diff options
29 files changed, 1231 insertions, 610 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index f05023510253..33ecf266570f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5415,6 +5415,23 @@ F: fs/fuse/ F: include/uapi/linux/fuse.h F: Documentation/filesystems/fuse.txt +FUTEX SUBSYSTEM +M: Thomas Gleixner <tglx@linutronix.de> +M: Ingo Molnar <mingo@redhat.com> +R: Peter Zijlstra <peterz@infradead.org> +R: Darren Hart <dvhart@infradead.org> +L: linux-kernel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core +S: Maintained +F: kernel/futex.c +F: kernel/futex_compat.c +F: include/asm-generic/futex.h +F: include/linux/futex.h +F: include/uapi/linux/futex.h +F: tools/testing/selftests/futex/ +F: tools/perf/bench/futex* +F: Documentation/*futex* + FUTURE DOMAIN TMC-16x0 SCSI DRIVER (16-bit) M: Rik Faith <faith@cs.unc.edu> L: linux-scsi@vger.kernel.org diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 14635c5ea025..caa5798c92f4 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) return cmpxchg(&v->counter, old, new); } +#define atomic_try_cmpxchg atomic_try_cmpxchg +static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) +{ + return try_cmpxchg(&v->counter, old, new); +} + static inline int atomic_xchg(atomic_t *v, int new) { return xchg(&v->counter, new); @@ -201,16 +207,12 @@ static inline void atomic_##op(int i, atomic_t *v) \ } #define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ +static inline int atomic_fetch_##op(int i, atomic_t *v) \ { \ - int old, val = atomic_read(v); \ - for (;;) { \ - old = atomic_cmpxchg(v, val, val c_op i); \ - if (old == val) \ - break; \ - val = old; \ - } \ - return old; \ + int val = atomic_read(v); \ + do { \ + } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ + return val; \ } #define ATOMIC_OPS(op, c_op) \ @@ -236,16 +238,11 @@ ATOMIC_OPS(xor, ^) */ static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { - int c, old; - c = atomic_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic_cmpxchg((v), c, c + (a)); - if (likely(old == c)) + int c = atomic_read(v); + do { + if (unlikely(c == u)) break; - c = old; - } + } while (!atomic_try_cmpxchg(v, &c, c + a)); return c; } diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 89ed2f6ae2f7..6189a433c9a9 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -176,6 +176,12 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) return cmpxchg(&v->counter, old, new); } +#define atomic64_try_cmpxchg atomic64_try_cmpxchg +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +{ + return try_cmpxchg(&v->counter, old, new); +} + static inline long atomic64_xchg(atomic64_t *v, long new) { return xchg(&v->counter, new); @@ -192,17 +198,12 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c, old; - c = atomic64_read(v); - for (;;) { - if (unlikely(c == (u))) - break; - old = atomic64_cmpxchg((v), c, c + (a)); - if (likely(old == c)) - break; - c = old; - } - return c != (u); + long c = atomic64_read(v); + do { + if (unlikely(c == u)) + return false; + } while (!atomic64_try_cmpxchg(v, &c, c + a)); + return true; } #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) @@ -216,17 +217,12 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long c, old, dec; - c = atomic64_read(v); - for (;;) { + long dec, c = atomic64_read(v); + do { dec = c - 1; if (unlikely(dec < 0)) break; - old = atomic64_cmpxchg((v), c, dec); - if (likely(old == c)) - break; - c = old; - } + } while (!atomic64_try_cmpxchg(v, &c, dec)); return dec; } @@ -242,14 +238,10 @@ static inline void atomic64_##op(long i, atomic64_t *v) \ #define ATOMIC64_FETCH_OP(op, c_op) \ static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ { \ - long old, val = atomic64_read(v); \ - for (;;) { \ - old = atomic64_cmpxchg(v, val, val c_op i); \ - if (old == val) \ - break; \ - val = old; \ - } \ - return old; \ + long val = atomic64_read(v); \ + do { \ + } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ + return val; \ } #define ATOMIC64_OPS(op, c_op) \ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 97848cdfcb1a..d90296d061e8 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -153,6 +153,76 @@ extern void __add_wrong_size(void) #define cmpxchg_local(ptr, old, new) \ __cmpxchg_local(ptr, old, new, sizeof(*(ptr))) + +#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ +({ \ + bool success; \ + __typeof__(_ptr) _old = (_pold); \ + __typeof__(*(_ptr)) __old = *_old; \ + __typeof__(*(_ptr)) __new = (_new); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(_ptr); \ + asm volatile(lock "cmpxchgb %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "q" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(_ptr); \ + asm volatile(lock "cmpxchgw %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(_ptr); \ + asm volatile(lock "cmpxchgl %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(_ptr); \ + asm volatile(lock "cmpxchgq %[new], %[ptr]" \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [ptr] "+m" (*__ptr), \ + [old] "+a" (__old) \ + : [new] "r" (__new) \ + : "memory"); \ + break; \ + } \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + if (unlikely(!success)) \ + *_old = __old; \ + likely(success); \ +}) + +#define __try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) + +#define try_cmpxchg(ptr, pold, new) \ + __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + /* * xadd() adds "inc" to "*ptr" and atomically returns the previous * value of "*ptr". diff --git a/include/linux/atomic.h b/include/linux/atomic.h index e71835bf60a9..c56be7410130 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -423,6 +423,29 @@ #endif #endif /* atomic_cmpxchg_relaxed */ +#ifndef atomic_try_cmpxchg + +#define __atomic_try_cmpxchg(type, _p, _po, _n) \ +({ \ + typeof(_po) __po = (_po); \ + typeof(*(_po)) __r, __o = *__po; \ + __r = atomic_cmpxchg##type((_p), __o, (_n)); \ + if (unlikely(__r != __o)) \ + *__po = __r; \ + likely(__r == __o); \ +}) + +#define atomic_try_cmpxchg(_p, _po, _n) __atomic_try_cmpxchg(, _p, _po, _n) +#define atomic_try_cmpxchg_relaxed(_p, _po, _n) __atomic_try_cmpxchg(_relaxed, _p, _po, _n) +#define atomic_try_cmpxchg_acquire(_p, _po, _n) __atomic_try_cmpxchg(_acquire, _p, _po, _n) +#define atomic_try_cmpxchg_release(_p, _po, _n) __atomic_try_cmpxchg(_release, _p, _po, _n) + +#else /* atomic_try_cmpxchg */ +#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg +#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg +#define atomic_try_cmpxchg_release atomic_try_cmpxchg +#endif /* atomic_try_cmpxchg */ + /* cmpxchg_relaxed */ #ifndef cmpxchg_relaxed #define cmpxchg_relaxed cmpxchg @@ -996,6 +1019,29 @@ static inline int atomic_dec_if_positive(atomic_t *v) #endif #endif /* atomic64_cmpxchg_relaxed */ +#ifndef atomic64_try_cmpxchg + +#define __atomic64_try_cmpxchg(type, _p, _po, _n) \ +({ \ + typeof(_po) __po = (_po); \ + typeof(*(_po)) __r, __o = *__po; \ + __r = atomic64_cmpxchg##type((_p), __o, (_n)); \ + if (unlikely(__r != __o)) \ + *__po = __r; \ + likely(__r == __o); \ +}) + +#define atomic64_try_cmpxchg(_p, _po, _n) __atomic64_try_cmpxchg(, _p, _po, _n) +#define atomic64_try_cmpxchg_relaxed(_p, _po, _n) __atomic64_try_cmpxchg(_relaxed, _p, _po, _n) +#define atomic64_try_cmpxchg_acquire(_p, _po, _n) __atomic64_try_cmpxchg(_acquire, _p, _po, _n) +#define atomic64_try_cmpxchg_release(_p, _po, _n) __atomic64_try_cmpxchg(_release, _p, _po, _n) + +#else /* atomic64_try_cmpxchg */ +#define atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg +#define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg +#define atomic64_try_cmpxchg_release atomic64_try_cmpxchg +#endif /* atomic64_try_cmpxchg */ + #ifndef atomic64_andnot static inline void atomic64_andnot(long long i, atomic64_t *v) { diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 91d9049f0039..2c487e0879d5 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -181,6 +181,7 @@ extern struct cred init_cred; #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ .pi_waiters = RB_ROOT, \ + .pi_top_task = NULL, \ .pi_waiters_leftmost = NULL, #else # define INIT_RT_MUTEXES(tsk) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 1e327bb80838..fffe49f188e6 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -361,6 +361,8 @@ static inline void lock_set_subclass(struct lockdep_map *lock, lock_set_class(lock, lock->name, lock->key, subclass, ip); } +extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); + extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); extern void lockdep_clear_current_reclaim_state(void); extern void lockdep_trace_alloc(gfp_t mask); @@ -411,6 +413,7 @@ static inline void lockdep_on(void) # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) +# define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_set_current_reclaim_state(g) do { } while (0) diff --git a/include/linux/module.h b/include/linux/module.h index 0297c5cd7cdf..9ad68561d8c2 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -493,6 +493,7 @@ static inline int module_is_live(struct module *mod) struct module *__module_text_address(unsigned long addr); struct module *__module_address(unsigned long addr); bool is_module_address(unsigned long addr); +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr); bool is_module_percpu_address(unsigned long addr); bool is_module_text_address(unsigned long addr); @@ -660,6 +661,11 @@ static inline bool is_module_percpu_address(unsigned long addr) return false; } +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) +{ + return false; +} + static inline bool is_module_text_address(unsigned long addr) { return false; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 56939d3f6e53..491b3f5a5f8a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -110,6 +110,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, #endif extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); +extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); extern bool is_kernel_percpu_address(unsigned long addr); #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 0023fee4bbbc..b34aa649d204 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -6,17 +6,36 @@ #include <linux/spinlock.h> #include <linux/kernel.h> +/** + * refcount_t - variant of atomic_t specialized for reference counts + * @refs: atomic_t counter field + * + * The counter saturates at UINT_MAX and will not move once + * there. This avoids wrapping the counter and causing 'spurious' + * use-after-free bugs. + */ typedef struct refcount_struct { atomic_t refs; } refcount_t; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } +/** + * refcount_set - set a refcount's value + * @r: the refcount + * @n: value to which the refcount will be set + */ static inline void refcount_set(refcount_t *r, unsigned int n) { atomic_set(&r->refs, n); } +/** + * refcount_read - get a refcount's value + * @r: the refcount + * + * Return: the refcount's value + */ static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); diff --git a/include/linux/sched.h b/include/linux/sched.h index b9a421f1f76f..ba080e586dae 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -779,6 +779,8 @@ struct task_struct { /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root pi_waiters; struct rb_node *pi_waiters_leftmost; + /* Updated under owner's pi_lock and rq lock */ + struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 3bd668414f61..f93329aba31a 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -18,27 +18,20 @@ static inline int rt_task(struct task_struct *p) } #ifdef CONFIG_RT_MUTEXES -extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio); -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); +/* + * Must hold either p->pi_lock or task_rq(p)->lock. + */ +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) +{ + return p->pi_top_task; +} +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { return tsk->pi_blocked_on != NULL; } #else -static inline int rt_mutex_getprio(struct task_struct *p) -{ - return p->normal_prio; -} - -static inline int rt_mutex_get_effective_prio(struct task_struct *task, - int newprio) -{ - return newprio; -} - static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; diff --git a/include/linux/smp.h b/include/linux/smp.h index 8e0cb7a0f836..68123c1fe549 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus; extern void __init setup_nr_cpu_ids(void); extern void __init smp_init(void); +extern int __boot_cpu_id; + +static inline int get_boot_cpu_id(void) +{ + return __boot_cpu_id; +} + #else /* !SMP */ static inline void smp_send_stop(void) { } @@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); } static inline void smp_init(void) { } #endif +static inline int get_boot_cpu_id(void) +{ + return 0; +} + #endif /* !SMP */ /* diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9e3ef6c99e4b..ae1409ffe99a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -70,7 +70,7 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; - __entry->prio = p->prio; + __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->success = 1; /* rudiment, kill when possible */ __entry->target_cpu = task_cpu(p); ), @@ -147,6 +147,7 @@ TRACE_EVENT(sched_switch, memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; + /* XXX SCHED_DEADLINE */ ), TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", @@ -181,7 +182,7 @@ TRACE_EVENT(sched_migrate_task, TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; - __entry->prio = p->prio; + __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; ), @@ -206,7 +207,7 @@ DECLARE_EVENT_CLASS(sched_process_template, TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; - __entry->prio = p->prio; + __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", @@ -253,7 +254,7 @@ TRACE_EVENT(sched_process_wait, TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); __entry->pid = pid_nr(pid); - __entry->prio = current->prio; + __entry->prio = current->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", @@ -413,9 +414,9 @@ DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime, */ TRACE_EVENT(sched_pi_setprio, - TP_PROTO(struct task_struct *tsk, int newprio), + TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task), - TP_ARGS(tsk, newprio), + TP_ARGS(tsk, pi_task), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -428,7 +429,8 @@ TRACE_EVENT(sched_pi_setprio, memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->oldprio = tsk->prio; - __entry->newprio = newprio; + __entry->newprio = pi_task ? pi_task->prio : tsk->prio; + /* XXX SCHED_DEADLINE bits missing */ ), TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", diff --git a/kernel/cpu.c b/kernel/cpu.c index 37b223e4fc05..9ae6fbe5b5cf 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ +int __boot_cpu_id; + #endif /* CONFIG_SMP */ /* Boot processor state steps */ @@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void) set_cpu_active(cpu, true); set_cpu_present(cpu, true); set_cpu_possible(cpu, true); + +#ifdef CONFIG_SMP + __boot_cpu_id = cpu; +#endif } /* diff --git a/kernel/fork.c b/kernel/fork.c index 6c463c80e93d..b30196a00b0d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1438,6 +1438,7 @@ static void rt_mutex_init_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; + p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif } diff --git a/kernel/futex.c b/kernel/futex.c index 45858ec73941..357348a6cf6b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -802,7 +802,7 @@ static int refill_pi_state_cache(void) return 0; } -static struct futex_pi_state * alloc_pi_state(void) +static struct futex_pi_state *alloc_pi_state(void) { struct futex_pi_state *pi_state = current->pi_state_cache; @@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +static void get_pi_state(struct futex_pi_state *pi_state) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); +} + /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. @@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * Look up the task based on what TID userspace gave us. * We dont trust it. */ -static struct task_struct * futex_find_get_task(pid_t pid) +static struct task_struct *futex_find_get_task(pid_t pid) { struct task_struct *p; @@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr) pi_state->owner = NULL; raw_spin_unlock_irq(&curr->pi_lock); - rt_mutex_unlock(&pi_state->pi_mutex); - + get_pi_state(pi_state); spin_unlock(&hb->lock); + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); @@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr) * * [10] There is no transient state which leaves owner and user space * TID out of sync. + * + * + * Serialization and lifetime rules: + * + * hb->lock: + * + * hb -> futex_q, relation + * futex_q -> pi_state, relation + * + * (cannot be raw because hb can contain arbitrary amount + * of futex_q's) + * + * pi_mutex->wait_lock: + * + * {uval, pi_state} + * + * (and pi_mutex 'obviously') + * + * p->pi_lock: + * + * p->pi_state_list -> pi_state->list, relation + * + * pi_state->refcount: + * + * pi_state lifetime + * + * + * Lock order: + * + * hb->lock + * pi_mutex->wait_lock + * p->pi_lock + * */ /* @@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr) * the pi_state against the user space value. If correct, attach to * it. */ -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, +static int attach_to_pi_state(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, struct futex_pi_state **ps) { pid_t pid = uval & FUTEX_TID_MASK; + u32 uval2; + int ret; /* * Userspace might have messed up non-PI and PI futexes [3] @@ -991,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, if (unlikely(!pi_state)) return -EINVAL; + /* + * We get here with hb->lock held, and having found a + * futex_top_waiter(). This means that futex_lock_pi() of said futex_q + * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), + * which in turn means that futex_lock_pi() still has a reference on + * our pi_state. + * + * The waiter holding a reference on @pi_state also protects against + * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() + * and futex_wait_requeue_pi() as it cannot go to 0 and consequently + * free pi_state before we can take a reference ourselves. + */ WARN_ON(!atomic_read(&pi_state->refcount)); /* + * Now that we have a pi_state, we can acquire wait_lock + * and do the state validation. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Since {uval, pi_state} is serialized by wait_lock, and our current + * uval was read without holding it, it can have changed. Verify it + * still is what we expect it to be, otherwise retry the entire + * operation. + */ + if (get_futex_value_locked(&uval2, uaddr)) + goto out_efault; + + if (uval != uval2) + goto out_eagain; + + /* * Handle the owner died case: */ if (uval & FUTEX_OWNER_DIED) { @@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * is not 0. Inconsistent state. [5] */ if (pid) - return -EINVAL; + goto out_einval; /* * Take a ref on the state and return success. [4] */ - goto out_state; + goto out_attach; } /* @@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * Take a ref on the state and return success. [6] */ if (!pid) - goto out_state; + goto out_attach; } else { /* * If the owner died bit is not set, then the pi_state * must have an owner. [7] */ if (!pi_state->owner) - return -EINVAL; + goto out_einval; } /* @@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * user space TID. [9/10] */ if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; -out_state: - atomic_inc(&pi_state->refcount); + goto out_einval; + +out_attach: + get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); *ps = pi_state; return 0; + +out_einval: + ret = -EINVAL; + goto out_error; + +out_eagain: + ret = -EAGAIN; + goto out_error; + +out_efault: + ret = -EFAULT; + goto out_error; + +out_error: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } /* @@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, /* * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. */ pi_state = alloc_pi_state(); @@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, return 0; } -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, +static int lookup_pi_state(u32 __user *uaddr, u32 uval, + struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps) { - struct futex_q *match = futex_top_waiter(hb, key); + struct futex_q *top_waiter = futex_top_waiter(hb, key); /* * If there is a waiter on that futex, validate it and * attach to the pi_state when the validation succeeds. */ - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * We are the first waiter - try to look up the owner based on @@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; - /*If user space value changed, let the caller retry */ + /* If user space value changed, let the caller retry */ return curval != uval ? -EAGAIN : 0; } @@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { u32 uval, newval, vpid = task_pid_vnr(task); - struct futex_q *match; + struct futex_q *top_waiter; int ret; /* @@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * Lookup existing state first. If it exists, try to attach to * its pi_state. */ - match = futex_top_waiter(hb, key); - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + top_waiter = futex_top_waiter(hb, key); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * No waiter and user TID is 0. We are here because the @@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) wake_q_add(wake_q, p); __unqueue_futex(q); /* - * The waiting task can free the futex_q as soon as - * q->lock_ptr = NULL is written, without taking any locks. A - * memory barrier is required here to prevent the following - * store to lock_ptr from getting ahead of the plist_del. + * The waiting task can free the futex_q as soon as q->lock_ptr = NULL + * is written, without taking any locks. This is possible in the event + * of a spurious wakeup, for example. A memory barrier is required here + * to prevent the following store to lock_ptr from getting ahead of the + * plist_del in __unqueue_futex(). */ - smp_wmb(); - q->lock_ptr = NULL; + smp_store_release(&q->lock_ptr, NULL); } -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, - struct futex_hash_bucket *hb) +/* + * Caller must hold a reference on @pi_state. + */ +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - struct task_struct *new_owner; - struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + struct task_struct *new_owner; + bool postunlock = false; DEFINE_WAKE_Q(wake_q); - bool deboost; int ret = 0; - if (!pi_state) - return -EINVAL; - - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - return -EINVAL; - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + if (WARN_ON_ONCE(!new_owner)) { + /* + * As per the comment in futex_unlock_pi() this should not happen. + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by + * waiting on the rtmutex or removing itself from the futex + * queue. + */ + ret = -EAGAIN; + goto out_unlock; + } /* - * It is possible that the next waiter (the one that brought - * this owner to the kernel) timed out and is no longer - * waiting on the lock. - */ - if (!new_owner) - new_owner = this->task; - - /* - * We pass it to the next owner. The WAITERS bit is always - * kept enabled while there is PI state around. We cleanup the - * owner died bit, because we are the owner. + * We pass it to the next owner. The WAITERS bit is always kept + * enabled while there is PI state around. We cleanup the owner + * died bit, because we are the owner. */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); @@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; + } else if (curval != uval) { /* * If a unconditional UNLOCK_PI operation (user space did not @@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else ret = -EINVAL; } - if (ret) { - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return ret; - } + + if (ret) + goto out_unlock; + + /* + * This is a point of no return; once we modify the uval there is no + * going back and subsequent operations must not fail. + */ raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); @@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, pi_state->owner = new_owner; raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); - deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - /* - * First unlock HB so the waiter does not spin on it once he got woken - * up. Second wake up the waiter before the priority is adjusted. If we - * deboost first (and lose our higher priority), then the task might get - * scheduled away before the wake up can take place. - */ - spin_unlock(&hb->lock); - wake_up_q(&wake_q); - if (deboost) - rt_mutex_adjust_prio(current); + if (postunlock) + rt_mutex_postunlock(&wake_q); - return 0; + return ret; } /* @@ -1826,7 +1913,7 @@ retry_private: * If that call succeeds then we have pi_state and an * initial refcount on it. */ - ret = lookup_pi_state(ret, hb2, &key2, &pi_state); + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); } switch (ret) { @@ -1909,7 +1996,7 @@ retry_private: * refcount on the pi_state and store the pointer in * the futex_q object of the waiter. */ - atomic_inc(&pi_state->refcount); + get_pi_state(pi_state); this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, @@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb) hb_waiters_dec(hb); } -/** - * queue_me() - Enqueue the futex_q on the futex_hash_bucket - * @q: The futex_q to enqueue - * @hb: The destination hash bucket - * - * The hb->lock must be held by the caller, and is released here. A call to - * queue_me() is typically paired with exactly one call to unqueue_me(). The - * exceptions involve the PI related operations, which may use unqueue_me_pi() - * or nothing if the unqueue is done as part of the wake process and the unqueue - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + __queue_me(q, hb); spin_unlock(&hb->lock); } @@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; - struct task_struct *oldowner = pi_state->owner; u32 uval, uninitialized_var(curval), newval; + struct task_struct *oldowner; int ret; + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + oldowner = pi_state->owner; /* Owner died? */ if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; @@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the * previous highest priority waiter or we are the highest priority - * waiter but failed to get the rtmutex the first time. + * waiter but have failed to get the rtmutex the first time. + * * We have to replace the newowner TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * @@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * because we can fault here. Imagine swapped out pages or a fork * that marked all the anonymous memory readonly for cow. * - * Modifying pi_state _before_ the user space value would - * leave the pi_state in an inconsistent state when we fault - * here, because we need to drop the hash bucket lock to - * handle the fault. This might be observed in the PID check - * in lookup_pi_state. + * Modifying pi_state _before_ the user space value would leave the + * pi_state in an inconsistent state when we fault here, because we + * need to drop the locks to handle the fault. This might be observed + * in the PID check in lookup_pi_state. */ retry: if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; - while (1) { + for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) @@ -2169,47 +2264,60 @@ retry: * itself. */ if (pi_state->owner != NULL) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - raw_spin_lock_irq(&newowner->pi_lock); + raw_spin_lock(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock_irq(&newowner->pi_lock); + raw_spin_unlock(&newowner->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return 0; /* - * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the highest priority - * waiter itself or the task which stole the rtmutex) the - * chance to try the fixup of the pi_state. So once we are - * back from handling the fault we need to check the pi_state - * after reacquiring the hash bucket lock and before trying to - * do another fixup. When the fixup has been done already we - * simply return. + * To handle the page fault we need to drop the locks here. That gives + * the other task (either the highest priority waiter itself or the + * task which stole the rtmutex) the chance to try the fixup of the + * pi_state. So once we are back from handling the fault we need to + * check the pi_state after reacquiring the locks and before trying to + * do another fixup. When the fixup has been done already we simply + * return. + * + * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely + * drop hb->lock since the caller owns the hb -> futex_q relation. + * Dropping the pi_mutex->wait_lock requires the state revalidate. */ handle_fault: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(q->lock_ptr); ret = fault_in_user_writeable(uaddr); spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); /* * Check if someone else fixed it for us: */ - if (pi_state->owner != oldowner) - return 0; + if (pi_state->owner != oldowner) { + ret = 0; + goto out_unlock; + } if (ret) - return ret; + goto out_unlock; goto retry; + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } static long futex_wait_restart(struct restart_block *restart); @@ -2231,13 +2339,16 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - struct task_struct *owner; int ret = 0; if (locked) { /* * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: + * + * We can safely read pi_state->owner without holding wait_lock + * because we now own the rt_mutex, only the owner will attempt + * to change it. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); @@ -2245,43 +2356,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) } /* - * Catch the rare case, where the lock was released when we were on the - * way back before we locked the hash bucket. - */ - if (q->pi_state->owner == current) { - /* - * Try to get the rt_mutex now. This might fail as some other - * task acquired the rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - } - - /* - * pi_state is incorrect, some other task did a lock steal and - * we returned due to timeout or signal without taking the - * rt_mutex. Too late. - */ - raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner); - goto out; - } - - /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " "pi-state %p\n", ret, q->pi_state->pi_mutex.owner, q->pi_state->owner); + } out: return ret ? ret : locked; @@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; int res, ret; @@ -2557,25 +2642,68 @@ retry_private: } } + WARN_ON(!q.pi_state); + /* * Only actually queue now that the atomic ops are done: */ - queue_me(&q, hb); + __queue_me(&q, hb); - WARN_ON(!q.pi_state); - /* - * Block on the PI mutex: - */ - if (!trylock) { - ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); - } else { - ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; } + rt_mutex_init_waiter(&rt_waiter); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling rt_mutex_start_proxy_lock(). This still fully + * serializes against futex_unlock_pi() as that does the exact same + * lock handoff sequence. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + + if (ret) { + if (ret == 1) + ret = 0; + + spin_lock(q.lock_ptr); + goto no_block; + } + + + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + spin_lock(q.lock_ptr); /* + * If we failed to acquire the lock (signal/timeout), we must + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex + * wait lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + +no_block: + /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ @@ -2591,12 +2719,19 @@ retry_private: * If fixup_owner() faulted and was unable to handle the fault, unlock * it and return the fault to userspace. */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) - rt_mutex_unlock(&q.pi_state->pi_mutex); + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* Unqueue and drop the lock */ unqueue_me_pi(&q); + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + goto out_put_key; out_unlock_put_key: @@ -2605,8 +2740,10 @@ out_unlock_put_key: out_put_key: put_futex_key(&q.key); out: - if (to) + if (to) { + hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); + } return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: @@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; - struct futex_q *match; + struct futex_q *top_waiter; int ret; retry: @@ -2657,12 +2794,37 @@ retry: * all and we at least want to know if user space fiddled * with the futex value instead of blindly unlocking. */ - match = futex_top_waiter(hb, &key); - if (match) { - ret = wake_futex_pi(uaddr, uval, match, hb); + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { + struct futex_pi_state *pi_state = top_waiter->pi_state; + + ret = -EINVAL; + if (!pi_state) + goto out_unlock; + /* - * In case of success wake_futex_pi dropped the hash - * bucket lock. + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + goto out_unlock; + + get_pi_state(pi_state); + /* + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we + * observed. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + ret = wake_futex_pi(uaddr, uval, pi_state); + + put_pi_state(pi_state); + + /* + * Success, we're done! No tricky corner cases. */ if (!ret) goto out_putkey; @@ -2677,7 +2839,6 @@ retry: * setting the FUTEX_WAITERS bit. Try again. */ if (ret == -EAGAIN) { - spin_unlock(&hb->lock); put_futex_key(&key); goto retry; } @@ -2685,7 +2846,7 @@ retry: * wake_futex_pi has detected invalid state. Tell user * space. */ - goto out_unlock; + goto out_putkey; } /* @@ -2695,8 +2856,10 @@ retry: * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + spin_unlock(&hb->lock); goto pi_faulted; + } /* * If uval has changed, let user space handle it. @@ -2710,7 +2873,6 @@ out_putkey: return ret; pi_faulted: - spin_unlock(&hb->lock); put_futex_key(&key); ret = fault_in_user_writeable(uaddr); @@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; @@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ - debug_rt_mutex_init_waiter(&rt_waiter); - RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); - RB_CLEAR_NODE(&rt_waiter.tree_entry); - rt_waiter.task = NULL; + rt_mutex_init_waiter(&rt_waiter); ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) - rt_mutex_unlock(&q.pi_state->pi_mutex); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); - debug_rt_mutex_free_waiter(&rt_waiter); + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); spin_lock(q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + + debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. @@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * the fault, unlock the rt_mutex and return the fault to * userspace. */ - if (ret && rt_mutex_owner(pi_mutex) == current) - rt_mutex_unlock(pi_mutex); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a95e5d1f4a9c..98dd6231d43b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -660,6 +660,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; + bool is_static = false; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); @@ -673,10 +674,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) /* * Static locks do not have their class-keys yet - for them the key - * is the lock object itself: + * is the lock object itself. If the lock is in the per cpu area, + * the canonical address of the lock (per cpu offset removed) is + * used. */ - if (unlikely(!lock->key)) - lock->key = (void *)lock; + if (unlikely(!lock->key)) { + unsigned long can_addr, addr = (unsigned long)lock; + + if (__is_kernel_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (__is_module_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (static_obj(lock)) + lock->key = (void *)lock; + else + return ERR_PTR(-EINVAL); + is_static = true; + } /* * NOTE: the class-key must be unique. For dynamic locks, a static @@ -708,7 +722,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } } - return NULL; + return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } /* @@ -726,19 +740,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); - if (likely(class)) + if (likely(!IS_ERR_OR_NULL(class))) goto out_set_class_cache; /* * Debug-check: all keys must be persistent! - */ - if (!static_obj(lock->key)) { + */ + if (IS_ERR(class)) { debug_locks_off(); printk("INFO: trying to register non-static key.\n"); printk("the code is fine but needs lockdep annotation.\n"); printk("turning off the locking correctness validator.\n"); dump_stack(); - return NULL; } @@ -3419,7 +3432,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) * Clearly if the lock hasn't been acquired _ever_, we're not * holding it either, so report failure. */ - if (!class) + if (IS_ERR_OR_NULL(class)) return 0; /* @@ -3437,13 +3450,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 0; } +/* @depth must not be zero */ +static struct held_lock *find_held_lock(struct task_struct *curr, + struct lockdep_map *lock, + unsigned int depth, int *idx) +{ + struct held_lock *ret, *hlock, *prev_hlock; + int i; + + i = depth - 1; + hlock = curr->held_locks + i; + ret = hlock; + if (match_held_lock(hlock, lock)) + goto out; + + ret = NULL; + for (i--, prev_hlock = hlock--; + i >= 0; + i--, prev_hlock = hlock--) { + /* + * We must not cross into another context: + */ + if (prev_hlock->irq_context != hlock->irq_context) { + ret = NULL; + break; + } + if (match_held_lock(hlock, lock)) { + ret = hlock; + break; + } + } + +out: + *idx = i; + return ret; +} + +static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, + int idx) +{ + struct held_lock *hlock; + + for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, + hlock->trylock, + hlock->read, hlock->check, + hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip, + hlock->references, hlock->pin_count)) + return 1; + } + return 0; +} + static int __lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class *class; unsigned int depth; int i; @@ -3456,21 +3523,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); -found_it: lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes + 1; @@ -3478,15 +3534,46 @@ found_it: curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - for (; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) - return 0; - } + if (reacquire_held_locks(curr, depth, i)) + return 0; + + /* + * I took it apart and put it back together again, except now I have + * these 'spare' parts.. where shall I put them. + */ + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + +static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + /* + * This function is about (re)setting the class of a held lock, + * yet we're not actually holding any locks. Naughty user! + */ + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + WARN(hlock->read, "downgrading a read lock"); + hlock->read = 1; + hlock->acquire_ip = ip; + + if (reacquire_held_locks(curr, depth, i)) + return 0; /* * I took it apart and put it back together again, except now I have @@ -3508,7 +3595,7 @@ static int __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; unsigned int depth; int i; @@ -3527,21 +3614,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * Check whether the lock exists in the current stack * of held locks: */ - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); -found_it: if (hlock->instance == lock) lock_release_holdtime(hlock); @@ -3568,15 +3644,8 @@ found_it: curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - for (i++; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) - return 0; - } + if (reacquire_held_locks(curr, depth, i + 1)) + return 0; /* * We had N bottles of beer on the wall, we drank one, but now @@ -3741,6 +3810,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); +void lock_downgrade(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_downgrade(lock, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_downgrade); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -3903,7 +3989,7 @@ static void __lock_contended(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class_stats *stats; unsigned int depth; int i, contention_point, contending_point; @@ -3916,22 +4002,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) { + print_lock_contention_bug(curr, lock, ip); + return; } - print_lock_contention_bug(curr, lock, ip); - return; -found_it: if (hlock->instance != lock) return; @@ -3955,7 +4031,7 @@ static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class_stats *stats; unsigned int depth; u64 now, waittime = 0; @@ -3969,22 +4045,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) { + print_lock_contention_bug(curr, lock, _RET_IP_); + return; } - print_lock_contention_bug(curr, lock, _RET_IP_); - return; -found_it: if (hlock->instance != lock) return; @@ -4172,7 +4238,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) * If the class exists we look it up and zap it: */ class = look_up_lock_class(lock, j); - if (class) + if (!IS_ERR_OR_NULL(class)) zap_class(class); } /* diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 97ee9df32e0f..32fe775a2eaf 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -174,12 +174,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) lock->name = name; } -void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) -{ -} - -void rt_mutex_deadlock_account_unlock(struct task_struct *task) -{ -} - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index d0519c3432b6..b585af9a1b50 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -9,9 +9,6 @@ * This file contains macros used solely by rtmutex.c. Debug version. */ -extern void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6edc32ecd9c5..b95509416909 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #endif +/* + * Only use with rt_mutex_waiter_{less,equal}() + */ +#define task_to_waiter(p) \ + &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + static inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, struct rt_mutex_waiter *right) @@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, * then right waiter has a dl_prio() too. */ if (dl_prio(left->prio)) - return dl_time_before(left->task->dl.deadline, - right->task->dl.deadline); + return dl_time_before(left->deadline, right->deadline); return 0; } +static inline int +rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) +{ + if (left->prio != right->prio) + return 0; + + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. + * If left waiter has a dl_prio(), and we didn't return 0 above, + * then right waiter has a dl_prio() too. + */ + if (dl_prio(left->prio)) + return left->deadline == right->deadline; + + return 1; +} + static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { @@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) RB_CLEAR_NODE(&waiter->pi_tree_entry); } -/* - * Calculate task priority from the waiter tree priority - * - * Return task->normal_prio when the waiter tree is empty or when - * the waiter is not allowed to do priority boosting - */ -int rt_mutex_getprio(struct task_struct *task) -{ - if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; - - return min(task_top_pi_waiter(task)->prio, - task->normal_prio); -} - -struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +static void rt_mutex_adjust_prio(struct task_struct *p) { - if (likely(!task_has_pi_waiters(task))) - return NULL; - - return task_top_pi_waiter(task)->task; -} + struct task_struct *pi_task = NULL; -/* - * Called by sched_setscheduler() to get the priority which will be - * effective after the change. - */ -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) -{ - if (!task_has_pi_waiters(task)) - return newprio; + lockdep_assert_held(&p->pi_lock); - if (task_top_pi_waiter(task)->task->prio <= newprio) - return task_top_pi_waiter(task)->task->prio; - return newprio; -} + if (task_has_pi_waiters(p)) + pi_task = task_top_pi_waiter(p)->task; -/* - * Adjust the priority of a task, after its pi_waiters got modified. - * - * This can be both boosting and unboosting. task->pi_lock must be held. - */ -static void __rt_mutex_adjust_prio(struct task_struct *task) -{ - int prio = rt_mutex_getprio(task); - - if (task->prio != prio || dl_prio(prio)) - rt_mutex_setprio(task, prio); -} - -/* - * Adjust task priority (undo boosting). Called from the exit path of - * rt_mutex_slowunlock() and rt_mutex_slowlock(). - * - * (Note: We do this outside of the protection of lock->wait_lock to - * allow the lock to be taken while or before we readjust the priority - * of task. We do not use the spin_xx_mutex() variants here as we are - * outside of the debug path.) - */ -void rt_mutex_adjust_prio(struct task_struct *task) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + rt_mutex_setprio(p, pi_task); } /* @@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (waiter->prio == task->prio) { + if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* [7] Requeue the waiter in the lock waiter tree. */ rt_mutex_dequeue(lock, waiter); + + /* + * Update the waiter prio fields now that we're dequeued. + * + * These values can have changed through either: + * + * sys_sched_set_scheduler() / sys_sched_setattr() + * + * or + * + * DL CBS enforcement advancing the effective deadline. + * + * Even though pi_waiters also uses these fields, and that tree is only + * updated in [11], we can do this here, since we hold [L], which + * serializes all pi_waiters access and rb_erase() does not care about + * the values of the node being removed. + */ waiter->prio = task->prio; + waiter->deadline = task->dl.deadline; + rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ @@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else if (prerequeue_top_waiter == waiter) { /* @@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else { /* * Nothing changed. No need to do any priority @@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { + lockdep_assert_held(&lock->wait_lock); + /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * the top waiter priority (kernel view), * @task lost. */ - if (task->prio >= rt_mutex_top_waiter(lock)->prio) + if (!rt_mutex_waiter_less(task_to_waiter(task), + rt_mutex_top_waiter(lock))) return 0; /* @@ -938,8 +928,6 @@ takeit: */ rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, task); - return 1; } @@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex *next_lock; int chain_walk = 0, res; + lockdep_assert_held(&lock->wait_lock); + /* * Early deadlock detection. We really don't want the task to * enqueue on itself just to untangle the mess later. It's not @@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; + waiter->deadline = task->dl.deadline; /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, waiter = rt_mutex_top_waiter(lock); /* - * Remove it from current->pi_waiters. We do not adjust a - * possible priority boost right now. We execute wakeup in the - * boosted mode and go back to normal after releasing - * lock->wait_lock. + * Remove it from current->pi_waiters and deboost. + * + * We must in fact deboost here in order to ensure we call + * rt_mutex_setprio() to update p->pi_top_task before the + * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); + rt_mutex_adjust_prio(current); /* * As we are waking up the top waiter, and the waiter stays @@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock(¤t->pi_lock); - + /* + * We deboosted before waking the top waiter task such that we don't + * run two tasks with the 'same' priority (and ensure the + * p->pi_top_task pointer points to a blocked task). This however can + * lead to priority inversion if we would get preempted after the + * deboost but before waking our donor task, hence the preempt_disable() + * before unlock. + * + * Pairs with preempt_enable() in rt_mutex_postunlock(); + */ + preempt_disable(); wake_q_add(wake_q, waiter->task); + raw_spin_unlock(¤t->pi_lock); } /* @@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock, struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; @@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock, if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); @@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || (waiter->prio == task->prio && - !dl_prio(task->prio))) { + if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } @@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task) next_lock, NULL, task); } +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->task = NULL; +} + /** * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take @@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, unsigned long flags; int ret = 0; - debug_rt_mutex_init_waiter(&waiter); - RB_CLEAR_NODE(&waiter.pi_tree_entry); - RB_CLEAR_NODE(&waiter.tree_entry); + rt_mutex_init_waiter(&waiter); /* * Technically we could use raw_spin_[un]lock_irq() here, but this can @@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) /* * Slow path to release a rt-mutex. - * Return whether the current task needs to undo a potential priority boosting. + * + * Return whether the current task needs to call rt_mutex_postunlock(). */ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) @@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, debug_rt_mutex_unlock(lock); - rt_mutex_deadlock_account_unlock(current); - /* * We must be careful here if the fast path is enabled. If we * have no waiters queued we cannot set owner to NULL here @@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * Queue the next waiter for wakeup once we release the wait_lock. */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - /* check PI boosting */ - return true; + return true; /* call rt_mutex_postunlock() */ } /* @@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, enum rtmutex_chainwalk chwalk)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } static inline int @@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, timeout, chwalk); + + return slowfn(lock, state, timeout, chwalk); } static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 1; - } + return slowfn(lock); } +/* + * Performs the wakeup of the the top-waiter and re-enables preemption. + */ +void rt_mutex_postunlock(struct wake_q_head *wake_q) +{ + wake_up_q(wake_q); + + /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ + preempt_enable(); +} + static inline void rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, @@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock, { DEFINE_WAKE_Q(wake_q); - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - - } else { - bool deboost = slowfn(lock, &wake_q); - - wake_up_q(&wake_q); + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; - /* Undo pi boosting if necessary: */ - if (deboost) - rt_mutex_adjust_prio(current); - } + if (slowfn(lock, &wake_q)) + rt_mutex_postunlock(&wake_q); } /** @@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /* - * Futex variant with full deadlock detection. + * Futex variant, must not use fastpath. */ -int rt_mutex_timed_futex_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *timeout) +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) { - might_sleep(); - - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_FULL_CHAINWALK, - rt_mutex_slowlock); + return rt_mutex_slowtrylock(lock); } /** @@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock - * @lock: the rt_mutex to be unlocked - * - * Returns: true/false indicating whether priority adjustment is - * required or not. + * Futex variant, that since futex variants do not use the fast-path, can be + * simple and will not need to retry. */ -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh) +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wake_q) { - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - return false; + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ } - return rt_mutex_slowunlock(lock, wqh); + + /* + * We've already deboosted, mark_wakeup_next_waiter() will + * retain preempt_disabled when we drop the wait_lock, to + * avoid inversion prior to the wakeup. preempt_disable() + * therein pairs with rt_mutex_postunlock(). + */ + mark_wakeup_next_waiter(wake_q, lock); + + return true; /* call postunlock() */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +{ + DEFINE_WAKE_Q(wake_q); + bool postunlock; + + raw_spin_lock_irq(&lock->wait_lock); + postunlock = __rt_mutex_futex_unlock(lock, &wake_q); + raw_spin_unlock_irq(&lock->wait_lock); + + if (postunlock) + rt_mutex_postunlock(&wake_q); } /** @@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); - rt_mutex_deadlock_account_lock(lock, proxy_owner); } /** @@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); - rt_mutex_deadlock_account_unlock(proxy_owner); } -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for FUTEX_REQUEUE_PI support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; - raw_spin_lock_irq(&lock->wait_lock); - - if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock_irq(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; - } /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, @@ -1703,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); - debug_rt_mutex_print_deadlock(waiter); return ret; } /** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** * rt_mutex_next_owner - return the next owner of the lock * * @lock: the rt lock query @@ -1731,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) } /** - * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition * @lock: the rt_mutex we were woken on * @to: the timeout, null if none. hrtimer should already have * been started. * @waiter: the pre-initialized rt_mutex_waiter * - * Complete the lock acquisition started our behalf by another thread. + * Wait for the the lock acquisition started on our behalf by + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call + * rt_mutex_cleanup_proxy_lock(). * * Returns: * 0 - success * <0 - error, one of -EINTR, -ETIMEDOUT * - * Special API call for PI-futex requeue support + * Special API call for PI-futex support */ -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter) { @@ -1758,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - if (unlikely(ret)) + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can + * find we are in fact the owner and must disregard the + * rt_mutex_wait_proxy_lock() failure. + * + * Returns: + * true - did the cleanup, we done. + * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, + * caller should disregards its return value. + * + * Special API call for PI-futex support + */ +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) +{ + bool cleanup = false; + + raw_spin_lock_irq(&lock->wait_lock); + /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); + fixup_rt_mutex_waiters(lock); + cleanup = true; + } /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might @@ -1769,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock_irq(&lock->wait_lock); - return ret; + return cleanup; } diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index c4060584c407..6607802efa8b 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -11,8 +11,6 @@ */ #define rt_mutex_deadlock_check(l) (0) -#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) -#define rt_mutex_deadlock_account_unlock(l) do { } while (0) #define debug_rt_mutex_init_waiter(w) do { } while (0) #define debug_rt_mutex_free_waiter(w) do { } while (0) #define debug_rt_mutex_lock(l) do { } while (0) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 856dfff5c33a..72ad45a9a794 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -34,6 +34,7 @@ struct rt_mutex_waiter { struct rt_mutex *deadlock_lock; #endif int prio; + u64 deadline; }; /* @@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter); -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); -extern void rt_mutex_adjust_prio(struct task_struct *task); +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); + +extern int rt_mutex_futex_trylock(struct rt_mutex *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh); + +extern void rt_mutex_postunlock(struct wake_q_head *wake_q); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 90a74ccd85a4..4d48b1c4870d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write); */ void downgrade_write(struct rw_semaphore *sem) { - /* - * lockdep: a downgraded write will live on as a write - * dependency. - */ + lock_downgrade(&sem->dep_map, _RET_IP_); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 6b7abb334ca6..39f56c870051 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus) struct stress { struct work_struct work; struct ww_mutex *locks; + unsigned long timeout; int nlocks; - int nloops; }; static int *get_random_order(int count) @@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work) if (!order) return; - ww_acquire_init(&ctx, &ww_class); - do { int contended = -1; int n, err; + ww_acquire_init(&ctx, &ww_class); retry: err = 0; for (n = 0; n < nlocks; n++) { @@ -433,9 +432,9 @@ retry: __func__, err); break; } - } while (--stress->nloops); - ww_acquire_fini(&ctx); + ww_acquire_fini(&ctx); + } while (!time_after(jiffies, stress->timeout)); kfree(order); kfree(stress); @@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work) kfree(order); order = NULL; - ww_acquire_init(&ctx, &ww_class); - do { + ww_acquire_init(&ctx, &ww_class); + list_for_each_entry(ll, &locks, link) { err = ww_mutex_lock(ll->lock, &ctx); if (!err) @@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work) dummy_load(stress); list_for_each_entry(ll, &locks, link) ww_mutex_unlock(ll->lock); - } while (--stress->nloops); - ww_acquire_fini(&ctx); + ww_acquire_fini(&ctx); + } while (!time_after(jiffies, stress->timeout)); out: list_for_each_entry_safe(ll, ln, &locks, link) @@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work) __func__, err); break; } - } while (--stress->nloops); + } while (!time_after(jiffies, stress->timeout)); kfree(stress); } @@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work) #define STRESS_ONE BIT(2) #define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) -static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) +static int stress(int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; int n; @@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) INIT_WORK(&stress->work, fn); stress->locks = locks; stress->nlocks = nlocks; - stress->nloops = nloops; + stress->timeout = jiffies + 2*HZ; queue_work(wq, &stress->work); nthreads--; @@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER); + ret = stress(16, 2*ncpus, STRESS_INORDER); if (ret) return ret; - ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER); + ret = stress(16, 2*ncpus, STRESS_REORDER); if (ret) return ret; - ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); + ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; diff --git a/kernel/module.c b/kernel/module.c index 7eba6dea4f41..6d9988031c5b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -665,16 +665,7 @@ static void percpu_modcopy(struct module *mod, memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); } -/** - * is_module_percpu_address - test whether address is from module static percpu - * @addr: address to test - * - * Test whether @addr belongs to module static percpu area. - * - * RETURNS: - * %true if @addr is from module static percpu area - */ -bool is_module_percpu_address(unsigned long addr) +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) { struct module *mod; unsigned int cpu; @@ -688,9 +679,15 @@ bool is_module_percpu_address(unsigned long addr) continue; for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(mod->percpu, cpu); - - if ((void *)addr >= start && - (void *)addr < start + mod->percpu_size) { + void *va = (void *)addr; + + if (va >= start && va < start + mod->percpu_size) { + if (can_addr) { + *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(mod->percpu, + get_boot_cpu_id()); + } preempt_enable(); return true; } @@ -701,6 +698,20 @@ bool is_module_percpu_address(unsigned long addr) return false; } +/** + * is_module_percpu_address - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * RETURNS: + * %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ + return __is_module_percpu_address(addr, NULL); +} + #else /* ... !CONFIG_SMP */ static inline void __percpu *mod_percpu(struct module *mod) @@ -732,6 +743,11 @@ bool is_module_percpu_address(unsigned long addr) return false; } +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) +{ + return false; +} + #endif /* CONFIG_SMP */ #define MODINFO_ATTR(field) \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d16db7067d6a..c51147a1204c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function); #ifdef CONFIG_RT_MUTEXES +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + /* * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) + * @p: task to boost + * @pi_task: donor task * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). @@ -3682,18 +3697,42 @@ EXPORT_SYMBOL(default_wake_function); * Used by the rt_mutex code to implement priority inheritance * logic. Call site only calls if the priority of the task changed. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int oldprio, queued, running, queue_flag = + int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class; struct rq_flags rf; struct rq *rq; - BUG_ON(prio > MAX_PRIO); + /* XXX used to be waiter->prio, not waiter->task->prio */ + prio = __rt_effective_prio(pi_task, p->normal_prio); + + /* + * If nothing changed; bail early. + */ + if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) + return; rq = __task_rq_lock(p, &rf); update_rq_clock(rq); + /* + * Set under pi_lock && rq->lock, such that the value can be used under + * either lock. + * + * Note that there is loads of tricky to make this pointer cache work + * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to + * ensure a task is de-boosted (pi_task is set to NULL) before the + * task is allowed to run again (and can exit). This ensures the pointer + * points to a blocked task -- which guaratees the task is present. + */ + p->pi_top_task = pi_task; + + /* + * For FIFO/RR we only need to set prio, if that matches we're done. + */ + if (prio == p->prio && !dl_prio(prio)) + goto out_unlock; /* * Idle task boosting is a nono in general. There is one @@ -3713,7 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) goto out_unlock; } - trace_sched_pi_setprio(p, prio); + trace_sched_pi_setprio(p, pi_task); oldprio = p->prio; if (oldprio == prio) @@ -3737,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) * running task */ if (dl_prio(prio)) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; @@ -3775,6 +3813,11 @@ out_unlock: balance_callback(rq); preempt_enable(); } +#else +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} #endif void set_user_nice(struct task_struct *p, long nice) @@ -4021,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * Keep a potential priority boosting if called from * sched_setscheduler(). */ + p->prio = normal_prio(p); if (keep_boost) - p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); - else - p->prio = normal_prio(p); + p->prio = rt_effective_prio(p, p->prio); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -4311,7 +4353,7 @@ change: * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + new_effective_prio = rt_effective_prio(p, newprio); if (new_effective_prio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } diff --git a/lib/refcount.c b/lib/refcount.c index aa09ad3c30b0..f42124ccf295 100644 --- a/lib/refcount.c +++ b/lib/refcount.c @@ -37,11 +37,29 @@ #include <linux/refcount.h> #include <linux/bug.h> +/** + * refcount_add_not_zero - add a value to a refcount unless it is 0 + * @i: the value to add to the refcount + * @r: the refcount + * + * Will saturate at UINT_MAX and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + * + * Return: false if the passed refcount is 0, true otherwise + */ bool refcount_add_not_zero(unsigned int i, refcount_t *r) { - unsigned int old, new, val = atomic_read(&r->refs); + unsigned int new, val = atomic_read(&r->refs); - for (;;) { + do { if (!val) return false; @@ -51,12 +69,8 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r) new = val + i; if (new < val) new = UINT_MAX; - old = atomic_cmpxchg_relaxed(&r->refs, val, new); - if (old == val) - break; - val = old; - } + } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new)); WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); @@ -64,24 +78,45 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r) } EXPORT_SYMBOL_GPL(refcount_add_not_zero); +/** + * refcount_add - add a value to a refcount + * @i: the value to add to the refcount + * @r: the refcount + * + * Similar to atomic_add(), but will saturate at UINT_MAX and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + */ void refcount_add(unsigned int i, refcount_t *r) { WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n"); } EXPORT_SYMBOL_GPL(refcount_add); -/* - * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN. +/** + * refcount_inc_not_zero - increment a refcount unless it is 0 + * @r: the refcount to increment + * + * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. + * + * Return: true if the increment was successful, false otherwise */ bool refcount_inc_not_zero(refcount_t *r) { - unsigned int old, new, val = atomic_read(&r->refs); + unsigned int new, val = atomic_read(&r->refs); - for (;;) { + do { new = val + 1; if (!val) @@ -90,12 +125,7 @@ bool refcount_inc_not_zero(refcount_t *r) if (unlikely(!new)) return true; - old = atomic_cmpxchg_relaxed(&r->refs, val, new); - if (old == val) - break; - - val = old; - } + } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new)); WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); @@ -103,11 +133,17 @@ bool refcount_inc_not_zero(refcount_t *r) } EXPORT_SYMBOL_GPL(refcount_inc_not_zero); -/* - * Similar to atomic_inc(), will saturate at UINT_MAX and WARN. +/** + * refcount_inc - increment a refcount + * @r: the refcount to increment + * + * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN. * * Provides no memory ordering, it is assumed the caller already has a - * reference on the object, will WARN when this is not so. + * reference on the object. + * + * Will WARN if the refcount is 0, as this represents a possible use-after-free + * condition. */ void refcount_inc(refcount_t *r) { @@ -115,11 +151,31 @@ void refcount_inc(refcount_t *r) } EXPORT_SYMBOL_GPL(refcount_inc); +/** + * refcount_sub_and_test - subtract from a refcount and test if it is 0 + * @i: amount to subtract from the refcount + * @r: the refcount + * + * Similar to atomic_dec_and_test(), but it will WARN, return false and + * ultimately leak on underflow and will fail to decrement when saturated + * at UINT_MAX. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides a control dependency such that free() must come after. + * See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_dec(), or one of its variants, should instead be used to + * decrement a reference count. + * + * Return: true if the resulting refcount is 0, false otherwise + */ bool refcount_sub_and_test(unsigned int i, refcount_t *r) { - unsigned int old, new, val = atomic_read(&r->refs); + unsigned int new, val = atomic_read(&r->refs); - for (;;) { + do { if (unlikely(val == UINT_MAX)) return false; @@ -129,24 +185,24 @@ bool refcount_sub_and_test(unsigned int i, refcount_t *r) return false; } - old = atomic_cmpxchg_release(&r->refs, val, new); - if (old == val) - break; - - val = old; - } + } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return !new; } EXPORT_SYMBOL_GPL(refcount_sub_and_test); -/* +/** + * refcount_dec_and_test - decrement a refcount and test if it is 0 + * @r: the refcount + * * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to * decrement when saturated at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. + * + * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_and_test(refcount_t *r) { @@ -154,21 +210,26 @@ bool refcount_dec_and_test(refcount_t *r) } EXPORT_SYMBOL_GPL(refcount_dec_and_test); -/* +/** + * refcount_dec - decrement a refcount + * @r: the refcount + * * Similar to atomic_dec(), it will WARN on underflow and fail to decrement * when saturated at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done * before. */ - void refcount_dec(refcount_t *r) { WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n"); } EXPORT_SYMBOL_GPL(refcount_dec); -/* +/** + * refcount_dec_if_one - decrement a refcount if it is 1 + * @r: the refcount + * * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. * @@ -178,24 +239,33 @@ EXPORT_SYMBOL_GPL(refcount_dec); * It can be used like a try-delete operator; this explicit case is provided * and not cmpxchg in generic, because that would allow implementing unsafe * operations. + * + * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_if_one(refcount_t *r) { - return atomic_cmpxchg_release(&r->refs, 1, 0) == 1; + int val = 1; + + return atomic_try_cmpxchg_release(&r->refs, &val, 0); } EXPORT_SYMBOL_GPL(refcount_dec_if_one); -/* +/** + * refcount_dec_not_one - decrement a refcount if it is not 1 + * @r: the refcount + * * No atomic_t counterpart, it decrements unless the value is 1, in which case * it will return false. * * Was often done like: atomic_add_unless(&var, -1, 1) + * + * Return: true if the decrement operation was successful, false otherwise */ bool refcount_dec_not_one(refcount_t *r) { - unsigned int old, new, val = atomic_read(&r->refs); + unsigned int new, val = atomic_read(&r->refs); - for (;;) { + do { if (unlikely(val == UINT_MAX)) return true; @@ -208,24 +278,27 @@ bool refcount_dec_not_one(refcount_t *r) return true; } - old = atomic_cmpxchg_release(&r->refs, val, new); - if (old == val) - break; - - val = old; - } + } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return true; } EXPORT_SYMBOL_GPL(refcount_dec_not_one); -/* +/** + * refcount_dec_and_mutex_lock - return holding mutex if able to decrement + * refcount to 0 + * @r: the refcount + * @lock: the mutex to be locked + * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail * to decrement when saturated at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. + * + * Return: true and hold mutex if able to decrement refcount to 0, false + * otherwise */ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) { @@ -242,13 +315,21 @@ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) } EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock); -/* +/** + * refcount_dec_and_lock - return holding spinlock if able to decrement + * refcount to 0 + * @r: the refcount + * @lock: the spinlock to be locked + * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to * decrement when saturated at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. + * + * Return: true and hold spinlock if able to decrement refcount to 0, false + * otherwise */ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) { diff --git a/mm/percpu.c b/mm/percpu.c index 60a6488e9e6d..e0aa8ae7bde7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1284,18 +1284,7 @@ void free_percpu(void __percpu *ptr) } EXPORT_SYMBOL_GPL(free_percpu); -/** - * is_kernel_percpu_address - test whether address is from static percpu area - * @addr: address to test - * - * Test whether @addr belongs to in-kernel static percpu area. Module - * static percpu areas are not considered. For those, use - * is_module_percpu_address(). - * - * RETURNS: - * %true if @addr is from in-kernel static percpu area, %false otherwise. - */ -bool is_kernel_percpu_address(unsigned long addr) +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) { #ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; @@ -1304,16 +1293,39 @@ bool is_kernel_percpu_address(unsigned long addr) for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); + void *va = (void *)addr; - if ((void *)addr >= start && (void *)addr < start + static_size) + if (va >= start && va < start + static_size) { + if (can_addr) { + *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(base, get_boot_cpu_id()); + } return true; - } + } + } #endif /* on UP, can't distinguish from other static vars, always false */ return false; } /** + * is_kernel_percpu_address - test whether address is from static percpu area + * @addr: address to test + * + * Test whether @addr belongs to in-kernel static percpu area. Module + * static percpu areas are not considered. For those, use + * is_module_percpu_address(). + * + * RETURNS: + * %true if @addr is from in-kernel static percpu area, %false otherwise. + */ +bool is_kernel_percpu_address(unsigned long addr) +{ + return __is_kernel_percpu_address(addr, NULL); +} + +/** * per_cpu_ptr_to_phys - convert translated percpu address to physical address * @addr: the address to be converted to physical address * |