diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-26 16:02:40 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-26 16:02:40 -0800 |
commit | 168829ad09ca9cdfdc664b2110d0e3569932c12d (patch) | |
tree | 1b6351ab5766a272dec1fc08f77272a199bba978 | |
parent | 1ae78780eda54023a0fb49ee743dbba39da148e0 (diff) | |
parent | 500543c53a54134ced386aed85cd93cf1363f981 (diff) | |
download | linux-168829ad09ca9cdfdc664b2110d0e3569932c12d.tar.bz2 |
Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar:
"The main changes in this cycle were:
- A comprehensive rewrite of the robust/PI futex code's exit handling
to fix various exit races. (Thomas Gleixner et al)
- Rework the generic REFCOUNT_FULL implementation using
atomic_fetch_* operations so that the performance impact of the
cmpxchg() loops is mitigated for common refcount operations.
With these performance improvements the generic implementation of
refcount_t should be good enough for everybody - and this got
confirmed by performance testing, so remove ARCH_HAS_REFCOUNT and
REFCOUNT_FULL entirely, leaving the generic implementation enabled
unconditionally. (Will Deacon)
- Other misc changes, fixes, cleanups"
* 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (27 commits)
lkdtm: Remove references to CONFIG_REFCOUNT_FULL
locking/refcount: Remove unused 'refcount_error_report()' function
locking/refcount: Consolidate implementations of refcount_t
locking/refcount: Consolidate REFCOUNT_{MAX,SATURATED} definitions
locking/refcount: Move saturation warnings out of line
locking/refcount: Improve performance of generic REFCOUNT_FULL code
locking/refcount: Move the bulk of the REFCOUNT_FULL implementation into the <linux/refcount.h> header
locking/refcount: Remove unused refcount_*_checked() variants
locking/refcount: Ensure integer operands are treated as signed
locking/refcount: Define constants for saturation and max refcount values
futex: Prevent exit livelock
futex: Provide distinct return value when owner is exiting
futex: Add mutex around futex exit
futex: Provide state handling for exec() as well
futex: Sanitize exit state handling
futex: Mark the begin of futex exit explicitly
futex: Set task::futex_state to DEAD right after handling futex exit
futex: Split futex_mm_release() for exit/exec
exit/exec: Seperate mm_release()
futex: Replace PF_EXITPIDONE with a state
...
56 files changed, 686 insertions, 716 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 5f8a5d84dbbe..8bcc1c746142 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -892,27 +892,6 @@ config STRICT_MODULE_RWX config ARCH_HAS_PHYS_TO_DMA bool -config ARCH_HAS_REFCOUNT - bool - help - An architecture selects this when it has implemented refcount_t - using open coded assembly primitives that provide an optimized - refcount_t implementation, possibly at the expense of some full - refcount state checks of CONFIG_REFCOUNT_FULL=y. - - The refcount overflow check behavior, however, must be retained. - Catching overflows is the primary security concern for protecting - against bugs in reference counts. - -config REFCOUNT_FULL - bool "Perform full reference count validation at the expense of speed" - help - Enabling this switches the refcounting infrastructure from a fast - unchecked atomic_t implementation to a fully state checked - implementation, which can be (slightly) slower but provides protections - against various use-after-free conditions that can be used in - security flaw exploits. - config HAVE_ARCH_COMPILER_H bool help diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8a50efb559f3..0d3c5d7cceb7 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -117,7 +117,6 @@ config ARM select OLD_SIGSUSPEND3 select PCI_SYSCALL if PCI select PERF_USE_VMALLOC - select REFCOUNT_FULL select RTC_LIB select SYS_SUPPORTS_APM_EMULATION # Above selects are sorted alphabetically; please add new ones diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fcc6635666b4..afe6412fe769 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -182,7 +182,6 @@ config ARM64 select PCI_SYSCALL if PCI select POWER_RESET select POWER_SUPPLY - select REFCOUNT_FULL select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 38d64030aacf..2e60c80395ab 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -62,7 +62,6 @@ CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_STATIC_KEYS_SELFTEST=y -CONFIG_REFCOUNT_FULL=y CONFIG_LOCK_EVENT_COUNTS=y CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d936174f9d49..9c9bc348c412 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -73,7 +73,6 @@ config X86 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL - select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE select ARCH_HAS_SET_MEMORY diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 1b563f9167ea..cd339b88d5d4 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -141,9 +141,6 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) -# define _ASM_EXTABLE_REFCOUNT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) - # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -172,9 +169,6 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) -# define _ASM_EXTABLE_REFCOUNT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) - /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h deleted file mode 100644 index 232f856e0db0..000000000000 --- a/arch/x86/include/asm/refcount.h +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef __ASM_X86_REFCOUNT_H -#define __ASM_X86_REFCOUNT_H -/* - * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from - * PaX/grsecurity. - */ -#include <linux/refcount.h> -#include <asm/bug.h> - -/* - * This is the first portion of the refcount error handling, which lives in - * .text.unlikely, and is jumped to from the CPU flag check (in the - * following macros). This saves the refcount value location into CX for - * the exception handler to use (in mm/extable.c), and then triggers the - * central refcount exception. The fixup address for the exception points - * back to the regular execution flow in .text. - */ -#define _REFCOUNT_EXCEPTION \ - ".pushsection .text..refcount\n" \ - "111:\tlea %[var], %%" _ASM_CX "\n" \ - "112:\t" ASM_UD2 "\n" \ - ASM_UNREACHABLE \ - ".popsection\n" \ - "113:\n" \ - _ASM_EXTABLE_REFCOUNT(112b, 113b) - -/* Trigger refcount exception if refcount result is negative. */ -#define REFCOUNT_CHECK_LT_ZERO \ - "js 111f\n\t" \ - _REFCOUNT_EXCEPTION - -/* Trigger refcount exception if refcount result is zero or negative. */ -#define REFCOUNT_CHECK_LE_ZERO \ - "jz 111f\n\t" \ - REFCOUNT_CHECK_LT_ZERO - -/* Trigger refcount exception unconditionally. */ -#define REFCOUNT_ERROR \ - "jmp 111f\n\t" \ - _REFCOUNT_EXCEPTION - -static __always_inline void refcount_add(unsigned int i, refcount_t *r) -{ - asm volatile(LOCK_PREFIX "addl %1,%0\n\t" - REFCOUNT_CHECK_LT_ZERO - : [var] "+m" (r->refs.counter) - : "ir" (i) - : "cc", "cx"); -} - -static __always_inline void refcount_inc(refcount_t *r) -{ - asm volatile(LOCK_PREFIX "incl %0\n\t" - REFCOUNT_CHECK_LT_ZERO - : [var] "+m" (r->refs.counter) - : : "cc", "cx"); -} - -static __always_inline void refcount_dec(refcount_t *r) -{ - asm volatile(LOCK_PREFIX "decl %0\n\t" - REFCOUNT_CHECK_LE_ZERO - : [var] "+m" (r->refs.counter) - : : "cc", "cx"); -} - -static __always_inline __must_check -bool refcount_sub_and_test(unsigned int i, refcount_t *r) -{ - bool ret = GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", - REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, e, "er", i, "cx"); - - if (ret) { - smp_acquire__after_ctrl_dep(); - return true; - } - - return false; -} - -static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) -{ - bool ret = GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", - REFCOUNT_CHECK_LT_ZERO, - r->refs.counter, e, "cx"); - - if (ret) { - smp_acquire__after_ctrl_dep(); - return true; - } - - return false; -} - -static __always_inline __must_check -bool refcount_add_not_zero(unsigned int i, refcount_t *r) -{ - int c, result; - - c = atomic_read(&(r->refs)); - do { - if (unlikely(c == 0)) - return false; - - result = c + i; - - /* Did we try to increment from/to an undesirable state? */ - if (unlikely(c < 0 || c == INT_MAX || result < c)) { - asm volatile(REFCOUNT_ERROR - : : [var] "m" (r->refs.counter) - : "cc", "cx"); - break; - } - - } while (!atomic_try_cmpxchg(&(r->refs), &c, result)); - - return c != 0; -} - -static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) -{ - return refcount_add_not_zero(1, r); -} - -#endif diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 4d75bc656f97..30bb0bd3b1b8 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -45,55 +45,6 @@ __visible bool ex_handler_fault(const struct exception_table_entry *fixup, EXPORT_SYMBOL_GPL(ex_handler_fault); /* - * Handler for UD0 exception following a failed test against the - * result of a refcount inc/dec/add/sub. - */ -__visible bool ex_handler_refcount(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - /* First unconditionally saturate the refcount. */ - *(int *)regs->cx = INT_MIN / 2; - - /* - * Strictly speaking, this reports the fixup destination, not - * the fault location, and not the actually overflowing - * instruction, which is the instruction before the "js", but - * since that instruction could be a variety of lengths, just - * report the location after the overflow, which should be close - * enough for finding the overflow, as it's at least back in - * the function, having returned from .text.unlikely. - */ - regs->ip = ex_fixup_addr(fixup); - - /* - * This function has been called because either a negative refcount - * value was seen by any of the refcount functions, or a zero - * refcount value was seen by refcount_dec(). - * - * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result - * wrapped around) will be set. Additionally, seeing the refcount - * reach 0 will set ZF (Zero Flag: result was zero). In each of - * these cases we want a report, since it's a boundary condition. - * The SF case is not reported since it indicates post-boundary - * manipulations below zero or above INT_MAX. And if none of the - * flags are set, something has gone very wrong, so report it. - */ - if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { - bool zero = regs->flags & X86_EFLAGS_ZF; - - refcount_error_report(regs, zero ? "hit zero" : "overflow"); - } else if ((regs->flags & X86_EFLAGS_SF) == 0) { - /* Report if none of OF, ZF, nor SF are set. */ - refcount_error_report(regs, "unexpected saturation"); - } - - return true; -} -EXPORT_SYMBOL(ex_handler_refcount); - -/* * Handler for when we fail to restore a task's FPU state. We should never get * here because the FPU state of a task using the FPU (task->thread.fpu.state) * should always be valid. However, past bugs have allowed userspace to set diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index 4c766624b20d..4a8b2e5c2af6 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -719,7 +719,7 @@ void drm_connector_list_iter_end(struct drm_connector_list_iter *iter) __drm_connector_put_safe(iter->conn); spin_unlock_irqrestore(&config->connector_list_lock, flags); } - lock_release(&connector_list_iter_dep_map, 0, _RET_IP_); + lock_release(&connector_list_iter_dep_map, _RET_IP_); } EXPORT_SYMBOL(drm_connector_list_iter_end); diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug index 00786a142ff0..1400fce39c58 100644 --- a/drivers/gpu/drm/i915/Kconfig.debug +++ b/drivers/gpu/drm/i915/Kconfig.debug @@ -22,7 +22,6 @@ config DRM_I915_DEBUG depends on DRM_I915 select DEBUG_FS select PREEMPT_COUNT - select REFCOUNT_FULL select I2C_CHARDEV select STACKDEPOT select DRM_DP_AUX_CHARDEV diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c index edd21d14e64f..1a51b3598d63 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c @@ -509,14 +509,14 @@ void i915_gem_shrinker_taints_mutex(struct drm_i915_private *i915, I915_MM_SHRINKER, 0, _RET_IP_); mutex_acquire(&mutex->dep_map, 0, 0, _RET_IP_); - mutex_release(&mutex->dep_map, 0, _RET_IP_); + mutex_release(&mutex->dep_map, _RET_IP_); - mutex_release(&i915->drm.struct_mutex.dep_map, 0, _RET_IP_); + mutex_release(&i915->drm.struct_mutex.dep_map, _RET_IP_); fs_reclaim_release(GFP_KERNEL); if (unlock) - mutex_release(&i915->drm.struct_mutex.dep_map, 0, _RET_IP_); + mutex_release(&i915->drm.struct_mutex.dep_map, _RET_IP_); } #define obj_to_i915(obj__) to_i915((obj__)->base.dev) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index 65b5ca74b394..7f647243b3b9 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -52,7 +52,7 @@ static inline unsigned long __timeline_mark_lock(struct intel_context *ce) static inline void __timeline_mark_unlock(struct intel_context *ce, unsigned long flags) { - mutex_release(&ce->timeline->mutex.dep_map, 0, _THIS_IP_); + mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); local_irq_restore(flags); } diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 1c5506822dc7..bc828a9ace84 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1495,7 +1495,7 @@ long i915_request_wait(struct i915_request *rq, dma_fence_remove_callback(&rq->fence, &wait.cb); out: - mutex_release(&rq->engine->gt->reset.mutex.dep_map, 0, _THIS_IP_); + mutex_release(&rq->engine->gt->reset.mutex.dep_map, _THIS_IP_); trace_i915_request_wait_end(rq); return timeout; } diff --git a/drivers/misc/lkdtm/refcount.c b/drivers/misc/lkdtm/refcount.c index 0a146b32da13..de7c5ab528d9 100644 --- a/drivers/misc/lkdtm/refcount.c +++ b/drivers/misc/lkdtm/refcount.c @@ -6,14 +6,6 @@ #include "lkdtm.h" #include <linux/refcount.h> -#ifdef CONFIG_REFCOUNT_FULL -#define REFCOUNT_MAX (UINT_MAX - 1) -#define REFCOUNT_SATURATED UINT_MAX -#else -#define REFCOUNT_MAX INT_MAX -#define REFCOUNT_SATURATED (INT_MIN / 2) -#endif - static void overflow_check(refcount_t *ref) { switch (refcount_read(ref)) { @@ -127,7 +119,7 @@ void lkdtm_REFCOUNT_DEC_ZERO(void) static void check_negative(refcount_t *ref, int start) { /* - * CONFIG_REFCOUNT_FULL refuses to move a refcount at all on an + * refcount_t refuses to move a refcount at all on an * over-sub, so we have to track our starting position instead of * looking only at zero-pinning. */ @@ -210,7 +202,6 @@ static void check_from_zero(refcount_t *ref) /* * A refcount_inc() from zero should pin to zero or saturate and may WARN. - * Only CONFIG_REFCOUNT_FULL provides this protection currently. */ void lkdtm_REFCOUNT_INC_ZERO(void) { diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c index 60ff236a3d63..ce8291053af3 100644 --- a/drivers/tty/tty_ldsem.c +++ b/drivers/tty/tty_ldsem.c @@ -303,7 +303,7 @@ static int __ldsem_down_read_nested(struct ld_semaphore *sem, if (count <= 0) { lock_contended(&sem->dep_map, _RET_IP_); if (!down_read_failed(sem, count, timeout)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); return 0; } } @@ -322,7 +322,7 @@ static int __ldsem_down_write_nested(struct ld_semaphore *sem, if ((count & LDSEM_ACTIVE_MASK) != LDSEM_ACTIVE_BIAS) { lock_contended(&sem->dep_map, _RET_IP_); if (!down_write_failed(sem, count, timeout)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); return 0; } } @@ -390,7 +390,7 @@ void ldsem_up_read(struct ld_semaphore *sem) { long count; - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); count = atomic_long_add_return(-LDSEM_READ_BIAS, &sem->count); if (count < 0 && (count & LDSEM_ACTIVE_MASK) == 0) @@ -404,7 +404,7 @@ void ldsem_up_write(struct ld_semaphore *sem) { long count; - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); count = atomic_long_add_return(-LDSEM_WRITE_BIAS, &sem->count); if (count < 0) diff --git a/fs/dcache.c b/fs/dcache.c index e88cf0554e65..f7931b682a0d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1319,7 +1319,7 @@ resume: if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); - spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_); + spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; diff --git a/fs/exec.c b/fs/exec.c index 555e93c7dec8..c27231234764 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1015,7 +1015,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - mm_release(tsk, old_mm); + exec_mm_release(tsk, old_mm); if (old_mm) { sync_mm_rss(old_mm); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index bee8498d7792..b25ebdcabfa3 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -713,7 +713,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) if (need_to_start) jbd2_log_start_commit(journal, tid); - rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); + rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); handle->h_buffer_credits = nblocks; /* * Restore the original nofs context because the journal restart @@ -1848,7 +1848,7 @@ int jbd2_journal_stop(handle_t *handle) wake_up(&journal->j_wait_transaction_locked); } - rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); + rwsem_release(&journal->j_trans_commit_map, _THIS_IP_); if (wait_for_commit) err = jbd2_log_wait_commit(journal, tid); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index b2d9f79c4a7c..9d96e6871e1a 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -438,7 +438,7 @@ void kernfs_put_active(struct kernfs_node *kn) return; if (kernfs_lockdep(kn)) - rwsem_release(&kn->dep_map, 1, _RET_IP_); + rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; @@ -476,7 +476,7 @@ static void kernfs_drain(struct kernfs_node *kn) if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); - rwsem_release(&kn->dep_map, 1, _RET_IP_); + rwsem_release(&kn->dep_map, _RET_IP_); } kernfs_drain_open_files(kn); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 6e774c5ea13b..1c4c51f3df60 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1687,7 +1687,7 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb, spin_unlock_irqrestore(&lockres->l_lock, flags); #ifdef CONFIG_DEBUG_LOCK_ALLOC if (lockres->l_lockdep_map.key != NULL) - rwsem_release(&lockres->l_lockdep_map, 1, caller_ip); + rwsem_release(&lockres->l_lockdep_map, caller_ip); #endif } diff --git a/include/linux/compat.h b/include/linux/compat.h index 16dafd9f4b86..c4c389c7e1b4 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -410,8 +410,6 @@ struct compat_kexec_segment; struct compat_mq_attr; struct compat_msgbuf; -extern void compat_exit_robust_list(struct task_struct *curr); - #define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) #define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) diff --git a/include/linux/futex.h b/include/linux/futex.h index ccaef0097785..5cc3fed27d4c 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -2,7 +2,9 @@ #ifndef _LINUX_FUTEX_H #define _LINUX_FUTEX_H +#include <linux/sched.h> #include <linux/ktime.h> + #include <uapi/linux/futex.h> struct inode; @@ -48,15 +50,35 @@ union futex_key { #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } #ifdef CONFIG_FUTEX -extern void exit_robust_list(struct task_struct *curr); +enum { + FUTEX_STATE_OK, + FUTEX_STATE_EXITING, + FUTEX_STATE_DEAD, +}; -long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - u32 __user *uaddr2, u32 val2, u32 val3); -#else -static inline void exit_robust_list(struct task_struct *curr) +static inline void futex_init_task(struct task_struct *tsk) { + tsk->robust_list = NULL; +#ifdef CONFIG_COMPAT + tsk->compat_robust_list = NULL; +#endif + INIT_LIST_HEAD(&tsk->pi_state_list); + tsk->pi_state_cache = NULL; + tsk->futex_state = FUTEX_STATE_OK; + mutex_init(&tsk->futex_exit_mutex); } +void futex_exit_recursive(struct task_struct *tsk); +void futex_exit_release(struct task_struct *tsk); +void futex_exec_release(struct task_struct *tsk); + +long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3); +#else +static inline void futex_init_task(struct task_struct *tsk) { } +static inline void futex_exit_recursive(struct task_struct *tsk) { } +static inline void futex_exit_release(struct task_struct *tsk) { } +static inline void futex_exec_release(struct task_struct *tsk) { } static inline long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) @@ -65,12 +87,4 @@ static inline long do_futex(u32 __user *uaddr, int op, u32 val, } #endif -#ifdef CONFIG_FUTEX_PI -extern void exit_pi_state_list(struct task_struct *curr); -#else -static inline void exit_pi_state_list(struct task_struct *curr) -{ -} -#endif - #endif diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 603fbc4e2f70..564793c24d12 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1170,7 +1170,7 @@ struct journal_s #define jbd2_might_wait_for_commit(j) \ do { \ rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \ - rwsem_release(&j->j_trans_commit_map, 1, _THIS_IP_); \ + rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \ } while (0) /* journal feature predicate functions */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d83d403dac2e..09f759228e3f 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -328,13 +328,6 @@ extern int oops_may_print(void); void do_exit(long error_code) __noreturn; void complete_and_exit(struct completion *, long) __noreturn; -#ifdef CONFIG_ARCH_HAS_REFCOUNT -void refcount_error_report(struct pt_regs *regs, const char *err); -#else -static inline void refcount_error_report(struct pt_regs *regs, const char *err) -{ } -#endif - /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); int __must_check _kstrtol(const char *s, unsigned int base, long *res); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b8a835fd611b..c50d01ef1414 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -349,8 +349,7 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); -extern void lock_release(struct lockdep_map *lock, int nested, - unsigned long ip); +extern void lock_release(struct lockdep_map *lock, unsigned long ip); /* * Same "read" as for lock_acquire(), except -1 means any. @@ -428,7 +427,7 @@ static inline void lockdep_set_selftest_task(struct task_struct *task) } # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) -# define lock_release(l, n, i) do { } while (0) +# define lock_release(l, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) @@ -591,42 +590,42 @@ static inline void print_irqtrace_events(struct task_struct *curr) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) -#define spin_release(l, n, i) lock_release(l, n, i) +#define spin_release(l, i) lock_release(l, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) -#define rwlock_release(l, n, i) lock_release(l, n, i) +#define rwlock_release(l, i) lock_release(l, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) -#define seqcount_release(l, n, i) lock_release(l, n, i) +#define seqcount_release(l, i) lock_release(l, i) #define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) -#define mutex_release(l, n, i) lock_release(l, n, i) +#define mutex_release(l, i) lock_release(l, i) #define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) -#define rwsem_release(l, n, i) lock_release(l, n, i) +#define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) -#define lock_map_release(l) lock_release(l, 1, _THIS_IP_) +#define lock_map_release(l) lock_release(l, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ - lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ + lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ - lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ + lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) #define lockdep_assert_irqs_enabled() do { \ diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 3998cdf9cd14..ad2ca2a89d5b 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -93,7 +93,7 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) __percpu_up_read(sem); /* Unconditional memory barrier */ preempt_enable(); - rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); + rwsem_release(&sem->rw_sem.dep_map, _RET_IP_); } extern void percpu_down_write(struct percpu_rw_semaphore *); @@ -118,7 +118,7 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, bool read, unsigned long ip) { - lock_release(&sem->rw_sem.dep_map, 1, ip); + lock_release(&sem->rw_sem.dep_map, ip); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER if (!read) atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN); diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 185dd9736863..0b7506330c87 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -210,7 +210,7 @@ static inline void rcu_lock_acquire(struct lockdep_map *map) static inline void rcu_lock_release(struct lockdep_map *map) { - lock_release(map, 1, _THIS_IP_); + lock_release(map, _THIS_IP_); } extern struct lockdep_map rcu_lock_map; diff --git a/include/linux/refcount.h b/include/linux/refcount.h index e28cce21bad6..0ac50cf62d06 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -1,9 +1,88 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Variant of atomic_t specialized for reference counts. + * + * The interface matches the atomic_t interface (to aid in porting) but only + * provides the few functions one should use for reference counting. + * + * Saturation semantics + * ==================== + * + * refcount_t differs from atomic_t in that the counter saturates at + * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the + * counter and causing 'spurious' use-after-free issues. In order to avoid the + * cost associated with introducing cmpxchg() loops into all of the saturating + * operations, we temporarily allow the counter to take on an unchecked value + * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow + * or overflow has occurred. Although this is racy when multiple threads + * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly + * equidistant from 0 and INT_MAX we minimise the scope for error: + * + * INT_MAX REFCOUNT_SATURATED UINT_MAX + * 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff) + * +--------------------------------+----------------+----------------+ + * <---------- bad value! ----------> + * + * (in a signed view of the world, the "bad value" range corresponds to + * a negative counter value). + * + * As an example, consider a refcount_inc() operation that causes the counter + * to overflow: + * + * int old = atomic_fetch_add_relaxed(r); + * // old is INT_MAX, refcount now INT_MIN (0x8000_0000) + * if (old < 0) + * atomic_set(r, REFCOUNT_SATURATED); + * + * If another thread also performs a refcount_inc() operation between the two + * atomic operations, then the count will continue to edge closer to 0. If it + * reaches a value of 1 before /any/ of the threads reset it to the saturated + * value, then a concurrent refcount_dec_and_test() may erroneously free the + * underlying object. Given the precise timing details involved with the + * round-robin scheduling of each thread manipulating the refcount and the need + * to hit the race multiple times in succession, there doesn't appear to be a + * practical avenue of attack even if using refcount_add() operations with + * larger increments. + * + * Memory ordering + * =============== + * + * Memory ordering rules are slightly relaxed wrt regular atomic_t functions + * and provide only what is strictly required for refcounts. + * + * The increments are fully relaxed; these will not provide ordering. The + * rationale is that whatever is used to obtain the object we're increasing the + * reference count on will provide the ordering. For locked data structures, + * its the lock acquire, for RCU/lockless data structures its the dependent + * load. + * + * Do note that inc_not_zero() provides a control dependency which will order + * future stores against the inc, this ensures we'll never modify the object + * if we did not in fact acquire a reference. + * + * The decrements will provide release order, such that all the prior loads and + * stores will be issued before, it also provides a control dependency, which + * will order us against the subsequent free(). + * + * The control dependency is against the load of the cmpxchg (ll/sc) that + * succeeded. This means the stores aren't fully ordered, but this is fine + * because the 1->0 transition indicates no concurrency. + * + * Note that the allocator is responsible for ordering things between free() + * and alloc(). + * + * The decrements dec_and_test() and sub_and_test() also provide acquire + * ordering on success. + * + */ + #ifndef _LINUX_REFCOUNT_H #define _LINUX_REFCOUNT_H #include <linux/atomic.h> +#include <linux/bug.h> #include <linux/compiler.h> +#include <linux/limits.h> #include <linux/spinlock_types.h> struct mutex; @@ -12,7 +91,7 @@ struct mutex; * struct refcount_t - variant of atomic_t specialized for reference counts * @refs: atomic_t counter field * - * The counter saturates at UINT_MAX and will not move once + * The counter saturates at REFCOUNT_SATURATED and will not move once * there. This avoids wrapping the counter and causing 'spurious' * use-after-free bugs. */ @@ -21,13 +100,25 @@ typedef struct refcount_struct { } refcount_t; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } +#define REFCOUNT_MAX INT_MAX +#define REFCOUNT_SATURATED (INT_MIN / 2) + +enum refcount_saturation_type { + REFCOUNT_ADD_NOT_ZERO_OVF, + REFCOUNT_ADD_OVF, + REFCOUNT_ADD_UAF, + REFCOUNT_SUB_UAF, + REFCOUNT_DEC_LEAK, +}; + +void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t); /** * refcount_set - set a refcount's value * @r: the refcount * @n: value to which the refcount will be set */ -static inline void refcount_set(refcount_t *r, unsigned int n) +static inline void refcount_set(refcount_t *r, int n) { atomic_set(&r->refs, n); } @@ -43,70 +134,168 @@ static inline unsigned int refcount_read(const refcount_t *r) return atomic_read(&r->refs); } -extern __must_check bool refcount_add_not_zero_checked(unsigned int i, refcount_t *r); -extern void refcount_add_checked(unsigned int i, refcount_t *r); - -extern __must_check bool refcount_inc_not_zero_checked(refcount_t *r); -extern void refcount_inc_checked(refcount_t *r); - -extern __must_check bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r); - -extern __must_check bool refcount_dec_and_test_checked(refcount_t *r); -extern void refcount_dec_checked(refcount_t *r); - -#ifdef CONFIG_REFCOUNT_FULL - -#define refcount_add_not_zero refcount_add_not_zero_checked -#define refcount_add refcount_add_checked - -#define refcount_inc_not_zero refcount_inc_not_zero_checked -#define refcount_inc refcount_inc_checked +/** + * refcount_add_not_zero - add a value to a refcount unless it is 0 + * @i: the value to add to the refcount + * @r: the refcount + * + * Will saturate at REFCOUNT_SATURATED and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + * + * Return: false if the passed refcount is 0, true otherwise + */ +static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) +{ + int old = refcount_read(r); -#define refcount_sub_and_test refcount_sub_and_test_checked + do { + if (!old) + break; + } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); -#define refcount_dec_and_test refcount_dec_and_test_checked -#define refcount_dec refcount_dec_checked + if (unlikely(old < 0 || old + i < 0)) + refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); -#else -# ifdef CONFIG_ARCH_HAS_REFCOUNT -# include <asm/refcount.h> -# else -static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r) -{ - return atomic_add_unless(&r->refs, i, 0); + return old; } -static inline void refcount_add(unsigned int i, refcount_t *r) +/** + * refcount_add - add a value to a refcount + * @i: the value to add to the refcount + * @r: the refcount + * + * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + */ +static inline void refcount_add(int i, refcount_t *r) { - atomic_add(i, &r->refs); + int old = atomic_fetch_add_relaxed(i, &r->refs); + + if (unlikely(!old)) + refcount_warn_saturate(r, REFCOUNT_ADD_UAF); + else if (unlikely(old < 0 || old + i < 0)) + refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } +/** + * refcount_inc_not_zero - increment a refcount unless it is 0 + * @r: the refcount to increment + * + * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED + * and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Return: true if the increment was successful, false otherwise + */ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { - return atomic_add_unless(&r->refs, 1, 0); + return refcount_add_not_zero(1, r); } +/** + * refcount_inc - increment a refcount + * @r: the refcount to increment + * + * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN. + * + * Provides no memory ordering, it is assumed the caller already has a + * reference on the object. + * + * Will WARN if the refcount is 0, as this represents a possible use-after-free + * condition. + */ static inline void refcount_inc(refcount_t *r) { - atomic_inc(&r->refs); + refcount_add(1, r); } -static inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r) +/** + * refcount_sub_and_test - subtract from a refcount and test if it is 0 + * @i: amount to subtract from the refcount + * @r: the refcount + * + * Similar to atomic_dec_and_test(), but it will WARN, return false and + * ultimately leak on underflow and will fail to decrement when saturated + * at REFCOUNT_SATURATED. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides an acquire ordering on success such that free() + * must come after. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_dec(), or one of its variants, should instead be used to + * decrement a reference count. + * + * Return: true if the resulting refcount is 0, false otherwise + */ +static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { - return atomic_sub_and_test(i, &r->refs); + int old = atomic_fetch_sub_release(i, &r->refs); + + if (old == i) { + smp_acquire__after_ctrl_dep(); + return true; + } + + if (unlikely(old < 0 || old - i < 0)) + refcount_warn_saturate(r, REFCOUNT_SUB_UAF); + + return false; } +/** + * refcount_dec_and_test - decrement a refcount and test if it is 0 + * @r: the refcount + * + * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to + * decrement when saturated at REFCOUNT_SATURATED. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides an acquire ordering on success such that free() + * must come after. + * + * Return: true if the resulting refcount is 0, false otherwise + */ static inline __must_check bool refcount_dec_and_test(refcount_t *r) { - return atomic_dec_and_test(&r->refs); + return refcount_sub_and_test(1, r); } +/** + * refcount_dec - decrement a refcount + * @r: the refcount + * + * Similar to atomic_dec(), it will WARN on underflow and fail to decrement + * when saturated at REFCOUNT_SATURATED. + * + * Provides release memory ordering, such that prior loads and stores are done + * before. + */ static inline void refcount_dec(refcount_t *r) { - atomic_dec(&r->refs); + if (unlikely(atomic_fetch_sub_release(1, &r->refs) <= 1)) + refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } -# endif /* !CONFIG_ARCH_HAS_REFCOUNT */ -#endif /* CONFIG_REFCOUNT_FULL */ extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 86ebb4bf9c6e..abfb53ab11be 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -215,14 +215,14 @@ static inline void __raw_write_lock(rwlock_t *lock) static inline void __raw_write_unlock(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); + rwlock_release(&lock->dep_map, _RET_IP_); do_raw_write_unlock(lock); preempt_enable(); } static inline void __raw_read_unlock(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); + rwlock_release(&lock->dep_map, _RET_IP_); do_raw_read_unlock(lock); preempt_enable(); } @@ -230,7 +230,7 @@ static inline void __raw_read_unlock(rwlock_t *lock) static inline void __raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); + rwlock_release(&lock->dep_map, _RET_IP_); do_raw_read_unlock(lock); local_irq_restore(flags); preempt_enable(); @@ -238,7 +238,7 @@ __raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) static inline void __raw_read_unlock_irq(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); + rwlock_release(&lock->dep_map, _RET_IP_); do_raw_read_unlock(lock); local_irq_enable(); preempt_enable(); @@ -246,7 +246,7 @@ static inline void __raw_read_unlock_irq(rwlock_t *lock) static inline void __raw_read_unlock_bh(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); + rwlock_release(&lock->dep_map, _RET_IP_); do_raw_read_unlock(lock); __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); } @@ -254,7 +254,7 @@ static inline void __raw_read_unlock_bh(rwlock_t *lock) static inline void __raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); + rwlock_release(&lock->dep_map, _RET_IP_); do_raw_write_unlock(lock); local_irq_restore(flags); preempt_enable(); @@ -262,7 +262,7 @@ static inline void __raw_write_unlock_irqrestore(rwlock_t *lock, static inline void __raw_write_unlock_irq(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); + rwlock_release(&lock->dep_map, _RET_IP_); do_raw_write_unlock(lock); local_irq_enable(); preempt_enable(); @@ -270,7 +270,7 @@ static inline void __raw_write_unlock_irq(rwlock_t *lock) static inline void __raw_write_unlock_bh(rwlock_t *lock) { - rwlock_release(&lock->dep_map, 1, _RET_IP_); + rwlock_release(&lock->dep_map, _RET_IP_); do_raw_write_unlock(lock); __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); } diff --git a/include/linux/sched.h b/include/linux/sched.h index f72984f94a5c..07e68d9f5dc4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1059,6 +1059,8 @@ struct task_struct { #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; + struct mutex futex_exit_mutex; + unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; @@ -1447,7 +1449,6 @@ extern struct pid *cad_pid; */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ -#define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index e6770012db18..c49257a3b510 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -117,8 +117,10 @@ extern struct mm_struct *get_task_mm(struct task_struct *task); * succeeds. */ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); -/* Remove the current tasks stale references to the old mm_struct */ -extern void mm_release(struct task_struct *, struct mm_struct *); +/* Remove the current tasks stale references to the old mm_struct on exit() */ +extern void exit_mm_release(struct task_struct *, struct mm_struct *); +/* Remove the current tasks stale references to the old mm_struct on exec() */ +extern void exec_mm_release(struct task_struct *, struct mm_struct *); #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index bcf4cf26b8c8..0491d963d47e 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -79,7 +79,7 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) local_irq_save(flags); seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_); - seqcount_release(&l->dep_map, 1, _RET_IP_); + seqcount_release(&l->dep_map, _RET_IP_); local_irq_restore(flags); } @@ -384,7 +384,7 @@ static inline void write_seqcount_begin(seqcount_t *s) static inline void write_seqcount_end(seqcount_t *s) { - seqcount_release(&s->dep_map, 1, _RET_IP_); + seqcount_release(&s->dep_map, _RET_IP_); raw_write_seqcount_end(s); } diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index b762eaba4cdf..19a9be9d97ee 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -147,7 +147,7 @@ static inline void __raw_spin_lock(raw_spinlock_t *lock) static inline void __raw_spin_unlock(raw_spinlock_t *lock) { - spin_release(&lock->dep_map, 1, _RET_IP_); + spin_release(&lock->dep_map, _RET_IP_); do_raw_spin_unlock(lock); preempt_enable(); } @@ -155,7 +155,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) static inline void __raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { - spin_release(&lock->dep_map, 1, _RET_IP_); + spin_release(&lock->dep_map, _RET_IP_); do_raw_spin_unlock(lock); local_irq_restore(flags); preempt_enable(); @@ -163,7 +163,7 @@ static inline void __raw_spin_unlock_irqrestore(raw_spinlock_t *lock, static inline void __raw_spin_unlock_irq(raw_spinlock_t *lock) { - spin_release(&lock->dep_map, 1, _RET_IP_); + spin_release(&lock->dep_map, _RET_IP_); do_raw_spin_unlock(lock); local_irq_enable(); preempt_enable(); @@ -171,7 +171,7 @@ static inline void __raw_spin_unlock_irq(raw_spinlock_t *lock) static inline void __raw_spin_unlock_bh(raw_spinlock_t *lock) { - spin_release(&lock->dep_map, 1, _RET_IP_); + spin_release(&lock->dep_map, _RET_IP_); do_raw_spin_unlock(lock); __local_bh_enable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); } diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 3af7c0e03be5..d7554252404c 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -182,7 +182,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) { #ifdef CONFIG_DEBUG_MUTEXES - mutex_release(&ctx->dep_map, 0, _THIS_IP_); + mutex_release(&ctx->dep_map, _THIS_IP_); DEBUG_LOCKS_WARN_ON(ctx->acquired); if (!IS_ENABLED(CONFIG_PROVE_LOCKING)) diff --git a/include/net/sock.h b/include/net/sock.h index e7f697174f84..87d54ef57f00 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1488,7 +1488,7 @@ static inline void sock_release_ownership(struct sock *sk) sk->sk_lock.owned = 0; /* The sk_lock has mutex_unlock() semantics: */ - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 173e983619d7..caca752ee5e6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -339,7 +339,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * up_read_non_owner(). The rwsem_release() is called * here to release the lock from lockdep's perspective. */ - rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_); + rwsem_release(¤t->mm->mmap_sem.dep_map, _RET_IP_); } } diff --git a/kernel/cpu.c b/kernel/cpu.c index e2cad3ee2ead..a59cc980adad 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -336,7 +336,7 @@ static void lockdep_acquire_cpus_lock(void) static void lockdep_release_cpus_lock(void) { - rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_); + rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_); } /* diff --git a/kernel/exit.c b/kernel/exit.c index f2d20ab74422..0bac4b60d5f3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -437,7 +437,7 @@ static void exit_mm(void) struct mm_struct *mm = current->mm; struct core_state *core_state; - mm_release(current, mm); + exit_mm_release(current, mm); if (!mm) return; sync_mm_rss(mm); @@ -746,32 +746,12 @@ void __noreturn do_exit(long code) */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); - /* - * We can do this unlocked here. The futex code uses - * this flag just to verify whether the pi state - * cleanup has been done or not. In the worst case it - * loops once more. We pretend that the cleanup was - * done as there is no way to return. Either the - * OWNER_DIED bit is set by now or we push the blocked - * task into the wait for ever nirwana as well. - */ - tsk->flags |= PF_EXITPIDONE; + futex_exit_recursive(tsk); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } exit_signals(tsk); /* sets PF_EXITING */ - /* - * Ensure that all new tsk->pi_lock acquisitions must observe - * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). - */ - smp_mb(); - /* - * Ensure that we must observe the pi_state in exit_mm() -> - * mm_release() -> exit_pi_state_list(). - */ - raw_spin_lock_irq(&tsk->pi_lock); - raw_spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", @@ -846,12 +826,6 @@ void __noreturn do_exit(long code) * Make sure we are holding no locks: */ debug_check_no_locks_held(); - /* - * We can do this unlocked here. The futex code uses this flag - * just to verify whether the pi state cleanup has been done - * or not. In the worst case it loops once more. - */ - tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) exit_io_context(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 35f91ee91057..00b64f41c2b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1283,24 +1283,8 @@ static int wait_for_vfork_done(struct task_struct *child, * restoring the old one. . . * Eric Biederman 10 January 1998 */ -void mm_release(struct task_struct *tsk, struct mm_struct *mm) +static void mm_release(struct task_struct *tsk, struct mm_struct *mm) { - /* Get rid of any futexes when releasing the mm */ -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) { - exit_robust_list(tsk); - tsk->robust_list = NULL; - } -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) { - compat_exit_robust_list(tsk); - tsk->compat_robust_list = NULL; - } -#endif - if (unlikely(!list_empty(&tsk->pi_state_list))) - exit_pi_state_list(tsk); -#endif - uprobe_free_utask(tsk); /* Get rid of any cached register state */ @@ -1333,6 +1317,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) complete_vfork_done(tsk); } +void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + futex_exit_release(tsk); + mm_release(tsk, mm); +} + +void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + futex_exec_release(tsk); + mm_release(tsk, mm); +} + /** * dup_mm() - duplicates an existing mm structure * @tsk: the task_struct with which the new mm will be associated. @@ -2124,14 +2120,8 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_BLOCK p->plug = NULL; #endif -#ifdef CONFIG_FUTEX - p->robust_list = NULL; -#ifdef CONFIG_COMPAT - p->compat_robust_list = NULL; -#endif - INIT_LIST_HEAD(&p->pi_state_list); - p->pi_state_cache = NULL; -#endif + futex_init_task(p); + /* * sigaltstack should be cleared when sharing the same VM */ diff --git a/kernel/futex.c b/kernel/futex.c index bd18f60e4c6c..03c518e9747e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -325,6 +325,12 @@ static inline bool should_fail_futex(bool fshared) } #endif /* CONFIG_FAIL_FUTEX */ +#ifdef CONFIG_COMPAT +static void compat_exit_robust_list(struct task_struct *curr); +#else +static inline void compat_exit_robust_list(struct task_struct *curr) { } +#endif + static inline void futex_get_mm(union futex_key *key) { mmgrab(key->private.mm); @@ -890,7 +896,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ -void exit_pi_state_list(struct task_struct *curr) +static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; @@ -960,7 +966,8 @@ void exit_pi_state_list(struct task_struct *curr) } raw_spin_unlock_irq(&curr->pi_lock); } - +#else +static inline void exit_pi_state_list(struct task_struct *curr) { } #endif /* @@ -1169,16 +1176,47 @@ out_error: return ret; } +/** + * wait_for_owner_exiting - Block until the owner has exited + * @exiting: Pointer to the exiting task + * + * Caller must hold a refcount on @exiting. + */ +static void wait_for_owner_exiting(int ret, struct task_struct *exiting) +{ + if (ret != -EBUSY) { + WARN_ON_ONCE(exiting); + return; + } + + if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) + return; + + mutex_lock(&exiting->futex_exit_mutex); + /* + * No point in doing state checking here. If the waiter got here + * while the task was in exec()->exec_futex_release() then it can + * have any FUTEX_STATE_* value when the waiter has acquired the + * mutex. OK, if running, EXITING or DEAD if it reached exit() + * already. Highly unlikely and not a problem. Just one more round + * through the futex maze. + */ + mutex_unlock(&exiting->futex_exit_mutex); + + put_task_struct(exiting); +} + static int handle_exit_race(u32 __user *uaddr, u32 uval, struct task_struct *tsk) { u32 uval2; /* - * If PF_EXITPIDONE is not yet set, then try again. + * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the + * caller that the alleged owner is busy. */ - if (tsk && !(tsk->flags & PF_EXITPIDONE)) - return -EAGAIN; + if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) + return -EBUSY; /* * Reread the user space value to handle the following situation: @@ -1196,8 +1234,9 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, * *uaddr = 0xC0000000; tsk = get_task(PID); * } if (!tsk->flags & PF_EXITING) { * ... attach(); - * tsk->flags |= PF_EXITPIDONE; } else { - * if (!(tsk->flags & PF_EXITPIDONE)) + * tsk->futex_state = } else { + * FUTEX_STATE_DEAD; if (tsk->futex_state != + * FUTEX_STATE_DEAD) * return -EAGAIN; * return -ESRCH; <--- FAIL * } @@ -1228,7 +1267,8 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, * it after doing proper sanity checks. */ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, - struct futex_pi_state **ps) + struct futex_pi_state **ps, + struct task_struct **exiting) { pid_t pid = uval & FUTEX_TID_MASK; struct futex_pi_state *pi_state; @@ -1253,22 +1293,33 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, } /* - * We need to look at the task state flags to figure out, - * whether the task is exiting. To protect against the do_exit - * change of the task flags, we do this protected by - * p->pi_lock: + * We need to look at the task state to figure out, whether the + * task is exiting. To protect against the change of the task state + * in futex_exit_release(), we do this protected by p->pi_lock: */ raw_spin_lock_irq(&p->pi_lock); - if (unlikely(p->flags & PF_EXITING)) { + if (unlikely(p->futex_state != FUTEX_STATE_OK)) { /* - * The task is on the way out. When PF_EXITPIDONE is - * set, we know that the task has finished the - * cleanup: + * The task is on the way out. When the futex state is + * FUTEX_STATE_DEAD, we know that the task has finished + * the cleanup: */ int ret = handle_exit_race(uaddr, uval, p); raw_spin_unlock_irq(&p->pi_lock); - put_task_struct(p); + /* + * If the owner task is between FUTEX_STATE_EXITING and + * FUTEX_STATE_DEAD then store the task pointer and keep + * the reference on the task struct. The calling code will + * drop all locks, wait for the task to reach + * FUTEX_STATE_DEAD and then drop the refcount. This is + * required to prevent a live lock when the current task + * preempted the exiting task between the two states. + */ + if (ret == -EBUSY) + *exiting = p; + else + put_task_struct(p); return ret; } @@ -1307,7 +1358,8 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, static int lookup_pi_state(u32 __user *uaddr, u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) + union futex_key *key, struct futex_pi_state **ps, + struct task_struct **exiting) { struct futex_q *top_waiter = futex_top_waiter(hb, key); @@ -1322,7 +1374,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, * We are the first waiter - try to look up the owner based on * @uval and attach to it. */ - return attach_to_pi_owner(uaddr, uval, key, ps); + return attach_to_pi_owner(uaddr, uval, key, ps, exiting); } static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) @@ -1350,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * lookup * @task: the task to perform the atomic lock work for. This will * be "current" except in the case of requeue pi. + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Return: @@ -1358,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * - <0 - error * * The hb->lock and futex_key refs shall be held by the caller. + * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. */ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, - struct task_struct *task, int set_waiters) + struct task_struct *task, + struct task_struct **exiting, + int set_waiters) { u32 uval, newval, vpid = task_pid_vnr(task); struct futex_q *top_waiter; @@ -1432,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * attach to the owner. If that fails, no harm done, we only * set the FUTEX_WAITERS bit in the user space variable. */ - return attach_to_pi_owner(uaddr, newval, key, ps); + return attach_to_pi_owner(uaddr, newval, key, ps, exiting); } /** @@ -1480,7 +1540,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) /* * Queue the task for later wakeup for after we've released - * the hb->lock. wake_q_add() grabs reference to p. + * the hb->lock. */ wake_q_add_safe(wake_q, p); } @@ -1850,6 +1910,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * @key1: the from futex key * @key2: the to futex key * @ps: address to store the pi_state pointer + * @exiting: Pointer to store the task pointer of the owner task + * which is in the middle of exiting * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Try and get the lock on behalf of the top waiter if we can do it atomically. @@ -1857,16 +1919,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * + * @exiting is only set when the return value is -EBUSY. If so, this holds + * a refcount on the exiting task on return and the caller needs to drop it + * after waiting for the exit to complete. + * * Return: * - 0 - failed to acquire the lock atomically; * - >0 - acquired the lock, return value is vpid of the top_waiter * - <0 - error */ -static int futex_proxy_trylock_atomic(u32 __user *pifutex, - struct futex_hash_bucket *hb1, - struct futex_hash_bucket *hb2, - union futex_key *key1, union futex_key *key2, - struct futex_pi_state **ps, int set_waiters) +static int +futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key1, + union futex_key *key2, struct futex_pi_state **ps, + struct task_struct **exiting, int set_waiters) { struct futex_q *top_waiter = NULL; u32 curval; @@ -1903,7 +1969,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, */ vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, - set_waiters); + exiting, set_waiters); if (ret == 1) { requeue_pi_wake_futex(top_waiter, key2, hb2); return vpid; @@ -2032,6 +2098,8 @@ retry_private: } if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + struct task_struct *exiting = NULL; + /* * Attempt to acquire uaddr2 and wake the top waiter. If we * intend to requeue waiters, force setting the FUTEX_WAITERS @@ -2039,7 +2107,8 @@ retry_private: * faults rather in the requeue loop below. */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, nr_requeue); + &key2, &pi_state, + &exiting, nr_requeue); /* * At this point the top_waiter has either taken uaddr2 or is @@ -2066,7 +2135,8 @@ retry_private: * If that call succeeds then we have pi_state and an * initial refcount on it. */ - ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, + &pi_state, &exiting); } switch (ret) { @@ -2084,17 +2154,24 @@ retry_private: if (!ret) goto retry; goto out; + case -EBUSY: case -EAGAIN: /* * Two reasons for this: - * - Owner is exiting and we just wait for the + * - EBUSY: Owner is exiting and we just wait for the * exit to complete. - * - The user space value changed. + * - EAGAIN: The user space value changed. */ double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); cond_resched(); goto retry; default: @@ -2801,6 +2878,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to; struct futex_pi_state *pi_state = NULL; + struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; @@ -2822,7 +2900,8 @@ retry: retry_private: hb = queue_lock(&q); - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, + &exiting, 0); if (unlikely(ret)) { /* * Atomic work succeeded and we got the lock, @@ -2835,15 +2914,22 @@ retry_private: goto out_unlock_put_key; case -EFAULT: goto uaddr_faulted; + case -EBUSY: case -EAGAIN: /* * Two reasons for this: - * - Task is exiting and we just wait for the + * - EBUSY: Task is exiting and we just wait for the * exit to complete. - * - The user space value changed. + * - EAGAIN: The user space value changed. */ queue_unlock(hb); put_futex_key(&q.key); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); cond_resched(); goto retry; default: @@ -3452,11 +3538,16 @@ err_unlock: return ret; } +/* Constants for the pending_op argument of handle_futex_death */ +#define HANDLE_DEATH_PENDING true +#define HANDLE_DEATH_LIST false + /* * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) +static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, + bool pi, bool pending_op) { u32 uval, uninitialized_var(nval), mval; int err; @@ -3469,6 +3560,42 @@ retry: if (get_user(uval, uaddr)) return -1; + /* + * Special case for regular (non PI) futexes. The unlock path in + * user space has two race scenarios: + * + * 1. The unlock path releases the user space futex value and + * before it can execute the futex() syscall to wake up + * waiters it is killed. + * + * 2. A woken up waiter is killed before it can acquire the + * futex in user space. + * + * In both cases the TID validation below prevents a wakeup of + * potential waiters which can cause these waiters to block + * forever. + * + * In both cases the following conditions are met: + * + * 1) task->robust_list->list_op_pending != NULL + * @pending_op == true + * 2) User space futex value == 0 + * 3) Regular futex: @pi == false + * + * If these conditions are met, it is safe to attempt waking up a + * potential waiter without touching the user space futex value and + * trying to set the OWNER_DIED bit. The user space futex value is + * uncontended and the rest of the user space mutex state is + * consistent, so a woken waiter will just take over the + * uncontended futex. Setting the OWNER_DIED bit would create + * inconsistent state and malfunction of the user space owner died + * handling. + */ + if (pending_op && !pi && !uval) { + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + return 0; + } + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) return 0; @@ -3547,7 +3674,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, * * We silently return on any sign of list-walking problem. */ -void exit_robust_list(struct task_struct *curr) +static void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; @@ -3588,10 +3715,11 @@ void exit_robust_list(struct task_struct *curr) * A pending lock might already be on the list, so * don't process it twice: */ - if (entry != pending) + if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, - curr, pi)) + curr, pi, HANDLE_DEATH_LIST)) return; + } if (rc) return; entry = next_entry; @@ -3605,9 +3733,118 @@ void exit_robust_list(struct task_struct *curr) cond_resched(); } - if (pending) + if (pending) { handle_futex_death((void __user *)pending + futex_offset, - curr, pip); + curr, pip, HANDLE_DEATH_PENDING); + } +} + +static void futex_cleanup(struct task_struct *tsk) +{ + if (unlikely(tsk->robust_list)) { + exit_robust_list(tsk); + tsk->robust_list = NULL; + } + +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) { + compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } +#endif + + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); +} + +/** + * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD + * @tsk: task to set the state on + * + * Set the futex exit state of the task lockless. The futex waiter code + * observes that state when a task is exiting and loops until the task has + * actually finished the futex cleanup. The worst case for this is that the + * waiter runs through the wait loop until the state becomes visible. + * + * This is called from the recursive fault handling path in do_exit(). + * + * This is best effort. Either the futex exit code has run already or + * not. If the OWNER_DIED bit has been set on the futex then the waiter can + * take it over. If not, the problem is pushed back to user space. If the + * futex exit code did not run yet, then an already queued waiter might + * block forever, but there is nothing which can be done about that. + */ +void futex_exit_recursive(struct task_struct *tsk) +{ + /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ + if (tsk->futex_state == FUTEX_STATE_EXITING) + mutex_unlock(&tsk->futex_exit_mutex); + tsk->futex_state = FUTEX_STATE_DEAD; +} + +static void futex_cleanup_begin(struct task_struct *tsk) +{ + /* + * Prevent various race issues against a concurrent incoming waiter + * including live locks by forcing the waiter to block on + * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in + * attach_to_pi_owner(). + */ + mutex_lock(&tsk->futex_exit_mutex); + + /* + * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. + * + * This ensures that all subsequent checks of tsk->futex_state in + * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with + * tsk->pi_lock held. + * + * It guarantees also that a pi_state which was queued right before + * the state change under tsk->pi_lock by a concurrent waiter must + * be observed in exit_pi_state_list(). + */ + raw_spin_lock_irq(&tsk->pi_lock); + tsk->futex_state = FUTEX_STATE_EXITING; + raw_spin_unlock_irq(&tsk->pi_lock); +} + +static void futex_cleanup_end(struct task_struct *tsk, int state) +{ + /* + * Lockless store. The only side effect is that an observer might + * take another loop until it becomes visible. + */ + tsk->futex_state = state; + /* + * Drop the exit protection. This unblocks waiters which observed + * FUTEX_STATE_EXITING to reevaluate the state. + */ + mutex_unlock(&tsk->futex_exit_mutex); +} + +void futex_exec_release(struct task_struct *tsk) +{ + /* + * The state handling is done for consistency, but in the case of + * exec() there is no way to prevent futher damage as the PID stays + * the same. But for the unlikely and arguably buggy case that a + * futex is held on exec(), this provides at least as much state + * consistency protection which is possible. + */ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + /* + * Reset the state to FUTEX_STATE_OK. The task is alive and about + * exec a new binary. + */ + futex_cleanup_end(tsk, FUTEX_STATE_OK); +} + +void futex_exit_release(struct task_struct *tsk) +{ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, @@ -3737,7 +3974,7 @@ static void __user *futex_uaddr(struct robust_list __user *entry, * * We silently return on any sign of list-walking problem. */ -void compat_exit_robust_list(struct task_struct *curr) +static void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; @@ -3784,7 +4021,8 @@ void compat_exit_robust_list(struct task_struct *curr) if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); - if (handle_futex_death(uaddr, curr, pi)) + if (handle_futex_death(uaddr, curr, pi, + HANDLE_DEATH_LIST)) return; } if (rc) @@ -3803,7 +4041,7 @@ void compat_exit_robust_list(struct task_struct *curr) if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); - handle_futex_death(uaddr, curr, pip); + handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); } } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 233459c03b5a..32282e7112d3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4208,11 +4208,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) } /* - * Remove the lock to the list of currently held locks - this gets + * Remove the lock from the list of currently held locks - this gets * called on mutex_unlock()/spin_unlock*() (or on a failed * mutex_lock_interruptible()). - * - * @nested is an hysterical artifact, needs a tree wide cleanup. */ static int __lock_release(struct lockdep_map *lock, unsigned long ip) @@ -4491,8 +4489,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); -void lock_release(struct lockdep_map *lock, int nested, - unsigned long ip) +void lock_release(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 468a9b8422e3..54cc5f9286e9 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -733,6 +733,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne */ void __sched mutex_unlock(struct mutex *lock) { +#ifdef CONFIG_DEBUG_MUTEXES + WARN_ON(in_interrupt()); +#endif #ifndef CONFIG_DEBUG_LOCK_ALLOC if (__mutex_unlock_fast(lock)) return; @@ -1091,7 +1094,7 @@ err: err_early_kill: spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); - mutex_release(&lock->dep_map, 1, ip); + mutex_release(&lock->dep_map, ip); preempt_enable(); return ret; } @@ -1225,7 +1228,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne DEFINE_WAKE_Q(wake_q); unsigned long owner; - mutex_release(&lock->dep_map, 1, ip); + mutex_release(&lock->dep_map, ip); /* * Release the lock before (potentially) taking the spinlock such that @@ -1413,6 +1416,7 @@ int __sched mutex_trylock(struct mutex *lock) #ifdef CONFIG_DEBUG_MUTEXES DEBUG_LOCKS_WARN_ON(lock->magic != lock); + WARN_ON(in_interrupt()); #endif locked = __mutex_trylock(lock); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2874bf556162..851bbb10819d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1517,7 +1517,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); + mutex_release(&lock->dep_map, _RET_IP_); return ret; } @@ -1561,7 +1561,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) RT_MUTEX_MIN_CHAINWALK, rt_mutex_slowlock); if (ret) - mutex_release(&lock->dep_map, 1, _RET_IP_); + mutex_release(&lock->dep_map, _RET_IP_); return ret; } @@ -1600,7 +1600,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); */ void __sched rt_mutex_unlock(struct rt_mutex *lock) { - mutex_release(&lock->dep_map, 1, _RET_IP_); + mutex_release(&lock->dep_map, _RET_IP_); rt_mutex_fastunlock(lock, rt_mutex_slowunlock); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index eef04551eae7..44e68761f432 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1504,7 +1504,7 @@ int __sched down_read_killable(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); return -EINTR; } @@ -1546,7 +1546,7 @@ int __sched down_write_killable(struct rw_semaphore *sem) if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); return -EINTR; } @@ -1573,7 +1573,7 @@ EXPORT_SYMBOL(down_write_trylock); */ void up_read(struct rw_semaphore *sem) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); __up_read(sem); } EXPORT_SYMBOL(up_read); @@ -1583,7 +1583,7 @@ EXPORT_SYMBOL(up_read); */ void up_write(struct rw_semaphore *sem) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); __up_write(sem); } EXPORT_SYMBOL(up_write); @@ -1639,7 +1639,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { - rwsem_release(&sem->dep_map, 1, _RET_IP_); + rwsem_release(&sem->dep_map, _RET_IP_); return -EINTR; } diff --git a/kernel/panic.c b/kernel/panic.c index f470a038b05b..b69ee9e76cb2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -671,17 +671,6 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif -#ifdef CONFIG_ARCH_HAS_REFCOUNT -void refcount_error_report(struct pt_regs *regs, const char *err) -{ - WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n", - err, (void *)instruction_pointer(regs), - current->comm, task_pid_nr(current), - from_kuid_munged(&init_user_ns, current_uid()), - from_kuid_munged(&init_user_ns, current_euid())); -} -#endif - core_param(panic, panic_timeout, int, 0644); core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ca65327a6de8..c8be5a0f5259 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -248,7 +248,7 @@ static void __up_console_sem(unsigned long ip) { unsigned long flags; - mutex_release(&console_lock_dep_map, 1, ip); + mutex_release(&console_lock_dep_map, ip); printk_safe_enter_irqsave(flags); up(&console_sem); @@ -1679,20 +1679,20 @@ static int console_lock_spinning_disable_and_check(void) raw_spin_unlock(&console_owner_lock); if (!waiter) { - spin_release(&console_owner_dep_map, 1, _THIS_IP_); + spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } /* The waiter is now free to continue */ WRITE_ONCE(console_waiter, false); - spin_release(&console_owner_dep_map, 1, _THIS_IP_); + spin_release(&console_owner_dep_map, _THIS_IP_); /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ - mutex_release(&console_lock_dep_map, 1, _THIS_IP_); + mutex_release(&console_lock_dep_map, _THIS_IP_); return 1; } @@ -1746,7 +1746,7 @@ static int console_trylock_spinning(void) /* Owner will clear console_waiter on hand off */ while (READ_ONCE(console_waiter)) cpu_relax(); - spin_release(&console_owner_dep_map, 1, _THIS_IP_); + spin_release(&console_owner_dep_map, _THIS_IP_); printk_safe_exit_irqrestore(flags); /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d82e2f6ac41d..90e4b00ace89 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3106,7 +3106,7 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + spin_release(&rq->lock.dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = next; diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index a1705545e6ac..14f44f59e733 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -1475,7 +1475,7 @@ static void ww_test_edeadlk_normal(void) mutex_lock(&o2.base); o2.ctx = &t2; - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); WWAI(&t); t2 = t; @@ -1500,7 +1500,7 @@ static void ww_test_edeadlk_normal_slow(void) int ret; mutex_lock(&o2.base); - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; WWAI(&t); @@ -1527,7 +1527,7 @@ static void ww_test_edeadlk_no_unlock(void) mutex_lock(&o2.base); o2.ctx = &t2; - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); WWAI(&t); t2 = t; @@ -1551,7 +1551,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) int ret; mutex_lock(&o2.base); - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; WWAI(&t); @@ -1576,7 +1576,7 @@ static void ww_test_edeadlk_acquire_more(void) int ret; mutex_lock(&o2.base); - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; WWAI(&t); @@ -1597,7 +1597,7 @@ static void ww_test_edeadlk_acquire_more_slow(void) int ret; mutex_lock(&o2.base); - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; WWAI(&t); @@ -1618,11 +1618,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk(void) int ret; mutex_lock(&o2.base); - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; mutex_lock(&o3.base); - mutex_release(&o3.base.dep_map, 1, _THIS_IP_); + mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; WWAI(&t); @@ -1644,11 +1644,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk_slow(void) int ret; mutex_lock(&o2.base); - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; mutex_lock(&o3.base); - mutex_release(&o3.base.dep_map, 1, _THIS_IP_); + mutex_release(&o3.base.dep_map, _THIS_IP_); o3.ctx = &t2; WWAI(&t); @@ -1669,7 +1669,7 @@ static void ww_test_edeadlk_acquire_wrong(void) int ret; mutex_lock(&o2.base); - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; WWAI(&t); @@ -1694,7 +1694,7 @@ static void ww_test_edeadlk_acquire_wrong_slow(void) int ret; mutex_lock(&o2.base); - mutex_release(&o2.base.dep_map, 1, _THIS_IP_); + mutex_release(&o2.base.dep_map, _THIS_IP_); o2.ctx = &t2; WWAI(&t); diff --git a/lib/refcount.c b/lib/refcount.c index 6e904af0fb3e..ebac8b7d15a7 100644 --- a/lib/refcount.c +++ b/lib/refcount.c @@ -1,41 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Variant of atomic_t specialized for reference counts. - * - * The interface matches the atomic_t interface (to aid in porting) but only - * provides the few functions one should use for reference counting. - * - * It differs in that the counter saturates at UINT_MAX and will not move once - * there. This avoids wrapping the counter and causing 'spurious' - * use-after-free issues. - * - * Memory ordering rules are slightly relaxed wrt regular atomic_t functions - * and provide only what is strictly required for refcounts. - * - * The increments are fully relaxed; these will not provide ordering. The - * rationale is that whatever is used to obtain the object we're increasing the - * reference count on will provide the ordering. For locked data structures, - * its the lock acquire, for RCU/lockless data structures its the dependent - * load. - * - * Do note that inc_not_zero() provides a control dependency which will order - * future stores against the inc, this ensures we'll never modify the object - * if we did not in fact acquire a reference. - * - * The decrements will provide release order, such that all the prior loads and - * stores will be issued before, it also provides a control dependency, which - * will order us against the subsequent free(). - * - * The control dependency is against the load of the cmpxchg (ll/sc) that - * succeeded. This means the stores aren't fully ordered, but this is fine - * because the 1->0 transition indicates no concurrency. - * - * Note that the allocator is responsible for ordering things between free() - * and alloc(). - * - * The decrements dec_and_test() and sub_and_test() also provide acquire - * ordering on success. - * + * Out-of-line refcount functions. */ #include <linux/mutex.h> @@ -43,199 +8,33 @@ #include <linux/spinlock.h> #include <linux/bug.h> -/** - * refcount_add_not_zero_checked - add a value to a refcount unless it is 0 - * @i: the value to add to the refcount - * @r: the refcount - * - * Will saturate at UINT_MAX and WARN. - * - * Provides no memory ordering, it is assumed the caller has guaranteed the - * object memory to be stable (RCU, etc.). It does provide a control dependency - * and thereby orders future stores. See the comment on top. - * - * Use of this function is not recommended for the normal reference counting - * use case in which references are taken and released one at a time. In these - * cases, refcount_inc(), or one of its variants, should instead be used to - * increment a reference count. - * - * Return: false if the passed refcount is 0, true otherwise - */ -bool refcount_add_not_zero_checked(unsigned int i, refcount_t *r) -{ - unsigned int new, val = atomic_read(&r->refs); - - do { - if (!val) - return false; - - if (unlikely(val == UINT_MAX)) - return true; - - new = val + i; - if (new < val) - new = UINT_MAX; - - } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new)); - - WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); - - return true; -} -EXPORT_SYMBOL(refcount_add_not_zero_checked); - -/** - * refcount_add_checked - add a value to a refcount - * @i: the value to add to the refcount - * @r: the refcount - * - * Similar to atomic_add(), but will saturate at UINT_MAX and WARN. - * - * Provides no memory ordering, it is assumed the caller has guaranteed the - * object memory to be stable (RCU, etc.). It does provide a control dependency - * and thereby orders future stores. See the comment on top. - * - * Use of this function is not recommended for the normal reference counting - * use case in which references are taken and released one at a time. In these - * cases, refcount_inc(), or one of its variants, should instead be used to - * increment a reference count. - */ -void refcount_add_checked(unsigned int i, refcount_t *r) -{ - WARN_ONCE(!refcount_add_not_zero_checked(i, r), "refcount_t: addition on 0; use-after-free.\n"); -} -EXPORT_SYMBOL(refcount_add_checked); - -/** - * refcount_inc_not_zero_checked - increment a refcount unless it is 0 - * @r: the refcount to increment - * - * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN. - * - * Provides no memory ordering, it is assumed the caller has guaranteed the - * object memory to be stable (RCU, etc.). It does provide a control dependency - * and thereby orders future stores. See the comment on top. - * - * Return: true if the increment was successful, false otherwise - */ -bool refcount_inc_not_zero_checked(refcount_t *r) -{ - unsigned int new, val = atomic_read(&r->refs); - - do { - new = val + 1; - - if (!val) - return false; - - if (unlikely(!new)) - return true; - - } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new)); +#define REFCOUNT_WARN(str) WARN_ONCE(1, "refcount_t: " str ".\n") - WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); - - return true; -} -EXPORT_SYMBOL(refcount_inc_not_zero_checked); - -/** - * refcount_inc_checked - increment a refcount - * @r: the refcount to increment - * - * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN. - * - * Provides no memory ordering, it is assumed the caller already has a - * reference on the object. - * - * Will WARN if the refcount is 0, as this represents a possible use-after-free - * condition. - */ -void refcount_inc_checked(refcount_t *r) +void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) { - WARN_ONCE(!refcount_inc_not_zero_checked(r), "refcount_t: increment on 0; use-after-free.\n"); -} -EXPORT_SYMBOL(refcount_inc_checked); - -/** - * refcount_sub_and_test_checked - subtract from a refcount and test if it is 0 - * @i: amount to subtract from the refcount - * @r: the refcount - * - * Similar to atomic_dec_and_test(), but it will WARN, return false and - * ultimately leak on underflow and will fail to decrement when saturated - * at UINT_MAX. - * - * Provides release memory ordering, such that prior loads and stores are done - * before, and provides an acquire ordering on success such that free() - * must come after. - * - * Use of this function is not recommended for the normal reference counting - * use case in which references are taken and released one at a time. In these - * cases, refcount_dec(), or one of its variants, should instead be used to - * decrement a reference count. - * - * Return: true if the resulting refcount is 0, false otherwise - */ -bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r) -{ - unsigned int new, val = atomic_read(&r->refs); - - do { - if (unlikely(val == UINT_MAX)) - return false; - - new = val - i; - if (new > val) { - WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n"); - return false; - } - - } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); - - if (!new) { - smp_acquire__after_ctrl_dep(); - return true; + refcount_set(r, REFCOUNT_SATURATED); + + switch (t) { + case REFCOUNT_ADD_NOT_ZERO_OVF: + REFCOUNT_WARN("saturated; leaking memory"); + break; + case REFCOUNT_ADD_OVF: + REFCOUNT_WARN("saturated; leaking memory"); + break; + case REFCOUNT_ADD_UAF: + REFCOUNT_WARN("addition on 0; use-after-free"); + break; + case REFCOUNT_SUB_UAF: + REFCOUNT_WARN("underflow; use-after-free"); + break; + case REFCOUNT_DEC_LEAK: + REFCOUNT_WARN("decrement hit 0; leaking memory"); + break; + default: + REFCOUNT_WARN("unknown saturation event!?"); } - return false; - -} -EXPORT_SYMBOL(refcount_sub_and_test_checked); - -/** - * refcount_dec_and_test_checked - decrement a refcount and test if it is 0 - * @r: the refcount - * - * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to - * decrement when saturated at UINT_MAX. - * - * Provides release memory ordering, such that prior loads and stores are done - * before, and provides an acquire ordering on success such that free() - * must come after. - * - * Return: true if the resulting refcount is 0, false otherwise - */ -bool refcount_dec_and_test_checked(refcount_t *r) -{ - return refcount_sub_and_test_checked(1, r); -} -EXPORT_SYMBOL(refcount_dec_and_test_checked); - -/** - * refcount_dec_checked - decrement a refcount - * @r: the refcount - * - * Similar to atomic_dec(), it will WARN on underflow and fail to decrement - * when saturated at UINT_MAX. - * - * Provides release memory ordering, such that prior loads and stores are done - * before. - */ -void refcount_dec_checked(refcount_t *r) -{ - WARN_ONCE(refcount_dec_and_test_checked(r), "refcount_t: decrement hit 0; leaking memory.\n"); } -EXPORT_SYMBOL(refcount_dec_checked); +EXPORT_SYMBOL(refcount_warn_saturate); /** * refcount_dec_if_one - decrement a refcount if it is 1 @@ -277,7 +76,7 @@ bool refcount_dec_not_one(refcount_t *r) unsigned int new, val = atomic_read(&r->refs); do { - if (unlikely(val == UINT_MAX)) + if (unlikely(val == REFCOUNT_SATURATED)) return true; if (val == 1) @@ -302,7 +101,7 @@ EXPORT_SYMBOL(refcount_dec_not_one); * @lock: the mutex to be locked * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail - * to decrement when saturated at UINT_MAX. + * to decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. @@ -333,7 +132,7 @@ EXPORT_SYMBOL(refcount_dec_and_mutex_lock); * @lock: the spinlock to be locked * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to - * decrement when saturated at UINT_MAX. + * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 60ba93fc42ce..bd9571653288 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -23,7 +23,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) * Kernel threads bound to a single CPU can safely use * smp_processor_id(): */ - if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu))) + if (current->nr_cpus_allowed == 1) goto out; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46ad252e6d6a..01f3f8b665e9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1800,7 +1800,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) struct mem_cgroup *iter; spin_lock(&memcg_oom_lock); - mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); + mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; spin_unlock(&memcg_oom_lock); diff --git a/net/core/sock.c b/net/core/sock.c index 71787f7c4f8c..043db3ce023e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -518,7 +518,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, rc = sk_backlog_rcv(sk, skb); - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); + mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { bh_unlock_sock(sk); atomic_inc(&sk->sk_drops); diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h index a81d91d4fc78..a6d7ee5f18ba 100644 --- a/tools/lib/lockdep/include/liblockdep/common.h +++ b/tools/lib/lockdep/include/liblockdep/common.h @@ -42,8 +42,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); -void lock_release(struct lockdep_map *lock, int nested, - unsigned long ip); +void lock_release(struct lockdep_map *lock, unsigned long ip); void lockdep_reset_lock(struct lockdep_map *lock); void lockdep_register_key(struct lock_class_key *key); void lockdep_unregister_key(struct lock_class_key *key); diff --git a/tools/lib/lockdep/include/liblockdep/mutex.h b/tools/lib/lockdep/include/liblockdep/mutex.h index 783dd0df06f9..bd106b82759b 100644 --- a/tools/lib/lockdep/include/liblockdep/mutex.h +++ b/tools/lib/lockdep/include/liblockdep/mutex.h @@ -42,7 +42,7 @@ static inline int liblockdep_pthread_mutex_lock(liblockdep_pthread_mutex_t *lock static inline int liblockdep_pthread_mutex_unlock(liblockdep_pthread_mutex_t *lock) { - lock_release(&lock->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&lock->dep_map, (unsigned long)_RET_IP_); return pthread_mutex_unlock(&lock->mutex); } diff --git a/tools/lib/lockdep/include/liblockdep/rwlock.h b/tools/lib/lockdep/include/liblockdep/rwlock.h index 365762e3a1ea..6d5d2932bf4d 100644 --- a/tools/lib/lockdep/include/liblockdep/rwlock.h +++ b/tools/lib/lockdep/include/liblockdep/rwlock.h @@ -44,7 +44,7 @@ static inline int liblockdep_pthread_rwlock_rdlock(liblockdep_pthread_rwlock_t * static inline int liblockdep_pthread_rwlock_unlock(liblockdep_pthread_rwlock_t *lock) { - lock_release(&lock->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&lock->dep_map, (unsigned long)_RET_IP_); return pthread_rwlock_unlock(&lock->rwlock); } diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c index 76245d16196d..8f1adbe887b2 100644 --- a/tools/lib/lockdep/preload.c +++ b/tools/lib/lockdep/preload.c @@ -270,7 +270,7 @@ int pthread_mutex_lock(pthread_mutex_t *mutex) */ r = ll_pthread_mutex_lock(mutex); if (r) - lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&__get_lock(mutex)->dep_map, (unsigned long)_RET_IP_); return r; } @@ -284,7 +284,7 @@ int pthread_mutex_trylock(pthread_mutex_t *mutex) lock_acquire(&__get_lock(mutex)->dep_map, 0, 1, 0, 1, NULL, (unsigned long)_RET_IP_); r = ll_pthread_mutex_trylock(mutex); if (r) - lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&__get_lock(mutex)->dep_map, (unsigned long)_RET_IP_); return r; } @@ -295,7 +295,7 @@ int pthread_mutex_unlock(pthread_mutex_t *mutex) try_init_preload(); - lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&__get_lock(mutex)->dep_map, (unsigned long)_RET_IP_); /* * Just like taking a lock, only in reverse! * @@ -355,7 +355,7 @@ int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock) lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 2, 1, NULL, (unsigned long)_RET_IP_); r = ll_pthread_rwlock_rdlock(rwlock); if (r) - lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_); return r; } @@ -369,7 +369,7 @@ int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock) lock_acquire(&__get_lock(rwlock)->dep_map, 0, 1, 2, 1, NULL, (unsigned long)_RET_IP_); r = ll_pthread_rwlock_tryrdlock(rwlock); if (r) - lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_); return r; } @@ -383,7 +383,7 @@ int pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock) lock_acquire(&__get_lock(rwlock)->dep_map, 0, 1, 0, 1, NULL, (unsigned long)_RET_IP_); r = ll_pthread_rwlock_trywrlock(rwlock); if (r) - lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_); return r; } @@ -397,7 +397,7 @@ int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock) lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 0, 1, NULL, (unsigned long)_RET_IP_); r = ll_pthread_rwlock_wrlock(rwlock); if (r) - lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_); return r; } @@ -408,7 +408,7 @@ int pthread_rwlock_unlock(pthread_rwlock_t *rwlock) init_preload(); - lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_); + lock_release(&__get_lock(rwlock)->dep_map, (unsigned long)_RET_IP_); r = ll_pthread_rwlock_unlock(rwlock); if (r) lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 0, 1, NULL, (unsigned long)_RET_IP_); |