diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-09 09:54:46 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-09 09:54:46 -0700 |
commit | a5ad5742f671de906adbf29fbedf0a04705cebad (patch) | |
tree | 88d1a4c18e2025a5a8335dbbc9dea8bebeba5789 /kernel | |
parent | 013b2deba9a6b80ca02f4fafd7dedf875e9b4450 (diff) | |
parent | 4fa7252338a56fbc90220e6330f136a379175a7a (diff) | |
download | linux-a5ad5742f671de906adbf29fbedf0a04705cebad.tar.bz2 |
Merge branch 'akpm' (patches from Andrew)
Merge even more updates from Andrew Morton:
- a kernel-wide sweep of show_stack()
- pagetable cleanups
- abstract out accesses to mmap_sem - prep for mmap_sem scalability work
- hch's user acess work
Subsystems affected by this patch series: debug, mm/pagemap, mm/maccess,
mm/documentation.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (93 commits)
include/linux/cache.h: expand documentation over __read_mostly
maccess: return -ERANGE when probe_kernel_read() fails
x86: use non-set_fs based maccess routines
maccess: allow architectures to provide kernel probing directly
maccess: move user access routines together
maccess: always use strict semantics for probe_kernel_read
maccess: remove strncpy_from_unsafe
tracing/kprobes: handle mixed kernel/userspace probes better
bpf: rework the compat kernel probe handling
bpf:bpf_seq_printf(): handle potentially unsafe format string better
bpf: handle the compat string in bpf_trace_copy_string better
bpf: factor out a bpf_trace_copy_string helper
maccess: unify the probe kernel arch hooks
maccess: remove probe_read_common and probe_write_common
maccess: rename strnlen_unsafe_user to strnlen_user_nofault
maccess: rename strncpy_from_unsafe_strict to strncpy_from_kernel_nofault
maccess: rename strncpy_from_unsafe_user to strncpy_from_user_nofault
maccess: update the top of file comment
maccess: clarify kerneldoc comments
maccess: remove duplicate kerneldoc comments
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 6 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 17 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 2 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 4 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_bt.c | 15 | ||||
-rw-r--r-- | kernel/events/core.c | 10 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 20 | ||||
-rw-r--r-- | kernel/exit.c | 11 | ||||
-rw-r--r-- | kernel/fork.c | 15 | ||||
-rw-r--r-- | kernel/futex.c | 4 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 4 | ||||
-rw-r--r-- | kernel/locking/rtmutex-debug.c | 2 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 1 | ||||
-rw-r--r-- | kernel/relay.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 8 | ||||
-rw-r--r-- | kernel/sched/fair.c | 4 | ||||
-rw-r--r-- | kernel/sys.c | 22 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 162 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 8 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 70 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 4 |
21 files changed, 215 insertions, 176 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 11ff4a596d6b..b0c5b3a9f5af 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -40,7 +40,7 @@ * is one more bug... 10/11/98, AV. * * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold - * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks + * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks * a struct file opened for write. Fixed. 2/6/2000, AV. */ @@ -541,13 +541,13 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); vma = current->mm->mmap; while (vma) { vsize += vma->vm_end - vma->vm_start; vma = vma->vm_next; } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); } spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 7b8381ce40a0..599488f25e40 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -33,7 +33,7 @@ struct bpf_stack_map { /* irq_work to run up_read() for build_id lookup in nmi context */ struct stack_map_irq_work { struct irq_work irq_work; - struct rw_semaphore *sem; + struct mm_struct *mm; }; static void do_up_read(struct irq_work *entry) @@ -44,8 +44,7 @@ static void do_up_read(struct irq_work *entry) return; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read_non_owner(work->sem); - work->sem = NULL; + mmap_read_unlock_non_owner(work->mm); } static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); @@ -317,7 +316,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - down_read_trylock(¤t->mm->mmap_sem) == 0) { + !mmap_read_trylock_non_owner(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -342,16 +341,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock_non_owner(current->mm); } else { - work->sem = ¤t->mm->mmap_sem; + work->mm = current->mm; irq_work_queue(&work->irq_work); - /* - * The irq_work will release the mmap_sem with - * up_read_non_owner(). The rwsem_release() is called - * here to release the lock from lockdep's perspective. - */ - rwsem_release(¤t->mm->mmap_sem.dep_map, _RET_IP_); } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d530b1d5683..c3ae2adaeccd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,7 +25,7 @@ #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <linux/bpf_lsm.h> #include <linux/poll.h> #include <linux/bpf-netns.h> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 729d3a5c772e..642415b8c3c9 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1655,7 +1655,7 @@ static void update_tasks_nodemask(struct cpuset *cs) guarantee_online_mems(cs, &newmems); /* - * The mpol_rebind_mm() call takes mmap_sem, which we couldn't + * The mpol_rebind_mm() call takes mmap_lock, which we couldn't * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold @@ -1760,7 +1760,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, - * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 3de0cc780c16..18e03aba2cfc 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,17 +21,18 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { - int old_lvl = console_loglevel; - - console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; - if (!addr && kdb_task_has_cpu(p)) + if (!addr && kdb_task_has_cpu(p)) { + int old_lvl = console_loglevel; + + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_dump_stack_on_cpu(kdb_process_cpu(p)); - else - show_stack(p, addr); + console_loglevel = old_lvl; + } else { + show_stack(p, addr, KERN_EMERG); + } - console_loglevel = old_lvl; kdb_trap_printk--; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 63d66bbebbd5..856d98c36f56 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1316,7 +1316,7 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event::child_mutex; * perf_event_context::lock * perf_event::mmap_mutex - * mmap_sem + * mmap_lock * perf_addr_filters_head::lock * * cpu_hotplug_lock @@ -3080,7 +3080,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly - * registered mapping, called for every new mmap(), with mm::mmap_sem down + * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. @@ -9742,7 +9742,7 @@ static void perf_addr_filters_splice(struct perf_event *event, /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. - * Called with mm::mmap_sem down for reading. + * Called with mm::mmap_lock down for reading. */ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, @@ -9784,7 +9784,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (!mm) goto restart; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } raw_spin_lock_irqsave(&ifh->lock, flags); @@ -9810,7 +9810,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (ifh->nr_file_filters) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); mmput(mm); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5dc2f46f38c0..bb0862873dba 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -457,7 +457,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_sem held for write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, @@ -1054,7 +1054,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) @@ -1076,7 +1076,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) } unlock: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); @@ -1240,7 +1240,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { unsigned long vaddr; loff_t offset; @@ -1257,7 +1257,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, mm, vaddr); } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return err; } @@ -1354,7 +1354,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. + * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. @@ -1444,7 +1444,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) struct vm_area_struct *vma; int ret; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; if (mm->uprobes_state.xol_area) { @@ -1474,7 +1474,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* pairs with get_xol_area() */ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } @@ -2047,7 +2047,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) struct uprobe *uprobe = NULL; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { @@ -2065,7 +2065,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) mmf_recalc_uprobes(mm); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return uprobe; } diff --git a/kernel/exit.c b/kernel/exit.c index c300253a7b8e..727150f28103 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -66,7 +66,6 @@ #include <linux/uaccess.h> #include <asm/unistd.h> -#include <asm/pgtable.h> #include <asm/mmu_context.h> static void __unhash_process(struct task_struct *p, bool group_dead) @@ -441,17 +440,17 @@ static void exit_mm(void) sync_mm_rss(mm); /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_state + * We must hold mmap_lock around checking core_state * and clearing tsk->mm. The core-inducing thread * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); core_state = mm->core_state; if (core_state) { struct core_thread self; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); self.task = current; self.next = xchg(&core_state->dumper.next, &self); @@ -469,14 +468,14 @@ static void exit_mm(void) freezable_schedule(); } __set_current_state(TASK_RUNNING); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); } mmgrab(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); current->mm = NULL; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); enter_lazy_tlb(mm, current); task_unlock(current); mm_update_next_owner(mm); diff --git a/kernel/fork.c b/kernel/fork.c index cefe8745c46e..142b23645d82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,7 +96,6 @@ #include <linux/kasan.h> #include <linux/scs.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -493,7 +492,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, LIST_HEAD(uf); uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { + if (mmap_write_lock_killable(oldmm)) { retval = -EINTR; goto fail_uprobe_end; } @@ -502,7 +501,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* * Not linked in yet - no deadlock potential: */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); @@ -618,9 +617,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); + mmap_write_unlock(oldmm); dup_userfaultfd_complete(&uf); fail_uprobe_end: uprobe_end_dup_mmap(); @@ -650,9 +649,9 @@ static inline void mm_free_pgd(struct mm_struct *mm) #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - down_write(&oldmm->mmap_sem); + mmap_write_lock(oldmm); RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); + mmap_write_unlock(oldmm); return 0; } #define mm_alloc_pgd(mm) (0) @@ -1023,7 +1022,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); - init_rwsem(&mm->mmap_sem); + mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; mm_pgtables_bytes_init(mm); diff --git a/kernel/futex.c b/kernel/futex.c index b4b9f960b610..e646661f6282 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -698,10 +698,10 @@ static int fault_in_user_writeable(u32 __user *uaddr) struct mm_struct *mm = current->mm; int ret; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, FAULT_FLAG_WRITE, NULL); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return ret < 0 ? ret : 0; } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4c057dd8e93b..38cce34d03dc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4424,7 +4424,7 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no more locks to release!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); @@ -5075,7 +5075,7 @@ static void print_lock_contention_bug(struct task_struct *curr, curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); pr_cont(") at:\n"); - print_ip_sym(ip); + print_ip_sym(KERN_WARNING, ip); pr_warn("but there are no locks held!\n"); pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index fd4fe1f5b458..36e69100e8e0 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task_pid_nr(task)); - show_stack(task, NULL); + show_stack(task, NULL, KERN_DEFAULT); printk("\n%s/%d's [current] stackdump:\n\n", current->comm, task_pid_nr(current)); dump_stack(); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 659800157b17..881128b9351e 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -34,7 +34,6 @@ #include <linux/uaccess.h> #include <asm/mmu_context.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/io.h> diff --git a/kernel/relay.c b/kernel/relay.c index 204867220f8a..72fe443ea78f 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static void relay_free_page_array(struct page **array) * * Returns 0 if ok, negative on error * - * Caller should already have grabbed mmap_sem. + * Caller should already have grabbed mmap_lock. */ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8298b2c240ce..8f360326861e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3922,8 +3922,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } if (panic_on_warn) panic("scheduling while atomic\n"); @@ -6026,7 +6025,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); - show_stack(p, NULL); + show_stack(p, NULL, KERN_INFO); put_task_stack(p); } EXPORT_SYMBOL_GPL(sched_show_task); @@ -6871,8 +6870,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && !preempt_count_equals(preempt_offset)) { pr_err("Preemption disabled at:"); - print_ip_sym(preempt_disable_ip); - pr_cont("\n"); + print_ip_sym(KERN_ERR, preempt_disable_ip); } dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 35f4cc024dcf..cbcb2f71599b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2770,7 +2770,7 @@ static void task_numa_work(struct callback_head *work) return; - if (!down_read_trylock(&mm->mmap_sem)) + if (!mmap_read_trylock(mm)) return; vma = find_vma(mm, start); if (!vma) { @@ -2838,7 +2838,7 @@ out: mm->numa_scan_offset = start; else reset_ptenuma_scan(p); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Make sure tasks use at least 32x as much time to run other code diff --git a/kernel/sys.c b/kernel/sys.c index 891667a49bb7..fd46865b46ba 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1846,7 +1846,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (exe_file) { struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!vma->vm_file) continue; @@ -1855,7 +1855,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) goto exit_err; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); fput(exe_file); } @@ -1869,7 +1869,7 @@ exit: fdput(exe); return err; exit_err: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); fput(exe_file); goto exit; } @@ -2007,10 +2007,10 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } /* - * arg_lock protects concurent updates but we still need mmap_sem for + * arg_lock protects concurent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); /* * We don't validate if these members are pointing to @@ -2049,7 +2049,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -2122,10 +2122,10 @@ static int prctl_set_mm(int opt, unsigned long addr, /* * arg_lock protects concurent updates of arg boundaries, we need - * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr + * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, addr); spin_lock(&mm->arg_lock); @@ -2217,7 +2217,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: spin_unlock(&mm->arg_lock); - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return error; } @@ -2442,13 +2442,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; - if (down_write_killable(&me->mm->mmap_sem)) + if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) set_bit(MMF_DISABLE_THP, &me->mm->flags); else clear_bit(MMF_DISABLE_THP, &me->mm->flags); - up_write(&me->mm->mmap_sem); + mmap_write_unlock(me->mm); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3744372a24e2..e729c9e587a0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -136,17 +136,23 @@ static const struct bpf_func_proto bpf_override_return_proto = { }; #endif -BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, - const void __user *, unsafe_ptr) +static __always_inline int +bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr) { - int ret = probe_user_read(dst, unsafe_ptr, size); + int ret; + ret = probe_user_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); - return ret; } +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_common(dst, size, unsafe_ptr); +} + const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, @@ -156,17 +162,24 @@ const struct bpf_func_proto bpf_probe_read_user_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, - const void __user *, unsafe_ptr) +static __always_inline int +bpf_probe_read_user_str_common(void *dst, u32 size, + const void __user *unsafe_ptr) { - int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size); + int ret; + ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); - return ret; } +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_str_common(dst, size, unsafe_ptr); +} + const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, @@ -177,25 +190,25 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = { }; static __always_inline int -bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr, - const bool compat) +bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = security_locked_down(LOCKDOWN_BPF_READ); if (unlikely(ret < 0)) - goto out; - ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) : - probe_kernel_read_strict(dst, unsafe_ptr, size); + goto fail; + ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) -out: - memset(dst, 0, size); + goto fail; + return ret; +fail: + memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false); + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_proto = { @@ -207,50 +220,37 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, - const void *, unsafe_ptr) -{ - return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true); -} - -static const struct bpf_func_proto bpf_probe_read_compat_proto = { - .func = bpf_probe_read_compat, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_ANYTHING, -}; - static __always_inline int -bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr, - const bool compat) +bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) { int ret = security_locked_down(LOCKDOWN_BPF_READ); if (unlikely(ret < 0)) - goto out; + goto fail; + /* - * The strncpy_from_unsafe_*() call will likely not fill the entire - * buffer, but that's okay in this circumstance as we're probing + * The strncpy_from_kernel_nofault() call will likely not fill the + * entire buffer, but that's okay in this circumstance as we're probing * arbitrary memory anyway similar to bpf_probe_read_*() and might * as well probe the stack. Thus, memory is explicitly cleared * only in error case, so that improper users ignoring return * code altogether don't copy garbage; otherwise length of string * is returned that can be used for bpf_perf_event_output() et al. */ - ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) : - strncpy_from_unsafe_strict(dst, unsafe_ptr, size); + ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) -out: - memset(dst, 0, size); + goto fail; + + return 0; +fail: + memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false); + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { @@ -262,10 +262,34 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .arg3_type = ARG_ANYTHING, }; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE +BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); +} + +static const struct bpf_func_proto bpf_probe_read_compat_proto = { + .func = bpf_probe_read_compat, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, const void *, unsafe_ptr) { - return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true); + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_str_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { @@ -276,6 +300,7 @@ static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; +#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, u32, size) @@ -324,6 +349,31 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) return &bpf_probe_write_user_proto; } +static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + size_t bufsz) +{ + void __user *user_ptr = (__force void __user *)unsafe_ptr; + + buf[0] = 0; + + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + strncpy_from_user_nofault(buf, user_ptr, bufsz); + break; + } + fallthrough; +#endif + case 'k': + strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); + break; + case 'u': + strncpy_from_user_nofault(buf, user_ptr, bufsz); + break; + } +} + /* * Only limited trace_printk() conversion specifiers allowed: * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s @@ -406,24 +456,8 @@ fmt_str: break; } - buf[0] = 0; - switch (fmt_ptype) { - case 's': -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - strncpy_from_unsafe(buf, unsafe_ptr, - sizeof(buf)); - break; -#endif - case 'k': - strncpy_from_unsafe_strict(buf, unsafe_ptr, - sizeof(buf)); - break; - case 'u': - strncpy_from_unsafe_user(buf, - (__force void __user *)unsafe_ptr, - sizeof(buf)); - break; - } + bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype, + sizeof(buf)); goto fmt_next; } @@ -579,15 +613,17 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, } if (fmt[i] == 's') { + void *unsafe_ptr; + /* try our best to copy */ if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { err = -E2BIG; goto out; } - err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - MAX_SEQ_PRINTF_STR_LEN); + unsafe_ptr = (void *)(long)args[fmt_cnt]; + err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt], + unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN); if (err < 0) bufs->buf[memcpy_cnt][0] = '\0'; params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b5765aeea698..7d0ebd104706 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2020,12 +2020,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EFAULT: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on modifying "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; case -EINVAL: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); if (ftrace_expected) { @@ -2036,12 +2036,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) case -EPERM: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on writing "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); break; default: FTRACE_WARN_ON_ONCE(1); pr_info("ftrace faulted on unknown error "); - print_ip_sym(ip); + print_ip_sym(KERN_INFO, ip); } print_bug_type(); if (rec) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35989383ae11..ea8d0b094f1b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1202,11 +1202,25 @@ static const struct file_operations kprobe_profile_ops = { /* Return the length of string -- including null terminal byte */ static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int fetch_store_strlen(unsigned long addr) { int ret, len = 0; u8 c; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return fetch_store_strlen_user(addr); +#endif + do { ret = probe_kernel_read(&c, (u8 *)addr + len, 1); len++; @@ -1215,22 +1229,14 @@ fetch_store_strlen(unsigned long addr) return (ret < 0) ? ret : len; } -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen_user(unsigned long addr) -{ - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE); -} - /* - * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max - * length and relative data location. + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. */ static nokprobe_inline int -fetch_store_string(unsigned long addr, void *dest, void *base) +fetch_store_string_user(unsigned long addr, void *dest, void *base) { + const void __user *uaddr = (__force const void __user *)addr; int maxlen = get_loc_len(*(u32 *)dest); void *__dest; long ret; @@ -1240,11 +1246,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen); + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); @@ -1252,23 +1254,31 @@ fetch_store_string(unsigned long addr, void *dest, void *base) } /* - * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf - * with max length and relative data location. + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. */ static nokprobe_inline int -fetch_store_string_user(unsigned long addr, void *dest, void *base) +fetch_store_string(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; int maxlen = get_loc_len(*(u32 *)dest); void *__dest; long ret; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return fetch_store_string_user(addr, dest, base); +#endif + if (unlikely(!maxlen)) return -ENOMEM; __dest = get_loc_data(dest, base); - ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen); + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); if (ret >= 0) *(u32 *)dest = make_data_loc(ret, __dest - base); @@ -1276,12 +1286,6 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) } static nokprobe_inline int -probe_mem_read(void *dest, void *src, size_t size) -{ - return probe_kernel_read(dest, src, size); -} - -static nokprobe_inline int probe_mem_read_user(void *dest, void *src, size_t size) { const void __user *uaddr = (__force const void __user *)src; @@ -1289,6 +1293,16 @@ probe_mem_read_user(void *dest, void *src, size_t size) return probe_user_read(dest, uaddr, size); } +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return probe_kernel_read(dest, src, size); +} + /* Note that we don't verify it, since the code does not come from user space */ static int process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 9a121e147102..73976de7f8cc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -393,7 +393,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, if (mm) { const struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, ip); if (vma) { file = vma->vm_file; @@ -405,7 +405,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, trace_seq_printf(s, "[+0x%lx]", ip - vmstart); } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) trace_seq_printf(s, " <" IP_FMT ">", ip); |