diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 7 | ||||
-rw-r--r-- | kernel/bpf/core.c | 2 | ||||
-rw-r--r-- | kernel/bpf/disasm.c | 2 | ||||
-rw-r--r-- | kernel/bpf/disasm.h | 2 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 10 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 2 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 63 | ||||
-rw-r--r-- | kernel/dma/debug.c | 3 | ||||
-rw-r--r-- | kernel/dma/mapping.c | 3 | ||||
-rw-r--r-- | kernel/entry/common.c | 4 | ||||
-rw-r--r-- | kernel/events/core.c | 36 | ||||
-rw-r--r-- | kernel/irq/irqdomain.c | 2 | ||||
-rw-r--r-- | kernel/locking/rwbase_rt.c | 65 | ||||
-rw-r--r-- | kernel/printk/printk.c | 4 | ||||
-rw-r--r-- | kernel/rseq.c | 14 | ||||
-rw-r--r-- | kernel/sched/debug.c | 8 | ||||
-rw-r--r-- | kernel/sched/fair.c | 6 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 3 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 8 |
19 files changed, 155 insertions, 89 deletions
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d6731c32864e..9abcc33f02cf 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -368,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *mtype, *ptype; struct bpf_prog *prog; u32 moff; + u32 flags; moff = btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); @@ -431,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + flags = st_ops->func_models[i].ret_size > 0 ? + BPF_TRAMP_F_RET_FENTRY_RET : 0; err = arch_prepare_bpf_trampoline(NULL, image, st_map->image + PAGE_SIZE, - &st_ops->func_models[i], 0, - tprogs, NULL); + &st_ops->func_models[i], + flags, tprogs, NULL); if (err < 0) goto reset_unlock; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9f4636d021b1..d6b7dfdd8066 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { atomic_long_sub(pages, &bpf_jit_current); return -EPERM; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index ca3cd9aaa6ce..7b4afb7d96db 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e546b18d27da..a4b040793f44 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e8eefdf8cf3e..09a3fd97d329 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -179,7 +179,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock_non_owner(current->mm)) { + !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -204,9 +204,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock_non_owner(current->mm); + mmap_read_unlock(current->mm); } else { work->mm = current->mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 047ac4b4703b..e76b55917905 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9912,6 +9912,8 @@ static int check_btf_line(struct bpf_verifier_env *env, nr_linfo = attr->line_info_cnt; if (!nr_linfo) return 0; + if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) + return -EINVAL; rec_size = attr->line_info_rec_size; if (rec_size < MIN_BPF_LINEINFO_SIZE || diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 881ce1470beb..570b0c97392a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6572,74 +6572,51 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } - +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 6c90c69e5311..95445bd6eb72 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -567,7 +567,8 @@ static void add_dma_entry(struct dma_debug_entry *entry) pr_err("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; } else if (rc == -EEXIST) { - pr_err("cacheline tracking EEXIST, overlapping mappings aren't supported\n"); + err_printk(entry->dev, entry, + "cacheline tracking EEXIST, overlapping mappings aren't supported\n"); } } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 7ee5284bff58..06fec5547e7c 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -206,7 +206,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, /** * dma_map_sg_attrs - Map the given buffer for DMA * @dev: The device for which to perform the DMA operation - * @sg: The sg_table object describing the buffer + * @sg: The sg_table object describing the buffer + * @nents: Number of entries to map * @dir: DMA direction * @attrs: Optional DMA attributes for the map operation * diff --git a/kernel/entry/common.c b/kernel/entry/common.c index bf16395b9e13..d5a61d565ad5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -171,10 +171,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) handle_signal_work(regs, ti_work); - if (ti_work & _TIF_NOTIFY_RESUME) { + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 744e8726c5b2..f23ca260307f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3707,6 +3707,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +static inline bool event_update_userpage(struct perf_event *event) +{ + if (likely(!atomic_read(&event->mmap_count))) + return false; + + perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); + perf_event_update_userpage(event); + + return true; +} + +static inline void group_update_userpage(struct perf_event *group_event) +{ + struct perf_event *event; + + if (!event_update_userpage(group_event)) + return; + + for_each_sibling_event(event, group_event) + event_update_userpage(event); +} + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; @@ -3725,14 +3748,15 @@ static int merge_sched_in(struct perf_event *event, void *data) } if (event->state == PERF_EVENT_STATE_INACTIVE) { + *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + } else { + ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); + group_update_userpage(event); } - - *can_add_hw = 0; - ctx->rotate_necessary = 1; - perf_mux_hrtimer_restart(cpuctx); } return 0; @@ -6324,6 +6348,8 @@ accounting: ring_buffer_attach(event, rb); + perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { @@ -10193,7 +10219,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) return; if (ifh->nr_file_filters) { - mm = get_task_mm(event->ctx->task); + mm = get_task_mm(task); if (!mm) goto restart; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 19e83e9b723c..4d8fc65cf38f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -136,7 +136,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); * Allocates and initializes an irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data) diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 4ba15088e640..88191f6e252c 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -41,6 +41,12 @@ * The risk of writer starvation is there, but the pathological use cases * which trigger it are not necessarily the typical RT workloads. * + * Fast-path orderings: + * The lock/unlock of readers can run in fast paths: lock and unlock are only + * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE + * semantics of rwbase_rt. Atomic ops should thus provide _acquire() + * and _release() (or stronger). + * * Common code shared between RT rw_semaphore and rwlock */ @@ -53,6 +59,7 @@ static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) * set. */ for (r = atomic_read(&rwb->readers); r < 0;) { + /* Fully-ordered if cmpxchg() succeeds, provides ACQUIRE */ if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1))) return 1; } @@ -162,6 +169,8 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, /* * rwb->readers can only hit 0 when a writer is waiting for the * active readers to leave the critical section. + * + * dec_and_test() is fully ordered, provides RELEASE. */ if (unlikely(atomic_dec_and_test(&rwb->readers))) __rwbase_read_unlock(rwb, state); @@ -172,7 +181,11 @@ static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, { struct rt_mutex_base *rtm = &rwb->rtmutex; - atomic_add(READER_BIAS - bias, &rwb->readers); + /* + * _release() is needed in case that reader is in fast path, pairing + * with atomic_try_cmpxchg() in rwbase_read_trylock(), provides RELEASE + */ + (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers); raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); rwbase_rtmutex_unlock(rtm); } @@ -196,6 +209,23 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb) __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags); } +static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb) +{ + /* Can do without CAS because we're serialized by wait_lock. */ + lockdep_assert_held(&rwb->rtmutex.wait_lock); + + /* + * _acquire is needed in case the reader is in the fast path, pairing + * with rwbase_read_unlock(), provides ACQUIRE. + */ + if (!atomic_read_acquire(&rwb->readers)) { + atomic_set(&rwb->readers, WRITER_BIAS); + return 1; + } + + return 0; +} + static int __sched rwbase_write_lock(struct rwbase_rt *rwb, unsigned int state) { @@ -210,34 +240,30 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, atomic_sub(READER_BIAS, &rwb->readers); raw_spin_lock_irqsave(&rtm->wait_lock, flags); - /* - * set_current_state() for rw_semaphore - * current_save_and_set_rtlock_wait_state() for rwlock - */ - rwbase_set_and_save_current_state(state); + if (__rwbase_write_trylock(rwb)) + goto out_unlock; - /* Block until all readers have left the critical section. */ - for (; atomic_read(&rwb->readers);) { + rwbase_set_and_save_current_state(state); + for (;;) { /* Optimized out for rwlocks */ if (rwbase_signal_pending_state(state, current)) { - __set_current_state(TASK_RUNNING); + rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); return -EINTR; } + + if (__rwbase_write_trylock(rwb)) + break; + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); - /* - * Schedule and wait for the readers to leave the critical - * section. The last reader leaving it wakes the waiter. - */ - if (atomic_read(&rwb->readers) != 0) - rwbase_schedule(); set_current_state(state); - raw_spin_lock_irqsave(&rtm->wait_lock, flags); } - - atomic_set(&rwb->readers, WRITER_BIAS); rwbase_restore_current_state(); + +out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); return 0; } @@ -253,8 +279,7 @@ static inline int rwbase_write_trylock(struct rwbase_rt *rwb) atomic_sub(READER_BIAS, &rwb->readers); raw_spin_lock_irqsave(&rtm->wait_lock, flags); - if (!atomic_read(&rwb->readers)) { - atomic_set(&rwb->readers, WRITER_BIAS); + if (__rwbase_write_trylock(rwb)) { raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); return 1; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 825277e1e742..a8d0a58deebc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early) return; err_free_descs: - memblock_free(__pa(new_descs), new_descs_size); + memblock_free_ptr(new_descs, new_descs_size); err_free_log_buf: - memblock_free(__pa(new_log_buf), new_log_buf_len); + memblock_free_ptr(new_log_buf, new_log_buf_len); } static bool __read_mostly ignore_loglevel; diff --git a/kernel/rseq.c b/kernel/rseq.c index 35f7bd0fced0..6d45ac3dae7f 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -282,9 +282,17 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + + /* + * regs is NULL if and only if the caller is in a syscall path. Skip + * fixup and leave rseq_cs as is so that rseq_sycall() will detect and + * kill a misbehaving userspace on debug kernels. + */ + if (regs) { + ret = rseq_ip_fixup(regs); + if (unlikely(ret < 0)) + goto error; + } if (unlikely(rseq_update_cpu_id(t))) goto error; return; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 49716228efb4..17a653b67006 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -173,16 +173,22 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[16]; + unsigned int scaling; if (cnt > 15) cnt = 15; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = '\0'; - if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling)) + if (kstrtouint(buf, 10, &scaling)) return -EINVAL; + if (scaling >= SCHED_TUNABLESCALING_END) + return -EINVAL; + + sysctl_sched_tunable_scaling = scaling; if (sched_update_scaling()) return -EINVAL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..f6a05d9b5443 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4936,8 +4936,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - if (!cfs_rq->load.weight) + /* Nothing to run but something to decay (on_list)? Complete the branch */ + if (!cfs_rq->load.weight) { + if (cfs_rq->on_list) + goto unthrottle_throttle; return; + } task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ee736861b18f..643d412ac623 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1404,7 +1404,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - *newval += now; + if (*newval) + *newval += now; } /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c221e4c3f625..fa91f398f28b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1605,6 +1605,14 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; + if (bt->trace_state == Blktrace_running) { + bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + } + put_probe_ref(); synchronize_rcu(); blk_trace_free(bt); |