diff options
Diffstat (limited to 'kernel')
77 files changed, 1311 insertions, 1000 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4d09e610777f..3f5bf1af0826 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1579,7 +1579,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, struct bpf_prog_array_item *item; int i = 0; - item = rcu_dereference(array)->items; + item = rcu_dereference_check(array, 1)->items; for (; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 620bc5024d7d..24aac0d0f412 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -479,6 +479,8 @@ static void cpu_map_free(struct bpf_map *map) * It does __not__ ensure pending flush operations (if any) are * complete. */ + + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ac1df79f3788..141710b82a6c 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -161,6 +161,7 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); + bpf_clear_redirect_map(map); synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 0c1a696b041b..98e621a29e8e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -58,6 +58,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; struct bpf_sock_progs progs; + raw_spinlock_t lock; }; struct bucket { @@ -89,9 +90,9 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; + struct bpf_map *map; struct sock **entry; struct htab_elem __rcu *hash_link; - struct bpf_htab __rcu *htab; }; struct smap_psock { @@ -343,13 +344,18 @@ static void bpf_tcp_close(struct sock *sk, long timeout) e = psock_map_pop(sk, psock); while (e) { if (e->entry) { - osk = cmpxchg(e->entry, sk, NULL); + struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); + + raw_spin_lock_bh(&stab->lock); + osk = *e->entry; if (osk == sk) { + *e->entry = NULL; smap_release_sock(psock, sk); } + raw_spin_unlock_bh(&stab->lock); } else { struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = rcu_dereference(e->htab); + struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); struct hlist_head *head; struct htab_elem *l; struct bucket *b; @@ -370,6 +376,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } raw_spin_unlock_bh(&b->lock); } + kfree(e); e = psock_map_pop(sk, psock); } rcu_read_unlock(); @@ -1641,6 +1648,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&stab->map, attr); + raw_spin_lock_init(&stab->lock); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); @@ -1675,8 +1683,10 @@ static void smap_list_map_remove(struct smap_psock *psock, spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) + if (e->entry == entry) { list_del(&e->list); + kfree(e); + } } spin_unlock_bh(&psock->maps_lock); } @@ -1690,8 +1700,10 @@ static void smap_list_hash_remove(struct smap_psock *psock, list_for_each_entry_safe(e, tmp, &psock->maps, list) { struct htab_elem *c = rcu_dereference(e->hash_link); - if (c == hash_link) + if (c == hash_link) { list_del(&e->list); + kfree(e); + } } spin_unlock_bh(&psock->maps_lock); } @@ -1711,14 +1723,15 @@ static void sock_map_free(struct bpf_map *map) * and a grace period expire to ensure psock is really safe to remove. */ rcu_read_lock(); + raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct smap_psock *psock; struct sock *sock; - sock = xchg(&stab->sock_map[i], NULL); + sock = stab->sock_map[i]; if (!sock) continue; - + stab->sock_map[i] = NULL; psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -1730,6 +1743,7 @@ static void sock_map_free(struct bpf_map *map) smap_release_sock(psock, sock); } } + raw_spin_unlock_bh(&stab->lock); rcu_read_unlock(); sock_map_remove_complete(stab); @@ -1773,19 +1787,23 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (k >= map->max_entries) return -EINVAL; - sock = xchg(&stab->sock_map[k], NULL); + raw_spin_lock_bh(&stab->lock); + sock = stab->sock_map[k]; + stab->sock_map[k] = NULL; + raw_spin_unlock_bh(&stab->lock); if (!sock) return -EINVAL; psock = smap_psock_sk(sock); if (!psock) - goto out; - - if (psock->bpf_parse) + return 0; + if (psock->bpf_parse) { + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); + } smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); -out: return 0; } @@ -1821,11 +1839,9 @@ out: static int __sock_map_ctx_update_elem(struct bpf_map *map, struct bpf_sock_progs *progs, struct sock *sock, - struct sock **map_link, void *key) { struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; bool new = false; int err = 0; @@ -1898,14 +1914,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, new = true; } - if (map_link) { - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_free; - } - } - /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ @@ -1927,17 +1935,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, write_unlock_bh(&sock->sk_callback_lock); } - /* 4. Place psock in sockmap for use and stop any programs on - * the old sock assuming its not the same sock we are replacing - * it with. Because we can only have a single set of programs if - * old_sock has a strp we can stop it. - */ - if (map_link) { - e->entry = map_link; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - } return err; out_free: smap_release_sock(psock, sock); @@ -1948,7 +1945,6 @@ out_progs: } if (tx_msg) bpf_prog_put(tx_msg); - kfree(e); return err; } @@ -1958,36 +1954,57 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock; + struct sock *osock, *sock = skops->sk; + struct smap_psock_map_entry *e; + struct smap_psock *psock; u32 i = *(u32 *)key; int err; if (unlikely(flags > BPF_EXIST)) return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) return -E2BIG; - sock = READ_ONCE(stab->sock_map[i]); - if (flags == BPF_EXIST && !sock) - return -ENOENT; - else if (flags == BPF_NOEXIST && sock) - return -EEXIST; + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; - sock = skops->sk; - err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i], - key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto out; - osock = xchg(&stab->sock_map[i], sock); - if (osock) { - struct smap_psock *opsock = smap_psock_sk(osock); + /* psock guaranteed to be present. */ + psock = smap_psock_sk(sock); + raw_spin_lock_bh(&stab->lock); + osock = stab->sock_map[i]; + if (osock && flags == BPF_NOEXIST) { + err = -EEXIST; + goto out_unlock; + } + if (!osock && flags == BPF_EXIST) { + err = -ENOENT; + goto out_unlock; + } - smap_list_map_remove(opsock, &stab->sock_map[i]); - smap_release_sock(opsock, osock); + e->entry = &stab->sock_map[i]; + e->map = map; + spin_lock_bh(&psock->maps_lock); + list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); + + stab->sock_map[i] = sock; + if (osock) { + psock = smap_psock_sk(osock); + smap_list_map_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, osock); } + raw_spin_unlock_bh(&stab->lock); + return 0; +out_unlock: + smap_release_sock(psock, sock); + raw_spin_unlock_bh(&stab->lock); out: + kfree(e); return err; } @@ -2350,7 +2367,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, b = __select_bucket(htab, hash); head = &b->head; - err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + err = __sock_map_ctx_update_elem(map, progs, sock, key); if (err) goto err; @@ -2376,8 +2393,7 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, } rcu_assign_pointer(e->hash_link, l_new); - rcu_assign_pointer(e->htab, - container_of(map, struct bpf_htab, map)); + e->map = map; spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); spin_unlock_bh(&psock->maps_lock); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ca90679a7fe5..92246117d2b0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5844,27 +5844,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_redirect_map) { - /* Note, we cannot use prog directly as imm as subsequent - * rewrites would still change the prog pointer. The only - * stable address we can use is aux, which also works with - * prog clones during blinding. - */ - u64 addr = (unsigned long)prog->aux; - struct bpf_insn r4_ld[] = { - BPF_LD_IMM64(BPF_REG_4, addr), - *insn, - }; - cnt = ARRAY_SIZE(r4_ld); - - new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - } patch_call_imm: fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 4ddf61e158f6..9f8463afda9c 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -75,6 +75,7 @@ static void xsk_map_free(struct bpf_map *map) struct xsk_map *m = container_of(map, struct xsk_map, map); int i; + bpf_clear_redirect_map(map); synchronize_net(); for (i = 0; i < map->max_entries; i++) { diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 77ff1cd6a252..75568fcf2180 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -8,6 +8,32 @@ #include <linux/list.h> #include <linux/refcount.h> +#define TRACE_CGROUP_PATH_LEN 1024 +extern spinlock_t trace_cgroup_path_lock; +extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; + +/* + * cgroup_path() takes a spin lock. It is good practice not to take + * spin locks within trace point handlers, as they are mostly hidden + * from normal view. As cgroup_path() can take the kernfs_rename_lock + * spin lock, it is best to not call that function from the trace event + * handler. + * + * Note: trace_cgroup_##type##_enabled() is a static branch that will only + * be set when the trace event is enabled. + */ +#define TRACE_CGROUP_PATH(type, cgrp, ...) \ + do { \ + if (trace_cgroup_##type##_enabled()) { \ + spin_lock(&trace_cgroup_path_lock); \ + cgroup_path(cgrp, trace_cgroup_path, \ + TRACE_CGROUP_PATH_LEN); \ + trace_cgroup_##type(cgrp, trace_cgroup_path, \ + ##__VA_ARGS__); \ + spin_unlock(&trace_cgroup_path_lock); \ + } \ + } while (0) + /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 8b4f0768efd6..51063e7a93c2 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -135,7 +135,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (task) { ret = cgroup_migrate(task, false, &mgctx); if (!ret) - trace_cgroup_transfer_tasks(to, task, false); + TRACE_CGROUP_PATH(transfer_tasks, to, task, false); put_task_struct(task); } } while (task && !ret); @@ -865,7 +865,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) - trace_cgroup_rename(cgrp); + TRACE_CGROUP_PATH(rename, cgrp); mutex_unlock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 35cf3d71f8aa..aae10baf1902 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -83,6 +83,9 @@ EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif +DEFINE_SPINLOCK(trace_cgroup_path_lock); +char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; + /* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex. @@ -2638,7 +2641,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, cgroup_migrate_finish(&mgctx); if (!ret) - trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup); return ret; } @@ -4636,7 +4639,7 @@ static void css_release_work_fn(struct work_struct *work) struct cgroup *tcgrp; /* cgroup release path */ - trace_cgroup_release(cgrp); + TRACE_CGROUP_PATH(release, cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); @@ -4979,7 +4982,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) if (ret) goto out_destroy; - trace_cgroup_mkdir(cgrp); + TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ kernfs_activate(kn); @@ -5167,9 +5170,8 @@ int cgroup_rmdir(struct kernfs_node *kn) return 0; ret = cgroup_destroy_locked(cgrp); - if (!ret) - trace_cgroup_rmdir(cgrp); + TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b66aced5e8c2..933cb3e45b98 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,8 +14,8 @@ #include <asm/sections.h> /* vmcoreinfo stuff */ -static unsigned char *vmcoreinfo_data; -static size_t vmcoreinfo_size; +unsigned char *vmcoreinfo_data; +size_t vmcoreinfo_size; u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ @@ -344,7 +344,7 @@ void crash_save_vmcoreinfo(void) if (vmcoreinfo_data_safecopy) vmcoreinfo_data = vmcoreinfo_data_safecopy; - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); update_vmcoreinfo_note(); } @@ -401,7 +401,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); #ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); + VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmap_area_list); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index d987dcd1bd56..286d82329eb0 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -178,7 +178,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * @dev: Pointer to device for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP flags to use for this allocation. + * @no_warn: Avoid printing message about failed allocation. * * This function allocates memory buffer for specified device. It uses * device specific contiguous memory area if available or the default @@ -186,12 +186,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * function. */ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int align, gfp_t gfp_mask) + unsigned int align, bool no_warn) { if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); + return cma_alloc(dev_get_cma_area(dev), count, align, no_warn); } /** diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c2860c5a9e96..1c35b7b945d0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -78,7 +78,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, again: /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, gfp); + page = dma_alloc_from_contiguous(dev, count, page_order, + gfp & __GFP_NOWARN); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_release_from_contiguous(dev, page, count); page = NULL; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c187aa3df3c8..24a77c34e9ad 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 80f456ec5d89..2a62b96600ad 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1334,7 +1334,7 @@ static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { - return perf_event_pid_type(event, p, __PIDTYPE_TGID); + return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index aed1ba569954..3207a4d26849 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -299,8 +299,8 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Called with mm->mmap_sem held for write. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, - uprobe_opcode_t opcode) +int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; struct vm_area_struct *vma; @@ -351,7 +351,7 @@ put_old: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -366,7 +366,8 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); + return uprobe_write_opcode(auprobe, mm, vaddr, + *(uprobe_opcode_t *)&auprobe->insn); } static struct uprobe *get_uprobe(struct uprobe *uprobe) @@ -840,13 +841,8 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) return err; } -static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) -{ - consumer_add(uprobe, uc); - return register_for_each_vma(uprobe, uc); -} - -static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void +__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; @@ -860,24 +856,46 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u } /* - * uprobe_register - register a probe + * uprobe_unregister - unregister an already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: identify which probe if multiple probes are colocated. + */ +void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +{ + struct uprobe *uprobe; + + uprobe = find_uprobe(inode, offset); + if (WARN_ON(!uprobe)) + return; + + down_write(&uprobe->register_rwsem); + __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); +} +EXPORT_SYMBOL_GPL(uprobe_unregister); + +/* + * __uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @uc: information on howto handle the probe.. * - * Apart from the access refcount, uprobe_register() takes a creation + * Apart from the access refcount, __uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. Caller of uprobe_register() is required to keep @inode + * unregisters. Caller of __uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return errno if it cannot successully install probes * else return 0 (success) */ -int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +static int __uprobe_register(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; @@ -904,7 +922,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * down_write(&uprobe->register_rwsem); ret = -EAGAIN; if (likely(uprobe_is_active(uprobe))) { - ret = __uprobe_register(uprobe, uc); + consumer_add(uprobe, uc); + ret = register_for_each_vma(uprobe, uc); if (ret) __uprobe_unregister(uprobe, uc); } @@ -915,6 +934,12 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * goto retry; return ret; } + +int uprobe_register(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc) +{ + return __uprobe_register(inode, offset, uc); +} EXPORT_SYMBOL_GPL(uprobe_register); /* @@ -946,27 +971,6 @@ int uprobe_apply(struct inode *inode, loff_t offset, return ret; } -/* - * uprobe_unregister - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. - * @uc: identify which probe if multiple probes are colocated. - */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - struct uprobe *uprobe; - - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return; - - down_write(&uprobe->register_rwsem); - __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); -} -EXPORT_SYMBOL_GPL(uprobe_unregister); - static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { struct vm_area_struct *vma; diff --git a/kernel/exit.c b/kernel/exit.c index c3c7ac560114..0e21e6d21f35 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -73,6 +73,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) nr_threads--; detach_pid(p, PIDTYPE_PID); if (group_dead) { + detach_pid(p, PIDTYPE_TGID); detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -680,7 +681,8 @@ static void forget_original_parent(struct task_struct *father, t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, - SEND_SIG_NOINFO, t); + SEND_SIG_NOINFO, t, + PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to @@ -1001,14 +1003,6 @@ struct wait_opts { int notask_error; }; -static inline -struct pid *task_pid_type(struct task_struct *task, enum pid_type type) -{ - if (type != PIDTYPE_PID) - task = task->group_leader; - return task->pids[type].pid; -} - static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || diff --git a/kernel/fork.c b/kernel/fork.c index 33112315b5c0..d896e9ca38b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -310,8 +310,9 @@ static struct kmem_cache *mm_cachep; struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma; + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) vma_init(vma, mm); return vma; @@ -871,6 +872,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif +#ifdef CONFIG_MEMCG + tsk->active_memcg = NULL; +#endif return tsk; free_stack: @@ -1298,6 +1302,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; + tsk->last_switch_time = 0; #endif tsk->mm = NULL; @@ -1422,7 +1427,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; atomic_set(&sig->count, 1); + spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + spin_unlock_irq(¤t->sighand->siglock); return 0; } @@ -1484,6 +1491,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); + INIT_HLIST_HEAD(&sig->multiprocess); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); @@ -1577,10 +1585,22 @@ static void posix_cpu_timers_init(struct task_struct *tsk) static inline void posix_cpu_timers_init(struct task_struct *tsk) { } #endif +static inline void init_task_pid_links(struct task_struct *task) +{ + enum pid_type type; + + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + INIT_HLIST_NODE(&task->pid_links[type]); + } +} + static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { - task->pids[type].pid = pid; + if (type == PIDTYPE_PID) + task->thread_pid = pid; + else + task->signal->pids[type] = pid; } static inline void rcu_copy_process(struct task_struct *p) @@ -1618,6 +1638,7 @@ static __latent_entropy struct task_struct *copy_process( { int retval; struct task_struct *p; + struct multiprocess_signals delayed; /* * Don't allow sharing the root directory with processes in a different @@ -1665,6 +1686,24 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + /* + * Force any signals received before this point to be delivered + * before the fork happens. Collect up signals sent to multiple + * processes that happen during the fork and delay them so that + * they appear to happen after the fork. + */ + sigemptyset(&delayed.signal); + INIT_HLIST_NODE(&delayed.node); + + spin_lock_irq(¤t->sighand->siglock); + if (!(clone_flags & CLONE_THREAD)) + hlist_add_head(&delayed.node, ¤t->signal->multiprocess); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + retval = -ERESTARTNOINTR; + if (signal_pending(current)) + goto fork_out; + retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) @@ -1938,29 +1977,26 @@ static __latent_entropy struct task_struct *copy_process( rseq_fork(p, clone_flags); - /* - * Process group and session signals need to be delivered to just the - * parent before the fork or both the parent and the child after the - * fork. Restart if a signal comes in before we add the new process to - * it's process group. - * A fatal signal pending means that current will exit, so the new - * thread can't slip out of an OOM kill (or normal SIGKILL). - */ - recalc_sigpending(); - if (signal_pending(current)) { - retval = -ERESTARTNOINTR; - goto bad_fork_cancel_cgroup; - } + /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_cancel_cgroup; } + /* Let kill terminate clone/fork in the middle */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto bad_fork_cancel_cgroup; + } + + + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { + init_task_pid(p, PIDTYPE_TGID, pid); init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); @@ -1968,8 +2004,7 @@ static __latent_entropy struct task_struct *copy_process( ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } - - p->signal->leader_pid = pid; + p->signal->shared_pending.signal = delayed.signal; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same @@ -1980,6 +2015,7 @@ static __latent_entropy struct task_struct *copy_process( p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); + attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); @@ -1987,6 +2023,7 @@ static __latent_entropy struct task_struct *copy_process( current->signal->nr_threads++; atomic_inc(¤t->signal->live); atomic_inc(¤t->signal->sigcnt); + task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); list_add_tail_rcu(&p->thread_node, @@ -1995,8 +2032,8 @@ static __latent_entropy struct task_struct *copy_process( attach_pid(p, PIDTYPE_PID); nr_threads++; } - total_forks++; + hlist_del_init(&delayed.node); spin_unlock(¤t->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); @@ -2061,16 +2098,19 @@ bad_fork_free: put_task_stack(p); free_task(p); fork_out: + spin_lock_irq(¤t->sighand->siglock); + hlist_del_init(&delayed.node); + spin_unlock_irq(¤t->sighand->siglock); return ERR_PTR(retval); } -static inline void init_idle_pids(struct pid_link *links) +static inline void init_idle_pids(struct task_struct *idle) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { - INIT_HLIST_NODE(&links[type].node); /* not really needed */ - links[type].pid = &init_struct_pid; + INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */ + init_task_pid(idle, type, &init_struct_pid); } } @@ -2080,7 +2120,7 @@ struct task_struct *fork_idle(int cpu) task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { - init_idle_pids(task->pids); + init_idle_pids(task); init_idle(task, cpu); } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 32b479468e4d..b9132d1269ef 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; +/* + * Zero (default value) means use sysctl_hung_task_timeout_secs: + */ +unsigned long __read_mostly sysctl_hung_task_check_interval_secs; + int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; @@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; + t->last_switch_time = jiffies; return; } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return; trace_sched_process_hang(t); @@ -245,8 +253,13 @@ static int watchdog(void *dummy) for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; - long t = hung_timeout_jiffies(hung_last_checked, timeout); + unsigned long interval = sysctl_hung_task_check_interval_secs; + long t; + if (interval == 0) + interval = timeout; + interval = min_t(unsigned long, interval, timeout); + t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { if (!atomic_xchg(&reset_hung_task, 0)) check_hung_uninterruptible_tasks(timeout); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 3a4656fb7047..5b77a7314e01 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -678,6 +678,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) if (!func->old_name || !func->new_func) return -EINVAL; + if (strlen(func->old_name) >= KSYM_NAME_LEN) + return -EINVAL; + INIT_LIST_HEAD(&func->stack_node); func->patched = false; func->transition = false; @@ -751,6 +754,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) if (!obj->funcs) return -EINVAL; + if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) + return -EINVAL; + obj->patched = false; obj->mod = NULL; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 7c6631e693bc..5bc349805e03 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -310,13 +310,6 @@ static bool klp_try_switch_task(struct task_struct *task) return true; /* - * For arches which don't have reliable stack traces, we have to rely - * on other methods (e.g., switching tasks at kernel exit). - */ - if (!klp_have_reliable_stack()) - return false; - - /* * Now try to check the stack for any to-be-patched or to-be-unpatched * functions. If all goes well, switch the task to the target patch * state. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5fa4d3138bf1..e406c5fdb41e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,6 +55,7 @@ #include "lockdep_internals.h" +#include <trace/events/preemptirq.h> #define CREATE_TRACE_POINTS #include <trace/events/lock.h> @@ -248,12 +249,7 @@ void clear_lock_stats(struct lock_class *class) static struct lock_class_stats *get_lock_stats(struct lock_class *class) { - return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; -} - -static void put_lock_stats(struct lock_class_stats *stats) -{ - put_cpu_var(cpu_lock_stats); + return &this_cpu_ptr(cpu_lock_stats)[class - lock_classes]; } static void lock_release_holdtime(struct held_lock *hlock) @@ -271,7 +267,6 @@ static void lock_release_holdtime(struct held_lock *hlock) lock_time_inc(&stats->read_holdtime, holdtime); else lock_time_inc(&stats->write_holdtime, holdtime); - put_lock_stats(stats); } #else static inline void lock_release_holdtime(struct held_lock *hlock) @@ -2845,10 +2840,8 @@ static void __trace_hardirqs_on_caller(unsigned long ip) debug_atomic_inc(hardirqs_on_events); } -__visible void trace_hardirqs_on_caller(unsigned long ip) +void lockdep_hardirqs_on(unsigned long ip) { - time_hardirqs_on(CALLER_ADDR0, ip); - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2887,23 +2880,14 @@ __visible void trace_hardirqs_on_caller(unsigned long ip) __trace_hardirqs_on_caller(ip); current->lockdep_recursion = 0; } -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -void trace_hardirqs_on(void) -{ - trace_hardirqs_on_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -__visible void trace_hardirqs_off_caller(unsigned long ip) +void lockdep_hardirqs_off(unsigned long ip) { struct task_struct *curr = current; - time_hardirqs_off(CALLER_ADDR0, ip); - if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2925,13 +2909,6 @@ __visible void trace_hardirqs_off_caller(unsigned long ip) } else debug_atomic_inc(redundant_hardirqs_off); } -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -void trace_hardirqs_off(void) -{ - trace_hardirqs_off_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_off); /* * Softirqs will be enabled: @@ -4090,7 +4067,6 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) stats->contending_point[contending_point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; - put_lock_stats(stats); } static void @@ -4138,7 +4114,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) } if (lock->cpu != cpu) stats->bounces[bounce_acquired + !!hlock->read]++; - put_lock_stats(stats); lock->cpu = cpu; lock->ip = ip; @@ -4338,7 +4313,7 @@ out_restore: raw_local_irq_restore(flags); } -void __init lockdep_info(void) +void __init lockdep_init(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); diff --git a/kernel/memremap.c b/kernel/memremap.c index 38283363da06..5b8600d39931 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/pfn_t.h> #include <linux/io.h> +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/memory_hotplug.h> #include <linux/swap.h> @@ -42,7 +43,7 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff) pgoff += 1UL << order, order = order_at((res), pgoff)) #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -int device_private_entry_fault(struct vm_area_struct *vma, +vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, swp_entry_t entry, unsigned int flags, @@ -137,6 +138,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_begin(); arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? &pgmap->altmap : NULL); + kasan_remove_zero_shadow(__va(align_start), align_size); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); @@ -239,6 +241,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_pfn_remap; mem_hotplug_begin(); + error = kasan_add_zero_shadow(__va(align_start), align_size); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + error = arch_add_memory(nid, align_start, align_size, altmap, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], @@ -267,6 +275,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) return __va(res->start); err_add_memory: + kasan_remove_zero_shadow(__va(align_start), align_size); + err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: err_radix: @@ -355,7 +365,6 @@ void __put_devmap_managed_page(struct page *page) __ClearPageActive(page); __ClearPageWaiters(page); - page->mapping = NULL; mem_cgroup_uncharge(page); page->pgmap->page_free(page, page->pgmap->data); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 915e123a430f..79c9be2dbbe9 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,4 +9,27 @@ * 2 of the Licence, or (at your option) any later version. */ -extern int mod_verify_sig(const void *mod, unsigned long *_modlen); +#include <linux/elf.h> +#include <asm/module.h> + +struct load_info { + const char *name; + /* pointer to module in temporary copy, freed at end of load_module() */ + struct module *mod; + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long symoffs, stroffs; + struct _ddebug *debug; + unsigned int num_debug; + bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + +extern int mod_verify_sig(const void *mod, struct load_info *info); diff --git a/kernel/module.c b/kernel/module.c index a7615d661910..6746c85511fe 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -307,24 +307,6 @@ int unregister_module_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_module_notifier); -struct load_info { - const char *name; - Elf_Ehdr *hdr; - unsigned long len; - Elf_Shdr *sechdrs; - char *secstrings, *strtab; - unsigned long symoffs, stroffs; - struct _ddebug *debug; - unsigned int num_debug; - bool sig_ok; -#ifdef CONFIG_KALLSYMS - unsigned long mod_kallsyms_init_off; -#endif - struct { - unsigned int sym, str, mod, vers, info, pcpu; - } index; -}; - /* * We require a truly strong try_module_get(): 0 means success. * Otherwise an error is returned due to ongoing or failed @@ -547,12 +529,30 @@ static bool check_symbol(const struct symsearch *syms, return true; } +static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} + +static const char *kernel_symbol_name(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return offset_to_ptr(&sym->name_offset); +#else + return sym->name; +#endif +} + static int cmp_name(const void *va, const void *vb) { const char *a; const struct kernel_symbol *b; a = va; b = vb; - return strcmp(a, b->name); + return strcmp(a, kernel_symbol_name(b)); } static bool find_symbol_in_section(const struct symsearch *syms, @@ -1339,14 +1339,12 @@ static inline int check_modstruct_version(const struct load_info *info, * locking is necessary -- use preempt_disable() to placate lockdep. */ preempt_disable(); - if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, - &crc, true, false)) { + if (!find_symbol("module_layout", NULL, &crc, true, false)) { preempt_enable(); BUG(); } preempt_enable(); - return check_version(info, VMLINUX_SYMBOL_STR(module_layout), - mod, crc); + return check_version(info, "module_layout", mod, crc); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -2059,21 +2057,19 @@ static int copy_module_elf(struct module *mod, struct load_info *info) /* Elf section header table */ size = sizeof(*info->sechdrs) * info->hdr->e_shnum; - mod->klp_info->sechdrs = kmalloc(size, GFP_KERNEL); + mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL); if (mod->klp_info->sechdrs == NULL) { ret = -ENOMEM; goto free_info; } - memcpy(mod->klp_info->sechdrs, info->sechdrs, size); /* Elf section name string table */ size = info->sechdrs[info->hdr->e_shstrndx].sh_size; - mod->klp_info->secstrings = kmalloc(size, GFP_KERNEL); + mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL); if (mod->klp_info->secstrings == NULL) { ret = -ENOMEM; goto free_sechdrs; } - memcpy(mod->klp_info->secstrings, info->secstrings, size); /* Elf symbol section index */ symndx = info->index.sym; @@ -2192,7 +2188,7 @@ void *__symbol_get(const char *symbol) sym = NULL; preempt_enable(); - return sym ? (void *)sym->value : NULL; + return sym ? (void *)kernel_symbol_value(sym) : NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -2222,10 +2218,12 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(s->name, &owner, NULL, true, false)) { + if (find_symbol(kernel_symbol_name(s), &owner, NULL, + true, false)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", - mod->name, s->name, module_name(owner)); + mod->name, kernel_symbol_name(s), + module_name(owner)); return -ENOEXEC; } } @@ -2274,7 +2272,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ if (ksym && !IS_ERR(ksym)) { - sym[i].st_value = ksym->value; + sym[i].st_value = kernel_symbol_value(ksym); break; } @@ -2282,9 +2280,9 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - pr_warn("%s: Unknown symbol %s (err %li)\n", - mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; + pr_warn("%s: Unknown symbol %s (err %d)\n", + mod->name, name, ret); break; default: @@ -2486,7 +2484,11 @@ static char *get_modinfo(struct load_info *info, const char *tag) Elf_Shdr *infosec = &info->sechdrs[info->index.info]; unsigned long size = infosec->sh_size; - for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { + /* + * get_modinfo() calls made before rewrite_section_headers() + * must use sh_offset, as sh_addr isn't set! + */ + for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; } @@ -2534,7 +2536,7 @@ static int is_exported(const char *name, unsigned long value, ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); else ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); - return ks != NULL && ks->value == value; + return ks != NULL && kernel_symbol_value(ks) == value; } /* As per nm */ @@ -2774,7 +2776,7 @@ static int module_sig_check(struct load_info *info, int flags) memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; - err = mod_verify_sig(mod, &info->len); + err = mod_verify_sig(mod, info); } if (!err) { @@ -2926,17 +2928,7 @@ static int rewrite_section_headers(struct load_info *info, int flags) } /* Track but don't keep modinfo and version sections. */ - if (flags & MODULE_INIT_IGNORE_MODVERSIONS) - info->index.vers = 0; /* Pretend no __versions section! */ - else - info->index.vers = find_sec(info, "__versions"); info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; - - info->index.info = find_sec(info, ".modinfo"); - if (!info->index.info) - info->name = "(missing .modinfo section)"; - else - info->name = get_modinfo(info, "name"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; return 0; @@ -2947,23 +2939,24 @@ static int rewrite_section_headers(struct load_info *info, int flags) * search for module section index etc), and do some basic section * verification. * - * Return the temporary module pointer (we'll replace it with the final - * one when we move the module sections around). + * Set info->mod to the temporary copy of the module in info->hdr. The final one + * will be allocated in move_module(). */ -static struct module *setup_load_info(struct load_info *info, int flags) +static int setup_load_info(struct load_info *info, int flags) { unsigned int i; - int err; - struct module *mod; /* Set up the convenience variables */ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; info->secstrings = (void *)info->hdr + info->sechdrs[info->hdr->e_shstrndx].sh_offset; - err = rewrite_section_headers(info, flags); - if (err) - return ERR_PTR(err); + /* Try to find a name early so we can log errors with a module name */ + info->index.info = find_sec(info, ".modinfo"); + if (!info->index.info) + info->name = "(missing .modinfo section)"; + else + info->name = get_modinfo(info, "name"); /* Find internal symbols and strings. */ for (i = 1; i < info->hdr->e_shnum; i++) { @@ -2976,34 +2969,35 @@ static struct module *setup_load_info(struct load_info *info, int flags) } } + if (info->index.sym == 0) { + pr_warn("%s: module has no symbols (stripped?)\n", info->name); + return -ENOEXEC; + } + info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { pr_warn("%s: No module found in object\n", info->name ?: "(missing .modinfo name field)"); - return ERR_PTR(-ENOEXEC); + return -ENOEXEC; } /* This is temporary: point mod into copy of data. */ - mod = (void *)info->sechdrs[info->index.mod].sh_addr; + info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset; /* - * If we didn't load the .modinfo 'name' field, fall back to + * If we didn't load the .modinfo 'name' field earlier, fall back to * on-disk struct mod 'name' field. */ if (!info->name) - info->name = mod->name; + info->name = info->mod->name; - if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", info->name); - return ERR_PTR(-ENOEXEC); - } + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) + info->index.vers = 0; /* Pretend no __versions section! */ + else + info->index.vers = find_sec(info, "__versions"); info->index.pcpu = find_pcpusec(info); - /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info, mod)) - return ERR_PTR(-ENOEXEC); - - return mod; + return 0; } static int check_modinfo(struct module *mod, struct load_info *info, int flags) @@ -3298,25 +3292,17 @@ core_param(module_blacklist, module_blacklist, charp, 0400); static struct module *layout_and_allocate(struct load_info *info, int flags) { - /* Module within temporary copy. */ struct module *mod; unsigned int ndx; int err; - mod = setup_load_info(info, flags); - if (IS_ERR(mod)) - return mod; - - if (blacklisted(info->name)) - return ERR_PTR(-EPERM); - - err = check_modinfo(mod, info, flags); + err = check_modinfo(info->mod, info, flags); if (err) return ERR_PTR(err); /* Allow arches to frob section contents and sizes. */ err = module_frob_arch_sections(info->hdr, info->sechdrs, - info->secstrings, mod); + info->secstrings, info->mod); if (err < 0) return ERR_PTR(err); @@ -3335,11 +3321,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any special cases for the architectures. */ - layout_sections(mod, info); - layout_symtab(mod, info); + layout_sections(info->mod, info); + layout_symtab(info->mod, info); /* Allocate and move to the final place */ - err = move_module(mod, info); + err = move_module(info->mod, info); if (err) return ERR_PTR(err); @@ -3657,17 +3643,36 @@ static int load_module(struct load_info *info, const char __user *uargs, int flags) { struct module *mod; - long err; + long err = 0; char *after_dashes; + err = elf_header_check(info); + if (err) + goto free_copy; + + err = setup_load_info(info, flags); + if (err) + goto free_copy; + + if (blacklisted(info->name)) { + err = -EPERM; + goto free_copy; + } + err = module_sig_check(info, flags); if (err) goto free_copy; - err = elf_header_check(info); + err = rewrite_section_headers(info, flags); if (err) goto free_copy; + /* Check module struct version now, before we try to use module. */ + if (!check_modstruct_version(info, info->mod)) { + err = -ENOEXEC; + goto free_copy; + } + /* Figure out module layout, and allocate all the memory. */ mod = layout_and_allocate(info, flags); if (IS_ERR(mod)) { @@ -4067,7 +4072,7 @@ static unsigned long mod_find_symname(struct module *mod, const char *name) for (i = 0; i < kallsyms->num_symtab; i++) if (strcmp(name, symname(kallsyms, i)) == 0 && - kallsyms->symtab[i].st_info != 'U') + kallsyms->symtab[i].st_shndx != SHN_UNDEF) return kallsyms->symtab[i].st_value; return 0; } @@ -4113,6 +4118,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, if (mod->state == MODULE_STATE_UNFORMED) continue; for (i = 0; i < kallsyms->num_symtab; i++) { + + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) + continue; + ret = fn(data, symname(kallsyms, i), mod, kallsyms->symtab[i].st_value); if (ret != 0) diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 937c844bee4a..f2075ce8e4b3 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -45,10 +45,10 @@ struct module_signature { /* * Verify the signature on a module. */ -int mod_verify_sig(const void *mod, unsigned long *_modlen) +int mod_verify_sig(const void *mod, struct load_info *info) { struct module_signature ms; - size_t modlen = *_modlen, sig_len; + size_t sig_len, modlen = info->len; pr_devel("==>%s(,%zu)\n", __func__, modlen); @@ -62,10 +62,11 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) if (sig_len >= modlen) return -EBADMSG; modlen -= sig_len; - *_modlen = modlen; + info->len = modlen; if (ms.id_type != PKEY_ID_PKCS7) { - pr_err("Module is not signed with expected PKCS#7 message\n"); + pr_err("%s: Module is not signed with expected PKCS#7 message\n", + info->name); return -ENOPKG; } @@ -76,7 +77,8 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) ms.__pad[0] != 0 || ms.__pad[1] != 0 || ms.__pad[2] != 0) { - pr_err("PKCS#7 signature info has unexpected non-zero params\n"); + pr_err("%s: PKCS#7 signature info has unexpected non-zero params\n", + info->name); return -EBADMSG; } diff --git a/kernel/pid.c b/kernel/pid.c index 157fe4b19971..de1cfc4f75a2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -265,27 +265,33 @@ struct pid *find_vpid(int nr) } EXPORT_SYMBOL_GPL(find_vpid); +static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) +{ + return (type == PIDTYPE_PID) ? + &task->thread_pid : + &task->signal->pids[type]; +} + /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid_link *link = &task->pids[type]; - hlist_add_head_rcu(&link->node, &link->pid->tasks[type]); + struct pid *pid = *task_pid_ptr(task, type); + hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { - struct pid_link *link; + struct pid **pid_ptr = task_pid_ptr(task, type); struct pid *pid; int tmp; - link = &task->pids[type]; - pid = link->pid; + pid = *pid_ptr; - hlist_del_rcu(&link->node); - link->pid = new; + hlist_del_rcu(&task->pid_links[type]); + *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (!hlist_empty(&pid->tasks[tmp])) @@ -310,8 +316,9 @@ void change_pid(struct task_struct *task, enum pid_type type, void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - new->pids[type].pid = old->pids[type].pid; - hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); + if (type == PIDTYPE_PID) + new->thread_pid = old->thread_pid; + hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) @@ -322,7 +329,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) - result = hlist_entry(first, struct task_struct, pids[(type)].node); + result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } @@ -360,9 +367,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); - if (type != PIDTYPE_PID) - task = task->group_leader; - pid = get_pid(rcu_dereference(task->pids[type].pid)); + pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } @@ -420,15 +425,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) { - if (type == __PIDTYPE_TGID) - type = PIDTYPE_PID; - - task = task->group_leader; - } - nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); - } + if (likely(pid_alive(task))) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e880ca22c5a5..3a6c2f87699e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -105,6 +105,7 @@ config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS select PM + select SRCU config PM_SLEEP_SMP def_bool y diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 90b6ab01db59..924e37fb1620 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -66,6 +66,9 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; +atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); +EXPORT_SYMBOL(ignore_console_lock_warning); + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -2788,7 +2791,8 @@ EXPORT_SYMBOL(unregister_console); void __init console_init(void) { int ret; - initcall_t *call; + initcall_t call; + initcall_entry_t *ce; /* Setup the default TTY line discipline. */ n_tty_init(); @@ -2797,13 +2801,14 @@ void __init console_init(void) * set up the console device so that later boot sequences can * inform about problems etc.. */ - call = __con_initcall_start; + ce = __con_initcall_start; trace_initcall_level("console"); - while (call < __con_initcall_end) { - trace_initcall_start((*call)); - ret = (*call)(); - trace_initcall_finish((*call), ret); - call++; + while (ce < __con_initcall_end) { + call = initcall_from_entry(ce); + trace_initcall_start(call); + ret = call(); + trace_initcall_finish(call, ret); + ce++; } } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 454adf9f8180..625bc9897f62 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2774,6 +2774,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); + + calculate_sigpending(); } /* @@ -3159,7 +3161,7 @@ static inline void sched_tick_stop(int cpu) { } #endif #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) + defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * If the value passed in is equal to the current preempt count * then we just disabled preemption. Start timing the latency. diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1a3e9bddd17b..16f84142f2f4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -190,7 +190,7 @@ static void cpuidle_idle_call(void) */ next_state = cpuidle_select(drv, dev, &stop_tick); - if (stop_tick) + if (stop_tick || tick_nohz_tick_stopped()) tick_nohz_idle_stop_tick(); else tick_nohz_idle_retain_tick(); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 870f97b313e3..5dd47f1103d1 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -69,6 +69,8 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, wait_queue_entry_t *curr, *next; int cnt = 0; + lockdep_assert_held(&wq_head->lock); + if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { curr = list_next_entry(bookmark, entry); diff --git a/kernel/signal.c b/kernel/signal.c index 8d8a940422a8..5843c541fda9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -65,14 +65,14 @@ static void __user *sig_handler(struct task_struct *t, int sig) return t->sighand->action[sig - 1].sa.sa_handler; } -static int sig_handler_ignored(void __user *handler, int sig) +static inline bool sig_handler_ignored(void __user *handler, int sig) { /* Is it explicitly or implicitly ignored? */ return handler == SIG_IGN || - (handler == SIG_DFL && sig_kernel_ignore(sig)); + (handler == SIG_DFL && sig_kernel_ignore(sig)); } -static int sig_task_ignored(struct task_struct *t, int sig, bool force) +static bool sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; @@ -80,12 +80,12 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) - return 1; + return true; return sig_handler_ignored(handler, sig); } -static int sig_ignored(struct task_struct *t, int sig, bool force) +static bool sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the @@ -93,7 +93,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * unblocked. */ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) - return 0; + return false; /* * Tracers may want to know about even ignored signal unless it @@ -101,7 +101,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * by SIGNAL_UNKILLABLE task. */ if (t->ptrace && sig != SIGKILL) - return 0; + return false; return sig_task_ignored(t, sig, force); } @@ -110,7 +110,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * Re-calculate pending state from the set of locally pending * signals, globally pending signals, and blocked signals. */ -static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) +static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) { unsigned long ready; long i; @@ -138,20 +138,21 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) -static int recalc_sigpending_tsk(struct task_struct *t) +static bool recalc_sigpending_tsk(struct task_struct *t) { if ((t->jobctl & JOBCTL_PENDING_MASK) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); - return 1; + return true; } + /* * We must never clear the flag in another thread, or in current * when it's possible the current syscall is returning -ERESTART*. * So we don't clear it here, and only callers who know they should do. */ - return 0; + return false; } /* @@ -172,6 +173,17 @@ void recalc_sigpending(void) } +void calculate_sigpending(void) +{ + /* Have any signals or users of TIF_SIGPENDING been delayed + * until after fork? + */ + spin_lock_irq(¤t->sighand->siglock); + set_tsk_thread_flag(current, TIF_SIGPENDING); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); +} + /* Given the mask, find the first available signal that should be serviced. */ #define SYNCHRONOUS_MASK \ @@ -362,6 +374,20 @@ static bool task_participate_group_stop(struct task_struct *task) return false; } +void task_join_group_stop(struct task_struct *task) +{ + /* Have the new thread join an on-going signal group stop */ + unsigned long jobctl = current->jobctl; + if (jobctl & JOBCTL_STOP_PENDING) { + struct signal_struct *sig = current->signal; + unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; + unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; + if (task_set_jobctl_pending(task, signr | gstop)) { + sig->group_stop_count++; + } + } +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an @@ -504,13 +530,15 @@ flush_signal_handlers(struct task_struct *t, int force_default) } } -int unhandled_signal(struct task_struct *tsk, int sig) +bool unhandled_signal(struct task_struct *tsk, int sig) { void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) - return 1; + return true; + if (handler != SIG_IGN && handler != SIG_DFL) - return 0; + return false; + /* if ptraced, let the tracer determine */ return !tsk->ptrace; } @@ -684,14 +712,14 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) * * All callers must be holding the siglock. */ -static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) +static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) - return 0; + return; sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { @@ -700,7 +728,6 @@ static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) __sigqueue_free(q); } } - return 1; } static inline int is_si_special(const struct siginfo *info) @@ -717,21 +744,16 @@ static inline bool si_fromuser(const struct siginfo *info) /* * called with RCU read lock from check_kill_permission() */ -static int kill_ok_by_cred(struct task_struct *t) +static bool kill_ok_by_cred(struct task_struct *t) { const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); - if (uid_eq(cred->euid, tcred->suid) || - uid_eq(cred->euid, tcred->uid) || - uid_eq(cred->uid, tcred->suid) || - uid_eq(cred->uid, tcred->uid)) - return 1; - - if (ns_capable(tcred->user_ns, CAP_KILL)) - return 1; - - return 0; + return uid_eq(cred->euid, tcred->suid) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->uid, tcred->suid) || + uid_eq(cred->uid, tcred->uid) || + ns_capable(tcred->user_ns, CAP_KILL); } /* @@ -882,20 +904,24 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ -static inline int wants_signal(int sig, struct task_struct *p) +static inline bool wants_signal(int sig, struct task_struct *p) { if (sigismember(&p->blocked, sig)) - return 0; + return false; + if (p->flags & PF_EXITING) - return 0; + return false; + if (sig == SIGKILL) - return 1; + return true; + if (task_is_stopped_or_traced(p)) - return 0; + return false; + return task_curr(p) || !signal_pending(p); } -static void complete_signal(int sig, struct task_struct *p, int group) +static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; struct task_struct *t; @@ -908,7 +934,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) */ if (wants_signal(sig, p)) t = p; - else if (!group || thread_group_empty(p)) + else if ((type == PIDTYPE_PID) || thread_group_empty(p)) /* * There is just one thread and it does not need to be woken. * It will dequeue unblocked signals before it runs again. @@ -971,7 +997,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) return; } -static inline int legacy_queue(struct sigpending *signals, int sig) +static inline bool legacy_queue(struct sigpending *signals, int sig) { return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } @@ -998,7 +1024,7 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str #endif static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group, int from_ancestor_ns) + enum pid_type type, int from_ancestor_ns) { struct sigpending *pending; struct sigqueue *q; @@ -1012,7 +1038,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, from_ancestor_ns || (info == SEND_SIG_FORCED))) goto ret; - pending = group ? &t->signal->shared_pending : &t->pending; + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; /* * Short-circuit ignored signals and support queuing * exactly one non-rt signal, so that we can get more @@ -1096,14 +1122,29 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); + + /* Let multiprocess signals appear after on-going forks */ + if (type > PIDTYPE_TGID) { + struct multiprocess_signals *delayed; + hlist_for_each_entry(delayed, &t->signal->multiprocess, node) { + sigset_t *signal = &delayed->signal; + /* Can't queue both a stop and a continue signal */ + if (sig == SIGCONT) + sigdelsetmask(signal, SIG_KERNEL_STOP_MASK); + else if (sig_kernel_stop(sig)) + sigdelset(signal, SIGCONT); + sigaddset(signal, sig); + } + } + + complete_signal(sig, t, type); ret: - trace_signal_generate(sig, info, t, group, result); + trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; } static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - int group) + enum pid_type type) { int from_ancestor_ns = 0; @@ -1112,7 +1153,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif - return __send_signal(sig, info, t, group, from_ancestor_ns); + return __send_signal(sig, info, t, type, from_ancestor_ns); } static void print_fatal_signal(int signr) @@ -1151,23 +1192,23 @@ __setup("print-fatal-signals=", setup_print_fatal_signals); int __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - return send_signal(sig, info, p, 1); + return send_signal(sig, info, p, PIDTYPE_TGID); } static int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) { - return send_signal(sig, info, t, 0); + return send_signal(sig, info, t, PIDTYPE_PID); } int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, - bool group) + enum pid_type type) { unsigned long flags; int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { - ret = send_signal(sig, info, p, group); + ret = send_signal(sig, info, p, type); unlock_task_sighand(p, &flags); } @@ -1274,7 +1315,8 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, /* * send signal info to all the members of a group */ -int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + enum pid_type type) { int ret; @@ -1283,7 +1325,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) rcu_read_unlock(); if (!ret && sig) - ret = do_send_sig_info(sig, info, p, true); + ret = do_send_sig_info(sig, info, p, type); return ret; } @@ -1301,7 +1343,7 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) success = 0; retval = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - int err = group_send_sig_info(sig, info, p); + int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); success |= !err; retval = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); @@ -1317,7 +1359,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) - error = group_send_sig_info(sig, info, p); + error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; @@ -1339,14 +1381,15 @@ static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } -static int kill_as_cred_perm(const struct cred *cred, - struct task_struct *target) +static inline bool kill_as_cred_perm(const struct cred *cred, + struct task_struct *target) { const struct cred *pcred = __task_cred(target); - if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) && - !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid)) - return 0; - return 1; + + return uid_eq(cred->euid, pcred->suid) || + uid_eq(cred->euid, pcred->uid) || + uid_eq(cred->uid, pcred->suid) || + uid_eq(cred->uid, pcred->uid); } /* like kill_pid_info(), but doesn't use uid/euid of "current" */ @@ -1376,7 +1419,7 @@ int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, info, p, 1, 0); + ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0); unlock_task_sighand(p, &flags); } else ret = -ESRCH; @@ -1420,7 +1463,8 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) for_each_process(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { - int err = group_send_sig_info(sig, info, p); + int err = group_send_sig_info(sig, info, p, + PIDTYPE_MAX); ++count; if (err != -EPERM) retval = err; @@ -1446,7 +1490,7 @@ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - return do_send_sig_info(sig, info, p, false); + return do_send_sig_info(sig, info, p, PIDTYPE_PID); } #define __si_special(priv) \ @@ -1458,8 +1502,7 @@ send_sig(int sig, struct task_struct *p, int priv) return send_sig_info(sig, __si_special(priv), p); } -void -force_sig(int sig, struct task_struct *p) +void force_sig(int sig, struct task_struct *p) { force_sig_info(sig, SEND_SIG_PRIV, p); } @@ -1470,8 +1513,7 @@ force_sig(int sig, struct task_struct *p) * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ -int -force_sigsegv(int sig, struct task_struct *p) +void force_sigsegv(int sig, struct task_struct *p) { if (sig == SIGSEGV) { unsigned long flags; @@ -1480,7 +1522,6 @@ force_sigsegv(int sig, struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } force_sig(SIGSEGV, p); - return 0; } int force_sig_fault(int sig, int code, void __user *addr @@ -1664,17 +1705,20 @@ void sigqueue_free(struct sigqueue *q) __sigqueue_free(q); } -int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) +int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) { int sig = q->info.si_signo; struct sigpending *pending; + struct task_struct *t; unsigned long flags; int ret, result; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); ret = -1; - if (!likely(lock_task_sighand(t, &flags))) + rcu_read_lock(); + t = pid_task(pid, type); + if (!t || !likely(lock_task_sighand(t, &flags))) goto ret; ret = 1; /* the signal is ignored */ @@ -1696,15 +1740,16 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) q->info.si_overrun = 0; signalfd_notify(t, sig); - pending = group ? &t->signal->shared_pending : &t->pending; + pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); - complete_signal(sig, t, group); + complete_signal(sig, t, type); result = TRACE_SIGNAL_DELIVERED; out: - trace_signal_generate(sig, &q->info, t, group, result); + trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); ret: + rcu_read_unlock(); return ret; } @@ -1877,10 +1922,10 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, spin_unlock_irqrestore(&sighand->siglock, flags); } -static inline int may_ptrace_stop(void) +static inline bool may_ptrace_stop(void) { if (!likely(current->ptrace)) - return 0; + return false; /* * Are we in the middle of do_coredump? * If so and our tracer is also part of the coredump stopping @@ -1896,19 +1941,19 @@ static inline int may_ptrace_stop(void) */ if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) - return 0; + return false; - return 1; + return true; } /* * Return non-zero if there is a SIGKILL that should be waking us up. * Called with the siglock held. */ -static int sigkill_pending(struct task_struct *tsk) +static bool sigkill_pending(struct task_struct *tsk) { - return sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL); + return sigismember(&tsk->pending.signal, SIGKILL) || + sigismember(&tsk->signal->shared_pending.signal, SIGKILL); } /* @@ -2288,7 +2333,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal(struct ksignal *ksig) +bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2298,7 +2343,7 @@ int get_signal(struct ksignal *ksig) task_work_run(); if (unlikely(uprobe_deny_signal())) - return 0; + return false; /* * Do this once, we can't return to user-mode if freezing() == T. @@ -2755,7 +2800,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, } #endif -static int do_sigpending(sigset_t *set) +static void do_sigpending(sigset_t *set) { spin_lock_irq(¤t->sighand->siglock); sigorsets(set, ¤t->pending.signal, @@ -2764,7 +2809,6 @@ static int do_sigpending(sigset_t *set) /* Outside the lock because only this thread touches it. */ sigandsets(set, ¤t->blocked, set); - return 0; } /** @@ -2776,15 +2820,16 @@ static int do_sigpending(sigset_t *set) SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; - int err; if (sigsetsize > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err && copy_to_user(uset, &set, sigsetsize)) - err = -EFAULT; - return err; + do_sigpending(&set); + + if (copy_to_user(uset, &set, sigsetsize)) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT @@ -2792,15 +2837,13 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; - int err; if (sigsetsize > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err) - err = put_compat_sigset(uset, &set, sigsetsize); - return err; + do_sigpending(&set); + + return put_compat_sigset(uset, &set, sigsetsize); } #endif @@ -3193,7 +3236,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) * probe. No signal is actually delivered. */ if (!error && sig) { - error = do_send_sig_info(sig, info, p, false); + error = do_send_sig_info(sig, info, p, PIDTYPE_PID); /* * If lock_task_sighand() failed we pretend the task * dies after receiving the signal. The window is tiny, @@ -3562,25 +3605,26 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { sigset_t set; - int err; if (sizeof(old_sigset_t) > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t))) - err = -EFAULT; - return err; + do_sigpending(&set); + + if (copy_to_user(uset, &set, sizeof(old_sigset_t))) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; - int err = do_sigpending(&set); - if (!err) - err = put_user(set.sig[0], set32); - return err; + + do_sigpending(&set); + + return put_user(set.sig[0], set32); } #endif @@ -3651,25 +3695,23 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig, size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; - int ret = -EINVAL; + int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) - goto out; + return -EINVAL; - if (act) { - if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) - return -EFAULT; - } + if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) + return -EFAULT; ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); + if (ret) + return ret; - if (!ret && oact) { - if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) - return -EFAULT; - } -out: - return ret; + if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, @@ -3960,7 +4002,7 @@ void kdb_send_sig(struct task_struct *t, int sig) "the deadlock.\n"); return; } - ret = send_signal(sig, SEND_SIG_PRIV, t, false); + ret = send_signal(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", diff --git a/kernel/sys.c b/kernel/sys.c index e27b51d3facd..cf5c67533ff1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1237,18 +1237,19 @@ static int override_release(char __user *release, size_t len) SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { - int errno = 0; + struct new_utsname tmp; down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof *name)) - errno = -EFAULT; + memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!errno && override_release(name->release, sizeof(name->release))) - errno = -EFAULT; - if (!errno && override_architecture(name)) - errno = -EFAULT; - return errno; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + if (override_architecture(name)) + return -EFAULT; + return 0; } #ifdef __ARCH_WANT_SYS_OLD_UNAME @@ -1257,55 +1258,46 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) */ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) { - int error = 0; + struct old_utsname tmp; if (!name) return -EFAULT; down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof(*name))) - error = -EFAULT; + memcpy(&tmp, utsname(), sizeof(tmp)); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - if (!error && override_architecture(name)) - error = -EFAULT; - return error; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + if (override_architecture(name)) + return -EFAULT; + return 0; } SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { - int error; + struct oldold_utsname tmp = {}; if (!name) return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; down_read(&uts_sem); - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); + memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); + memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN); + memcpy(&tmp.release, &utsname()->release, __OLD_UTS_LEN); + memcpy(&tmp.version, &utsname()->version, __OLD_UTS_LEN); + memcpy(&tmp.machine, &utsname()->machine, __OLD_UTS_LEN); up_read(&uts_sem); + if (copy_to_user(name, &tmp, sizeof(tmp))) + return -EFAULT; - if (!error && override_architecture(name)) - error = -EFAULT; - if (!error && override_release(name->release, sizeof(name->release))) - error = -EFAULT; - return error ? -EFAULT : 0; + if (override_architecture(name)) + return -EFAULT; + if (override_release(name->release, sizeof(name->release))) + return -EFAULT; + return 0; } #endif @@ -1319,17 +1311,18 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); + struct new_utsname *u; + down_write(&uts_sem); + u = utsname(); memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; uts_proc_notify(UTS_PROC_HOSTNAME); + up_write(&uts_sem); } - up_write(&uts_sem); return errno; } @@ -1337,8 +1330,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) { - int i, errno; + int i; struct new_utsname *u; + char tmp[__NEW_UTS_LEN + 1]; if (len < 0) return -EINVAL; @@ -1347,11 +1341,11 @@ SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) i = 1 + strlen(u->nodename); if (i > len) i = len; - errno = 0; - if (copy_to_user(name, u->nodename, i)) - errno = -EFAULT; + memcpy(tmp, u->nodename, i); up_read(&uts_sem); - return errno; + if (copy_to_user(name, tmp, i)) + return -EFAULT; + return 0; } #endif @@ -1370,17 +1364,18 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; - down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - struct new_utsname *u = utsname(); + struct new_utsname *u; + down_write(&uts_sem); + u = utsname(); memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; uts_proc_notify(UTS_PROC_DOMAINNAME); + up_write(&uts_sem); } - up_write(&uts_sem); return errno; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f22f76b7a138..cc02050fd0c4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -145,7 +145,10 @@ static int minolduid; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ #ifdef CONFIG_DETECT_HUNG_TASK static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #endif @@ -222,7 +225,7 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ -/* Note: sysrq code uses it's own private copy */ +/* Note: sysrq code uses its own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(struct ctl_table *table, int write, @@ -1091,6 +1094,14 @@ static struct ctl_table kern_table[] = { .extra2 = &hung_task_timeout_max, }, { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, + { .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, .maxlen = sizeof(int), @@ -1797,6 +1808,24 @@ static struct ctl_table fs_table[] = { .extra2 = &one, }, { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), @@ -1965,13 +1994,13 @@ static void warn_sysctl_write(struct ctl_table *table) } /** - * proc_first_pos_non_zero_ignore - check if firs position is allowed + * proc_first_pos_non_zero_ignore - check if first position is allowed * @ppos: file position * @table: the sysctl table * * Returns true if the first position is non-zero and the sysctl_writes_strict * mode indicates this is not allowed for numeric input types. String proc - * hadlers can ignore the return value. + * handlers can ignore the return value. */ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, struct ctl_table *table) diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index f26acef5d7b4..9a65713c8309 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -139,9 +139,10 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) { struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + struct pid *leader_pid = sig->pids[PIDTYPE_TGID]; - trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); - kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); + trace_itimer_expire(ITIMER_REAL, leader_pid, 0); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid); return HRTIMER_NORESTART; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 294d7b65af33..ce32cf741b25 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -894,7 +894,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, trace_itimer_expire(signo == SIGPROF ? ITIMER_PROF : ITIMER_VIRTUAL, - tsk->signal->leader_pid, cur_time); + task_tgid(tsk), cur_time); __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index f23cc46ecf3e..4b9127e95430 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -333,8 +333,8 @@ void posixtimer_rearm(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { - struct task_struct *task; - int shared, ret = -1; + enum pid_type type; + int ret = -1; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). @@ -348,13 +348,8 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - rcu_read_lock(); - task = pid_task(timr->it_pid, PIDTYPE_PID); - if (task) { - shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); - ret = send_sigqueue(timr->sigq, task, shared); - } - rcu_read_unlock(); + type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; + ret = send_sigqueue(timr->sigq, timr->it_pid, type); /* If we failed to send the signal the timer stops. */ return ret > 0; } @@ -433,11 +428,13 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) static struct pid *good_sigevent(sigevent_t * event) { - struct task_struct *rtn = current->group_leader; + struct pid *pid = task_tgid(current); + struct task_struct *rtn; switch (event->sigev_notify) { case SIGEV_SIGNAL | SIGEV_THREAD_ID: - rtn = find_task_by_vpid(event->sigev_notify_thread_id); + pid = find_vpid(event->sigev_notify_thread_id); + rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; /* FALLTHRU */ @@ -447,7 +444,7 @@ static struct pid *good_sigevent(sigevent_t * event) return NULL; /* FALLTHRU */ case SIGEV_NONE: - return task_pid(rtn); + return pid; default: return NULL; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9a27f146fa1c..5e3de28c7677 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -47,6 +47,11 @@ config HAVE_FENTRY help Arch supports the gcc options -pg with -mfentry +config HAVE_NOP_MCOUNT + bool + help + Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount + config HAVE_C_RECORDMCOUNT bool help @@ -82,6 +87,15 @@ config RING_BUFFER_ALLOW_SWAP Allow the use of ring_buffer_swap_cpu. Adds a very slight overhead to tracing when enabled. +config PREEMPTIRQ_TRACEPOINTS + bool + depends on TRACE_PREEMPT_TOGGLE || TRACE_IRQFLAGS + select TRACING + default y + help + Create preempt/irq toggle tracepoints if needed, so that other parts + of the kernel can use them to generate or add hooks to them. + # All tracer options should select GENERIC_TRACER. For those options that are # enabled by all tracers (context switch and event tracer) they select TRACING. # This allows those options to appear when no other tracer is selected. But the @@ -155,18 +169,20 @@ config FUNCTION_GRAPH_TRACER the return value. This is done by setting the current return address on the current task structure into a stack of calls. +config TRACE_PREEMPT_TOGGLE + bool + help + Enables hooks which will be called when preemption is first disabled, + and last enabled. config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS - depends on DEBUG_PREEMPT || !PROVE_LOCKING - depends on TRACING + select TRACE_PREEMPT_TOGGLE if PREEMPT + select GENERIC_TRACER default n help Enable tracing of disable and enable events for preemption and irqs. - For tracing preempt disable/enable events, DEBUG_PREEMPT must be - enabled. For tracing irq disable/enable events, PROVE_LOCKING must - be disabled. config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" @@ -203,6 +219,7 @@ config PREEMPT_TRACER select RING_BUFFER_ALLOW_SWAP select TRACER_SNAPSHOT select TRACER_SNAPSHOT_PER_CPU_SWAP + select TRACE_PREEMPT_TOGGLE help This option measures the time spent in preemption-off critical sections, with microsecond accuracy. @@ -456,6 +473,26 @@ config KPROBE_EVENTS This option is also required by perf-probe subcommand of perf tools. If you want to use perf tools, this option is strongly recommended. +config KPROBE_EVENTS_ON_NOTRACE + bool "Do NOT protect notrace function from kprobe events" + depends on KPROBE_EVENTS + depends on KPROBES_ON_FTRACE + default n + help + This is only for the developers who want to debug ftrace itself + using kprobe events. + + If kprobes can use ftrace instead of breakpoint, ftrace related + functions are protected from kprobe-events to prevent an infinit + recursion or any unexpected execution path which leads to a kernel + crash. + + This option disables such protection and allows you to put kprobe + events on ftrace functions for debugging ftrace by itself. + Note that this might let you shoot yourself in the foot. + + If unsure, say N. + config UPROBE_EVENTS bool "Enable uprobes-based dynamic events" depends on ARCH_SUPPORTS_UPROBES @@ -687,6 +724,21 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N +config PREEMPTIRQ_DELAY_TEST + tristate "Preempt / IRQ disable delay thread to test latency tracers" + depends on m + help + Select this option to build a test module that can help test latency + tracers by executing a preempt or irq disable section with a user + configurable delay. The module busy waits for the duration of the + critical section. + + For example, the following invocation forces a one-time irq-disabled + critical section for 500us: + modprobe preemptirq_delay_test test_mode=irq delay=500000 + + If unsure, say N + config TRACE_EVAL_MAP_FILE bool "Show eval mappings for trace events" depends on TRACING @@ -722,6 +774,18 @@ config TRACING_EVENTS_GPIO help Enable tracing events for gpio subsystem +config GCOV_PROFILE_FTRACE + bool "Enable GCOV profiling on ftrace subsystem" + depends on GCOV_KERNEL + help + Enable GCOV profiling on ftrace subsystem for checking + which functions/lines are tested. + + If unsure, say N. + + Note that on a kernel compiled with this config, ftrace will + run significantly slower. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e2538c7638d4..f81dadbc7c4a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -13,11 +13,21 @@ obj-y += trace_selftest_dynamic.o endif endif +ifdef CONFIG_FTRACE_STARTUP_TEST +CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE) +obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o +endif + # If unlikely tracing is enabled, do not trace these files ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +# for GCOV coverage profiling +ifdef CONFIG_GCOV_PROFILE_FTRACE +GCOV_PROFILE := y +endif + CFLAGS_trace_benchmark.o := -I$(src) CFLAGS_trace_events_filter.o := -I$(src) @@ -33,9 +43,10 @@ obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o +obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o -obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o +obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += trace_preemptirq.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b951aa1fac61..2868d85f1fb1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * */ #include <linux/kernel.h> #include <linux/blkdev.h> @@ -1841,6 +1829,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { + if (!!value == !!q->blk_trace) { + ret = 0; + goto out_unlock_bdev; + } if (value) ret = blk_trace_setup_queue(q, bdev); else diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ae6829804bc..08fcfe440c63 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <linux/kernel.h> #include <linux/types.h> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index caf9cbf35816..f536f601bd46 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Infrastructure for profiling code inserted by 'gcc -pg'. * @@ -157,30 +158,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) #endif } -/** - * ftrace_nr_registered_ops - return number of ops registered - * - * Returns the number of ftrace_ops registered and tracing functions - */ -int ftrace_nr_registered_ops(void) -{ - struct ftrace_ops *ops; - int cnt = 0; - - mutex_lock(&ftrace_lock); - - for (ops = rcu_dereference_protected(ftrace_ops_list, - lockdep_is_held(&ftrace_lock)); - ops != &ftrace_list_end; - ops = rcu_dereference_protected(ops->next, - lockdep_is_held(&ftrace_lock))) - cnt++; - - mutex_unlock(&ftrace_lock); - - return cnt; -} - static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { @@ -313,11 +290,6 @@ static void update_ftrace_function(void) ftrace_trace_function = func; } -int using_ftrace_ops_list_func(void) -{ - return ftrace_trace_function == ftrace_ops_list_func; -} - static void add_ftrace_ops(struct ftrace_ops __rcu **list, struct ftrace_ops *ops) { @@ -1049,8 +1021,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -static struct pid * const ftrace_swapper_pid = &init_struct_pid; - #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int ftrace_graph_active; #else @@ -2927,22 +2897,22 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) { /* If ops isn't enabled, ignore it */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return 0; + return false; /* If ops traces all then it includes this function */ if (ops_traces_mod(ops)) - return 1; + return true; /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) - return 0; + return false; /* If in notrace hash, we ignore it too */ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) - return 0; + return false; - return 1; + return true; } static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) @@ -2981,12 +2951,14 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) p = &pg->records[i]; p->flags = rec_flags; +#ifndef CC_USING_NOP_MCOUNT /* * Do the initial record conversion from mcount jump * to the NOP instructions. */ if (!ftrace_code_disable(mod, p)) break; +#endif update_cnt++; } diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c new file mode 100644 index 000000000000..f704390db9fc --- /dev/null +++ b/kernel/trace/preemptirq_delay_test.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Preempt / IRQ disable delay thread to test latency tracers + * + * Copyright (C) 2018 Joel Fernandes (Google) <joel@joelfernandes.org> + */ + +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/string.h> + +static ulong delay = 100; +static char test_mode[10] = "irq"; + +module_param_named(delay, delay, ulong, S_IRUGO); +module_param_string(test_mode, test_mode, 10, S_IRUGO); +MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)"); +MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)"); + +static void busy_wait(ulong time) +{ + ktime_t start, end; + start = ktime_get(); + do { + end = ktime_get(); + if (kthread_should_stop()) + break; + } while (ktime_to_ns(ktime_sub(end, start)) < (time * 1000)); +} + +static int preemptirq_delay_run(void *data) +{ + unsigned long flags; + + if (!strcmp(test_mode, "irq")) { + local_irq_save(flags); + busy_wait(delay); + local_irq_restore(flags); + } else if (!strcmp(test_mode, "preempt")) { + preempt_disable(); + busy_wait(delay); + preempt_enable(); + } + + return 0; +} + +static int __init preemptirq_delay_init(void) +{ + char task_name[50]; + struct task_struct *test_task; + + snprintf(task_name, sizeof(task_name), "%s_test", test_mode); + + test_task = kthread_run(preemptirq_delay_run, NULL, task_name); + return PTR_ERR_OR_ZERO(test_task); +} + +static void __exit preemptirq_delay_exit(void) +{ + return; +} + +module_init(preemptirq_delay_init) +module_exit(preemptirq_delay_exit) +MODULE_LICENSE("GPL v2"); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0b0b688ea166..1d92d4a982fd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic ring buffer * @@ -3221,7 +3222,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_on); * * Returns true if the ring buffer is in a state that it accepts writes. */ -int ring_buffer_record_is_on(struct ring_buffer *buffer) +bool ring_buffer_record_is_on(struct ring_buffer *buffer) { return !atomic_read(&buffer->record_disabled); } @@ -3237,7 +3238,7 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer) * ring_buffer_record_disable(), as that is a temporary disabling of * the ring buffer. */ -int ring_buffer_record_is_set_on(struct ring_buffer *buffer) +bool ring_buffer_record_is_set_on(struct ring_buffer *buffer) { return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 68ee79afe31c..ffba6789c0e2 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ring buffer tester and benchmark * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 176debd3481b..bf6f1d70484d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ring buffer based function tracer * @@ -1087,7 +1088,7 @@ void disable_trace_on_warning(void) * * Shows real state of the ring buffer if it is enabled or not. */ -int tracer_tracing_is_on(struct trace_array *tr) +bool tracer_tracing_is_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) return ring_buffer_record_is_on(tr->trace_buffer.buffer); @@ -7628,7 +7629,9 @@ rb_simple_write(struct file *filp, const char __user *ubuf, if (buffer) { mutex_lock(&trace_types_lock); - if (val) { + if (!!val == tracer_tracing_is_on(tr)) { + val = 0; /* do nothing */ + } else if (val) { tracer_tracing_on(tr); if (tr->current_trace->start) tr->current_trace->start(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f8f86231ad90..3b8c0e24ab30 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H @@ -594,7 +594,7 @@ void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); -int tracer_tracing_is_on(struct trace_array *tr); +bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); struct dentry *trace_create_file(const char *name, @@ -937,7 +937,6 @@ void ftrace_destroy_function_files(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); -int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); @@ -1533,9 +1532,6 @@ extern int event_trigger_init(struct event_trigger_ops *ops, extern int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable); extern void update_cond_flag(struct trace_event_file *file); -extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file); extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); @@ -1831,6 +1827,21 @@ static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) } #endif +#ifdef CONFIG_PREEMPT_TRACER +void tracer_preempt_on(unsigned long a0, unsigned long a1); +void tracer_preempt_off(unsigned long a0, unsigned long a1); +#else +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif +#ifdef CONFIG_IRQSOFF_TRACER +void tracer_hardirqs_on(unsigned long a0, unsigned long a1); +void tracer_hardirqs_off(unsigned long a0, unsigned long a1); +#else +static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } +#endif + extern struct trace_iterator *tracepoint_print_iter; #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index be1d86ff753d..79e6fbe5b365 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #undef TRACE_SYSTEM #define TRACE_SYSTEM benchmark diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index d8a188e0418a..aaf6793ededa 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * tracing clocks * diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 1d67464ed95e..06bb2fd9a56c 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* * This file defines the trace event structures that go into the ring * buffer directly. They are created via macros so that changes for them diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index c79193e598f5..69a3fe926e8c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace event based perf event profiling/tracing * diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 14ff4ff3caab..f94be0c2827b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * event tracer * @@ -239,7 +240,7 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) struct trace_array_cpu *data; struct trace_pid_list *pid_list; - pid_list = rcu_dereference_sched(tr->filtered_pids); + pid_list = rcu_dereference_raw(tr->filtered_pids); if (!pid_list) return false; @@ -512,7 +513,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_pid_list *pid_list; struct trace_array *tr = data; - pid_list = rcu_dereference_sched(tr->filtered_pids); + pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); } @@ -636,7 +637,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) rcu_assign_pointer(tr->filtered_pids, NULL); /* Wait till all users are no longer using pid filtering */ - synchronize_sched(); + tracepoint_synchronize_unregister(); trace_free_pid_list(pid_list); } @@ -1622,7 +1623,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, } if (filtered_pids) { - synchronize_sched(); + tracepoint_synchronize_unregister(); trace_free_pid_list(filtered_pids); } else if (pid_list) { /* @@ -3036,8 +3037,8 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); - /* Access to events are within rcu_read_lock_sched() */ - synchronize_sched(); + /* Make sure no more events are being executed */ + tracepoint_synchronize_unregister(); down_write(&trace_event_sem); __trace_remove_event_dirs(tr); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 893a206bcba4..84a65173b1e9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_events_filter - generic event filtering * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ @@ -899,7 +886,8 @@ int filter_match_preds(struct event_filter *filter, void *rec) if (!filter) return 1; - prog = rcu_dereference_sched(filter->prog); + /* Protected by either SRCU(tracepoint_srcu) or preempt_disable */ + prog = rcu_dereference_raw(filter->prog); if (!prog) return 1; @@ -1626,10 +1614,10 @@ static int process_system_preds(struct trace_subsystem_dir *dir, /* * The calls can still be using the old filters. - * Do a synchronize_sched() to ensure all calls are + * Do a synchronize_sched() and to ensure all calls are * done with them before we free them. */ - synchronize_sched(); + tracepoint_synchronize_unregister(); list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { __free_filter(filter_item->filter); list_del(&filter_item->list); @@ -1648,7 +1636,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, kfree(filter); /* If any call succeeded, we still need to sync */ if (!fail) - synchronize_sched(); + tracepoint_synchronize_unregister(); list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { __free_filter(filter_item->filter); list_del(&filter_item->list); @@ -1790,7 +1778,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_clear_filter(file); /* Make sure the filter is not being used */ - synchronize_sched(); + tracepoint_synchronize_unregister(); __free_filter(filter); return 0; @@ -1817,7 +1805,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) if (tmp) { /* Make sure the call is done with the filter */ - synchronize_sched(); + tracepoint_synchronize_unregister(); __free_filter(tmp); } } @@ -1847,7 +1835,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ - synchronize_sched(); + tracepoint_synchronize_unregister(); filter_free_subsystem_filters(dir, tr); __free_filter(filter); goto out_unlock; diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h index 39d7ef4f57cb..e651dfbd345e 100644 --- a/kernel/trace/trace_events_filter_test.h +++ b/kernel/trace/trace_events_filter_test.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #undef TRACE_SYSTEM #define TRACE_SYSTEM test diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index aae18af94c94..85f6b01431c7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_events_hist - trace event hist triggers * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> */ @@ -5141,7 +5132,7 @@ static void hist_clear(struct event_trigger_data *data) if (data->name) pause_named_trigger(data); - synchronize_sched(); + tracepoint_synchronize_unregister(); tracing_map_clear(hist_data->map); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 5dea177cef53..2152d1e530cb 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_events_trigger - trace event triggers * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> */ @@ -34,7 +21,9 @@ void trigger_data_free(struct event_trigger_data *data) if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); - synchronize_sched(); /* make sure current triggers exit before free */ + /* make sure current triggers exit before free */ + tracepoint_synchronize_unregister(); + kfree(data); } @@ -579,9 +568,9 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +static void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; @@ -752,7 +741,7 @@ int set_trigger_filter(char *filter_str, if (tmp) { /* Make sure the call is done with the filter */ - synchronize_sched(); + tracepoint_synchronize_unregister(); free_event_filter(tmp); } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index d7c8e4ec3d9d..1e6db9cbe4dc 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_hwlatdetect.c - A simple Hardware Latency detector. * @@ -35,9 +36,6 @@ * * Includes useful feedback from Clark Williams <clark@redhat.com> * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. */ #include <linux/kthread.h> #include <linux/tracefs.h> @@ -354,6 +352,9 @@ static int start_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; + if (WARN_ON(hwlat_kthread)) + return 0; + /* Just pick the first CPU on first iteration */ current_mask = &save_cpumask; get_online_cpus(); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 03ecb4465ee4..b7357f9f82a3 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace irqs off critical timings * @@ -16,7 +17,6 @@ #include "trace.h" -#define CREATE_TRACE_POINTS #include <trace/events/preemptirq.h> #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) @@ -41,12 +41,12 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph); #ifdef CONFIG_PREEMPT_TRACER static inline int -preempt_trace(void) +preempt_trace(int pc) { - return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); + return ((trace_type & TRACER_PREEMPT_OFF) && pc); } #else -# define preempt_trace() (0) +# define preempt_trace(pc) (0) #endif #ifdef CONFIG_IRQSOFF_TRACER @@ -367,7 +367,7 @@ out: } static inline void -start_critical_timing(unsigned long ip, unsigned long parent_ip) +start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; struct trace_array *tr = irqsoff_trace; @@ -395,7 +395,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - __trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, pc); per_cpu(tracing_cpu, cpu) = 1; @@ -403,7 +403,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) } static inline void -stop_critical_timing(unsigned long ip, unsigned long parent_ip) +stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; struct trace_array *tr = irqsoff_trace; @@ -429,7 +429,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); - __trace_function(tr, ip, parent_ip, flags, preempt_count()); + __trace_function(tr, ip, parent_ip, flags, pc); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); @@ -438,77 +438,21 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) /* start and stop critical timings used to for stoppage (in idle) */ void start_critical_timings(void) { - if (preempt_trace() || irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + int pc = preempt_count(); + + if (preempt_trace(pc) || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { - if (preempt_trace() || irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} -EXPORT_SYMBOL_GPL(stop_critical_timings); - -#ifdef CONFIG_IRQSOFF_TRACER -#ifdef CONFIG_PROVE_LOCKING -void time_hardirqs_on(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(a0, a1); -} - -void time_hardirqs_off(unsigned long a0, unsigned long a1) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(a0, a1); -} - -#else /* !CONFIG_PROVE_LOCKING */ - -/* - * We are only interested in hardirq on/off events: - */ -static inline void tracer_hardirqs_on(void) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} + int pc = preempt_count(); -static inline void tracer_hardirqs_off(void) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); -} - -static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - stop_critical_timing(CALLER_ADDR0, caller_addr); -} - -static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) -{ - if (!preempt_trace() && irq_trace()) - start_critical_timing(CALLER_ADDR0, caller_addr); -} - -#endif /* CONFIG_PROVE_LOCKING */ -#endif /* CONFIG_IRQSOFF_TRACER */ - -#ifdef CONFIG_PREEMPT_TRACER -static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) -{ - if (preempt_trace() && !irq_trace()) - stop_critical_timing(a0, a1); -} - -static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) -{ - if (preempt_trace() && !irq_trace()) - start_critical_timing(a0, a1); + if (preempt_trace(pc) || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } -#endif /* CONFIG_PREEMPT_TRACER */ +EXPORT_SYMBOL_GPL(stop_critical_timings); #ifdef CONFIG_FUNCTION_TRACER static bool function_enabled; @@ -634,7 +578,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr) return 0; } -static void irqsoff_tracer_reset(struct trace_array *tr) +static void __irqsoff_tracer_reset(struct trace_array *tr) { int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; @@ -659,12 +603,37 @@ static void irqsoff_tracer_stop(struct trace_array *tr) } #ifdef CONFIG_IRQSOFF_TRACER +/* + * We are only interested in hardirq on/off events: + */ +void tracer_hardirqs_on(unsigned long a0, unsigned long a1) +{ + unsigned int pc = preempt_count(); + + if (!preempt_trace(pc) && irq_trace()) + stop_critical_timing(a0, a1, pc); +} + +void tracer_hardirqs_off(unsigned long a0, unsigned long a1) +{ + unsigned int pc = preempt_count(); + + if (!preempt_trace(pc) && irq_trace()) + start_critical_timing(a0, a1, pc); +} + static int irqsoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_IRQS_OFF; return __irqsoff_tracer_init(tr); } + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + static struct tracer irqsoff_tracer __read_mostly = { .name = "irqsoff", @@ -684,12 +653,25 @@ static struct tracer irqsoff_tracer __read_mostly = .allow_instances = true, .use_max_tr = true, }; -# define register_irqsoff(trace) register_tracer(&trace) -#else -# define register_irqsoff(trace) do { } while (0) -#endif +#endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER +void tracer_preempt_on(unsigned long a0, unsigned long a1) +{ + int pc = preempt_count(); + + if (preempt_trace(pc) && !irq_trace()) + stop_critical_timing(a0, a1, pc); +} + +void tracer_preempt_off(unsigned long a0, unsigned long a1) +{ + int pc = preempt_count(); + + if (preempt_trace(pc) && !irq_trace()) + start_critical_timing(a0, a1, pc); +} + static int preemptoff_tracer_init(struct trace_array *tr) { trace_type = TRACER_PREEMPT_OFF; @@ -697,11 +679,16 @@ static int preemptoff_tracer_init(struct trace_array *tr) return __irqsoff_tracer_init(tr); } +static void preemptoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + static struct tracer preemptoff_tracer __read_mostly = { .name = "preemptoff", .init = preemptoff_tracer_init, - .reset = irqsoff_tracer_reset, + .reset = preemptoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = true, @@ -716,13 +703,9 @@ static struct tracer preemptoff_tracer __read_mostly = .allow_instances = true, .use_max_tr = true, }; -# define register_preemptoff(trace) register_tracer(&trace) -#else -# define register_preemptoff(trace) do { } while (0) -#endif +#endif /* CONFIG_PREEMPT_TRACER */ -#if defined(CONFIG_IRQSOFF_TRACER) && \ - defined(CONFIG_PREEMPT_TRACER) +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) static int preemptirqsoff_tracer_init(struct trace_array *tr) { @@ -731,11 +714,16 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) return __irqsoff_tracer_init(tr); } +static void preemptirqsoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + static struct tracer preemptirqsoff_tracer __read_mostly = { .name = "preemptirqsoff", .init = preemptirqsoff_tracer_init, - .reset = irqsoff_tracer_reset, + .reset = preemptirqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, .print_max = true, @@ -750,115 +738,21 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .allow_instances = true, .use_max_tr = true, }; - -# define register_preemptirqsoff(trace) register_tracer(&trace) -#else -# define register_preemptirqsoff(trace) do { } while (0) #endif __init static int init_irqsoff_tracer(void) { - register_irqsoff(irqsoff_tracer); - register_preemptoff(preemptoff_tracer); - register_preemptirqsoff(preemptirqsoff_tracer); - - return 0; -} -core_initcall(init_irqsoff_tracer); -#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ - -#ifndef CONFIG_IRQSOFF_TRACER -static inline void tracer_hardirqs_on(void) { } -static inline void tracer_hardirqs_off(void) { } -static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { } -static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { } +#ifdef CONFIG_IRQSOFF_TRACER + register_tracer(&irqsoff_tracer); #endif - -#ifndef CONFIG_PREEMPT_TRACER -static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } -static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#ifdef CONFIG_PREEMPT_TRACER + register_tracer(&preemptoff_tracer); #endif - -#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) -/* Per-cpu variable to prevent redundant calls when IRQs already off */ -static DEFINE_PER_CPU(int, tracing_irq_cpu); - -void trace_hardirqs_on(void) -{ - if (!this_cpu_read(tracing_irq_cpu)) - return; - - trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); - tracer_hardirqs_on(); - - this_cpu_write(tracing_irq_cpu, 0); -} -EXPORT_SYMBOL(trace_hardirqs_on); - -void trace_hardirqs_off(void) -{ - if (this_cpu_read(tracing_irq_cpu)) - return; - - this_cpu_write(tracing_irq_cpu, 1); - - trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); - tracer_hardirqs_off(); -} -EXPORT_SYMBOL(trace_hardirqs_off); - -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) -{ - if (!this_cpu_read(tracing_irq_cpu)) - return; - - trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); - tracer_hardirqs_on_caller(caller_addr); - - this_cpu_write(tracing_irq_cpu, 0); -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) -{ - if (this_cpu_read(tracing_irq_cpu)) - return; - - this_cpu_write(tracing_irq_cpu, 1); - - trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); - tracer_hardirqs_off_caller(caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -/* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) + register_tracer(&preemptirqsoff_tracer); #endif -#if defined(CONFIG_PREEMPT_TRACER) || \ - (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) -void trace_preempt_on(unsigned long a0, unsigned long a1) -{ - trace_preempt_enable_rcuidle(a0, a1); - tracer_preempt_on(a0, a1); -} - -void trace_preempt_off(unsigned long a0, unsigned long a1) -{ - trace_preempt_disable_rcuidle(a0, a1); - tracer_preempt_off(a0, a1); + return 0; } -#endif +core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index e9d99463e5df..c30032367aab 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1,20 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Kprobes-based tracing events * * Created by Masami Hiramatsu <mhiramat@redhat.com> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) "trace_kprobe: " fmt @@ -23,6 +12,7 @@ #include <linux/rculist.h> #include <linux/error-injection.h> +#include "trace_kprobe_selftest.h" #include "trace_probe.h" #define KPROBE_EVENT_SYSTEM "kprobes" @@ -87,6 +77,23 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +/* Return 0 if it fails to find the symbol address */ +static nokprobe_inline +unsigned long trace_kprobe_address(struct trace_kprobe *tk) +{ + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + if (addr) + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return addr; +} + bool trace_kprobe_on_func_entry(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; @@ -99,16 +106,8 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call) bool trace_kprobe_error_injectable(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - unsigned long addr; - if (tk->symbol) { - addr = (unsigned long) - kallsyms_lookup_name(trace_kprobe_symbol(tk)); - addr += tk->rp.kp.offset; - } else { - addr = (unsigned long)tk->rp.kp.addr; - } - return within_error_injection_list(addr); + return within_error_injection_list(trace_kprobe_address(tk)); } static int register_kprobe_event(struct trace_kprobe *tk); @@ -393,6 +392,20 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, return NULL; } +static inline int __enable_trace_kprobe(struct trace_kprobe *tk) +{ + int ret = 0; + + if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_return(tk)) + ret = enable_kretprobe(&tk->rp); + else + ret = enable_kprobe(&tk->rp.kp); + } + + return ret; +} + /* * Enable trace_probe * if the file is NULL, enable "perf" handler, or enable "trace" handler. @@ -400,7 +413,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { - struct event_file_link *link = NULL; + struct event_file_link *link; int ret = 0; if (file) { @@ -414,26 +427,18 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) list_add_tail_rcu(&link->list, &tk->tp.files); tk->tp.flags |= TP_FLAG_TRACE; - } else - tk->tp.flags |= TP_FLAG_PROFILE; - - if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { - if (trace_kprobe_is_return(tk)) - ret = enable_kretprobe(&tk->rp); - else - ret = enable_kprobe(&tk->rp.kp); - } - - if (ret) { - if (file) { - /* Notice the if is true on not WARN() */ - if (!WARN_ON_ONCE(!link)) - list_del_rcu(&link->list); + ret = __enable_trace_kprobe(tk); + if (ret) { + list_del_rcu(&link->list); kfree(link); tk->tp.flags &= ~TP_FLAG_TRACE; - } else { - tk->tp.flags &= ~TP_FLAG_PROFILE; } + + } else { + tk->tp.flags |= TP_FLAG_PROFILE; + ret = __enable_trace_kprobe(tk); + if (ret) + tk->tp.flags &= ~TP_FLAG_PROFILE; } out: return ret; @@ -498,6 +503,29 @@ disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) return ret; } +#if defined(CONFIG_KPROBES_ON_FTRACE) && \ + !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) +static bool within_notrace_func(struct trace_kprobe *tk) +{ + unsigned long offset, size, addr; + + addr = trace_kprobe_address(tk); + if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) + return false; + + /* Get the entry address of the target function */ + addr -= offset; + + /* + * Since ftrace_location_range() does inclusive range check, we need + * to subtract 1 byte from the end address. + */ + return !ftrace_location_range(addr, addr + size - 1); +} +#else +#define within_notrace_func(tk) (false) +#endif + /* Internal register function - just handle k*probes and flags */ static int __register_trace_kprobe(struct trace_kprobe *tk) { @@ -506,6 +534,12 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (trace_probe_is_registered(&tk->tp)) return -EINVAL; + if (within_notrace_func(tk)) { + pr_warn("Could not probe notrace function %s\n", + trace_kprobe_symbol(tk)); + return -EINVAL; + } + for (i = 0; i < tk->tp.nr_args; i++) traceprobe_update_arg(&tk->tp.args[i]); @@ -1547,17 +1581,6 @@ fs_initcall(init_kprobe_trace); #ifdef CONFIG_FTRACE_STARTUP_TEST -/* - * The "__used" keeps gcc from removing the function symbol - * from the kallsyms table. 'noinline' makes sure that there - * isn't an inlined version used by the test method below - */ -static __used __init noinline int -kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) -{ - return a1 + a2 + a3 + a4 + a5 + a6; -} - static __init struct trace_event_file * find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c new file mode 100644 index 000000000000..16548ee4c8c6 --- /dev/null +++ b/kernel/trace/trace_kprobe_selftest.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Function used during the kprobe self test. This function is in a separate + * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it + * can be probed by the selftests. + */ +int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} diff --git a/kernel/trace/trace_kprobe_selftest.h b/kernel/trace/trace_kprobe_selftest.h new file mode 100644 index 000000000000..c4fc7268ba7c --- /dev/null +++ b/kernel/trace/trace_kprobe_selftest.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Function used during the kprobe self test. This function is in a separate + * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it + * can be probed by the selftests. + */ +int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1c8e30fda46a..6e6cc64faa38 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_output.c * diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index dbba03ed96de..2f742b74e7e6 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c new file mode 100644 index 000000000000..71f553cceb3c --- /dev/null +++ b/kernel/trace/trace_preemptirq.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * preemptoff and irqoff tracepoints + * + * Copyright (C) Joel Fernandes (Google) <joel@joelfernandes.org> + */ + +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include "trace.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/preemptirq.h> + +#ifdef CONFIG_TRACE_IRQFLAGS +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + +void trace_hardirqs_on(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } + + lockdep_hardirqs_on(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + if (!in_nmi()) + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + } + + lockdep_hardirqs_off(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + if (this_cpu_read(tracing_irq_cpu)) { + if (!in_nmi()) + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); + tracer_hardirqs_on(CALLER_ADDR0, caller_addr); + this_cpu_write(tracing_irq_cpu, 0); + } + + lockdep_hardirqs_on(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, caller_addr); + if (!in_nmi()) + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); + } + + lockdep_hardirqs_off(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); +#endif /* CONFIG_TRACE_IRQFLAGS */ + +#ifdef CONFIG_TRACE_PREEMPT_TOGGLE + +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + if (!in_nmi()) + trace_preempt_enable_rcuidle(a0, a1); + tracer_preempt_on(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + if (!in_nmi()) + trace_preempt_disable_rcuidle(a0, a1); + tracer_preempt_off(a0, a1); +} +#endif diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 50f44b7b2b32..b0875b327f5c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace binary printk * diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index daf54bda4dc8..e99c3ce7aa65 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common code for probe-based Dynamic events. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * This code was copied from kernel/trace/trace_kprobe.c written by * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> * diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 75daff22ccea..5f52668e165d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common header file for probe-based Dynamic events. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * This code was copied from kernel/trace/trace_kprobe.h written by * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> * diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index e694c9f9efa4..6b1c562ffdaf 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * trace_seq.c * diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 76d30b4ebe83..8786d17caf49 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef __TRACE_STAT_H #define __TRACE_STAT_H diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index bf89a51e740d..e696667da29a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * uprobes-based tracing events * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> */ @@ -952,7 +940,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file) list_del_rcu(&link->list); /* synchronize with u{,ret}probe_trace_func */ - synchronize_sched(); + synchronize_rcu(); kfree(link); if (!list_empty(&tu->tp.files)) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 752d8042bad4..9a1c22310323 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * tracing_map - lock-free map for tracing * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> * * tracing_map implementation inspired by lock-free map algorithms diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 053eb92b2d31..a6de61fc22de 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 #ifndef __TRACING_MAP_H #define __TRACING_MAP_H diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6dc6356c3327..bf2c06ef9afc 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -31,6 +31,9 @@ extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; +DEFINE_SRCU(tracepoint_srcu); +EXPORT_SYMBOL_GPL(tracepoint_srcu); + /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; @@ -50,6 +53,9 @@ static LIST_HEAD(tracepoint_module_list); */ static DEFINE_MUTEX(tracepoints_mutex); +static struct rcu_head *early_probes; +static bool ok_to_free_tracepoints; + /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent @@ -67,16 +73,56 @@ static inline void *allocate_probes(int count) return p == NULL ? NULL : p->probes; } -static void rcu_free_old_probes(struct rcu_head *head) +static void srcu_free_old_probes(struct rcu_head *head) { kfree(container_of(head, struct tp_probes, rcu)); } +static void rcu_free_old_probes(struct rcu_head *head) +{ + call_srcu(&tracepoint_srcu, head, srcu_free_old_probes); +} + +static __init int release_early_probes(void) +{ + struct rcu_head *tmp; + + ok_to_free_tracepoints = true; + + while (early_probes) { + tmp = early_probes; + early_probes = tmp->next; + call_rcu_sched(tmp, rcu_free_old_probes); + } + + return 0; +} + +/* SRCU is initialized at core_initcall */ +postcore_initcall(release_early_probes); + static inline void release_probes(struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); + + /* + * We can't free probes if SRCU is not initialized yet. + * Postpone the freeing till after SRCU is initialized. + */ + if (unlikely(!ok_to_free_tracepoints)) { + tp_probes->rcu.next = early_probes; + early_probes = &tp_probes->rcu; + return; + } + + /* + * Tracepoint probes are protected by both sched RCU and SRCU, + * by calling the SRCU callback in the sched RCU callback we + * cover both cases. So let us chain the SRCU and sched RCU + * callbacks to wait for both grace periods. + */ call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); } } @@ -325,6 +371,27 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); +static void for_each_tracepoint_range(struct tracepoint * const *begin, + struct tracepoint * const *end, + void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + if (!begin) + return; + + if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) { + const int *iter; + + for (iter = (const int *)begin; iter < (const int *)end; iter++) + fct(offset_to_ptr(iter), priv); + } else { + struct tracepoint * const *iter; + + for (iter = begin; iter < end; iter++) + fct(*iter, priv); + } +} + #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { @@ -389,15 +456,9 @@ EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); * Ensure the tracer unregistered the module's probes before the module * teardown is performed. Prevents leaks of probe and data pointers. */ -static void tp_module_going_check_quiescent(struct tracepoint * const *begin, - struct tracepoint * const *end) +static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) { - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - WARN_ON_ONCE((*iter)->funcs); + WARN_ON_ONCE(tp->funcs); } static int tracepoint_module_coming(struct module *mod) @@ -448,8 +509,9 @@ static void tracepoint_module_going(struct module *mod) * Called the going notifier before checking for * quiescence. */ - tp_module_going_check_quiescent(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + for_each_tracepoint_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints, + tp_module_going_check_quiescent, NULL); break; } } @@ -501,19 +563,6 @@ static __init int init_tracepoints(void) __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ -static void for_each_tracepoint_range(struct tracepoint * const *begin, - struct tracepoint * const *end, - void (*fct)(struct tracepoint *tp, void *priv), - void *priv) -{ - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - fct(*iter, priv); -} - /** * for_each_kernel_tracepoint - iteration on all kernel tracepoints * @fct: callback diff --git a/kernel/user.c b/kernel/user.c index 36288d840675..0df9b1640b2a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -96,7 +96,7 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { - .__count = ATOMIC_INIT(1), + .__count = REFCOUNT_INIT(1), .processes = ATOMIC_INIT(1), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, @@ -123,7 +123,7 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { - atomic_inc(&user->__count); + refcount_inc(&user->__count); return user; } } @@ -169,11 +169,8 @@ void free_uid(struct user_struct *up) if (!up) return; - local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); - else - local_irq_restore(flags); } struct user_struct *alloc_uid(kuid_t uid) @@ -191,7 +188,7 @@ struct user_struct *alloc_uid(kuid_t uid) goto out_unlock; new->uid = uid; - atomic_set(&new->__count, 1); + refcount_set(&new->__count, 1); ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c3d7583fcd21..e5222b5fb4fe 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -859,7 +859,16 @@ static ssize_t map_write(struct file *file, const char __user *buf, unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; - ssize_t ret = -EINVAL; + ssize_t ret; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. @@ -895,19 +904,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; - /* Only allow < page size writes at the beginning of the file */ - ret = -EINVAL; - if ((*ppos != 0) || (count >= PAGE_SIZE)) - goto out; - - /* Slurp in the user data */ - kbuf = memdup_user_nul(buf, count); - if (IS_ERR(kbuf)) { - ret = PTR_ERR(kbuf); - kbuf = NULL; - goto out; - } - /* Parse the user data */ ret = -EINVAL; pos = kbuf; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 233cd8fc6910..258033d62cb3 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -18,7 +18,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(struct ctl_table *table, int write) +static void *get_uts(struct ctl_table *table) { char *which = table->data; struct uts_namespace *uts_ns; @@ -26,21 +26,9 @@ static void *get_uts(struct ctl_table *table, int write) uts_ns = current->nsproxy->uts_ns; which = (which - (char *)&init_uts_ns) + (char *)uts_ns; - if (!write) - down_read(&uts_sem); - else - down_write(&uts_sem); return which; } -static void put_uts(struct ctl_table *table, int write, void *which) -{ - if (!write) - up_read(&uts_sem); - else - up_write(&uts_sem); -} - /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? @@ -50,13 +38,34 @@ static int proc_do_uts_string(struct ctl_table *table, int write, { struct ctl_table uts_table; int r; + char tmp_data[__NEW_UTS_LEN + 1]; + memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); + uts_table.data = tmp_data; + + /* + * Buffer the value in tmp_data so that proc_dostring() can be called + * without holding any locks. + * We also need to read the original value in the write==1 case to + * support partial writes. + */ + down_read(&uts_sem); + memcpy(tmp_data, get_uts(table), sizeof(tmp_data)); + up_read(&uts_sem); r = proc_dostring(&uts_table, write, buffer, lenp, ppos); - put_uts(table, write, uts_table.data); - if (write) + if (write) { + /* + * Write back the new value. + * Note that, since we dropped uts_sem, the result can + * theoretically be incorrect if there are two parallel writes + * at non-zero offsets to the same sysctl. + */ + down_write(&uts_sem); + memcpy(get_uts(table), tmp_data, sizeof(tmp_data)); + up_write(&uts_sem); proc_sys_poll_notify(table->poll); + } return r; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 78b192071ef7..60e80198c3df 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2652,6 +2652,9 @@ void flush_workqueue(struct workqueue_struct *wq) if (WARN_ON(!wq_online)) return; + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); + mutex_lock(&wq->mutex); /* @@ -2843,7 +2846,8 @@ reflush: } EXPORT_SYMBOL_GPL(drain_workqueue); -static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, + bool from_cancel) { struct worker *worker = NULL; struct worker_pool *pool; @@ -2885,7 +2889,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * workqueues the deadlock happens when the rescuer stalls, blocking * forward progress. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) { + if (!from_cancel && + (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) { lock_map_acquire(&pwq->wq->lockdep_map); lock_map_release(&pwq->wq->lockdep_map); } @@ -2896,6 +2901,27 @@ already_gone: return false; } +static bool __flush_work(struct work_struct *work, bool from_cancel) +{ + struct wq_barrier barr; + + if (WARN_ON(!wq_online)) + return false; + + if (!from_cancel) { + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + } + + if (start_flush_work(work, &barr, from_cancel)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } +} + /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush @@ -2909,18 +2935,7 @@ already_gone: */ bool flush_work(struct work_struct *work) { - struct wq_barrier barr; - - if (WARN_ON(!wq_online)) - return false; - - if (start_flush_work(work, &barr)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else { - return false; - } + return __flush_work(work, false); } EXPORT_SYMBOL_GPL(flush_work); @@ -2986,7 +3001,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) * isn't executing. */ if (wq_online) - flush_work(work); + __flush_work(work, true); clear_work_data(work); |