diff options
Diffstat (limited to 'kernel')
32 files changed, 1804 insertions, 748 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7b96bf0e654..ce166a002d16 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6678,17 +6678,17 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* valid generic load 64-bit imm */ goto next_insn; - if (insn->src_reg != BPF_PSEUDO_MAP_FD) { - verbose(env, - "unrecognized bpf_ld_imm64 insn\n"); + if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || + insn[1].imm != 0) { + verbose(env, "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } - f = fdget(insn->imm); + f = fdget(insn[0].imm); map = __bpf_map_get(f); if (IS_ERR(map)) { verbose(env, "fd %d is not pointing to valid bpf_map\n", - insn->imm); + insn[0].imm); return PTR_ERR(map); } diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c9a35f09e4b9..30e39f3932ad 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -7,6 +7,7 @@ #include <linux/workqueue.h> #include <linux/list.h> #include <linux/refcount.h> +#include <linux/fs_context.h> #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; @@ -37,6 +38,31 @@ extern void __init enable_debug_cgroup(void); } while (0) /* + * The cgroup filesystem superblock creation/mount context. + */ +struct cgroup_fs_context { + struct kernfs_fs_context kfc; + struct cgroup_root *root; + struct cgroup_namespace *ns; + unsigned int flags; /* CGRP_ROOT_* flags */ + + /* cgroup1 bits */ + bool cpuset_clone_children; + bool none; /* User explicitly requested empty subsystem */ + bool all_ss; /* Seen 'all' option */ + u16 subsys_mask; /* Selected subsystems */ + char *name; /* Hierarchy name */ + char *release_agent; /* Path for release notifications */ +}; + +static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct cgroup_fs_context, kfc); +} + +/* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other * direction, a css_set is naturally associated with multiple cgroups. @@ -117,16 +143,6 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; @@ -197,12 +213,10 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); void cgroup_free_root(struct cgroup_root *root); -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); +void init_cgroup_root(struct cgroup_fs_context *ctx); int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns); +int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); @@ -246,14 +260,15 @@ extern const struct proc_ns_operations cgroupns_operations; */ extern struct cftype cgroup1_base_files[]; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; +extern const struct fs_parameter_description cgroup1_fs_parameters; int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); void cgroup1_check_for_release(struct cgroup *cgrp); -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns); +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param); +int cgroup1_get_tree(struct fs_context *fc); +int cgroup1_reconfigure(struct fs_context *ctx); #endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index f94a7229974e..c126b34fd4ff 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -13,9 +13,12 @@ #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> +#include <linux/fs_parser.h> #include <trace/events/cgroup.h> +#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__) + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -906,172 +909,195 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo return 0; } -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); +enum cgroup1_param { + Opt_all, + Opt_clone_children, + Opt_cpuset_v2_mode, + Opt_name, + Opt_none, + Opt_noprefix, + Opt_release_agent, + Opt_xattr, +}; - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; +static const struct fs_parameter_spec cgroup1_param_specs[] = { + fsparam_flag ("all", Opt_all), + fsparam_flag ("clone_children", Opt_clone_children), + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + fsparam_string("name", Opt_name), + fsparam_flag ("none", Opt_none), + fsparam_flag ("noprefix", Opt_noprefix), + fsparam_string("release_agent", Opt_release_agent), + fsparam_flag ("xattr", Opt_xattr), + {} +}; - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "cpuset_v2_mode")) { - opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - - /* blocked by boot param? */ - if (cgroup_no_v1_named) - return -ENOENT; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; +const struct fs_parameter_description cgroup1_fs_parameters = { + .name = "cgroup1", + .specs = cgroup1_param_specs, +}; - continue; +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct cgroup_subsys *ss; + struct fs_parse_result result; + int opt, i; + + opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result); + if (opt == -ENOPARAM) { + if (strcmp(param->key, "source") == 0) { + fc->source = param->string; + param->string = NULL; + return 0; } - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name)) continue; - if (!cgroup_ssid_enabled(i)) + ctx->subsys_mask |= (1 << i); + return 0; + } + return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key); + } + if (opt < 0) + return opt; + + switch (opt) { + case Opt_none: + /* Explicitly have no subsystems */ + ctx->none = true; + break; + case Opt_all: + ctx->all_ss = true; + break; + case Opt_noprefix: + ctx->flags |= CGRP_ROOT_NOPREFIX; + break; + case Opt_clone_children: + ctx->cpuset_clone_children = true; + break; + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + break; + case Opt_xattr: + ctx->flags |= CGRP_ROOT_XATTR; + break; + case Opt_release_agent: + /* Specifying two release agents is forbidden */ + if (ctx->release_agent) + return cg_invalf(fc, "cgroup1: release_agent respecified"); + ctx->release_agent = param->string; + param->string = NULL; + break; + case Opt_name: + /* blocked by boot param? */ + if (cgroup_no_v1_named) + return -ENOENT; + /* Can't specify an empty name */ + if (!param->size) + return cg_invalf(fc, "cgroup1: Empty name"); + if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) + return cg_invalf(fc, "cgroup1: Name too long"); + /* Must match [\w.-]+ */ + for (i = 0; i < param->size; i++) { + char c = param->string[i]; + if (isalnum(c)) continue; - if (cgroup1_ssid_disabled(i)) + if ((c == '.') || (c == '-') || (c == '_')) continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; + return cg_invalf(fc, "cgroup1: Invalid name"); } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + /* Specifying two names is forbidden */ + if (ctx->name) + return cg_invalf(fc, "cgroup1: name respecified"); + ctx->name = param->string; + param->string = NULL; + break; } + return 0; +} + +static int check_cgroupfs_options(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + u16 mask = U16_MAX; + u16 enabled = 0; + struct cgroup_subsys *ss; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + enabled |= 1 << i; + + ctx->subsys_mask &= enabled; /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' + * In absense of 'none', 'name=' or subsystem name options, + * let's default to 'all'. */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) - opts->subsys_mask |= (1 << i); + if (!ctx->subsys_mask && !ctx->none && !ctx->name) + ctx->all_ss = true; + + if (ctx->all_ss) { + /* Mutually exclusive option 'all' + subsystem name */ + if (ctx->subsys_mask) + return cg_invalf(fc, "cgroup1: subsys name conflicts with all"); + /* 'all' => select all the subsystems */ + ctx->subsys_mask = enabled; + } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + if (!ctx->subsys_mask && !ctx->name) + return cg_invalf(fc, "cgroup1: Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; + if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) + return cg_invalf(fc, "cgroup1: noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; + if (ctx->subsys_mask && ctx->none) + return cg_invalf(fc, "cgroup1: none used incorrectly"); return 0; } -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) +int cgroup1_reconfigure(struct fs_context *fc) { - int ret = 0; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_sb_opts opts; + int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; + added_mask = ctx->subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); + if ((ctx->flags ^ root->flags) || + (ctx->name && strcmp(ctx->name, root->name))) { + cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", + ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1088,17 +1114,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - if (opts.release_agent) { + if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); + strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: - kfree(opts.release_agent); - kfree(opts.name); mutex_unlock(&cgroup_mutex); return ret; } @@ -1106,28 +1130,30 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, - .remount_fs = cgroup1_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns) +/* + * The guts of cgroup1 mount - find or create cgroup_root to use. + * Called with cgroup_mutex held; returns 0 on success, -E... on + * error and positive - in case when the candidate is busy dying. + * On success it stashes a reference to cgroup_root into given + * cgroup_fs_context; that reference is *NOT* counting towards the + * cgroup_root refcount. + */ +static int cgroup1_root_to_use(struct fs_context *fc) { - struct cgroup_sb_opts opts; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; - struct dentry *dentry; int i, ret; - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) - goto out_unlock; + return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may @@ -1137,16 +1163,12 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || + if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) + return 1; /* restart */ cgroup_put(&ss->root->cgrp); } @@ -1161,8 +1183,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ - if (opts.name) { - if (strcmp(opts.name, root->name)) + if (ctx->name) { + if (strcmp(ctx->name, root->name)) continue; name_match = true; } @@ -1171,19 +1193,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { + if ((ctx->subsys_mask || ctx->none) && + (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; - ret = -EBUSY; - goto out_unlock; + return -EBUSY; } - if (root->flags ^ opts.flags) + if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - ret = 0; - goto out_unlock; + ctx->root = root; + return 0; } /* @@ -1191,55 +1212,58 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } + if (!ctx->subsys_mask && !ctx->none) + return cg_invalf(fc, "cgroup1: No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } + if (ctx->ns != &init_cgroup_ns) + return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } + if (!root) + return -ENOMEM; - init_cgroup_root(root, &opts); + ctx->root = root; + init_cgroup_root(ctx); - ret = cgroup_setup_root(root, opts.subsys_mask); + ret = cgroup_setup_root(root, ctx->subsys_mask); if (ret) cgroup_free_root(root); + return ret; +} -out_unlock: - if (!ret && !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); +int cgroup1_get_tree(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + int ret; - if (ret) - return ERR_PTR(ret); + /* Check if the caller has permission to mount. */ + if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); + + ret = cgroup1_root_to_use(fc); + if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) + ret = 1; /* restart */ - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, - CGROUP_SUPER_MAGIC, ns); + mutex_unlock(&cgroup_mutex); - if (!IS_ERR(dentry) && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - struct super_block *sb = dentry->d_sb; - dput(dentry); + if (!ret) + ret = cgroup_do_get_tree(fc); + + if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { + struct super_block *sb = fc->root->d_sb; + dput(fc->root); deactivate_locked_super(sb); + ret = 1; + } + + if (unlikely(ret > 0)) { msleep(10); - dentry = ERR_PTR(restart_syscall()); + return restart_syscall(); } - return dentry; + return ret; } static int __init cgroup1_wq_init(void) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index eef24a25bda7..3f2b4bde0f9c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -54,6 +54,7 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> +#include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/psi.h> #include <net/sock.h> @@ -1772,26 +1773,37 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) -{ - char *token; +enum cgroup2_param { + Opt_nsdelegate, + nr__cgroup2_params +}; - *root_flags = 0; +static const struct fs_parameter_spec cgroup2_param_specs[] = { + fsparam_flag ("nsdelegate", Opt_nsdelegate), + {} +}; - if (!data || *data == '\0') - return 0; +static const struct fs_parameter_description cgroup2_fs_parameters = { + .name = "cgroup2", + .specs = cgroup2_param_specs, +}; - while ((token = strsep(&data, ",")) != NULL) { - if (!strcmp(token, "nsdelegate")) { - *root_flags |= CGRP_ROOT_NS_DELEGATE; - continue; - } +static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; - pr_err("cgroup2: unknown option \"%s\"\n", token); - return -EINVAL; - } + opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result); + if (opt < 0) + return opt; - return 0; + switch (opt) { + case Opt_nsdelegate: + ctx->flags |= CGRP_ROOT_NS_DELEGATE; + return 0; + } + return -EINVAL; } static void apply_cgroup_root_flags(unsigned int root_flags) @@ -1811,16 +1823,11 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root return 0; } -static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) +static int cgroup_reconfigure(struct fs_context *fc) { - unsigned int root_flags; - int ret; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) - return ret; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - apply_cgroup_root_flags(root_flags); + apply_cgroup_root_flags(ctx->flags); return 0; } @@ -1908,8 +1915,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) +void init_cgroup_root(struct cgroup_fs_context *ctx) { + struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); @@ -1918,12 +1926,12 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); - root->flags = opts->flags; - if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); - if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); - if (opts->cpuset_clone_children) + root->flags = ctx->flags; + if (ctx->release_agent) + strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); + if (ctx->name) + strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); + if (ctx->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } @@ -2028,60 +2036,104 @@ out: return ret; } -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns) +int cgroup_do_get_tree(struct fs_context *fc) { - struct dentry *dentry; - bool new_sb = false; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + int ret; - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); + ctx->kfc.root = ctx->root->kf_root; + if (fc->fs_type == &cgroup2_fs_type) + ctx->kfc.magic = CGROUP2_SUPER_MAGIC; + else + ctx->kfc.magic = CGROUP_SUPER_MAGIC; + ret = kernfs_get_tree(fc); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + if (!ret && ctx->ns != &init_cgroup_ns) { struct dentry *nsdentry; - struct super_block *sb = dentry->d_sb; + struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); - cgrp = cset_cgroup_from_root(ns->root_cset, root); + cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); nsdentry = kernfs_node_dentry(cgrp->kn, sb); - dput(dentry); - if (IS_ERR(nsdentry)) + dput(fc->root); + fc->root = nsdentry; + if (IS_ERR(nsdentry)) { + ret = PTR_ERR(nsdentry); deactivate_locked_super(sb); - dentry = nsdentry; + } } - if (!new_sb) - cgroup_put(&root->cgrp); + if (!ctx->kfc.new_sb_created) + cgroup_put(&ctx->root->cgrp); - return dentry; + return ret; } -static struct dentry *cgroup_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +/* + * Destroy a cgroup filesystem context. + */ +static void cgroup_fs_context_free(struct fs_context *fc) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct dentry *dentry; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + + kfree(ctx->name); + kfree(ctx->release_agent); + put_cgroup_ns(ctx->ns); + kernfs_free_fs_context(fc); + kfree(ctx); +} + +static int cgroup_get_tree(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; - get_cgroup_ns(ns); + cgrp_dfl_visible = true; + cgroup_get_live(&cgrp_dfl_root.cgrp); + ctx->root = &cgrp_dfl_root; - /* Check if the caller has permission to mount. */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { - put_cgroup_ns(ns); - return ERR_PTR(-EPERM); - } + ret = cgroup_do_get_tree(fc); + if (!ret) + apply_cgroup_root_flags(ctx->flags); + return ret; +} + +static const struct fs_context_operations cgroup_fs_context_ops = { + .free = cgroup_fs_context_free, + .parse_param = cgroup2_parse_param, + .get_tree = cgroup_get_tree, + .reconfigure = cgroup_reconfigure, +}; + +static const struct fs_context_operations cgroup1_fs_context_ops = { + .free = cgroup_fs_context_free, + .parse_param = cgroup1_parse_param, + .get_tree = cgroup1_get_tree, + .reconfigure = cgroup1_reconfigure, +}; + +/* + * Initialise the cgroup filesystem creation/reconfiguration context. Notably, + * we select the namespace we're going to use. + */ +static int cgroup_init_fs_context(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx; + + ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; /* * The first time anyone tries to mount a cgroup, enable the list @@ -2090,29 +2142,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - if (fs_type == &cgroup2_fs_type) { - unsigned int root_flags; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) { - put_cgroup_ns(ns); - return ERR_PTR(ret); - } - - cgrp_dfl_visible = true; - cgroup_get_live(&cgrp_dfl_root.cgrp); - - dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, - CGROUP2_SUPER_MAGIC, ns); - if (!IS_ERR(dentry)) - apply_cgroup_root_flags(root_flags); - } else { - dentry = cgroup1_mount(&cgroup_fs_type, flags, data, - CGROUP_SUPER_MAGIC, ns); - } - - put_cgroup_ns(ns); - return dentry; + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); + fc->fs_private = &ctx->kfc; + if (fc->fs_type == &cgroup2_fs_type) + fc->ops = &cgroup_fs_context_ops; + else + fc->ops = &cgroup1_fs_context_ops; + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->ns->user_ns); + fc->global = true; + return 0; } static void cgroup_kill_sb(struct super_block *sb) @@ -2135,17 +2176,19 @@ static void cgroup_kill_sb(struct super_block *sb) } struct file_system_type cgroup_fs_type = { - .name = "cgroup", - .mount = cgroup_mount, - .kill_sb = cgroup_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "cgroup", + .init_fs_context = cgroup_init_fs_context, + .parameters = &cgroup1_fs_parameters, + .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { - .name = "cgroup2", - .mount = cgroup_mount, - .kill_sb = cgroup_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "cgroup2", + .init_fs_context = cgroup_init_fs_context, + .parameters = &cgroup2_fs_parameters, + .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, @@ -5280,7 +5323,6 @@ int cgroup_rmdir(struct kernfs_node *kn) static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, @@ -5347,11 +5389,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts; + static struct cgroup_fs_context __initdata ctx; struct cgroup_subsys *ss; int i; - init_cgroup_root(&cgrp_dfl_root, &opts); + ctx.root = &cgrp_dfl_root; + init_cgroup_root(&ctx); cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; RCU_INIT_POINTER(init_task.cgroups, &init_css_set); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 72afd55f70c6..4834c4214e9c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -39,6 +39,7 @@ #include <linux/memory.h> #include <linux/export.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> @@ -359,25 +360,52 @@ static inline bool is_in_v2_mode(void) * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ -static struct dentry *cpuset_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, void *data) -{ - struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - struct dentry *ret = ERR_PTR(-ENODEV); - if (cgroup_fs) { - char mountopts[] = - "cpuset,noprefix," - "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->mount(cgroup_fs, flags, - unused_dev_name, mountopts); - put_filesystem(cgroup_fs); +static int cpuset_get_tree(struct fs_context *fc) +{ + struct file_system_type *cgroup_fs; + struct fs_context *new_fc; + int ret; + + cgroup_fs = get_fs_type("cgroup"); + if (!cgroup_fs) + return -ENODEV; + + new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags); + if (IS_ERR(new_fc)) { + ret = PTR_ERR(new_fc); + } else { + static const char agent_path[] = "/sbin/cpuset_release_agent"; + ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0); + if (!ret) + ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0); + if (!ret) + ret = vfs_parse_fs_string(new_fc, "release_agent", + agent_path, sizeof(agent_path) - 1); + if (!ret) + ret = vfs_get_tree(new_fc); + if (!ret) { /* steal the result */ + fc->root = new_fc->root; + new_fc->root = NULL; + } + put_fs_context(new_fc); } + put_filesystem(cgroup_fs); return ret; } +static const struct fs_context_operations cpuset_fs_context_ops = { + .get_tree = cpuset_get_tree, +}; + +static int cpuset_init_fs_context(struct fs_context *fc) +{ + fc->ops = &cpuset_fs_context_ops; + return 0; +} + static struct file_system_type cpuset_fs_type = { - .name = "cpuset", - .mount = cpuset_mount, + .name = "cpuset", + .init_fs_context = cpuset_init_fs_context, }; /* diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9f5851064aea..53012db1e53c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -199,6 +199,7 @@ void __init swiotlb_update_mem_attributes(void) int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { unsigned long i, bytes; + size_t alloc_size; bytes = nslabs << IO_TLB_SHIFT; @@ -211,12 +212,18 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = memblock_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), - PAGE_SIZE); - io_tlb_orig_addr = memblock_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), - PAGE_SIZE); + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int)); + io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_list) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)); + io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_orig_addr) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; @@ -249,7 +256,7 @@ swiotlb_init(int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; /* Get IO TLB memory from the low pages */ - vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); + vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index affa830a198c..c5cde87329c7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -53,7 +53,7 @@ static struct percpu_rw_semaphore dup_mmap_sem; struct uprobe { struct rb_node rb_node; /* node in the rb tree */ - atomic_t ref; + refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; @@ -547,13 +547,13 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v static struct uprobe *get_uprobe(struct uprobe *uprobe) { - atomic_inc(&uprobe->ref); + refcount_inc(&uprobe->ref); return uprobe; } static void put_uprobe(struct uprobe *uprobe) { - if (atomic_dec_and_test(&uprobe->ref)) { + if (refcount_dec_and_test(&uprobe->ref)) { /* * If application munmap(exec_vma) before uprobe_unregister() * gets called, we don't get a chance to remove uprobe from @@ -644,7 +644,7 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) rb_link_node(&uprobe->rb_node, parent, p); rb_insert_color(&uprobe->rb_node, &uprobes_tree); /* get access + creation ref */ - atomic_set(&uprobe->ref, 2); + refcount_set(&uprobe->ref, 2); return u; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4802b039b89f..f08a1e4ee1d4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -965,6 +965,9 @@ void __init __register_nosave_region(unsigned long start_pfn, /* This allocation cannot fail */ region = memblock_alloc(sizeof(struct nosave_region), SMP_CACHE_BYTES); + if (!region) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(struct nosave_region)); } region->start_pfn = start_pfn; region->end_pfn = end_pfn; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8eee85bb2687..02ca827b8fac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -65,6 +65,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; +EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); @@ -1143,14 +1144,7 @@ void __init setup_log_buf(int early) if (!new_log_buf_len) return; - if (early) { - new_log_buf = - memblock_alloc(new_log_buf_len, LOG_ALIGN); - } else { - new_log_buf = memblock_alloc_nopanic(new_log_buf_len, - LOG_ALIGN); - } - + new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu bytes not available\n", new_log_buf_len); diff --git a/kernel/relay.c b/kernel/relay.c index 9e0f52375487..ade14fb7ce2e 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1177,7 +1177,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, diff --git a/kernel/resource.c b/kernel/resource.c index e81b17b53fa5..92190f62ebc5 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -382,7 +382,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, int (*func)(struct resource *, void *)) { struct resource res; - int ret = -1; + int ret = -EINVAL; while (start < end && !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { @@ -452,6 +452,9 @@ int walk_mem_res(u64 start, u64 end, void *arg, * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * It is to be used only for System RAM. + * + * This will find System RAM ranges that are children of top-level resources + * in addition to top-level System RAM resources. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -460,14 +463,14 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, unsigned long flags; struct resource res; unsigned long pfn, end_pfn; - int ret = -1; + int ret = -EINVAL; start = (u64) start_pfn << PAGE_SHIFT; end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, - true, &res)) { + false, &res)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) @@ -1128,6 +1131,15 @@ struct resource * __request_region(struct resource *parent, conflict = __request_resource(parent, res); if (!conflict) break; + /* + * mm/hmm.c reserves physical addresses which then + * become unavailable to other users. Conflicts are + * not expected. Warn to aid debugging if encountered. + */ + if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_warn("Unaddressable device %s %pR conflicts with %pR", + conflict->name, conflict, res); + } if (conflict != parent) { if (!(conflict->flags & IORESOURCE_BUSY)) { parent = conflict; diff --git a/kernel/signal.c b/kernel/signal.c index 5d53183e2705..b7953934aa99 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -19,7 +19,9 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/file.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> @@ -3487,6 +3489,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, #endif #endif +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info) +{ + clear_siginfo(info); + info->si_signo = sig; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_tgid_vnr(current); + info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); +} + /** * sys_kill - send a signal to a process * @pid: the PID of the process @@ -3496,16 +3508,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; - clear_siginfo(&info); - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info); return kill_something_info(sig, &info, pid); } +#ifdef CONFIG_PROC_FS +/* + * Verify that the signaler and signalee either are in the same pid namespace + * or that the signaler's pid namespace is an ancestor of the signalee's pid + * namespace. + */ +static bool access_pidfd_pidns(struct pid *pid) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *p = ns_of_pid(pid); + + for (;;) { + if (!p) + return false; + if (p == active) + break; + p = p->parent; + } + + return true; +} + +static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) +{ +#ifdef CONFIG_COMPAT + /* + * Avoid hooking up compat syscalls and instead handle necessary + * conversions here. Note, this is a stop-gap measure and should not be + * considered a generic solution. + */ + if (in_compat_syscall()) + return copy_siginfo_from_user32( + kinfo, (struct compat_siginfo __user *)info); +#endif + return copy_siginfo_from_user(kinfo, info); +} + +/** + * sys_pidfd_send_signal - send a signal to a process through a task file + * descriptor + * @pidfd: the file descriptor of the process + * @sig: signal to be sent + * @info: the signal info + * @flags: future flags to be passed + * + * The syscall currently only signals via PIDTYPE_PID which covers + * kill(<positive-pid>, <signal>. It does not signal threads or process + * groups. + * In order to extend the syscall to threads and process groups the @flags + * argument should be used. In essence, the @flags argument will determine + * what is signaled and not the file descriptor itself. Put in other words, + * grouping is a property of the flags argument not a property of the file + * descriptor. + * + * Return: 0 on success, negative errno on failure + */ +SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, + siginfo_t __user *, info, unsigned int, flags) +{ + int ret; + struct fd f; + struct pid *pid; + kernel_siginfo_t kinfo; + + /* Enforce flags be set to 0 until we add an extension. */ + if (flags) + return -EINVAL; + + f = fdget_raw(pidfd); + if (!f.file) + return -EBADF; + + /* Is this a pidfd? */ + pid = tgid_pidfd_to_pid(f.file); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto err; + } + + ret = -EINVAL; + if (!access_pidfd_pidns(pid)) + goto err; + + if (info) { + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + goto err; + + ret = -EINVAL; + if (unlikely(sig != kinfo.si_signo)) + goto err; + + if ((task_pid(current) != pid) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) { + /* Only allow sending arbitrary signals to yourself. */ + ret = -EPERM; + if (kinfo.si_code != SI_USER) + goto err; + + /* Turn this into a regular kill signal. */ + prepare_kill_siginfo(sig, &kinfo); + } + } else { + prepare_kill_siginfo(sig, &kinfo); + } + + ret = kill_pid_info(sig, &kinfo, pid); + +err: + fdput(f); + return ret; +} +#endif /* CONFIG_PROC_FS */ + static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 51d7c6794bf1..d21f4befaea4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -168,6 +168,7 @@ COND_SYSCALL(syslog); /* kernel/sched/core.c */ /* kernel/signal.c */ +COND_SYSCALL(pidfd_send_signal); /* kernel/sys.c */ COND_SYSCALL(setregid); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3fb1405f3f8c..e5da394d1ca3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2643,23 +2643,25 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { + int tmp, ret; struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + if (write) { - int val = *negp ? -*lvalp : *lvalp; - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) return -EINVAL; - *valp = val; - } else { - int val = *valp; - if (val < 0) { - *negp = true; - *lvalp = -(unsigned long)val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } + *valp = tmp; } + return 0; } @@ -2708,22 +2710,22 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { + int ret; + unsigned int tmp; struct do_proc_douintvec_minmax_conv_param *param = data; + /* write via temporary local uint for bounds-checking */ + unsigned int *up = write ? &tmp : valp; - if (write) { - unsigned int val = *lvalp; - - if (*lvalp > UINT_MAX) - return -EINVAL; + ret = do_proc_douintvec_conv(lvalp, up, write, data); + if (ret) + return ret; - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) return -ERANGE; - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; + *valp = tmp; } return 0; @@ -3326,7 +3328,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ -#ifdef CONFIG_BPF_SYSCALL +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fa8b1fe824f3..8bd1d6d001d7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -370,6 +370,7 @@ config PROFILE_ANNOTATED_BRANCHES config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING + imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fac0ddf8a8e2..e1c6d79fb4cc 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -723,6 +723,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; + /* fall through */ case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aac7847c0214..fa79323331b2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3702,6 +3702,31 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) } static int +add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, + int clear_filter) +{ + long index = simple_strtoul(func_g->search, NULL, 0); + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + /* The index starts at 1 */ + if (--index < 0) + return 0; + + do_for_each_ftrace_rec(pg, rec) { + if (pg->index <= index) { + index -= pg->index; + /* this is a double loop, break goes to the next page */ + break; + } + rec = &pg->records[index]; + enter_record(hash, rec, clear_filter); + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + +static int ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, struct ftrace_glob *mod_g, int exclude_mod) { @@ -3769,6 +3794,11 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) if (unlikely(ftrace_disabled)) goto out_unlock; + if (func_g.type == MATCH_INDEX) { + found = add_rec_by_index(hash, &func_g, clear_filter); + goto out_unlock; + } + do_for_each_ftrace_rec(pg, rec) { if (rec->flags & FTRACE_FL_DISABLED) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 06e864a334bb..41b6f96e5366 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -353,20 +353,6 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } -/** - * ring_buffer_page_len - the size of data on the page. - * @page: The page to read - * - * Returns the amount of data on the page, including buffer page header. - */ -size_t ring_buffer_page_len(void *page) -{ - struct buffer_data_page *bpage = page; - - return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) - + BUF_PAGE_HDR_SIZE; -} - /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -4205,6 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over + * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer recording @@ -4222,7 +4209,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -4230,7 +4217,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc(sizeof(*iter), flags); if (!iter) return NULL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c4238b441624..21153e64bf1c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -894,7 +894,7 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -void tracing_snapshot_instance(struct trace_array *tr) +void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -920,10 +920,15 @@ void tracing_snapshot_instance(struct trace_array *tr) } local_irq_save(flags); - update_max_tr(tr, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); } +void tracing_snapshot_instance(struct trace_array *tr) +{ + tracing_snapshot_instance_cond(tr, NULL); +} + /** * tracing_snapshot - take a snapshot of the current buffer. * @@ -946,6 +951,54 @@ void tracing_snapshot(void) } EXPORT_SYMBOL_GPL(tracing_snapshot); +/** + * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. + * @tr: The tracing instance to snapshot + * @cond_data: The data to be tested conditionally, and possibly saved + * + * This is the same as tracing_snapshot() except that the snapshot is + * conditional - the snapshot will only happen if the + * cond_snapshot.update() implementation receiving the cond_data + * returns true, which means that the trace array's cond_snapshot + * update() operation used the cond_data to determine whether the + * snapshot should be taken, and if it was, presumably saved it along + * with the snapshot. + */ +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + tracing_snapshot_instance_cond(tr, cond_data); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); + +/** + * tracing_snapshot_cond_data - get the user data associated with a snapshot + * @tr: The tracing instance + * + * When the user enables a conditional snapshot using + * tracing_snapshot_cond_enable(), the user-defined cond_data is saved + * with the snapshot. This accessor is used to retrieve it. + * + * Should not be called from cond_snapshot.update(), since it takes + * the tr->max_lock lock, which the code calling + * cond_snapshot.update() has already done. + * + * Returns the cond_data associated with the trace array's snapshot. + */ +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + void *cond_data = NULL; + + arch_spin_lock(&tr->max_lock); + + if (tr->cond_snapshot) + cond_data = tr->cond_snapshot->cond_data; + + arch_spin_unlock(&tr->max_lock); + + return cond_data; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); + static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, struct trace_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); @@ -1025,12 +1078,111 @@ void tracing_snapshot_alloc(void) tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); + +/** + * tracing_snapshot_cond_enable - enable conditional snapshot for an instance + * @tr: The tracing instance + * @cond_data: User data to associate with the snapshot + * @update: Implementation of the cond_snapshot update function + * + * Check whether the conditional snapshot for the given instance has + * already been enabled, or if the current tracer is already using a + * snapshot; if so, return -EBUSY, else create a cond_snapshot and + * save the cond_data and update function inside. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, + cond_update_fn_t update) +{ + struct cond_snapshot *cond_snapshot; + int ret = 0; + + cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); + if (!cond_snapshot) + return -ENOMEM; + + cond_snapshot->cond_data = cond_data; + cond_snapshot->update = update; + + mutex_lock(&trace_types_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) + goto fail_unlock; + + if (tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto fail_unlock; + } + + /* + * The cond_snapshot can only change to NULL without the + * trace_types_lock. We don't care if we race with it going + * to NULL, but we want to make sure that it's not set to + * something other than NULL when we get here, which we can + * do safely with only holding the trace_types_lock and not + * having to take the max_lock. + */ + if (tr->cond_snapshot) { + ret = -EBUSY; + goto fail_unlock; + } + + arch_spin_lock(&tr->max_lock); + tr->cond_snapshot = cond_snapshot; + arch_spin_unlock(&tr->max_lock); + + mutex_unlock(&trace_types_lock); + + return ret; + + fail_unlock: + mutex_unlock(&trace_types_lock); + kfree(cond_snapshot); + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); + +/** + * tracing_snapshot_cond_disable - disable conditional snapshot for an instance + * @tr: The tracing instance + * + * Check whether the conditional snapshot for the given instance is + * enabled; if so, free the cond_snapshot associated with it, + * otherwise return -EINVAL. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + int ret = 0; + + arch_spin_lock(&tr->max_lock); + + if (!tr->cond_snapshot) + ret = -EINVAL; + else { + kfree(tr->cond_snapshot); + tr->cond_snapshot = NULL; + } + + arch_spin_unlock(&tr->max_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); int tracing_alloc_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); @@ -1043,6 +1195,21 @@ void tracing_snapshot_alloc(void) tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) +{ + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + return false; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1330,7 +1497,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use @@ -1354,12 +1521,14 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) * @tr: tracer * @tsk: the task with the latency * @cpu: The cpu that initiated the trace. + * @cond_data: User data associated with a conditional snapshot * * Flip the buffers between the @tr and the max_tr and record information * about which task was the cause of this latency. */ void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data) { if (tr->stop_count) return; @@ -1380,9 +1549,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) else ring_buffer_record_off(tr->max_buffer.buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) + goto out_unlock; +#endif swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); + + out_unlock: arch_spin_unlock(&tr->max_lock); } @@ -1748,7 +1923,7 @@ static inline char *get_saved_cmdlines(int idx) static inline void set_cmdline(int idx, const char *cmdline) { - memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); + strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static int allocate_cmdlines_buffer(unsigned int val, @@ -3904,7 +4079,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -3914,7 +4090,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); @@ -4702,6 +4879,7 @@ static const char readme_msg[] = "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" + "\t [:<handler>.<action>]\n" "\t [if <filter>]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -4742,8 +4920,21 @@ static const char readme_msg[] = "\t unchanged.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" - "\t already-attached hist trigger. The syntax is analagous to\n" - "\t the enable_event and disable_event triggers.\n" + "\t already-attached hist trigger. The syntax is analogous to\n" + "\t the enable_event and disable_event triggers.\n\n" + "\t Hist trigger handlers and actions are executed whenever a\n" + "\t a histogram entry is added or updated. They take the form:\n\n" + "\t <handler>.<action>\n\n" + "\t The available handlers are:\n\n" + "\t onmatch(matching.event) - invoke on addition or update\n" + "\t onmax(var) - invoke if var exceeds current max\n" + "\t onchange(var) - invoke action if var changes\n\n" + "\t The available actions are:\n\n" + "\t trace(<synthetic_event>,param list) - generate synthetic event\n" + "\t save(field,...) - save current event fields\n" +#ifdef CONFIG_TRACER_SNAPSHOT + "\t snapshot() - snapshot the trace buffer\n" +#endif #endif ; @@ -5388,6 +5579,16 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t == tr->current_trace) goto out; +#ifdef CONFIG_TRACER_SNAPSHOT + if (t->use_max_tr) { + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + if (ret) + goto out; + } +#endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", @@ -5626,7 +5827,6 @@ out: return ret; fail: - kfree(iter->trace); kfree(iter); __trace_array_put(tr); mutex_unlock(&trace_types_lock); @@ -5825,7 +6025,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, } static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -6470,6 +6669,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, goto out; } + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + if (ret) + goto out; + switch (val) { case 0: if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { @@ -6495,7 +6701,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, local_irq_disable(); /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - update_max_tr(tr, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id(), NULL); else update_max_tr_single(tr, current, iter->cpu_file); local_irq_enable(); @@ -6845,7 +7051,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, .steal = generic_pipe_buf_steal, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 08900828d282..d80cee49e0eb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -194,6 +194,51 @@ struct trace_pid_list { unsigned long *pids; }; +typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); + +/** + * struct cond_snapshot - conditional snapshot data and callback + * + * The cond_snapshot structure encapsulates a callback function and + * data associated with the snapshot for a given tracing instance. + * + * When a snapshot is taken conditionally, by invoking + * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is + * passed in turn to the cond_snapshot.update() function. That data + * can be compared by the update() implementation with the cond_data + * contained wihin the struct cond_snapshot instance associated with + * the trace_array. Because the tr->max_lock is held throughout the + * update() call, the update() function can directly retrieve the + * cond_snapshot and cond_data associated with the per-instance + * snapshot associated with the trace_array. + * + * The cond_snapshot.update() implementation can save data to be + * associated with the snapshot if it decides to, and returns 'true' + * in that case, or it returns 'false' if the conditional snapshot + * shouldn't be taken. + * + * The cond_snapshot instance is created and associated with the + * user-defined cond_data by tracing_cond_snapshot_enable(). + * Likewise, the cond_snapshot instance is destroyed and is no longer + * associated with the trace instance by + * tracing_cond_snapshot_disable(). + * + * The method below is required. + * + * @update: When a conditional snapshot is invoked, the update() + * callback function is invoked with the tr->max_lock held. The + * update() implementation signals whether or not to actually + * take the snapshot, by returning 'true' if so, 'false' if no + * snapshot should be taken. Because the max_lock is held for + * the duration of update(), the implementation is safe to + * directly retrieven and save any implementation data it needs + * to in association with the snapshot. + */ +struct cond_snapshot { + void *cond_data; + cond_update_fn_t update; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -277,6 +322,9 @@ struct trace_array { #endif int time_stamp_abs_ref; struct list_head hist_vars; +#ifdef CONFIG_TRACER_SNAPSHOT + struct cond_snapshot *cond_snapshot; +#endif }; enum { @@ -727,7 +775,8 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, const char __user *ubuf, size_t cnt); #ifdef CONFIG_TRACER_MAX_TRACE -void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ @@ -855,10 +904,11 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 -#define TRACE_GRAPH_PRINT_TAIL 0x80 -#define TRACE_GRAPH_SLEEP_TIME 0x100 -#define TRACE_GRAPH_GRAPH_TIME 0x200 +#define TRACE_GRAPH_PRINT_REL_TIME 0x40 +#define TRACE_GRAPH_PRINT_IRQS 0x80 +#define TRACE_GRAPH_PRINT_TAIL 0x100 +#define TRACE_GRAPH_SLEEP_TIME 0x200 +#define TRACE_GRAPH_GRAPH_TIME 0x400 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -1458,6 +1508,7 @@ enum regex_type { MATCH_MIDDLE_ONLY, MATCH_END_ONLY, MATCH_GLOB, + MATCH_INDEX, }; struct regex { @@ -1808,6 +1859,11 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops) extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); +extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); +extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); + +extern int tracing_snapshot_cond_disable(struct trace_array *tr); +extern void *tracing_cond_snapshot_data(struct trace_array *tr); extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 06bb2fd9a56c..fc8e97328e54 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -65,7 +65,8 @@ FTRACE_ENTRY_REG(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + F_printk(" %ps <-- %ps", + (void *)__entry->ip, (void *)__entry->parent_ip), FILTER_TRACE_FN, @@ -83,7 +84,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth), + F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth), FILTER_OTHER ); @@ -102,8 +103,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_desc( int, ret, depth ) ), - F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", - __entry->func, __entry->depth, + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth), @@ -167,12 +168,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, #define FTRACE_STACK_ENTRIES 8 -#ifndef CONFIG_64BIT -# define IP_FMT "%08lx" -#else -# define IP_FMT "%016lx" -#endif - FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, @@ -182,12 +177,13 @@ FTRACE_ENTRY(kernel_stack, stack_entry, __dynamic_array(unsigned long, caller ) ), - F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", - __entry->caller[0], __entry->caller[1], __entry->caller[2], - __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]), + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]), FILTER_OTHER ); @@ -201,12 +197,13 @@ FTRACE_ENTRY(user_stack, userstack_entry, __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ), - F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", - __entry->caller[0], __entry->caller[1], __entry->caller[2], - __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]), + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]), FILTER_OTHER ); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 76217bbef815..4629a6104474 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -299,15 +299,13 @@ int perf_uprobe_init(struct perf_event *p_event, if (!p_event->attr.uprobe_path) return -EINVAL; - path = kzalloc(PATH_MAX, GFP_KERNEL); - if (!path) - return -ENOMEM; - ret = strncpy_from_user( - path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); - if (ret == PATH_MAX) - return -E2BIG; - if (ret < 0) - goto out; + + path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), + PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + return (ret == -EINVAL) ? -E2BIG : ret; + } if (path[0] == '\0') { ret = -EINVAL; goto out; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 217ef481fbbb..05a66493a164 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -491,10 +491,12 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, break; case '&': case '|': + /* accepting only "&&" or "||" */ if (next[1] == next[0]) { ptr++; break; } + /* fall through */ default: parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); @@ -823,6 +825,9 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) *search = buff; + if (isdigit(buff[0])) + return MATCH_INDEX; + for (i = 0; i < len; i++) { if (buff[i] == '*') { if (!i) { @@ -860,6 +865,8 @@ static void filter_build_regex(struct filter_pred *pred) } switch (type) { + /* MATCH_INDEX should not happen, but if it does, match full */ + case MATCH_INDEX: case MATCH_FULL: r->match = regex_match_full; break; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 449d90cfa151..ca46339f3009 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -313,9 +313,9 @@ struct hist_trigger_data { struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; unsigned int n_field_var_hists; - struct field_var *max_vars[SYNTH_FIELDS_MAX]; - unsigned int n_max_vars; - unsigned int n_max_var_str; + struct field_var *save_vars[SYNTH_FIELDS_MAX]; + unsigned int n_save_vars; + unsigned int n_save_var_str; }; static int synth_event_create(int argc, const char **argv); @@ -383,41 +383,157 @@ struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, + struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals); +typedef bool (*check_track_val_fn_t) (u64 track_val, u64 var_val); + +enum handler_id { + HANDLER_ONMATCH = 1, + HANDLER_ONMAX, + HANDLER_ONCHANGE, +}; + +enum action_id { + ACTION_SAVE = 1, + ACTION_TRACE, + ACTION_SNAPSHOT, +}; + struct action_data { + enum handler_id handler; + enum action_id action; + char *action_name; action_fn_t fn; + unsigned int n_params; char *params[SYNTH_FIELDS_MAX]; + /* + * When a histogram trigger is hit, the values of any + * references to variables, including variables being passed + * as parameters to synthetic events, are collected into a + * var_ref_vals array. This var_ref_idx is the index of the + * first param in the array to be passed to the synthetic + * event invocation. + */ + unsigned int var_ref_idx; + struct synth_event *synth_event; + bool use_trace_keyword; + char *synth_event_name; + union { struct { - /* - * When a histogram trigger is hit, the values of any - * references to variables, including variables being passed - * as parameters to synthetic events, are collected into a - * var_ref_vals array. This var_ref_idx is the index of the - * first param in the array to be passed to the synthetic - * event invocation. - */ - unsigned int var_ref_idx; - char *match_event; - char *match_event_system; - char *synth_event_name; - struct synth_event *synth_event; - } onmatch; + char *event; + char *event_system; + } match_data; struct { + /* + * var_str contains the $-unstripped variable + * name referenced by var_ref, and used when + * printing the action. Because var_ref + * creation is deferred to create_actions(), + * we need a per-action way to save it until + * then, thus var_str. + */ char *var_str; - char *fn_name; - unsigned int max_var_ref_idx; - struct hist_field *max_var; - struct hist_field *var; - } onmax; + + /* + * var_ref refers to the variable being + * tracked e.g onmax($var). + */ + struct hist_field *var_ref; + + /* + * track_var contains the 'invisible' tracking + * variable created to keep the current + * e.g. max value. + */ + struct hist_field *track_var; + + check_track_val_fn_t check_val; + action_fn_t save_data; + } track_data; }; }; +struct track_data { + u64 track_val; + bool updated; + + unsigned int key_len; + void *key; + struct tracing_map_elt elt; + + struct action_data *action_data; + struct hist_trigger_data *hist_data; +}; + +struct hist_elt_data { + char *comm; + u64 *var_ref_vals; + char *field_var_str[SYNTH_FIELDS_MAX]; +}; + +struct snapshot_context { + struct tracing_map_elt *elt; + void *key; +}; + +static void track_data_free(struct track_data *track_data) +{ + struct hist_elt_data *elt_data; + + if (!track_data) + return; + + kfree(track_data->key); + + elt_data = track_data->elt.private_data; + if (elt_data) { + kfree(elt_data->comm); + kfree(elt_data); + } + + kfree(track_data); +} + +static struct track_data *track_data_alloc(unsigned int key_len, + struct action_data *action_data, + struct hist_trigger_data *hist_data) +{ + struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + struct hist_elt_data *elt_data; + + if (!data) + return ERR_PTR(-ENOMEM); + + data->key = kzalloc(key_len, GFP_KERNEL); + if (!data->key) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + + data->key_len = key_len; + data->action_data = action_data; + data->hist_data = hist_data; + + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + data->elt.private_data = elt_data; + + elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!elt_data->comm) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + + return data; +} static char last_hist_cmd[MAX_FILTER_STR_VAL]; static char hist_err_str[MAX_FILTER_STR_VAL]; @@ -1078,12 +1194,12 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, static void action_trace(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, + struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { - struct synth_event *event = data->onmatch.synth_event; + struct synth_event *event = data->synth_event; - trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx); + trace_synth(event, var_ref_vals, data->var_ref_idx); } struct hist_var_data { @@ -1200,8 +1316,8 @@ static int synth_event_create(int argc, const char **argv) /* This interface accepts group name prefix */ if (strchr(name, '/')) { - len = sizeof(SYNTH_SYSTEM "/") - 1; - if (strncmp(name, SYNTH_SYSTEM "/", len)) + len = str_has_prefix(name, SYNTH_SYSTEM "/"); + if (len == 0) return -EINVAL; name += len; } @@ -1644,9 +1760,9 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) { - char *system = data->onmatch.match_event_system; - char *event_name = data->onmatch.match_event; + if (data->handler == HANDLER_ONMATCH) { + char *system = data->match_data.event_system; + char *event_name = data->match_data.event; file = find_var_file(tr, system, event_name, var_name); if (!file) @@ -1691,12 +1807,6 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, return hist_field; } -struct hist_elt_data { - char *comm; - u64 *var_ref_vals; - char *field_var_str[SYNTH_FIELDS_MAX]; -}; - static u64 hist_field_var_ref(struct hist_field *hist_field, struct tracing_map_elt *elt, struct ring_buffer_event *rbe, @@ -1882,7 +1992,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) return ret; if ((str_has_prefix(str, "onmatch(")) || - (str_has_prefix(str, "onmax("))) { + (str_has_prefix(str, "onmax(")) || + (str_has_prefix(str, "onchange("))) { attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); if (!attrs->action_str[attrs->n_actions]) { ret = -ENOMEM; @@ -2030,7 +2141,7 @@ static inline void save_comm(char *comm, struct task_struct *task) return; } - memcpy(comm, task->comm, TASK_COMM_LEN); + strncpy(comm, task->comm, TASK_COMM_LEN); } static void hist_elt_data_free(struct hist_elt_data *elt_data) @@ -2076,7 +2187,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str + hist_data->n_max_var_str; + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str; size = STR_VAR_LEN_MAX; @@ -3050,7 +3161,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, int ret; if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { - hist_err_event("onmatch: Too many field variables defined: ", + hist_err_event("trace action: Too many field variables defined: ", subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3058,7 +3169,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { - hist_err_event("onmatch: Event file not found: ", + hist_err_event("trace action: Event file not found: ", subsys_name, event_name, field_name); ret = PTR_ERR(file); return ERR_PTR(ret); @@ -3072,7 +3183,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, */ hist_data = find_compatible_hist(target_hist_data, file); if (!hist_data) { - hist_err_event("onmatch: Matching event histogram not found: ", + hist_err_event("trace action: Matching event histogram not found: ", subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3134,7 +3245,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("onmatch: Couldn't create histogram for field: ", + hist_err_event("trace action: Couldn't create histogram for field: ", subsys_name, event_name, field_name); return ERR_PTR(ret); } @@ -3147,7 +3258,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("onmatch: Couldn't find synthetic variable: ", + hist_err_event("trace action: Couldn't find synthetic variable: ", subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3225,13 +3336,13 @@ static void update_field_vars(struct hist_trigger_data *hist_data, hist_data->n_field_vars, 0); } -static void update_max_vars(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - struct ring_buffer_event *rbe, - void *rec) +static void save_track_data_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { - __update_field_vars(elt, rbe, rec, hist_data->max_vars, - hist_data->n_max_vars, hist_data->n_field_var_str); + __update_field_vars(elt, rbe, rec, hist_data->save_vars, + hist_data->n_save_vars, hist_data->n_field_var_str); } static struct hist_field *create_var(struct hist_trigger_data *hist_data, @@ -3366,18 +3477,190 @@ create_target_field_var(struct hist_trigger_data *target_hist_data, return create_field_var(target_hist_data, file, var_name); } -static void onmax_print(struct seq_file *m, - struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - struct action_data *data) +static bool check_track_val_max(u64 track_val, u64 var_val) { - unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx; + if (var_val <= track_val) + return false; + + return true; +} - seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx)); +static bool check_track_val_changed(u64 track_val, u64 var_val) +{ + if (var_val == track_val) + return false; + + return true; +} + +static u64 get_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + unsigned int track_var_idx = data->track_data.track_var->var.idx; + u64 track_val; + + track_val = tracing_map_read_var(elt, track_var_idx); + + return track_val; +} + +static void save_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data, u64 var_val) +{ + unsigned int track_var_idx = data->track_data.track_var->var.idx; + + tracing_map_set_var(elt, track_var_idx, var_val); +} + +static void save_track_data(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) +{ + if (data->track_data.save_data) + data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); +} + +static bool check_track_val(struct tracing_map_elt *elt, + struct action_data *data, + u64 var_val) +{ + struct hist_trigger_data *hist_data; + u64 track_val; + + hist_data = data->track_data.track_var->hist_data; + track_val = get_track_val(hist_data, elt, data); + + return data->track_data.check_val(track_val, var_val); +} - for (i = 0; i < hist_data->n_max_vars; i++) { - struct hist_field *save_val = hist_data->max_vars[i]->val; - struct hist_field *save_var = hist_data->max_vars[i]->var; +#ifdef CONFIG_TRACER_SNAPSHOT +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + /* called with tr->max_lock held */ + struct track_data *track_data = tr->cond_snapshot->cond_data; + struct hist_elt_data *elt_data, *track_elt_data; + struct snapshot_context *context = cond_data; + u64 track_val; + + if (!track_data) + return false; + + track_val = get_track_val(track_data->hist_data, context->elt, + track_data->action_data); + + track_data->track_val = track_val; + memcpy(track_data->key, context->key, track_data->key_len); + + elt_data = context->elt->private_data; + track_elt_data = track_data->elt.private_data; + if (elt_data->comm) + strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + + track_data->updated = true; + + return true; +} + +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) +{ + struct trace_event_file *file = hist_data->event_file; + struct snapshot_context context; + + context.elt = elt; + context.key = key; + + tracing_snapshot_cond(file->tr, &context); +} + +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt); + +static struct action_data *snapshot_action(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + if (!hist_data->n_actions) + return NULL; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->action == ACTION_SNAPSHOT) + return data; + } + + return NULL; +} + +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + struct trace_event_file *file = hist_data->event_file; + struct track_data *track_data; + struct action_data *action; + + track_data = tracing_cond_snapshot_data(file->tr); + if (!track_data) + return; + + if (!track_data->updated) + return; + + action = snapshot_action(hist_data); + if (!action) + return; + + seq_puts(m, "\nSnapshot taken (see tracing/snapshot). Details:\n"); + seq_printf(m, "\ttriggering value { %s(%s) }: %10llu", + action->handler == HANDLER_ONMAX ? "onmax" : "onchange", + action->track_data.var_str, track_data->track_val); + + seq_puts(m, "\ttriggered by event with key: "); + hist_trigger_print_key(m, hist_data, track_data->key, &track_data->elt); + seq_putc(m, '\n'); +} +#else +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + return false; +} +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) {} +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) {} +#endif /* CONFIG_TRACER_SNAPSHOT */ + +static void track_data_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + u64 track_val = get_track_val(hist_data, elt, data); + unsigned int i, save_var_idx; + + if (data->handler == HANDLER_ONMAX) + seq_printf(m, "\n\tmax: %10llu", track_val); + else if (data->handler == HANDLER_ONCHANGE) + seq_printf(m, "\n\tchanged: %10llu", track_val); + + if (data->action == ACTION_SNAPSHOT) + return; + + for (i = 0; i < hist_data->n_save_vars; i++) { + struct hist_field *save_val = hist_data->save_vars[i]->val; + struct hist_field *save_var = hist_data->save_vars[i]->var; u64 val; save_var_idx = save_var->var.idx; @@ -3392,64 +3675,82 @@ static void onmax_print(struct seq_file *m, } } -static void onmax_save(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, - struct action_data *data, u64 *var_ref_vals) +static void ontrack_action(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { - unsigned int max_idx = data->onmax.max_var->var.idx; - unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx; + u64 var_val = var_ref_vals[data->track_data.var_ref->var_ref_idx]; - u64 var_val, max_val; - - var_val = var_ref_vals[max_var_ref_idx]; - max_val = tracing_map_read_var(elt, max_idx); - - if (var_val <= max_val) - return; - - tracing_map_set_var(elt, max_idx, var_val); - - update_max_vars(hist_data, elt, rbe, rec); + if (check_track_val(elt, data, var_val)) { + save_track_val(hist_data, elt, data, var_val); + save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + } } -static void onmax_destroy(struct action_data *data) +static void action_data_destroy(struct action_data *data) { unsigned int i; - destroy_hist_field(data->onmax.max_var, 0); - destroy_hist_field(data->onmax.var, 0); + lockdep_assert_held(&event_mutex); - kfree(data->onmax.var_str); - kfree(data->onmax.fn_name); + kfree(data->action_name); for (i = 0; i < data->n_params; i++) kfree(data->params[i]); + if (data->synth_event) + data->synth_event->ref--; + + kfree(data->synth_event_name); + kfree(data); } -static int onmax_create(struct hist_trigger_data *hist_data, - struct action_data *data) +static void track_data_destroy(struct hist_trigger_data *hist_data, + struct action_data *data) { struct trace_event_file *file = hist_data->event_file; - struct hist_field *var_field, *ref_field, *max_var; - unsigned int var_ref_idx = hist_data->n_var_refs; - struct field_var *field_var; - char *onmax_var_str, *param; - unsigned int i; + + destroy_hist_field(data->track_data.track_var, 0); + destroy_hist_field(data->track_data.var_ref, 0); + + if (data->action == ACTION_SNAPSHOT) { + struct track_data *track_data; + + track_data = tracing_cond_snapshot_data(file->tr); + if (track_data && track_data->hist_data == hist_data) { + tracing_snapshot_cond_disable(file->tr); + track_data_free(track_data); + } + } + + kfree(data->track_data.var_str); + + action_data_destroy(data); +} + +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data); + +static int track_data_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct hist_field *var_field, *ref_field, *track_var = NULL; + struct trace_event_file *file = hist_data->event_file; + char *track_data_var_str; int ret = 0; - onmax_var_str = data->onmax.var_str; - if (onmax_var_str[0] != '$') { - hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str); + track_data_var_str = data->track_data.var_str; + if (track_data_var_str[0] != '$') { + hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str); return -EINVAL; } - onmax_var_str++; + track_data_var_str++; - var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str); + var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str); + hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str); return -EINVAL; } @@ -3457,39 +3758,26 @@ static int onmax_create(struct hist_trigger_data *hist_data, if (!ref_field) return -ENOMEM; - data->onmax.var = ref_field; + data->track_data.var_ref = ref_field; - data->fn = onmax_save; - data->onmax.max_var_ref_idx = var_ref_idx; - max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); - if (IS_ERR(max_var)) { - hist_err("onmax: Couldn't create onmax variable: ", "max"); - ret = PTR_ERR(max_var); + if (data->handler == HANDLER_ONMAX) + track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err("Couldn't create onmax variable: ", "__max"); + ret = PTR_ERR(track_var); goto out; } - data->onmax.max_var = max_var; - - for (i = 0; i < data->n_params; i++) { - param = kstrdup(data->params[i], GFP_KERNEL); - if (!param) { - ret = -ENOMEM; - goto out; - } - - field_var = create_target_field_var(hist_data, NULL, NULL, param); - if (IS_ERR(field_var)) { - hist_err("onmax: Couldn't create field variable: ", param); - ret = PTR_ERR(field_var); - kfree(param); - goto out; - } - - hist_data->max_vars[hist_data->n_max_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) - hist_data->n_max_var_str++; - kfree(param); + if (data->handler == HANDLER_ONCHANGE) + track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err("Couldn't create onchange variable: ", "__change"); + ret = PTR_ERR(track_var); + goto out; } + data->track_data.track_var = track_var; + + ret = action_create(hist_data, data); out: return ret; } @@ -3497,14 +3785,18 @@ static int onmax_create(struct hist_trigger_data *hist_data, static int parse_action_params(char *params, struct action_data *data) { char *param, *saved_param; + bool first_param = true; int ret = 0; while (params) { - if (data->n_params >= SYNTH_FIELDS_MAX) + if (data->n_params >= SYNTH_FIELDS_MAX) { + hist_err("Too many action params", ""); goto out; + } param = strsep(¶ms, ","); if (!param) { + hist_err("No action param found", ""); ret = -EINVAL; goto out; } @@ -3522,86 +3814,164 @@ static int parse_action_params(char *params, struct action_data *data) goto out; } + if (first_param && data->use_trace_keyword) { + data->synth_event_name = saved_param; + first_param = false; + continue; + } + first_param = false; + data->params[data->n_params++] = saved_param; } out: return ret; } -static struct action_data *onmax_parse(char *str) +static int action_parse(char *str, struct action_data *data, + enum handler_id handler) { - char *onmax_fn_name, *onmax_var_str; - struct action_data *data; - int ret = -EINVAL; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return ERR_PTR(-ENOMEM); + char *action_name; + int ret = 0; - onmax_var_str = strsep(&str, ")"); - if (!onmax_var_str || !str) { + strsep(&str, "."); + if (!str) { + hist_err("action parsing: No action found", ""); ret = -EINVAL; - goto free; + goto out; } - data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL); - if (!data->onmax.var_str) { - ret = -ENOMEM; - goto free; + action_name = strsep(&str, "("); + if (!action_name || !str) { + hist_err("action parsing: No action found", ""); + ret = -EINVAL; + goto out; } - strsep(&str, "."); - if (!str) - goto free; - - onmax_fn_name = strsep(&str, "("); - if (!onmax_fn_name || !str) - goto free; - - if (str_has_prefix(onmax_fn_name, "save")) { + if (str_has_prefix(action_name, "save")) { char *params = strsep(&str, ")"); if (!params) { + hist_err("action parsing: No params found for %s", "save"); ret = -EINVAL; - goto free; + goto out; } ret = parse_action_params(params, data); if (ret) - goto free; - } else + goto out; + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + else { + hist_err("action parsing: Handler doesn't support action: ", action_name); + ret = -EINVAL; + goto out; + } + + data->track_data.save_data = save_track_data_vars; + data->fn = ontrack_action; + data->action = ACTION_SAVE; + } else if (str_has_prefix(action_name, "snapshot")) { + char *params = strsep(&str, ")"); + + if (!str) { + hist_err("action parsing: No closing paren found: %s", params); + ret = -EINVAL; + goto out; + } + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + else { + hist_err("action parsing: Handler doesn't support action: ", action_name); + ret = -EINVAL; + goto out; + } + + data->track_data.save_data = save_track_data_snapshot; + data->fn = ontrack_action; + data->action = ACTION_SNAPSHOT; + } else { + char *params = strsep(&str, ")"); + + if (str_has_prefix(action_name, "trace")) + data->use_trace_keyword = true; + + if (params) { + ret = parse_action_params(params, data); + if (ret) + goto out; + } + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + + if (handler != HANDLER_ONMATCH) { + data->track_data.save_data = action_trace; + data->fn = ontrack_action; + } else + data->fn = action_trace; + + data->action = ACTION_TRACE; + } + + data->action_name = kstrdup(action_name, GFP_KERNEL); + if (!data->action_name) { + ret = -ENOMEM; + goto out; + } + + data->handler = handler; + out: + return ret; +} + +static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, + char *str, enum handler_id handler) +{ + struct action_data *data; + int ret = -EINVAL; + char *var_str; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + var_str = strsep(&str, ")"); + if (!var_str || !str) { + ret = -EINVAL; goto free; + } - data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL); - if (!data->onmax.fn_name) { + data->track_data.var_str = kstrdup(var_str, GFP_KERNEL); + if (!data->track_data.var_str) { ret = -ENOMEM; goto free; } + + ret = action_parse(str, data, handler); + if (ret) + goto free; out: return data; free: - onmax_destroy(data); + track_data_destroy(hist_data, data); data = ERR_PTR(ret); goto out; } static void onmatch_destroy(struct action_data *data) { - unsigned int i; - - lockdep_assert_held(&event_mutex); - - kfree(data->onmatch.match_event); - kfree(data->onmatch.match_event_system); - kfree(data->onmatch.synth_event_name); - - for (i = 0; i < data->n_params; i++) - kfree(data->params[i]); + kfree(data->match_data.event); + kfree(data->match_data.event_system); - if (data->onmatch.synth_event) - data->onmatch.synth_event->ref--; - - kfree(data); + action_data_destroy(data); } static void destroy_field_var(struct field_var *field_var) @@ -3651,8 +4021,9 @@ static int check_synth_field(struct synth_event *event, } static struct hist_field * -onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, - char *system, char *event, char *var) +trace_action_find_var(struct hist_trigger_data *hist_data, + struct action_data *data, + char *system, char *event, char *var) { struct hist_field *hist_field; @@ -3660,24 +4031,24 @@ onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, hist_field = find_target_event_var(hist_data, system, event, var); if (!hist_field) { - if (!system) { - system = data->onmatch.match_event_system; - event = data->onmatch.match_event; + if (!system && data->handler == HANDLER_ONMATCH) { + system = data->match_data.event_system; + event = data->match_data.event; } hist_field = find_event_var(hist_data, system, event, var); } if (!hist_field) - hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var); + hist_err_event("trace action: Couldn't find param: $", system, event, var); return hist_field; } static struct hist_field * -onmatch_create_field_var(struct hist_trigger_data *hist_data, - struct action_data *data, char *system, - char *event, char *var) +trace_action_create_field_var(struct hist_trigger_data *hist_data, + struct action_data *data, char *system, + char *event, char *var) { struct hist_field *hist_field = NULL; struct field_var *field_var; @@ -3700,9 +4071,9 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data, * looking for fields on the onmatch(system.event.xxx) * event. */ - if (!system) { - system = data->onmatch.match_event_system; - event = data->onmatch.match_event; + if (!system && data->handler == HANDLER_ONMATCH) { + system = data->match_data.event_system; + event = data->match_data.event; } /* @@ -3724,24 +4095,30 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data, goto out; } -static int onmatch_create(struct hist_trigger_data *hist_data, - struct trace_event_file *file, - struct action_data *data) +static int trace_action_create(struct hist_trigger_data *hist_data, + struct action_data *data) { char *event_name, *param, *system = NULL; struct hist_field *hist_field, *var_ref; unsigned int i, var_ref_idx; unsigned int field_pos = 0; struct synth_event *event; + char *synth_event_name; int ret = 0; lockdep_assert_held(&event_mutex); - event = find_synth_event(data->onmatch.synth_event_name); + if (data->use_trace_keyword) + synth_event_name = data->synth_event_name; + else + synth_event_name = data->action_name; + + event = find_synth_event(synth_event_name); if (!event) { - hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); + hist_err("trace action: Couldn't find synthetic event: ", synth_event_name); return -EINVAL; } + event->ref++; var_ref_idx = hist_data->n_var_refs; @@ -3769,13 +4146,15 @@ static int onmatch_create(struct hist_trigger_data *hist_data, } if (param[0] == '$') - hist_field = onmatch_find_var(hist_data, data, system, - event_name, param); + hist_field = trace_action_find_var(hist_data, data, + system, event_name, + param); else - hist_field = onmatch_create_field_var(hist_data, data, - system, - event_name, - param); + hist_field = trace_action_create_field_var(hist_data, + data, + system, + event_name, + param); if (!hist_field) { kfree(p); @@ -3797,7 +4176,7 @@ static int onmatch_create(struct hist_trigger_data *hist_data, continue; } - hist_err_event("onmatch: Param type doesn't match synthetic event field type: ", + hist_err_event("trace action: Param type doesn't match synthetic event field type: ", system, event_name, param); kfree(p); ret = -EINVAL; @@ -3805,14 +4184,13 @@ static int onmatch_create(struct hist_trigger_data *hist_data, } if (field_pos != event->n_fields) { - hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name); + hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name); ret = -EINVAL; goto err; } - data->fn = action_trace; - data->onmatch.synth_event = event; - data->onmatch.var_ref_idx = var_ref_idx; + data->synth_event = event; + data->var_ref_idx = var_ref_idx; out: return ret; err: @@ -3821,10 +4199,75 @@ static int onmatch_create(struct hist_trigger_data *hist_data, goto out; } +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct trace_event_file *file = hist_data->event_file; + struct track_data *track_data; + struct field_var *field_var; + unsigned int i; + char *param; + int ret = 0; + + if (data->action == ACTION_TRACE) + return trace_action_create(hist_data, data); + + if (data->action == ACTION_SNAPSHOT) { + track_data = track_data_alloc(hist_data->key_size, data, hist_data); + if (IS_ERR(track_data)) { + ret = PTR_ERR(track_data); + goto out; + } + + ret = tracing_snapshot_cond_enable(file->tr, track_data, + cond_snapshot_update); + if (ret) + track_data_free(track_data); + + goto out; + } + + if (data->action == ACTION_SAVE) { + if (hist_data->n_save_vars) { + ret = -EEXIST; + hist_err("save action: Can't have more than one save() action per hist", ""); + goto out; + } + + for (i = 0; i < data->n_params; i++) { + param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto out; + } + + field_var = create_target_field_var(hist_data, NULL, NULL, param); + if (IS_ERR(field_var)) { + hist_err("save action: Couldn't create field variable: ", param); + ret = PTR_ERR(field_var); + kfree(param); + goto out; + } + + hist_data->save_vars[hist_data->n_save_vars++] = field_var; + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_save_var_str++; + kfree(param); + } + } + out: + return ret; +} + +static int onmatch_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + return action_create(hist_data, data); +} + static struct action_data *onmatch_parse(struct trace_array *tr, char *str) { char *match_event, *match_event_system; - char *synth_event_name, *params; struct action_data *data; int ret = -EINVAL; @@ -3850,43 +4293,19 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) goto free; } - data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL); - if (!data->onmatch.match_event) { - ret = -ENOMEM; - goto free; - } - - data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL); - if (!data->onmatch.match_event_system) { + data->match_data.event = kstrdup(match_event, GFP_KERNEL); + if (!data->match_data.event) { ret = -ENOMEM; goto free; } - strsep(&str, "."); - if (!str) { - hist_err("onmatch: Missing . after onmatch(): ", str); - goto free; - } - - synth_event_name = strsep(&str, "("); - if (!synth_event_name || !str) { - hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name); - goto free; - } - - data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL); - if (!data->onmatch.synth_event_name) { + data->match_data.event_system = kstrdup(match_event_system, GFP_KERNEL); + if (!data->match_data.event_system) { ret = -ENOMEM; goto free; } - params = strsep(&str, ")"); - if (!params || !str || (str && strlen(str))) { - hist_err("onmatch: Missing closing paramlist paren: ", params); - goto free; - } - - ret = parse_action_params(params, data); + ret = action_parse(str, data, HANDLER_ONMATCH); if (ret) goto free; out: @@ -4326,10 +4745,11 @@ static void destroy_actions(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) + if (data->handler == HANDLER_ONMATCH) onmatch_destroy(data); - else if (data->fn == onmax_save) - onmax_destroy(data); + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + track_data_destroy(hist_data, data); else kfree(data); } @@ -4355,16 +4775,24 @@ static int parse_actions(struct hist_trigger_data *hist_data) ret = PTR_ERR(data); break; } - data->fn = action_trace; } else if ((len = str_has_prefix(str, "onmax("))) { char *action_str = str + len; - data = onmax_parse(action_str); + data = track_data_parse(hist_data, action_str, + HANDLER_ONMAX); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + } else if ((len = str_has_prefix(str, "onchange("))) { + char *action_str = str + len; + + data = track_data_parse(hist_data, action_str, + HANDLER_ONCHANGE); if (IS_ERR(data)) { ret = PTR_ERR(data); break; } - data->fn = onmax_save; } else { ret = -EINVAL; break; @@ -4376,8 +4804,7 @@ static int parse_actions(struct hist_trigger_data *hist_data) return ret; } -static int create_actions(struct hist_trigger_data *hist_data, - struct trace_event_file *file) +static int create_actions(struct hist_trigger_data *hist_data) { struct action_data *data; unsigned int i; @@ -4386,14 +4813,18 @@ static int create_actions(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->attrs->n_actions; i++) { data = hist_data->actions[i]; - if (data->fn == action_trace) { - ret = onmatch_create(hist_data, file, data); + if (data->handler == HANDLER_ONMATCH) { + ret = onmatch_create(hist_data, data); if (ret) - return ret; - } else if (data->fn == onmax_save) { - ret = onmax_create(hist_data, data); + break; + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + ret = track_data_create(hist_data, data); if (ret) - return ret; + break; + } else { + ret = -EINVAL; + break; } } @@ -4409,26 +4840,51 @@ static void print_actions(struct seq_file *m, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == onmax_save) - onmax_print(m, hist_data, elt, data); + if (data->action == ACTION_SNAPSHOT) + continue; + + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + track_data_print(m, hist_data, elt, data); } } -static void print_onmax_spec(struct seq_file *m, - struct hist_trigger_data *hist_data, - struct action_data *data) +static void print_action_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) { unsigned int i; - seq_puts(m, ":onmax("); - seq_printf(m, "%s", data->onmax.var_str); - seq_printf(m, ").%s(", data->onmax.fn_name); - - for (i = 0; i < hist_data->n_max_vars; i++) { - seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name); - if (i < hist_data->n_max_vars - 1) - seq_puts(m, ","); + if (data->action == ACTION_SAVE) { + for (i = 0; i < hist_data->n_save_vars; i++) { + seq_printf(m, "%s", hist_data->save_vars[i]->var->var.name); + if (i < hist_data->n_save_vars - 1) + seq_puts(m, ","); + } + } else if (data->action == ACTION_TRACE) { + if (data->use_trace_keyword) + seq_printf(m, "%s", data->synth_event_name); + for (i = 0; i < data->n_params; i++) { + if (i || data->use_trace_keyword) + seq_puts(m, ","); + seq_printf(m, "%s", data->params[i]); + } } +} + +static void print_track_data_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + if (data->handler == HANDLER_ONMAX) + seq_puts(m, ":onmax("); + else if (data->handler == HANDLER_ONCHANGE) + seq_puts(m, ":onchange("); + seq_printf(m, "%s", data->track_data.var_str); + seq_printf(m, ").%s(", data->action_name); + + print_action_spec(m, hist_data, data); + seq_puts(m, ")"); } @@ -4436,18 +4892,12 @@ static void print_onmatch_spec(struct seq_file *m, struct hist_trigger_data *hist_data, struct action_data *data) { - unsigned int i; - - seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system, - data->onmatch.match_event); + seq_printf(m, ":onmatch(%s.%s).", data->match_data.event_system, + data->match_data.event); - seq_printf(m, "%s(", data->onmatch.synth_event->name); + seq_printf(m, "%s(", data->action_name); - for (i = 0; i < data->n_params; i++) { - if (i) - seq_puts(m, ","); - seq_printf(m, "%s", data->params[i]); - } + print_action_spec(m, hist_data, data); seq_puts(m, ")"); } @@ -4463,8 +4913,11 @@ static bool actions_match(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; struct action_data *data_test = hist_data_test->actions[i]; + char *action_name, *action_name_test; - if (data->fn != data_test->fn) + if (data->handler != data_test->handler) + return false; + if (data->action != data_test->action) return false; if (data->n_params != data_test->n_params) @@ -4475,22 +4928,30 @@ static bool actions_match(struct hist_trigger_data *hist_data, return false; } - if (data->fn == action_trace) { - if (strcmp(data->onmatch.synth_event_name, - data_test->onmatch.synth_event_name) != 0) - return false; - if (strcmp(data->onmatch.match_event_system, - data_test->onmatch.match_event_system) != 0) - return false; - if (strcmp(data->onmatch.match_event, - data_test->onmatch.match_event) != 0) + if (data->use_trace_keyword) + action_name = data->synth_event_name; + else + action_name = data->action_name; + + if (data_test->use_trace_keyword) + action_name_test = data_test->synth_event_name; + else + action_name_test = data_test->action_name; + + if (strcmp(action_name, action_name_test) != 0) + return false; + + if (data->handler == HANDLER_ONMATCH) { + if (strcmp(data->match_data.event_system, + data_test->match_data.event_system) != 0) return false; - } else if (data->fn == onmax_save) { - if (strcmp(data->onmax.var_str, - data_test->onmax.var_str) != 0) + if (strcmp(data->match_data.event, + data_test->match_data.event) != 0) return false; - if (strcmp(data->onmax.fn_name, - data_test->onmax.fn_name) != 0) + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + if (strcmp(data->track_data.var_str, + data_test->track_data.var_str) != 0) return false; } } @@ -4507,10 +4968,11 @@ static void print_actions_spec(struct seq_file *m, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) + if (data->handler == HANDLER_ONMATCH) print_onmatch_spec(m, hist_data, data); - else if (data->fn == onmax_save) - print_onmax_spec(m, hist_data, data); + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + print_track_data_spec(m, hist_data, data); } } @@ -4695,22 +5157,24 @@ static inline void add_to_key(char *compound_key, void *key, /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - } - memcpy(compound_key + key_field->offset, key, size); + strncpy(compound_key + key_field->offset, (char *)key, size); + } else + memcpy(compound_key + key_field->offset, key, size); } static void hist_trigger_actions(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, u64 *var_ref_vals) + struct ring_buffer_event *rbe, void *key, + u64 *var_ref_vals) { struct action_data *data; unsigned int i; for (i = 0; i < hist_data->n_actions; i++) { data = hist_data->actions[i]; - data->fn(hist_data, elt, rec, rbe, data, var_ref_vals); + data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals); } } @@ -4771,7 +5235,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals); + hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -4792,10 +5256,10 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, } } -static void -hist_trigger_entry_print(struct seq_file *m, - struct hist_trigger_data *hist_data, void *key, - struct tracing_map_elt *elt) +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt) { struct hist_field *key_field; char str[KSYM_SYMBOL_LEN]; @@ -4871,6 +5335,17 @@ hist_trigger_entry_print(struct seq_file *m, seq_puts(m, " "); seq_puts(m, "}"); +} + +static void hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt) +{ + const char *field_name; + unsigned int i; + + hist_trigger_print_key(m, hist_data, key, elt); seq_printf(m, " hitcount: %10llu", tracing_map_read_sum(elt, HITCOUNT_IDX)); @@ -4937,6 +5412,8 @@ static void hist_trigger_show(struct seq_file *m, if (n_entries < 0) n_entries = 0; + track_data_snapshot_print(m, hist_data); + seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", (u64)atomic64_read(&hist_data->map->hits), n_entries, (u64)atomic64_read(&hist_data->map->drops)); @@ -5683,7 +6160,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, if (has_hist_vars(hist_data)) save_hist_vars(hist_data); - ret = create_actions(hist_data, file); + ret = create_actions(hist_data); if (ret) goto out_unreg; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c2af1560e856..69ebf3c2f1b5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -380,6 +380,7 @@ static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { trace_seq_putc(s, ' '); trace_print_lat_fmt(s, entry); + trace_seq_puts(s, " | "); } /* If the pid changed since the last trace, output this event */ @@ -501,6 +502,17 @@ static void print_graph_abs_time(u64 t, struct trace_seq *s) } static void +print_graph_rel_time(struct trace_iterator *iter, struct trace_seq *s) +{ + unsigned long long usecs; + + usecs = iter->ts - iter->trace_buffer->time_start; + do_div(usecs, NSEC_PER_USEC); + + trace_seq_printf(s, "%9llu us | ", usecs); +} + +static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) { @@ -517,6 +529,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + /* Cpu */ if (flags & TRACE_GRAPH_PRINT_CPU) print_graph_cpu(s, cpu); @@ -725,6 +741,10 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + /* Cpu */ if (flags & TRACE_GRAPH_PRINT_CPU) print_graph_cpu(s, cpu); @@ -1101,6 +1121,8 @@ static void print_lat_header(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_ABS_TIME) size += 16; + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + size += 16; if (flags & TRACE_GRAPH_PRINT_CPU) size += 4; if (flags & TRACE_GRAPH_PRINT_PROC) @@ -1125,12 +1147,14 @@ static void __print_graph_headers_flags(struct trace_array *tr, seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_puts(s, " TIME "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " REL TIME "); if (flags & TRACE_GRAPH_PRINT_CPU) seq_puts(s, " CPU"); if (flags & TRACE_GRAPH_PRINT_PROC) seq_puts(s, " TASK/PID "); if (lat) - seq_puts(s, "||||"); + seq_puts(s, "|||| "); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_puts(s, " DURATION "); seq_puts(s, " FUNCTION CALLS\n"); @@ -1139,12 +1163,14 @@ static void __print_graph_headers_flags(struct trace_array *tr, seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_puts(s, " | "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_CPU) seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_PROC) seq_puts(s, " | | "); if (lat) - seq_puts(s, "||||"); + seq_puts(s, "|||| "); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_puts(s, " | | "); seq_puts(s, " | | | |\n"); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d42a473b8240..a745b0cee5d3 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -239,7 +239,7 @@ static void irqsoff_trace_close(struct trace_iterator *iter) #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_REL_TIME | \ TRACE_GRAPH_PRINT_DURATION) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index d953c163a079..810d78a8d14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -51,14 +51,16 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu_file, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 99592c27465e..5d5129b05df7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ static struct dyn_event_operations trace_kprobe_ops = { .match = trace_kprobe_match, }; -/** +/* * Kprobe event core functions */ struct trace_kprobe { @@ -221,7 +221,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->rp.maxactive = maxactive; - if (!event || !is_good_name(event)) { + if (!event || !group) { ret = -EINVAL; goto error; } @@ -231,11 +231,6 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk->tp.call.name) goto error; - if (!group || !is_good_name(group)) { - ret = -EINVAL; - goto error; - } - tk->tp.class.system = kstrdup(group, GFP_KERNEL); if (!tk->tp.class.system) goto error; @@ -624,7 +619,11 @@ static int trace_kprobe_create(int argc, const char *argv[]) if (event) event++; - if (is_return && isdigit(argv[0][1])) { + if (isdigit(argv[0][1])) { + if (!is_return) { + pr_info("Maxactive is not for kprobe"); + return -EINVAL; + } if (event) len = event - &argv[0][1] - 1; else @@ -634,8 +633,8 @@ static int trace_kprobe_create(int argc, const char *argv[]) memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); - if (ret) { - pr_info("Failed to parse maxactive.\n"); + if (ret || !maxactive) { + pr_info("Invalid maxactive number\n"); return ret; } /* kretprobes instances are iterated over via a list. The @@ -694,9 +693,9 @@ static int trace_kprobe_create(int argc, const char *argv[]) tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); if (IS_ERR(tk)) { - pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tk)); ret = PTR_ERR(tk); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); goto out; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9962cb5da8ac..8f8411e7835f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,7 +13,7 @@ #include "trace_probe.h" -const char *reserved_field_names[] = { +static const char *reserved_field_names[] = { "common_type", "common_flags", "common_preempt_count", @@ -159,6 +159,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf) { const char *slash, *event = *pevent; + int len; slash = strchr(event, '/'); if (slash) { @@ -171,12 +172,25 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return -E2BIG; } strlcpy(buf, event, slash - event + 1); + if (!is_good_name(buf)) { + pr_info("Group name must follow the same rules as C identifiers\n"); + return -EINVAL; + } *pgroup = buf; *pevent = slash + 1; + event = *pevent; } - if (strlen(event) == 0) { + len = strlen(event); + if (len == 0) { pr_info("Event name is not specified\n"); return -EINVAL; + } else if (len > MAX_EVENT_NAME_LEN) { + pr_info("Event name is too long\n"); + return -E2BIG; + } + if (!is_good_name(event)) { + pr_info("Event name must follow the same rules as C identifiers\n"); + return -EINVAL; } return 0; } @@ -300,6 +314,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, case '+': /* deref memory */ arg++; /* Skip '+', because kstrtol() rejects it. */ + /* fall through */ case '-': tmp = strchr(arg, '('); if (!tmp) @@ -547,6 +562,8 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, body = strchr(arg, '='); if (body) { + if (body - arg > MAX_ARG_NAME_LEN || body == arg) + return -EINVAL; parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8a63f8bc01bc..2177c206de15 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -32,6 +32,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_ARRAY_LEN 64 +#define MAX_ARG_NAME_LEN 32 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4ea7e6845efb..743b2b520d34 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -180,8 +180,11 @@ static void wakeup_trace_close(struct trace_iterator *iter) } #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ - TRACE_GRAPH_PRINT_DURATION) + TRACE_GRAPH_PRINT_CPU | \ + TRACE_GRAPH_PRINT_REL_TIME | \ + TRACE_GRAPH_PRINT_DURATION | \ + TRACE_GRAPH_PRINT_OVERHEAD | \ + TRACE_GRAPH_PRINT_IRQS) static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { @@ -472,6 +475,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); + __trace_stack(wakeup_trace, flags, 0, pc); T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); @@ -482,7 +486,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, if (likely(!is_tracing_stopped())) { wakeup_trace->max_latency = delta; - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu, NULL); } out_unlock: @@ -583,6 +587,7 @@ probe_wakeup(void *ignore, struct task_struct *p) data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); + __trace_stack(wakeup_trace, flags, 0, pc); /* * We must be careful in using CALLER_ADDR2. But since wake_up diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9bde07c06362..be78d99ee6bc 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -273,10 +273,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; - if (!event || !is_good_name(event)) - return ERR_PTR(-EINVAL); - - if (!group || !is_good_name(group)) + if (!event || !group) return ERR_PTR(-EINVAL); tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); @@ -524,8 +521,9 @@ static int trace_uprobe_create(int argc, const char **argv) tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { - pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); ret = PTR_ERR(tu); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); goto fail_address_parse; } tu->offset = offset; |