diff options
Diffstat (limited to 'kernel')
65 files changed, 5038 insertions, 1308 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index be1c28fd4d57..227db99b0f19 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -85,13 +85,13 @@ static int audit_initialized; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 -u32 audit_enabled; -u32 audit_ever_enabled; +u32 audit_enabled = AUDIT_OFF; +bool audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ -static u32 audit_default; +static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; @@ -1197,25 +1197,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid_t auditd_pid; struct pid *req_pid = task_tgid(current); - /* sanity check - PID values must match */ - if (new_pid != pid_vnr(req_pid)) + /* Sanity check - PID values must match. Setting + * pid to 0 is how auditd ends auditing. */ + if (new_pid && (new_pid != pid_vnr(req_pid))) return -EINVAL; /* test the auditd connection */ audit_replace(req_pid); auditd_pid = auditd_pid_vnr(); - /* only the current auditd can unregister itself */ - if ((!new_pid) && (new_pid != auditd_pid)) { - audit_log_config_change("audit_pid", new_pid, - auditd_pid, 0); - return -EACCES; - } - /* replacing a healthy auditd is not allowed */ - if (auditd_pid && new_pid) { - audit_log_config_change("audit_pid", new_pid, - auditd_pid, 0); - return -EEXIST; + if (auditd_pid) { + /* replacing a healthy auditd is not allowed */ + if (new_pid) { + audit_log_config_change("audit_pid", + new_pid, auditd_pid, 0); + return -EEXIST; + } + /* only current auditd can unregister itself */ + if (pid_vnr(req_pid) != auditd_pid) { + audit_log_config_change("audit_pid", + new_pid, auditd_pid, 0); + return -EACCES; + } } if (new_pid) { @@ -1549,8 +1552,6 @@ static int __init audit_init(void) register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { @@ -1564,14 +1565,21 @@ static int __init audit_init(void) return 0; } -__initcall(audit_init); +postcore_initcall(audit_init); /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ static int __init audit_enable(char *str) { - audit_default = !!simple_strtol(str, NULL, 0); - if (!audit_default) + long val; + + if (kstrtol(str, 0, &val)) + panic("audit: invalid 'audit' parameter value (%s)\n", str); + audit_default = (val ? AUDIT_ON : AUDIT_OFF); + + if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; + if (audit_set_enabled(audit_default)) + panic("audit: error setting audit state (%d)\n", audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); @@ -2337,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, } } -#ifdef CONFIG_SECURITY -/** - * audit_log_secctx - Converts and logs SELinux context - * @ab: audit_buffer - * @secid: security number - * - * This is a helper function that calls security_secid_to_secctx to convert - * secid to secctx and then adds the (converted) SELinux context to the audit - * log by calling audit_log_format, thus also preventing leak of internal secid - * to userspace. If secid cannot be converted audit_panic is called. - */ -void audit_log_secctx(struct audit_buffer *ab, u32 secid) -{ - u32 len; - char *secctx; - - if (security_secid_to_secctx(secid, &secctx, &len)) { - audit_panic("Cannot convert secid to context"); - } else { - audit_log_format(ab, " obj=%s", secctx); - security_release_secctx(secctx, len); - } -} -EXPORT_SYMBOL(audit_log_secctx); -#endif - EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); diff --git a/kernel/audit.h b/kernel/audit.h index 9b110ae17ee3..af5bc59487ed 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -208,7 +208,7 @@ struct audit_context { struct audit_proctitle proctitle; }; -extern u32 audit_ever_enabled; +extern bool audit_ever_enabled; extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index d4b050d9a66e..fd353120e0d9 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1008,7 +1008,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify * We are guaranteed to have at least one reference to the mark from * either the inode or the caller of fsnotify_destroy_mark(). */ - BUG_ON(atomic_read(&entry->refcnt) < 1); + BUG_ON(refcount_read(&entry->refcnt) < 1); } static const struct fsnotify_ops audit_tree_ops = { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0b0aa5854dac..4a1758adb222 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[3]), LIST_HEAD_INIT(audit_filter_list[4]), LIST_HEAD_INIT(audit_filter_list[5]), -#if AUDIT_NR_FILTERS != 6 + LIST_HEAD_INIT(audit_filter_list[6]), +#if AUDIT_NR_FILTERS != 7 #error Fix audit_filter_list initialiser #endif }; @@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_rules_list[3]), LIST_HEAD_INIT(audit_rules_list[4]), LIST_HEAD_INIT(audit_rules_list[5]), + LIST_HEAD_INIT(audit_rules_list[6]), }; DEFINE_MUTEX(audit_filter_mutex); @@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * #endif case AUDIT_FILTER_USER: case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_FS: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { @@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; + case AUDIT_FSTYPE: + if (entry->rule.listnr != AUDIT_FILTER_FS) + return -EINVAL; + break; + } + + switch(entry->rule.listnr) { + case AUDIT_FILTER_FS: + switch(f->type) { + case AUDIT_FSTYPE: + case AUDIT_FILTERKEY: + break; + default: + return -EINVAL; + } } switch(f->type) { @@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return -EINVAL; /* FALL THROUGH */ case AUDIT_ARCH: + case AUDIT_FSTYPE: if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; break; @@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry) #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) + /* If any of these, don't count towards total */ + switch(entry->rule.listnr) { + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_FS: dont_count = 1; + } #endif mutex_lock(&audit_filter_mutex); @@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry) #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) + /* If any of these, don't count towards total */ + switch(entry->rule.listnr) { + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_FS: dont_count = 1; + } #endif mutex_lock(&audit_filter_mutex); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ecc23e25c9eb..e80459f7e132 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent, struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; + struct audit_entry *e; + struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; + int i; if (!context->in_syscall) return; + rcu_read_lock(); + if (!list_empty(list)) { + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE) { + if (audit_comparator(parent->i_sb->s_magic, + f->op, f->val)) { + if (e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; + } + } + } + } + } + } + rcu_read_unlock(); + if (inode) handle_one(inode); @@ -2390,6 +2413,12 @@ void __audit_log_kern_module(char *name) context->type = AUDIT_KERN_MODULE; } +void __audit_fanotify(unsigned int response) +{ + audit_log(current->audit_context, GFP_KERNEL, + AUDIT_FANOTIFY, "resp=%u", response); +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index af3ab6164ff5..e691da0b3bab 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,8 +3,11 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o +obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c4b9ab01bba5..7c25426d3cf5 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -19,6 +19,9 @@ #include "map_in_map.h" +#define ARRAY_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE || + attr->value_size == 0 || + attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 546113430049..b789ab78d28f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,129 +27,405 @@ void cgroup_bpf_put(struct cgroup *cgrp) { unsigned int type; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { - struct bpf_prog *prog = cgrp->bpf.prog[type]; - - if (prog) { - bpf_prog_put(prog); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog_list *pl, *tmp; + + list_for_each_entry_safe(pl, tmp, progs, node) { + list_del(&pl->node); + bpf_prog_put(pl->prog); + kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_prog_array_free(cgrp->bpf.effective[type]); + } +} + +/* count number of elements in the list. + * it's slow but the list cannot be long + */ +static u32 prog_list_length(struct list_head *head) +{ + struct bpf_prog_list *pl; + u32 cnt = 0; + + list_for_each_entry(pl, head, node) { + if (!pl->prog) + continue; + cnt++; } + return cnt; +} + +/* if parent has non-overridable prog attached, + * disallow attaching new programs to the descendent cgroup. + * if parent has overridable or multi-prog, allow attaching + */ +static bool hierarchy_allows_attach(struct cgroup *cgrp, + enum bpf_attach_type type, + u32 new_flags) +{ + struct cgroup *p; + + p = cgroup_parent(cgrp); + if (!p) + return true; + do { + u32 flags = p->bpf.flags[type]; + u32 cnt; + + if (flags & BPF_F_ALLOW_MULTI) + return true; + cnt = prog_list_length(&p->bpf.progs[type]); + WARN_ON_ONCE(cnt > 1); + if (cnt == 1) + return !!(flags & BPF_F_ALLOW_OVERRIDE); + p = cgroup_parent(p); + } while (p); + return true; +} + +/* compute a chain of effective programs for a given cgroup: + * start from the list of programs in this cgroup and add + * all parent programs. + * Note that parent's F_ALLOW_OVERRIDE-type program is yielding + * to programs in this cgroup + */ +static int compute_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu **array) +{ + struct bpf_prog_array __rcu *progs; + struct bpf_prog_list *pl; + struct cgroup *p = cgrp; + int cnt = 0; + + /* count number of effective programs by walking parents */ + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[type]); + p = cgroup_parent(p); + } while (p); + + progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!progs) + return -ENOMEM; + + /* populate the array with effective progs */ + cnt = 0; + p = cgrp; + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + list_for_each_entry(pl, + &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + rcu_dereference_protected(progs, 1)-> + progs[cnt++] = pl->prog; + } + p = cgroup_parent(p); + } while (p); + + *array = progs; + return 0; +} + +static void activate_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu *array) +{ + struct bpf_prog_array __rcu *old_array; + + old_array = xchg(&cgrp->bpf.effective[type], array); + /* free prog array after grace period, since __cgroup_bpf_run_*() + * might be still walking the array + */ + bpf_prog_array_free(old_array); } /** * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify - * @parent: the parent to inherit from */ -void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +int cgroup_bpf_inherit(struct cgroup *cgrp) { - unsigned int type; +/* has to use marco instead of const int, since compiler thinks + * that array below is variable length + */ +#define NR ARRAY_SIZE(cgrp->bpf.effective) + struct bpf_prog_array __rcu *arrays[NR] = {}; + int i; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { - struct bpf_prog *e; + for (i = 0; i < NR; i++) + INIT_LIST_HEAD(&cgrp->bpf.progs[i]); - e = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - rcu_assign_pointer(cgrp->bpf.effective[type], e); - cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; - } + for (i = 0; i < NR; i++) + if (compute_effective_progs(cgrp, i, &arrays[i])) + goto cleanup; + + for (i = 0; i < NR; i++) + activate_effective_progs(cgrp, i, arrays[i]); + + return 0; +cleanup: + for (i = 0; i < NR; i++) + bpf_prog_array_free(arrays[i]); + return -ENOMEM; } +#define BPF_CGROUP_MAX_PROGS 64 + /** - * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * __cgroup_bpf_attach() - Attach the program to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse - * @parent: The parent of @cgrp, or %NULL if @cgrp is the root - * @prog: A new program to pin - * @type: Type of pinning operation (ingress/egress) - * - * Each cgroup has a set of two pointers for bpf programs; one for eBPF - * programs it owns, and which is effective for execution. - * - * If @prog is not %NULL, this function attaches a new program to the cgroup - * and releases the one that is currently attached, if any. @prog is then made - * the effective program of type @type in that cgroup. - * - * If @prog is %NULL, the currently attached program of type @type is released, - * and the effective program of the parent cgroup (if any) is inherited to - * @cgrp. - * - * Then, the descendants of @cgrp are walked and the effective program for - * each of them is set to the effective program of @cgrp unless the - * descendant has its own program attached, in which case the subbranch is - * skipped. This ensures that delegated subcgroups with own programs are left - * untouched. + * @prog: A program to attach + * @type: Type of attach operation * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, - struct bpf_prog *prog, enum bpf_attach_type type, - bool new_overridable) +int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct bpf_prog *old_prog, *effective = NULL; - struct cgroup_subsys_state *pos; - bool overridable = true; - - if (parent) { - overridable = !parent->bpf.disallow_override[type]; - effective = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - } - - if (prog && effective && !overridable) - /* if parent has non-overridable prog attached, disallow - * attaching new programs to descendent cgroup - */ + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + bool pl_was_allocated; + int err; + + if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + /* invalid combination */ + return -EINVAL; + + if (!hierarchy_allows_attach(cgrp, type, flags)) return -EPERM; - if (prog && effective && overridable != new_overridable) - /* if parent has overridable prog attached, only - * allow overridable programs in descendent cgroup + if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + /* Disallow attaching non-overridable on top + * of existing overridable in this cgroup. + * Disallow attaching multi-prog if overridable or none */ return -EPERM; - old_prog = cgrp->bpf.prog[type]; - - if (prog) { - overridable = new_overridable; - effective = prog; - if (old_prog && - cgrp->bpf.disallow_override[type] == new_overridable) - /* disallow attaching non-overridable on top - * of existing overridable in this cgroup - * and vice versa - */ - return -EPERM; + if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + return -E2BIG; + + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + pl->prog = prog; + list_add_tail(&pl->node, progs); + } else { + if (list_empty(progs)) { + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + list_add_tail(&pl->node, progs); + } else { + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl_was_allocated = false; + } + pl->prog = prog; } - if (!prog && !old_prog) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; + cgrp->bpf.flags[type] = flags; - cgrp->bpf.prog[type] = prog; + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); - css_for_each_descendant_pre(pos, &cgrp->self) { - struct cgroup *desc = container_of(pos, struct cgroup, self); - - /* skip the subtree if the descendant has its own program */ - if (desc->bpf.prog[type] && desc != cgrp) { - pos = css_rightmost_descendant(pos); - } else { - rcu_assign_pointer(desc->bpf.effective[type], - effective); - desc->bpf.disallow_override[type] = !overridable; - } + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; } - if (prog) - static_branch_inc(&cgroup_bpf_enabled_key); + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + static_branch_inc(&cgroup_bpf_enabled_key); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and cleanup the prog list */ + pl->prog = old_prog; + if (pl_was_allocated) { + list_del(&pl->node); + kfree(pl); + } + return err; +} + +/** + * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @prog: A program to detach or NULL + * @type: Type of detach operation + * + * Must be called with cgroup_mutex held. + */ +int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 unused_flags) +{ + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + int err; + + if (flags & BPF_F_ALLOW_MULTI) { + if (!prog) + /* to detach MULTI prog the user has to specify valid FD + * of the program to be detached + */ + return -EINVAL; + } else { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return -ENOENT; + } + + if (flags & BPF_F_ALLOW_MULTI) { + /* find the prog and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog != prog) + continue; + old_prog = prog; + /* mark it deleted, so it's ignored while + * recomputing effective + */ + pl->prog = NULL; + break; + } + if (!old_prog) + return -ENOENT; + } else { + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) + */ + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl->prog = NULL; + } + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* now can actually delete it from this cgroup list */ + list_del(&pl->node); + kfree(pl); + if (list_empty(progs)) + /* last program was detached, reset flags to zero */ + cgrp->bpf.flags[type] = 0; + + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and restore back old_prog */ + pl->prog = old_prog; + return err; +} + +/* Must be called with cgroup_mutex held to avoid races. */ +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + enum bpf_attach_type type = attr->query.attach_type; + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + int cnt, ret = 0, i; + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) + cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + else + cnt = prog_list_length(progs); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + /* return early if user requested only program count + flags */ + return 0; + if (attr->query.prog_cnt < cnt) { + cnt = attr->query.prog_cnt; + ret = -ENOSPC; + } + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], + prog_ids, cnt); + } else { + struct bpf_prog_list *pl; + u32 id; + + i = 0; + list_for_each_entry(pl, progs, node) { + id = pl->prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + } + return ret; } /** @@ -171,36 +447,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type) { - struct bpf_prog *prog; + unsigned int offset = skb->data - skb_network_header(skb); + struct sock *save_sk; struct cgroup *cgrp; - int ret = 0; + int ret; if (!sk || !sk_fullsock(sk)) return 0; - if (sk->sk_family != AF_INET && - sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) { - unsigned int offset = skb->data - skb_network_header(skb); - struct sock *save_sk = skb->sk; - - skb->sk = sk; - __skb_push(skb, offset); - ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; - __skb_pull(skb, offset); - skb->sk = save_sk; - } - - rcu_read_unlock(); - - return ret; + save_sk = skb->sk; + skb->sk = sk; + __skb_push(skb, offset); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + bpf_prog_run_save_cb); + __skb_pull(skb, offset); + skb->sk = save_sk; + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -221,19 +487,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; + int ret; - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; - - rcu_read_unlock(); - - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -258,18 +515,77 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; + int ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, + BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); + +int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, + short access, enum bpf_attach_type type) +{ + struct cgroup *cgrp; + struct bpf_cgroup_dev_ctx ctx = { + .access_type = (access << 16) | dev_type, + .major = major, + .minor = minor, + }; + int allow = 1; rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN); + rcu_read_unlock(); - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; + return !allow; +} +EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); - rcu_read_unlock(); +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_trace_printk: + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); + default: + return NULL; + } +} - return ret; +static bool cgroup_dev_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); + +const struct bpf_prog_ops cg_dev_prog_ops = { +}; + +const struct bpf_verifier_ops cg_dev_verifier_ops = { + .get_func_proto = cgroup_dev_func_proto, + .is_valid_access = cgroup_dev_is_valid_access, +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7b62df86be1d..b9f8686a84cf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - kmemcheck_annotate_bitfield(fp, meta); - aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { - kmemcheck_annotate_bitfield(fp, meta); - memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; @@ -309,12 +305,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { + const char *end = sym + KSYM_NAME_LEN; + BUILD_BUG_ON(sizeof("bpf_prog_") + - sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + sizeof(prog->tag) * 2 + + /* name has been null terminated. + * We should need +1 for the '_' preceding + * the name. However, the null character + * is double counted between the name and the + * sizeof("bpf_prog_") above, so we omit + * the +1 here. + */ + sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); - *sym = 0; + if (prog->aux->name[0]) + snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); + else + *sym = 0; } static __always_inline unsigned long @@ -662,8 +671,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); if (fp != NULL) { - kmemcheck_annotate_bitfield(fp, meta); - /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, * this still needs to be adapted. @@ -1367,7 +1374,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ - fp = bpf_int_jit_compile(fp); + if (!bpf_prog_is_dev_bound(fp->aux)) { + fp = bpf_int_jit_compile(fp); + } else { + *err = bpf_prog_offload_compile(fp); + if (*err) + return fp; + } bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -1381,11 +1394,163 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); +static unsigned int __bpf_prog_ret1(const void *ctx, + const struct bpf_insn *insn) +{ + return 1; +} + +static struct bpf_prog_dummy { + struct bpf_prog prog; +} dummy_bpf_prog = { + .prog = { + .bpf_func = __bpf_prog_ret1, + }, +}; + +/* to avoid allocating empty bpf_prog_array for cgroups that + * don't have bpf program attached use one global 'empty_prog_array' + * It will not be modified the caller of bpf_prog_array_alloc() + * (since caller requested prog_cnt == 0) + * that pointer should be 'freed' by bpf_prog_array_free() + */ +static struct { + struct bpf_prog_array hdr; + struct bpf_prog *null_prog; +} empty_prog_array = { + .null_prog = NULL, +}; + +struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +{ + if (prog_cnt) + return kzalloc(sizeof(struct bpf_prog_array) + + sizeof(struct bpf_prog *) * (prog_cnt + 1), + flags); + + return &empty_prog_array.hdr; +} + +void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +{ + if (!progs || + progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + return; + kfree_rcu(progs, rcu); +} + +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +{ + struct bpf_prog **prog; + u32 cnt = 0; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) + cnt++; + rcu_read_unlock(); + return cnt; +} + +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt) +{ + struct bpf_prog **prog; + u32 i = 0, id; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) { + id = (*prog)->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) { + rcu_read_unlock(); + return -EFAULT; + } + if (++i == cnt) { + prog++; + break; + } + } + rcu_read_unlock(); + if (*prog) + return -ENOSPC; + return 0; +} + +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, + struct bpf_prog *old_prog) +{ + struct bpf_prog **prog = progs->progs; + + for (; *prog; prog++) + if (*prog == old_prog) { + WRITE_ONCE(*prog, &dummy_bpf_prog.prog); + break; + } +} + +int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, + struct bpf_prog *exclude_prog, + struct bpf_prog *include_prog, + struct bpf_prog_array **new_array) +{ + int new_prog_cnt, carry_prog_cnt = 0; + struct bpf_prog **existing_prog; + struct bpf_prog_array *array; + int new_prog_idx = 0; + + /* Figure out how many existing progs we need to carry over to + * the new array. + */ + if (old_array) { + existing_prog = old_array->progs; + for (; *existing_prog; existing_prog++) { + if (*existing_prog != exclude_prog && + *existing_prog != &dummy_bpf_prog.prog) + carry_prog_cnt++; + if (*existing_prog == include_prog) + return -EEXIST; + } + } + + /* How many progs (not NULL) will be in the new array? */ + new_prog_cnt = carry_prog_cnt; + if (include_prog) + new_prog_cnt += 1; + + /* Do we have any prog (not NULL) in the new array? */ + if (!new_prog_cnt) { + *new_array = NULL; + return 0; + } + + /* +1 as the end of prog_array is marked with NULL */ + array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); + if (!array) + return -ENOMEM; + + /* Fill in the new prog array */ + if (carry_prog_cnt) { + existing_prog = old_array->progs; + for (; *existing_prog; existing_prog++) + if (*existing_prog != exclude_prog && + *existing_prog != &dummy_bpf_prog.prog) + array->progs[new_prog_idx++] = *existing_prog; + } + if (include_prog) + array->progs[new_prog_idx++] = include_prog; + array->progs[new_prog_idx] = NULL; + *new_array = array; + return 0; +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; aux = container_of(work, struct bpf_prog_aux, work); + if (bpf_prog_is_dev_bound(aux)) + bpf_prog_offload_destroy(aux->prog); bpf_jit_free(aux->prog); } @@ -1498,5 +1663,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +/* These are only used within the BPF_SYSCALL code */ +#ifdef CONFIG_BPF_SYSCALL EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); +#endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c new file mode 100644 index 000000000000..ce5b669003b2 --- /dev/null +++ b/kernel/bpf/cpumap.c @@ -0,0 +1,706 @@ +/* bpf/cpumap.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ + +/* The 'cpumap' is primarily used as a backend map for XDP BPF helper + * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. + * + * Unlike devmap which redirects XDP frames out another NIC device, + * this map type redirects raw XDP frames to another CPU. The remote + * CPU will do SKB-allocation and call the normal network stack. + * + * This is a scalability and isolation mechanism, that allow + * separating the early driver network XDP layer, from the rest of the + * netstack, and assigning dedicated CPUs for this stage. This + * basically allows for 10G wirespeed pre-filtering via bpf. + */ +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/ptr_ring.h> + +#include <linux/sched.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/capability.h> +#include <trace/events/xdp.h> + +#include <linux/netdevice.h> /* netif_receive_skb_core */ +#include <linux/etherdevice.h> /* eth_type_trans */ + +/* General idea: XDP packets getting XDP redirected to another CPU, + * will maximum be stored/queued for one driver ->poll() call. It is + * guaranteed that setting flush bit and flush operation happen on + * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() + * which queue in bpf_cpu_map_entry contains packets. + */ + +#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct xdp_bulk_queue { + void *q[CPU_MAP_BULK_SIZE]; + unsigned int count; +}; + +/* Struct for every remote "destination" CPU in map */ +struct bpf_cpu_map_entry { + u32 cpu; /* kthread CPU and map index */ + int map_id; /* Back reference to map */ + u32 qsize; /* Queue size placeholder for map lookup */ + + /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ + struct xdp_bulk_queue __percpu *bulkq; + + /* Queue with potential multi-producers, and single-consumer kthread */ + struct ptr_ring *queue; + struct task_struct *kthread; + struct work_struct kthread_stop_wq; + + atomic_t refcnt; /* Control when this struct can be free'ed */ + struct rcu_head rcu; +}; + +struct bpf_cpu_map { + struct bpf_map map; + /* Below members specific for map type */ + struct bpf_cpu_map_entry **cpu_map; + unsigned long __percpu *flush_needed; +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq); + +static u64 cpu_map_bitmap_size(const union bpf_attr *attr) +{ + return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +} + +static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) +{ + struct bpf_cpu_map *cmap; + int err = -ENOMEM; + u64 cost; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + return ERR_PTR(-EINVAL); + + cmap = kzalloc(sizeof(*cmap), GFP_USER); + if (!cmap) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + cmap->map.map_type = attr->map_type; + cmap->map.key_size = attr->key_size; + cmap->map.value_size = attr->value_size; + cmap->map.max_entries = attr->max_entries; + cmap->map.map_flags = attr->map_flags; + cmap->map.numa_node = bpf_map_attr_numa_node(attr); + + /* Pre-limit array size based on NR_CPUS, not final CPU check */ + if (cmap->map.max_entries > NR_CPUS) { + err = -E2BIG; + goto free_cmap; + } + + /* make sure page count doesn't overflow */ + cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); + cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_cmap; + cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + ret = bpf_map_precharge_memlock(cmap->map.pages); + if (ret) { + err = ret; + goto free_cmap; + } + + /* A per cpu bitfield with a bit per possible CPU in map */ + cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), + __alignof__(unsigned long)); + if (!cmap->flush_needed) + goto free_cmap; + + /* Alloc array for possible remote "destination" CPUs */ + cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * + sizeof(struct bpf_cpu_map_entry *), + cmap->map.numa_node); + if (!cmap->cpu_map) + goto free_percpu; + + return &cmap->map; +free_percpu: + free_percpu(cmap->flush_needed); +free_cmap: + kfree(cmap); + return ERR_PTR(err); +} + +void __cpu_map_queue_destructor(void *ptr) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + if (WARN_ON_ONCE(ptr)) + page_frag_free(ptr); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); + kfree(rcpu->queue); + kfree(rcpu); + } +} + +static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + atomic_inc(&rcpu->refcnt); +} + +/* called from workqueue, to workaround syscall using preempt_disable */ +static void cpu_map_kthread_stop(struct work_struct *work) +{ + struct bpf_cpu_map_entry *rcpu; + + rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); + + /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, + * as it waits until all in-flight call_rcu() callbacks complete. + */ + rcu_barrier(); + + /* kthread_stop will wake_up_process and wait for it to complete */ + kthread_stop(rcpu->kthread); +} + +/* For now, xdp_pkt is a cpumap internal data structure, with info + * carried between enqueue to dequeue. It is mapped into the top + * headroom of the packet, to avoid allocating separate mem. + */ +struct xdp_pkt { + void *data; + u16 len; + u16 headroom; + u16 metasize; + struct net_device *dev_rx; +}; + +/* Convert xdp_buff to xdp_pkt */ +static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) +{ + struct xdp_pkt *xdp_pkt; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) + return NULL; + + /* Store info in top of packet */ + xdp_pkt = xdp->data_hard_start; + + xdp_pkt->data = xdp->data; + xdp_pkt->len = xdp->data_end - xdp->data; + xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); + xdp_pkt->metasize = metasize; + + return xdp_pkt; +} + +struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) +{ + unsigned int frame_size; + void *pkt_data_start; + struct sk_buff *skb; + + /* build_skb need to place skb_shared_info after SKB end, and + * also want to know the memory "truesize". Thus, need to + * know the memory frame size backing xdp_buff. + * + * XDP was designed to have PAGE_SIZE frames, but this + * assumption is not longer true with ixgbe and i40e. It + * would be preferred to set frame_size to 2048 or 4096 + * depending on the driver. + * frame_size = 2048; + * frame_len = frame_size - sizeof(*xdp_pkt); + * + * Instead, with info avail, skb_shared_info in placed after + * packet len. This, unfortunately fakes the truesize. + * Another disadvantage of this approach, the skb_shared_info + * is not at a fixed memory location, with mixed length + * packets, which is bad for cache-line hotness. + */ + frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + skb = build_skb(pkt_data_start, frame_size); + if (!skb) + return NULL; + + skb_reserve(skb, xdp_pkt->headroom); + __skb_put(skb, xdp_pkt->len); + if (xdp_pkt->metasize) + skb_metadata_set(skb, xdp_pkt->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + return skb; +} + +static int cpu_map_kthread_run(void *data) +{ + struct bpf_cpu_map_entry *rcpu = data; + + set_current_state(TASK_INTERRUPTIBLE); + + /* When kthread gives stop order, then rcpu have been disconnected + * from map, thus no new packets can enter. Remaining in-flight + * per CPU stored packets are flushed to this queue. Wait honoring + * kthread_stop signal until queue is empty. + */ + while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + unsigned int processed = 0, drops = 0, sched = 0; + struct xdp_pkt *xdp_pkt; + + /* Release CPU reschedule checks */ + if (__ptr_ring_empty(rcpu->queue)) { + set_current_state(TASK_INTERRUPTIBLE); + /* Recheck to avoid lost wake-up */ + if (__ptr_ring_empty(rcpu->queue)) { + schedule(); + sched = 1; + } else { + __set_current_state(TASK_RUNNING); + } + } else { + sched = cond_resched(); + } + + /* Process packets in rcpu->queue */ + local_bh_disable(); + /* + * The bpf_cpu_map_entry is single consumer, with this + * kthread CPU pinned. Lockless access to ptr_ring + * consume side valid as no-resize allowed of queue. + */ + while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + struct sk_buff *skb; + int ret; + + skb = cpu_map_build_skb(rcpu, xdp_pkt); + if (!skb) { + page_frag_free(xdp_pkt); + continue; + } + + /* Inject into network stack */ + ret = netif_receive_skb_core(skb); + if (ret == NET_RX_DROP) + drops++; + + /* Limit BH-disable period */ + if (++processed == 8) + break; + } + /* Feedback loop via tracepoint */ + trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + + local_bh_enable(); /* resched point, may call do_softirq() */ + } + __set_current_state(TASK_RUNNING); + + put_cpu_map_entry(rcpu); + return 0; +} + +struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +{ + gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; + struct bpf_cpu_map_entry *rcpu; + int numa, err; + + /* Have map->numa_node, but choose node of redirect target CPU */ + numa = cpu_to_node(cpu); + + rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); + if (!rcpu) + return NULL; + + /* Alloc percpu bulkq */ + rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), + sizeof(void *), gfp); + if (!rcpu->bulkq) + goto free_rcu; + + /* Alloc queue */ + rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); + if (!rcpu->queue) + goto free_bulkq; + + err = ptr_ring_init(rcpu->queue, qsize, gfp); + if (err) + goto free_queue; + + rcpu->cpu = cpu; + rcpu->map_id = map_id; + rcpu->qsize = qsize; + + /* Setup kthread */ + rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, + "cpumap/%d/map:%d", cpu, map_id); + if (IS_ERR(rcpu->kthread)) + goto free_ptr_ring; + + get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ + get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + + /* Make sure kthread runs on a single CPU */ + kthread_bind(rcpu->kthread, cpu); + wake_up_process(rcpu->kthread); + + return rcpu; + +free_ptr_ring: + ptr_ring_cleanup(rcpu->queue, NULL); +free_queue: + kfree(rcpu->queue); +free_bulkq: + free_percpu(rcpu->bulkq); +free_rcu: + kfree(rcpu); + return NULL; +} + +void __cpu_map_entry_free(struct rcu_head *rcu) +{ + struct bpf_cpu_map_entry *rcpu; + int cpu; + + /* This cpu_map_entry have been disconnected from map and one + * RCU graze-period have elapsed. Thus, XDP cannot queue any + * new packets and cannot change/set flush_needed that can + * find this entry. + */ + rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); + + /* Flush remaining packets in percpu bulkq */ + for_each_online_cpu(cpu) { + struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); + + /* No concurrent bq_enqueue can run at this point */ + bq_flush_to_queue(rcpu, bq); + } + free_percpu(rcpu->bulkq); + /* Cannot kthread_stop() here, last put free rcpu resources */ + put_cpu_map_entry(rcpu); +} + +/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to + * ensure any driver rcu critical sections have completed, but this + * does not guarantee a flush has happened yet. Because driver side + * rcu_read_lock/unlock only protects the running XDP program. The + * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a + * pending flush op doesn't fail. + * + * The bpf_cpu_map_entry is still used by the kthread, and there can + * still be pending packets (in queue and percpu bulkq). A refcnt + * makes sure to last user (kthread_stop vs. call_rcu) free memory + * resources. + * + * The rcu callback __cpu_map_entry_free flush remaining packets in + * percpu bulkq to queue. Due to caller map_delete_elem() disable + * preemption, cannot call kthread_stop() to make sure queue is empty. + * Instead a work_queue is started for stopping kthread, + * cpu_map_kthread_stop, which waits for an RCU graze period before + * stopping kthread, emptying the queue. + */ +void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +{ + struct bpf_cpu_map_entry *old_rcpu; + + old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); + if (old_rcpu) { + call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); + INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); + schedule_work(&old_rcpu->kthread_stop_wq); + } +} + +int cpu_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 key_cpu = *(u32 *)key; + + if (key_cpu >= map->max_entries) + return -EINVAL; + + /* notice caller map_delete_elem() use preempt_disable() */ + __cpu_map_entry_replace(cmap, key_cpu, NULL); + return 0; +} + +int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + /* Array index key correspond to CPU number */ + u32 key_cpu = *(u32 *)key; + /* Value is the queue size */ + u32 qsize = *(u32 *)value; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(key_cpu >= cmap->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + return -EOVERFLOW; + + /* Make sure CPU is a valid possible cpu */ + if (!cpu_possible(key_cpu)) + return -ENODEV; + + if (qsize == 0) { + rcpu = NULL; /* Same as deleting */ + } else { + /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ + rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + if (!rcpu) + return -ENOMEM; + } + rcu_read_lock(); + __cpu_map_entry_replace(cmap, key_cpu, rcpu); + rcu_read_unlock(); + return 0; +} + +void cpu_map_free(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + int cpu; + u32 i; + + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the bpf programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete. The rcu critical section only guarantees + * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. + * It does __not__ ensure pending flush operations (if any) are + * complete. + */ + synchronize_rcu(); + + /* To ensure all pending flush operations have completed wait for flush + * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * Because the above synchronize_rcu() ensures the map is disconnected + * from the program we can assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + + while (!bitmap_empty(bitmap, cmap->map.max_entries)) + cond_resched(); + } + + /* For cpu_map the remote CPUs can still be using the entries + * (struct bpf_cpu_map_entry). + */ + for (i = 0; i < cmap->map.max_entries; i++) { + struct bpf_cpu_map_entry *rcpu; + + rcpu = READ_ONCE(cmap->cpu_map[i]); + if (!rcpu) + continue; + + /* bq flush and cleanup happens after RCU graze-period */ + __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ + } + free_percpu(cmap->flush_needed); + bpf_map_area_free(cmap->cpu_map); + kfree(cmap); +} + +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + if (key >= map->max_entries) + return NULL; + + rcpu = READ_ONCE(cmap->cpu_map[key]); + return rcpu; +} + +static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map_entry *rcpu = + __cpu_map_lookup_elem(map, *(u32 *)key); + + return rcpu ? &rcpu->qsize : NULL; +} + +static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= cmap->map.max_entries) { + *next = 0; + return 0; + } + + if (index == cmap->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +const struct bpf_map_ops cpu_map_ops = { + .map_alloc = cpu_map_alloc, + .map_free = cpu_map_free, + .map_delete_elem = cpu_map_delete_elem, + .map_update_elem = cpu_map_update_elem, + .map_lookup_elem = cpu_map_lookup_elem, + .map_get_next_key = cpu_map_get_next_key, +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq) +{ + unsigned int processed = 0, drops = 0; + const int to_cpu = rcpu->cpu; + struct ptr_ring *q; + int i; + + if (unlikely(!bq->count)) + return 0; + + q = rcpu->queue; + spin_lock(&q->producer_lock); + + for (i = 0; i < bq->count; i++) { + void *xdp_pkt = bq->q[i]; + int err; + + err = __ptr_ring_produce(q, xdp_pkt); + if (err) { + drops++; + page_frag_free(xdp_pkt); /* Free xdp_pkt */ + } + processed++; + } + bq->count = 0; + spin_unlock(&q->producer_lock); + + /* Feedback loop via tracepoints */ + trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); + return 0; +} + +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); + + if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) + bq_flush_to_queue(rcpu, bq); + + /* Notice, xdp_buff/page MUST be queued here, long enough for + * driver to code invoking us to finished, due to driver + * (e.g. ixgbe) recycle tricks based on page-refcnt. + * + * Thus, incoming xdp_pkt is always queued here (else we race + * with another CPU on page-refcnt and remaining driver code). + * Queue time is very short, as driver will invoke flush + * operation, when completing napi->poll call. + */ + bq->q[bq->count++] = xdp_pkt; + return 0; +} + +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct xdp_pkt *xdp_pkt; + + xdp_pkt = convert_to_xdp_pkt(xdp); + if (unlikely(!xdp_pkt)) + return -EOVERFLOW; + + /* Info needed when constructing SKB on remote CPU */ + xdp_pkt->dev_rx = dev_rx; + + bq_enqueue(rcpu, xdp_pkt); + return 0; +} + +void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + + __set_bit(bit, bitmap); +} + +void __cpu_map_flush(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + u32 bit; + + /* The napi->poll softirq makes sure __cpu_map_insert_ctx() + * and __cpu_map_flush() happen on same CPU. Thus, the percpu + * bitmap indicate which percpu bulkq have packets. + */ + for_each_set_bit(bit, bitmap, map->max_entries) { + struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); + struct xdp_bulk_queue *bq; + + /* This is possible if entry is removed by user space + * between xdp redirect and flush op. + */ + if (unlikely(!rcpu)) + continue; + + __clear_bit(bit, bitmap); + + /* Flush all frames in bulkq to real queue */ + bq = this_cpu_ptr(rcpu->bulkq); + bq_flush_to_queue(rcpu, bq); + + /* If already running, costs spin_lock_irqsave + smb_mb */ + wake_up_process(rcpu->kthread); + } +} diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e745d6a88224..ebdef54bf7df 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -50,6 +50,9 @@ #include <linux/bpf.h> #include <linux/filter.h> +#define DEV_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct bpf_dtab_netdev { struct net_device *dev; struct bpf_dtab *dtab; @@ -83,7 +86,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); dtab = kzalloc(sizeof(*dtab), GFP_USER); diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c new file mode 100644 index 000000000000..e682850c9715 --- /dev/null +++ b/kernel/bpf/disasm.c @@ -0,0 +1,214 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/bpf.h> + +#include "disasm.h" + +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + +const char *const bpf_class_string[8] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +const char *const bpf_alu_string[16] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[16] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JLT >> 4] = "<", + [BPF_JGE >> 4] = ">=", + [BPF_JLE >> 4] = "<=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSLT >> 4] = "s<", + [BPF_JSGE >> 4] = "s>=", + [BPF_JSLE >> 4] = "s<=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_end_insn(bpf_insn_print_cb verbose, + struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose(env, "BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(verbose, env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose(env, "(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + } else { + verbose(env, "(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose(env, "BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_st_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_ldx_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !allow_ptr_leaks) + imm = 0; + + verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); + } else { + verbose(env, "BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose(env, "(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose(env, "(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose(env, "(%02x) %s\n", + insn->code, bpf_class_string[class]); + } +} diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h new file mode 100644 index 000000000000..8de977e420b6 --- /dev/null +++ b/kernel/bpf/disasm.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef __BPF_DISASM_H__ +#define __BPF_DISASM_H__ + +#include <linux/bpf.h> +#include <linux/kernel.h> +#include <linux/stringify.h> + +extern const char *const bpf_alu_string[16]; +extern const char *const bpf_class_string[8]; + +const char *func_id_name(int id); + +struct bpf_verifier_env; +typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, + const char *, ...); +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks); + +#endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 6533f08d1238..e469e05c8e83 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -18,8 +18,9 @@ #include "bpf_lru_list.h" #include "map_in_map.h" -#define HTAB_CREATE_FLAG_MASK \ - (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE) +#define HTAB_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ + BPF_F_RDONLY | BPF_F_WRONLY) struct bucket { struct hlist_nulls_head head; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be1dde967208..01aaef1a77c5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -295,7 +295,7 @@ out: } static void *bpf_obj_do_get(const struct filename *pathname, - enum bpf_type *type) + enum bpf_type *type, int flags) { struct inode *inode; struct path path; @@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname, return ERR_PTR(ret); inode = d_backing_inode(path.dentry); - ret = inode_permission(inode, MAY_WRITE); + ret = inode_permission(inode, ACC_MODE(flags)); if (ret) goto out; @@ -326,18 +326,23 @@ out: return ERR_PTR(ret); } -int bpf_obj_get_user(const char __user *pathname) +int bpf_obj_get_user(const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; struct filename *pname; int ret = -ENOENT; + int f_flags; void *raw; + f_flags = bpf_get_file_flag(flags); + if (f_flags < 0) + return f_flags; + pname = getname(pathname); if (IS_ERR(pname)) return PTR_ERR(pname); - raw = bpf_obj_do_get(pname, &type); + raw = bpf_obj_do_get(pname, &type, f_flags); if (IS_ERR(raw)) { ret = PTR_ERR(raw); goto out; @@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname) if (type == BPF_TYPE_PROG) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) - ret = bpf_map_new_fd(raw); + ret = bpf_map_new_fd(raw, f_flags); else goto out; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1b767844a76f..885e45479680 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -389,10 +389,99 @@ out: return ret; } -static int trie_delete_elem(struct bpf_map *map, void *key) +/* Called from syscall or from eBPF program */ +static int trie_delete_elem(struct bpf_map *map, void *_key) { - /* TODO */ - return -ENOSYS; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key; + struct lpm_trie_node __rcu **trim, **trim2; + struct lpm_trie_node *node, *parent; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Walk the tree looking for an exact key/length match and keeping + * track of the path we traverse. We will need to know the node + * we wish to delete, and the slot that points to the node we want + * to delete. We may also need to know the nodes parent and the + * slot that contains it. + */ + trim = &trie->root; + trim2 = trim; + parent = NULL; + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + parent = node; + trim2 = trim; + next_bit = extract_bit(key->data, node->prefixlen); + trim = &node->child[next_bit]; + } + + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + ret = -ENOENT; + goto out; + } + + trie->n_entries--; + + /* If the node we are removing has two children, simply mark it + * as intermediate and we are done. + */ + if (rcu_access_pointer(node->child[0]) && + rcu_access_pointer(node->child[1])) { + node->flags |= LPM_TREE_NODE_FLAG_IM; + goto out; + } + + /* If the parent of the node we are about to delete is an intermediate + * node, and the deleted node doesn't have any children, we can delete + * the intermediate parent as well and promote its other child + * up the tree. Doing this maintains the invariant that all + * intermediate nodes have exactly 2 children and that there are no + * unnecessary intermediate nodes in the tree. + */ + if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) && + !node->child[0] && !node->child[1]) { + if (node == rcu_access_pointer(parent->child[0])) + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[1])); + else + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[0])); + kfree_rcu(parent, rcu); + kfree_rcu(node, rcu); + goto out; + } + + /* The node we are removing has either zero or one child. If there + * is a child, move it into the removed node's slot then delete + * the node. Otherwise just clear the slot and delete the node. + */ + if (node->child[0]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0])); + else if (node->child[1]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); + else + RCU_INIT_POINTER(*trim, NULL); + kfree_rcu(node, rcu); + +out: + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; } #define LPM_DATA_SIZE_MAX 256 @@ -406,7 +495,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key) #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) -#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE) +#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ + BPF_F_RDONLY | BPF_F_WRONLY) static struct bpf_map *trie_alloc(union bpf_attr *attr) { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c new file mode 100644 index 000000000000..2816feb38be1 --- /dev/null +++ b/kernel/bpf/offload.c @@ -0,0 +1,194 @@ +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/bug.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/printk.h> +#include <linux/rtnetlink.h> + +/* protected by RTNL */ +static LIST_HEAD(bpf_prog_offload_devs); + +int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_dev_offload *offload; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->prog_flags) + return -EINVAL; + + offload = kzalloc(sizeof(*offload), GFP_USER); + if (!offload) + return -ENOMEM; + + offload->prog = prog; + init_waitqueue_head(&offload->verifier_done); + + rtnl_lock(); + offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex); + if (!offload->netdev) { + rtnl_unlock(); + kfree(offload); + return -EINVAL; + } + + prog->aux->offload = offload; + list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + rtnl_unlock(); + + return 0; +} + +static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, + struct netdev_bpf *data) +{ + struct net_device *netdev = prog->aux->offload->netdev; + + ASSERT_RTNL(); + + if (!netdev) + return -ENODEV; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + data->command = cmd; + + return netdev->netdev_ops->ndo_bpf(netdev, data); +} + +int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +{ + struct netdev_bpf data = {}; + int err; + + data.verifier.prog = env->prog; + + rtnl_lock(); + err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data); + if (err) + goto exit_unlock; + + env->dev_ops = data.verifier.ops; + + env->prog->aux->offload->dev_state = true; + env->prog->aux->offload->verifier_running = true; +exit_unlock: + rtnl_unlock(); + return err; +} + +static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + struct netdev_bpf data = {}; + + data.offload.prog = prog; + + if (offload->verifier_running) + wait_event(offload->verifier_done, !offload->verifier_running); + + if (offload->dev_state) + WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + + offload->dev_state = false; + list_del_init(&offload->offloads); + offload->netdev = NULL; +} + +void bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + + offload->verifier_running = false; + wake_up(&offload->verifier_done); + + rtnl_lock(); + __bpf_prog_offload_destroy(prog); + rtnl_unlock(); + + kfree(offload); +} + +static int bpf_prog_offload_translate(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + struct netdev_bpf data = {}; + int ret; + + data.offload.prog = prog; + + offload->verifier_running = false; + wake_up(&offload->verifier_done); + + rtnl_lock(); + ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); + rtnl_unlock(); + + return ret; +} + +static unsigned int bpf_prog_warn_on_exec(const void *ctx, + const struct bpf_insn *insn) +{ + WARN(1, "attempt to execute device eBPF program on the host!"); + return 0; +} + +int bpf_prog_offload_compile(struct bpf_prog *prog) +{ + prog->bpf_func = bpf_prog_warn_on_exec; + + return bpf_prog_offload_translate(prog); +} + +u32 bpf_prog_offload_ifindex(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + u32 ifindex; + + rtnl_lock(); + ifindex = offload->netdev ? offload->netdev->ifindex : 0; + rtnl_unlock(); + + return ifindex; +} + +const struct bpf_prog_ops bpf_offload_prog_ops = { +}; + +static int bpf_offload_notification(struct notifier_block *notifier, + ulong event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_dev_offload *offload, *tmp; + + ASSERT_RTNL(); + + switch (event) { + case NETDEV_UNREGISTER: + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, + offloads) { + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block bpf_offload_notifier = { + .notifier_call = bpf_offload_notification, +}; + +static int __init bpf_offload_init(void) +{ + register_netdevice_notifier(&bpf_offload_notifier); + return 0; +} + +subsys_initcall(bpf_offload_init); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 5c51d1985b51..673fa6fe2d73 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; + unsigned long flags; int orig_cpu, cpu; + local_irq_save(flags); orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); @@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) node = head->first; if (node) { head->first = node->next; - raw_spin_unlock(&head->lock); + raw_spin_unlock_irqrestore(&head->lock, flags); return node; } raw_spin_unlock(&head->lock); cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; - if (cpu == orig_cpu) + if (cpu == orig_cpu) { + local_irq_restore(flags); return NULL; + } } } diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index dbd7b322a86b..5ee2e41893d9 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -41,6 +41,9 @@ #include <net/strparser.h> #include <net/tcp.h> +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct bpf_stab { struct bpf_map map; struct sock **sock_map; @@ -122,7 +125,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.map = NULL; skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); + bpf_compute_data_pointers(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -385,7 +388,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); @@ -508,7 +511,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 135be433e9a0..a15bc636cc98 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -11,6 +11,9 @@ #include <linux/perf_event.h> #include "percpu_freelist.h" +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; @@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_flags & ~BPF_F_NUMA_NODE) + if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); /* check sanity of attributes */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 25d074920a00..09badc37e864 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,9 @@ #include <linux/version.h> #include <linux/kernel.h> #include <linux/idr.h> +#include <linux/cred.h> +#include <linux/timekeeping.h> +#include <linux/ctype.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -31,6 +34,8 @@ #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) + DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); @@ -207,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work) struct bpf_map *map = container_of(work, struct bpf_map, work); bpf_map_uncharge_memlock(map); + security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); } @@ -291,17 +297,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) } #endif -static const struct file_operations bpf_map_fops = { +static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, + loff_t *ppos) +{ + /* We need this handler such that alloc_file() enables + * f_mode with FMODE_CAN_READ. + */ + return -EINVAL; +} + +static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, + size_t siz, loff_t *ppos) +{ + /* We need this handler such that alloc_file() enables + * f_mode with FMODE_CAN_WRITE. + */ + return -EINVAL; +} + +const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, #endif .release = bpf_map_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, }; -int bpf_map_new_fd(struct bpf_map *map) +int bpf_map_new_fd(struct bpf_map *map, int flags) { + int ret; + + ret = security_bpf_map(map, OPEN_FMODE(flags)); + if (ret < 0) + return ret; + return anon_inode_getfd("bpf-map", &bpf_map_fops, map, - O_RDWR | O_CLOEXEC); + flags | O_CLOEXEC); +} + +int bpf_get_file_flag(int flags) +{ + if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) + return -EINVAL; + if (flags & BPF_F_RDONLY) + return O_RDONLY; + if (flags & BPF_F_WRONLY) + return O_WRONLY; + return O_RDWR; } /* helper macro to check that unused fields 'union bpf_attr' are zero */ @@ -312,18 +355,46 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD numa_node +/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. + * Return 0 on success and < 0 on error. + */ +static int bpf_obj_name_cpy(char *dst, const char *src) +{ + const char *end = src + BPF_OBJ_NAME_LEN; + + memset(dst, 0, BPF_OBJ_NAME_LEN); + + /* Copy all isalnum() and '_' char */ + while (src < end && *src) { + if (!isalnum(*src) && *src != '_') + return -EINVAL; + *dst++ = *src++; + } + + /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + if (src == end) + return -EINVAL; + + return 0; +} + +#define BPF_MAP_CREATE_LAST_FIELD map_name /* called via syscall */ static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_map *map; + int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); if (err) return -EINVAL; + f_flags = bpf_get_file_flag(attr->map_flags); + if (f_flags < 0) + return f_flags; + if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) @@ -334,18 +405,26 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = bpf_obj_name_cpy(map->name, attr->map_name); + if (err) + goto free_map_nouncharge; + atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - err = bpf_map_charge_memlock(map); + err = security_bpf_map_alloc(map); if (err) goto free_map_nouncharge; + err = bpf_map_charge_memlock(map); + if (err) + goto free_map_sec; + err = bpf_map_alloc_id(map); if (err) goto free_map; - err = bpf_map_new_fd(map); + err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. * bpf_map_put() is needed because the above @@ -362,6 +441,8 @@ static int map_create(union bpf_attr *attr) free_map: bpf_map_uncharge_memlock(map); +free_map_sec: + security_bpf_map_free(map); free_map_nouncharge: map->ops->map_free(map); return err; @@ -460,6 +541,11 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -540,6 +626,11 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -562,6 +653,12 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; + /* Need to create a kthread, thus must support schedule */ + if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + err = map->ops->map_update_elem(map, key, value, attr->flags); + goto out; + } + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from * inside bpf map update or delete otherwise deadlocks are possible */ @@ -592,7 +689,7 @@ static int map_update_elem(union bpf_attr *attr) } __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_update_elem(map, ufd, key, value); free_value: @@ -623,6 +720,11 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -666,6 +768,11 @@ static int map_get_next_key(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + if (ukey) { key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { @@ -703,9 +810,9 @@ err_put: return err; } -static const struct bpf_verifier_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _ops) \ - [_id] = &_ops, +static const struct bpf_prog_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE @@ -717,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) return -EINVAL; - prog->aux->ops = bpf_prog_types[type]; + if (!bpf_prog_is_dev_bound(prog->aux)) + prog->aux->ops = bpf_prog_types[type]; + else + prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; return 0; } @@ -820,6 +930,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); + security_bpf_prog_free(aux); bpf_prog_free(aux->prog); } @@ -867,15 +978,23 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) } #endif -static const struct file_operations bpf_prog_fops = { +const struct file_operations bpf_prog_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_prog_show_fdinfo, #endif .release = bpf_prog_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, }; int bpf_prog_new_fd(struct bpf_prog *prog) { + int ret; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ret; + return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); } @@ -938,7 +1057,22 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) +static bool bpf_prog_can_attach(struct bpf_prog *prog, + enum bpf_prog_type *attach_type, + struct net_device *netdev) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + + if (prog->type != *attach_type) + return false; + if (offload && offload->netdev != netdev) + return false; + + return true; +} + +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, + struct net_device *netdev) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -946,7 +1080,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (type && prog->type != *type) { + if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -959,12 +1093,12 @@ out: struct bpf_prog *bpf_prog_get(u32 ufd) { - return __bpf_prog_get(ufd, NULL); + return __bpf_prog_get(ufd, NULL, NULL); } struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); @@ -972,8 +1106,19 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) } EXPORT_SYMBOL_GPL(bpf_prog_get_type); +struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, + struct net_device *netdev) +{ + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev); + + if (!IS_ERR(prog)) + trace_bpf_prog_get_type(prog); + return prog; +} +EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_flags +#define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex static int bpf_prog_load(union bpf_attr *attr) { @@ -1015,10 +1160,14 @@ static int bpf_prog_load(union bpf_attr *attr) if (!prog) return -ENOMEM; - err = bpf_prog_charge_memlock(prog); + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; + err = bpf_prog_charge_memlock(prog); + if (err) + goto free_prog_sec; + prog->len = attr->insn_cnt; err = -EFAULT; @@ -1032,11 +1181,22 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; + if (attr->prog_target_ifindex) { + err = bpf_prog_offload_init(prog, attr); + if (err) + goto free_prog; + } + /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) goto free_prog; + prog->aux->load_time = ktime_get_boot_ns(); + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); + if (err) + goto free_prog; + /* run eBPF verifier */ err = bpf_check(&prog, attr); if (err < 0) @@ -1071,16 +1231,18 @@ free_used_maps: free_used_maps(prog->aux); free_prog: bpf_prog_uncharge_memlock(prog); +free_prog_sec: + security_bpf_prog_free(prog->aux); free_prog_nouncharge: bpf_prog_free(prog); return err; } -#define BPF_OBJ_LAST_FIELD bpf_fd +#define BPF_OBJ_LAST_FIELD file_flags static int bpf_obj_pin(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ)) + if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) return -EINVAL; return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); @@ -1088,10 +1250,12 @@ static int bpf_obj_pin(const union bpf_attr *attr) static int bpf_obj_get(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || + attr->file_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; - return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); + return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), + attr->file_flags); } #ifdef CONFIG_CGROUP_BPF @@ -1132,6 +1296,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return 0; } +#define BPF_F_ATTACH_MASK \ + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; @@ -1145,7 +1312,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; - if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) + if (attr->attach_flags & ~BPF_F_ATTACH_MASK) return -EINVAL; switch (attr->attach_type) { @@ -1159,6 +1326,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, true); @@ -1176,8 +1346,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) return PTR_ERR(cgrp); } - ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, - attr->attach_flags & BPF_F_ALLOW_OVERRIDE); + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); if (ret) bpf_prog_put(prog); cgroup_put(cgrp); @@ -1189,6 +1359,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { + enum bpf_prog_type ptype; + struct bpf_prog *prog; struct cgroup *cgrp; int ret; @@ -1201,26 +1373,71 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: + ptype = BPF_PROG_TYPE_CGROUP_SKB; + break; case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; case BPF_CGROUP_SOCK_OPS: - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); - cgroup_put(cgrp); + ptype = BPF_PROG_TYPE_SOCK_OPS; + break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - ret = sockmap_get_from_fd(attr, false); - break; + return sockmap_get_from_fd(attr, false); default: return -EINVAL; } + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + cgroup_put(cgrp); return ret; } +#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt + +static int bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (CHECK_ATTR(BPF_PROG_QUERY)) + return -EINVAL; + if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) + return -EINVAL; + + switch (attr->query.attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_DEVICE: + break; + default: + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + ret = cgroup_bpf_query(cgrp, attr, uattr); + cgroup_put(cgrp); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -1305,20 +1522,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) return fd; } -#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags static int bpf_map_get_fd_by_id(const union bpf_attr *attr) { struct bpf_map *map; u32 id = attr->map_id; + int f_flags; int fd; - if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) + if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || + attr->open_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + f_flags = bpf_get_file_flag(attr->open_flags); + if (f_flags < 0) + return f_flags; + spin_lock_bh(&map_idr_lock); map = idr_find(&map_idr, id); if (map) @@ -1330,7 +1553,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - fd = bpf_map_new_fd(map); + fd = bpf_map_new_fd(map, f_flags); if (fd < 0) bpf_map_put(map); @@ -1358,8 +1581,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.type = prog->type; info.id = prog->aux->id; + info.load_time = prog->aux->load_time; + info.created_by_uid = from_kuid_munged(current_user_ns(), + prog->aux->user->uid); memcpy(info.tag, prog->tag, sizeof(prog->tag)); + memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + + ulen = info.nr_map_ids; + info.nr_map_ids = prog->aux->used_map_cnt; + ulen = min_t(u32, info.nr_map_ids, ulen); + if (ulen) { + u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); + u32 i; + + for (i = 0; i < ulen; i++) + if (put_user(prog->aux->used_maps[i]->id, + &user_map_ids[i])) + return -EFAULT; + } if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; @@ -1385,6 +1625,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + info.status |= BPF_PROG_STATUS_DEV_BOUND; + info.ifindex = bpf_prog_offload_ifindex(prog); + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -1413,6 +1658,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; + memcpy(info.name, map->name, sizeof(map->name)); if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -1467,6 +1713,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; + err = security_bpf(cmd, &attr, size); + if (err < 0) + return err; + switch (cmd) { case BPF_MAP_CREATE: err = map_create(&attr); @@ -1499,6 +1749,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; + case BPF_PROG_QUERY: + err = bpf_prog_query(&attr, uattr); + break; #endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c48ca2a34b5e..dd54d20ace2f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,17 @@ #include <linux/vmalloc.h> #include <linux/stringify.h> +#include "disasm.h" + +static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _verifier_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include <linux/bpf_types.h> +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -153,28 +164,42 @@ struct bpf_call_arg_meta { int access_size; }; -/* verbose verifier prints what it's seeing - * bpf_check() is called under lock, so no race to access these global vars - */ -static u32 log_level, log_size, log_len; -static char *log_buf; - static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static __printf(1, 2) void verbose(const char *fmt, ...) +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) { + struct bpf_verifer_log *log = &env->log; + unsigned int n; va_list args; - if (log_level == 0 || log_len >= log_size - 1) + if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) return; va_start(args, fmt); - log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); va_end(args); + + WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, + "verifier log line truncated - local buffer too short\n"); + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; + + if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) + log->len_used += n; + else + log->ubuf = NULL; +} + +static bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; } /* string representation of 'enum bpf_reg_type' */ @@ -187,26 +212,12 @@ static const char * const reg_type_str[] = { [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", [PTR_TO_STACK] = "fp", [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", }; -#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) -static const char * const func_id_str[] = { - __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) -}; -#undef __BPF_FUNC_STR_FN - -static const char *func_id_name(int id) -{ - BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - - if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) - return func_id_str[id]; - else - return "unknown"; -} - -static void print_verifier_state(struct bpf_verifier_state *state) +static void print_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *state) { struct bpf_reg_state *reg; enum bpf_reg_type t; @@ -217,21 +228,21 @@ static void print_verifier_state(struct bpf_verifier_state *state) t = reg->type; if (t == NOT_INIT) continue; - verbose(" R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d=%s", i, reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ - verbose("%lld", reg->var_off.value + reg->off); + verbose(env, "%lld", reg->var_off.value + reg->off); } else { - verbose("(id=%d", reg->id); + verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) - verbose(",off=%d", reg->off); - if (t == PTR_TO_PACKET) - verbose(",r=%d", reg->range); + verbose(env, ",off=%d", reg->off); + if (type_is_pkt_pointer(t)) + verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) - verbose(",ks=%d,vs=%d", + verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { @@ -239,243 +250,174 @@ static void print_verifier_state(struct bpf_verifier_state *state) * could be a pointer whose offset is too big * for reg->off */ - verbose(",imm=%llx", reg->var_off.value); + verbose(env, ",imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(",smin_value=%lld", + verbose(env, ",smin_value=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(",smax_value=%lld", + verbose(env, ",smax_value=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(",umin_value=%llu", + verbose(env, ",umin_value=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(",umax_value=%llu", + verbose(env, ",umax_value=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(",var_off=%s", tn_buf); + verbose(env, ",var_off=%s", tn_buf); } } - verbose(")"); + verbose(env, ")"); } } - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] == STACK_SPILL) - verbose(" fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] == STACK_SPILL) + verbose(env, " fp%d=%s", + -MAX_BPF_STACK + i * BPF_REG_SIZE, + reg_type_str[state->stack[i].spilled_ptr.type]); } - verbose("\n"); + verbose(env, "\n"); } -static const char *const bpf_class_string[] = { - [BPF_LD] = "ld", - [BPF_LDX] = "ldx", - [BPF_ST] = "st", - [BPF_STX] = "stx", - [BPF_ALU] = "alu", - [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", - [BPF_ALU64] = "alu64", -}; - -static const char *const bpf_alu_string[16] = { - [BPF_ADD >> 4] = "+=", - [BPF_SUB >> 4] = "-=", - [BPF_MUL >> 4] = "*=", - [BPF_DIV >> 4] = "/=", - [BPF_OR >> 4] = "|=", - [BPF_AND >> 4] = "&=", - [BPF_LSH >> 4] = "<<=", - [BPF_RSH >> 4] = ">>=", - [BPF_NEG >> 4] = "neg", - [BPF_MOD >> 4] = "%=", - [BPF_XOR >> 4] = "^=", - [BPF_MOV >> 4] = "=", - [BPF_ARSH >> 4] = "s>>=", - [BPF_END >> 4] = "endian", -}; - -static const char *const bpf_ldst_string[] = { - [BPF_W >> 3] = "u32", - [BPF_H >> 3] = "u16", - [BPF_B >> 3] = "u8", - [BPF_DW >> 3] = "u64", -}; - -static const char *const bpf_jmp_string[16] = { - [BPF_JA >> 4] = "jmp", - [BPF_JEQ >> 4] = "==", - [BPF_JGT >> 4] = ">", - [BPF_JLT >> 4] = "<", - [BPF_JGE >> 4] = ">=", - [BPF_JLE >> 4] = "<=", - [BPF_JSET >> 4] = "&", - [BPF_JNE >> 4] = "!=", - [BPF_JSGT >> 4] = "s>", - [BPF_JSLT >> 4] = "s<", - [BPF_JSGE >> 4] = "s>=", - [BPF_JSLE >> 4] = "s<=", - [BPF_CALL >> 4] = "call", - [BPF_EXIT >> 4] = "exit", -}; +static int copy_stack_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) +{ + if (!src->stack) + return 0; + if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { + /* internal bug, make state invalid to reject the program */ + memset(dst, 0, sizeof(*dst)); + return -EFAULT; + } + memcpy(dst->stack, src->stack, + sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); + return 0; +} -static void print_bpf_insn(const struct bpf_verifier_env *env, - const struct bpf_insn *insn) +/* do_check() starts with zero-sized stack in struct bpf_verifier_state to + * make it consume minimal amount of memory. check_stack_write() access from + * the program calls into realloc_verifier_state() to grow the stack size. + * Note there is a non-zero 'parent' pointer inside bpf_verifier_state + * which this function copies over. It points to previous bpf_verifier_state + * which is never reallocated + */ +static int realloc_verifier_state(struct bpf_verifier_state *state, int size, + bool copy_old) { - u8 class = BPF_CLASS(insn->code); - - if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_SRC(insn->code) == BPF_X) - verbose("(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->src_reg); - else - verbose("(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->imm); - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_MEM) - verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) - verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, insn->off, - insn->src_reg); - else - verbose("BUG_%02x\n", insn->code); - } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_st_%02x\n", insn->code); - return; - } - verbose("(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); - } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_ldx_%02x\n", insn->code); - return; + u32 old_size = state->allocated_stack; + struct bpf_stack_state *new_stack; + int slot = size / BPF_REG_SIZE; + + if (size <= old_size || !size) { + if (copy_old) + return 0; + state->allocated_stack = slot * BPF_REG_SIZE; + if (!size && old_size) { + kfree(state->stack); + state->stack = NULL; } - verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", - insn->code, insn->dst_reg, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->off); - } else if (class == BPF_LD) { - if (BPF_MODE(insn->code) == BPF_ABS) { - verbose("(%02x) r0 = *(%s *)skb[%d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM && - BPF_SIZE(insn->code) == BPF_DW) { - /* At this point, we already made sure that the second - * part of the ldimm64 insn is accessible. - */ - u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + return 0; + } + new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), + GFP_KERNEL); + if (!new_stack) + return -ENOMEM; + if (copy_old) { + if (state->stack) + memcpy(new_stack, state->stack, + sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); + memset(new_stack + old_size / BPF_REG_SIZE, 0, + sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); + } + state->allocated_stack = slot * BPF_REG_SIZE; + kfree(state->stack); + state->stack = new_stack; + return 0; +} - if (map_ptr && !env->allow_ptr_leaks) - imm = 0; +static void free_verifier_state(struct bpf_verifier_state *state, + bool free_self) +{ + kfree(state->stack); + if (free_self) + kfree(state); +} - verbose("(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); - } else { - verbose("BUG_ld_%02x\n", insn->code); - return; - } - } else if (class == BPF_JMP) { - u8 opcode = BPF_OP(insn->code); +/* copy verifier state from src to dst growing dst stack space + * when necessary to accommodate larger src stack + */ +static int copy_verifier_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) +{ + int err; - if (opcode == BPF_CALL) { - verbose("(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); - } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose("(%02x) goto pc%+d\n", - insn->code, insn->off); - } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose("(%02x) exit\n", insn->code); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->src_reg, insn->off); - } else { - verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); - } - } else { - verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); - } + err = realloc_verifier_state(dst, src->allocated_stack, false); + if (err) + return err; + memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + return copy_stack_state(dst, src); } -static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, + int *insn_idx) { - struct bpf_verifier_stack_elem *elem; - int insn_idx; + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_stack_elem *elem, *head = env->head; + int err; if (env->head == NULL) - return -1; + return -ENOENT; - memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); - insn_idx = env->head->insn_idx; + if (cur) { + err = copy_verifier_state(cur, &head->st); + if (err) + return err; + } + if (insn_idx) + *insn_idx = head->insn_idx; if (prev_insn_idx) - *prev_insn_idx = env->head->prev_insn_idx; - elem = env->head->next; - kfree(env->head); + *prev_insn_idx = head->prev_insn_idx; + elem = head->next; + free_verifier_state(&head->st, false); + kfree(head); env->head = elem; env->stack_size--; - return insn_idx; + return 0; } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { + struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; + int err; - elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; - memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; env->head = elem; env->stack_size++; + err = copy_verifier_state(&elem->st, cur); + if (err) + goto err; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose("BPF program is too complex\n"); + verbose(env, "BPF program is too complex\n"); goto err; } return &elem->st; err: /* pop all elements and return */ - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); return NULL; } @@ -507,10 +449,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_known_zero(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_known_zero(regs, %u)\n", regno); + verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -519,6 +462,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) __mark_reg_known_zero(regs + regno); } +static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +{ + return type_is_pkt_pointer(reg->type); +} + +static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) +{ + return reg_is_pkt_pointer(reg) || + reg->type == PTR_TO_PACKET_END; +} + +/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ +static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, + enum bpf_reg_type which) +{ + /* The register can already have a range from prior markings. + * This is fine as long as it hasn't been advanced from its + * origin. + */ + return reg->type == which && + reg->id == 0 && + reg->off == 0 && + tnum_equals_const(reg->var_off, 0); +} + /* Attempts to improve min/max values based on var_off information */ static void __update_reg_bounds(struct bpf_reg_state *reg) { @@ -595,10 +563,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) __mark_reg_unbounded(reg); } -static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_unknown(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_unknown(regs, %u)\n", regno); + verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -613,10 +582,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg) reg->type = NOT_INIT; } -static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_not_init(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_not_init(regs, %u)\n", regno); + verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -625,22 +595,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) __mark_reg_not_init(regs + regno); } -static void init_reg_state(struct bpf_reg_state *regs) +static void init_reg_state(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { - mark_reg_not_init(regs, i); + mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; } /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; - mark_reg_known_zero(regs, BPF_REG_FP); + mark_reg_known_zero(env, regs, BPF_REG_FP); /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(regs, BPF_REG_1); + mark_reg_known_zero(env, regs, BPF_REG_1); } enum reg_arg_type { @@ -671,29 +642,29 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state->regs; if (regno >= MAX_BPF_REG) { - verbose("R%d is invalid\n", regno); + verbose(env, "R%d is invalid\n", regno); return -EINVAL; } if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (regs[regno].type == NOT_INIT) { - verbose("R%d !read_ok\n", regno); + verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(&env->cur_state, regno); + mark_reg_read(env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { - verbose("frame pointer is read only\n"); + verbose(env, "frame pointer is read only\n"); return -EACCES; } regs[regno].live |= REG_LIVE_WRITTEN; if (t == DST_OP) - mark_reg_unknown(regs, regno); + mark_reg_unknown(env, regs, regno); } return 0; } @@ -706,6 +677,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: + case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: case CONST_PTR_TO_MAP: return true; @@ -717,35 +689,48 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_state *state, int off, +static int check_stack_write(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { - int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + + err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); + if (err) + return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ + if (!env->allow_ptr_leaks && + state->stack[spi].slot_type[0] == STACK_SPILL && + size != BPF_REG_SIZE) { + verbose(env, "attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } if (value_regno >= 0 && is_spillable_regtype(state->regs[value_regno].type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } /* save register state */ - state->spilled_regs[spi] = state->regs[value_regno]; - state->spilled_regs[spi].live |= REG_LIVE_WRITTEN; + state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + state->stack[spi].slot_type[i] = STACK_SPILL; } else { /* regular write of data into stack */ - state->spilled_regs[spi] = (struct bpf_reg_state) {}; + state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; for (i = 0; i < size; i++) - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; + state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = + STACK_MISC; } return 0; } @@ -756,66 +741,72 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN) + if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->spilled_regs[slot].live |= REG_LIVE_READ; + parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; } } -static int check_stack_read(struct bpf_verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { - u8 *slot_type; - int i, spi; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + u8 *stype; - slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; + if (state->allocated_stack <= slot) { + verbose(env, "invalid read from stack off %d+0 size %d\n", + off, size); + return -EACCES; + } + stype = state->stack[spi].slot_type; - if (slot_type[0] == STACK_SPILL) { + if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { - if (slot_type[i] != STACK_SPILL) { - verbose("corrupted spill memory\n"); + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { + verbose(env, "corrupted spill memory\n"); return -EACCES; } } - spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; - if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->spilled_regs[spi]; + state->regs[value_regno] = state->stack[spi].spilled_ptr; mark_stack_slot_read(state, spi); } return 0; } else { for (i = 0; i < size; i++) { - if (slot_type[i] != STACK_MISC) { - verbose("invalid read from stack off %d+%d size %d\n", + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { + verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); return 0; } } /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size) + int size, bool zero_size_allowed) { - struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; - if (off < 0 || size <= 0 || off + size > map->value_size) { - verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || + off + size > map->value_size) { + verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; } @@ -824,9 +815,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size, bool zero_size_allowed) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -834,8 +825,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (log_level) - print_verifier_state(state); + if (env->log.level) + print_verifier_state(env, state); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our @@ -843,13 +834,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * will have a set floor within our range. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size); + err = __check_map_access(env, regno, reg->smin_value + off, size, + zero_size_allowed); if (err) { - verbose("R%d min value is outside of the array range\n", regno); + verbose(env, "R%d min value is outside of the array range\n", + regno); return err; } @@ -858,13 +851,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size); + err = __check_map_access(env, regno, reg->umax_value + off, size, + zero_size_allowed); if (err) - verbose("R%d max value is outside of the array range\n", regno); + verbose(env, "R%d max value is outside of the array range\n", + regno); return err; } @@ -897,13 +892,14 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size, bool zero_size_allowed) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - if (off < 0 || size <= 0 || (u64)off + size > reg->range) { - verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || + (u64)off + size > reg->range) { + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } @@ -911,9 +907,9 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, } static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, - int size) + int size, bool zero_size_allowed) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; int err; @@ -926,13 +922,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, * detail to prove they're safe. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size); + err = __check_packet_access(env, regno, off, size, zero_size_allowed); if (err) { - verbose("R%d offset is outside of the packet\n", regno); + verbose(env, "R%d offset is outside of the packet\n", regno); return err; } return err; @@ -946,12 +942,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, .reg_type = *reg_type, }; - /* for analyzer ctx accesses are already validated and converted */ - if (env->analyzer_ops) - return 0; - - if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + if (env->ops->is_valid_access && + env->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -959,16 +951,16 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; *reg_type = info.reg_type; + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; return 0; } - verbose("invalid bpf_context access off=%d size=%d\n", off, size); + verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; } @@ -983,10 +975,11 @@ static bool __is_pointer_value(bool allow_ptr_leaks, static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { - return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); + return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } -static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, +static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int off, int size, bool strict) { struct tnum reg_off; @@ -1011,7 +1004,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned packet access off %d+%s+%d+%d size %d\n", + verbose(env, + "misaligned packet access off %d+%s+%d+%d size %d\n", ip_align, tn_buf, reg->off, off, size); return -EACCES; } @@ -1019,7 +1013,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, return 0; } -static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, +static int check_generic_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, const char *pointer_desc, int off, int size, bool strict) { @@ -1034,7 +1029,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned %saccess off %s+%d+%d size %d\n", + verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", pointer_desc, tn_buf, reg->off, off, size); return -EACCES; } @@ -1051,8 +1046,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, switch (reg->type) { case PTR_TO_PACKET: - /* special case, because of NET_IP_ALIGN */ - return check_pkt_ptr_alignment(reg, off, size, strict); + case PTR_TO_PACKET_META: + /* Special case, because of NET_IP_ALIGN. Given metadata sits + * right in front, treat it the very same way. + */ + return check_pkt_ptr_alignment(env, reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1065,7 +1063,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, default: break; } - return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); + return check_generic_ptr_alignment(env, reg, pointer_desc, off, size, + strict); } /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -1078,8 +1077,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int bpf_size, enum bpf_access_type t, int value_regno) { - struct bpf_verifier_state *state = &env->cur_state; - struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_verifier_state *state = env->cur_state; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -1097,27 +1097,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into map\n", value_regno); + verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - err = check_map_access(env, regno, off, size); + err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into ctx\n", value_regno); + verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } /* ctx accesses must be at a fixed offset, so that we can * determine what type of data were returned. */ if (reg->off) { - verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", + verbose(env, + "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", regno, reg->off, off - reg->off); return -EACCES; } @@ -1125,24 +1126,26 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable ctx access var_off=%s off=%d size=%d", + verbose(env, + "variable ctx access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a - * PTR_TO_PACKET[_END]. In the latter case, we know - * the offset is zero. + * PTR_TO_PACKET[_META,_END]. In the latter + * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, regs, value_regno); else - mark_reg_known_zero(state->regs, value_regno); - state->regs[value_regno].id = 0; - state->regs[value_regno].off = 0; - state->regs[value_regno].range = 0; - state->regs[value_regno].type = reg_type; + mark_reg_known_zero(env, regs, + value_regno); + regs[value_regno].id = 0; + regs[value_regno].off = 0; + regs[value_regno].range = 0; + regs[value_regno].type = reg_type; } } else if (reg->type == PTR_TO_STACK) { @@ -1154,55 +1157,52 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } off += reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK) { - verbose("invalid stack off=%d size=%d\n", off, size); + verbose(env, "invalid stack off=%d size=%d\n", off, + size); return -EACCES; } if (env->prog->aux->stack_depth < -off) env->prog->aux->stack_depth = -off; - if (t == BPF_WRITE) { - if (!env->allow_ptr_leaks && - state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && - size != BPF_REG_SIZE) { - verbose("attempt to corrupt spilled pointer on stack\n"); - return -EACCES; - } - err = check_stack_write(state, off, size, value_regno); - } else { - err = check_stack_read(state, off, size, value_regno); - } - } else if (reg->type == PTR_TO_PACKET) { + if (t == BPF_WRITE) + err = check_stack_write(env, state, off, size, + value_regno); + else + err = check_stack_read(env, state, off, size, + value_regno); + } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { - verbose("cannot write into packet\n"); + verbose(env, "cannot write into packet\n"); return -EACCES; } if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into packet\n", value_regno); + verbose(env, "R%d leaks addr into packet\n", + value_regno); return -EACCES; } - err = check_packet_access(env, regno, off, size); + err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, regs, value_regno); } else { - verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[reg->type]); + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str[reg->type]); return -EACCES; } if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && - state->regs[value_regno].type == SCALAR_VALUE) { + regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ - state->regs[value_regno].var_off = tnum_cast( - state->regs[value_regno].var_off, size); - __update_reg_bounds(&state->regs[value_regno]); + regs[value_regno].var_off = + tnum_cast(regs[value_regno].var_off, size); + __update_reg_bounds(®s[value_regno]); } return err; } @@ -1213,7 +1213,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || insn->imm != 0) { - verbose("BPF_XADD uses reserved fields\n"); + verbose(env, "BPF_XADD uses reserved fields\n"); return -EINVAL; } @@ -1228,7 +1228,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d leaks addr into mem\n", insn->src_reg); + verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; } @@ -1259,9 +1259,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs; - int off, i; + int off, i, slot, spi; if (regs[regno].type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -1269,7 +1269,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, register_is_null(regs[regno])) return 0; - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[regs[regno].type], reg_type_str[PTR_TO_STACK]); return -EACCES; @@ -1280,13 +1280,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); - verbose("invalid variable stack read R%d var_off=%s\n", + verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size <= 0) { - verbose("invalid stack type R%d off=%d access_size=%d\n", + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; } @@ -1301,8 +1301,12 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { - if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { - verbose("invalid indirect read from stack off %d+%d size %d\n", + slot = -(off + i) - 1; + spi = slot / BPF_REG_SIZE; + if (state->allocated_stack <= slot || + state->stack[spi].slot_type[slot % BPF_REG_SIZE] != + STACK_MISC) { + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; } @@ -1314,13 +1318,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; switch (reg->type) { case PTR_TO_PACKET: - return check_packet_access(env, regno, reg->off, access_size); + case PTR_TO_PACKET_META: + return check_packet_access(env, regno, reg->off, access_size, + zero_size_allowed); case PTR_TO_MAP_VALUE: - return check_map_access(env, regno, reg->off, access_size); + return check_map_access(env, regno, reg->off, access_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -1331,7 +1338,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; int err = 0; @@ -1344,22 +1351,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_ANYTHING) { if (is_pointer_value(env, regno)) { - verbose("R%d leaks addr into helper function\n", regno); + verbose(env, "R%d leaks addr into helper function\n", + regno); return -EACCES; } return 0; } - if (type == PTR_TO_PACKET && + if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { - verbose("helper access to the packet is not allowed\n"); + verbose(env, "helper access to the packet is not allowed\n"); return -EACCES; } if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (type != PTR_TO_PACKET && type != expected_type) + if (!type_is_pkt_pointer(type) && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -1383,12 +1392,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (register_is_null(*reg)) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { - verbose("unsupported arg_type %d\n", arg_type); + verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; } @@ -1406,12 +1416,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->key\n"); + verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, - meta->map_ptr->key_size); + meta->map_ptr->key_size, + false); else err = check_stack_boundary(env, regno, meta->map_ptr->key_size, @@ -1422,12 +1433,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->value\n"); + verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, - meta->map_ptr->value_size); + meta->map_ptr->value_size, + false); else err = check_stack_boundary(env, regno, meta->map_ptr->value_size, @@ -1442,7 +1454,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_SIZE cannot be first argument\n"); + verbose(env, + "ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } @@ -1459,7 +1472,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta = NULL; if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", regno); return -EACCES; } @@ -1473,7 +1486,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", regno); return -EACCES; } @@ -1484,12 +1497,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return err; err_type: - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[type], reg_type_str[expected_type]); return -EACCES; } -static int check_map_func_compatibility(struct bpf_map *map, int func_id) +static int check_map_func_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, int func_id) { if (!map) return 0; @@ -1502,7 +1516,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && - func_id != BPF_FUNC_perf_event_output) + func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_perf_event_read_value) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -1522,6 +1537,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (func_id != BPF_FUNC_redirect_map) goto error; break; + /* Restrict bpf side of cpumap, open when use-cases appear */ + case BPF_MAP_TYPE_CPUMAP: + if (func_id != BPF_FUNC_redirect_map) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) @@ -1545,6 +1565,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: + case BPF_FUNC_perf_event_read_value: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -1558,7 +1579,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_FUNC_redirect_map: - if (map->map_type != BPF_MAP_TYPE_DEVMAP) + if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_CPUMAP) goto error; break; case BPF_FUNC_sk_redirect_map: @@ -1575,7 +1597,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %s#%d\n", + verbose(env, "cannot pass map_type %d into func %s#%d\n", map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1598,57 +1620,55 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, - * so turn them into unknown SCALAR_VALUE. +/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] + * are now invalid, so turn them into unknown SCALAR_VALUE. */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET || - regs[i].type == PTR_TO_PACKET_END) - mark_reg_unknown(regs, i); + if (reg_is_pkt_pointer_any(®s[i])) + mark_reg_unknown(env, regs, i); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type != PTR_TO_PACKET && - reg->type != PTR_TO_PACKET_END) - continue; - __mark_reg_unknown(reg); + reg = &state->stack[i].spilled_ptr; + if (reg_is_pkt_pointer_any(reg)) + __mark_reg_unknown(reg); } } static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { - struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; int i, err; /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "invalid func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } - if (env->prog->aux->ops->get_func_proto) - fn = env->prog->aux->ops->get_func_proto(func_id); + if (env->ops->get_func_proto) + fn = env->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "unknown func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose("cannot call GPL only function from proprietary program\n"); + verbose(env, "cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -1662,7 +1682,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } @@ -1693,16 +1713,17 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return err; } + regs = cur_regs(env); /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { @@ -1710,14 +1731,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ - mark_reg_known_zero(regs, BPF_REG_0); + mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose("kernel subsystem misconfigured verifier\n"); + verbose(env, + "kernel subsystem misconfigured verifier\n"); return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; @@ -1728,12 +1750,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) else if (insn_aux->map_ptr != meta.map_ptr) insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { - verbose("unknown return type %d of func %s#%d\n", + verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } - err = check_map_func_compatibility(meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; @@ -1780,7 +1802,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -1792,39 +1814,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad sbounds\n"); + print_verifier_state(env, env->cur_state); + verbose(env, + "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad ubounds\n"); + print_verifier_state(env, env->cur_state); + verbose(env, + "verifier internal error: known but bad ubounds\n"); return -EINVAL; } if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ if (!env->allow_ptr_leaks) - verbose("R%d 32-bit pointer arithmetic prohibited\n", + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", dst); return -EACCES; } @@ -1879,7 +1904,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ dst_reg->range = 0; @@ -1889,7 +1914,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ if (!env->allow_ptr_leaks) - verbose("R%d tried to subtract pointer from scalar\n", + verbose(env, "R%d tried to subtract pointer from scalar\n", dst); return -EACCES; } @@ -1899,7 +1924,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, */ if (ptr_reg->type == PTR_TO_STACK) { if (!env->allow_ptr_leaks) - verbose("R%d subtraction from stack pointer prohibited\n", + verbose(env, "R%d subtraction from stack pointer prohibited\n", dst); return -EACCES; } @@ -1939,7 +1964,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -1954,13 +1979,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * ptr &= ~3 which would reduce min_value by 3.) */ if (!env->allow_ptr_leaks) - verbose("R%d bitwise operator %s on pointer prohibited\n", + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic with %s operator prohibited\n", + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; } @@ -1976,7 +2001,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); bool src_known, dst_known; s64 smin_val, smax_val; @@ -2126,7 +2151,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* We lose all sign bit information (except what we can pick @@ -2154,7 +2179,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* BPF_RSH is an unsigned shift, so make the appropriate casts */ @@ -2182,7 +2207,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; default: - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } @@ -2197,7 +2222,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); int rc; @@ -2214,12 +2239,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. */ if (!env->allow_ptr_leaks) { - verbose("R%d pointer %s pointer prohibited\n", + verbose(env, "R%d pointer %s pointer prohibited\n", insn->dst_reg, bpf_alu_string[opcode >> 4]); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); return 0; } else { /* scalar += pointer @@ -2271,13 +2296,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: unexpected ptr_reg\n"); + print_verifier_state(env, env->cur_state); + verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: no src_reg\n"); + print_verifier_state(env, env->cur_state); + verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); @@ -2286,7 +2311,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); int err; @@ -2295,14 +2320,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) != 0 || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { - verbose("BPF_NEG uses reserved fields\n"); + verbose(env, "BPF_NEG uses reserved fields\n"); return -EINVAL; } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || BPF_CLASS(insn->code) == BPF_ALU64) { - verbose("BPF_END uses reserved fields\n"); + verbose(env, "BPF_END uses reserved fields\n"); return -EINVAL; } } @@ -2313,7 +2338,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer arithmetic prohibited\n", + verbose(env, "R%d pointer arithmetic prohibited\n", insn->dst_reg); return -EACCES; } @@ -2327,7 +2352,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -2337,7 +2362,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } } @@ -2357,11 +2382,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d partial copy of pointer\n", + verbose(env, + "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); /* high 32 bits are known zero. */ regs[insn->dst_reg].var_off = tnum_cast( regs[insn->dst_reg].var_off, 4); @@ -2376,14 +2402,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else if (opcode > BPF_END) { - verbose("invalid BPF_ALU opcode %x\n", opcode); + verbose(env, "invalid BPF_ALU opcode %x\n", opcode); return -EINVAL; } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } /* check src1 operand */ @@ -2392,7 +2418,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } } @@ -2404,7 +2430,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if ((opcode == BPF_MOD || opcode == BPF_DIV) && BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { - verbose("div by zero\n"); + verbose(env, "div by zero\n"); return -EINVAL; } @@ -2413,7 +2439,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; if (insn->imm < 0 || insn->imm >= size) { - verbose("invalid shift %d\n", insn->imm); + verbose(env, "invalid shift %d\n", insn->imm); return -EINVAL; } } @@ -2431,6 +2457,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) static void find_good_pkt_pointers(struct bpf_verifier_state *state, struct bpf_reg_state *dst_reg, + enum bpf_reg_type type, bool range_right_open) { struct bpf_reg_state *regs = state->regs, *reg; @@ -2501,15 +2528,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + if (regs[i].type == type && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + reg = &state->stack[i].spilled_ptr; + if (reg->type == type && reg->id == dst_reg->id) reg->range = max(reg->range, new_range); } } @@ -2758,29 +2785,122 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null); + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); } } +static bool try_match_pkt_pointers(const struct bpf_insn *insn, + struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg, + struct bpf_verifier_state *this_branch, + struct bpf_verifier_state *other_branch) +{ + if (BPF_SRC(insn->code) != BPF_X) + return false; + + switch (BPF_OP(insn->code)) { + case BPF_JGT: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ + find_good_pkt_pointers(this_branch, dst_reg, + dst_reg->type, false); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end > pkt_data', pkt_data > pkt_meta' */ + find_good_pkt_pointers(other_branch, src_reg, + src_reg->type, true); + } else { + return false; + } + break; + case BPF_JLT: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ + find_good_pkt_pointers(other_branch, dst_reg, + dst_reg->type, true); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end < pkt_data', pkt_data > pkt_meta' */ + find_good_pkt_pointers(this_branch, src_reg, + src_reg->type, false); + } else { + return false; + } + break; + case BPF_JGE: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ + find_good_pkt_pointers(this_branch, dst_reg, + dst_reg->type, true); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ + find_good_pkt_pointers(other_branch, src_reg, + src_reg->type, false); + } else { + return false; + } + break; + case BPF_JLE: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ + find_good_pkt_pointers(other_branch, dst_reg, + dst_reg->type, false); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ + find_good_pkt_pointers(this_branch, src_reg, + src_reg->type, true); + } else { + return false; + } + break; + default: + return false; + } + + return true; +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; + struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; if (opcode > BPF_JSLE) { - verbose("invalid BPF_JMP opcode %x\n", opcode); + verbose(env, "invalid BPF_JMP opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } @@ -2790,13 +2910,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d pointer comparison prohibited\n", + verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg); return -EACCES; } } else { if (insn->src_reg != BPF_REG_0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } } @@ -2871,52 +2991,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' > pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end > pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' < pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end < pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' >= pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end >= pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' <= pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end <= pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], true); - } else if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], + this_branch, other_branch) && + is_pointer_value(env, insn->dst_reg)) { + verbose(env, "R%d pointer comparison prohibited\n", + insn->dst_reg); return -EACCES; } - if (log_level) - print_verifier_state(this_branch); + if (env->log.level) + print_verifier_state(env, this_branch); return 0; } @@ -2931,15 +3014,15 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); int err; if (BPF_SIZE(insn->code) != BPF_DW) { - verbose("invalid BPF_LD_IMM insn\n"); + verbose(env, "invalid BPF_LD_IMM insn\n"); return -EINVAL; } if (insn->off != 0) { - verbose("BPF_LD_IMM64 uses reserved fields\n"); + verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); return -EINVAL; } @@ -2992,19 +3075,19 @@ static bool may_access_skb(enum bpf_prog_type type) */ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 mode = BPF_MODE(insn->code); int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); + verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); + verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -3014,7 +3097,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (regs[BPF_REG_6].type != PTR_TO_CTX) { - verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + verbose(env, + "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; } @@ -3027,7 +3111,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -3035,7 +3119,45 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * the value fetched from the packet. * Already marked as written above. */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); + return 0; +} + +static int check_return_code(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *reg; + struct tnum range = tnum_range(0, 1); + + switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: + break; + default: + return 0; + } + + reg = cur_regs(env) + BPF_REG_0; + if (reg->type != SCALAR_VALUE) { + verbose(env, "At program exit the register R0 is not a known value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + + if (!tnum_in(range, reg->var_off)) { + verbose(env, "At program exit the register R0 "); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "has value %s", tn_buf); + } else { + verbose(env, "has unknown scalar value"); + } + verbose(env, " should have been 0 or 1\n"); + return -EINVAL; + } return 0; } @@ -3099,7 +3221,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { - verbose("jump out of range from insn %d to %d\n", t, w); + verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -3116,13 +3238,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - verbose("back-edge from insn %d to %d\n", t, w); + verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose("insn state internal bug\n"); + verbose(env, "insn state internal bug\n"); return -EFAULT; } return 0; @@ -3216,7 +3338,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; if (cur_stack-- <= 0) { - verbose("pop stack internal bug\n"); + verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; } @@ -3225,7 +3347,7 @@ mark_explored: check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { - verbose("unreachable insn %d\n", i); + verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } @@ -3340,8 +3462,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; /* Check our ids match any regs they're supposed to */ return check_ids(rold->id, rcur->id, idmap); + case PTR_TO_PACKET_META: case PTR_TO_PACKET: - if (rcur->type != PTR_TO_PACKET) + if (rcur->type != rold->type) return false; /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are @@ -3379,6 +3502,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } +static bool stacksafe(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + struct idpair *idmap) +{ + int i, spi; + + /* if explored stack has more populated slots than current stack + * such stacks are not equivalent + */ + if (old->allocated_stack > cur->allocated_stack) + return false; + + /* walk slots of the explored stack and ignore any additional + * slots in the current stack, since explored(safe) state + * didn't use them + */ + for (i = 0; i < old->allocated_stack; i++) { + spi = i / BPF_REG_SIZE; + + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) + continue; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ + return false; + if (i % BPF_REG_SIZE) + continue; + if (old->stack[spi].slot_type[0] != STACK_SPILL) + continue; + if (!regsafe(&old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, + idmap)) + /* when explored and current stack slot are both storing + * spilled registers, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} + * but current path has stored: + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + } + return true; +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -3423,37 +3597,8 @@ static bool states_equal(struct bpf_verifier_env *env, goto out_free; } - for (i = 0; i < MAX_BPF_STACK; i++) { - if (old->stack_slot_type[i] == STACK_INVALID) - continue; - if (old->stack_slot_type[i] != cur->stack_slot_type[i]) - /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has has STACK_MISC -> - * this verifier states are not equivalent, - * return false to continue verification of this path - */ - goto out_free; - if (i % BPF_REG_SIZE) - continue; - if (old->stack_slot_type[i] != STACK_SPILL) - continue; - if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE], - &cur->spilled_regs[i / BPF_REG_SIZE], - idmap)) - /* when explored and current stack slot are both storing - * spilled registers, check that stored pointers types - * are the same as well. - * Ex: explored safe path could have stored - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} - * but current path has stored: - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} - * such verifier states are not equivalent. - * return false to continue verification of this path - */ - goto out_free; - else - continue; - } + if (!stacksafe(old, cur, idmap)) + goto out_free; ret = true; out_free: kfree(idmap); @@ -3489,17 +3634,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state, } } /* ... and stack slots */ - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) { - if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].slot_type[0] != STACK_SPILL) continue; - if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - if (parent->spilled_regs[i].live & REG_LIVE_READ) + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; - if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN)) + if (writes && + (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) continue; - if (state->spilled_regs[i].live & REG_LIVE_READ) { - parent->spilled_regs[i].live |= REG_LIVE_READ; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { + parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; touched = true; } } @@ -3529,7 +3676,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - int i; + struct bpf_verifier_state *cur = env->cur_state; + int i, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -3539,7 +3687,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(env, &sl->state, &env->cur_state)) { + if (states_equal(env, &sl->state, cur)) { /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -3550,7 +3698,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, &env->cur_state); + propagate_liveness(&sl->state, cur); return 1; } sl = sl->next; @@ -3562,16 +3710,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * it will be rejected. Since there are no loops, we won't be * seeing this 'insn_idx' instruction again on the way to bpf_exit */ - new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); + new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; /* add new state to the head of linked list */ - memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); + err = copy_verifier_state(&new_sl->state, cur); + if (err) { + free_verifier_state(&new_sl->state, false); + kfree(new_sl); + return err; + } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - env->cur_state.parent = &new_sl->state; + cur->parent = &new_sl->state; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -3579,33 +3732,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - env->cur_state.regs[i].live = REG_LIVE_NONE; - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) - if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL) - env->cur_state.spilled_regs[i].live = REG_LIVE_NONE; + cur->regs[i].live = REG_LIVE_NONE; + for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) + if (cur->stack[i].slot_type[0] == STACK_SPILL) + cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; return 0; } static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) - return 0; + if (env->dev_ops && env->dev_ops->insn_hook) + return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); - return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); + return 0; } static int do_check(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; int insn_cnt = env->prog->len; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; - init_reg_state(regs); + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + env->cur_state = state; + init_reg_state(env, state->regs); state->parent = NULL; insn_idx = 0; for (;;) { @@ -3614,7 +3771,7 @@ static int do_check(struct bpf_verifier_env *env) int err; if (insn_idx >= insn_cnt) { - verbose("invalid insn idx %d insn_cnt %d\n", + verbose(env, "invalid insn idx %d insn_cnt %d\n", insn_idx, insn_cnt); return -EFAULT; } @@ -3623,7 +3780,8 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose("BPF program is too large. Processed %d insn\n", + verbose(env, + "BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -3633,12 +3791,12 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (log_level) { + if (env->log.level) { if (do_print_state) - verbose("\nfrom %d to %d: safe\n", + verbose(env, "\nfrom %d to %d: safe\n", prev_insn_idx, insn_idx); else - verbose("%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", insn_idx); } goto process_bpf_exit; } @@ -3646,25 +3804,27 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (log_level > 1 || (log_level && do_print_state)) { - if (log_level > 1) - verbose("%d:", insn_idx); + if (env->log.level > 1 || (env->log.level && do_print_state)) { + if (env->log.level > 1) + verbose(env, "%d:", insn_idx); else - verbose("\nfrom %d to %d:", + verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(&env->cur_state); + print_verifier_state(env, state); do_print_state = false; } - if (log_level) { - verbose("%d: ", insn_idx); - print_bpf_insn(env, insn); + if (env->log.level) { + verbose(env, "%d: ", insn_idx); + print_bpf_insn(verbose, env, insn, + env->allow_ptr_leaks); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); if (err) return err; + regs = cur_regs(env); if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -3714,7 +3874,7 @@ static int do_check(struct bpf_verifier_env *env) * src_reg == stack|map in some other branch. * Reject it. */ - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -3754,14 +3914,14 @@ static int do_check(struct bpf_verifier_env *env) } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || *prev_dst_type == PTR_TO_CTX)) { - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { - verbose("BPF_ST uses reserved fields\n"); + verbose(env, "BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ @@ -3784,7 +3944,7 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_CALL uses reserved fields\n"); + verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } @@ -3797,7 +3957,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_JA uses reserved fields\n"); + verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -3809,7 +3969,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_EXIT uses reserved fields\n"); + verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } @@ -3824,13 +3984,18 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_pointer_value(env, BPF_REG_0)) { - verbose("R0 leaks addr as return value\n"); + verbose(env, "R0 leaks addr as return value\n"); return -EACCES; } + err = check_return_code(env); + if (err) + return err; process_bpf_exit: - insn_idx = pop_stack(env, &prev_insn_idx); - if (insn_idx < 0) { + err = pop_stack(env, &prev_insn_idx, &insn_idx); + if (err < 0) { + if (err != -ENOENT) + return err; break; } else { do_print_state = true; @@ -3856,19 +4021,19 @@ process_bpf_exit: insn_idx++; } else { - verbose("invalid BPF_LD mode\n"); + verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; } } else { - verbose("unknown insn class %d\n", class); + verbose(env, "unknown insn class %d\n", class); return -EINVAL; } insn_idx++; } - verbose("processed %d insns, stack depth %d\n", - insn_processed, env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth %d\n", insn_processed, + env->prog->aux->stack_depth); return 0; } @@ -3880,7 +4045,8 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } -static int check_map_prog_compatibility(struct bpf_map *map, +static int check_map_prog_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, struct bpf_prog *prog) { @@ -3891,12 +4057,12 @@ static int check_map_prog_compatibility(struct bpf_map *map, */ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { if (!check_map_prealloc(map)) { - verbose("perf_event programs can only use preallocated hash map\n"); + verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) { - verbose("perf_event programs can only use preallocated inner hash map\n"); + verbose(env, "perf_event programs can only use preallocated inner hash map\n"); return -EINVAL; } } @@ -3919,14 +4085,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { - verbose("BPF_LDX uses reserved fields\n"); + verbose(env, "BPF_LDX uses reserved fields\n"); return -EINVAL; } if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose("BPF_STX uses reserved fields\n"); + verbose(env, "BPF_STX uses reserved fields\n"); return -EINVAL; } @@ -3937,7 +4103,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { - verbose("invalid bpf_ld_imm64 insn\n"); + verbose(env, "invalid bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -3946,19 +4112,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) goto next_insn; if (insn->src_reg != BPF_PSEUDO_MAP_FD) { - verbose("unrecognized bpf_ld_imm64 insn\n"); + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } f = fdget(insn->imm); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose("fd %d is not pointing to valid bpf_map\n", + verbose(env, "fd %d is not pointing to valid bpf_map\n", insn->imm); return PTR_ERR(map); } - err = check_map_prog_compatibility(map, env->prog); + err = check_map_prog_compatibility(env, map, env->prog); if (err) { fdput(f); return err; @@ -4067,7 +4234,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { - const struct bpf_verifier_ops *ops = env->prog->aux->ops; + const struct bpf_verifier_ops *ops = env->ops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; @@ -4080,7 +4247,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); @@ -4128,7 +4295,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose("bpf verifier narrow ctx access misconfigured\n"); + verbose(env, "bpf verifier narrow ctx access misconfigured\n"); return -EINVAL; } @@ -4147,7 +4314,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4229,7 +4396,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4268,12 +4435,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = prog->aux->ops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, + "kernel subsystem misconfigured func %s#%d\n", func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -4297,6 +4465,7 @@ static void free_states(struct bpf_verifier_env *env) if (sl) while (sl != STATE_LIST_MARK) { sln = sl->next; + free_verifier_state(&sl->state, false); kfree(sl); sl = sln; } @@ -4307,16 +4476,21 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - char __user *log_ubuf = NULL; struct bpf_verifier_env *env; + struct bpf_verifer_log *log; int ret = -EINVAL; + /* no program is valid */ + if (ARRAY_SIZE(bpf_verifier_ops) == 0) + return -EINVAL; + /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; + log = &env->log; env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * (*prog)->len); @@ -4324,6 +4498,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->insn_aux_data) goto err_free_env; env->prog = *prog; + env->ops = bpf_verifier_ops[env->prog->type]; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -4332,29 +4507,27 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* user requested verbose verifier output * and supplied buffer to store the verification trace */ - log_level = attr->log_level; - log_ubuf = (char __user *) (unsigned long) attr->log_buf; - log_size = attr->log_size; - log_len = 0; + log->level = attr->log_level; + log->ubuf = (char __user *) (unsigned long) attr->log_buf; + log->len_total = attr->log_size; ret = -EINVAL; - /* log_* values have to be sane */ - if (log_size < 128 || log_size > UINT_MAX >> 8 || - log_level == 0 || log_ubuf == NULL) - goto err_unlock; - - ret = -ENOMEM; - log_buf = vmalloc(log_size); - if (!log_buf) + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) goto err_unlock; - } else { - log_level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; + if (env->prog->aux->offload) { + ret = bpf_prog_offload_verifier_prep(env); + if (ret) + goto err_unlock; + } + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; @@ -4373,9 +4546,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } skip_full_check: - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); free_states(env); if (ret == 0) @@ -4385,17 +4562,11 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (log_level && log_len >= log_size - 1) { - BUG_ON(log_len >= log_size); - /* verifier log exceeded user supplied buffer */ + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; - /* fall through to return what was recorded */ - } - - /* copy verifier log back to user space including trailing zero */ - if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + if (log->level && !log->ubuf) { ret = -EFAULT; - goto free_log_buf; + goto err_release_maps; } if (ret == 0 && env->used_map_cnt) { @@ -4406,7 +4577,7 @@ skip_full_check: if (!env->prog->aux->used_maps) { ret = -ENOMEM; - goto free_log_buf; + goto err_release_maps; } memcpy(env->prog->aux->used_maps, env->used_maps, @@ -4419,9 +4590,7 @@ skip_full_check: convert_pseudo_ld_imm64(env); } -free_log_buf: - if (log_level) - vfree(log_buf); +err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. @@ -4435,58 +4604,3 @@ err_free_env: kfree(env); return ret; } - -int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, - void *priv) -{ - struct bpf_verifier_env *env; - int ret; - - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); - if (!env) - return -ENOMEM; - - env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * - prog->len); - ret = -ENOMEM; - if (!env->insn_aux_data) - goto err_free_env; - env->prog = prog; - env->analyzer_ops = ops; - env->analyzer_priv = priv; - - /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); - - log_level = 0; - - env->strict_alignment = false; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - env->strict_alignment = true; - - env->explored_states = kcalloc(env->prog->len, - sizeof(struct bpf_verifier_state_list *), - GFP_KERNEL); - ret = -ENOMEM; - if (!env->explored_states) - goto skip_full_check; - - ret = check_cfg(env); - if (ret < 0) - goto skip_full_check; - - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - - ret = do_check(env); - -skip_full_check: - while (pop_stack(env, NULL) >= 0); - free_states(env); - - mutex_unlock(&bpf_verifier_lock); - vfree(env->insn_aux_data); -err_free_env: - kfree(env); - return ret; -} -EXPORT_SYMBOL_GPL(bpf_analyzer); diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ae448f7632cc..2be89a003185 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o namespace.o cgroup-v1.o +obj-y := cgroup.o stat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bf54ade001be..b928b27050c6 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -201,6 +201,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); /* + * stat.c + */ +void cgroup_stat_flush(struct cgroup *cgrp); +int cgroup_stat_init(struct cgroup *cgrp); +void cgroup_stat_exit(struct cgroup *cgrp); +void cgroup_stat_show_cputime(struct seq_file *seq); +void cgroup_stat_boot(void); + +/* * namespace.c */ extern const struct proc_ns_operations cgroupns_operations; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 44857278eb8a..0b1ffe147f24 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS +static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root; +struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -462,6 +464,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, } /** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (!css || !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + +/** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -647,6 +671,14 @@ struct css_set init_css_set = { .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), + + /* + * The following field is re-initialized when this cset gets linked + * in cgroup_init(). However, let's initialize the field + * statically too so that the default cgroup can be accessed safely + * early during boot. + */ + .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ @@ -1896,6 +1928,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) if (ret) goto destroy_root; + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + trace_cgroup_setup_root(root); /* @@ -3312,6 +3347,37 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } +static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, + struct cgroup *cgrp, int ssid) +{ + struct cgroup_subsys *ss = cgroup_subsys[ssid]; + struct cgroup_subsys_state *css; + int ret; + + if (!ss->css_extra_stat_show) + return 0; + + css = cgroup_tryget_css(cgrp, ss); + if (!css) + return 0; + + ret = ss->css_extra_stat_show(seq, css); + css_put(css); + return ret; +} + +static int cpu_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; + int ret = 0; + + cgroup_stat_show_cputime(seq); +#ifdef CONFIG_CGROUP_SCHED + ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); +#endif + return ret; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4419,6 +4485,11 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, + { + .name = "cpu.stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_stat_show, + }, { } /* terminate */ }; @@ -4479,6 +4550,8 @@ static void css_free_work_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_exit(cgrp); kfree(cgrp); } else { /* @@ -4523,6 +4596,9 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ trace_cgroup_release(cgrp); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_flush(cgrp); + for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; @@ -4706,6 +4782,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_free_cgrp; + if (cgroup_on_dfl(parent)) { + ret = cgroup_stat_init(cgrp); + if (ret) + goto out_cancel_ref; + } + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. @@ -4713,7 +4795,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; - goto out_cancel_ref; + goto out_stat_exit; } init_cgroup_housekeeping(cgrp); @@ -4721,6 +4803,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_idr_free; for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4755,13 +4840,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); - if (parent) - cgroup_bpf_inherit(cgrp, parent); - cgroup_propagate_control(cgrp); return cgrp; +out_idr_free: + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_stat_exit: + if (cgroup_on_dfl(parent)) + cgroup_stat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5156,6 +5243,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); + cgroup_stat_boot(); + /* * The latency of the synchronize_sched() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. @@ -5744,15 +5833,103 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, bool overridable) +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct cgroup *parent = cgroup_parent(cgrp); int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); + ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_detach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_query(cgrp, attr, uattr); mutex_unlock(&cgroup_mutex); return ret; } #endif /* CONFIG_CGROUP_BPF */ + +#ifdef CONFIG_SYSFS +static ssize_t show_delegatable_files(struct cftype *files, char *buf, + ssize_t size, const char *prefix) +{ + struct cftype *cft; + ssize_t ret = 0; + + for (cft = files; cft && cft->name[0] != '\0'; cft++) { + if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) + continue; + + if (prefix) + ret += snprintf(buf + ret, size - ret, "%s.", prefix); + + ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); + + if (unlikely(ret >= size)) { + WARN_ON(1); + break; + } + } + + return ret; +} + +static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct cgroup_subsys *ss; + int ssid; + ssize_t ret = 0; + + ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, + NULL); + + for_each_subsys(ss, ssid) + ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, + PAGE_SIZE - ret, + cgroup_subsys_name[ssid]); + + return ret; +} +static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); + +static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); +} +static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); + +static struct attribute *cgroup_sysfs_attrs[] = { + &cgroup_delegate_attr.attr, + &cgroup_features_attr.attr, + NULL, +}; + +static const struct attribute_group cgroup_sysfs_attr_group = { + .attrs = cgroup_sysfs_attrs, + .name = "cgroup", +}; + +static int __init cgroup_sysfs_init(void) +{ + return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); +} +subsys_initcall(cgroup_sysfs_init); +#endif /* CONFIG_SYSFS */ diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c new file mode 100644 index 000000000000..133b465691d6 --- /dev/null +++ b/kernel/cgroup/stat.c @@ -0,0 +1,334 @@ +#include "cgroup-internal.h" + +#include <linux/sched/cputime.h> + +static DEFINE_MUTEX(cgroup_stat_mutex); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); + +static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->cpu_stat, cpu); +} + +/** + * cgroup_cpu_stat_updated - keep track of updated cpu_stat + * @cgrp: target cgroup + * @cpu: cpu on which cpu_stat was updated + * + * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching + * cpu_stat->updated_children list. See the comment on top of + * cgroup_cpu_stat definition for details. + */ +static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_cpu_stat(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (cstat->updated_next) + break; + + cstat->updated_next = pcstat->updated_children; + pcstat->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + +/** + * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_cpu_stat_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_cpu_stat *cstat; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + cstat = cgroup_cpu_stat(pos, cpu); + if (cstat->updated_children == pos) + break; + pos = cstat->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && cstat->updated_next) { + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + struct cgroup_cpu_stat *ncstat; + struct cgroup **nextp; + + nextp = &pcstat->updated_children; + while (true) { + ncstat = cgroup_cpu_stat(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &ncstat->updated_next; + } + + *nextp = cstat->updated_next; + cstat->updated_next = NULL; + } + + return pos; +} + +static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, + struct cgroup_stat *src_stat) +{ + dst_stat->cputime.utime += src_stat->cputime.utime; + dst_stat->cputime.stime += src_stat->cputime.stime; + dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; +} + +static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct task_cputime *last_cputime = &cstat->last_cputime; + struct task_cputime cputime; + struct cgroup_stat delta; + unsigned seq; + + lockdep_assert_held(&cgroup_stat_mutex); + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&cstat->sync); + cputime = cstat->cputime; + } while (__u64_stats_fetch_retry(&cstat->sync, seq)); + + /* accumulate the deltas to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_stat_accumulate(&delta, &cgrp->pending_stat); + memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_stat_accumulate(&cgrp->stat, &delta); + if (parent) + cgroup_stat_accumulate(&parent->pending_stat, &delta); +} + +/* see cgroup_stat_flush() */ +static void cgroup_stat_flush_locked(struct cgroup *cgrp) +{ + int cpu; + + lockdep_assert_held(&cgroup_stat_mutex); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *pos = NULL; + + raw_spin_lock_irq(cpu_lock); + while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) + cgroup_cpu_stat_flush_one(pos, cpu); + raw_spin_unlock_irq(cpu_lock); + } +} + +/** + * cgroup_stat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + */ +void cgroup_stat_flush(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_stat_mutex); + cgroup_stat_flush_locked(cgrp); + mutex_unlock(&cgroup_stat_mutex); +} + +static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) +{ + struct cgroup_cpu_stat *cstat; + + cstat = get_cpu_ptr(cgrp->cpu_stat); + u64_stats_update_begin(&cstat->sync); + return cstat; +} + +static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, + struct cgroup_cpu_stat *cstat) +{ + u64_stats_update_end(&cstat->sync); + cgroup_cpu_stat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(cstat); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + cstat->cputime.sum_exec_runtime += delta_exec; + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + cstat->cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + cstat->cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void cgroup_stat_show_cputime(struct seq_file *seq) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + mutex_lock(&cgroup_stat_mutex); + + cgroup_stat_flush_locked(cgrp); + + usage = cgrp->stat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, + &utime, &stime); + + mutex_unlock(&cgroup_stat_mutex); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "usage_usec %llu\n" + "user_usec %llu\n" + "system_usec %llu\n", + usage, utime, stime); +} + +int cgroup_stat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has cpu_stat preallocated */ + if (!cgrp->cpu_stat) { + cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); + if (!cgrp->cpu_stat) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) + cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + + prev_cputime_init(&cgrp->stat.prev_cputime); + + return 0; +} + +void cgroup_stat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_stat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + if (WARN_ON_ONCE(cstat->updated_children != cgrp) || + WARN_ON_ONCE(cstat->updated_next)) + return; + } + + free_percpu(cgrp->cpu_stat); + cgrp->cpu_stat = NULL; +} + +void __init cgroup_stat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); + + BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); +} diff --git a/kernel/compat.c b/kernel/compat.c index 772e038d04d9..d1cee656a7ed 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, return ret; } -int get_compat_itimerspec(struct itimerspec *dst, - const struct compat_itimerspec __user *src) -{ - if (__compat_get_timespec(&dst->it_interval, &src->it_interval) || - __compat_get_timespec(&dst->it_value, &src->it_value)) - return -EFAULT; - return 0; -} - -int put_compat_itimerspec(struct compat_itimerspec __user *dst, - const struct itimerspec *src) -{ - if (__compat_put_timespec(&src->it_interval, &dst->it_interval) || - __compat_put_timespec(&src->it_value, &dst->it_value)) - return -EFAULT; - return 0; -} - int get_compat_itimerspec64(struct itimerspec64 *its, const struct compat_itimerspec __user *uits) { @@ -485,27 +467,44 @@ Efault: return -EFAULT; } -void -sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) +int +get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) { +#ifdef __BIG_ENDIAN + compat_sigset_t v; + if (copy_from_user(&v, compat, sizeof(compat_sigset_t))) + return -EFAULT; switch (_NSIG_WORDS) { - case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); - case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); - case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); - case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); + case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); + case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); + case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); + case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } +#else + if (copy_from_user(set, compat, sizeof(compat_sigset_t))) + return -EFAULT; +#endif + return 0; } -EXPORT_SYMBOL_GPL(sigset_from_compat); +EXPORT_SYMBOL_GPL(get_compat_sigset); -void -sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) +int +put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, + unsigned int size) { + /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ +#ifdef __BIG_ENDIAN + compat_sigset_t v; switch (_NSIG_WORDS) { - case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; - case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; - case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; - case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; + case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; + case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; + case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; + case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; } + return copy_to_user(compat, &v, size) ? -EFAULT : 0; +#else + return copy_to_user(compat, set, size) ? -EFAULT : 0; +#endif } #ifdef CONFIG_NUMA @@ -563,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, } #endif -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, - compat_pid_t, pid, - struct compat_timespec __user *, interval) -{ - struct timespec t; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); - set_fs(old_fs); - if (compat_put_timespec(&t, interval)) - return -EFAULT; - return ret; -} - /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/kernel/events/core.c b/kernel/events/core.c index 4c39c05e029a..3939a4674e0a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3601,7 +3601,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } - /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise @@ -7867,11 +7866,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { - struct bpf_prog *prog = call->prog; - - if (prog) { + if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; - if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { + if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } @@ -8060,13 +8057,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; + int ret; if (event->attr.type != PERF_TYPE_TRACEPOINT) return perf_event_set_bpf_handler(event, prog_fd); - if (event->tp_event->prog) - return -EEXIST; - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); @@ -8094,26 +8089,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EACCES; } } - event->tp_event->prog = prog; - event->tp_event->bpf_prog_owner = event; - return 0; + ret = perf_event_attach_bpf_prog(event, prog); + if (ret) + bpf_prog_put(prog); + return ret; } static void perf_event_free_bpf_prog(struct perf_event *event) { - struct bpf_prog *prog; - - perf_event_free_bpf_handler(event); - - if (!event->tp_event) + if (event->attr.type != PERF_TYPE_TRACEPOINT) { + perf_event_free_bpf_handler(event); return; - - prog = event->tp_event->prog; - if (prog && event->tp_event->bpf_prog_owner == event) { - event->tp_event->prog = NULL; - bpf_prog_put(prog); } + perf_event_detach_bpf_prog(event); } #else diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index f3e37971c842..141aa2ca8728 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -411,6 +411,7 @@ err: return NULL; } +EXPORT_SYMBOL_GPL(perf_aux_output_begin); static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) { @@ -480,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) rb_free_aux(rb); ring_buffer_put(rb); } +EXPORT_SYMBOL_GPL(perf_aux_output_end); /* * Skip over a given number of bytes in the AUX buffer, due to, for example, @@ -505,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) return 0; } +EXPORT_SYMBOL_GPL(perf_aux_output_skip); void *perf_get_aux(struct perf_output_handle *handle) { @@ -514,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle) return handle->rb->aux_priv; } +EXPORT_SYMBOL_GPL(perf_get_aux); #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) diff --git a/kernel/fork.c b/kernel/fork.c index 07cc743698d3..4e55eedba8d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -469,7 +469,7 @@ void __init fork_init(void) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -817,8 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - atomic_long_set(&mm->nr_ptes, 0); - mm_nr_pmds_init(mm); + mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -872,12 +871,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (atomic_long_read(&mm->nr_ptes)) - pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - atomic_long_read(&mm->nr_ptes)); - if (mm_nr_pmds(mm)) - pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", - mm_nr_pmds(mm)); + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); @@ -2209,18 +2205,18 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); + SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the @@ -2231,7 +2227,7 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index f2edcf85780d..49b54e9979cc 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -862,6 +862,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) return 0; } +EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); void kstat_incr_irq_this_cpu(unsigned int irq) { diff --git a/kernel/kthread.c b/kernel/kthread.c index ba3992c8c375..8af313081b0d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,7 +20,6 @@ #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> -#include <linux/cgroup.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -47,6 +46,9 @@ struct kthread { void *data; struct completion parked; struct completion exited; +#ifdef CONFIG_BLK_CGROUP + struct cgroup_subsys_state *blkcg_css; +#endif }; enum KTHREAD_BITS { @@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k) void free_kthread_struct(struct task_struct *k) { + struct kthread *kthread; + /* * Can be NULL if this kthread was created by kernel_thread() * or if kmalloc() in kthread() failed. */ - kfree(to_kthread(k)); + kthread = to_kthread(k); +#ifdef CONFIG_BLK_CGROUP + WARN_ON_ONCE(kthread && kthread->blkcg_css); +#endif + kfree(kthread); } /** @@ -196,7 +204,7 @@ static int kthread(void *_create) struct kthread *self; int ret; - self = kmalloc(sizeof(*self), GFP_KERNEL); + self = kzalloc(sizeof(*self), GFP_KERNEL); set_kthread_struct(self); /* If user was SIGKILLed, I release the structure. */ @@ -212,7 +220,6 @@ static int kthread(void *_create) do_exit(-ENOMEM); } - self->flags = 0; self->data = data; init_completion(&self->exited); init_completion(&self->parked); @@ -1152,3 +1159,54 @@ void kthread_destroy_worker(struct kthread_worker *worker) kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); + +#ifdef CONFIG_BLK_CGROUP +/** + * kthread_associate_blkcg - associate blkcg to current kthread + * @css: the cgroup info + * + * Current thread must be a kthread. The thread is running jobs on behalf of + * other threads. In some cases, we expect the jobs attach cgroup info of + * original threads instead of that of current thread. This function stores + * original thread's cgroup info in current kthread context for later + * retrieval. + */ +void kthread_associate_blkcg(struct cgroup_subsys_state *css) +{ + struct kthread *kthread; + + if (!(current->flags & PF_KTHREAD)) + return; + kthread = to_kthread(current); + if (!kthread) + return; + + if (kthread->blkcg_css) { + css_put(kthread->blkcg_css); + kthread->blkcg_css = NULL; + } + if (css) { + css_get(css); + kthread->blkcg_css = css; + } +} +EXPORT_SYMBOL(kthread_associate_blkcg); + +/** + * kthread_blkcg - get associated blkcg css of current kthread + * + * Current thread must be a kthread. + */ +struct cgroup_subsys_state *kthread_blkcg(void) +{ + struct kthread *kthread; + + if (current->flags & PF_KTHREAD) { + kthread = to_kthread(current); + if (kthread) + return kthread->blkcg_css; + } + return NULL; +} +EXPORT_SYMBOL(kthread_blkcg); +#endif diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index 2b8bdb1925da..b36ceda6488e 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o -livepatch-objs := core.o patch.o transition.o +livepatch-objs := core.o patch.o shadow.o transition.o diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bf8c8fd72589..de9e45dca70f 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj) return obj->name; } -static bool klp_is_object_loaded(struct klp_object *obj) -{ - return !obj->name || obj->mod; -} - /* sets obj->mod if object is not vmlinux and module is found */ static void klp_find_object_module(struct klp_object *obj) { @@ -285,6 +280,11 @@ static int klp_write_object_relocations(struct module *pmod, static int __klp_disable_patch(struct klp_patch *patch) { + struct klp_object *obj; + + if (WARN_ON(!patch->enabled)) + return -EINVAL; + if (klp_transition_patch) return -EBUSY; @@ -295,6 +295,10 @@ static int __klp_disable_patch(struct klp_patch *patch) klp_init_transition(patch, KLP_UNPATCHED); + klp_for_each_object(patch, obj) + if (obj->patched) + klp_pre_unpatch_callback(obj); + /* * Enforce the order of the func->transition writes in * klp_init_transition() and the TIF_PATCH_PENDING writes in @@ -388,13 +392,18 @@ static int __klp_enable_patch(struct klp_patch *patch) if (!klp_is_object_loaded(obj)) continue; - ret = klp_patch_object(obj); + ret = klp_pre_patch_callback(obj); if (ret) { - pr_warn("failed to enable patch '%s'\n", - patch->mod->name); + pr_warn("pre-patch callback failed for object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } - klp_cancel_transition(); - return ret; + ret = klp_patch_object(obj); + if (ret) { + pr_warn("failed to patch object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; } } @@ -403,6 +412,11 @@ static int __klp_enable_patch(struct klp_patch *patch) patch->enabled = true; return 0; +err: + pr_warn("failed to enable patch '%s'\n", patch->mod->name); + + klp_cancel_transition(); + return ret; } /** @@ -854,9 +868,15 @@ static void klp_cleanup_module_patches_limited(struct module *mod, * is in transition. */ if (patch->enabled || patch == klp_transition_patch) { + + if (patch != klp_transition_patch) + klp_pre_unpatch_callback(obj); + pr_notice("reverting patch '%s' on unloading module '%s'\n", patch->mod->name, obj->mod->name); klp_unpatch_object(obj); + + klp_post_unpatch_callback(obj); } klp_free_object_loaded(obj); @@ -906,13 +926,25 @@ int klp_module_coming(struct module *mod) pr_notice("applying patch '%s' to loading module '%s'\n", patch->mod->name, obj->mod->name); + ret = klp_pre_patch_callback(obj); + if (ret) { + pr_warn("pre-patch callback failed for object '%s'\n", + obj->name); + goto err; + } + ret = klp_patch_object(obj); if (ret) { pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", patch->mod->name, obj->mod->name, ret); + + klp_post_unpatch_callback(obj); goto err; } + if (patch != klp_transition_patch) + klp_post_patch_callback(obj); + break; } } diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index a351601d7f76..48a83d4364cf 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -2,6 +2,46 @@ #ifndef _LIVEPATCH_CORE_H #define _LIVEPATCH_CORE_H +#include <linux/livepatch.h> + extern struct mutex klp_mutex; +static inline bool klp_is_object_loaded(struct klp_object *obj) +{ + return !obj->name || obj->mod; +} + +static inline int klp_pre_patch_callback(struct klp_object *obj) +{ + int ret = 0; + + if (obj->callbacks.pre_patch) + ret = (*obj->callbacks.pre_patch)(obj); + + obj->callbacks.post_unpatch_enabled = !ret; + + return ret; +} + +static inline void klp_post_patch_callback(struct klp_object *obj) +{ + if (obj->callbacks.post_patch) + (*obj->callbacks.post_patch)(obj); +} + +static inline void klp_pre_unpatch_callback(struct klp_object *obj) +{ + if (obj->callbacks.pre_unpatch) + (*obj->callbacks.pre_unpatch)(obj); +} + +static inline void klp_post_unpatch_callback(struct klp_object *obj) +{ + if (obj->callbacks.post_unpatch_enabled && + obj->callbacks.post_unpatch) + (*obj->callbacks.post_unpatch)(obj); + + obj->callbacks.post_unpatch_enabled = false; +} + #endif /* _LIVEPATCH_CORE_H */ diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 52c4e907c14b..82d584225dc6 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/bug.h> #include <linux/printk.h> +#include "core.h" #include "patch.h" #include "transition.h" diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c new file mode 100644 index 000000000000..fdac27588d60 --- /dev/null +++ b/kernel/livepatch/shadow.c @@ -0,0 +1,277 @@ +/* + * shadow.c - Shadow Variables + * + * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com> + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/** + * DOC: Shadow variable API concurrency notes: + * + * The shadow variable API provides a simple relationship between an + * <obj, id> pair and a pointer value. It is the responsibility of the + * caller to provide any mutual exclusion required of the shadow data. + * + * Once a shadow variable is attached to its parent object via the + * klp_shadow_*alloc() API calls, it is considered live: any subsequent + * call to klp_shadow_get() may then return the shadow variable's data + * pointer. Callers of klp_shadow_*alloc() should prepare shadow data + * accordingly. + * + * The klp_shadow_*alloc() API calls may allocate memory for new shadow + * variable structures. Their implementation does not call kmalloc + * inside any spinlocks, but API callers should pass GFP flags according + * to their specific needs. + * + * The klp_shadow_hash is an RCU-enabled hashtable and is safe against + * concurrent klp_shadow_free() and klp_shadow_get() operations. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/hashtable.h> +#include <linux/slab.h> +#include <linux/livepatch.h> + +static DEFINE_HASHTABLE(klp_shadow_hash, 12); + +/* + * klp_shadow_lock provides exclusive access to the klp_shadow_hash and + * the shadow variables it references. + */ +static DEFINE_SPINLOCK(klp_shadow_lock); + +/** + * struct klp_shadow - shadow variable structure + * @node: klp_shadow_hash hash table node + * @rcu_head: RCU is used to safely free this structure + * @obj: pointer to parent object + * @id: data identifier + * @data: data area + */ +struct klp_shadow { + struct hlist_node node; + struct rcu_head rcu_head; + void *obj; + unsigned long id; + char data[]; +}; + +/** + * klp_shadow_match() - verify a shadow variable matches given <obj, id> + * @shadow: shadow variable to match + * @obj: pointer to parent object + * @id: data identifier + * + * Return: true if the shadow variable matches. + */ +static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj, + unsigned long id) +{ + return shadow->obj == obj && shadow->id == id; +} + +/** + * klp_shadow_get() - retrieve a shadow variable data pointer + * @obj: pointer to parent object + * @id: data identifier + * + * Return: the shadow variable data element, NULL on failure. + */ +void *klp_shadow_get(void *obj, unsigned long id) +{ + struct klp_shadow *shadow; + + rcu_read_lock(); + + hash_for_each_possible_rcu(klp_shadow_hash, shadow, node, + (unsigned long)obj) { + + if (klp_shadow_match(shadow, obj, id)) { + rcu_read_unlock(); + return shadow->data; + } + } + + rcu_read_unlock(); + + return NULL; +} +EXPORT_SYMBOL_GPL(klp_shadow_get); + +static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags, bool warn_on_exist) +{ + struct klp_shadow *new_shadow; + void *shadow_data; + unsigned long flags; + + /* Check if the shadow variable already exists */ + shadow_data = klp_shadow_get(obj, id); + if (shadow_data) + goto exists; + + /* Allocate a new shadow variable for use inside the lock below */ + new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags); + if (!new_shadow) + return NULL; + + new_shadow->obj = obj; + new_shadow->id = id; + + /* Initialize the shadow variable if data provided */ + if (data) + memcpy(new_shadow->data, data, size); + + /* Look for <obj, id> again under the lock */ + spin_lock_irqsave(&klp_shadow_lock, flags); + shadow_data = klp_shadow_get(obj, id); + if (unlikely(shadow_data)) { + /* + * Shadow variable was found, throw away speculative + * allocation. + */ + spin_unlock_irqrestore(&klp_shadow_lock, flags); + kfree(new_shadow); + goto exists; + } + + /* No <obj, id> found, so attach the newly allocated one */ + hash_add_rcu(klp_shadow_hash, &new_shadow->node, + (unsigned long)new_shadow->obj); + spin_unlock_irqrestore(&klp_shadow_lock, flags); + + return new_shadow->data; + +exists: + if (warn_on_exist) { + WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id); + return NULL; + } + + return shadow_data; +} + +/** + * klp_shadow_alloc() - allocate and add a new shadow variable + * @obj: pointer to parent object + * @id: data identifier + * @data: pointer to data to attach to parent + * @size: size of attached data + * @gfp_flags: GFP mask for allocation + * + * Allocates @size bytes for new shadow variable data using @gfp_flags + * and copies @size bytes from @data into the new shadow variable's own + * data space. If @data is NULL, @size bytes are still allocated, but + * no copy is performed. The new shadow variable is then added to the + * global hashtable. + * + * If an existing <obj, id> shadow variable can be found, this routine + * will issue a WARN, exit early and return NULL. + * + * Return: the shadow variable data element, NULL on duplicate or + * failure. + */ +void *klp_shadow_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags) +{ + return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true); +} +EXPORT_SYMBOL_GPL(klp_shadow_alloc); + +/** + * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable + * @obj: pointer to parent object + * @id: data identifier + * @data: pointer to data to attach to parent + * @size: size of attached data + * @gfp_flags: GFP mask for allocation + * + * Returns a pointer to existing shadow data if an <obj, id> shadow + * variable is already present. Otherwise, it creates a new shadow + * variable like klp_shadow_alloc(). + * + * This function guarantees that only one shadow variable exists with + * the given @id for the given @obj. It also guarantees that the shadow + * variable will be initialized by the given @data only when it did not + * exist before. + * + * Return: the shadow variable data element, NULL on failure. + */ +void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags) +{ + return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false); +} +EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc); + +/** + * klp_shadow_free() - detach and free a <obj, id> shadow variable + * @obj: pointer to parent object + * @id: data identifier + * + * This function releases the memory for this <obj, id> shadow variable + * instance, callers should stop referencing it accordingly. + */ +void klp_shadow_free(void *obj, unsigned long id) +{ + struct klp_shadow *shadow; + unsigned long flags; + + spin_lock_irqsave(&klp_shadow_lock, flags); + + /* Delete <obj, id> from hash */ + hash_for_each_possible(klp_shadow_hash, shadow, node, + (unsigned long)obj) { + + if (klp_shadow_match(shadow, obj, id)) { + hash_del_rcu(&shadow->node); + kfree_rcu(shadow, rcu_head); + break; + } + } + + spin_unlock_irqrestore(&klp_shadow_lock, flags); +} +EXPORT_SYMBOL_GPL(klp_shadow_free); + +/** + * klp_shadow_free_all() - detach and free all <*, id> shadow variables + * @id: data identifier + * + * This function releases the memory for all <*, id> shadow variable + * instances, callers should stop referencing them accordingly. + */ +void klp_shadow_free_all(unsigned long id) +{ + struct klp_shadow *shadow; + unsigned long flags; + int i; + + spin_lock_irqsave(&klp_shadow_lock, flags); + + /* Delete all <*, id> from hash */ + hash_for_each(klp_shadow_hash, i, shadow, node) { + if (klp_shadow_match(shadow, shadow->obj, id)) { + hash_del_rcu(&shadow->node); + kfree_rcu(shadow, rcu_head); + } + } + + spin_unlock_irqrestore(&klp_shadow_lock, flags); +} +EXPORT_SYMBOL_GPL(klp_shadow_free_all); diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index b004a1fb6032..56add6327736 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -82,6 +82,10 @@ static void klp_complete_transition(void) unsigned int cpu; bool immediate_func = false; + pr_debug("'%s': completing %s transition\n", + klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + if (klp_target_state == KLP_UNPATCHED) { /* * All tasks have transitioned to KLP_UNPATCHED so we can now @@ -109,9 +113,6 @@ static void klp_complete_transition(void) } } - if (klp_target_state == KLP_UNPATCHED && !immediate_func) - module_put(klp_transition_patch->mod); - /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ if (klp_target_state == KLP_PATCHED) klp_synchronize_transition(); @@ -130,6 +131,27 @@ static void klp_complete_transition(void) } done: + klp_for_each_object(klp_transition_patch, obj) { + if (!klp_is_object_loaded(obj)) + continue; + if (klp_target_state == KLP_PATCHED) + klp_post_patch_callback(obj); + else if (klp_target_state == KLP_UNPATCHED) + klp_post_unpatch_callback(obj); + } + + pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + + /* + * See complementary comment in __klp_enable_patch() for why we + * keep the module reference for immediate patches. + */ + if (!klp_transition_patch->immediate && !immediate_func && + klp_target_state == KLP_UNPATCHED) { + module_put(klp_transition_patch->mod); + } + klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; } @@ -145,6 +167,9 @@ void klp_cancel_transition(void) if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED)) return; + pr_debug("'%s': canceling patching transition, going to unpatch\n", + klp_transition_patch->mod->name); + klp_target_state = KLP_UNPATCHED; klp_complete_transition(); } @@ -408,9 +433,6 @@ void klp_try_complete_transition(void) } success: - pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* we're done, now cleanup the data structures */ klp_complete_transition(); } @@ -426,7 +448,8 @@ void klp_start_transition(void) WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); - pr_notice("'%s': %s...\n", klp_transition_patch->mod->name, + pr_notice("'%s': starting %s transition\n", + klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); /* @@ -482,6 +505,9 @@ void klp_init_transition(struct klp_patch *patch, int state) */ klp_target_state = state; + pr_debug("'%s': initializing %s transition\n", patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + /* * If the patch can be applied or reverted immediately, skip the * per-task transitions. @@ -547,6 +573,11 @@ void klp_reverse_transition(void) unsigned int cpu; struct task_struct *g, *task; + pr_debug("'%s': reversing transition from %s\n", + klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching to unpatching" : + "unpatching to patching"); + klp_transition_patch->enabled = !klp_transition_patch->enabled; klp_target_state = !klp_target_state; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index db933d063bfc..9776da8db180 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -47,7 +47,6 @@ #include <linux/stringify.h> #include <linux/bitops.h> #include <linux/gfp.h> -#include <linux/kmemcheck.h> #include <linux/random.h> #include <linux/jhash.h> @@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, { int i; - kmemcheck_mark_initialized(lock, sizeof(*lock)); - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) lock->class_cache[i] = NULL; diff --git a/kernel/module.c b/kernel/module.c index 32c2cdaccd93..222aba4aa960 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -847,10 +847,8 @@ static int add_module_usage(struct module *a, struct module *b) pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - pr_warn("%s: out of memory loading\n", a->name); + if (!use) return -ENOMEM; - } use->source = a; use->target = b; diff --git a/kernel/padata.c b/kernel/padata.c index 868f947166d7..f262c9a4e70a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst, padata->cb_cpu = cb_cpu; target_cpu = padata_cpu_hash(pd); + padata->cpu = target_cpu; queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); @@ -275,11 +276,51 @@ static void padata_reorder(struct parallel_data *pd) return; } +static void invoke_padata_reorder(struct work_struct *work) +{ + struct padata_parallel_queue *pqueue; + struct parallel_data *pd; + + local_bh_disable(); + pqueue = container_of(work, struct padata_parallel_queue, reorder_work); + pd = pqueue->pd; + padata_reorder(pd); + local_bh_enable(); +} + static void padata_reorder_timer(unsigned long arg) { struct parallel_data *pd = (struct parallel_data *)arg; + unsigned int weight; + int target_cpu, cpu; - padata_reorder(pd); + cpu = get_cpu(); + + /* We don't lock pd here to not interfere with parallel processing + * padata_reorder() calls on other CPUs. We just need any CPU out of + * the cpumask.pcpu set. It would be nice if it's the right one but + * it doesn't matter if we're off to the next one by using an outdated + * pd->processed value. + */ + weight = cpumask_weight(pd->cpumask.pcpu); + target_cpu = padata_index_to_cpu(pd, pd->processed % weight); + + /* ensure to call the reorder callback on the correct CPU */ + if (cpu != target_cpu) { + struct padata_parallel_queue *pqueue; + struct padata_instance *pinst; + + /* The timer function is serialized wrt itself -- no locking + * needed. + */ + pinst = pd->pinst; + pqueue = per_cpu_ptr(pd->pqueue, target_cpu); + queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work); + } else { + padata_reorder(pd); + } + + put_cpu(); } static void padata_serial_worker(struct work_struct *serial_work) @@ -323,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata) int cpu; struct padata_parallel_queue *pqueue; struct parallel_data *pd; + int reorder_via_wq = 0; pd = padata->pd; cpu = get_cpu(); + + /* We need to run on the same CPU padata_do_parallel(.., padata, ..) + * was called on -- or, at least, enqueue the padata object into the + * correct per-cpu queue. + */ + if (cpu != padata->cpu) { + reorder_via_wq = 1; + cpu = padata->cpu; + } + pqueue = per_cpu_ptr(pd->pqueue, cpu); spin_lock(&pqueue->reorder.lock); @@ -336,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata) put_cpu(); - padata_reorder(pd); + /* If we're running on the wrong CPU, call padata_reorder() via a + * kernel worker. + */ + if (reorder_via_wq) + queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work); + else + padata_reorder(pd); } EXPORT_SYMBOL(padata_do_serial); @@ -384,8 +442,14 @@ static void padata_init_pqueues(struct parallel_data *pd) struct padata_parallel_queue *pqueue; cpu_index = 0; - for_each_cpu(cpu, pd->cpumask.pcpu) { + for_each_possible_cpu(cpu) { pqueue = per_cpu_ptr(pd->pqueue, cpu); + + if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) { + pqueue->cpu_index = -1; + continue; + } + pqueue->pd = pd; pqueue->cpu_index = cpu_index; cpu_index++; @@ -393,6 +457,7 @@ static void padata_init_pqueues(struct parallel_data *pd) __padata_list_init(&pqueue->reorder); __padata_list_init(&pqueue->parallel); INIT_WORK(&pqueue->work, padata_parallel_worker); + INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder); atomic_set(&pqueue->num_obj, 0); } } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a917a301e201..bce0464524d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) */ static inline int get_highmem_buffer(int safe_needed) { - buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + buffer = get_image_page(GFP_ATOMIC, safe_needed); return buffer ? 0 : -ENOMEM; } @@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm, while (nr_pages-- > 0) { struct page *page; - page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_image_page(GFP_ATOMIC); if (!page) goto err_out; memory_bm_set_bit(copy_bm, page_to_pfn(page)); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b82a0073532..75554f366fd3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include <linux/init_task.h> #include <linux/context_tracking.h> #include <linux/rcupdate_wait.h> +#include <linux/compat.h> #include <linux/blkdev.h> #include <linux/kprobes.h> @@ -5107,13 +5108,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * Return: On success, 0 and the timeslice is in @interval. Otherwise, * an error code. */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) +static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; unsigned int time_slice; struct rq_flags rf; - struct timespec t; struct rq *rq; int retval; @@ -5137,15 +5136,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, task_rq_unlock(rq, p, &rf); rcu_read_unlock(); - jiffies_to_timespec(time_slice, &t); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; + jiffies_to_timespec64(time_slice, t); + return 0; out_unlock: rcu_read_unlock(); return retval; } +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_timespec64(&t, interval); + + return retval; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, + compat_pid_t, pid, + struct compat_timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = compat_put_timespec64(&t, interval); + return retval; +} +#endif + void sched_show_task(struct task_struct *p) { unsigned long free = 0; @@ -6620,7 +6644,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct seq_file *sf, void *v) +static int cpu_cfs_stat_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; @@ -6660,7 +6684,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ -static struct cftype cpu_files[] = { +static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", @@ -6681,7 +6705,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .seq_show = cpu_stats_show, + .seq_show = cpu_cfs_stat_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -6699,16 +6723,182 @@ static struct cftype cpu_files[] = { { } /* Terminate */ }; +static int cpu_extra_stat_show(struct seq_file *sf, + struct cgroup_subsys_state *css) +{ +#ifdef CONFIG_CFS_BANDWIDTH + { + struct task_group *tg = css_tg(css); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 throttled_usec; + + throttled_usec = cfs_b->throttled_time; + do_div(throttled_usec, NSEC_PER_USEC); + + seq_printf(sf, "nr_periods %d\n" + "nr_throttled %d\n" + "throttled_usec %llu\n", + cfs_b->nr_periods, cfs_b->nr_throttled, + throttled_usec); + } +#endif + return 0; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + u64 weight = scale_load_down(tg->shares); + + return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); +} + +static int cpu_weight_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 weight) +{ + /* + * cgroup weight knobs should use the common MIN, DFL and MAX + * values which are 1, 100 and 10000 respectively. While it loses + * a bit of range on both ends, it maps pretty well onto the shares + * value used by scheduler and the round-trip conversions preserve + * the original value over the entire range. + */ + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + return -ERANGE; + + weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + + return sched_group_set_shares(css_tg(css), scale_load(weight)); +} + +static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + unsigned long weight = scale_load_down(css_tg(css)->shares); + int last_delta = INT_MAX; + int prio, delta; + + /* find the closest nice value to the current weight */ + for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) { + delta = abs(sched_prio_to_weight[prio] - weight); + if (delta >= last_delta) + break; + last_delta = delta; + } + + return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); +} + +static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 nice) +{ + unsigned long weight; + + if (nice < MIN_NICE || nice > MAX_NICE) + return -ERANGE; + + weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); +} +#endif + +static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, + long period, long quota) +{ + if (quota < 0) + seq_puts(sf, "max"); + else + seq_printf(sf, "%ld", quota); + + seq_printf(sf, " %ld\n", period); +} + +/* caller should put the current value in *@periodp before calling */ +static int __maybe_unused cpu_period_quota_parse(char *buf, + u64 *periodp, u64 *quotap) +{ + char tok[21]; /* U64_MAX */ + + if (!sscanf(buf, "%s %llu", tok, periodp)) + return -EINVAL; + + *periodp *= NSEC_PER_USEC; + + if (sscanf(tok, "%llu", quotap)) + *quotap *= NSEC_PER_USEC; + else if (!strcmp(tok, "max")) + *quotap = RUNTIME_INF; + else + return -EINVAL; + + return 0; +} + +#ifdef CONFIG_CFS_BANDWIDTH +static int cpu_max_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + return 0; +} + +static ssize_t cpu_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct task_group *tg = css_tg(of_css(of)); + u64 period = tg_get_cfs_period(tg); + u64 quota; + int ret; + + ret = cpu_period_quota_parse(buf, &period, "a); + if (!ret) + ret = tg_set_cfs_bandwidth(tg, period, quota); + return ret ?: nbytes; +} +#endif + +static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_weight_read_u64, + .write_u64 = cpu_weight_write_u64, + }, + { + .name = "weight.nice", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_weight_nice_read_s64, + .write_s64 = cpu_weight_nice_write_s64, + }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_max_show, + .write = cpu_max_write, + }, +#endif + { } /* terminate */ +}; + struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, + .css_extra_stat_show = cpu_extra_stat_show, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_files, + .legacy_cftypes = cpu_legacy_files, + .dfl_cftypes = cpu_files, .early_init = true, + .threaded = true, }; #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h deleted file mode 100644 index a8358a57a316..000000000000 --- a/kernel/sched/cpuacct.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_CGROUP_CPUACCT - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); - -#else - -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ -} - -static inline void -cpuacct_account_field(struct task_struct *tsk, int index, u64 val) -{ -} - -#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9be8b68a66da..bac6ac9a4ec7 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __this_cpu_add(kernel_cpustat.cpustat[index], tmp); - cpuacct_account_field(p, index, tmp); + cgroup_account_cputime_field(p, index, tmp); } /* @@ -446,6 +446,13 @@ void vtime_account_irq_enter(struct task_struct *tsk) EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ +void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) +{ + *ut = curr->utime; + *st = curr->stime; +} + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) { *ut = p->utime; @@ -584,9 +591,8 @@ drop_precision: * * Assuming that rtime_i+1 >= rtime_i. */ -static void cputime_adjust(struct task_cputime *curr, - struct prev_cputime *prev, - u64 *ut, u64 *st) +void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) { u64 rtime, stime, utime; unsigned long flags; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f349f7e98dec..2473736c7616 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1144,7 +1144,7 @@ static void update_curr_dl(struct rq *rq) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0989676c50e9..4037e19bbca2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -844,7 +844,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cpuacct_charge(curtask, delta_exec); + cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d8c43d73e078..4056c19ca3f0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -969,7 +969,7 @@ static void update_curr_rt(struct rq *rq) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 45ab0bf564e7..b19552a212de 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -30,6 +30,7 @@ #include <linux/irq_work.h> #include <linux/tick.h> #include <linux/slab.h> +#include <linux/cgroup.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> @@ -37,7 +38,6 @@ #include "cpupri.h" #include "cpudeadline.h" -#include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 45caf90b24cd..210b1f2146ff 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -72,7 +72,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index f8159698aa4d..84cb3acd9260 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -183,7 +183,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo */ static __sched int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - int (*action)(atomic_t *), unsigned mode) + wait_atomic_t_action_f action, unsigned int mode) { atomic_t *val; int ret = 0; @@ -193,7 +193,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en val = wbq_entry->key.flags; if (atomic_read(val) == 0) break; - ret = (*action)(val); + ret = (*action)(val, mode); } while (!ret && atomic_read(val) != 0); finish_wait(wq_head, &wbq_entry->wq_entry); return ret; @@ -210,8 +210,9 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en }, \ } -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), - unsigned mode) +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, + wait_atomic_t_action_f action, + unsigned int mode) { struct wait_queue_head *wq_head = atomic_t_waitqueue(p); DEFINE_WAIT_ATOMIC_T(wq_entry, p); @@ -220,6 +221,15 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), } EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); +__sched int atomic_t_wait(atomic_t *counter, unsigned int mode) +{ + schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(atomic_t_wait); + /** * wake_up_atomic_t - Wake up a waiter on a atomic_t * @p: The atomic_t being waited on, a kernel virtual address diff --git a/kernel/signal.c b/kernel/signal.c index 8dcd8825b2de..babb36d3d039 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, - override_rlimit); + q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { @@ -2600,7 +2599,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t old_set = current->blocked; /* XXX: Don't preclude handling different sized sigset_t's. */ @@ -2608,38 +2606,22 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, return -EINVAL; if (nset) { - compat_sigset_t new32; sigset_t new_set; int error; - if (copy_from_user(&new32, nset, sizeof(compat_sigset_t))) + if (get_compat_sigset(&new_set, nset)) return -EFAULT; - - sigset_from_compat(&new_set, &new32); sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); if (error) return error; } - if (oset) { - compat_sigset_t old32; - sigset_to_compat(&old32, &old_set); - if (copy_to_user(oset, &old32, sizeof(compat_sigset_t))) - return -EFAULT; - } - return 0; -#else - return sys_rt_sigprocmask(how, (sigset_t __user *)nset, - (sigset_t __user *)oset, sigsetsize); -#endif + return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; } #endif -static int do_sigpending(void *set, unsigned long sigsetsize) +static int do_sigpending(sigset_t *set) { - if (sigsetsize > sizeof(sigset_t)) - return -EINVAL; - spin_lock_irq(¤t->sighand->siglock); sigorsets(set, ¤t->pending.signal, ¤t->signal->shared_pending.signal); @@ -2659,7 +2641,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize) SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; - int err = do_sigpending(&set, sigsetsize); + int err; + + if (sigsetsize > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); if (!err && copy_to_user(uset, &set, sigsetsize)) err = -EFAULT; return err; @@ -2669,20 +2656,16 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t set; - int err = do_sigpending(&set, sigsetsize); - if (!err) { - compat_sigset_t set32; - sigset_to_compat(&set32, &set); - /* we can get here only if sigsetsize <= sizeof(set) */ - if (copy_to_user(uset, &set32, sigsetsize)) - err = -EFAULT; - } + int err; + + if (sigsetsize > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); + if (!err) + err = put_compat_sigset(uset, &set, sigsetsize); return err; -#else - return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize); -#endif } #endif @@ -2916,7 +2899,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct compat_timespec __user *, uts, compat_size_t, sigsetsize) { - compat_sigset_t s32; sigset_t s; struct timespec t; siginfo_t info; @@ -2925,9 +2907,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + if (get_compat_sigset(&s, uthese)) return -EFAULT; - sigset_from_compat(&s, &s32); if (uts) { if (compat_get_timespec(&t, uts)) @@ -3345,15 +3326,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { -#ifdef __BIG_ENDIAN sigset_t set; - int err = do_sigpending(&set, sizeof(set.sig[0])); + int err = do_sigpending(&set); if (!err) err = put_user(set.sig[0], set32); return err; -#else - return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32)); -#endif } #endif @@ -3451,7 +3428,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, compat_size_t, sigsetsize) { struct k_sigaction new_ka, old_ka; - compat_sigset_t mask; #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t restorer; #endif @@ -3469,19 +3445,18 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret |= get_user(restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(restorer); #endif - ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); + ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask); ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; - sigset_from_compat(&new_ka.sa.sa_mask, &mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - sigset_to_compat(&mask, &old_ka.sa.sa_mask); ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); - ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); + ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask, + sizeof(oact->sa_mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), @@ -3661,22 +3636,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t newset; - compat_sigset_t newset32; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) + if (get_compat_sigset(&newset, unewset)) return -EFAULT; - sigset_from_compat(&newset, &newset32); return sigsuspend(&newset); -#else - /* on little-endian bitmaps don't care about granularity */ - return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize); -#endif } #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 662f7b1b7a78..2f5e87f1bae2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule); -void __tasklet_hi_schedule_first(struct tasklet_struct *t) -{ - lockdep_assert_irqs_disabled(); - - t->next = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, t); - __raise_softirq_irqoff(HI_SOFTIRQ); -} -EXPORT_SYMBOL(__tasklet_hi_schedule_first); - static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/sys.c b/kernel/sys.c index 524a4cb9bbe2..83ffd7dccf23 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -111,6 +111,12 @@ #ifndef SET_FP_MODE # define SET_FP_MODE(a,b) (-EINVAL) #endif +#ifndef SVE_SET_VL +# define SVE_SET_VL(a) (-EINVAL) +#endif +#ifndef SVE_GET_VL +# define SVE_GET_VL() (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2386,6 +2392,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; + case PR_SVE_SET_VL: + error = SVE_SET_VL(arg2); + break; + case PR_SVE_GET_VL: + error = SVE_GET_VL(); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d9c31bc2eaea..4a13a389e99b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -30,7 +30,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/kmemcheck.h> #include <linux/kmemleak.h> #include <linux/fs.h> #include <linux/init.h> @@ -1174,15 +1173,6 @@ static struct ctl_table kern_table[] = { .extra2 = &one_thousand, }, #endif -#ifdef CONFIG_KMEMCHECK - { - .procname = "kmemcheck", - .data = &kmemcheck_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "panic_on_warn", .data = &panic_on_warn, @@ -1342,11 +1332,6 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { - .procname = "nr_pdflush_threads", - .mode = 0444 /* read-only */, - .proc_handler = pdflush_proc_obsolete, - }, - { .procname = "swappiness", .data = &vm_swappiness, .maxlen = sizeof(vm_swappiness), @@ -1371,6 +1356,15 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "hugetlb_shm_group", diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 434c840e2d82..f54b7b6b4a4b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -224,7 +224,7 @@ config HWLAT_TRACER select GENERIC_TRACER help This tracer, when enabled will create one or more kernel threads, - depening on what the cpumask file is set to, which each thread + depending on what the cpumask file is set to, which each thread spinning in a loop looking for interruptions caused by something other than the kernel. For example, if a System Management Interrupt (SMI) takes a noticeable amount of @@ -239,7 +239,7 @@ config HWLAT_TRACER iteration A kernel thread is created that will spin with interrupts disabled - for "width" microseconds in every "widow" cycle. It will not spin + for "width" microseconds in every "window" cycle. It will not spin for "window - width" microseconds, where the system can continue to operate. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 45a3928544ce..206e0e2ace53 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = { }; /* Global reference count of probes */ -static atomic_t blk_probes_ref = ATOMIC_INIT(0); +static DEFINE_MUTEX(blk_probe_mutex); +static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); @@ -329,14 +330,29 @@ static void blk_trace_free(struct blk_trace *bt) kfree(bt); } +static void get_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (++blk_probes_ref == 1) + blk_register_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void put_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (!--blk_probes_ref) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + static void blk_trace_cleanup(struct blk_trace *bt) { blk_trace_free(bt); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); + put_probe_ref(); } -int blk_trace_remove(struct request_queue *q) +static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; @@ -349,6 +365,17 @@ int blk_trace_remove(struct request_queue *q) return 0; } + +int blk_trace_remove(struct request_queue *q) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_remove(q); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_remove); static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, @@ -538,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (cmpxchg(&q->blk_trace, NULL, bt)) goto err; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); ret = 0; err: @@ -550,9 +576,8 @@ err: return ret; } -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) +static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; int ret; @@ -571,6 +596,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } return 0; } + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_setup(q, name, dev, bdev, arg); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) @@ -607,7 +645,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, } #endif -int blk_trace_startstop(struct request_queue *q, int start) +static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; struct blk_trace *bt = q->blk_trace; @@ -646,6 +684,17 @@ int blk_trace_startstop(struct request_queue *q, int start) return ret; } + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_startstop(q, start); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_startstop); /* @@ -676,7 +725,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: @@ -687,10 +736,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) case BLKTRACESTART: start = 1; case BLKTRACESTOP: - ret = blk_trace_startstop(q, start); + ret = __blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: - ret = blk_trace_remove(q); + ret = __blk_trace_remove(q); break; default: ret = -ENOTTY; @@ -708,10 +757,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { + mutex_lock(&q->blk_trace_mutex); + if (q->blk_trace) { - blk_trace_startstop(q, 0); - blk_trace_remove(q); + __blk_trace_startstop(q, 0); + __blk_trace_remove(q); } + + mutex_unlock(&q->blk_trace_mutex); } #ifdef CONFIG_BLK_CGROUP @@ -1558,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - + put_probe_ref(); blk_trace_free(bt); return 0; } @@ -1591,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q, if (cmpxchg(&q->blk_trace, NULL, bt)) goto free_bt; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); return 0; free_bt: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 95888ae6c263..a5580c670866 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -15,9 +15,11 @@ #include <linux/ctype.h> #include "trace.h" +u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + /** * trace_call_bpf - invoke BPF program - * @prog: BPF program + * @call: tracepoint event * @ctx: opaque context pointer * * kprobe handlers execute BPF programs via this helper. @@ -29,7 +31,7 @@ * 1 - store kprobe event into ring buffer * Other values are reserved and currently alias to 1 */ -unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; @@ -49,9 +51,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) goto out; } - rcu_read_lock(); - ret = BPF_PROG_RUN(prog, ctx); - rcu_read_unlock(); + /* + * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock + * to all call sites, we did a bpf_prog_array_valid() there to check + * whether call->prog_array is empty or not, which is + * a heurisitc to speed up execution. + * + * If bpf_prog_array_valid() fetched prog_array was + * non-NULL, we go into trace_call_bpf() and do the actual + * proper rcu_dereference() under RCU lock. + * If it turns out that prog_array is NULL then, we bail out. + * For the opposite, if the bpf_prog_array_valid() fetched pointer + * was NULL, you'll skip the prog_array with the risk of missing + * out of events when it was updated in between this and the + * rcu_dereference() which is accepted risk. + */ + ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN); out: __this_cpu_dec(bpf_prog_active); @@ -63,12 +78,16 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { - int ret; + int ret = 0; + + if (unlikely(size == 0)) + goto out; ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); + out: return ret; } @@ -77,7 +96,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; @@ -255,14 +274,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +static __always_inline int +get_map_perf_counter(struct bpf_map *map, u64 flags, + u64 *value, u64 *enabled, u64 *running) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - u64 value = 0; - int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -275,7 +294,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value, NULL, NULL); + return perf_event_read_local(ee->event, value, enabled, running); +} + +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +{ + u64 value = 0; + int err; + + err = get_map_perf_counter(map, flags, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi @@ -293,6 +320,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_event_read_value_proto = { + .func = bpf_perf_event_read_value, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); static __always_inline u64 @@ -499,6 +553,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_perf_event_read_value: + return &bpf_perf_event_read_value_proto; default: return tracing_func_proto(func_id); } @@ -524,11 +580,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -const struct bpf_verifier_ops kprobe_prog_ops = { +const struct bpf_verifier_ops kprobe_verifier_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; +const struct bpf_prog_ops kprobe_prog_ops = { +}; + BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@ -576,6 +635,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { + .func = bpf_perf_prog_read_value_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -583,6 +668,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_perf_prog_read_value: + return &bpf_perf_prog_read_value_proto_tp; default: return tracing_func_proto(func_id); } @@ -602,11 +689,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -const struct bpf_verifier_ops tracepoint_prog_ops = { +const struct bpf_verifier_ops tracepoint_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -662,8 +752,67 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -const struct bpf_verifier_ops perf_event_prog_ops = { +const struct bpf_verifier_ops perf_event_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; + +const struct bpf_prog_ops perf_event_prog_ops = { +}; + +static DEFINE_MUTEX(bpf_event_mutex); + +int perf_event_attach_bpf_prog(struct perf_event *event, + struct bpf_prog *prog) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + int ret = -EEXIST; + + mutex_lock(&bpf_event_mutex); + + if (event->prog) + goto unlock; + + old_array = event->tp_event->prog_array; + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + if (ret < 0) + goto unlock; + + /* set the new array to event->tp_event and set event->prog */ + event->prog = prog; + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free(old_array); + +unlock: + mutex_unlock(&bpf_event_mutex); + return ret; +} + +void perf_event_detach_bpf_prog(struct perf_event *event) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + int ret; + + mutex_lock(&bpf_event_mutex); + + if (!event->prog) + goto unlock; + + old_array = event->tp_event->prog_array; + ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + if (ret < 0) { + bpf_prog_array_delete_safe(old_array, event->prog); + } else { + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free(old_array); + } + + bpf_prog_put(event->prog); + event->prog = NULL; + +unlock: + mutex_unlock(&bpf_event_mutex); +} diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 845f3805c73d..d57fede84b38 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -13,7 +13,6 @@ #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kthread.h> /* for self test */ -#include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> @@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); /* account for padding bytes */ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); @@ -2686,7 +2684,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8a907e12b6b9..abf92e478cfb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1174,13 +1174,12 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; - struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; head = this_cpu_ptr(call->perf_events); @@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; - struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; head = this_cpu_ptr(call->perf_events); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a2a642f2c64f..19bcaaac884b 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -560,9 +560,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, - struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) { +static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_metadata *sys_data, + struct syscall_trace_enter *rec) +{ struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; @@ -574,7 +575,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, param.syscall_nr = rec->nr; for (i = 0; i < sys_data->nb_args; i++) param.args[i] = rec->args[i]; - return trace_call_bpf(prog, ¶m); + return trace_call_bpf(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) @@ -582,7 +583,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; - struct bpf_prog *prog; + bool valid_prog_array; int syscall_nr; int rctx; int size; @@ -597,9 +598,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; - prog = READ_ONCE(sys_data->enter_event->prog); head = this_cpu_ptr(sys_data->enter_event->perf_events); - if (!prog && hlist_empty(head)) + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (!valid_prog_array && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ @@ -615,7 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) || + if ((valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; @@ -660,8 +662,9 @@ static void perf_sysenter_disable(struct trace_event_call *call) mutex_unlock(&syscall_trace_lock); } -static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, - struct syscall_trace_exit *rec) { +static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_trace_exit *rec) +{ struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; @@ -671,7 +674,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; param.ret = rec->ret; - return trace_call_bpf(prog, ¶m); + return trace_call_bpf(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) @@ -679,7 +682,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; struct hlist_head *head; - struct bpf_prog *prog; + bool valid_prog_array; int syscall_nr; int rctx; int size; @@ -694,9 +697,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - prog = READ_ONCE(sys_data->exit_event->prog); head = this_cpu_ptr(sys_data->exit_event->perf_events); - if (!prog && hlist_empty(head)) + valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); + if (!valid_prog_array && hlist_empty(head)) return; /* We can probably do that at build time */ @@ -710,7 +713,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((prog && !perf_call_bpf_exit(prog, regs, rec)) || + if ((valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4525e0271a53..153c0e411461 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; - struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); diff --git a/kernel/user.c b/kernel/user.c index 00281add65b2..9a20acce460d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -26,26 +26,32 @@ struct user_namespace init_user_ns = { .uid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .gid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .projid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .count = ATOMIC_INIT(3), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d32b45662fb6..246d4d4ce5c7 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -23,6 +23,8 @@ #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> +#include <linux/bsearch.h> +#include <linux/sort.h> static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); @@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->gid_map.forward); + kfree(ns->gid_map.reverse); + } + if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->uid_map.forward); + kfree(ns->uid_map.reverse); + } + if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->projid_map.forward); + kfree(ns->projid_map.reverse); + } retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); @@ -198,26 +212,101 @@ void __put_user_ns(struct user_namespace *ns) } EXPORT_SYMBOL(__put_user_ns); -static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +/** + * idmap_key struct holds the information necessary to find an idmapping in a + * sorted idmap array. It is passed to cmp_map_id() as first argument. + */ +struct idmap_key { + bool map_up; /* true -> id from kid; false -> kid from id */ + u32 id; /* id to find */ + u32 count; /* == 0 unless used with map_id_range_down() */ +}; + +/** + * cmp_map_id - Function to be passed to bsearch() to find the requested + * idmapping. Expects struct idmap_key to be passed via @k. + */ +static int cmp_map_id(const void *k, const void *e) { - unsigned idx, extents; + u32 first, last, id2; + const struct idmap_key *key = k; + const struct uid_gid_extent *el = e; + + id2 = key->id + key->count - 1; + + /* handle map_id_{down,up}() */ + if (key->map_up) + first = el->lower_first; + else + first = el->first; + + last = first + el->count - 1; + + if (key->id >= first && key->id <= last && + (id2 >= first && id2 <= last)) + return 0; + + if (key->id < first || id2 < first) + return -1; + + return 1; +} + +/** + * map_id_range_down_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static struct uid_gid_extent * +map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) +{ + struct idmap_key key; + + key.map_up = false; + key.count = count; + key.id = id; + + return bsearch(&key, map->forward, extents, + sizeof(struct uid_gid_extent), cmp_map_id); +} + +/** + * map_id_range_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static struct uid_gid_extent * +map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) +{ + unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) - break; + return &map->extent[idx]; } + return NULL; +} + +static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +{ + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = map_id_range_down_base(extents, map, id, count); + else + extent = map_id_range_down_max(extents, map, id, count); + /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].lower_first; + if (extent) + id = (id - extent->first) + extent->lower_first; else id = (u32) -1; @@ -226,44 +315,61 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) static u32 map_id_down(struct uid_gid_map *map, u32 id) { - unsigned idx, extents; + return map_id_range_down(map, id, 1); +} + +/** + * map_id_up_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static struct uid_gid_extent * +map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) +{ + unsigned idx; u32 first, last; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { - first = map->extent[idx].first; + first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) - break; + return &map->extent[idx]; } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].lower_first; - else - id = (u32) -1; + return NULL; +} - return id; +/** + * map_id_up_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static struct uid_gid_extent * +map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) +{ + struct idmap_key key; + + key.map_up = true; + key.count = 1; + key.id = id; + + return bsearch(&key, map->reverse, extents, + sizeof(struct uid_gid_extent), cmp_map_id); } static u32 map_id_up(struct uid_gid_map *map, u32 id) { - unsigned idx, extents; - u32 first, last; - - /* Find the matching extent */ - extents = map->nr_extents; + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); - for (idx = 0; idx < extents; idx++) { - first = map->extent[idx].lower_first; - last = first + map->extent[idx].count - 1; - if (id >= first && id <= last) - break; - } + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = map_id_up_base(extents, map, id); + else + extent = map_id_up_max(extents, map, id); + /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].first; + if (extent) + id = (id - extent->lower_first) + extent->first; else id = (u32) -1; @@ -540,13 +646,17 @@ static int projid_m_show(struct seq_file *seq, void *v) static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { - struct uid_gid_extent *extent = NULL; loff_t pos = *ppos; + unsigned extents = map->nr_extents; + smp_rmb(); - if (pos < map->nr_extents) - extent = &map->extent[pos]; + if (pos >= extents) + return NULL; - return extent; + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return &map->extent[pos]; + + return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) @@ -618,7 +728,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map, u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; - prev = &new_map->extent[idx]; + if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + prev = &new_map->extent[idx]; + else + prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; @@ -638,6 +751,101 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } +/** + * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. + * Takes care to allocate a 4K block of memory if the number of mappings exceeds + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) +{ + struct uid_gid_extent *dest; + + if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { + struct uid_gid_extent *forward; + + /* Allocate memory for 340 mappings. */ + forward = kmalloc(sizeof(struct uid_gid_extent) * + UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL); + if (!forward) + return -ENOMEM; + + /* Copy over memory. Only set up memory for the forward pointer. + * Defer the memory setup for the reverse pointer. + */ + memcpy(forward, map->extent, + map->nr_extents * sizeof(map->extent[0])); + + map->forward = forward; + map->reverse = NULL; + } + + if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) + dest = &map->extent[map->nr_extents]; + else + dest = &map->forward[map->nr_extents]; + + *dest = *extent; + map->nr_extents++; + return 0; +} + +/* cmp function to sort() forward mappings */ +static int cmp_extents_forward(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->first < e2->first) + return -1; + + if (e1->first > e2->first) + return 1; + + return 0; +} + +/* cmp function to sort() reverse mappings */ +static int cmp_extents_reverse(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->lower_first < e2->lower_first) + return -1; + + if (e1->lower_first > e2->lower_first) + return 1; + + return 0; +} + +/** + * sort_idmaps - Sorts an array of idmap entries. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int sort_idmaps(struct uid_gid_map *map) +{ + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return 0; + + /* Sort forward array. */ + sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_forward, NULL); + + /* Only copy the memory from forward we actually need. */ + map->reverse = kmemdup(map->forward, + map->nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL); + if (!map->reverse) + return -ENOMEM; + + /* Sort reverse array. */ + sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_reverse, NULL); + + return 0; +} + static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -648,7 +856,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; - struct uid_gid_extent *extent = NULL; + struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret = -EINVAL; @@ -673,6 +881,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, */ mutex_lock(&userns_state_mutex); + memset(&new_map, 0, sizeof(struct uid_gid_map)); + ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) @@ -700,9 +910,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Parse the user data */ ret = -EINVAL; pos = kbuf; - new_map.nr_extents = 0; for (; pos; pos = next_line) { - extent = &new_map.extent[new_map.nr_extents]; /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); @@ -714,17 +922,17 @@ static ssize_t map_write(struct file *file, const char __user *buf, } pos = skip_spaces(pos); - extent->first = simple_strtoul(pos, &pos, 10); + extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->lower_first = simple_strtoul(pos, &pos, 10); + extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->count = simple_strtoul(pos, &pos, 10); + extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; @@ -734,29 +942,31 @@ static ssize_t map_write(struct file *file, const char __user *buf, goto out; /* Verify we have been given valid starting values */ - if ((extent->first == (u32) -1) || - (extent->lower_first == (u32) -1)) + if ((extent.first == (u32) -1) || + (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ - if ((extent->first + extent->count) <= extent->first) + if ((extent.first + extent.count) <= extent.first) goto out; - if ((extent->lower_first + extent->count) <= - extent->lower_first) + if ((extent.lower_first + extent.count) <= + extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ - if (mappings_overlap(&new_map, extent)) + if (mappings_overlap(&new_map, &extent)) goto out; - new_map.nr_extents++; - - /* Fail if the file contains too many extents */ - if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && + if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; + + ret = insert_extent(&new_map, &extent); + if (ret < 0) + goto out; + ret = -EINVAL; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) @@ -767,16 +977,26 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; + ret = sort_idmaps(&new_map); + if (ret < 0) + goto out; + + ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { + struct uid_gid_extent *e; u32 lower_first; - extent = &new_map.extent[idx]; + + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + e = &new_map.extent[idx]; + else + e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, - extent->lower_first, - extent->count); + e->lower_first, + e->count); /* Fail if we can not map the specified extent to * the kernel global id space. @@ -784,18 +1004,31 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (lower_first == (u32) -1) goto out; - extent->lower_first = lower_first; + e->lower_first = lower_first; } /* Install the map */ - memcpy(map->extent, new_map.extent, - new_map.nr_extents*sizeof(new_map.extent[0])); + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + memcpy(map->extent, new_map.extent, + new_map.nr_extents * sizeof(new_map.extent[0])); + } else { + map->forward = new_map.forward; + map->reverse = new_map.reverse; + } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: + if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(new_map.forward); + kfree(new_map.reverse); + map->forward = NULL; + map->reverse = NULL; + map->nr_extents = 0; + } + mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7368b57842ea..dde6298f6b22 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4990,9 +4990,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) * * Unbound workqueues have the following extra attributes. * - * id RO int : the associated pool ID + * pool_ids RO int : the associated pool IDs for each node * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers + * numa RW bool : whether enable NUMA affinity */ struct wq_device { struct workqueue_struct *wq; |