diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/Makefile | 1 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 97 | ||||
-rw-r--r-- | kernel/bpf/core.c | 6 | ||||
-rw-r--r-- | kernel/bpf/cpumap.c | 76 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 78 | ||||
-rw-r--r-- | kernel/bpf/dispatcher.c | 158 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 63 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 24 | ||||
-rw-r--r-- | kernel/bpf/xskmap.c | 18 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 5 |
10 files changed, 332 insertions, 194 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3f671bf617e8..d4f330351f87 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o +obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 4fb20ab179fe..ee871d4a51f7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -103,8 +103,7 @@ static u32 prog_list_length(struct list_head *head) * if parent has overridable or multi-prog, allow attaching */ static bool hierarchy_allows_attach(struct cgroup *cgrp, - enum bpf_attach_type type, - u32 new_flags) + enum bpf_attach_type type) { struct cgroup *p; @@ -283,31 +282,34 @@ cleanup: * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach + * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * * Must be called with cgroup_mutex held. */ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_prog *replace_prog, enum bpf_attach_type type, u32 flags) { + u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; - struct bpf_prog_list *pl; - bool pl_was_allocated; int err; - if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || + ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; - if (!hierarchy_allows_attach(cgrp, type, flags)) + if (!hierarchy_allows_attach(cgrp, type)) return -EPERM; - if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none @@ -317,6 +319,21 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) { + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + if (pl->prog == replace_prog) + replace_pl = pl; + } + if ((flags & BPF_F_REPLACE) && !replace_pl) + /* prog to replace not found for cgroup */ + return -ENOENT; + } else if (!list_empty(progs)) { + replace_pl = list_first_entry(progs, typeof(*pl), node); + } + for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(storage[stype])) { @@ -327,53 +344,28 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } } - if (flags & BPF_F_ALLOW_MULTI) { - list_for_each_entry(pl, progs, node) { - if (pl->prog == prog) { - /* disallow attaching the same prog twice */ - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -EINVAL; - } + if (replace_pl) { + pl = replace_pl; + old_prog = pl->prog; + for_each_cgroup_storage_type(stype) { + old_storage[stype] = pl->storage[stype]; + bpf_cgroup_storage_unlink(old_storage[stype]); } - + } else { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } - - pl_was_allocated = true; - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); - } else { - if (list_empty(progs)) { - pl = kmalloc(sizeof(*pl), GFP_KERNEL); - if (!pl) { - for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); - return -ENOMEM; - } - pl_was_allocated = true; - list_add_tail(&pl->node, progs); - } else { - pl = list_first_entry(progs, typeof(*pl), node); - old_prog = pl->prog; - for_each_cgroup_storage_type(stype) { - old_storage[stype] = pl->storage[stype]; - bpf_cgroup_storage_unlink(old_storage[stype]); - } - pl_was_allocated = false; - } - pl->prog = prog; - for_each_cgroup_storage_type(stype) - pl->storage[stype] = storage[stype]; } - cgrp->bpf.flags[type] = flags; + pl->prog = prog; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; + + cgrp->bpf.flags[type] = saved_flags; err = update_effective_progs(cgrp, type); if (err) @@ -401,7 +393,7 @@ cleanup: pl->storage[stype] = old_storage[stype]; bpf_cgroup_storage_link(old_storage[stype], cgrp, type); } - if (pl_was_allocated) { + if (!replace_pl) { list_del(&pl->node); kfree(pl); } @@ -539,6 +531,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { + struct bpf_prog *replace_prog = NULL; struct cgroup *cgrp; int ret; @@ -546,8 +539,20 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + if ((attr->attach_flags & BPF_F_ALLOW_MULTI) && + (attr->attach_flags & BPF_F_REPLACE)) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype); + if (IS_ERR(replace_prog)) { + cgroup_put(cgrp); + return PTR_ERR(replace_prog); + } + } + + ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, attr->attach_flags); + + if (replace_prog) + bpf_prog_put(replace_prog); cgroup_put(cgrp); return ret; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index af6b738cf435..29d47aae0dd1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -222,8 +222,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, u32 pages, delta; int ret; - BUG_ON(fp_old == NULL); - size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) @@ -520,9 +518,9 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ -int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); +int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; -int bpf_jit_kallsyms __read_mostly; long bpf_jit_limit __read_mostly; static __always_inline void diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ef49e17ae47c..70f71b154fa5 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -72,17 +72,18 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); +static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); + +static int bq_flush_to_queue(struct xdp_bulk_queue *bq); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; - int ret, cpu; u64 cost; + int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -106,7 +107,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ ret = bpf_map_charge_init(&cmap->map.memory, cost); @@ -115,23 +115,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - cmap->flush_list = alloc_percpu(struct list_head); - if (!cmap->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) - goto free_percpu; + goto free_charge; return &cmap->map; -free_percpu: - free_percpu(cmap->flush_list); free_charge: bpf_map_charge_finish(&cmap->map.memory); free_cmap: @@ -399,22 +390,14 @@ free_rcu: static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; - int cpu; /* This cpu_map_entry have been disconnected from map and one - * RCU graze-period have elapsed. Thus, XDP cannot queue any + * RCU grace-period have elapsed. Thus, XDP cannot queue any * new packets and cannot change/set flush_needed that can * find this entry. */ rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); - /* Flush remaining packets in percpu bulkq */ - for_each_online_cpu(cpu) { - struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); - - /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(bq, false); - } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ put_cpu_map_entry(rcpu); @@ -436,7 +419,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) * percpu bulkq to queue. Due to caller map_delete_elem() disable * preemption, cannot call kthread_stop() to make sure queue is empty. * Instead a work_queue is started for stopping kthread, - * cpu_map_kthread_stop, which waits for an RCU graze period before + * cpu_map_kthread_stop, which waits for an RCU grace period before * stopping kthread, emptying the queue. */ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, @@ -507,7 +490,6 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - int cpu; u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, @@ -522,18 +504,6 @@ static void cpu_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_rcu(); - /* To ensure all pending flush operations have completed wait for flush - * list be empty on _all_ cpus. Because the above synchronize_rcu() - * ensures the map is disconnected from the program we can assume no new - * items will be added to the list. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - /* For cpu_map the remote CPUs can still be using the entries * (struct bpf_cpu_map_entry). */ @@ -544,10 +514,9 @@ static void cpu_map_free(struct bpf_map *map) if (!rcpu) continue; - /* bq flush and cleanup happens after RCU graze-period */ + /* bq flush and cleanup happens after RCU grace-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -599,7 +568,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; @@ -620,10 +589,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } processed++; } @@ -642,11 +608,11 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -681,16 +647,26 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_flush(struct bpf_map *map) +void __cpu_map_flush(void) { - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { - bq_flush_to_queue(bq, true); + bq_flush_to_queue(bq); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); } } + +static int __init cpu_map_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu)); + return 0; +} + +subsys_initcall(cpu_map_init); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3d3d61b5985b..da9c832fc5c8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -75,7 +75,6 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ - struct list_head __percpu *flush_list; struct list_head list; /* these are only used for DEVMAP_HASH type maps */ @@ -85,6 +84,7 @@ struct bpf_dtab { u32 n_buckets; }; +static DEFINE_PER_CPU(struct list_head, dev_map_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); @@ -109,8 +109,8 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - int err, cpu; - u64 cost; + u64 cost = 0; + int err; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -125,9 +125,6 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) bpf_map_init_from_attr(&dtab->map, attr); - /* make sure page count doesn't overflow */ - cost = (u64) sizeof(struct list_head) * num_possible_cpus(); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); @@ -143,17 +140,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (err) return -EINVAL; - dtab->flush_list = alloc_percpu(struct list_head); - if (!dtab->flush_list) - goto free_charge; - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); if (!dtab->dev_index_head) - goto free_percpu; + goto free_charge; spin_lock_init(&dtab->index_lock); } else { @@ -161,13 +151,11 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_percpu; + goto free_charge; } return 0; -free_percpu: - free_percpu(dtab->flush_list); free_charge: bpf_map_charge_finish(&dtab->map.memory); return -ENOMEM; @@ -201,7 +189,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i, cpu; + int i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -221,18 +209,6 @@ static void dev_map_free(struct bpf_map *map) /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); - /* To ensure all pending flush operations have completed wait for flush - * list to empty on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new items will be added. - */ - for_each_online_cpu(cpu) { - struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - - while (!list_empty(flush_list)) - cond_resched(); - } - if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; @@ -266,7 +242,6 @@ static void dev_map_free(struct bpf_map *map) bpf_map_area_free(dtab->netdev_map); } - free_percpu(dtab->flush_list); kfree(dtab); } @@ -345,8 +320,7 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } -static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, - bool in_napi_ctx) +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags) { struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; @@ -384,11 +358,7 @@ error: for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; - /* RX path under NAPI protection, can return frames faster */ - if (likely(in_napi_ctx)) - xdp_return_frame_rx_napi(xdpf); - else - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); drops++; } goto out; @@ -401,15 +371,14 @@ error: * net device can be torn down. On devmap tear down we ensure the flush list * is empty before completing to ensure all flush operations have completed. */ -void __dev_map_flush(struct bpf_map *map) +void __dev_map_flush(void) { - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq, *tmp; rcu_read_lock(); list_for_each_entry_safe(bq, tmp, flush_list, flush_node) - bq_xmit_all(bq, XDP_XMIT_FLUSH, true); + bq_xmit_all(bq, XDP_XMIT_FLUSH); rcu_read_unlock(); } @@ -436,11 +405,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); + struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(bq, 0, true); + bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -509,27 +478,11 @@ static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } -static void dev_map_flush_old(struct bpf_dtab_netdev *dev) -{ - if (dev->dev->netdev_ops->ndo_xdp_xmit) { - struct xdp_bulk_queue *bq; - int cpu; - - rcu_read_lock(); - for_each_online_cpu(cpu) { - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(bq, XDP_XMIT_FLUSH, false); - } - rcu_read_unlock(); - } -} - static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); - dev_map_flush_old(dev); free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); @@ -810,10 +763,15 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + int cpu; + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu)); return 0; } diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c new file mode 100644 index 000000000000..204ee61a3904 --- /dev/null +++ b/kernel/bpf/dispatcher.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2019 Intel Corporation. */ + +#include <linux/hash.h> +#include <linux/bpf.h> +#include <linux/filter.h> + +/* The BPF dispatcher is a multiway branch code generator. The + * dispatcher is a mechanism to avoid the performance penalty of an + * indirect call, which is expensive when retpolines are enabled. A + * dispatch client registers a BPF program into the dispatcher, and if + * there is available room in the dispatcher a direct call to the BPF + * program will be generated. All calls to the BPF programs called via + * the dispatcher will then be a direct call, instead of an + * indirect. The dispatcher hijacks a trampoline function it via the + * __fentry__ of the trampoline. The trampoline function has the + * following signature: + * + * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi, + * unsigned int (*bpf_func)(const void *, + * const struct bpf_insn *)); + */ + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog( + struct bpf_dispatcher *d, struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (prog == d->progs[i].prog) + return &d->progs[i]; + } + return NULL; +} + +static struct bpf_dispatcher_prog *bpf_dispatcher_find_free( + struct bpf_dispatcher *d) +{ + return bpf_dispatcher_find_prog(d, NULL); +} + +static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (entry) { + refcount_inc(&entry->users); + return false; + } + + entry = bpf_dispatcher_find_free(d); + if (!entry) + return false; + + bpf_prog_inc(prog); + entry->prog = prog; + refcount_set(&entry->users, 1); + d->num_progs++; + return true; +} + +static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d, + struct bpf_prog *prog) +{ + struct bpf_dispatcher_prog *entry; + + if (!prog) + return false; + + entry = bpf_dispatcher_find_prog(d, prog); + if (!entry) + return false; + + if (refcount_dec_and_test(&entry->users)) { + entry->prog = NULL; + bpf_prog_put(prog); + d->num_progs--; + return true; + } + return false; +} + +int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs) +{ + return -ENOTSUPP; +} + +static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image) +{ + s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; + int i; + + for (i = 0; i < BPF_DISPATCHER_MAX; i++) { + if (d->progs[i].prog) + *ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func; + } + return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs); +} + +static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) +{ + void *old, *new; + u32 noff; + int err; + + if (!prev_num_progs) { + old = NULL; + noff = 0; + } else { + old = d->image + d->image_off; + noff = d->image_off ^ (PAGE_SIZE / 2); + } + + new = d->num_progs ? d->image + noff : NULL; + if (new) { + if (bpf_dispatcher_prepare(d, new)) + return; + } + + err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new); + if (err || !new) + return; + + d->image_off = noff; +} + +void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, + struct bpf_prog *to) +{ + bool changed = false; + int prev_num_progs; + + if (from == to) + return; + + mutex_lock(&d->mutex); + if (!d->image) { + d->image = bpf_jit_alloc_exec_page(); + if (!d->image) + goto out; + } + + prev_num_progs = d->num_progs; + changed |= bpf_dispatcher_remove_prog(d, from); + changed |= bpf_dispatcher_add_prog(d, to); + + if (!changed) + goto out; + + bpf_dispatcher_update(d, prev_num_progs); +out: + mutex_unlock(&d->mutex); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3461ec59570..81ee8595dfee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,7 @@ #include <linux/timekeeping.h> #include <linux/ctype.h> #include <linux/nospec.h> +#include <linux/audit.h> #include <uapi/linux/btf.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -1306,6 +1307,36 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return 0; } +enum bpf_audit { + BPF_AUDIT_LOAD, + BPF_AUDIT_UNLOAD, + BPF_AUDIT_MAX, +}; + +static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { + [BPF_AUDIT_LOAD] = "LOAD", + [BPF_AUDIT_UNLOAD] = "UNLOAD", +}; + +static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) +{ + struct audit_context *ctx = NULL; + struct audit_buffer *ab; + + if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) + return; + if (audit_enabled == AUDIT_OFF) + return; + if (op == BPF_AUDIT_LOAD) + ctx = audit_context(); + ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); + if (unlikely(!ab)) + return; + audit_log_format(ab, "prog-id=%u op=%s", + prog->aux->id, bpf_audit_str[op]); + audit_log_end(ab); +} + int __bpf_prog_charge(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -1421,6 +1452,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic64_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); __bpf_prog_put_noref(prog, true); @@ -1830,6 +1862,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) @@ -2040,10 +2073,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } -#define BPF_PROG_ATTACH_LAST_FIELD attach_flags +#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd #define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) static int bpf_prog_attach(const union bpf_attr *attr) { @@ -2305,17 +2338,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id -static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +struct bpf_prog *bpf_prog_by_id(u32 id) { struct bpf_prog *prog; - u32 id = attr->prog_id; - int fd; - - if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!id) + return ERR_PTR(-ENOENT); spin_lock_bh(&prog_idr_lock); prog = idr_find(&prog_idr, id); @@ -2324,7 +2352,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) else prog = ERR_PTR(-ENOENT); spin_unlock_bh(&prog_idr_lock); + return prog; +} + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + prog = bpf_prog_by_id(id); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 23b0d5cfd47e..505f4e4b31d2 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -14,6 +14,22 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +void *bpf_jit_alloc_exec_page(void) +{ + void *image; + + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) + return NULL; + + set_vm_flush_reset_perms(image); + /* Keep image as writeable. The alternative is to keep flipping ro/rw + * everytime new program is attached or detached. + */ + set_memory_x((long)image, 1); + return image; +} + struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; @@ -34,7 +50,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) goto out; /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec(PAGE_SIZE); + image = bpf_jit_alloc_exec_page(); if (!image) { kfree(tr); tr = NULL; @@ -48,12 +64,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key) mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); - - set_vm_flush_reset_perms(image); - /* Keep image as writeable. The alternative is to keep flipping ro/rw - * everytime new program is attached or detached. - */ - set_memory_x((long)image, 1); tr->image = image; out: mutex_unlock(&trampoline_mutex); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 90c4fce1c981..2cc5c8f4c800 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -72,9 +72,9 @@ static void xsk_map_sock_delete(struct xdp_sock *xs, static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { struct bpf_map_memory mem; - int cpu, err, numa_node; + int err, numa_node; struct xsk_map *m; - u64 cost, size; + u64 size; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -86,9 +86,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) numa_node = bpf_map_attr_numa_node(attr); size = struct_size(m, xsk_map, attr->max_entries); - cost = size + array_size(sizeof(*m->flush_list), num_possible_cpus()); - err = bpf_map_charge_init(&mem, cost); + err = bpf_map_charge_init(&mem, size); if (err < 0) return ERR_PTR(err); @@ -102,16 +101,6 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) bpf_map_charge_move(&m->map.memory, &mem); spin_lock_init(&m->lock); - m->flush_list = alloc_percpu(struct list_head); - if (!m->flush_list) { - bpf_map_charge_finish(&m->map.memory); - bpf_map_area_free(m); - return ERR_PTR(-ENOMEM); - } - - for_each_possible_cpu(cpu) - INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); - return &m->map; } @@ -121,7 +110,6 @@ static void xsk_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_net(); - free_percpu(m->flush_list); bpf_map_area_free(m); } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 735af8f15f95..725365df066d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6288,12 +6288,13 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 flags) + struct bpf_prog *replace_prog, enum bpf_attach_type type, + u32 flags) { int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags); mutex_unlock(&cgroup_mutex); return ret; } |