From 67f29e07e131ffa13ea158c259a513f474c7df27 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:46 +0200 Subject: bpf: devmap introduce dev_map_enqueue Functionality is the same, but the ndo_xdp_xmit call is now simply invoked from inside the devmap.c code. V2: Fix compile issue reported by kbuild test robot V5: Cleanups requested by Daniel - Newlines before func definition - Use BUILD_BUG_ON checks - Remove unnecessary use return value store in dev_map_enqueue Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 565f9ece9115..06c400e7e4ff 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,13 +48,15 @@ * calls will fail at this point. */ #include +#include #include +#include #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) struct bpf_dtab_netdev { - struct net_device *dev; + struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; unsigned int bit; struct rcu_head rcu; @@ -240,21 +242,38 @@ void __dev_map_flush(struct bpf_map *map) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *dev; + struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; - dev = READ_ONCE(dtab->netdev_map[key]); - return dev ? dev->dev : NULL; + obj = READ_ONCE(dtab->netdev_map[key]); + return obj; +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +{ + struct net_device *dev = dst->dev; + struct xdp_frame *xdpf; + + if (!dev->netdev_ops->ndo_xdp_xmit) + return -EOPNOTSUPP; + + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + /* TODO: implement a bulking/enqueue step later */ + return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { - struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); + struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); + struct net_device *dev = dev = obj ? obj->dev : NULL; return dev ? &dev->ifindex : NULL; } @@ -405,6 +424,9 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ + BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != + offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } -- cgit v1.2.3 From 5d053f9da4311a86bc58be8588bb5660fb3f0724 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:51 +0200 Subject: bpf: devmap prepare xdp frames for bulking Like cpumap create queue for xdp frames that will be bulked. For now, this patch simply invoke ndo_xdp_xmit foreach frame. This happens, either when the map flush operation is envoked, or when the limit DEV_MAP_BULK_SIZE is reached. V5: Avoid memleak on error path in dev_map_update_elem() Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 06c400e7e4ff..15293b9dfb77 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -55,10 +55,17 @@ #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define DEV_MAP_BULK_SIZE 16 +struct xdp_bulk_queue { + struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + unsigned int count; +}; + struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; unsigned int bit; + struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; }; @@ -208,6 +215,34 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) __set_bit(bit, bitmap); } +static int bq_xmit_all(struct bpf_dtab_netdev *obj, + struct xdp_bulk_queue *bq) +{ + struct net_device *dev = obj->dev; + int i; + + if (unlikely(!bq->count)) + return 0; + + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + prefetch(xdpf); + } + + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + int err; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + if (err) + xdp_return_frame(xdpf); + } + bq->count = 0; + + return 0; +} + /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled @@ -223,6 +258,7 @@ void __dev_map_flush(struct bpf_map *map) for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); + struct xdp_bulk_queue *bq; struct net_device *netdev; /* This is possible if the dev entry is removed by user space @@ -232,6 +268,9 @@ void __dev_map_flush(struct bpf_map *map) continue; __clear_bit(bit, bitmap); + + bq = this_cpu_ptr(dev->bulkq); + bq_xmit_all(dev, bq); netdev = dev->dev; if (likely(netdev->netdev_ops->ndo_xdp_flush)) netdev->netdev_ops->ndo_xdp_flush(netdev); @@ -254,6 +293,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) return obj; } +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + + if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) + bq_xmit_all(obj, bq); + + bq->q[bq->count++] = xdpf; + return 0; +} + int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) { struct net_device *dev = dst->dev; @@ -266,8 +319,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) if (unlikely(!xdpf)) return -EOVERFLOW; - /* TODO: implement a bulking/enqueue step later */ - return dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); + return bq_enqueue(dst, xdpf); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) @@ -282,13 +334,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_flush) { struct net_device *fl = dev->dev; + struct xdp_bulk_queue *bq; unsigned long *bitmap; + int cpu; for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); + bq = per_cpu_ptr(dev->bulkq, cpu); + bq_xmit_all(dev, bq); + fl->netdev_ops->ndo_xdp_flush(dev->dev); } } @@ -300,6 +357,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu) dev = container_of(rcu, struct bpf_dtab_netdev, rcu); dev_map_flush_old(dev); + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -332,6 +390,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct net *net = current->nsproxy->net_ns; + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; @@ -346,13 +405,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - map->numa_node); + dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); if (!dev) return -ENOMEM; + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return -ENOMEM; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { + free_percpu(dev->bulkq); kfree(dev); return -EINVAL; } -- cgit v1.2.3 From 38edddb81172e8b8decb057c0cd23271583a5fa0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:57 +0200 Subject: xdp: add tracepoint for devmap like cpumap have Notice how this allow us get XDP statistic without affecting the XDP performance, as tracepoint is no-longer activated on a per packet basis. V5: Spotted by John Fastabend. Fix 'sent' also counted 'drops' in this patch, a later patch corrected this, but it was a mistake in this intermediate step. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++-- include/trace/events/xdp.h | 39 +++++++++++++++++++++++++++++++++++++++ kernel/bpf/devmap.c | 27 +++++++++++++++++++++++---- net/core/filter.c | 2 +- 4 files changed, 67 insertions(+), 7 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 23a809da452d..bbe297436e5d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -492,7 +492,8 @@ struct xdp_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp); +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); @@ -579,7 +580,8 @@ struct xdp_buff; struct bpf_dtab_netdev; static inline -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) { return 0; } diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 96104610d40e..2e9ef0650144 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -229,6 +229,45 @@ TRACE_EVENT(xdp_cpumap_enqueue, __entry->to_cpu) ); +TRACE_EVENT(xdp_devmap_xmit, + + TP_PROTO(const struct bpf_map *map, u32 map_index, + int sent, int drops, + const struct net_device *from_dev, + const struct net_device *to_dev), + + TP_ARGS(map, map_index, sent, drops, from_dev, to_dev), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(u32, map_index) + __field(int, drops) + __field(int, sent) + __field(int, from_ifindex) + __field(int, to_ifindex) + ), + + TP_fast_assign( + __entry->map_id = map->id; + __entry->act = XDP_REDIRECT; + __entry->map_index = map_index; + __entry->drops = drops; + __entry->sent = sent; + __entry->from_ifindex = from_dev->ifindex; + __entry->to_ifindex = to_dev->ifindex; + ), + + TP_printk("ndo_xdp_xmit" + " map_id=%d map_index=%d action=%s" + " sent=%d drops=%d" + " from_ifindex=%d to_ifindex=%d", + __entry->map_id, __entry->map_index, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->sent, __entry->drops, + __entry->from_ifindex, __entry->to_ifindex) +); + #endif /* _TRACE_XDP_H */ #include diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 15293b9dfb77..ff2f3bf59f2f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -58,6 +58,7 @@ #define DEV_MAP_BULK_SIZE 16 struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct net_device *dev_rx; unsigned int count; }; @@ -219,6 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, struct xdp_bulk_queue *bq) { struct net_device *dev = obj->dev; + int sent = 0, drops = 0; int i; if (unlikely(!bq->count)) @@ -235,11 +237,18 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, int err; err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) + if (err) { + drops++; xdp_return_frame(xdpf); + } else { + sent++; + } } bq->count = 0; + trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + sent, drops, bq->dev_rx, dev); + bq->dev_rx = NULL; return 0; } @@ -296,18 +305,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, + struct net_device *dev_rx) + { struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(obj, bq); + /* Ingress dev_rx will be the same for all xdp_frame's in + * bulk_queue, because bq stored per-CPU and must be flushed + * from net_device drivers NAPI func end. + */ + if (!bq->dev_rx) + bq->dev_rx = dev_rx; + bq->q[bq->count++] = xdpf; return 0; } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; @@ -319,7 +338,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dst, xdpf); + return bq_enqueue(dst, xdpf, dev_rx); } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) diff --git a/net/core/filter.c b/net/core/filter.c index c867106d3707..36cf2f87d742 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3067,7 +3067,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, case BPF_MAP_TYPE_DEVMAP: { struct bpf_dtab_netdev *dst = fwd; - err = dev_map_enqueue(dst, xdp); + err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; __dev_map_insert_ctx(map, index); -- cgit v1.2.3 From 389ab7f01af988c2a1ec5617eb0c7e220df1ef1c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:07 +0200 Subject: xdp: introduce xdp_return_frame_rx_napi When sending an xdp_frame through xdp_do_redirect call, then error cases can happen where the xdp_frame needs to be dropped, and returning an -errno code isn't sufficient/possible any-longer (e.g. for cpumap case). This is already fully supported, by simply calling xdp_return_frame. This patch is an optimization, which provides xdp_return_frame_rx_napi, which is a faster variant for these error cases. It take advantage of the protection provided by XDP RX running under NAPI protection. This change is mostly relevant for drivers using the page_pool allocator as it can take advantage of this. (Tested with mlx5). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/net/page_pool.h | 5 +++-- include/net/xdp.h | 1 + kernel/bpf/cpumap.c | 2 +- kernel/bpf/devmap.c | 2 +- net/core/xdp.c | 20 ++++++++++++++++---- 5 files changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index c79087153148..694d055e01ef 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -115,13 +115,14 @@ void page_pool_destroy(struct page_pool *pool); void __page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct); -static inline void page_pool_put_page(struct page_pool *pool, struct page *page) +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, false); + __page_pool_put_page(pool, page, allow_direct); #endif } /* Very limited use-cases allow recycle direct */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 0b689cf561c7..7ad779237ae8 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) } void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index c95b04ec103e..e0918d180f08 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } processed++; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ff2f3bf59f2f..a9cd5c93dd2b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -239,7 +239,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) { drops++; - xdp_return_frame(xdpf); + xdp_return_frame_rx_napi(xdpf); } else { sent++; } diff --git a/net/core/xdp.c b/net/core/xdp.c index bf6758f74339..cb8c4e061a5a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,7 +308,13 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -static void xdp_return(void *data, struct xdp_mem_info *mem) +/* XDP RX runs under NAPI protection, and in different delivery error + * scenarios (e.g. queue full), it is possible to return the xdp_frame + * while still leveraging this protection. The @napi_direct boolian + * is used for those calls sites. Thus, allowing for faster recycling + * of xdp_frames/pages in those cases. + */ +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -320,7 +326,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); if (xa) - page_pool_put_page(xa->page_pool, page); + page_pool_put_page(xa->page_pool, page, napi_direct); else put_page(page); rcu_read_unlock(); @@ -340,12 +346,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem) void xdp_return_frame(struct xdp_frame *xdpf) { - xdp_return(xdpf->data, &xdpf->mem); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); +void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) +{ + __xdp_return(xdpf->data, &xdpf->mem, true); +} +EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); + void xdp_return_buff(struct xdp_buff *xdp) { - xdp_return(xdp->data, &xdp->rxq->mem); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_buff); -- cgit v1.2.3 From 735fc4054b3a25034445c6713d259da0f96f8131 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:12 +0200 Subject: xdp: change ndo_xdp_xmit API to support bulking This patch change the API for ndo_xdp_xmit to support bulking xdp_frames. When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown. Most of the slowdown is caused by DMA API indirect function calls, but also the net_device->ndo_xdp_xmit() call. Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed performance improved: for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps With frames avail as a bulk inside the driver ndo_xdp_xmit call, further optimizations are possible, like bulk DMA-mapping for TX. Testing without CONFIG_RETPOLINE show the same performance for physical NIC drivers. The virtual NIC driver tun sees a huge performance boost, as it can avoid doing per frame producer locking, but instead amortize the locking cost over the bulk. V2: Fix compile errors reported by kbuild test robot V4: Isolated ndo, driver changes and callers. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 26 ++++++++--- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 21 ++++++--- drivers/net/tun.c | 37 +++++++++------ drivers/net/virtio_net.c | 66 ++++++++++++++++++++------- include/linux/netdevice.h | 14 ++++-- kernel/bpf/devmap.c | 29 +++++++----- net/core/filter.c | 8 ++-- 8 files changed, 139 insertions(+), 64 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 5efa68de935b..9b698c5acd05 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev) * @dev: netdev * @xdp: XDP buffer * - * Returns Zero if sent, else an error code + * Returns number of frames successfully sent. Frames that fail are + * free'ed via XDP return API. + * + * For error cases, a negative errno code is returned and no-frames + * are transmitted (caller must handle freeing frames). **/ -int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) { struct i40e_netdev_priv *np = netdev_priv(dev); unsigned int queue_index = smp_processor_id(); struct i40e_vsi *vsi = np->vsi; - int err; + int drops = 0; + int i; if (test_bit(__I40E_VSI_DOWN, vsi->state)) return -ENETDOWN; @@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs) return -ENXIO; - err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]); - if (err != I40E_XDP_TX) - return -ENOSPC; + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + int err; - return 0; + err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]); + if (err != I40E_XDP_TX) { + xdp_return_frame_rx_napi(xdpf); + drops++; + } + } + + return n - drops; } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index fdd2c55f03a6..eb8804b3d7b6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw); void i40e_detect_recover_hung(struct i40e_vsi *vsi); int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); bool __i40e_chk_linearize(struct sk_buff *skb); -int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf); +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames); void i40e_xdp_flush(struct net_device *dev); /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6652b201df5b..9645619f7729 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10017,11 +10017,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } -static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) +static int ixgbe_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames) { struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_ring *ring; - int err; + int drops = 0; + int i; if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state))) return -ENETDOWN; @@ -10033,11 +10035,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) if (unlikely(!ring)) return -ENXIO; - err = ixgbe_xmit_xdp_ring(adapter, xdpf); - if (err != IXGBE_XDP_TX) - return -ENOSPC; + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + int err; - return 0; + err = ixgbe_xmit_xdp_ring(adapter, xdpf); + if (err != IXGBE_XDP_TX) { + xdp_return_frame_rx_napi(xdpf); + drops++; + } + } + + return n - drops; } static void ixgbe_xdp_flush(struct net_device *dev) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 44d4f3d25350..d3dcfcb1c4b3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_get_stats64 = tun_net_get_stats64, }; -static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame) +static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; - int ret = 0; + int drops = 0; + int cnt = n; + int i; rcu_read_lock(); numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { - ret = -ENOSPC; - goto out; + rcu_read_unlock(); + return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); - /* Encode the XDP flag into lowest bit for consumer to differ - * XDP buffer from sk_buff. - */ - if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) { - this_cpu_inc(tun->pcpu_stats->tx_dropped); - ret = -ENOSPC; + + spin_lock(&tfile->tx_ring.producer_lock); + for (i = 0; i < n; i++) { + struct xdp_frame *xdp = frames[i]; + /* Encode the XDP flag into lowest bit for consumer to differ + * XDP buffer from sk_buff. + */ + void *frame = tun_xdp_to_ptr(xdp); + + if (__ptr_ring_produce(&tfile->tx_ring, frame)) { + this_cpu_inc(tun->pcpu_stats->tx_dropped); + xdp_return_frame_rx_napi(xdp); + drops++; + } } + spin_unlock(&tfile->tx_ring.producer_lock); -out: rcu_read_unlock(); - return ret; + return cnt - drops; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) @@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) if (unlikely(!frame)) return -EOVERFLOW; - return tun_xdp_xmit(dev, frame); + return tun_xdp_xmit(dev, 1, &frame); } static void tun_xdp_flush(struct net_device *dev) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index f34794a76c4d..39a0783d1cde 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev) virtqueue_kick(sq->vq); } -static int __virtnet_xdp_xmit(struct virtnet_info *vi, - struct xdp_frame *xdpf) +static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, + struct send_queue *sq, + struct xdp_frame *xdpf) { struct virtio_net_hdr_mrg_rxbuf *hdr; - struct xdp_frame *xdpf_sent; - struct send_queue *sq; - unsigned int len; - unsigned int qp; int err; - qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); - sq = &vi->sq[qp]; - - /* Free up any pending old buffers before queueing new ones. */ - while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) - xdp_return_frame(xdpf_sent); - /* virtqueue want to use data area in-front of packet */ if (unlikely(xdpf->metasize > 0)) return -EOPNOTSUPP; @@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi, return 0; } -static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) +static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi, + struct xdp_frame *xdpf) +{ + struct xdp_frame *xdpf_sent; + struct send_queue *sq; + unsigned int len; + unsigned int qp; + + qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); + sq = &vi->sq[qp]; + + /* Free up any pending old buffers before queueing new ones. */ + while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) + xdp_return_frame(xdpf_sent); + + return __virtnet_xdp_xmit_one(vi, sq, xdpf); +} + +static int virtnet_xdp_xmit(struct net_device *dev, + int n, struct xdp_frame **frames) { struct virtnet_info *vi = netdev_priv(dev); struct receive_queue *rq = vi->rq; + struct xdp_frame *xdpf_sent; struct bpf_prog *xdp_prog; + struct send_queue *sq; + unsigned int len; + unsigned int qp; + int drops = 0; + int err; + int i; + + qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); + sq = &vi->sq[qp]; /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this * indicate XDP resources have been successfully allocated. @@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf) if (!xdp_prog) return -ENXIO; - return __virtnet_xdp_xmit(vi, xdpf); + /* Free up any pending old buffers before queueing new ones. */ + while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) + xdp_return_frame(xdpf_sent); + + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + + err = __virtnet_xdp_xmit_one(vi, sq, xdpf); + if (err) { + xdp_return_frame_rx_napi(xdpf); + drops++; + } + } + return n - drops; } static unsigned int virtnet_get_headroom(struct virtnet_info *vi) @@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev, xdpf = convert_to_xdp_frame(&xdp); if (unlikely(!xdpf)) goto err_xdp; - err = __virtnet_xdp_xmit(vi, xdpf); + err = __virtnet_xdp_tx_xmit(vi, xdpf); if (unlikely(err)) { trace_xdp_exception(vi->dev, xdp_prog, act); goto err_xdp; @@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, xdpf = convert_to_xdp_frame(&xdp); if (unlikely(!xdpf)) goto err_xdp; - err = __virtnet_xdp_xmit(vi, xdpf); + err = __virtnet_xdp_tx_xmit(vi, xdpf); if (unlikely(err)) { trace_xdp_exception(vi->dev, xdp_prog, act); if (unlikely(xdp_page != page)) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 03ed492c4e14..debdb6286170 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1185,9 +1185,13 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); - * This function is used to submit a XDP packet for transmit on a - * netdevice. + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); + * This function is used to submit @n XDP packets for transmit on a + * netdevice. Returns number of frames successfully transmitted, frames + * that got dropped are freed/returned via xdp_return_frame(). + * Returns negative number, means general error invoking ndo, meaning + * no frames were xmit'ed and core-caller will free all frames. + * TODO: Consider add flag to allow sending flush operation. * void (*ndo_xdp_flush)(struct net_device *dev); * This function is used to inform the driver to flush a particular * xdp tx queue. Must be called on same CPU as xdp_xmit. @@ -1375,8 +1379,8 @@ struct net_device_ops { int needed_headroom); int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); - int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_frame *xdp); + int (*ndo_xdp_xmit)(struct net_device *dev, int n, + struct xdp_frame **xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a9cd5c93dd2b..77908311ec98 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -232,24 +232,31 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, prefetch(xdpf); } - for (i = 0; i < bq->count; i++) { - struct xdp_frame *xdpf = bq->q[i]; - int err; - - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) { - drops++; - xdp_return_frame_rx_napi(xdpf); - } else { - sent++; - } + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); + if (sent < 0) { + sent = 0; + goto error; } + drops = bq->count - sent; +out: bq->count = 0; trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev); bq->dev_rx = NULL; return 0; +error: + /* If ndo_xdp_xmit fails with an errno, no frames have been + * xmit'ed and it's our responsibility to them free all. + */ + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + /* RX path under NAPI protection, can return frames faster */ + xdp_return_frame_rx_napi(xdpf); + drops++; + } + goto out; } /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled diff --git a/net/core/filter.c b/net/core/filter.c index 36cf2f87d742..1d75f9322275 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3039,7 +3039,7 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int err; + int sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; @@ -3049,9 +3049,9 @@ static int __bpf_tx_xdp(struct net_device *dev, if (unlikely(!xdpf)) return -EOVERFLOW; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); - if (err) - return err; + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf); + if (sent <= 0) + return sent; dev->netdev_ops->ndo_xdp_flush(dev); return 0; } -- cgit v1.2.3 From e74de52e55c092e7113f839e74400ce9dbe12ceb Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:17 +0200 Subject: xdp/trace: extend tracepoint in devmap with an err Extending tracepoint xdp:xdp_devmap_xmit in devmap with an err code allow people to easier identify the reason behind the ndo_xdp_xmit call to a given driver is failing. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/trace/events/xdp.h | 10 ++++++---- kernel/bpf/devmap.c | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 2e9ef0650144..1ecf4c67fcf7 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -234,9 +234,9 @@ TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct bpf_map *map, u32 map_index, int sent, int drops, const struct net_device *from_dev, - const struct net_device *to_dev), + const struct net_device *to_dev, int err), - TP_ARGS(map, map_index, sent, drops, from_dev, to_dev), + TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err), TP_STRUCT__entry( __field(int, map_id) @@ -246,6 +246,7 @@ TRACE_EVENT(xdp_devmap_xmit, __field(int, sent) __field(int, from_ifindex) __field(int, to_ifindex) + __field(int, err) ), TP_fast_assign( @@ -256,16 +257,17 @@ TRACE_EVENT(xdp_devmap_xmit, __entry->sent = sent; __entry->from_ifindex = from_dev->ifindex; __entry->to_ifindex = to_dev->ifindex; + __entry->err = err; ), TP_printk("ndo_xdp_xmit" " map_id=%d map_index=%d action=%s" " sent=%d drops=%d" - " from_ifindex=%d to_ifindex=%d", + " from_ifindex=%d to_ifindex=%d err=%d", __entry->map_id, __entry->map_index, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, - __entry->from_ifindex, __entry->to_ifindex) + __entry->from_ifindex, __entry->to_ifindex, __entry->err) ); #endif /* _TRACE_XDP_H */ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 77908311ec98..ae16d0c373ef 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -220,7 +220,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, struct xdp_bulk_queue *bq) { struct net_device *dev = obj->dev; - int sent = 0, drops = 0; + int sent = 0, drops = 0, err = 0; int i; if (unlikely(!bq->count)) @@ -234,6 +234,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); if (sent < 0) { + err = sent; sent = 0; goto error; } @@ -242,7 +243,7 @@ out: bq->count = 0; trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, - sent, drops, bq->dev_rx, dev); + sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; return 0; error: -- cgit v1.2.3 From 71b2c87df3ac37f5f83e166db136b0c1d065a781 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 30 May 2018 16:09:16 +0100 Subject: bpf: devmap: remove redundant assignment of dev = dev The assignment dev = dev is redundant and should be removed. Detected by CoverityScan, CID#1469486 ("Evaluation order violation") Signed-off-by: Colin Ian King Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ae16d0c373ef..1fe3fe60508a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -352,7 +352,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); - struct net_device *dev = dev = obj ? obj->dev : NULL; + struct net_device *dev = obj ? obj->dev : NULL; return dev ? &dev->ifindex : NULL; } -- cgit v1.2.3 From 42b33468987bac0dd95c30f14820c7abac04a153 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 10:59:47 +0200 Subject: xdp: add flags argument to ndo_xdp_xmit API This patch only change the API and reject any use of flags. This is an intermediate step that allows us to implement the flush flag operation later, for each individual driver in a separate patch. The plan is to implement flush operation via XDP_XMIT_FLUSH flag and then remove XDP_XMIT_FLAGS_NONE when done. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 6 +++++- drivers/net/ethernet/intel/i40e/i40e_txrx.h | 3 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 ++++- drivers/net/tun.c | 8 ++++++-- drivers/net/virtio_net.c | 5 ++++- include/linux/netdevice.h | 7 ++++--- include/net/xdp.h | 5 +++++ kernel/bpf/devmap.c | 2 +- net/core/filter.c | 2 +- 9 files changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 9b698c5acd05..c0451d6e0790 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3670,7 +3670,8 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev) * For error cases, a negative errno code is returned and no-frames * are transmitted (caller must handle freeing frames). **/ -int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, + u32 flags) { struct i40e_netdev_priv *np = netdev_priv(dev); unsigned int queue_index = smp_processor_id(); @@ -3684,6 +3685,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs) return -ENXIO; + if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE)) + return -EINVAL; + for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; int err; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index eb8804b3d7b6..820f76db251b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -487,7 +487,8 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw); void i40e_detect_recover_hung(struct i40e_vsi *vsi); int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); bool __i40e_chk_linearize(struct sk_buff *skb); -int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames); +int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, + u32 flags); void i40e_xdp_flush(struct net_device *dev); /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 031d65c4178d..87f088f4af52 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10023,7 +10023,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) } static int ixgbe_xdp_xmit(struct net_device *dev, int n, - struct xdp_frame **frames) + struct xdp_frame **frames, u32 flags) { struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_ring *ring; @@ -10033,6 +10033,9 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n, if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state))) return -ENETDOWN; + if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE)) + return -EINVAL; + /* During program transitions its possible adapter->xdp_prog is assigned * but ring has not been configured yet. In this case simply abort xmit. */ diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2265d2ccea47..b182b8cdd219 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1285,7 +1285,8 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_get_stats64 = tun_net_get_stats64, }; -static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) +static int tun_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; @@ -1294,6 +1295,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames int cnt = n; int i; + if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE)) + return -EINVAL; + rcu_read_lock(); numqueues = READ_ONCE(tun->numqueues); @@ -1332,7 +1336,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) if (unlikely(!frame)) return -EOVERFLOW; - return tun_xdp_xmit(dev, 1, &frame); + return tun_xdp_xmit(dev, 1, &frame, 0); } static void tun_xdp_flush(struct net_device *dev) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b2647dd5d302..4ed823625953 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -468,7 +468,7 @@ static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi, } static int virtnet_xdp_xmit(struct net_device *dev, - int n, struct xdp_frame **frames) + int n, struct xdp_frame **frames, u32 flags) { struct virtnet_info *vi = netdev_priv(dev); struct receive_queue *rq = vi->rq; @@ -481,6 +481,9 @@ static int virtnet_xdp_xmit(struct net_device *dev, int err; int i; + if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE)) + return -EINVAL; + qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); sq = &vi->sq[qp]; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8452f72087ef..7f17785a59d7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1185,13 +1185,13 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, + * u32 flags); * This function is used to submit @n XDP packets for transmit on a * netdevice. Returns number of frames successfully transmitted, frames * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. - * TODO: Consider add flag to allow sending flush operation. * void (*ndo_xdp_flush)(struct net_device *dev); * This function is used to inform the driver to flush a particular * xdp tx queue. Must be called on same CPU as xdp_xmit. @@ -1380,7 +1380,8 @@ struct net_device_ops { int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, int n, - struct xdp_frame **xdp); + struct xdp_frame **xdp, + u32 flags); void (*ndo_xdp_flush)(struct net_device *dev); }; diff --git a/include/net/xdp.h b/include/net/xdp.h index 7ad779237ae8..0c45f0f943ed 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -40,6 +40,11 @@ enum xdp_mem_type { MEM_TYPE_MAX, }; +/* XDP flags for ndo_xdp_xmit */ +#define XDP_XMIT_FLAGS_NONE 0U +#define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ +#define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH + struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1fe3fe60508a..037e234056f7 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -232,7 +232,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, prefetch(xdpf); } - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q); + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0); if (sent < 0) { err = sent; sent = 0; diff --git a/net/core/filter.c b/net/core/filter.c index 28e864777c0f..56e40dafdde7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3056,7 +3056,7 @@ static int __bpf_tx_xdp(struct net_device *dev, if (unlikely(!xdpf)) return -EOVERFLOW; - sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf); + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, 0); if (sent <= 0) return sent; dev->netdev_ops->ndo_xdp_flush(dev); -- cgit v1.2.3 From c1ece6b245bd12a57124da78abafbf8a511394d6 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 11:00:23 +0200 Subject: bpf/xdp: devmap can avoid calling ndo_xdp_flush The XDP_REDIRECT map devmap can avoid using ndo_xdp_flush, by instead instructing ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag in appropriate places. Notice after this patch it is possible to remove ndo_xdp_flush completely, as this is the last user of ndo_xdp_flush. This is left for later patches, to keep driver changes separate. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/devmap.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel/bpf/devmap.c') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 037e234056f7..a7cc7b3494a9 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -217,7 +217,7 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) } static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq) + struct xdp_bulk_queue *bq, u32 flags) { struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; @@ -232,7 +232,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, prefetch(xdpf); } - sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0); + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); if (sent < 0) { err = sent; sent = 0; @@ -276,7 +276,6 @@ void __dev_map_flush(struct bpf_map *map) for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); struct xdp_bulk_queue *bq; - struct net_device *netdev; /* This is possible if the dev entry is removed by user space * between xdp redirect and flush op. @@ -287,10 +286,7 @@ void __dev_map_flush(struct bpf_map *map) __clear_bit(bit, bitmap); bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq); - netdev = dev->dev; - if (likely(netdev->netdev_ops->ndo_xdp_flush)) - netdev->netdev_ops->ndo_xdp_flush(netdev); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); } } @@ -320,7 +316,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq); + bq_xmit_all(obj, bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -359,8 +355,7 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { - if (dev->dev->netdev_ops->ndo_xdp_flush) { - struct net_device *fl = dev->dev; + if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; unsigned long *bitmap; @@ -371,9 +366,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) __clear_bit(dev->bit, bitmap); bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq); - - fl->netdev_ops->ndo_xdp_flush(dev->dev); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); } } } -- cgit v1.2.3