From 735fc4054b3a25034445c6713d259da0f96f8131 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:12 +0200 Subject: xdp: change ndo_xdp_xmit API to support bulking This patch change the API for ndo_xdp_xmit to support bulking xdp_frames. When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown. Most of the slowdown is caused by DMA API indirect function calls, but also the net_device->ndo_xdp_xmit() call. Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed performance improved: for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps With frames avail as a bulk inside the driver ndo_xdp_xmit call, further optimizations are possible, like bulk DMA-mapping for TX. Testing without CONFIG_RETPOLINE show the same performance for physical NIC drivers. The virtual NIC driver tun sees a huge performance boost, as it can avoid doing per frame producer locking, but instead amortize the locking cost over the bulk. V2: Fix compile errors reported by kbuild test robot V4: Isolated ndo, driver changes and callers. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- drivers/net/tun.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'drivers/net/tun.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 44d4f3d25350..d3dcfcb1c4b3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_get_stats64 = tun_net_get_stats64, }; -static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame) +static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; - int ret = 0; + int drops = 0; + int cnt = n; + int i; rcu_read_lock(); numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { - ret = -ENOSPC; - goto out; + rcu_read_unlock(); + return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); - /* Encode the XDP flag into lowest bit for consumer to differ - * XDP buffer from sk_buff. - */ - if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) { - this_cpu_inc(tun->pcpu_stats->tx_dropped); - ret = -ENOSPC; + + spin_lock(&tfile->tx_ring.producer_lock); + for (i = 0; i < n; i++) { + struct xdp_frame *xdp = frames[i]; + /* Encode the XDP flag into lowest bit for consumer to differ + * XDP buffer from sk_buff. + */ + void *frame = tun_xdp_to_ptr(xdp); + + if (__ptr_ring_produce(&tfile->tx_ring, frame)) { + this_cpu_inc(tun->pcpu_stats->tx_dropped); + xdp_return_frame_rx_napi(xdp); + drops++; + } } + spin_unlock(&tfile->tx_ring.producer_lock); -out: rcu_read_unlock(); - return ret; + return cnt - drops; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) @@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) if (unlikely(!frame)) return -EOVERFLOW; - return tun_xdp_xmit(dev, frame); + return tun_xdp_xmit(dev, 1, &frame); } static void tun_xdp_flush(struct net_device *dev) -- cgit v1.2.3