summaryrefslogtreecommitdiffstats
path: root/drivers/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-09-13 09:25:41 -0700
committerDavid S. Miller <davem@davemloft.net>2018-09-13 09:25:41 -0700
commit8bb83b78382c93ad05d0e898a9018d5aa3a24b79 (patch)
tree29727608e82f7bd2aab425093725e70978d202ff /drivers/net
parent9708d2b5b7c648e8e0a40d11e8cea12f6277f33c (diff)
parent0a0be13b8fe2cac11da2063fb03f0f39359b3069 (diff)
downloadlinux-8bb83b78382c93ad05d0e898a9018d5aa3a24b79.tar.bz2
Merge branch 'vhost_net-TX-batching'
Jason Wang says: ==================== vhost_net TX batching This series tries to batch submitting packets to underlayer socket through msg_control during sendmsg(). This is done by: 1) Doing userspace copy inside vhost_net 2) Build XDP buff 3) Batch at most 64 (VHOST_NET_BATCH) XDP buffs and submit them once through msg_control during sendmsg(). 4) Underlayer sockets can use XDP buffs directly when XDP is enalbed, or build skb based on XDP buff. For the packet that can not be built easily with XDP or for the case that batch submission is hard (e.g sndbuf is limited). We will go for the previous slow path, passing iov iterator to underlayer socket through sendmsg() once per packet. This can help to improve cache utilization and avoid lots of indirect calls with sendmsg(). It can also co-operate with the batching support of the underlayer sockets (e.g the case of XDP redirection through maps). Testpmd(txonly) in guest shows obvious improvements: Test /+pps% XDP_DROP on TAP /+44.8% XDP_REDIRECT on TAP /+29% macvtap (skb) /+26% Netperf TCP_STREAM TX from guest shows obvious improvements on small packet: size/session/+thu%/+normalize% 64/ 1/ +2%/ 0% 64/ 2/ +3%/ +1% 64/ 4/ +7%/ +5% 64/ 8/ +8%/ +6% 256/ 1/ +3%/ 0% 256/ 2/ +10%/ +7% 256/ 4/ +26%/ +22% 256/ 8/ +27%/ +23% 512/ 1/ +3%/ +2% 512/ 2/ +19%/ +14% 512/ 4/ +43%/ +40% 512/ 8/ +45%/ +41% 1024/ 1/ +4%/ 0% 1024/ 2/ +27%/ +21% 1024/ 4/ +38%/ +73% 1024/ 8/ +15%/ +24% 2048/ 1/ +10%/ +7% 2048/ 2/ +16%/ +12% 2048/ 4/ 0%/ +2% 2048/ 8/ 0%/ +2% 4096/ 1/ +36%/ +60% 4096/ 2/ -11%/ -26% 4096/ 4/ 0%/ +14% 4096/ 8/ 0%/ +4% 16384/ 1/ -1%/ +5% 16384/ 2/ 0%/ +2% 16384/ 4/ 0%/ -3% 16384/ 8/ 0%/ +4% 65535/ 1/ 0%/ +10% 65535/ 2/ 0%/ +8% 65535/ 4/ 0%/ +1% 65535/ 8/ 0%/ +3% Please review. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net')
-rw-r--r--drivers/net/tap.c88
-rw-r--r--drivers/net/tun.c267
2 files changed, 290 insertions, 65 deletions
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index f0f7cd977667..a4ab4a791fe7 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -619,7 +619,7 @@ static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad,
#define TAP_RESERVE HH_DATA_OFF(ETH_HLEN)
/* Get packet from user space buffer */
-static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
+static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
struct iov_iter *from, int noblock)
{
int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
@@ -663,7 +663,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
if (unlikely(len < ETH_HLEN))
goto err;
- if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
+ if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
struct iov_iter i;
copylen = vnet_hdr.hdr_len ?
@@ -724,11 +724,11 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
tap = rcu_dereference(q->tap);
/* copy skb_ubuf_info for callback when skb has no error */
if (zerocopy) {
- skb_shinfo(skb)->destructor_arg = m->msg_control;
+ skb_shinfo(skb)->destructor_arg = msg_control;
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
- } else if (m && m->msg_control) {
- struct ubuf_info *uarg = m->msg_control;
+ } else if (msg_control) {
+ struct ubuf_info *uarg = msg_control;
uarg->callback(uarg, false);
}
@@ -1146,11 +1146,87 @@ static const struct file_operations tap_fops = {
#endif
};
+static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
+{
+ struct tun_xdp_hdr *hdr = xdp->data_hard_start;
+ struct virtio_net_hdr *gso = &hdr->gso;
+ int buflen = hdr->buflen;
+ int vnet_hdr_len = 0;
+ struct tap_dev *tap;
+ struct sk_buff *skb;
+ int err, depth;
+
+ if (q->flags & IFF_VNET_HDR)
+ vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
+
+ skb = build_skb(xdp->data_hard_start, buflen);
+ if (!skb) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ skb_put(skb, xdp->data_end - xdp->data);
+
+ skb_set_network_header(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb->protocol = eth_hdr(skb)->h_proto;
+
+ if (vnet_hdr_len) {
+ err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
+ if (err)
+ goto err_kfree;
+ }
+
+ skb_probe_transport_header(skb, ETH_HLEN);
+
+ /* Move network header to the right position for VLAN tagged packets */
+ if ((skb->protocol == htons(ETH_P_8021Q) ||
+ skb->protocol == htons(ETH_P_8021AD)) &&
+ __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
+ skb_set_network_header(skb, depth);
+
+ rcu_read_lock();
+ tap = rcu_dereference(q->tap);
+ if (tap) {
+ skb->dev = tap->dev;
+ dev_queue_xmit(skb);
+ } else {
+ kfree_skb(skb);
+ }
+ rcu_read_unlock();
+
+ return 0;
+
+err_kfree:
+ kfree_skb(skb);
+err:
+ rcu_read_lock();
+ tap = rcu_dereference(q->tap);
+ if (tap && tap->count_tx_dropped)
+ tap->count_tx_dropped(tap);
+ rcu_read_unlock();
+ return err;
+}
+
static int tap_sendmsg(struct socket *sock, struct msghdr *m,
size_t total_len)
{
struct tap_queue *q = container_of(sock, struct tap_queue, sock);
- return tap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT);
+ struct tun_msg_ctl *ctl = m->msg_control;
+ struct xdp_buff *xdp;
+ int i;
+
+ if (ctl && (ctl->type == TUN_MSG_PTR)) {
+ for (i = 0; i < ctl->num; i++) {
+ xdp = &((struct xdp_buff *)ctl->ptr)[i];
+ tap_get_user_xdp(q, xdp);
+ }
+ return 0;
+ }
+
+ return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
+ m->msg_flags & MSG_DONTWAIT);
}
static int tap_recvmsg(struct socket *sock, struct msghdr *m,
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ebd07ad82431..2a2cd35853b7 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -113,7 +113,6 @@ do { \
} while (0)
#endif
-#define TUN_HEADROOM 256
#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
/* TUN device flags */
@@ -869,6 +868,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
tun_napi_init(tun, tfile, napi);
}
+ if (rtnl_dereference(tun->xdp_prog))
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+
tun_set_real_num_queues(tun);
/* device is allowed to go away first, so no need to hold extra
@@ -1241,13 +1243,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netlink_ext_ack *extack)
{
struct tun_struct *tun = netdev_priv(dev);
+ struct tun_file *tfile;
struct bpf_prog *old_prog;
+ int i;
old_prog = rtnl_dereference(tun->xdp_prog);
rcu_assign_pointer(tun->xdp_prog, prog);
if (old_prog)
bpf_prog_put(old_prog);
+ for (i = 0; i < tun->numqueues; i++) {
+ tfile = rtnl_dereference(tun->tfiles[i]);
+ if (prog)
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+ else
+ sock_reset_flag(&tfile->sk, SOCK_XDP);
+ }
+ list_for_each_entry(tfile, &tun->disabled, next) {
+ if (prog)
+ sock_set_flag(&tfile->sk, SOCK_XDP);
+ else
+ sock_reset_flag(&tfile->sk, SOCK_XDP);
+ }
+
return 0;
}
@@ -1617,6 +1635,55 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
return true;
}
+static struct sk_buff *__tun_build_skb(struct page_frag *alloc_frag, char *buf,
+ int buflen, int len, int pad)
+{
+ struct sk_buff *skb = build_skb(buf, buflen);
+
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ skb_reserve(skb, pad);
+ skb_put(skb, len);
+
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
+ return skb;
+}
+
+static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+ struct xdp_buff *xdp, u32 act)
+{
+ int err;
+
+ switch (act) {
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+ if (err)
+ return err;
+ break;
+ case XDP_TX:
+ err = tun_xdp_tx(tun->dev, xdp);
+ if (err < 0)
+ return err;
+ break;
+ case XDP_PASS:
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ break;
+ }
+
+ return act;
+}
+
static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct tun_file *tfile,
struct iov_iter *from,
@@ -1624,18 +1691,17 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
int len, int *skb_xdp)
{
struct page_frag *alloc_frag = &current->task_frag;
- struct sk_buff *skb;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- unsigned int delta = 0;
char *buf;
size_t copied;
- int err, pad = TUN_RX_PAD;
+ int pad = TUN_RX_PAD;
+ int err = 0;
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog)
- pad += TUN_HEADROOM;
+ pad += XDP_PACKET_HEADROOM;
buflen += SKB_DATA_ALIGN(len + pad);
rcu_read_unlock();
@@ -1654,17 +1720,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
* of xdp_prog above, this should be rare and for simplicity
* we do XDP on skb in case the headroom is not enough.
*/
- if (hdr->gso_type || !xdp_prog)
+ if (hdr->gso_type || !xdp_prog) {
*skb_xdp = 1;
- else
- *skb_xdp = 0;
+ return __tun_build_skb(alloc_frag, buf, buflen, len, pad);
+ }
+
+ *skb_xdp = 0;
local_bh_disable();
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
- if (xdp_prog && !*skb_xdp) {
+ if (xdp_prog) {
struct xdp_buff xdp;
- void *orig_data;
u32 act;
xdp.data_hard_start = buf;
@@ -1672,66 +1739,33 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &tfile->xdp_rxq;
- orig_data = xdp.data;
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
- switch (act) {
- case XDP_REDIRECT:
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
- err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
- xdp_do_flush_map();
- if (err)
- goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
- case XDP_TX:
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ if (act == XDP_REDIRECT || act == XDP_TX) {
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
- if (tun_xdp_tx(tun->dev, &xdp) < 0)
- goto err_redirect;
- rcu_read_unlock();
- local_bh_enable();
- return NULL;
- case XDP_PASS:
- delta = orig_data - xdp.data;
- len = xdp.data_end - xdp.data;
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
- case XDP_ABORTED:
- trace_xdp_exception(tun->dev, xdp_prog, act);
- /* fall through */
- case XDP_DROP:
- goto err_xdp;
}
- }
+ err = tun_xdp_act(tun, xdp_prog, &xdp, act);
+ if (err < 0)
+ goto err_xdp;
+ if (err == XDP_REDIRECT)
+ xdp_do_flush_map();
+ if (err != XDP_PASS)
+ goto out;
- skb = build_skb(buf, buflen);
- if (!skb) {
- rcu_read_unlock();
- local_bh_enable();
- return ERR_PTR(-ENOMEM);
+ pad = xdp.data - xdp.data_hard_start;
+ len = xdp.data_end - xdp.data;
}
-
- skb_reserve(skb, pad - delta);
- skb_put(skb, len);
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
-
rcu_read_unlock();
local_bh_enable();
- return skb;
+ return __tun_build_skb(alloc_frag, buf, buflen, len, pad);
-err_redirect:
- put_page(alloc_frag->page);
err_xdp:
+ put_page(alloc_frag->page);
+out:
rcu_read_unlock();
local_bh_enable();
- this_cpu_inc(tun->pcpu_stats->rx_dropped);
return NULL;
}
@@ -2392,18 +2426,133 @@ static void tun_sock_write_space(struct sock *sk)
kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}
+static int tun_xdp_one(struct tun_struct *tun,
+ struct tun_file *tfile,
+ struct xdp_buff *xdp, int *flush)
+{
+ struct tun_xdp_hdr *hdr = xdp->data_hard_start;
+ struct virtio_net_hdr *gso = &hdr->gso;
+ struct tun_pcpu_stats *stats;
+ struct bpf_prog *xdp_prog;
+ struct sk_buff *skb = NULL;
+ u32 rxhash = 0, act;
+ int buflen = hdr->buflen;
+ int err = 0;
+ bool skb_xdp = false;
+
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ if (gso->gso_type) {
+ skb_xdp = true;
+ goto build;
+ }
+ xdp_set_data_meta_invalid(xdp);
+ xdp->rxq = &tfile->xdp_rxq;
+
+ act = bpf_prog_run_xdp(xdp_prog, xdp);
+ err = tun_xdp_act(tun, xdp_prog, xdp, act);
+ if (err < 0) {
+ put_page(virt_to_head_page(xdp->data));
+ return err;
+ }
+
+ switch (err) {
+ case XDP_REDIRECT:
+ *flush = true;
+ /* fall through */
+ case XDP_TX:
+ return 0;
+ case XDP_PASS:
+ break;
+ default:
+ put_page(virt_to_head_page(xdp->data));
+ return 0;
+ }
+ }
+
+build:
+ skb = build_skb(xdp->data_hard_start, buflen);
+ if (!skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ skb_put(skb, xdp->data_end - xdp->data);
+
+ if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+ kfree_skb(skb);
+ err = -EINVAL;
+ goto out;
+ }
+
+ skb->protocol = eth_type_trans(skb, tun->dev);
+ skb_reset_network_header(skb);
+ skb_probe_transport_header(skb, 0);
+
+ if (skb_xdp) {
+ err = do_xdp_generic(xdp_prog, skb);
+ if (err != XDP_PASS)
+ goto out;
+ }
+
+ if (!rcu_dereference(tun->steering_prog))
+ rxhash = __skb_get_hash_symmetric(skb);
+
+ netif_receive_skb(skb);
+
+ stats = get_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ u64_stats_update_end(&stats->syncp);
+ put_cpu_ptr(stats);
+
+ if (rxhash)
+ tun_flow_update(tun, rxhash, tfile);
+
+out:
+ return err;
+}
+
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
- int ret;
+ int ret, i;
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
+ struct tun_msg_ctl *ctl = m->msg_control;
+ struct xdp_buff *xdp;
if (!tun)
return -EBADFD;
- ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
+ if (ctl && (ctl->type == TUN_MSG_PTR)) {
+ int n = ctl->num;
+ int flush = 0;
+
+ local_bh_disable();
+ rcu_read_lock();
+
+ for (i = 0; i < n; i++) {
+ xdp = &((struct xdp_buff *)ctl->ptr)[i];
+ tun_xdp_one(tun, tfile, xdp, &flush);
+ }
+
+ if (flush)
+ xdp_do_flush_map();
+
+ rcu_read_unlock();
+ local_bh_enable();
+
+ ret = total_len;
+ goto out;
+ }
+
+ ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
m->msg_flags & MSG_DONTWAIT,
m->msg_flags & MSG_MORE);
+out:
tun_put(tun);
return ret;
}