From e4a2a3048ed93f0c354ad837f1d45fc8d389d538 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:16:59 +0800 Subject: net: sock: introduce SOCK_XDP This patch introduces a new sock flag - SOCK_XDP. This will be used for notifying the upper layer that XDP program is attached on the lower socket, and requires for extra headroom. TUN will be the first user. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ebd07ad82431..2c548bd20393 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -869,6 +869,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file, tun_napi_init(tun, tfile, napi); } + if (rtnl_dereference(tun->xdp_prog)) + sock_set_flag(&tfile->sk, SOCK_XDP); + tun_set_real_num_queues(tun); /* device is allowed to go away first, so no need to hold extra @@ -1241,13 +1244,29 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); + struct tun_file *tfile; struct bpf_prog *old_prog; + int i; old_prog = rtnl_dereference(tun->xdp_prog); rcu_assign_pointer(tun->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); + for (i = 0; i < tun->numqueues; i++) { + tfile = rtnl_dereference(tun->tfiles[i]); + if (prog) + sock_set_flag(&tfile->sk, SOCK_XDP); + else + sock_reset_flag(&tfile->sk, SOCK_XDP); + } + list_for_each_entry(tfile, &tun->disabled, next) { + if (prog) + sock_set_flag(&tfile->sk, SOCK_XDP); + else + sock_reset_flag(&tfile->sk, SOCK_XDP); + } + return 0; } -- cgit v1.2.3 From 4f23aff8713c1f98735f7290a2cde63845b7ec88 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:17:00 +0800 Subject: tuntap: switch to use XDP_PACKET_HEADROOM Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2c548bd20393..d3677a544b56 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -113,7 +113,6 @@ do { \ } while (0) #endif -#define TUN_HEADROOM 256 #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* TUN device flags */ @@ -1654,7 +1653,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) - pad += TUN_HEADROOM; + pad += XDP_PACKET_HEADROOM; buflen += SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); -- cgit v1.2.3 From 291aeb2b1dba0d0296673d994200824a7185585e Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:17:01 +0800 Subject: tuntap: enable bh early during processing XDP This patch move the bh enabling a little bit earlier, this will be used for factoring out the core XDP logic of tuntap. Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d3677a544b56..372caf7d67d9 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1726,22 +1726,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, goto err_xdp; } } + rcu_read_unlock(); + local_bh_enable(); skb = build_skb(buf, buflen); - if (!skb) { - rcu_read_unlock(); - local_bh_enable(); + if (!skb) return ERR_PTR(-ENOMEM); - } skb_reserve(skb, pad - delta); skb_put(skb, len); get_page(alloc_frag->page); alloc_frag->offset += buflen; - rcu_read_unlock(); - local_bh_enable(); - return skb; err_redirect: -- cgit v1.2.3 From f7053b6ccb65a36fdfed6ad92b808464839f0eb6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:17:02 +0800 Subject: tuntap: simplify error handling in tun_build_skb() There's no need to duplicate page get logic in each action. So this patch tries to get page and calculate the offset before processing XDP actions (except for XDP_DROP), and undo them when meet errors (we don't care the performance on errors). This will be used for factoring out XDP logic. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 372caf7d67d9..257cf7342d54 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1701,17 +1701,13 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp_do_flush_map(); if (err) goto err_redirect; - rcu_read_unlock(); - local_bh_enable(); - return NULL; + goto out; case XDP_TX: get_page(alloc_frag->page); alloc_frag->offset += buflen; if (tun_xdp_tx(tun->dev, &xdp) < 0) goto err_redirect; - rcu_read_unlock(); - local_bh_enable(); - return NULL; + goto out; case XDP_PASS: delta = orig_data - xdp.data; len = xdp.data_end - xdp.data; @@ -1742,7 +1738,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, err_redirect: put_page(alloc_frag->page); -err_xdp: +out: rcu_read_unlock(); local_bh_enable(); this_cpu_inc(tun->pcpu_stats->rx_dropped); -- cgit v1.2.3 From ac1f1f6c5ae8352a39afea7b0786c59bcd730712 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:17:03 +0800 Subject: tuntap: tweak on the path of skb XDP case in tun_build_skb() If we're sure not to go native XDP, there's no need for several things like bh and rcu stuffs. So this patch introduces a helper to build skb and hold page refcnt. When we found we will go through skb path, build skb directly. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 257cf7342d54..946c6148ed75 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1635,6 +1635,23 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, return true; } +static struct sk_buff *__tun_build_skb(struct page_frag *alloc_frag, char *buf, + int buflen, int len, int pad, int delta) +{ + struct sk_buff *skb = build_skb(buf, buflen); + + if (!skb) + return ERR_PTR(-ENOMEM); + + skb_reserve(skb, pad - delta); + skb_put(skb, len); + + get_page(alloc_frag->page); + alloc_frag->offset += buflen; + + return skb; +} + static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *from, @@ -1642,7 +1659,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, int len, int *skb_xdp) { struct page_frag *alloc_frag = ¤t->task_frag; - struct sk_buff *skb; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); unsigned int delta = 0; @@ -1672,10 +1688,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough. */ - if (hdr->gso_type || !xdp_prog) + if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; - else - *skb_xdp = 0; + return __tun_build_skb(alloc_frag, buf, buflen, len, pad, delta); + } + + *skb_xdp = 0; local_bh_disable(); rcu_read_lock(); @@ -1719,22 +1737,13 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, trace_xdp_exception(tun->dev, xdp_prog, act); /* fall through */ case XDP_DROP: - goto err_xdp; + goto out; } } rcu_read_unlock(); local_bh_enable(); - skb = build_skb(buf, buflen); - if (!skb) - return ERR_PTR(-ENOMEM); - - skb_reserve(skb, pad - delta); - skb_put(skb, len); - get_page(alloc_frag->page); - alloc_frag->offset += buflen; - - return skb; + return __tun_build_skb(alloc_frag, buf, buflen, len, pad, delta); err_redirect: put_page(alloc_frag->page); -- cgit v1.2.3 From 8ae1aff0b331ab154c39910f2e0ed239bf942d56 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:17:04 +0800 Subject: tuntap: split out XDP logic This patch split out XDP logic into a single function. This make it to be reused by XDP batching path in the following patch. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 88 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 37 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 946c6148ed75..14fe94098180 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1636,14 +1636,14 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, } static struct sk_buff *__tun_build_skb(struct page_frag *alloc_frag, char *buf, - int buflen, int len, int pad, int delta) + int buflen, int len, int pad) { struct sk_buff *skb = build_skb(buf, buflen); if (!skb) return ERR_PTR(-ENOMEM); - skb_reserve(skb, pad - delta); + skb_reserve(skb, pad); skb_put(skb, len); get_page(alloc_frag->page); @@ -1652,6 +1652,39 @@ static struct sk_buff *__tun_build_skb(struct page_frag *alloc_frag, char *buf, return skb; } +static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, + struct xdp_buff *xdp, u32 act) +{ + int err; + + switch (act) { + case XDP_REDIRECT: + err = xdp_do_redirect(tun->dev, xdp, xdp_prog); + xdp_do_flush_map(); + if (err) + return err; + break; + case XDP_TX: + err = tun_xdp_tx(tun->dev, xdp); + if (err < 0) + return err; + break; + case XDP_PASS: + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(tun->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + this_cpu_inc(tun->pcpu_stats->rx_dropped); + break; + } + + return act; +} + static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *from, @@ -1661,10 +1694,10 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct page_frag *alloc_frag = ¤t->task_frag; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - unsigned int delta = 0; char *buf; size_t copied; - int err, pad = TUN_RX_PAD; + int pad = TUN_RX_PAD; + int err = 0; rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); @@ -1690,7 +1723,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, */ if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; - return __tun_build_skb(alloc_frag, buf, buflen, len, pad, delta); + return __tun_build_skb(alloc_frag, buf, buflen, len, pad); } *skb_xdp = 0; @@ -1698,9 +1731,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); - if (xdp_prog && !*skb_xdp) { + if (xdp_prog) { struct xdp_buff xdp; - void *orig_data; u32 act; xdp.data_hard_start = buf; @@ -1708,49 +1740,31 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; xdp.rxq = &tfile->xdp_rxq; - orig_data = xdp.data; - act = bpf_prog_run_xdp(xdp_prog, &xdp); - switch (act) { - case XDP_REDIRECT: - get_page(alloc_frag->page); - alloc_frag->offset += buflen; - err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); - xdp_do_flush_map(); - if (err) - goto err_redirect; - goto out; - case XDP_TX: + act = bpf_prog_run_xdp(xdp_prog, &xdp); + if (act == XDP_REDIRECT || act == XDP_TX) { get_page(alloc_frag->page); alloc_frag->offset += buflen; - if (tun_xdp_tx(tun->dev, &xdp) < 0) - goto err_redirect; - goto out; - case XDP_PASS: - delta = orig_data - xdp.data; - len = xdp.data_end - xdp.data; - break; - default: - bpf_warn_invalid_xdp_action(act); - /* fall through */ - case XDP_ABORTED: - trace_xdp_exception(tun->dev, xdp_prog, act); - /* fall through */ - case XDP_DROP: - goto out; } + err = tun_xdp_act(tun, xdp_prog, &xdp, act); + if (err < 0) + goto err_xdp; + if (err != XDP_PASS) + goto out; + + pad = xdp.data - xdp.data_hard_start; + len = xdp.data_end - xdp.data; } rcu_read_unlock(); local_bh_enable(); - return __tun_build_skb(alloc_frag, buf, buflen, len, pad, delta); + return __tun_build_skb(alloc_frag, buf, buflen, len, pad); -err_redirect: +err_xdp: put_page(alloc_frag->page); out: rcu_read_unlock(); local_bh_enable(); - this_cpu_inc(tun->pcpu_stats->rx_dropped); return NULL; } -- cgit v1.2.3 From 1a097910adda6b3328fc235575bba0e9ee408492 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:17:05 +0800 Subject: tuntap: move XDP flushing out of tun_do_xdp() This will allow adding batch flushing on top. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 14fe94098180..3ae539374f6b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1660,7 +1660,6 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, switch (act) { case XDP_REDIRECT: err = xdp_do_redirect(tun->dev, xdp, xdp_prog); - xdp_do_flush_map(); if (err) return err; break; @@ -1749,6 +1748,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, err = tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) goto err_xdp; + if (err == XDP_REDIRECT) + xdp_do_flush_map(); if (err != XDP_PASS) goto out; -- cgit v1.2.3 From fe8dd45bb7556246c6b76277b1ba4296c91c2505 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:17:06 +0800 Subject: tun: switch to new type of msg_control This patch introduces to a new tun/tap specific msg_control: #define TUN_MSG_UBUF 1 #define TUN_MSG_PTR 2 struct tun_msg_ctl { int type; void *ptr; }; This allows us to pass different kinds of msg_control through sendmsg(). The first supported type is ubuf (TUN_MSG_UBUF) which will be used by the existed vhost_net zerocopy code. The second is XDP buff, which allows vhost_net to pass XDP buff to TUN. This could be used to implement accepting an array of XDP buffs from vhost_net in the following patches. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tap.c | 18 ++++++++++++------ drivers/net/tun.c | 6 +++++- drivers/vhost/net.c | 7 +++++-- include/linux/if_tun.h | 14 ++++++++++++++ 4 files changed, 36 insertions(+), 9 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index f0f7cd977667..7996ed7cbf18 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -619,7 +619,7 @@ static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad, #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN) /* Get packet from user space buffer */ -static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m, +static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, struct iov_iter *from, int noblock) { int good_linear = SKB_MAX_HEAD(TAP_RESERVE); @@ -663,7 +663,7 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m, if (unlikely(len < ETH_HLEN)) goto err; - if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { + if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i; copylen = vnet_hdr.hdr_len ? @@ -724,11 +724,11 @@ static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m, tap = rcu_dereference(q->tap); /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { - skb_shinfo(skb)->destructor_arg = m->msg_control; + skb_shinfo(skb)->destructor_arg = msg_control; skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY; skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; - } else if (m && m->msg_control) { - struct ubuf_info *uarg = m->msg_control; + } else if (msg_control) { + struct ubuf_info *uarg = msg_control; uarg->callback(uarg, false); } @@ -1150,7 +1150,13 @@ static int tap_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); - return tap_get_user(q, m, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); + struct tun_msg_ctl *ctl = m->msg_control; + + if (ctl && ctl->type != TUN_MSG_UBUF) + return -EINVAL; + + return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter, + m->msg_flags & MSG_DONTWAIT); } static int tap_recvmsg(struct socket *sock, struct msghdr *m, diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3ae539374f6b..89779b58c7ca 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2431,11 +2431,15 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) int ret; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); + struct tun_msg_ctl *ctl = m->msg_control; if (!tun) return -EBADFD; - ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter, + if (ctl && ctl->type != TUN_MSG_UBUF) + return -EINVAL; + + ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); tun_put(tun); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 4e656f89cb22..fb01ce6d981c 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -620,6 +620,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) .msg_controllen = 0, .msg_flags = MSG_DONTWAIT, }; + struct tun_msg_ctl ctl; size_t len, total_len = 0; int err; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); @@ -664,8 +665,10 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; refcount_set(&ubuf->refcnt, 1); - msg.msg_control = ubuf; - msg.msg_controllen = sizeof(ubuf); + msg.msg_control = &ctl; + ctl.type = TUN_MSG_UBUF; + ctl.ptr = ubuf; + msg.msg_controllen = sizeof(ctl); ubufs = nvq->ubufs; atomic_inc(&ubufs->refcount); nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV; diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 3d2996dc7d85..12e3eebf0ce6 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -16,9 +16,23 @@ #define __IF_TUN_H #include +#include #define TUN_XDP_FLAG 0x1UL +#define TUN_MSG_UBUF 1 +#define TUN_MSG_PTR 2 +struct tun_msg_ctl { + unsigned short type; + unsigned short num; + void *ptr; +}; + +struct tun_xdp_hdr { + int buflen; + struct virtio_net_hdr gso; +}; + #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); -- cgit v1.2.3 From 043d222f93ab8c76b56a3b315cd8692e35affb6c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:17:07 +0800 Subject: tuntap: accept an array of XDP buffs through sendmsg() This patch implement TUN_MSG_PTR msg_control type. This type allows the caller to pass an array of XDP buffs to tuntap through ptr field of the tun_msg_control. If an XDP program is attached, tuntap can run XDP program directly. If not, tuntap will build skb and do a fast receiving since part of the work has been done by vhost_net. This will avoid lots of indirect calls thus improves the icache utilization and allows to do XDP batched flushing when doing XDP redirection. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 3 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 89779b58c7ca..2a2cd35853b7 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2426,22 +2426,133 @@ static void tun_sock_write_space(struct sock *sk) kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } +static int tun_xdp_one(struct tun_struct *tun, + struct tun_file *tfile, + struct xdp_buff *xdp, int *flush) +{ + struct tun_xdp_hdr *hdr = xdp->data_hard_start; + struct virtio_net_hdr *gso = &hdr->gso; + struct tun_pcpu_stats *stats; + struct bpf_prog *xdp_prog; + struct sk_buff *skb = NULL; + u32 rxhash = 0, act; + int buflen = hdr->buflen; + int err = 0; + bool skb_xdp = false; + + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog) { + if (gso->gso_type) { + skb_xdp = true; + goto build; + } + xdp_set_data_meta_invalid(xdp); + xdp->rxq = &tfile->xdp_rxq; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + err = tun_xdp_act(tun, xdp_prog, xdp, act); + if (err < 0) { + put_page(virt_to_head_page(xdp->data)); + return err; + } + + switch (err) { + case XDP_REDIRECT: + *flush = true; + /* fall through */ + case XDP_TX: + return 0; + case XDP_PASS: + break; + default: + put_page(virt_to_head_page(xdp->data)); + return 0; + } + } + +build: + skb = build_skb(xdp->data_hard_start, buflen); + if (!skb) { + err = -ENOMEM; + goto out; + } + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + skb_put(skb, xdp->data_end - xdp->data); + + if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { + this_cpu_inc(tun->pcpu_stats->rx_frame_errors); + kfree_skb(skb); + err = -EINVAL; + goto out; + } + + skb->protocol = eth_type_trans(skb, tun->dev); + skb_reset_network_header(skb); + skb_probe_transport_header(skb, 0); + + if (skb_xdp) { + err = do_xdp_generic(xdp_prog, skb); + if (err != XDP_PASS) + goto out; + } + + if (!rcu_dereference(tun->steering_prog)) + rxhash = __skb_get_hash_symmetric(skb); + + netif_receive_skb(skb); + + stats = get_cpu_ptr(tun->pcpu_stats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); + put_cpu_ptr(stats); + + if (rxhash) + tun_flow_update(tun, rxhash, tfile); + +out: + return err; +} + static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { - int ret; + int ret, i; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); struct tun_msg_ctl *ctl = m->msg_control; + struct xdp_buff *xdp; if (!tun) return -EBADFD; - if (ctl && ctl->type != TUN_MSG_UBUF) - return -EINVAL; + if (ctl && (ctl->type == TUN_MSG_PTR)) { + int n = ctl->num; + int flush = 0; + + local_bh_disable(); + rcu_read_lock(); + + for (i = 0; i < n; i++) { + xdp = &((struct xdp_buff *)ctl->ptr)[i]; + tun_xdp_one(tun, tfile, xdp, &flush); + } + + if (flush) + xdp_do_flush_map(); + + rcu_read_unlock(); + local_bh_enable(); + + ret = total_len; + goto out; + } ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); +out: tun_put(tun); return ret; } -- cgit v1.2.3 From 0efac27791ee068075d80f07c55a229b1335ce12 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 12 Sep 2018 11:17:08 +0800 Subject: tap: accept an array of XDP buffs through sendmsg() This patch implement TUN_MSG_PTR msg_control type. This type allows the caller to pass an array of XDP buffs to tuntap through ptr field of the tun_msg_control. Tap will build skb through those XDP buffers. This will avoid lots of indirect calls thus improves the icache utilization and allows to do XDP batched flushing when doing XDP redirection. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tap.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 7996ed7cbf18..a4ab4a791fe7 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1146,14 +1146,84 @@ static const struct file_operations tap_fops = { #endif }; +static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) +{ + struct tun_xdp_hdr *hdr = xdp->data_hard_start; + struct virtio_net_hdr *gso = &hdr->gso; + int buflen = hdr->buflen; + int vnet_hdr_len = 0; + struct tap_dev *tap; + struct sk_buff *skb; + int err, depth; + + if (q->flags & IFF_VNET_HDR) + vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); + + skb = build_skb(xdp->data_hard_start, buflen); + if (!skb) { + err = -ENOMEM; + goto err; + } + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + skb_put(skb, xdp->data_end - xdp->data); + + skb_set_network_header(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb->protocol = eth_hdr(skb)->h_proto; + + if (vnet_hdr_len) { + err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q)); + if (err) + goto err_kfree; + } + + skb_probe_transport_header(skb, ETH_HLEN); + + /* Move network header to the right position for VLAN tagged packets */ + if ((skb->protocol == htons(ETH_P_8021Q) || + skb->protocol == htons(ETH_P_8021AD)) && + __vlan_get_protocol(skb, skb->protocol, &depth) != 0) + skb_set_network_header(skb, depth); + + rcu_read_lock(); + tap = rcu_dereference(q->tap); + if (tap) { + skb->dev = tap->dev; + dev_queue_xmit(skb); + } else { + kfree_skb(skb); + } + rcu_read_unlock(); + + return 0; + +err_kfree: + kfree_skb(skb); +err: + rcu_read_lock(); + tap = rcu_dereference(q->tap); + if (tap && tap->count_tx_dropped) + tap->count_tx_dropped(tap); + rcu_read_unlock(); + return err; +} + static int tap_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct tun_msg_ctl *ctl = m->msg_control; + struct xdp_buff *xdp; + int i; - if (ctl && ctl->type != TUN_MSG_UBUF) - return -EINVAL; + if (ctl && (ctl->type == TUN_MSG_PTR)) { + for (i = 0; i < ctl->num; i++) { + xdp = &((struct xdp_buff *)ctl->ptr)[i]; + tap_get_user_xdp(q, xdp); + } + return 0; + } return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); -- cgit v1.2.3