diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-06 14:45:08 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-06 14:45:08 -0700 |
commit | aae3dbb4776e7916b6cd442d00159bea27a695c1 (patch) | |
tree | d074c5d783a81e7e2e084b1eba77f57459da7e37 /drivers/net/tun.c | |
parent | ec3604c7a5aae8953545b0d05495357009a960e5 (diff) | |
parent | 66bed8465a808400eb14562510e26c8818082cb8 (diff) | |
download | linux-aae3dbb4776e7916b6cd442d00159bea27a695c1.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Support ipv6 checksum offload in sunvnet driver, from Shannon
Nelson.
2) Move to RB-tree instead of custom AVL code in inetpeer, from Eric
Dumazet.
3) Allow generic XDP to work on virtual devices, from John Fastabend.
4) Add bpf device maps and XDP_REDIRECT, which can be used to build
arbitrary switching frameworks using XDP. From John Fastabend.
5) Remove UFO offloads from the tree, gave us little other than bugs.
6) Remove the IPSEC flow cache, from Florian Westphal.
7) Support ipv6 route offload in mlxsw driver.
8) Support VF representors in bnxt_en, from Sathya Perla.
9) Add support for forward error correction modes to ethtool, from
Vidya Sagar Ravipati.
10) Add time filter for packet scheduler action dumping, from Jamal Hadi
Salim.
11) Extend the zerocopy sendmsg() used by virtio and tap to regular
sockets via MSG_ZEROCOPY. From Willem de Bruijn.
12) Significantly rework value tracking in the BPF verifier, from Edward
Cree.
13) Add new jump instructions to eBPF, from Daniel Borkmann.
14) Rework rtnetlink plumbing so that operations can be run without
taking the RTNL semaphore. From Florian Westphal.
15) Support XDP in tap driver, from Jason Wang.
16) Add 32-bit eBPF JIT for ARM, from Shubham Bansal.
17) Add Huawei hinic ethernet driver.
18) Allow to report MD5 keys in TCP inet_diag dumps, from Ivan
Delalande.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1780 commits)
i40e: point wb_desc at the nvm_wb_desc during i40e_read_nvm_aq
i40e: avoid NVM acquire deadlock during NVM update
drivers: net: xgene: Remove return statement from void function
drivers: net: xgene: Configure tx/rx delay for ACPI
drivers: net: xgene: Read tx/rx delay for ACPI
rocker: fix kcalloc parameter order
rds: Fix non-atomic operation on shared flag variable
net: sched: don't use GFP_KERNEL under spin lock
vhost_net: correctly check tx avail during rx busy polling
net: mdio-mux: add mdio_mux parameter to mdio_mux_init()
rxrpc: Make service connection lookup always check for retry
net: stmmac: Delete dead code for MDIO registration
gianfar: Fix Tx flow control deactivation
cxgb4: Ignore MPS_TX_INT_CAUSE[Bubble] for T6
cxgb4: Fix pause frame count in t4_get_port_stats
cxgb4: fix memory leak
tun: rename generic_xdp to skb_xdp
tun: reserve extra headroom only when XDP is set
net: dsa: bcm_sf2: Configure IMP port TC2QOS mapping
net: dsa: bcm_sf2: Advertise number of egress queues
...
Diffstat (limited to 'drivers/net/tun.c')
-rw-r--r-- | drivers/net/tun.c | 267 |
1 files changed, 238 insertions, 29 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 0a2c0a42283f..3c9985f29950 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -73,6 +73,8 @@ #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> +#include <linux/bpf.h> +#include <linux/bpf_trace.h> #include <linux/uaccess.h> @@ -105,6 +107,9 @@ do { \ } while (0) #endif +#define TUN_HEADROOM 256 +#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) + /* TUN device flags */ /* IFF_ATTACH_QUEUE is never stored in device flags, @@ -199,7 +204,7 @@ struct tun_struct { struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ - NETIF_F_TSO6|NETIF_F_UFO) + NETIF_F_TSO6) int align; int vnet_hdr_sz; @@ -221,6 +226,7 @@ struct tun_struct { u32 flow_count; u32 rx_batched; struct tun_pcpu_stats __percpu *pcpu_stats; + struct bpf_prog __rcu *xdp_prog; }; #ifdef CONFIG_TUN_VNET_CROSS_LE @@ -585,6 +591,7 @@ static void tun_detach(struct tun_file *tfile, bool clean) static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); + struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; @@ -617,6 +624,9 @@ static void tun_detach_all(struct net_device *dev) } BUG_ON(tun->numdisabled != 0); + if (xdp_prog) + bpf_prog_put(xdp_prog); + if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } @@ -892,7 +902,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) sk_filter(tfile->socket.sk, skb)) goto drop; - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; skb_tx_timestamp(skb); @@ -1003,6 +1013,46 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) stats->tx_dropped = tx_dropped; } +static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) +{ + struct tun_struct *tun = netdev_priv(dev); + struct bpf_prog *old_prog; + + old_prog = rtnl_dereference(tun->xdp_prog); + rcu_assign_pointer(tun->xdp_prog, prog); + if (old_prog) + bpf_prog_put(old_prog); + + return 0; +} + +static u32 tun_xdp_query(struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + const struct bpf_prog *xdp_prog; + + xdp_prog = rtnl_dereference(tun->xdp_prog); + if (xdp_prog) + return xdp_prog->aux->id; + + return 0; +} + +static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return tun_xdp_set(dev, xdp->prog, xdp->extack); + case XDP_QUERY_PROG: + xdp->prog_id = tun_xdp_query(dev); + xdp->prog_attached = !!xdp->prog_id; + return 0; + default: + return -EINVAL; + } +} + static const struct net_device_ops tun_netdev_ops = { .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, @@ -1033,6 +1083,7 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, + .ndo_xdp = tun_xdp, }; static void tun_flow_init(struct tun_struct *tun) @@ -1190,6 +1241,138 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, } } +static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, + int len, int noblock, bool zerocopy) +{ + if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) + return false; + + if (tfile->socket.sk->sk_sndbuf != INT_MAX) + return false; + + if (!noblock) + return false; + + if (zerocopy) + return false; + + if (SKB_DATA_ALIGN(len + TUN_RX_PAD) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) + return false; + + return true; +} + +static struct sk_buff *tun_build_skb(struct tun_struct *tun, + struct tun_file *tfile, + struct iov_iter *from, + struct virtio_net_hdr *hdr, + int len, int *skb_xdp) +{ + struct page_frag *alloc_frag = ¤t->task_frag; + struct sk_buff *skb; + struct bpf_prog *xdp_prog; + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + unsigned int delta = 0; + char *buf; + size_t copied; + bool xdp_xmit = false; + int err, pad = TUN_RX_PAD; + + rcu_read_lock(); + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog) + pad += TUN_HEADROOM; + buflen += SKB_DATA_ALIGN(len + pad); + rcu_read_unlock(); + + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) + return ERR_PTR(-ENOMEM); + + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; + copied = copy_page_from_iter(alloc_frag->page, + alloc_frag->offset + pad, + len, from); + if (copied != len) + return ERR_PTR(-EFAULT); + + /* There's a small window that XDP may be set after the check + * of xdp_prog above, this should be rare and for simplicity + * we do XDP on skb in case the headroom is not enough. + */ + if (hdr->gso_type || !xdp_prog) + *skb_xdp = 1; + else + *skb_xdp = 0; + + rcu_read_lock(); + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog && !*skb_xdp) { + struct xdp_buff xdp; + void *orig_data; + u32 act; + + xdp.data_hard_start = buf; + xdp.data = buf + pad; + xdp.data_end = xdp.data + len; + orig_data = xdp.data; + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + switch (act) { + case XDP_REDIRECT: + get_page(alloc_frag->page); + alloc_frag->offset += buflen; + err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); + if (err) + goto err_redirect; + return NULL; + case XDP_TX: + xdp_xmit = true; + /* fall through */ + case XDP_PASS: + delta = orig_data - xdp.data; + break; + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(tun->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + goto err_xdp; + } + } + + skb = build_skb(buf, buflen); + if (!skb) { + rcu_read_unlock(); + return ERR_PTR(-ENOMEM); + } + + skb_reserve(skb, pad - delta); + skb_put(skb, len + delta); + get_page(alloc_frag->page); + alloc_frag->offset += buflen; + + if (xdp_xmit) { + skb->dev = tun->dev; + generic_xdp_tx(skb, xdp_prog); + rcu_read_lock(); + return NULL; + } + + rcu_read_unlock(); + + return skb; + +err_redirect: + put_page(alloc_frag->page); +err_xdp: + rcu_read_unlock(); + this_cpu_inc(tun->pcpu_stats->rx_dropped); + return NULL; +} + /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, @@ -1206,6 +1389,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, bool zerocopy = false; int err; u32 rxhash; + int skb_xdp = 1; if (!(tun->dev->flags & IFF_UP)) return -EIO; @@ -1263,30 +1447,44 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, zerocopy = true; } - if (!zerocopy) { - copylen = len; - if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) - linear = good_linear; - else - linear = tun16_to_cpu(tun, gso.hdr_len); - } - - skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); - if (IS_ERR(skb)) { - if (PTR_ERR(skb) != -EAGAIN) + if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { + /* For the packet that is not easy to be processed + * (e.g gso or jumbo packet), we will do it at after + * skb was created with generic XDP routine. + */ + skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); + if (IS_ERR(skb)) { this_cpu_inc(tun->pcpu_stats->rx_dropped); - return PTR_ERR(skb); - } + return PTR_ERR(skb); + } + if (!skb) + return total_len; + } else { + if (!zerocopy) { + copylen = len; + if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) + linear = good_linear; + else + linear = tun16_to_cpu(tun, gso.hdr_len); + } - if (zerocopy) - err = zerocopy_sg_from_iter(skb, from); - else - err = skb_copy_datagram_from_iter(skb, 0, from, len); + skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); + if (IS_ERR(skb)) { + if (PTR_ERR(skb) != -EAGAIN) + this_cpu_inc(tun->pcpu_stats->rx_dropped); + return PTR_ERR(skb); + } - if (err) { - this_cpu_inc(tun->pcpu_stats->rx_dropped); - kfree_skb(skb); - return -EFAULT; + if (zerocopy) + err = zerocopy_sg_from_iter(skb, from); + else + err = skb_copy_datagram_from_iter(skb, 0, from, len); + + if (err) { + this_cpu_inc(tun->pcpu_stats->rx_dropped); + kfree_skb(skb); + return -EFAULT; + } } if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { @@ -1334,6 +1532,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); + if (skb_xdp) { + struct bpf_prog *xdp_prog; + int ret; + + rcu_read_lock(); + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog) { + ret = do_xdp_generic(xdp_prog, skb); + if (ret != XDP_PASS) { + rcu_read_unlock(); + return total_len; + } + } + rcu_read_unlock(); + } + rxhash = __skb_get_hash_symmetric(skb); #ifndef CONFIG_4KSTACKS tun_rx_batched(tun, tfile, skb, more); @@ -1924,11 +2138,6 @@ static int set_offload(struct tun_struct *tun, unsigned long arg) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } - - if (arg & TUN_F_UFO) { - features |= NETIF_F_UFO; - arg &= ~TUN_F_UFO; - } } /* This gives the user a way to test for new features in future by @@ -2540,7 +2749,7 @@ static int tun_queue_resize(struct tun_struct *tun) int n = tun->numqueues + tun->numdisabled; int ret, i; - arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL); + arrays = kmalloc_array(n, sizeof(*arrays), GFP_KERNEL); if (!arrays) return -ENOMEM; |