From d445516966dcb2924741b13b27738b54df2af01a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:26:45 -0700 Subject: net: xdp: support xdp generic on virtual devices XDP generic allows users to test XDP programs and/or run them with degraded performance on devices that do not yet support XDP. For testing I typically test eBPF programs using a set of veth devices. This allows testing topologies that would otherwise be difficult to setup especially in the early stages of development. This patch adds a xdp generic hook to the netif_rx_internal() function which is called from dev_forward_skb(). With this addition attaching XDP programs to veth devices works as expected! Also I noticed multiple drivers using netif_rx(). These devices will also benefit and generic XDP will work for them as well. Signed-off-by: John Fastabend Tested-by: Andy Gospodarek Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 208 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 113 insertions(+), 95 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 02440518dd69..a1ed7b41b3e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3865,6 +3865,107 @@ drop: return NET_RX_DROP; } +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct xdp_buff xdp; + u32 act = XDP_DROP; + void *orig_data; + int hlen, off; + u32 mac_len; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_cloned(skb)) + return XDP_PASS; + + if (skb_linearize(skb)) + goto do_drop; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp.data = skb->data - mac_len; + xdp.data_end = xdp.data + hlen; + xdp.data_hard_start = skb->data - skb_headroom(skb); + orig_data = xdp.data; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + off = xdp.data - orig_data; + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + switch (act) { + case XDP_TX: + __skb_push(skb, mac_len); + /* fall through */ + case XDP_PASS: + break; + + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(skb->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + do_drop: + kfree_skb(skb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. + */ +static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; + + txq = netdev_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + kfree_skb(skb); + } +} + +static struct static_key generic_xdp_needed __read_mostly; + +static int do_xdp_generic(struct sk_buff *skb) +{ + struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + + if (xdp_prog) { + u32 act = netif_receive_generic_xdp(skb, xdp_prog); + + if (act != XDP_PASS) { + if (act == XDP_TX) + generic_xdp_tx(skb, xdp_prog); + return XDP_DROP; + } + } + return XDP_PASS; +} + static int netif_rx_internal(struct sk_buff *skb) { int ret; @@ -3872,6 +3973,14 @@ static int netif_rx_internal(struct sk_buff *skb) net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); + + if (static_key_false(&generic_xdp_needed)) { + int ret = do_xdp_generic(skb); + + if (ret != XDP_PASS) + return NET_RX_DROP; + } + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -4338,8 +4447,6 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } -static struct static_key generic_xdp_needed __read_mostly; - static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); @@ -4373,89 +4480,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) return ret; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, - struct bpf_prog *xdp_prog) -{ - struct xdp_buff xdp; - u32 act = XDP_DROP; - void *orig_data; - int hlen, off; - u32 mac_len; - - /* Reinjected packets coming from act_mirred or similar should - * not get XDP generic processing. - */ - if (skb_cloned(skb)) - return XDP_PASS; - - if (skb_linearize(skb)) - goto do_drop; - - /* The XDP program wants to see the packet starting at the MAC - * header. - */ - mac_len = skb->data - skb_mac_header(skb); - hlen = skb_headlen(skb) + mac_len; - xdp.data = skb->data - mac_len; - xdp.data_end = xdp.data + hlen; - xdp.data_hard_start = skb->data - skb_headroom(skb); - orig_data = xdp.data; - - act = bpf_prog_run_xdp(xdp_prog, &xdp); - - off = xdp.data - orig_data; - if (off > 0) - __skb_pull(skb, off); - else if (off < 0) - __skb_push(skb, -off); - - switch (act) { - case XDP_TX: - __skb_push(skb, mac_len); - /* fall through */ - case XDP_PASS: - break; - - default: - bpf_warn_invalid_xdp_action(act); - /* fall through */ - case XDP_ABORTED: - trace_xdp_exception(skb->dev, xdp_prog, act); - /* fall through */ - case XDP_DROP: - do_drop: - kfree_skb(skb); - break; - } - - return act; -} - -/* When doing generic XDP we have to bypass the qdisc layer and the - * network taps in order to match in-driver-XDP behavior. - */ -static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) -{ - struct net_device *dev = skb->dev; - struct netdev_queue *txq; - bool free_skb = true; - int cpu, rc; - - txq = netdev_pick_tx(dev, skb, NULL); - cpu = smp_processor_id(); - HARD_TX_LOCK(dev, txq, cpu); - if (!netif_xmit_stopped(txq)) { - rc = netdev_start_xmit(skb, dev, txq, 0); - if (dev_xmit_complete(rc)) - free_skb = false; - } - HARD_TX_UNLOCK(dev, txq); - if (free_skb) { - trace_xdp_exception(dev, xdp_prog, XDP_TX); - kfree_skb(skb); - } -} - static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@ -4468,17 +4492,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); if (static_key_false(&generic_xdp_needed)) { - struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); - - if (xdp_prog) { - u32 act = netif_receive_generic_xdp(skb, xdp_prog); + int ret = do_xdp_generic(skb); - if (act != XDP_PASS) { - rcu_read_unlock(); - if (act == XDP_TX) - generic_xdp_tx(skb, xdp_prog); - return NET_RX_DROP; - } + if (ret != XDP_PASS) { + rcu_read_unlock(); + return NET_RX_DROP; } } -- cgit v1.2.3 From 6103aa96ec077c976e851e0b89cc2446cb76573d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:27:50 -0700 Subject: net: implement XDP_REDIRECT for xdp generic Add support for redirect to xdp generic creating a fall back for devices that do not yet have support and allowing test infrastructure using veth pairs to be built. Signed-off-by: John Fastabend Tested-by: Andy Gospodarek Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/filter.h | 1 + net/core/dev.c | 22 ++++++++++++++++++++-- net/core/filter.c | 26 ++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/filter.h b/include/linux/filter.h index 64cae7a08148..10df7daf5ec6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -712,6 +712,7 @@ bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp); void bpf_warn_invalid_xdp_action(u32 act); diff --git a/net/core/dev.c b/net/core/dev.c index a1ed7b41b3e8..9f3f4083ada5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3902,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_push(skb, -off); switch (act) { + case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); /* fall through */ @@ -3956,14 +3957,27 @@ static int do_xdp_generic(struct sk_buff *skb) if (xdp_prog) { u32 act = netif_receive_generic_xdp(skb, xdp_prog); + int err; if (act != XDP_PASS) { - if (act == XDP_TX) + switch (act) { + case XDP_REDIRECT: + err = xdp_do_generic_redirect(skb->dev, skb); + if (err) + goto out_redir; + /* fallthru to submit skb */ + case XDP_TX: generic_xdp_tx(skb, xdp_prog); + break; + } return XDP_DROP; } } return XDP_PASS; +out_redir: + trace_xdp_exception(skb->dev, xdp_prog, XDP_REDIRECT); + kfree_skb(skb); + return XDP_DROP; } static int netif_rx_internal(struct sk_buff *skb) @@ -3977,8 +3991,12 @@ static int netif_rx_internal(struct sk_buff *skb) if (static_key_false(&generic_xdp_needed)) { int ret = do_xdp_generic(skb); + /* Consider XDP consuming the packet a success from + * the netdev point of view we do not want to count + * this as an error. + */ if (ret != XDP_PASS) - return NET_RX_DROP; + return NET_RX_SUCCESS; } #ifdef CONFIG_RPS diff --git a/net/core/filter.c b/net/core/filter.c index d606a66d1040..eeb713461c25 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2437,6 +2437,32 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_do_redirect); +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + unsigned int len; + + dev = dev_get_by_index_rcu(dev_net(dev), ri->ifindex); + ri->ifindex = 0; + if (unlikely(!dev)) { + bpf_warn_invalid_xdp_redirect(ri->ifindex); + goto err; + } + + if (unlikely(!(dev->flags & IFF_UP))) + goto err; + + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len > len) + goto err; + + skb->dev = dev; + return 0; +err: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); + BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); -- cgit v1.2.3 From d4c023f4f3dd96734ef53d4b588136a872300046 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 3 Jul 2017 07:04:22 -0700 Subject: net: Remove references to NETIF_F_UFO in netdev_fix_features(). It is going away. Signed-off-by: David S. Miller --- net/core/dev.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9f3f4083ada5..467420eda02e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7271,24 +7271,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_GSO; } - /* UFO needs SG and checksumming */ - if (features & NETIF_F_UFO) { - /* maybe split UFO into V4 and V6? */ - if (!(features & NETIF_F_HW_CSUM) && - ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != - (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no checksum offload features.\n"); - features &= ~NETIF_F_UFO; - } - - if (!(features & NETIF_F_SG)) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); - features &= ~NETIF_F_UFO; - } - } - /* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) && !(features & NETIF_F_GSO_PARTIAL)) { -- cgit v1.2.3 From 7051b88a35c7dde5705923833117e14f9cc17d92 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 18 Jul 2017 15:59:27 -0700 Subject: net: make dev_close and related functions void There is no useful return value from dev_close. All paths return 0. Change dev_close and helper functions to void. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/core/dev.c | 26 +++++++++++--------------- 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c60351b84323..614642eb7eb7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2432,8 +2432,8 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); int dev_open(struct net_device *dev); -int dev_close(struct net_device *dev); -int dev_close_many(struct list_head *head, bool unlink); +void dev_close(struct net_device *dev); +void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 467420eda02e..d1b9c9b6c970 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1413,7 +1413,7 @@ int dev_open(struct net_device *dev) } EXPORT_SYMBOL(dev_open); -static int __dev_close_many(struct list_head *head) +static void __dev_close_many(struct list_head *head) { struct net_device *dev; @@ -1455,23 +1455,18 @@ static int __dev_close_many(struct list_head *head) dev->flags &= ~IFF_UP; netpoll_poll_enable(dev); } - - return 0; } -static int __dev_close(struct net_device *dev) +static void __dev_close(struct net_device *dev) { - int retval; LIST_HEAD(single); list_add(&dev->close_list, &single); - retval = __dev_close_many(&single); + __dev_close_many(&single); list_del(&single); - - return retval; } -int dev_close_many(struct list_head *head, bool unlink) +void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; @@ -1488,8 +1483,6 @@ int dev_close_many(struct list_head *head, bool unlink) if (unlink) list_del_init(&dev->close_list); } - - return 0; } EXPORT_SYMBOL(dev_close_many); @@ -1502,7 +1495,7 @@ EXPORT_SYMBOL(dev_close_many); * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ -int dev_close(struct net_device *dev) +void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); @@ -1511,7 +1504,6 @@ int dev_close(struct net_device *dev) dev_close_many(&single, true); list_del(&single); } - return 0; } EXPORT_SYMBOL(dev_close); @@ -6725,8 +6717,12 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) - ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); + if ((old_flags ^ flags) & IFF_UP) { + if (old_flags & IFF_UP) + __dev_close(dev); + else + ret = __dev_open(dev); + } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; -- cgit v1.2.3 From d764a122cc7af7ab1c40c08745f0fcd33cc2f7db Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 21 Jul 2017 12:49:28 +0200 Subject: net: add new netdevice feature for offload of RX port for UDP tunnels This adds a new netdevice feature, so that the offloading of RX port for UDP tunnels can be disabled by the administrator on some netdevices, using the "rx-udp_tunnel-port-offload" feature in ethtool. This feature is set for all devices that provide ndo_udp_tunnel_add. Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ net/core/dev.c | 6 ++++++ net/core/ethtool.c | 1 + 3 files changed, 9 insertions(+) (limited to 'net/core/dev.c') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index ebd273627334..dc8b4896b77b 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -75,6 +75,7 @@ enum { NETIF_F_HW_TC_BIT, /* Offload TC infrastructure */ NETIF_F_HW_ESP_BIT, /* Hardware ESP transformation offload */ NETIF_F_HW_ESP_TX_CSUM_BIT, /* ESP with TX checksum offload */ + NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */ /* * Add your fresh new feature above and remember to update @@ -138,6 +139,7 @@ enum { #define NETIF_F_HW_TC __NETIF_F(HW_TC) #define NETIF_F_HW_ESP __NETIF_F(HW_ESP) #define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM) +#define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) diff --git a/net/core/dev.c b/net/core/dev.c index 509af6ce8831..9081134adc0d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7530,6 +7530,12 @@ int register_netdevice(struct net_device *dev) */ dev->hw_features |= NETIF_F_SOFT_FEATURES; dev->features |= NETIF_F_SOFT_FEATURES; + + if (dev->netdev_ops->ndo_udp_tunnel_add) { + dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; + dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; + } + dev->wanted_features = dev->features & dev->hw_features; if (!(dev->flags & IFF_LOOPBACK)) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 78408ab77a10..b987bc475fc8 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -105,6 +105,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_HW_TC_BIT] = "hw-tc-offload", [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", + [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", }; static const char -- cgit v1.2.3 From ae847f40b6418a7d6e197f6ef0d85f40e313c4d4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 21 Jul 2017 12:49:31 +0200 Subject: net: call udp_tunnel_get_rx_info when NETIF_F_RX_UDP_TUNNEL_PORT is toggled NETIF_F_RX_UDP_TUNNEL_PORT is special, in that we need to do more than just flip the bit in dev->features. When disabling we must also clear currently offloaded ports from the device, and when enabling we must tell the device to offload the ports it can. Because vxlan stores its sockets in a hashtable and they are inserted at the head of per-bucket lists, switching the feature off and then on can result in a different set of ports being offloaded (depending on the HW's limits). Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/core/dev.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9081134adc0d..8ea6b4b42611 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -144,6 +144,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -7327,8 +7328,27 @@ sync_lower: netdev_for_each_lower_dev(dev, lower, iter) netdev_sync_lower_features(dev, lower, features); - if (!err) + if (!err) { + netdev_features_t diff = features ^ dev->features; + + if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { + /* udp_tunnel_{get,drop}_rx_info both need + * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the + * device, or they won't do anything. + * Thus we need to update dev->features + * *before* calling udp_tunnel_get_rx_info, + * but *after* calling udp_tunnel_drop_rx_info. + */ + if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { + dev->features = features; + udp_tunnel_get_rx_info(dev); + } else { + udp_tunnel_drop_rx_info(dev); + } + } + dev->features = features; + } return err < 0 ? 0 : 1; } -- cgit v1.2.3 From 1f8b977ab32dc5d148f103326e80d9097f1cefb5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:41 -0400 Subject: sock: enable MSG_ZEROCOPY Prepare the datapath for refcounted ubuf_info. Clone ubuf_info with skb_zerocopy_clone() wherever needed due to skb split, merge, resize or clone. Split skb_orphan_frags into two variants. The split, merge, .. paths support reference counted zerocopy buffers, so do not do a deep copy. Add skb_orphan_frags_rx for paths that may loop packets to receive sockets. That is not allowed, as it may cause unbounded latency. Deep copy all zerocopy copy buffers, ref-counted or not, in this path. The exact locations to modify were chosen by exhaustively searching through all code that might modify skb_frag references and/or the the SKBTX_DEV_ZEROCOPY tx_flags bit. The changes err on the safe side, in two ways. (1) legacy ubuf_info paths virtio and tap are not modified. They keep a 1:1 ubuf_info to sk_buff relationship. Calls to skb_orphan_frags still call skb_copy_ubufs and thus copy frags in this case. (2) not all copies deep in the stack are addressed yet. skb_shift, skb_split and skb_try_coalesce can be refined to avoid copying. These are not in the hot path and this patch is hairy enough as is, so that is left for future refinement. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 +- drivers/vhost/net.c | 1 + include/linux/skbuff.h | 14 +++++++++++++- net/core/dev.c | 4 ++-- net/core/skbuff.c | 48 +++++++++++++++++++----------------------------- 5 files changed, 36 insertions(+), 33 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 68add55f8460..d21510d47aa2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -892,7 +892,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) sk_filter(tfile->socket.sk, skb)) goto drop; - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; skb_tx_timestamp(skb); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 06d044862e58..ba08b78ed630 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -533,6 +533,7 @@ static void handle_tx(struct vhost_net *net) ubuf->callback = vhost_zerocopy_callback; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; + atomic_set(&ubuf->refcnt, 1); msg.msg_control = ubuf; msg.msg_controllen = sizeof(ubuf); ubufs = nvq->ubufs; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 59cff7aa494e..e5387932c266 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2512,7 +2512,17 @@ static inline void skb_orphan(struct sk_buff *skb) */ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { - if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY))) + if (likely(!skb_zcopy(skb))) + return 0; + if (skb_uarg(skb)->callback == sock_zerocopy_callback) + return 0; + return skb_copy_ubufs(skb, gfp_mask); +} + +/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */ +static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) +{ + if (likely(!skb_zcopy(skb))) return 0; return skb_copy_ubufs(skb, gfp_mask); } @@ -2944,6 +2954,8 @@ static inline int skb_add_data(struct sk_buff *skb, static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { + if (skb_zcopy(skb)) + return false; if (i) { const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1]; diff --git a/net/core/dev.c b/net/core/dev.c index 8ea6b4b42611..1d75499add72 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1853,7 +1853,7 @@ static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); @@ -4412,7 +4412,7 @@ skip_classify: } if (pt_prev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 29e34bc6a17c..74d3c36f8419 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -567,21 +567,10 @@ static void skb_release_data(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i]); - /* - * If skb buf is from userspace, we need to notify the caller - * the lower device DMA has done; - */ - if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; - - uarg = shinfo->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, true); - } - if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); + skb_zcopy_clear(skb, true); skb_free_head(skb); } @@ -695,14 +684,7 @@ EXPORT_SYMBOL(kfree_skb_list); */ void skb_tx_error(struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; - - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, false); - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; - } + skb_zcopy_clear(skb, true); } EXPORT_SYMBOL(skb_tx_error); @@ -1029,9 +1011,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); -/* unused only until next patch in the series; will remove attribute */ -static int __attribute__((unused)) - skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, +static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, gfp_t gfp_mask) { if (skb_zcopy(orig)) { @@ -1068,7 +1048,6 @@ static int __attribute__((unused)) */ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { - struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; int i, new_frags; @@ -1127,8 +1106,6 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); - uarg->callback(uarg, false); - /* skb frags point to kernel buffers */ for (i = 0; i < new_frags - 1; i++) { __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); @@ -1137,7 +1114,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); skb_shinfo(skb)->nr_frags = new_frags; - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; + skb_zcopy_clear(skb, false); return 0; } EXPORT_SYMBOL_GPL(skb_copy_ubufs); @@ -1298,7 +1275,8 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, if (skb_shinfo(skb)->nr_frags) { int i; - if (skb_orphan_frags(skb, gfp_mask)) { + if (skb_orphan_frags(skb, gfp_mask) || + skb_zerocopy_clone(n, skb, gfp_mask)) { kfree_skb(n); n = NULL; goto out; @@ -1375,9 +1353,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, * be since all we did is relocate the values */ if (skb_cloned(skb)) { - /* copy this zero copy skb frags */ if (skb_orphan_frags(skb, gfp_mask)) goto nofrags; + if (skb_zcopy(skb)) + atomic_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); @@ -1872,6 +1851,9 @@ end: skb->tail += delta; skb->data_len -= delta; + if (!skb->data_len) + skb_zcopy_clear(skb, false); + return skb_tail_pointer(skb); } EXPORT_SYMBOL(__pskb_pull_tail); @@ -2627,6 +2609,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) skb_tx_error(from); return -ENOMEM; } + skb_zerocopy_clone(to, from, GFP_ATOMIC); for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { if (!len) @@ -2924,6 +2907,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2967,6 +2951,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) if (skb_headlen(skb)) return 0; + if (skb_zcopy(tgt) || skb_zcopy(skb)) + return 0; todo = shiftlen; from = 0; @@ -3540,6 +3526,8 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; + if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC)) + goto err; while (pos < offset + len) { if (i >= nfrags) { @@ -4663,6 +4651,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_has_frag_list(to) || skb_has_frag_list(from)) return false; + if (skb_zcopy(to) || skb_zcopy(from)) + return false; if (skb_headlen(from) != 0) { struct page *page; -- cgit v1.2.3 From 939912216fa8f62331de7d04edff492d5dc8e6e9 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 10 Aug 2017 20:16:29 -0700 Subject: net: skb_needs_check() removes CHECKSUM_UNNECESSARY check for tx. Because we remove the UFO support, we will also remove the CHECKSUM_UNNECESSARY check in skb_needs_check(). Cc: Willem de Bruijn Signed-off-by: Tonghao Zhang Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3f69f6e71824..1024d3741d12 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2731,8 +2731,7 @@ EXPORT_SYMBOL(skb_mac_gso_segment); static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_UNNECESSARY; + return skb->ip_summed != CHECKSUM_PARTIAL; return skb->ip_summed == CHECKSUM_NONE; } -- cgit v1.2.3 From 7c4974786f4794178f04e96318fc3b2f2850cbc6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 11 Aug 2017 19:41:17 +0800 Subject: net: export some generic xdp helpers This patch tries to export some generic xdp helpers to drivers. This can let driver to do XDP for a specific skb. This is useful for the case when the packet is hard to be processed at page level directly (e.g jumbo/GSO frame). With this patch, there's no need for driver to forbid the XDP set when configuration is not suitable. Instead, it can defer the XDP for packets that is hard to be processed directly after skb is created. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 14 ++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d238d54c484..0f1c4cb2441e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3243,6 +3243,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED); } +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1024d3741d12..40b28e417072 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3919,7 +3919,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. */ -static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -3940,13 +3940,12 @@ static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) kfree_skb(skb); } } +EXPORT_SYMBOL_GPL(generic_xdp_tx); static struct static_key generic_xdp_needed __read_mostly; -static int do_xdp_generic(struct sk_buff *skb) +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { - struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); - if (xdp_prog) { u32 act = netif_receive_generic_xdp(skb, xdp_prog); int err; @@ -3971,6 +3970,7 @@ out_redir: kfree_skb(skb); return XDP_DROP; } +EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { @@ -3981,7 +3981,8 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(skb); + int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + skb); /* Consider XDP consuming the packet a success from * the netdev point of view we do not want to count @@ -4502,7 +4503,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(skb); + int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + skb); if (ret != XDP_PASS) { rcu_read_unlock(); -- cgit v1.2.3 From 2facaad6000f2322eb40ca379aced31c957f0a41 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 Aug 2017 12:33:08 +0200 Subject: xdp: make generic xdp redirect use tracepoint trace_xdp_redirect If the xdp_do_generic_redirect() call fails, it trigger the trace_xdp_exception tracepoint. It seems better to use the same tracepoint trace_xdp_redirect, as the native xdp_do_redirect{,_map} does. Signed-off-by: Jesper Dangaard Brouer Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 3 ++- net/core/dev.c | 4 ++-- net/core/filter.c | 35 +++++++++++++++++++++++------------ 3 files changed, 27 insertions(+), 15 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7015116331af..d29e58fde364 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -718,7 +718,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * because we only track one map and force a flush when the map changes. * This does not appear to be a real limitation for existing software. */ -int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb); +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); diff --git a/net/core/dev.c b/net/core/dev.c index 40b28e417072..270b54754821 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3953,7 +3953,8 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: - err = xdp_do_generic_redirect(skb->dev, skb); + err = xdp_do_generic_redirect(skb->dev, skb, + xdp_prog); if (err) goto out_redir; /* fallthru to submit skb */ @@ -3966,7 +3967,6 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - trace_xdp_exception(skb->dev, xdp_prog, XDP_REDIRECT); kfree_skb(skb); return XDP_DROP; } diff --git a/net/core/filter.c b/net/core/filter.c index a5e5f31b41b9..a04680331033 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2553,26 +2553,37 @@ out: } EXPORT_SYMBOL_GPL(xdp_do_redirect); -int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb) +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - unsigned int len; u32 index = ri->ifindex; + struct net_device *fwd; + unsigned int len; + int err = 0; - dev = dev_get_by_index_rcu(dev_net(dev), index); + fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; - if (unlikely(!dev)) { - return -EINVAL; + if (unlikely(!fwd)) { + err = -EINVAL; + goto out; } - if (unlikely(!(dev->flags & IFF_UP))) - return -ENETDOWN; - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len > len) - return -E2BIG; + if (unlikely(!(fwd->flags & IFF_UP))) { + err = -ENETDOWN; + goto out; + } - skb->dev = dev; - return 0; + len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; + if (skb->len > len) { + err = -EMSGSIZE; + goto out; + } + + skb->dev = fwd; +out: + trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err); + return err; } EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); -- cgit v1.2.3