diff options
author | David S. Miller <davem@davemloft.net> | 2018-08-13 10:07:23 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-08-13 10:07:23 -0700 |
commit | c1617fb4c5eea2ad38b5ffb937a732dbb137117f (patch) | |
tree | 7331bdc1fd66799533d2339816c688ded60e0475 | |
parent | 961d9735357ec59be3e3e2c37c0ca6494f04461b (diff) | |
parent | 2ce3206b9eb3943de09f3bf4ec9134568420d8b9 (diff) | |
download | linux-c1617fb4c5eea2ad38b5ffb937a732dbb137117f.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says:
====================
pull-request: bpf-next 2018-08-13
The following pull-request contains BPF updates for your *net-next* tree.
The main changes are:
1) Add driver XDP support for veth. This can be used in conjunction with
redirect of another XDP program e.g. sitting on NIC so the xdp_frame
can be forwarded to the peer veth directly without modification,
from Toshiaki.
2) Add a new BPF map type REUSEPORT_SOCKARRAY and prog type SK_REUSEPORT
in order to provide more control and visibility on where a SO_REUSEPORT
sk should be located, and the latter enables to directly select a sk
from the bpf map. This also enables map-in-map for application migration
use cases, from Martin.
3) Add a new BPF helper bpf_skb_ancestor_cgroup_id() that returns the id
of cgroup v2 that is the ancestor of the cgroup associated with the
skb at the ancestor_level, from Andrey.
4) Implement BPF fs map pretty-print support based on BTF data for regular
hash table and LRU map, from Yonghong.
5) Decouple the ability to attach BTF for a map from the key and value
pretty-printer in BPF fs, and enable further support of BTF for maps for
percpu and LPM trie, from Daniel.
6) Implement a better BPF sample of using XDP's CPU redirect feature for
load balancing SKB processing to remote CPU. The sample implements the
same XDP load balancing as Suricata does which is symmetric hash based
on IP and L4 protocol, from Jesper.
7) Revert adding NULL pointer check with WARN_ON_ONCE() in __xdp_return()'s
critical path as it is ensured that the allocator is present, from Björn.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
56 files changed, 3707 insertions, 176 deletions
diff --git a/drivers/net/veth.c b/drivers/net/veth.c index a69ad39ee57e..e3202af72df5 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -17,22 +17,47 @@ #include <net/rtnetlink.h> #include <net/dst.h> #include <net/xfrm.h> +#include <net/xdp.h> #include <linux/veth.h> #include <linux/module.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/ptr_ring.h> +#include <linux/bpf_trace.h> #define DRV_NAME "veth" #define DRV_VERSION "1.0" +#define VETH_XDP_FLAG BIT(0) +#define VETH_RING_SIZE 256 +#define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) + +/* Separating two types of XDP xmit */ +#define VETH_XDP_TX BIT(0) +#define VETH_XDP_REDIR BIT(1) + struct pcpu_vstats { u64 packets; u64 bytes; struct u64_stats_sync syncp; }; +struct veth_rq { + struct napi_struct xdp_napi; + struct net_device *dev; + struct bpf_prog __rcu *xdp_prog; + struct xdp_mem_info xdp_mem; + bool rx_notify_masked; + struct ptr_ring xdp_ring; + struct xdp_rxq_info xdp_rxq; +}; + struct veth_priv { struct net_device __rcu *peer; atomic64_t dropped; - unsigned requested_headroom; + struct bpf_prog *_xdp_prog; + struct veth_rq *rq; + unsigned int requested_headroom; }; /* @@ -98,11 +123,67 @@ static const struct ethtool_ops veth_ethtool_ops = { .get_link_ksettings = veth_get_link_ksettings, }; +/* general routines */ + +static bool veth_is_xdp_frame(void *ptr) +{ + return (unsigned long)ptr & VETH_XDP_FLAG; +} + +static void *veth_ptr_to_xdp(void *ptr) +{ + return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); +} + +static void *veth_xdp_to_ptr(void *ptr) +{ + return (void *)((unsigned long)ptr | VETH_XDP_FLAG); +} + +static void veth_ptr_free(void *ptr) +{ + if (veth_is_xdp_frame(ptr)) + xdp_return_frame(veth_ptr_to_xdp(ptr)); + else + kfree_skb(ptr); +} + +static void __veth_xdp_flush(struct veth_rq *rq) +{ + /* Write ptr_ring before reading rx_notify_masked */ + smp_mb(); + if (!rq->rx_notify_masked) { + rq->rx_notify_masked = true; + napi_schedule(&rq->xdp_napi); + } +} + +static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) +{ + if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { + dev_kfree_skb_any(skb); + return NET_RX_DROP; + } + + return NET_RX_SUCCESS; +} + +static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, + struct veth_rq *rq, bool xdp) +{ + return __dev_forward_skb(dev, skb) ?: xdp ? + veth_xdp_rx(rq, skb) : + netif_rx(skb); +} + static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { - struct veth_priv *priv = netdev_priv(dev); + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct veth_rq *rq = NULL; struct net_device *rcv; int length = skb->len; + bool rcv_xdp = false; + int rxq; rcu_read_lock(); rcv = rcu_dereference(priv->peer); @@ -111,7 +192,16 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) goto drop; } - if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) { + rcv_priv = netdev_priv(rcv); + rxq = skb_get_queue_mapping(skb); + if (rxq < rcv->real_num_rx_queues) { + rq = &rcv_priv->rq[rxq]; + rcv_xdp = rcu_access_pointer(rq->xdp_prog); + if (rcv_xdp) + skb_record_rx_queue(skb, rxq); + } + + if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); u64_stats_update_begin(&stats->syncp); @@ -122,14 +212,15 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) drop: atomic64_inc(&priv->dropped); } + + if (rcv_xdp) + __veth_xdp_flush(rq); + rcu_read_unlock(); + return NETDEV_TX_OK; } -/* - * general routines - */ - static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -179,18 +270,502 @@ static void veth_set_multicast_list(struct net_device *dev) { } +static struct sk_buff *veth_build_skb(void *head, int headroom, int len, + int buflen) +{ + struct sk_buff *skb; + + if (!buflen) { + buflen = SKB_DATA_ALIGN(headroom + len) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + } + skb = build_skb(head, buflen); + if (!skb) + return NULL; + + skb_reserve(skb, headroom); + skb_put(skb, len); + + return skb; +} + +static int veth_select_rxq(struct net_device *dev) +{ + return smp_processor_id() % dev->real_num_rx_queues; +} + +static int veth_xdp_xmit(struct net_device *dev, int n, + struct xdp_frame **frames, u32 flags) +{ + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct net_device *rcv; + unsigned int max_len; + struct veth_rq *rq; + int i, drops = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + rcv = rcu_dereference(priv->peer); + if (unlikely(!rcv)) + return -ENXIO; + + rcv_priv = netdev_priv(rcv); + rq = &rcv_priv->rq[veth_select_rxq(rcv)]; + /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive + * side. This means an XDP program is loaded on the peer and the peer + * device is up. + */ + if (!rcu_access_pointer(rq->xdp_prog)) + return -ENXIO; + + max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; + + spin_lock(&rq->xdp_ring.producer_lock); + for (i = 0; i < n; i++) { + struct xdp_frame *frame = frames[i]; + void *ptr = veth_xdp_to_ptr(frame); + + if (unlikely(frame->len > max_len || + __ptr_ring_produce(&rq->xdp_ring, ptr))) { + xdp_return_frame_rx_napi(frame); + drops++; + } + } + spin_unlock(&rq->xdp_ring.producer_lock); + + if (flags & XDP_XMIT_FLUSH) + __veth_xdp_flush(rq); + + return n - drops; +} + +static void veth_xdp_flush(struct net_device *dev) +{ + struct veth_priv *rcv_priv, *priv = netdev_priv(dev); + struct net_device *rcv; + struct veth_rq *rq; + + rcu_read_lock(); + rcv = rcu_dereference(priv->peer); + if (unlikely(!rcv)) + goto out; + + rcv_priv = netdev_priv(rcv); + rq = &rcv_priv->rq[veth_select_rxq(rcv)]; + /* xdp_ring is initialized on receive side? */ + if (unlikely(!rcu_access_pointer(rq->xdp_prog))) + goto out; + + __veth_xdp_flush(rq); +out: + rcu_read_unlock(); +} + +static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) +{ + struct xdp_frame *frame = convert_to_xdp_frame(xdp); + + if (unlikely(!frame)) + return -EOVERFLOW; + + return veth_xdp_xmit(dev, 1, &frame, 0); +} + +static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, + struct xdp_frame *frame, + unsigned int *xdp_xmit) +{ + void *hard_start = frame->data - frame->headroom; + void *head = hard_start - sizeof(struct xdp_frame); + int len = frame->len, delta = 0; + struct xdp_frame orig_frame; + struct bpf_prog *xdp_prog; + unsigned int headroom; + struct sk_buff *skb; + + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); + if (likely(xdp_prog)) { + struct xdp_buff xdp; + u32 act; + + xdp.data_hard_start = hard_start; + xdp.data = frame->data; + xdp.data_end = frame->data + frame->len; + xdp.data_meta = frame->data - frame->metasize; + xdp.rxq = &rq->xdp_rxq; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + switch (act) { + case XDP_PASS: + delta = frame->data - xdp.data; + len = xdp.data_end - xdp.data; + break; + case XDP_TX: + orig_frame = *frame; + xdp.data_hard_start = head; + xdp.rxq->mem = frame->mem; + if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { + trace_xdp_exception(rq->dev, xdp_prog, act); + frame = &orig_frame; + goto err_xdp; + } + *xdp_xmit |= VETH_XDP_TX; + rcu_read_unlock(); + goto xdp_xmit; + case XDP_REDIRECT: + orig_frame = *frame; + xdp.data_hard_start = head; + xdp.rxq->mem = frame->mem; + if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { + frame = &orig_frame; + goto err_xdp; + } + *xdp_xmit |= VETH_XDP_REDIR; + rcu_read_unlock(); + goto xdp_xmit; + default: + bpf_warn_invalid_xdp_action(act); + case XDP_ABORTED: + trace_xdp_exception(rq->dev, xdp_prog, act); + case XDP_DROP: + goto err_xdp; + } + } + rcu_read_unlock(); + + headroom = sizeof(struct xdp_frame) + frame->headroom - delta; + skb = veth_build_skb(head, headroom, len, 0); + if (!skb) { + xdp_return_frame(frame); + goto err; + } + + xdp_scrub_frame(frame); + skb->protocol = eth_type_trans(skb, rq->dev); +err: + return skb; +err_xdp: + rcu_read_unlock(); + xdp_return_frame(frame); +xdp_xmit: + return NULL; +} + +static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, + unsigned int *xdp_xmit) +{ + u32 pktlen, headroom, act, metalen; + void *orig_data, *orig_data_end; + struct bpf_prog *xdp_prog; + int mac_len, delta, off; + struct xdp_buff xdp; + + rcu_read_lock(); + xdp_prog = rcu_dereference(rq->xdp_prog); + if (unlikely(!xdp_prog)) { + rcu_read_unlock(); + goto out; + } + + mac_len = skb->data - skb_mac_header(skb); + pktlen = skb->len + mac_len; + headroom = skb_headroom(skb) - mac_len; + + if (skb_shared(skb) || skb_head_is_locked(skb) || + skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { + struct sk_buff *nskb; + int size, head_off; + void *head, *start; + struct page *page; + + size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + if (size > PAGE_SIZE) + goto drop; + + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); + if (!page) + goto drop; + + head = page_address(page); + start = head + VETH_XDP_HEADROOM; + if (skb_copy_bits(skb, -mac_len, start, pktlen)) { + page_frag_free(head); + goto drop; + } + + nskb = veth_build_skb(head, + VETH_XDP_HEADROOM + mac_len, skb->len, + PAGE_SIZE); + if (!nskb) { + page_frag_free(head); + goto drop; + } + + skb_copy_header(nskb, skb); + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + if (skb->sk) + skb_set_owner_w(nskb, skb->sk); + consume_skb(skb); + skb = nskb; + } + + xdp.data_hard_start = skb->head; + xdp.data = skb_mac_header(skb); + xdp.data_end = xdp.data + pktlen; + xdp.data_meta = xdp.data; + xdp.rxq = &rq->xdp_rxq; + orig_data = xdp.data; + orig_data_end = xdp.data_end; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + switch (act) { + case XDP_PASS: + break; + case XDP_TX: + get_page(virt_to_page(xdp.data)); + consume_skb(skb); + xdp.rxq->mem = rq->xdp_mem; + if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { + trace_xdp_exception(rq->dev, xdp_prog, act); + goto err_xdp; + } + *xdp_xmit |= VETH_XDP_TX; + rcu_read_unlock(); + goto xdp_xmit; + case XDP_REDIRECT: + get_page(virt_to_page(xdp.data)); + consume_skb(skb); + xdp.rxq->mem = rq->xdp_mem; + if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) + goto err_xdp; + *xdp_xmit |= VETH_XDP_REDIR; + rcu_read_unlock(); + goto xdp_xmit; + default: + bpf_warn_invalid_xdp_action(act); + case XDP_ABORTED: + trace_xdp_exception(rq->dev, xdp_prog, act); + case XDP_DROP: + goto drop; + } + rcu_read_unlock(); + + delta = orig_data - xdp.data; + off = mac_len + delta; + if (off > 0) + __skb_push(skb, off); + else if (off < 0) + __skb_pull(skb, -off); + skb->mac_header -= delta; + off = xdp.data_end - orig_data_end; + if (off != 0) + __skb_put(skb, off); + skb->protocol = eth_type_trans(skb, rq->dev); + + metalen = xdp.data - xdp.data_meta; + if (metalen) + skb_metadata_set(skb, metalen); +out: + return skb; +drop: + rcu_read_unlock(); + kfree_skb(skb); + return NULL; +err_xdp: + rcu_read_unlock(); + page_frag_free(xdp.data); +xdp_xmit: + return NULL; +} + +static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) +{ + int i, done = 0; + + for (i = 0; i < budget; i++) { + void *ptr = __ptr_ring_consume(&rq->xdp_ring); + struct sk_buff *skb; + + if (!ptr) + break; + + if (veth_is_xdp_frame(ptr)) { + skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr), + xdp_xmit); + } else { + skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit); + } + + if (skb) + napi_gro_receive(&rq->xdp_napi, skb); + + done++; + } + + return done; +} + +static int veth_poll(struct napi_struct *napi, int budget) +{ + struct veth_rq *rq = + container_of(napi, struct veth_rq, xdp_napi); + unsigned int xdp_xmit = 0; + int done; + + xdp_set_return_frame_no_direct(); + done = veth_xdp_rcv(rq, budget, &xdp_xmit); + + if (done < budget && napi_complete_done(napi, done)) { + /* Write rx_notify_masked before reading ptr_ring */ + smp_store_mb(rq->rx_notify_masked, false); + if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { + rq->rx_notify_masked = true; + napi_schedule(&rq->xdp_napi); + } + } + + if (xdp_xmit & VETH_XDP_TX) + veth_xdp_flush(rq->dev); + if (xdp_xmit & VETH_XDP_REDIR) + xdp_do_flush_map(); + xdp_clear_return_frame_no_direct(); + + return done; +} + +static int veth_napi_add(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int err, i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); + if (err) + goto err_xdp_ring; + } + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); + napi_enable(&rq->xdp_napi); + } + + return 0; +err_xdp_ring: + for (i--; i >= 0; i--) + ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); + + return err; +} + +static void veth_napi_del(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + napi_disable(&rq->xdp_napi); + napi_hash_del(&rq->xdp_napi); + } + synchronize_net(); + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + netif_napi_del(&rq->xdp_napi); + rq->rx_notify_masked = false; + ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); + } +} + +static int veth_enable_xdp(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int err, i; + + if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); + if (err < 0) + goto err_rxq_reg; + + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_PAGE_SHARED, + NULL); + if (err < 0) + goto err_reg_mem; + + /* Save original mem info as it can be overwritten */ + rq->xdp_mem = rq->xdp_rxq.mem; + } + + err = veth_napi_add(dev); + if (err) + goto err_rxq_reg; + } + + for (i = 0; i < dev->real_num_rx_queues; i++) + rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); + + return 0; +err_reg_mem: + xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); +err_rxq_reg: + for (i--; i >= 0; i--) + xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); + + return err; +} + +static void veth_disable_xdp(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) + rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); + veth_napi_del(dev); + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + rq->xdp_rxq.mem = rq->xdp_mem; + xdp_rxq_info_unreg(&rq->xdp_rxq); + } +} + static int veth_open(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); struct net_device *peer = rtnl_dereference(priv->peer); + int err; if (!peer) return -ENOTCONN; + if (priv->_xdp_prog) { + err = veth_enable_xdp(dev); + if (err) + return err; + } + if (peer->flags & IFF_UP) { netif_carrier_on(dev); netif_carrier_on(peer); } + return 0; } @@ -203,6 +778,9 @@ static int veth_close(struct net_device *dev) if (peer) netif_carrier_off(peer); + if (priv->_xdp_prog) + veth_disable_xdp(dev); + return 0; } @@ -228,7 +806,7 @@ static void veth_dev_free(struct net_device *dev) static void veth_poll_controller(struct net_device *dev) { /* veth only receives frames when its peer sends one - * Since it's a synchronous operation, we are guaranteed + * Since it has nothing to do with disabling irqs, we are guaranteed * never to have pending data when we poll for it so * there is nothing to do here. * @@ -253,6 +831,23 @@ static int veth_get_iflink(const struct net_device *dev) return iflink; } +static netdev_features_t veth_fix_features(struct net_device *dev, + netdev_features_t features) +{ + struct veth_priv *priv = netdev_priv(dev); + struct net_device *peer; + + peer = rtnl_dereference(priv->peer); + if (peer) { + struct veth_priv *peer_priv = netdev_priv(peer); + + if (peer_priv->_xdp_prog) + features &= ~NETIF_F_GSO_SOFTWARE; + } + + return features; +} + static void veth_set_rx_headroom(struct net_device *dev, int new_hr) { struct veth_priv *peer_priv, *priv = netdev_priv(dev); @@ -276,6 +871,103 @@ out: rcu_read_unlock(); } +static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) +{ + struct veth_priv *priv = netdev_priv(dev); + struct bpf_prog *old_prog; + struct net_device *peer; + unsigned int max_mtu; + int err; + + old_prog = priv->_xdp_prog; + priv->_xdp_prog = prog; + peer = rtnl_dereference(priv->peer); + + if (prog) { + if (!peer) { + NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); + err = -ENOTCONN; + goto err; + } + + max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - + peer->hard_header_len - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + if (peer->mtu > max_mtu) { + NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); + err = -ERANGE; + goto err; + } + + if (dev->real_num_rx_queues < peer->real_num_tx_queues) { + NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); + err = -ENOSPC; + goto err; + } + + if (dev->flags & IFF_UP) { + err = veth_enable_xdp(dev); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); + goto err; + } + } + + if (!old_prog) { + peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; + peer->max_mtu = max_mtu; + } + } + + if (old_prog) { + if (!prog) { + if (dev->flags & IFF_UP) + veth_disable_xdp(dev); + + if (peer) { + peer->hw_features |= NETIF_F_GSO_SOFTWARE; + peer->max_mtu = ETH_MAX_MTU; + } + } + bpf_prog_put(old_prog); + } + + if ((!!old_prog ^ !!prog) && peer) + netdev_update_features(peer); + + return 0; +err: + priv->_xdp_prog = old_prog; + + return err; +} + +static u32 veth_xdp_query(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + const struct bpf_prog *xdp_prog; + + xdp_prog = priv->_xdp_prog; + if (xdp_prog) + return xdp_prog->aux->id; + + return 0; +} + +static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return veth_xdp_set(dev, xdp->prog, xdp->extack); + case XDP_QUERY_PROG: + xdp->prog_id = veth_xdp_query(dev); + return 0; + default: + return -EINVAL; + } +} + static const struct net_device_ops veth_netdev_ops = { .ndo_init = veth_dev_init, .ndo_open = veth_open, @@ -288,8 +980,11 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_poll_controller = veth_poll_controller, #endif .ndo_get_iflink = veth_get_iflink, + .ndo_fix_features = veth_fix_features, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = veth_set_rx_headroom, + .ndo_bpf = veth_xdp, + .ndo_xdp_xmit = veth_xdp_xmit, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ @@ -345,13 +1040,31 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } +static int veth_alloc_queues(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); + if (!priv->rq) + return -ENOMEM; + + return 0; +} + +static void veth_free_queues(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + kfree(priv->rq); +} + static struct rtnl_link_ops veth_link_ops; static int veth_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - int err; + int err, i; struct net_device *peer; struct veth_priv *priv; char ifname[IFNAMSIZ]; @@ -404,6 +1117,12 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, return PTR_ERR(peer); } + err = veth_alloc_queues(peer); + if (err) { + put_net(net); + goto err_peer_alloc_queues; + } + if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); @@ -432,6 +1151,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, * should be re-allocated */ + err = veth_alloc_queues(dev); + if (err) + goto err_alloc_queues; + if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); @@ -451,19 +1174,28 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, */ priv = netdev_priv(dev); + for (i = 0; i < dev->real_num_rx_queues; i++) + priv->rq[i].dev = dev; rcu_assign_pointer(priv->peer, peer); priv = netdev_priv(peer); + for (i = 0; i < peer->real_num_rx_queues; i++) + priv->rq[i].dev = peer; rcu_assign_pointer(priv->peer, dev); + return 0; err_register_dev: + veth_free_queues(dev); +err_alloc_queues: /* nothing to do */ err_configure_peer: unregister_netdevice(peer); return err; err_register_peer: + veth_free_queues(peer); +err_peer_alloc_queues: free_netdev(peer); return err; } diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd8790d2c6ed..523481a3471b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,7 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; -struct btf; +struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { @@ -48,8 +48,9 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf, - u32 key_type_id, u32 value_type_id); + int (*map_check_btf)(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); }; struct bpf_map { @@ -118,9 +119,13 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map) static inline bool bpf_map_support_seq_show(const struct bpf_map *map) { - return map->ops->map_seq_show_elem && map->ops->map_check_btf; + return map->btf && map->ops->map_seq_show_elem; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ @@ -524,6 +529,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +int array_map_alloc_check(union bpf_attr *attr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -769,6 +775,33 @@ static inline void __xsk_map_flush(struct bpf_map *map) } #endif +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); +#else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL +static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, + void *key, void *value) +{ + return -EOPNOTSUPP; +} + +static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, + void *key, void *value, + u64 map_flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BPF_SYSCALL */ +#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index add08be53b6f..cd26c090e7c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #endif +#ifdef CONFIG_INET +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) @@ -60,4 +63,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif +#ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) +#endif #endif diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c9fdf6f57913..32c553556bbd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -554,6 +554,36 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, } /** + * cgroup_ancestor - find ancestor of cgroup + * @cgrp: cgroup to find ancestor of + * @ancestor_level: level of ancestor to find starting from root + * + * Find ancestor of cgroup at specified level starting from root if it exists + * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at + * @ancestor_level. + * + * This function is safe to call as long as @cgrp is accessible. + */ +static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, + int ancestor_level) +{ + struct cgroup *ptr; + + if (cgrp->level < ancestor_level) + return NULL; + + for (ptr = cgrp; + ptr && ptr->level > ancestor_level; + ptr = cgroup_parent(ptr)) + ; + + if (ptr && ptr->level == ancestor_level) + return ptr; + + return NULL; +} + +/** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup diff --git a/include/linux/filter.h b/include/linux/filter.h index c73dd7396886..5d565c50bcb2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -32,6 +32,7 @@ struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; +struct sock_reuseport; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -537,6 +538,20 @@ struct sk_msg_buff { struct list_head list; }; +struct bpf_redirect_info { + u32 ifindex; + u32 flags; + struct bpf_map *map; + struct bpf_map *map_to_flush; + unsigned long map_owner; + u32 kern_flags; +}; + +DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); + +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -738,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); +void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); @@ -765,6 +781,27 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline bool xdp_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_set_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; +} + +static inline void xdp_clear_return_frame_no_direct(void) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; +} + static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { @@ -798,6 +835,20 @@ void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_msg_redirect_map(struct sk_msg_buff *md); +#ifdef CONFIG_INET +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash); +#else +static inline struct sock * +bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7e237a63a70c..17a13e4785fc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1038,6 +1038,7 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, } struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); +void skb_headers_offset_update(struct sk_buff *skb, int off); int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 5f43f7a70fe6..6def0351bcc3 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); +bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 0054b3a9b923..8a5f70c7cdf2 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -5,25 +5,36 @@ #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/types.h> +#include <linux/spinlock.h> #include <net/sock.h> +extern spinlock_t reuseport_lock; + struct sock_reuseport { struct rcu_head rcu; u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ + /* The last synq overflow event timestamp of this + * reuse->socks[] group. + */ + unsigned int synq_overflow_ts; + /* ID stays the same even after the size of socks[] grows. */ + unsigned int reuseport_id; + bool bind_inany; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; -extern int reuseport_alloc(struct sock *sk); -extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); +extern int reuseport_alloc(struct sock *sk, bool bind_inany); +extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, + bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); -extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, - struct bpf_prog *prog); +extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); +int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d769dc20359b..d196901c9dba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -36,6 +36,7 @@ #include <net/inet_hashtables.h> #include <net/checksum.h> #include <net/request_sock.h> +#include <net/sock_reuseport.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> @@ -473,9 +474,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); */ static inline void tcp_synq_overflow(const struct sock *sk) { - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow; unsigned int now = jiffies; + if (sk->sk_reuseport) { + struct sock_reuseport *reuse; + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); + if (time_after32(now, last_overflow + HZ)) + WRITE_ONCE(reuse->synq_overflow_ts, now); + return; + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; if (time_after32(now, last_overflow + HZ)) tcp_sk(sk)->rx_opt.ts_recent_stamp = now; } @@ -483,9 +497,21 @@ static inline void tcp_synq_overflow(const struct sock *sk) /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow; unsigned int now = jiffies; + if (sk->sk_reuseport) { + struct sock_reuseport *reuse; + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); + return time_after32(now, last_overflow + + TCP_SYNCOOKIE_VALID); + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); } diff --git a/include/net/xdp.h b/include/net/xdp.h index fcb033f51d8c..76b95256c266 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -84,6 +84,13 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +/* Clear kernel pointers in xdp_frame */ +static inline void xdp_scrub_frame(struct xdp_frame *frame) +{ + frame->data = NULL; + frame->dev_rx = NULL; +} + /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd5758dc35d3..66917a4eba27 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -126,6 +126,7 @@ enum bpf_map_type { BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, }; enum bpf_prog_type { @@ -150,6 +151,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, }; enum bpf_attach_type { @@ -2091,6 +2093,24 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based @@ -2113,6 +2133,14 @@ union bpf_attr { * the shared data. * Return * Pointer to the local storage area. + * + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map + * It checks the selected sk is matching the incoming + * request in the skb. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2196,7 +2224,9 @@ union bpf_attr { FN(rc_keydown), \ FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ - FN(get_local_storage), + FN(get_local_storage), \ + FN(sk_select_reuseport), \ + FN(skb_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2413,6 +2443,30 @@ struct sk_msg_md { __u32 local_port; /* stored in host byte order */ }; +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + void *data; + void *data_end; /* End of directly accessible data */ + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e8906cbad81f..0488b8258321 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -23,3 +23,6 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2aa55d030c77..0c17aab3ce5f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -54,7 +54,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static int array_map_alloc_check(union bpf_attr *attr) +int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); @@ -358,27 +358,20 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, - u32 btf_key_id, u32 btf_value_id) +static int array_map_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) { - const struct btf_type *key_type, *value_type; - u32 key_size, value_size; u32 int_data; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes - * sure that the btf matches the attr used during map_create. + /* bpf array can only take a u32 key. This check makes sure + * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || - BTF_INT_OFFSET(int_data)) - return -EINVAL; - - value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size != map->value_size) + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; @@ -405,6 +398,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_check_btf = array_map_check_btf, }; static int fd_array_map_alloc_check(union bpf_attr *attr) @@ -546,6 +540,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, + .map_check_btf = map_check_no_btf, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -634,6 +629,7 @@ const struct bpf_map_ops perf_event_array_map_ops = { .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, + .map_check_btf = map_check_no_btf, }; #ifdef CONFIG_CGROUPS @@ -665,6 +661,7 @@ const struct bpf_map_ops cgroup_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, + .map_check_btf = map_check_no_btf, }; #endif @@ -749,4 +746,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 46f5f29605d4..620bc5024d7d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -555,6 +555,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_update_elem = cpu_map_update_elem, .map_lookup_elem = cpu_map_lookup_elem, .map_get_next_key = cpu_map_get_next_key, + .map_check_btf = map_check_no_btf, }; static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 750d45edae79..ac1df79f3788 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -488,6 +488,7 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int dev_map_notification(struct notifier_block *notifier, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 513d9dfcf4ee..04b8eda94e7d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -11,9 +11,11 @@ * General Public License for more details. */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/jhash.h> #include <linux/filter.h> #include <linux/rculist_nulls.h> +#include <uapi/linux/btf.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -1162,6 +1164,27 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } +static void htab_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = htab_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + const struct bpf_map_ops htab_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1171,6 +1194,7 @@ const struct bpf_map_ops htab_map_ops = { .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, }; const struct bpf_map_ops htab_lru_map_ops = { @@ -1182,6 +1206,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, + .map_seq_show_elem = htab_map_seq_show_elem, }; /* Called from eBPF program */ @@ -1408,4 +1433,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = htab_of_map_gen_lookup, + .map_check_btf = map_check_no_btf, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 76efe9a183f5..2ada5e21dfa6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -196,19 +196,21 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; + void *prev_key; if (map_iter(m)->done) return NULL; if (unlikely(v == SEQ_START_TOKEN)) - goto done; + prev_key = NULL; + else + prev_key = key; - if (map->ops->map_get_next_key(map, key, key)) { + if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; return NULL; } -done: ++(*pos); return key; } @@ -332,7 +334,8 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, - map->btf ? &bpffs_map_fops : &bpffs_obj_fops); + bpf_map_support_seq_show(map) ? + &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index fc4e37f68f2a..22ad967d1e5f 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -246,6 +246,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { .map_lookup_elem = cgroup_storage_lookup_elem, .map_update_elem = cgroup_storage_update_elem, .map_delete_elem = cgroup_storage_delete_elem, + .map_check_btf = map_check_no_btf, }; int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1603492c9cc7..9058317ba9de 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -10,11 +10,13 @@ */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <net/ipv6.h> +#include <uapi/linux/btf.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -686,6 +688,15 @@ free_stack: return err; } +static int trie_check_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + /* Keys must have struct bpf_lpm_trie_key embedded. */ + return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? + -EINVAL : 0; +} + const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, @@ -693,4 +704,5 @@ const struct bpf_map_ops trie_map_ops = { .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, + .map_check_btf = trie_check_btf, }; diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c new file mode 100644 index 000000000000..18e225de80ff --- /dev/null +++ b/kernel/bpf/reuseport_array.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Facebook + */ +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/sock_diag.h> +#include <net/sock_reuseport.h> + +struct reuseport_array { + struct bpf_map map; + struct sock __rcu *ptrs[]; +}; + +static struct reuseport_array *reuseport_array(struct bpf_map *map) +{ + return (struct reuseport_array *)map; +} + +/* The caller must hold the reuseport_lock */ +void bpf_sk_reuseport_detach(struct sock *sk) +{ + struct sock __rcu **socks; + + write_lock_bh(&sk->sk_callback_lock); + socks = sk->sk_user_data; + if (socks) { + WRITE_ONCE(sk->sk_user_data, NULL); + /* + * Do not move this NULL assignment outside of + * sk->sk_callback_lock because there is + * a race with reuseport_array_free() + * which does not hold the reuseport_lock. + */ + RCU_INIT_POINTER(*socks, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); +} + +static int reuseport_array_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size != sizeof(u32) && + attr->value_size != sizeof(u64)) + return -EINVAL; + + return array_map_alloc_check(attr); +} + +static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return rcu_dereference(array->ptrs[index]); +} + +/* Called from syscall only */ +static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = *(u32 *)key; + struct sock *sk; + int err; + + if (index >= map->max_entries) + return -E2BIG; + + if (!rcu_access_pointer(array->ptrs[index])) + return -ENOENT; + + spin_lock_bh(&reuseport_lock); + + sk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + WRITE_ONCE(sk->sk_user_data, NULL); + RCU_INIT_POINTER(array->ptrs[index], NULL); + write_unlock_bh(&sk->sk_callback_lock); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&reuseport_lock); + + return err; +} + +static void reuseport_array_free(struct bpf_map *map) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *sk; + u32 i; + + synchronize_rcu(); + + /* + * ops->map_*_elem() will not be able to access this + * array now. Hence, this function only races with + * bpf_sk_reuseport_detach() which was triggerred by + * close() or disconnect(). + * + * This function and bpf_sk_reuseport_detach() are + * both removing sk from "array". Who removes it + * first does not matter. + * + * The only concern here is bpf_sk_reuseport_detach() + * may access "array" which is being freed here. + * bpf_sk_reuseport_detach() access this "array" + * through sk->sk_user_data _and_ with sk->sk_callback_lock + * held which is enough because this "array" is not freed + * until all sk->sk_user_data has stopped referencing this "array". + * + * Hence, due to the above, taking "reuseport_lock" is not + * needed here. + */ + + /* + * Since reuseport_lock is not taken, sk is accessed under + * rcu_read_lock() + */ + rcu_read_lock(); + for (i = 0; i < map->max_entries; i++) { + sk = rcu_dereference(array->ptrs[i]); + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + /* + * No need for WRITE_ONCE(). At this point, + * no one is reading it without taking the + * sk->sk_callback_lock. + */ + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); + RCU_INIT_POINTER(array->ptrs[i], NULL); + } + } + rcu_read_unlock(); + + /* + * Once reaching here, all sk->sk_user_data is not + * referenceing this "array". "array" can be freed now. + */ + bpf_map_area_free(array); +} + +static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) +{ + int err, numa_node = bpf_map_attr_numa_node(attr); + struct reuseport_array *array; + u64 cost, array_size; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + array_size = sizeof(*array); + array_size += (u64)attr->max_entries * sizeof(struct sock *); + + /* make sure there is no u32 overflow later in round_up() */ + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(cost); + if (err) + return ERR_PTR(err); + + /* allocate all map elements and zero-initialize them */ + array = bpf_map_area_alloc(array_size, numa_node); + if (!array) + return ERR_PTR(-ENOMEM); + + /* copy mandatory map attributes */ + bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; + + return &array->map; +} + +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value) +{ + struct sock *sk; + int err; + + if (map->value_size != sizeof(u64)) + return -ENOSPC; + + rcu_read_lock(); + sk = reuseport_array_lookup_elem(map, key); + if (sk) { + *(u64 *)value = sock_gen_cookie(sk); + err = 0; + } else { + err = -ENOENT; + } + rcu_read_unlock(); + + return err; +} + +static int +reuseport_array_update_check(const struct reuseport_array *array, + const struct sock *nsk, + const struct sock *osk, + const struct sock_reuseport *nsk_reuse, + u32 map_flags) +{ + if (osk && map_flags == BPF_NOEXIST) + return -EEXIST; + + if (!osk && map_flags == BPF_EXIST) + return -ENOENT; + + if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) + return -ENOTSUPP; + + if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) + return -ENOTSUPP; + + if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) + return -ENOTSUPP; + + /* + * sk must be hashed (i.e. listening in the TCP case or binded + * in the UDP case) and + * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). + * + * Also, sk will be used in bpf helper that is protected by + * rcu_read_lock(). + */ + if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) + return -EINVAL; + + /* READ_ONCE because the sk->sk_callback_lock may not be held here */ + if (READ_ONCE(nsk->sk_user_data)) + return -EBUSY; + + return 0; +} + +/* + * Called from syscall only. + * The "nsk" in the fd refcnt. + * The "osk" and "reuse" are protected by reuseport_lock. + */ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct reuseport_array *array = reuseport_array(map); + struct sock *free_osk = NULL, *osk, *nsk; + struct sock_reuseport *reuse; + u32 index = *(u32 *)key; + struct socket *socket; + int err, fd; + + if (map_flags > BPF_EXIST) + return -EINVAL; + + if (index >= map->max_entries) + return -E2BIG; + + if (map->value_size == sizeof(u64)) { + u64 fd64 = *(u64 *)value; + + if (fd64 > S32_MAX) + return -EINVAL; + fd = fd64; + } else { + fd = *(int *)value; + } + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + nsk = socket->sk; + if (!nsk) { + err = -EINVAL; + goto put_file; + } + + /* Quick checks before taking reuseport_lock */ + err = reuseport_array_update_check(array, nsk, + rcu_access_pointer(array->ptrs[index]), + rcu_access_pointer(nsk->sk_reuseport_cb), + map_flags); + if (err) + goto put_file; + + spin_lock_bh(&reuseport_lock); + /* + * Some of the checks only need reuseport_lock + * but it is done under sk_callback_lock also + * for simplicity reason. + */ + write_lock_bh(&nsk->sk_callback_lock); + + osk = rcu_dereference_protected(array->ptrs[index], + lockdep_is_held(&reuseport_lock)); + reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); + if (err) + goto put_file_unlock; + + /* Ensure reuse->reuseport_id is set */ + err = reuseport_get_id(reuse); + if (err < 0) + goto put_file_unlock; + + WRITE_ONCE(nsk->sk_user_data, &array->ptrs[index]); + rcu_assign_pointer(array->ptrs[index], nsk); + free_osk = osk; + err = 0; + +put_file_unlock: + write_unlock_bh(&nsk->sk_callback_lock); + + if (free_osk) { + write_lock_bh(&free_osk->sk_callback_lock); + WRITE_ONCE(free_osk->sk_user_data, NULL); + write_unlock_bh(&free_osk->sk_callback_lock); + } + + spin_unlock_bh(&reuseport_lock); +put_file: + fput(socket->file); + return err; +} + +/* Called from syscall */ +static int reuseport_array_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct reuseport_array *array = reuseport_array(map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +const struct bpf_map_ops reuseport_array_ops = { + .map_alloc_check = reuseport_array_alloc_check, + .map_alloc = reuseport_array_alloc, + .map_free = reuseport_array_free, + .map_lookup_elem = reuseport_array_lookup_elem, + .map_get_next_key = reuseport_array_get_next_key, + .map_delete_elem = reuseport_array_delete_elem, +}; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 70c0755e8fc4..0c1a696b041b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2498,6 +2498,7 @@ const struct bpf_map_ops sock_map_ops = { .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; const struct bpf_map_ops sock_hash_ops = { @@ -2508,6 +2509,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, .map_release_uref = sock_map_release, + .map_check_btf = map_check_no_btf, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b675a3f3d141..8061a439ef18 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -607,6 +607,7 @@ const struct bpf_map_ops stack_map_ops = { .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, + .map_check_btf = map_check_no_btf, }; static int __init stack_map_init(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5af4e9e2722d..43727ed0d94a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -103,6 +103,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, const struct bpf_map_ops bpf_map_offload_ops = { .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, + .map_check_btf = map_check_no_btf, }; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) @@ -455,6 +456,34 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } +int map_check_no_btf(const struct bpf_map *map, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + return -ENOTSUPP; +} + +static int map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + int ret = 0; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size != map->value_size) + return -EINVAL; + + if (map->ops->map_check_btf) + ret = map->ops->map_check_btf(map, key_type, value_type); + + return ret; +} + #define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -489,8 +518,7 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - if (bpf_map_support_seq_show(map) && - (attr->btf_key_type_id || attr->btf_value_type_id)) { + if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; if (!attr->btf_key_type_id || !attr->btf_value_type_id) { @@ -504,8 +532,8 @@ static int map_create(union bpf_attr *attr) goto free_map_nouncharge; } - err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, - attr->btf_value_type_id); + err = map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); if (err) { btf_put(btf); goto free_map_nouncharge; @@ -684,6 +712,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -790,6 +820,10 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_htab_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { + /* rcu_read_lock() is not needed */ + err = bpf_fd_reuseport_array_update_elem(map, key, value, + attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587468a9c37d..ca90679a7fe5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_hash) goto error; break; + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + if (func_id != BPF_FUNC_sk_select_reuseport) + goto error; + break; default: break; } @@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) goto error; break; + case BPF_FUNC_sk_select_reuseport: + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + goto error; + break; default: break; } diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index b3c557476a8d..4ddf61e158f6 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -227,6 +227,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_lookup_elem = xsk_map_lookup_elem, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, }; - - diff --git a/net/core/filter.c b/net/core/filter.c index 587bbfbd7db3..15b9d2df92ca 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1453,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } -static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) -{ - struct bpf_prog *old_prog; - int err; - - if (bpf_prog_size(prog->len) > sysctl_optmem_max) - return -ENOMEM; - - if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk); - if (err) - return err; - } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { - /* The socket wasn't bound with SO_REUSEPORT */ - return -EINVAL; - } - - old_prog = reuseport_attach_prog(sk, prog); - if (old_prog) - bpf_prog_destroy(old_prog); - - return 0; -} - static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { @@ -1550,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + err = -ENOMEM; + else + err = reuseport_attach_prog(sk, prog); + + if (err) __bpf_prog_release(prog); - return err; - } - return 0; + return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) @@ -1586,19 +1564,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog = __get_bpf(ufd, sk); + struct bpf_prog *prog; int err; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { - bpf_prog_put(prog); - return err; + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER + * bpf prog (e.g. sockmap). It depends on the + * limitation imposed by bpf_prog_load(). + * Hence, sysctl_optmem_max is not checked. + */ + if ((sk->sk_type != SOCK_STREAM && + sk->sk_type != SOCK_DGRAM) || + (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) || + (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6)) { + err = -ENOTSUPP; + goto err_prog_put; + } + } else { + /* BPF_PROG_TYPE_SOCKET_FILTER */ + if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + err = -ENOMEM; + goto err_prog_put; + } } - return 0; + err = reuseport_attach_prog(sk, prog); +err_prog_put: + if (err) + bpf_prog_put(prog); + + return err; +} + +void sk_reuseport_prog_free(struct bpf_prog *prog) +{ + if (!prog) + return; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + bpf_prog_put(prog); + else + bpf_prog_destroy(prog); } struct bpf_scratchpad { @@ -2082,19 +2099,12 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; -struct redirect_info { - u32 ifindex; - u32 flags; - struct bpf_map *map; - struct bpf_map *map_to_flush; - unsigned long map_owner; -}; - -static DEFINE_PER_CPU(struct redirect_info, redirect_info); +DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; @@ -2107,7 +2117,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) int skb_do_redirect(struct sk_buff *skb) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); @@ -3200,7 +3210,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, void xdp_do_flush_map(void) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; @@ -3245,7 +3255,7 @@ static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3285,7 +3295,7 @@ err: int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *fwd; u32 index = ri->ifindex; int err; @@ -3317,7 +3327,7 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; @@ -3368,7 +3378,7 @@ err: int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; @@ -3399,7 +3409,7 @@ EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; @@ -3423,7 +3433,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, unsigned long, map_owner) { - struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; @@ -3768,6 +3778,32 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; + +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + struct cgroup *ancestor; + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ancestor = cgroup_ancestor(cgrp, ancestor_level); + if (!ancestor) + return 0; + + return ancestor->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { + .func = bpf_skb_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -4956,6 +4992,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; #endif default: return bpf_base_func_proto(func_id); @@ -7020,3 +7058,270 @@ out: release_sock(sk); return ret; } + +#ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; + +static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, + struct sock_reuseport *reuse, + struct sock *sk, struct sk_buff *skb, + u32 hash) +{ + reuse_kern->skb = skb; + reuse_kern->sk = sk; + reuse_kern->selected_sk = NULL; + reuse_kern->data_end = skb->data + skb_headlen(skb); + reuse_kern->hash = hash; + reuse_kern->reuseport_id = reuse->reuseport_id; + reuse_kern->bind_inany = reuse->bind_inany; +} + +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + struct sk_reuseport_kern reuse_kern; + enum sk_action action; + + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + action = BPF_PROG_RUN(prog, &reuse_kern); + + if (action == SK_PASS) + return reuse_kern.selected_sk; + else + return ERR_PTR(-ECONNREFUSED); +} + +BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, + struct bpf_map *, map, void *, key, u32, flags) +{ + struct sock_reuseport *reuse; + struct sock *selected_sk; + + selected_sk = map->ops->map_lookup_elem(map, key); + if (!selected_sk) + return -ENOENT; + + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); + if (!reuse) + /* selected_sk is unhashed (e.g. by close()) after the + * above map_lookup_elem(). Treat selected_sk has already + * been removed from the map. + */ + return -ENOENT; + + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { + struct sock *sk; + + if (unlikely(!reuse_kern->reuseport_id)) + /* There is a small race between adding the + * sk to the map and setting the + * reuse_kern->reuseport_id. + * Treat it as the sk has not been added to + * the bpf map yet. + */ + return -ENOENT; + + sk = reuse_kern->sk; + if (sk->sk_protocol != selected_sk->sk_protocol) + return -EPROTOTYPE; + else if (sk->sk_family != selected_sk->sk_family) + return -EAFNOSUPPORT; + + /* Catch all. Likely bound to a different sockaddr. */ + return -EBADFD; + } + + reuse_kern->selected_sk = selected_sk; + + return 0; +} + +static const struct bpf_func_proto sk_select_reuseport_proto = { + .func = sk_select_reuseport, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(sk_reuseport_load_bytes, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len) +{ + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { + .func = sk_reuseport_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(sk_reuseport_load_bytes_relative, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len, u32, start_header) +{ + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, + len, start_header); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { + .func = sk_reuseport_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_reuseport_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_select_reuseport: + return &sk_select_reuseport_proto; + case BPF_FUNC_skb_load_bytes: + return &sk_reuseport_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &sk_reuseport_load_bytes_relative_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool +sk_reuseport_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const u32 size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct sk_reuseport_md, data): + info->reg_type = PTR_TO_PACKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, hash): + return size == size_default; + + /* Fields that allow narrowing */ + case offsetof(struct sk_reuseport_md, eth_protocol): + if (size < FIELD_SIZEOF(struct sk_buff, protocol)) + return false; + case offsetof(struct sk_reuseport_md, ip_protocol): + case offsetof(struct sk_reuseport_md, bind_inany): + case offsetof(struct sk_reuseport_md, len): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + + default: + return false; + } +} + +#define SK_REUSEPORT_LOAD_FIELD(F) ({ \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + si->dst_reg, si->src_reg, \ + bpf_target_off(struct sk_reuseport_kern, F, \ + FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + target_size)); \ + }) + +#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sk_buff, \ + skb, \ + SKB_FIELD) + +#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD, BPF_SIZE, EXTRA_OFF) + +static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_reuseport_md, data): + SK_REUSEPORT_LOAD_SKB_FIELD(data); + break; + + case offsetof(struct sk_reuseport_md, len): + SK_REUSEPORT_LOAD_SKB_FIELD(len); + break; + + case offsetof(struct sk_reuseport_md, eth_protocol): + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); + break; + + case offsetof(struct sk_reuseport_md, ip_protocol): + BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, + BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian + * aware. No further narrowing or masking is needed. + */ + *target_size = 1; + break; + + case offsetof(struct sk_reuseport_md, data_end): + SK_REUSEPORT_LOAD_FIELD(data_end); + break; + + case offsetof(struct sk_reuseport_md, hash): + SK_REUSEPORT_LOAD_FIELD(hash); + break; + + case offsetof(struct sk_reuseport_md, bind_inany): + SK_REUSEPORT_LOAD_FIELD(bind_inany); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_reuseport_verifier_ops = { + .get_func_proto = sk_reuseport_func_proto, + .is_valid_access = sk_reuseport_is_valid_access, + .convert_ctx_access = sk_reuseport_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_reuseport_prog_ops = { +}; +#endif /* CONFIG_INET */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8d574a88125d..c996c09d095f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1291,7 +1291,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) } EXPORT_SYMBOL(skb_clone); -static void skb_headers_offset_update(struct sk_buff *skb, int off) +void skb_headers_offset_update(struct sk_buff *skb, int off) { /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) @@ -1305,6 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) skb->inner_network_header += off; skb->inner_mac_header += off; } +EXPORT_SYMBOL(skb_headers_offset_update); void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 064acb04be0f..ba5cba56f574 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -8,11 +8,34 @@ #include <net/sock_reuseport.h> #include <linux/bpf.h> +#include <linux/idr.h> +#include <linux/filter.h> #include <linux/rcupdate.h> #define INIT_SOCKS 128 -static DEFINE_SPINLOCK(reuseport_lock); +DEFINE_SPINLOCK(reuseport_lock); + +#define REUSEPORT_MIN_ID 1 +static DEFINE_IDA(reuseport_ida); + +int reuseport_get_id(struct sock_reuseport *reuse) +{ + int id; + + if (reuse->reuseport_id) + return reuse->reuseport_id; + + id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0, + /* Called under reuseport_lock */ + GFP_ATOMIC); + if (id < 0) + return id; + + reuse->reuseport_id = id; + + return reuse->reuseport_id; +} static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { @@ -29,7 +52,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) return reuse; } -int reuseport_alloc(struct sock *sk) +int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; @@ -41,9 +64,17 @@ int reuseport_alloc(struct sock *sk) /* Allocation attempts can occur concurrently via the setsockopt path * and the bind/hash path. Nothing to do when we lose the race. */ - if (rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock))) + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (reuse) { + /* Only set reuse->bind_inany if the bind_inany is true. + * Otherwise, it will overwrite the reuse->bind_inany + * which was set by the bind/hash path. + */ + if (bind_inany) + reuse->bind_inany = bind_inany; goto out; + } reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { @@ -53,6 +84,7 @@ int reuseport_alloc(struct sock *sk) reuse->socks[0] = sk; reuse->num_socks = 1; + reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -78,9 +110,12 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->max_socks = more_socks_size; more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; + more_reuse->reuseport_id = reuse->reuseport_id; + more_reuse->bind_inany = reuse->bind_inany; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); + more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); for (i = 0; i < reuse->num_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, @@ -99,8 +134,9 @@ static void reuseport_free_rcu(struct rcu_head *head) struct sock_reuseport *reuse; reuse = container_of(head, struct sock_reuseport, rcu); - if (reuse->prog) - bpf_prog_destroy(reuse->prog); + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); + if (reuse->reuseport_id) + ida_simple_remove(&reuseport_ida, reuse->reuseport_id); kfree(reuse); } @@ -110,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head) * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) { struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { - int err = reuseport_alloc(sk2); + int err = reuseport_alloc(sk2, bind_inany); if (err) return err; @@ -160,6 +196,14 @@ void reuseport_detach_sock(struct sock *sk) spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); + + /* At least one of the sk in this reuseport group is added to + * a bpf map. Notify the bpf side. The bpf map logic will + * remove the sk if it is indeed added to a bpf map. + */ + if (reuse->reuseport_id) + bpf_sk_reuseport_detach(sk); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); for (i = 0; i < reuse->num_socks; i++) { @@ -175,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk) } EXPORT_SYMBOL(reuseport_detach_sock); -static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, - struct bpf_prog *prog, struct sk_buff *skb, - int hdr_len) +static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) { struct sk_buff *nskb = NULL; u32 index; @@ -238,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk, /* paired with smp_wmb() in reuseport_add_sock() */ smp_rmb(); - if (prog && skb) - sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + if (!prog || !skb) + goto select_by_hash; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + else + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); +select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; @@ -252,12 +302,21 @@ out: } EXPORT_SYMBOL(reuseport_select_sock); -struct bpf_prog * -reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; + if (sk_unhashed(sk) && sk->sk_reuseport) { + int err = reuseport_alloc(sk, false); + + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); @@ -266,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) rcu_assign_pointer(reuse->prog, prog); spin_unlock_bh(&reuseport_lock); - return old_prog; + sk_reuseport_prog_free(old_prog); + return 0; } EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/core/xdp.c b/net/core/xdp.c index c013b836006b..3dd99e1c04f5 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -330,10 +330,12 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (xa) + if (xa) { + napi_direct &= !xdp_return_frame_no_direct(); page_pool_put_page(xa->page_pool, page, napi_direct); - else + } else { put_page(page); + } rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: @@ -348,8 +350,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, rcu_read_lock(); /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - if (!WARN_ON_ONCE(!xa)) - xa->zc_alloc->free(xa->zc_alloc, handle); + xa->zc_alloc->free(xa->zc_alloc, handle); rcu_read_unlock(); default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 33a88e045efd..dfd5009f96ef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, } EXPORT_SYMBOL(inet_rcv_saddr_equal); +bool inet_rcv_saddr_any(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#endif + return !sk->sk_rcv_saddr; +} + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3647167c8fa3..f5c9ef2586de 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with INADDR_ANY */ @@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each_rcu(sk, &ilb->head) { @@ -352,12 +353,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); @@ -567,10 +571,11 @@ static int inet_reuseport_add_sock(struct sock *sk, inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 060e841dde40..f4e35b2ff8b8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** @@ -498,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -512,6 +515,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 595ad408dba0..3d7c7460a0c5 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with in6addr_any */ @@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet6_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each(sk, &ilb->head) { @@ -214,12 +215,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f6b96956a8ed..83f4c77c79d8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -249,6 +251,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } diff --git a/samples/bpf/hash_func01.h b/samples/bpf/hash_func01.h new file mode 100644 index 000000000000..38255812e376 --- /dev/null +++ b/samples/bpf/hash_func01.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: LGPL-2.1 + * + * Based on Paul Hsieh's (LGPG 2.1) hash function + * From: http://www.azillionmonkeys.com/qed/hash.html + */ + +#define get16bits(d) (*((const __u16 *) (d))) + +static __always_inline +__u32 SuperFastHash (const char *data, int len, __u32 initval) { + __u32 hash = initval; + __u32 tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ +#pragma clang loop unroll(full) + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (__u16); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= ((signed char)data[sizeof (__u16)]) << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += (signed char)*data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c index 0cc3d71057f0..a306d1c75622 100644 --- a/samples/bpf/xdp_redirect_cpu_kern.c +++ b/samples/bpf/xdp_redirect_cpu_kern.c @@ -13,6 +13,7 @@ #include <uapi/linux/bpf.h> #include "bpf_helpers.h" +#include "hash_func01.h" #define MAX_CPUS 64 /* WARNING - sync with _user.c */ @@ -461,6 +462,108 @@ int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) return bpf_redirect_map(&cpu_map, cpu_dest, 0); } +/* Hashing initval */ +#define INITVAL 15485863 + +static __always_inline +u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + u32 cpu_hash; + + if (iph + 1 > data_end) + return 0; + + cpu_hash = iph->saddr + iph->daddr; + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol); + + return cpu_hash; +} + +static __always_inline +u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ipv6hdr *ip6h = data + nh_off; + u32 cpu_hash; + + if (ip6h + 1 > data_end) + return 0; + + cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0]; + cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1]; + cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2]; + cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3]; + cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr); + + return cpu_hash; +} + +/* Load-Balance traffic based on hashing IP-addrs + L4-proto. The + * hashing scheme is symmetric, meaning swapping IP src/dest still hit + * same CPU. + */ +SEC("xdp_cpu_map5_lb_hash_ip_pairs") +int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u32 *cpu_lookup; + u32 *cpu_max; + u32 cpu_hash; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + cpu_max = bpf_map_lookup_elem(&cpus_count, &key); + if (!cpu_max) + return XDP_ABORTED; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Hash for IPv4 and IPv6 */ + switch (eth_proto) { + case ETH_P_IP: + cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset); + break; + case ETH_P_IPV6: + cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset); + break; + case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */ + default: + cpu_hash = 0; + } + + /* Choose CPU based on hash */ + cpu_idx = cpu_hash % *cpu_max; + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 4b4d78fffe30..9a6c7e0a6dd1 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -22,7 +22,7 @@ static const char *__doc__ = #define MAX_CPUS 64 /* WARNING - sync with _kern.c */ /* How many xdp_progs are defined in _kern.c */ -#define MAX_PROG 5 +#define MAX_PROG 6 /* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead * use bpf/libbpf.h), but cannot as (currently) needed for XDP @@ -567,7 +567,7 @@ int main(int argc, char **argv) int added_cpus = 0; int longindex = 0; int interval = 2; - int prog_num = 0; + int prog_num = 5; int add_cpu = -1; __u32 qsize; int opt; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index dd5758dc35d3..66917a4eba27 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -126,6 +126,7 @@ enum bpf_map_type { BPF_MAP_TYPE_XSKMAP, BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, }; enum bpf_prog_type { @@ -150,6 +151,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, }; enum bpf_attach_type { @@ -2091,6 +2093,24 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based @@ -2113,6 +2133,14 @@ union bpf_attr { * the shared data. * Return * Pointer to the local storage area. + * + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map + * It checks the selected sk is matching the incoming + * request in the skb. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2196,7 +2224,9 @@ union bpf_attr { FN(rc_keydown), \ FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ - FN(get_local_storage), + FN(get_local_storage), \ + FN(sk_select_reuseport), \ + FN(skb_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2413,6 +2443,30 @@ struct sk_msg_md { __u32 local_port; /* stored in host byte order */ }; +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + void *data; + void *data_end; /* End of directly accessible data */ + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 9ddc89dae962..60aa4ca8b2c5 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -92,6 +92,7 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) attr.btf_key_type_id = create_attr->btf_key_type_id; attr.btf_value_type_id = create_attr->btf_value_type_id; attr.map_ifindex = create_attr->map_ifindex; + attr.inner_map_fd = create_attr->inner_map_fd; return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 0639a30a457d..6f38164b2618 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -39,6 +39,7 @@ struct bpf_create_map_attr { __u32 btf_key_type_id; __u32 btf_value_type_id; __u32 map_ifindex; + __u32 inner_map_fd; }; int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 40211b51427a..2abd0f112627 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1501,6 +1501,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_SK_REUSEPORT: return false; case BPF_PROG_TYPE_UNSPEC: case BPF_PROG_TYPE_KPROBE: diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 17a7a5818ee1..fff7fb1285fc 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -23,7 +23,7 @@ $(TEST_CUSTOM_PROGS): $(OUTPUT)/%: %.c TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ - test_socket_cookie test_cgroup_storage + test_socket_cookie test_cgroup_storage test_select_reuseport TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ @@ -34,7 +34,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \ test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ - get_cgroup_id_kern.o socket_cookie_prog.o + get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ + test_skb_cgroup_id_kern.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ @@ -45,10 +46,11 @@ TEST_PROGS := test_kmod.sh \ test_sock_addr.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ - test_lirc_mode2.sh + test_lirc_mode2.sh \ + test_skb_cgroup_id.sh # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr +TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user include ../lib.mk @@ -59,6 +61,7 @@ $(TEST_GEN_PROGS): $(BPFOBJ) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c +$(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c $(OUTPUT)/test_sock: cgroup_helpers.c $(OUTPUT)/test_sock_addr: cgroup_helpers.c $(OUTPUT)/test_socket_cookie: cgroup_helpers.c diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 9ba1c72d7cf5..e4be7730222d 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -111,6 +111,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) = static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, int size, int flags) = (void *) BPF_FUNC_skb_get_xfrm_state; +static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 flags) = + (void *) BPF_FUNC_sk_select_reuseport; static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = (void *) BPF_FUNC_get_stack; static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, @@ -137,6 +139,10 @@ static unsigned long long (*bpf_get_current_cgroup_id)(void) = (void *) BPF_FUNC_get_current_cgroup_id; static void *(*bpf_get_local_storage)(void *map, unsigned long long flags) = (void *) BPF_FUNC_get_local_storage; +static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) = + (void *) BPF_FUNC_skb_cgroup_id; +static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = + (void *) BPF_FUNC_skb_ancestor_cgroup_id; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions @@ -173,6 +179,8 @@ struct bpf_map_def { static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = (void *) BPF_FUNC_skb_load_bytes; +static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) = + (void *) BPF_FUNC_skb_load_bytes_relative; static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = (void *) BPF_FUNC_skb_store_bytes; static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index d0811b3d6a6f..315a44fa32af 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -44,4 +44,8 @@ static inline unsigned int bpf_num_possible_cpus(void) name[bpf_num_possible_cpus()] #define bpf_percpu(name, cpu) name[(cpu)].v +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + #endif /* __BPF_UTIL__ */ diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index 6b1b302310fe..5f377ec53f2f 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -18,10 +18,7 @@ #include "../../../include/linux/filter.h" #include "bpf_rlimit.h" - -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif +#include "bpf_util.h" #define MAX_INSNS 512 #define MAX_MATCHES 16 diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index ffdd27737c9e..6b5cfeb7a9cc 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -19,6 +19,7 @@ #include <bpf/btf.h> #include "bpf_rlimit.h" +#include "bpf_util.h" static uint32_t pass_cnt; static uint32_t error_cnt; @@ -93,10 +94,6 @@ static int __base_pr(const char *format, ...) #define MAX_NR_RAW_TYPES 1024 #define BTF_LOG_BUF_SIZE 65535 -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif - static struct args { unsigned int raw_test_num; unsigned int file_test_num; @@ -131,6 +128,8 @@ struct btf_raw_test { __u32 max_entries; bool btf_load_err; bool map_create_err; + bool ordered_map; + bool lossless_map; int hdr_len_delta; int type_off_delta; int str_off_delta; @@ -2093,8 +2092,7 @@ struct pprint_mapv { } aenum; }; -static struct btf_raw_test pprint_test = { - .descr = "BTF pretty print test #1", +static struct btf_raw_test pprint_test_template = { .raw_types = { /* unsighed char */ /* [1] */ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1), @@ -2146,8 +2144,6 @@ static struct btf_raw_test pprint_test = { }, .str_sec = "\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum", .str_sec_size = sizeof("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum"), - .map_type = BPF_MAP_TYPE_ARRAY, - .map_name = "pprint_test", .key_size = sizeof(unsigned int), .value_size = sizeof(struct pprint_mapv), .key_type_id = 3, /* unsigned int */ @@ -2155,6 +2151,40 @@ static struct btf_raw_test pprint_test = { .max_entries = 128 * 1024, }; +static struct btf_pprint_test_meta { + const char *descr; + enum bpf_map_type map_type; + const char *map_name; + bool ordered_map; + bool lossless_map; +} pprint_tests_meta[] = { +{ + .descr = "BTF pretty print array", + .map_type = BPF_MAP_TYPE_ARRAY, + .map_name = "pprint_test_array", + .ordered_map = true, + .lossless_map = true, +}, + +{ + .descr = "BTF pretty print hash", + .map_type = BPF_MAP_TYPE_HASH, + .map_name = "pprint_test_hash", + .ordered_map = false, + .lossless_map = true, +}, + +{ + .descr = "BTF pretty print lru hash", + .map_type = BPF_MAP_TYPE_LRU_HASH, + .map_name = "pprint_test_lru_hash", + .ordered_map = false, + .lossless_map = false, +}, + +}; + + static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i) { v->ui32 = i; @@ -2166,10 +2196,12 @@ static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i) v->aenum = i & 0x03; } -static int test_pprint(void) +static int do_test_pprint(void) { - const struct btf_raw_test *test = &pprint_test; + const struct btf_raw_test *test = &pprint_test_template; struct bpf_create_map_attr create_attr = {}; + unsigned int key, nr_read_elems; + bool ordered_map, lossless_map; int map_fd = -1, btf_fd = -1; struct pprint_mapv mapv = {}; unsigned int raw_btf_size; @@ -2178,7 +2210,6 @@ static int test_pprint(void) char pin_path[255]; size_t line_len = 0; char *line = NULL; - unsigned int key; uint8_t *raw_btf; ssize_t nread; int err, ret; @@ -2251,14 +2282,18 @@ static int test_pprint(void) goto done; } - key = 0; + nr_read_elems = 0; + ordered_map = test->ordered_map; + lossless_map = test->lossless_map; do { ssize_t nexpected_line; + unsigned int next_key; - set_pprint_mapv(&mapv, key); + next_key = ordered_map ? nr_read_elems : atoi(line); + set_pprint_mapv(&mapv, next_key); nexpected_line = snprintf(expected_line, sizeof(expected_line), "%u: {%u,0,%d,0x%x,0x%x,0x%x,{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n", - key, + next_key, mapv.ui32, mapv.si32, mapv.unused_bits2a, mapv.bits28, mapv.unused_bits2b, mapv.ui64, @@ -2281,11 +2316,12 @@ static int test_pprint(void) } nread = getline(&line, &line_len, pin_file); - } while (++key < test->max_entries && nread > 0); + } while (++nr_read_elems < test->max_entries && nread > 0); - if (CHECK(key < test->max_entries, - "Unexpected EOF. key:%u test->max_entries:%u", - key, test->max_entries)) { + if (lossless_map && + CHECK(nr_read_elems < test->max_entries, + "Unexpected EOF. nr_read_elems:%u test->max_entries:%u", + nr_read_elems, test->max_entries)) { err = -1; goto done; } @@ -2314,6 +2350,24 @@ done: return err; } +static int test_pprint(void) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < ARRAY_SIZE(pprint_tests_meta); i++) { + pprint_test_template.descr = pprint_tests_meta[i].descr; + pprint_test_template.map_type = pprint_tests_meta[i].map_type; + pprint_test_template.map_name = pprint_tests_meta[i].map_name; + pprint_test_template.ordered_map = pprint_tests_meta[i].ordered_map; + pprint_test_template.lossless_map = pprint_tests_meta[i].lossless_map; + + err |= count_result(do_test_pprint()); + } + + return err; +} + static void usage(const char *cmd) { fprintf(stderr, "Usage: %s [-l] [[-r test_num (1 - %zu)] | [-g test_num (1 - %zu)] | [-f test_num (1 - %zu)] | [-p]]\n", @@ -2409,7 +2463,7 @@ int main(int argc, char **argv) err |= test_file(); if (args.pprint_test) - err |= count_result(test_pprint()); + err |= test_pprint(); if (args.raw_test || args.get_info_test || args.file_test || args.pprint_test) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 6c253343a6f9..4b7c74f5faa7 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -17,7 +17,8 @@ #include <stdlib.h> #include <sys/wait.h> - +#include <sys/socket.h> +#include <netinet/in.h> #include <linux/bpf.h> #include <bpf/bpf.h> @@ -26,8 +27,21 @@ #include "bpf_util.h" #include "bpf_rlimit.h" +#ifndef ENOTSUPP +#define ENOTSUPP 524 +#endif + static int map_flags; +#define CHECK(condition, tag, format...) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ + printf(format); \ + exit(-1); \ + } \ +}) + static void test_hashmap(int task, void *data) { long long key, next_key, first_key, value; @@ -1150,6 +1164,250 @@ static void test_map_wronly(void) assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM); } +static void prepare_reuseport_grp(int type, int map_fd, + __s64 *fds64, __u64 *sk_cookies, + unsigned int n) +{ + socklen_t optlen, addrlen; + struct sockaddr_in6 s6; + const __u32 index0 = 0; + const int optval = 1; + unsigned int i; + u64 sk_cookie; + __s64 fd64; + int err; + + s6.sin6_family = AF_INET6; + s6.sin6_addr = in6addr_any; + s6.sin6_port = 0; + addrlen = sizeof(s6); + optlen = sizeof(sk_cookie); + + for (i = 0; i < n; i++) { + fd64 = socket(AF_INET6, type, 0); + CHECK(fd64 == -1, "socket()", + "sock_type:%d fd64:%lld errno:%d\n", + type, fd64, errno); + + err = setsockopt(fd64, SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof(optval)); + CHECK(err == -1, "setsockopt(SO_REUSEEPORT)", + "err:%d errno:%d\n", err, errno); + + /* reuseport_array does not allow unbound sk */ + err = bpf_map_update_elem(map_fd, &index0, &fd64, + BPF_ANY); + CHECK(err != -1 || errno != EINVAL, + "reuseport array update unbound sk", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + + err = bind(fd64, (struct sockaddr *)&s6, sizeof(s6)); + CHECK(err == -1, "bind()", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + + if (i == 0) { + err = getsockname(fd64, (struct sockaddr *)&s6, + &addrlen); + CHECK(err == -1, "getsockname()", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + } + + err = getsockopt(fd64, SOL_SOCKET, SO_COOKIE, &sk_cookie, + &optlen); + CHECK(err == -1, "getsockopt(SO_COOKIE)", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + + if (type == SOCK_STREAM) { + /* + * reuseport_array does not allow + * non-listening tcp sk. + */ + err = bpf_map_update_elem(map_fd, &index0, &fd64, + BPF_ANY); + CHECK(err != -1 || errno != EINVAL, + "reuseport array update non-listening sk", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + err = listen(fd64, 0); + CHECK(err == -1, "listen()", + "sock_type:%d, err:%d errno:%d\n", + type, err, errno); + } + + fds64[i] = fd64; + sk_cookies[i] = sk_cookie; + } +} + +static void test_reuseport_array(void) +{ +#define REUSEPORT_FD_IDX(err, last) ({ (err) ? last : !last; }) + + const __u32 array_size = 4, index0 = 0, index3 = 3; + int types[2] = { SOCK_STREAM, SOCK_DGRAM }, type; + __u64 grpa_cookies[2], sk_cookie, map_cookie; + __s64 grpa_fds64[2] = { -1, -1 }, fd64 = -1; + const __u32 bad_index = array_size; + int map_fd, err, t, f; + __u32 fds_idx = 0; + int fd; + + map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + sizeof(__u32), sizeof(__u64), array_size, 0); + CHECK(map_fd == -1, "reuseport array create", + "map_fd:%d, errno:%d\n", map_fd, errno); + + /* Test lookup/update/delete with invalid index */ + err = bpf_map_delete_elem(map_fd, &bad_index); + CHECK(err != -1 || errno != E2BIG, "reuseport array del >=max_entries", + "err:%d errno:%d\n", err, errno); + + err = bpf_map_update_elem(map_fd, &bad_index, &fd64, BPF_ANY); + CHECK(err != -1 || errno != E2BIG, + "reuseport array update >=max_entries", + "err:%d errno:%d\n", err, errno); + + err = bpf_map_lookup_elem(map_fd, &bad_index, &map_cookie); + CHECK(err != -1 || errno != ENOENT, + "reuseport array update >=max_entries", + "err:%d errno:%d\n", err, errno); + + /* Test lookup/delete non existence elem */ + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); + CHECK(err != -1 || errno != ENOENT, + "reuseport array lookup not-exist elem", + "err:%d errno:%d\n", err, errno); + err = bpf_map_delete_elem(map_fd, &index3); + CHECK(err != -1 || errno != ENOENT, + "reuseport array del not-exist elem", + "err:%d errno:%d\n", err, errno); + + for (t = 0; t < ARRAY_SIZE(types); t++) { + type = types[t]; + + prepare_reuseport_grp(type, map_fd, grpa_fds64, + grpa_cookies, ARRAY_SIZE(grpa_fds64)); + + /* Test BPF_* update flags */ + /* BPF_EXIST failure case */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_EXIST); + CHECK(err != -1 || errno != ENOENT, + "reuseport array update empty elem BPF_EXIST", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); + + /* BPF_NOEXIST success case */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_NOEXIST); + CHECK(err == -1, + "reuseport array update empty elem BPF_NOEXIST", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); + + /* BPF_EXIST success case. */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_EXIST); + CHECK(err == -1, + "reuseport array update same elem BPF_EXIST", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); + + /* BPF_NOEXIST failure case */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_NOEXIST); + CHECK(err != -1 || errno != EEXIST, + "reuseport array update non-empty elem BPF_NOEXIST", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + fds_idx = REUSEPORT_FD_IDX(err, fds_idx); + + /* BPF_ANY case (always succeed) */ + err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx], + BPF_ANY); + CHECK(err == -1, + "reuseport array update same sk with BPF_ANY", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + + fd64 = grpa_fds64[fds_idx]; + sk_cookie = grpa_cookies[fds_idx]; + + /* The same sk cannot be added to reuseport_array twice */ + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_ANY); + CHECK(err != -1 || errno != EBUSY, + "reuseport array update same sk with same index", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + + err = bpf_map_update_elem(map_fd, &index0, &fd64, BPF_ANY); + CHECK(err != -1 || errno != EBUSY, + "reuseport array update same sk with different index", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + + /* Test delete elem */ + err = bpf_map_delete_elem(map_fd, &index3); + CHECK(err == -1, "reuseport array delete sk", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + + /* Add it back with BPF_NOEXIST */ + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); + CHECK(err == -1, + "reuseport array re-add with BPF_NOEXIST after del", + "sock_type:%d err:%d errno:%d\n", type, err, errno); + + /* Test cookie */ + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); + CHECK(err == -1 || sk_cookie != map_cookie, + "reuseport array lookup re-added sk", + "sock_type:%d err:%d errno:%d sk_cookie:0x%llx map_cookie:0x%llxn", + type, err, errno, sk_cookie, map_cookie); + + /* Test elem removed by close() */ + for (f = 0; f < ARRAY_SIZE(grpa_fds64); f++) + close(grpa_fds64[f]); + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); + CHECK(err != -1 || errno != ENOENT, + "reuseport array lookup after close()", + "sock_type:%d err:%d errno:%d\n", + type, err, errno); + } + + /* Test SOCK_RAW */ + fd64 = socket(AF_INET6, SOCK_RAW, IPPROTO_UDP); + CHECK(fd64 == -1, "socket(SOCK_RAW)", "err:%d errno:%d\n", + err, errno); + err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST); + CHECK(err != -1 || errno != ENOTSUPP, "reuseport array update SOCK_RAW", + "err:%d errno:%d\n", err, errno); + close(fd64); + + /* Close the 64 bit value map */ + close(map_fd); + + /* Test 32 bit fd */ + map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + sizeof(__u32), sizeof(__u32), array_size, 0); + CHECK(map_fd == -1, "reuseport array create", + "map_fd:%d, errno:%d\n", map_fd, errno); + prepare_reuseport_grp(SOCK_STREAM, map_fd, &fd64, &sk_cookie, 1); + fd = fd64; + err = bpf_map_update_elem(map_fd, &index3, &fd, BPF_NOEXIST); + CHECK(err == -1, "reuseport array update 32 bit fd", + "err:%d errno:%d\n", err, errno); + err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie); + CHECK(err != -1 || errno != ENOSPC, + "reuseport array lookup 32 bit fd", + "err:%d errno:%d\n", err, errno); + close(fd); + close(map_fd); +} + static void run_all_tests(void) { test_hashmap(0, NULL); @@ -1170,6 +1428,8 @@ static void run_all_tests(void) test_map_rdonly(); test_map_wronly(); + + test_reuseport_array(); } int main(void) diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c new file mode 100644 index 000000000000..75646d9b34aa --- /dev/null +++ b/tools/testing/selftests/bpf/test_select_reuseport.c @@ -0,0 +1,688 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook */ + +#include <stdlib.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <errno.h> +#include <assert.h> +#include <fcntl.h> +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/types.h> +#include <linux/if_ether.h> +#include <sys/types.h> +#include <sys/epoll.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include "bpf_rlimit.h" +#include "bpf_util.h" +#include "test_select_reuseport_common.h" + +#define MIN_TCPHDR_LEN 20 +#define UDPHDR_LEN 8 + +#define TCP_SYNCOOKIE_SYSCTL "/proc/sys/net/ipv4/tcp_syncookies" +#define TCP_FO_SYSCTL "/proc/sys/net/ipv4/tcp_fastopen" +#define REUSEPORT_ARRAY_SIZE 32 + +static int result_map, tmp_index_ovr_map, linum_map, data_check_map; +static enum result expected_results[NR_RESULTS]; +static int sk_fds[REUSEPORT_ARRAY_SIZE]; +static int reuseport_array, outer_map; +static int select_by_skb_data_prog; +static int saved_tcp_syncookie; +static struct bpf_object *obj; +static int saved_tcp_fo; +static __u32 index_zero; +static int epfd; + +static union sa46 { + struct sockaddr_in6 v6; + struct sockaddr_in v4; + sa_family_t family; +} srv_sa; + +#define CHECK(condition, tag, format...) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ + printf(format); \ + exit(-1); \ + } \ +}) + +static void create_maps(void) +{ + struct bpf_create_map_attr attr = {}; + + /* Creating reuseport_array */ + attr.name = "reuseport_array"; + attr.map_type = BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; + attr.key_size = sizeof(__u32); + attr.value_size = sizeof(__u32); + attr.max_entries = REUSEPORT_ARRAY_SIZE; + + reuseport_array = bpf_create_map_xattr(&attr); + CHECK(reuseport_array == -1, "creating reuseport_array", + "reuseport_array:%d errno:%d\n", reuseport_array, errno); + + /* Creating outer_map */ + attr.name = "outer_map"; + attr.map_type = BPF_MAP_TYPE_ARRAY_OF_MAPS; + attr.key_size = sizeof(__u32); + attr.value_size = sizeof(__u32); + attr.max_entries = 1; + attr.inner_map_fd = reuseport_array; + outer_map = bpf_create_map_xattr(&attr); + CHECK(outer_map == -1, "creating outer_map", + "outer_map:%d errno:%d\n", outer_map, errno); +} + +static void prepare_bpf_obj(void) +{ + struct bpf_program *prog; + struct bpf_map *map; + int err; + struct bpf_object_open_attr attr = { + .file = "test_select_reuseport_kern.o", + .prog_type = BPF_PROG_TYPE_SK_REUSEPORT, + }; + + obj = bpf_object__open_xattr(&attr); + CHECK(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o", + "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj)); + + prog = bpf_program__next(NULL, obj); + CHECK(!prog, "get first bpf_program", "!prog\n"); + bpf_program__set_type(prog, attr.prog_type); + + map = bpf_object__find_map_by_name(obj, "outer_map"); + CHECK(!map, "find outer_map", "!map\n"); + err = bpf_map__reuse_fd(map, outer_map); + CHECK(err, "reuse outer_map", "err:%d\n", err); + + err = bpf_object__load(obj); + CHECK(err, "load bpf_object", "err:%d\n", err); + + select_by_skb_data_prog = bpf_program__fd(prog); + CHECK(select_by_skb_data_prog == -1, "get prog fd", + "select_by_skb_data_prog:%d\n", select_by_skb_data_prog); + + map = bpf_object__find_map_by_name(obj, "result_map"); + CHECK(!map, "find result_map", "!map\n"); + result_map = bpf_map__fd(map); + CHECK(result_map == -1, "get result_map fd", + "result_map:%d\n", result_map); + + map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map"); + CHECK(!map, "find tmp_index_ovr_map", "!map\n"); + tmp_index_ovr_map = bpf_map__fd(map); + CHECK(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd", + "tmp_index_ovr_map:%d\n", tmp_index_ovr_map); + + map = bpf_object__find_map_by_name(obj, "linum_map"); + CHECK(!map, "find linum_map", "!map\n"); + linum_map = bpf_map__fd(map); + CHECK(linum_map == -1, "get linum_map fd", + "linum_map:%d\n", linum_map); + + map = bpf_object__find_map_by_name(obj, "data_check_map"); + CHECK(!map, "find data_check_map", "!map\n"); + data_check_map = bpf_map__fd(map); + CHECK(data_check_map == -1, "get data_check_map fd", + "data_check_map:%d\n", data_check_map); +} + +static void sa46_init_loopback(union sa46 *sa, sa_family_t family) +{ + memset(sa, 0, sizeof(*sa)); + sa->family = family; + if (sa->family == AF_INET6) + sa->v6.sin6_addr = in6addr_loopback; + else + sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); +} + +static void sa46_init_inany(union sa46 *sa, sa_family_t family) +{ + memset(sa, 0, sizeof(*sa)); + sa->family = family; + if (sa->family == AF_INET6) + sa->v6.sin6_addr = in6addr_any; + else + sa->v4.sin_addr.s_addr = INADDR_ANY; +} + +static int read_int_sysctl(const char *sysctl) +{ + char buf[16]; + int fd, ret; + + fd = open(sysctl, 0); + CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", + sysctl, fd, errno); + + ret = read(fd, buf, sizeof(buf)); + CHECK(ret <= 0, "read(sysctl)", "sysctl:%s ret:%d errno:%d\n", + sysctl, ret, errno); + close(fd); + + return atoi(buf); +} + +static void write_int_sysctl(const char *sysctl, int v) +{ + int fd, ret, size; + char buf[16]; + + fd = open(sysctl, O_RDWR); + CHECK(fd == -1, "open(sysctl)", "sysctl:%s fd:%d errno:%d\n", + sysctl, fd, errno); + + size = snprintf(buf, sizeof(buf), "%d", v); + ret = write(fd, buf, size); + CHECK(ret != size, "write(sysctl)", + "sysctl:%s ret:%d size:%d errno:%d\n", sysctl, ret, size, errno); + close(fd); +} + +static void restore_sysctls(void) +{ + write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo); + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie); +} + +static void enable_fastopen(void) +{ + int fo; + + fo = read_int_sysctl(TCP_FO_SYSCTL); + write_int_sysctl(TCP_FO_SYSCTL, fo | 7); +} + +static void enable_syncookie(void) +{ + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2); +} + +static void disable_syncookie(void) +{ + write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0); +} + +static __u32 get_linum(void) +{ + __u32 linum; + int err; + + err = bpf_map_lookup_elem(linum_map, &index_zero, &linum); + CHECK(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n", + err, errno); + + return linum; +} + +static void check_data(int type, sa_family_t family, const struct cmd *cmd, + int cli_fd) +{ + struct data_check expected = {}, result; + union sa46 cli_sa; + socklen_t addrlen; + int err; + + addrlen = sizeof(cli_sa); + err = getsockname(cli_fd, (struct sockaddr *)&cli_sa, + &addrlen); + CHECK(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n", + err, errno); + + err = bpf_map_lookup_elem(data_check_map, &index_zero, &result); + CHECK(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n", + err, errno); + + if (type == SOCK_STREAM) { + expected.len = MIN_TCPHDR_LEN; + expected.ip_protocol = IPPROTO_TCP; + } else { + expected.len = UDPHDR_LEN; + expected.ip_protocol = IPPROTO_UDP; + } + + if (family == AF_INET6) { + expected.eth_protocol = htons(ETH_P_IPV6); + expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] && + !srv_sa.v6.sin6_addr.s6_addr32[2] && + !srv_sa.v6.sin6_addr.s6_addr32[1] && + !srv_sa.v6.sin6_addr.s6_addr32[0]; + + memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32, + sizeof(cli_sa.v6.sin6_addr)); + memcpy(&expected.skb_addrs[4], &in6addr_loopback, + sizeof(in6addr_loopback)); + expected.skb_ports[0] = cli_sa.v6.sin6_port; + expected.skb_ports[1] = srv_sa.v6.sin6_port; + } else { + expected.eth_protocol = htons(ETH_P_IP); + expected.bind_inany = !srv_sa.v4.sin_addr.s_addr; + + expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr; + expected.skb_addrs[1] = htonl(INADDR_LOOPBACK); + expected.skb_ports[0] = cli_sa.v4.sin_port; + expected.skb_ports[1] = srv_sa.v4.sin_port; + } + + if (memcmp(&result, &expected, offsetof(struct data_check, + equal_check_end))) { + printf("unexpected data_check\n"); + printf(" result: (0x%x, %u, %u)\n", + result.eth_protocol, result.ip_protocol, + result.bind_inany); + printf("expected: (0x%x, %u, %u)\n", + expected.eth_protocol, expected.ip_protocol, + expected.bind_inany); + CHECK(1, "data_check result != expected", + "bpf_prog_linum:%u\n", get_linum()); + } + + CHECK(!result.hash, "data_check result.hash empty", + "result.hash:%u", result.hash); + + expected.len += cmd ? sizeof(*cmd) : 0; + if (type == SOCK_STREAM) + CHECK(expected.len > result.len, "expected.len > result.len", + "expected.len:%u result.len:%u bpf_prog_linum:%u\n", + expected.len, result.len, get_linum()); + else + CHECK(expected.len != result.len, "expected.len != result.len", + "expected.len:%u result.len:%u bpf_prog_linum:%u\n", + expected.len, result.len, get_linum()); +} + +static void check_results(void) +{ + __u32 results[NR_RESULTS]; + __u32 i, broken = 0; + int err; + + for (i = 0; i < NR_RESULTS; i++) { + err = bpf_map_lookup_elem(result_map, &i, &results[i]); + CHECK(err == -1, "lookup_elem(result_map)", + "i:%u err:%d errno:%d\n", i, err, errno); + } + + for (i = 0; i < NR_RESULTS; i++) { + if (results[i] != expected_results[i]) { + broken = i; + break; + } + } + + if (i == NR_RESULTS) + return; + + printf("unexpected result\n"); + printf(" result: ["); + printf("%u", results[0]); + for (i = 1; i < NR_RESULTS; i++) + printf(", %u", results[i]); + printf("]\n"); + + printf("expected: ["); + printf("%u", expected_results[0]); + for (i = 1; i < NR_RESULTS; i++) + printf(", %u", expected_results[i]); + printf("]\n"); + + CHECK(expected_results[broken] != results[broken], + "unexpected result", + "expected_results[%u] != results[%u] bpf_prog_linum:%u\n", + broken, broken, get_linum()); +} + +static int send_data(int type, sa_family_t family, void *data, size_t len, + enum result expected) +{ + union sa46 cli_sa; + int fd, err; + + fd = socket(family, type, 0); + CHECK(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno); + + sa46_init_loopback(&cli_sa, family); + err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa)); + CHECK(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno); + + err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa, + sizeof(srv_sa)); + CHECK(err != len && expected >= PASS, + "sendto()", "family:%u err:%d errno:%d expected:%d\n", + family, err, errno, expected); + + return fd; +} + +static void do_test(int type, sa_family_t family, struct cmd *cmd, + enum result expected) +{ + int nev, srv_fd, cli_fd; + struct epoll_event ev; + struct cmd rcv_cmd; + ssize_t nread; + + cli_fd = send_data(type, family, cmd, cmd ? sizeof(*cmd) : 0, + expected); + nev = epoll_wait(epfd, &ev, 1, expected >= PASS ? 5 : 0); + CHECK((nev <= 0 && expected >= PASS) || + (nev > 0 && expected < PASS), + "nev <> expected", + "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n", + nev, expected, type, family, + cmd ? cmd->reuseport_index : -1, + cmd ? cmd->pass_on_failure : -1); + check_results(); + check_data(type, family, cmd, cli_fd); + + if (expected < PASS) + return; + + CHECK(expected != PASS_ERR_SK_SELECT_REUSEPORT && + cmd->reuseport_index != ev.data.u32, + "check cmd->reuseport_index", + "cmd:(%u, %u) ev.data.u32:%u\n", + cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32); + + srv_fd = sk_fds[ev.data.u32]; + if (type == SOCK_STREAM) { + int new_fd = accept(srv_fd, NULL, 0); + + CHECK(new_fd == -1, "accept(srv_fd)", + "ev.data.u32:%u new_fd:%d errno:%d\n", + ev.data.u32, new_fd, errno); + + nread = recv(new_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); + CHECK(nread != sizeof(rcv_cmd), + "recv(new_fd)", + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", + ev.data.u32, nread, sizeof(rcv_cmd), errno); + + close(new_fd); + } else { + nread = recv(srv_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT); + CHECK(nread != sizeof(rcv_cmd), + "recv(sk_fds)", + "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n", + ev.data.u32, nread, sizeof(rcv_cmd), errno); + } + + close(cli_fd); +} + +static void test_err_inner_map(int type, sa_family_t family) +{ + struct cmd cmd = { + .reuseport_index = 0, + .pass_on_failure = 0, + }; + + printf("%s: ", __func__); + expected_results[DROP_ERR_INNER_MAP]++; + do_test(type, family, &cmd, DROP_ERR_INNER_MAP); + printf("OK\n"); +} + +static void test_err_skb_data(int type, sa_family_t family) +{ + printf("%s: ", __func__); + expected_results[DROP_ERR_SKB_DATA]++; + do_test(type, family, NULL, DROP_ERR_SKB_DATA); + printf("OK\n"); +} + +static void test_err_sk_select_port(int type, sa_family_t family) +{ + struct cmd cmd = { + .reuseport_index = REUSEPORT_ARRAY_SIZE, + .pass_on_failure = 0, + }; + + printf("%s: ", __func__); + expected_results[DROP_ERR_SK_SELECT_REUSEPORT]++; + do_test(type, family, &cmd, DROP_ERR_SK_SELECT_REUSEPORT); + printf("OK\n"); +} + +static void test_pass(int type, sa_family_t family) +{ + struct cmd cmd; + int i; + + printf("%s: ", __func__); + cmd.pass_on_failure = 0; + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { + expected_results[PASS]++; + cmd.reuseport_index = i; + do_test(type, family, &cmd, PASS); + } + printf("OK\n"); +} + +static void test_syncookie(int type, sa_family_t family) +{ + int err, tmp_index = 1; + struct cmd cmd = { + .reuseport_index = 0, + .pass_on_failure = 0, + }; + + if (type != SOCK_STREAM) + return; + + printf("%s: ", __func__); + /* + * +1 for TCP-SYN and + * +1 for the TCP-ACK (ack the syncookie) + */ + expected_results[PASS] += 2; + enable_syncookie(); + /* + * Simulate TCP-SYN and TCP-ACK are handled by two different sk: + * TCP-SYN: select sk_fds[tmp_index = 1] tmp_index is from the + * tmp_index_ovr_map + * TCP-ACK: select sk_fds[reuseport_index = 0] reuseport_index + * is from the cmd.reuseport_index + */ + err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, + &tmp_index, BPF_ANY); + CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)", + "err:%d errno:%d\n", err, errno); + do_test(type, family, &cmd, PASS); + err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero, + &tmp_index); + CHECK(err == -1 || tmp_index != -1, + "lookup_elem(tmp_index_ovr_map)", + "err:%d errno:%d tmp_index:%d\n", + err, errno, tmp_index); + disable_syncookie(); + printf("OK\n"); +} + +static void test_pass_on_err(int type, sa_family_t family) +{ + struct cmd cmd = { + .reuseport_index = REUSEPORT_ARRAY_SIZE, + .pass_on_failure = 1, + }; + + printf("%s: ", __func__); + expected_results[PASS_ERR_SK_SELECT_REUSEPORT] += 1; + do_test(type, family, &cmd, PASS_ERR_SK_SELECT_REUSEPORT); + printf("OK\n"); +} + +static void prepare_sk_fds(int type, sa_family_t family, bool inany) +{ + const int first = REUSEPORT_ARRAY_SIZE - 1; + int i, err, optval = 1; + struct epoll_event ev; + socklen_t addrlen; + + if (inany) + sa46_init_inany(&srv_sa, family); + else + sa46_init_loopback(&srv_sa, family); + addrlen = sizeof(srv_sa); + + /* + * The sk_fds[] is filled from the back such that the order + * is exactly opposite to the (struct sock_reuseport *)reuse->socks[]. + */ + for (i = first; i >= 0; i--) { + sk_fds[i] = socket(family, type, 0); + CHECK(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n", + i, sk_fds[i], errno); + err = setsockopt(sk_fds[i], SOL_SOCKET, SO_REUSEPORT, + &optval, sizeof(optval)); + CHECK(err == -1, "setsockopt(SO_REUSEPORT)", + "sk_fds[%d] err:%d errno:%d\n", + i, err, errno); + + if (i == first) { + err = setsockopt(sk_fds[i], SOL_SOCKET, + SO_ATTACH_REUSEPORT_EBPF, + &select_by_skb_data_prog, + sizeof(select_by_skb_data_prog)); + CHECK(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)", + "err:%d errno:%d\n", err, errno); + } + + err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen); + CHECK(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n", + i, err, errno); + + if (type == SOCK_STREAM) { + err = listen(sk_fds[i], 10); + CHECK(err == -1, "listen()", + "sk_fds[%d] err:%d errno:%d\n", + i, err, errno); + } + + err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i], + BPF_NOEXIST); + CHECK(err == -1, "update_elem(reuseport_array)", + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); + + if (i == first) { + socklen_t addrlen = sizeof(srv_sa); + + err = getsockname(sk_fds[i], (struct sockaddr *)&srv_sa, + &addrlen); + CHECK(err == -1, "getsockname()", + "sk_fds[%d] err:%d errno:%d\n", i, err, errno); + } + } + + epfd = epoll_create(1); + CHECK(epfd == -1, "epoll_create(1)", + "epfd:%d errno:%d\n", epfd, errno); + + ev.events = EPOLLIN; + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) { + ev.data.u32 = i; + err = epoll_ctl(epfd, EPOLL_CTL_ADD, sk_fds[i], &ev); + CHECK(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i); + } +} + +static void setup_per_test(int type, unsigned short family, bool inany) +{ + int ovr = -1, err; + + prepare_sk_fds(type, family, inany); + err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr, + BPF_ANY); + CHECK(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)", + "err:%d errno:%d\n", err, errno); +} + +static void cleanup_per_test(void) +{ + int i, err; + + for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) + close(sk_fds[i]); + close(epfd); + + err = bpf_map_delete_elem(outer_map, &index_zero); + CHECK(err == -1, "delete_elem(outer_map)", + "err:%d errno:%d\n", err, errno); +} + +static void cleanup(void) +{ + close(outer_map); + close(reuseport_array); + bpf_object__close(obj); +} + +static void test_all(void) +{ + /* Extra SOCK_STREAM to test bind_inany==true */ + const int types[] = { SOCK_STREAM, SOCK_DGRAM, SOCK_STREAM }; + const char * const type_strings[] = { "TCP", "UDP", "TCP" }; + const char * const family_strings[] = { "IPv6", "IPv4" }; + const unsigned short families[] = { AF_INET6, AF_INET }; + const bool bind_inany[] = { false, false, true }; + int t, f, err; + + for (f = 0; f < ARRAY_SIZE(families); f++) { + unsigned short family = families[f]; + + for (t = 0; t < ARRAY_SIZE(types); t++) { + bool inany = bind_inany[t]; + int type = types[t]; + + printf("######## %s/%s %s ########\n", + family_strings[f], type_strings[t], + inany ? " INANY " : "LOOPBACK"); + + setup_per_test(type, family, inany); + + test_err_inner_map(type, family); + + /* Install reuseport_array to the outer_map */ + err = bpf_map_update_elem(outer_map, &index_zero, + &reuseport_array, BPF_ANY); + CHECK(err == -1, "update_elem(outer_map)", + "err:%d errno:%d\n", err, errno); + + test_err_skb_data(type, family); + test_err_sk_select_port(type, family); + test_pass(type, family); + test_syncookie(type, family); + test_pass_on_err(type, family); + + cleanup_per_test(); + printf("\n"); + } + } +} + +int main(int argc, const char **argv) +{ + create_maps(); + prepare_bpf_obj(); + saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); + saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); + enable_fastopen(); + disable_syncookie(); + atexit(restore_sysctls); + + test_all(); + + cleanup(); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_select_reuseport_common.h b/tools/testing/selftests/bpf/test_select_reuseport_common.h new file mode 100644 index 000000000000..08eb2a9f145f --- /dev/null +++ b/tools/testing/selftests/bpf/test_select_reuseport_common.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Facebook */ + +#ifndef __TEST_SELECT_REUSEPORT_COMMON_H +#define __TEST_SELECT_REUSEPORT_COMMON_H + +#include <linux/types.h> + +enum result { + DROP_ERR_INNER_MAP, + DROP_ERR_SKB_DATA, + DROP_ERR_SK_SELECT_REUSEPORT, + DROP_MISC, + PASS, + PASS_ERR_SK_SELECT_REUSEPORT, + NR_RESULTS, +}; + +struct cmd { + __u32 reuseport_index; + __u32 pass_on_failure; +}; + +struct data_check { + __u32 ip_protocol; + __u32 skb_addrs[8]; + __u16 skb_ports[2]; + __u16 eth_protocol; + __u8 bind_inany; + __u8 equal_check_end[0]; + + __u32 len; + __u32 hash; +}; + +#endif diff --git a/tools/testing/selftests/bpf/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/test_select_reuseport_kern.c new file mode 100644 index 000000000000..5b54ec637ada --- /dev/null +++ b/tools/testing/selftests/bpf/test_select_reuseport_kern.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook */ + +#include <stdlib.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/bpf.h> +#include <linux/types.h> +#include <linux/if_ether.h> + +#include "bpf_endian.h" +#include "bpf_helpers.h" +#include "test_select_reuseport_common.h" + +int _version SEC("version") = 1; + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +struct bpf_map_def SEC("maps") outer_map = { + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") result_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = NR_RESULTS, +}; + +struct bpf_map_def SEC("maps") tmp_index_ovr_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") linum_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") data_check_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct data_check), + .max_entries = 1, +}; + +#define GOTO_DONE(_result) ({ \ + result = (_result); \ + linum = __LINE__; \ + goto done; \ +}) + +SEC("select_by_skb_data") +int _select_by_skb_data(struct sk_reuseport_md *reuse_md) +{ + __u32 linum, index = 0, flags = 0, index_zero = 0; + __u32 *result_cnt, *linum_value; + struct data_check data_check = {}; + struct cmd *cmd, cmd_copy; + void *data, *data_end; + void *reuseport_array; + enum result result; + int *index_ovr; + int err; + + data = reuse_md->data; + data_end = reuse_md->data_end; + data_check.len = reuse_md->len; + data_check.eth_protocol = reuse_md->eth_protocol; + data_check.ip_protocol = reuse_md->ip_protocol; + data_check.hash = reuse_md->hash; + data_check.bind_inany = reuse_md->bind_inany; + if (data_check.eth_protocol == bpf_htons(ETH_P_IP)) { + if (bpf_skb_load_bytes_relative(reuse_md, + offsetof(struct iphdr, saddr), + data_check.skb_addrs, 8, + BPF_HDR_START_NET)) + GOTO_DONE(DROP_MISC); + } else { + if (bpf_skb_load_bytes_relative(reuse_md, + offsetof(struct ipv6hdr, saddr), + data_check.skb_addrs, 32, + BPF_HDR_START_NET)) + GOTO_DONE(DROP_MISC); + } + + /* + * The ip_protocol could be a compile time decision + * if the bpf_prog.o is dedicated to either TCP or + * UDP. + * + * Otherwise, reuse_md->ip_protocol or + * the protocol field in the iphdr can be used. + */ + if (data_check.ip_protocol == IPPROTO_TCP) { + struct tcphdr *th = data; + + if (th + 1 > data_end) + GOTO_DONE(DROP_MISC); + + data_check.skb_ports[0] = th->source; + data_check.skb_ports[1] = th->dest; + + if ((th->doff << 2) + sizeof(*cmd) > data_check.len) + GOTO_DONE(DROP_ERR_SKB_DATA); + if (bpf_skb_load_bytes(reuse_md, th->doff << 2, &cmd_copy, + sizeof(cmd_copy))) + GOTO_DONE(DROP_MISC); + cmd = &cmd_copy; + } else if (data_check.ip_protocol == IPPROTO_UDP) { + struct udphdr *uh = data; + + if (uh + 1 > data_end) + GOTO_DONE(DROP_MISC); + + data_check.skb_ports[0] = uh->source; + data_check.skb_ports[1] = uh->dest; + + if (sizeof(struct udphdr) + sizeof(*cmd) > data_check.len) + GOTO_DONE(DROP_ERR_SKB_DATA); + if (data + sizeof(struct udphdr) + sizeof(*cmd) > data_end) { + if (bpf_skb_load_bytes(reuse_md, sizeof(struct udphdr), + &cmd_copy, sizeof(cmd_copy))) + GOTO_DONE(DROP_MISC); + cmd = &cmd_copy; + } else { + cmd = data + sizeof(struct udphdr); + } + } else { + GOTO_DONE(DROP_MISC); + } + + reuseport_array = bpf_map_lookup_elem(&outer_map, &index_zero); + if (!reuseport_array) + GOTO_DONE(DROP_ERR_INNER_MAP); + + index = cmd->reuseport_index; + index_ovr = bpf_map_lookup_elem(&tmp_index_ovr_map, &index_zero); + if (!index_ovr) + GOTO_DONE(DROP_MISC); + + if (*index_ovr != -1) { + index = *index_ovr; + *index_ovr = -1; + } + err = bpf_sk_select_reuseport(reuse_md, reuseport_array, &index, + flags); + if (!err) + GOTO_DONE(PASS); + + if (cmd->pass_on_failure) + GOTO_DONE(PASS_ERR_SK_SELECT_REUSEPORT); + else + GOTO_DONE(DROP_ERR_SK_SELECT_REUSEPORT); + +done: + result_cnt = bpf_map_lookup_elem(&result_map, &result); + if (!result_cnt) + return SK_DROP; + + bpf_map_update_elem(&linum_map, &index_zero, &linum, BPF_ANY); + bpf_map_update_elem(&data_check_map, &index_zero, &data_check, BPF_ANY); + + (*result_cnt)++; + return result < PASS ? SK_DROP : SK_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh new file mode 100755 index 000000000000..42544a969abc --- /dev/null +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh @@ -0,0 +1,62 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Facebook + +set -eu + +wait_for_ip() +{ + local _i + echo -n "Wait for testing link-local IP to become available " + for _i in $(seq ${MAX_PING_TRIES}); do + echo -n "." + if ping -6 -q -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then + echo " OK" + return + fi + sleep 1 + done + echo 1>&2 "ERROR: Timeout waiting for test IP to become available." + exit 1 +} + +setup() +{ + # Create testing interfaces not to interfere with current environment. + ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} + ip link set ${TEST_IF} up + ip link set ${TEST_IF_PEER} up + + wait_for_ip + + tc qdisc add dev ${TEST_IF} clsact + tc filter add dev ${TEST_IF} egress bpf obj ${BPF_PROG_OBJ} \ + sec ${BPF_PROG_SECTION} da + + BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \ + awk '/ id / {sub(/.* id /, "", $0); print($1)}') +} + +cleanup() +{ + ip link del ${TEST_IF} 2>/dev/null || : + ip link del ${TEST_IF_PEER} 2>/dev/null || : +} + +main() +{ + trap cleanup EXIT 2 3 6 15 + setup + ${PROG} ${TEST_IF} ${BPF_PROG_ID} +} + +DIR=$(dirname $0) +TEST_IF="test_cgid_1" +TEST_IF_PEER="test_cgid_2" +MAX_PING_TRIES=5 +BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.o" +BPF_PROG_SECTION="cgroup_id_logger" +BPF_PROG_ID=0 +PROG="${DIR}/test_skb_cgroup_id_user" + +main diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c new file mode 100644 index 000000000000..68cf9829f5a7 --- /dev/null +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <linux/bpf.h> +#include <linux/pkt_cls.h> + +#include <string.h> + +#include "bpf_helpers.h" + +#define NUM_CGROUP_LEVELS 4 + +struct bpf_map_def SEC("maps") cgroup_ids = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u64), + .max_entries = NUM_CGROUP_LEVELS, +}; + +static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level) +{ + __u64 id; + + /* [1] &level passed to external function that may change it, it's + * incompatible with loop unroll. + */ + id = bpf_skb_ancestor_cgroup_id(skb, level); + bpf_map_update_elem(&cgroup_ids, &level, &id, 0); +} + +SEC("cgroup_id_logger") +int log_cgroup_id(struct __sk_buff *skb) +{ + /* Loop unroll can't be used here due to [1]. Unrolling manually. + * Number of calls should be in sync with NUM_CGROUP_LEVELS. + */ + log_nth_level(skb, 0); + log_nth_level(skb, 1); + log_nth_level(skb, 2); + log_nth_level(skb, 3); + + return TC_ACT_OK; +} + +int _version SEC("version") = 1; + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c new file mode 100644 index 000000000000..c121cc59f314 --- /dev/null +++ b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <arpa/inet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <sys/socket.h> +#include <sys/types.h> + + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "bpf_rlimit.h" +#include "cgroup_helpers.h" + +#define CGROUP_PATH "/skb_cgroup_test" +#define NUM_CGROUP_LEVELS 4 + +/* RFC 4291, Section 2.7.1 */ +#define LINKLOCAL_MULTICAST "ff02::1" + +static int mk_dst_addr(const char *ip, const char *iface, + struct sockaddr_in6 *dst) +{ + memset(dst, 0, sizeof(*dst)); + + dst->sin6_family = AF_INET6; + dst->sin6_port = htons(1025); + + if (inet_pton(AF_INET6, ip, &dst->sin6_addr) != 1) { + log_err("Invalid IPv6: %s", ip); + return -1; + } + + dst->sin6_scope_id = if_nametoindex(iface); + if (!dst->sin6_scope_id) { + log_err("Failed to get index of iface: %s", iface); + return -1; + } + + return 0; +} + +static int send_packet(const char *iface) +{ + struct sockaddr_in6 dst; + char msg[] = "msg"; + int err = 0; + int fd = -1; + + if (mk_dst_addr(LINKLOCAL_MULTICAST, iface, &dst)) + goto err; + + fd = socket(AF_INET6, SOCK_DGRAM, 0); + if (fd == -1) { + log_err("Failed to create UDP socket"); + goto err; + } + + if (sendto(fd, &msg, sizeof(msg), 0, (const struct sockaddr *)&dst, + sizeof(dst)) == -1) { + log_err("Failed to send datagram"); + goto err; + } + + goto out; +err: + err = -1; +out: + if (fd >= 0) + close(fd); + return err; +} + +int get_map_fd_by_prog_id(int prog_id) +{ + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + __u32 map_ids[1]; + int prog_fd = -1; + int map_fd = -1; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) { + log_err("Failed to get fd by prog id %d", prog_id); + goto err; + } + + info.nr_map_ids = 1; + info.map_ids = (__u64) (unsigned long) map_ids; + + if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) { + log_err("Failed to get info by prog fd %d", prog_fd); + goto err; + } + + if (!info.nr_map_ids) { + log_err("No maps found for prog fd %d", prog_fd); + goto err; + } + + map_fd = bpf_map_get_fd_by_id(map_ids[0]); + if (map_fd < 0) + log_err("Failed to get fd by map id %d", map_ids[0]); +err: + if (prog_fd >= 0) + close(prog_fd); + return map_fd; +} + +int check_ancestor_cgroup_ids(int prog_id) +{ + __u64 actual_ids[NUM_CGROUP_LEVELS], expected_ids[NUM_CGROUP_LEVELS]; + __u32 level; + int err = 0; + int map_fd; + + expected_ids[0] = 0x100000001; /* root cgroup */ + expected_ids[1] = get_cgroup_id(""); + expected_ids[2] = get_cgroup_id(CGROUP_PATH); + expected_ids[3] = 0; /* non-existent cgroup */ + + map_fd = get_map_fd_by_prog_id(prog_id); + if (map_fd < 0) + goto err; + + for (level = 0; level < NUM_CGROUP_LEVELS; ++level) { + if (bpf_map_lookup_elem(map_fd, &level, &actual_ids[level])) { + log_err("Failed to lookup key %d", level); + goto err; + } + if (actual_ids[level] != expected_ids[level]) { + log_err("%llx (actual) != %llx (expected), level: %u\n", + actual_ids[level], expected_ids[level], level); + goto err; + } + } + + goto out; +err: + err = -1; +out: + if (map_fd >= 0) + close(map_fd); + return err; +} + +int main(int argc, char **argv) +{ + int cgfd = -1; + int err = 0; + + if (argc < 3) { + fprintf(stderr, "Usage: %s iface prog_id\n", argv[0]); + exit(EXIT_FAILURE); + } + + if (setup_cgroup_environment()) + goto err; + + cgfd = create_and_get_cgroup(CGROUP_PATH); + if (!cgfd) + goto err; + + if (join_cgroup(CGROUP_PATH)) + goto err; + + if (send_packet(argv[1])) + goto err; + + if (check_ancestor_cgroup_ids(atoi(argv[2]))) + goto err; + + goto out; +err: + err = -1; +out: + close(cgfd); + cleanup_cgroup_environment(); + printf("[%s]\n", err ? "FAIL" : "PASS"); + return err; +} diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c index f4d99fabc56d..b8ebe2f58074 100644 --- a/tools/testing/selftests/bpf/test_sock.c +++ b/tools/testing/selftests/bpf/test_sock.c @@ -14,10 +14,7 @@ #include "cgroup_helpers.h" #include "bpf_rlimit.h" - -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif +#include "bpf_util.h" #define CG_PATH "/foo" #define MAX_INSNS 512 diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c index 2e45c92d1111..aeeb76a54d63 100644 --- a/tools/testing/selftests/bpf/test_sock_addr.c +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -20,15 +20,12 @@ #include "cgroup_helpers.h" #include "bpf_rlimit.h" +#include "bpf_util.h" #ifndef ENOTSUPP # define ENOTSUPP 524 #endif -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif - #define CG_PATH "/foo" #define CONNECT4_PROG_PATH "./connect4_prog.o" #define CONNECT6_PROG_PATH "./connect6_prog.o" diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 452cf5c6c784..67c412d19c09 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -42,12 +42,9 @@ #endif #include "bpf_rlimit.h" #include "bpf_rand.h" +#include "bpf_util.h" #include "../../../include/linux/filter.h" -#ifndef ARRAY_SIZE -# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif - #define MAX_INSNS BPF_MAXINSNS #define MAX_FIXUPS 8 #define MAX_NR_MAPS 8 |