diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-18 12:34:53 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-18 12:34:53 -0700 |
commit | 81160dda9a7aad13c04e78bb2cfd3c4630e3afab (patch) | |
tree | 4bf79ffa9fc7dc5e2915ff978778c3402c491113 /samples/bpf | |
parent | 8b53c76533aa4356602aea98f98a2f3b4051464c (diff) | |
parent | 1bab8d4c488be22d57f9dd09968c90a0ddc413bf (diff) | |
download | linux-81160dda9a7aad13c04e78bb2cfd3c4630e3afab.tar.bz2 |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller:
1) Support IPV6 RA Captive Portal Identifier, from Maciej Żenczykowski.
2) Use bio_vec in the networking instead of custom skb_frag_t, from
Matthew Wilcox.
3) Make use of xmit_more in r8169 driver, from Heiner Kallweit.
4) Add devmap_hash to xdp, from Toke Høiland-Jørgensen.
5) Support all variants of 5750X bnxt_en chips, from Michael Chan.
6) More RTNL avoidance work in the core and mlx5 driver, from Vlad
Buslov.
7) Add TCP syn cookies bpf helper, from Petar Penkov.
8) Add 'nettest' to selftests and use it, from David Ahern.
9) Add extack support to drop_monitor, add packet alert mode and
support for HW drops, from Ido Schimmel.
10) Add VLAN offload to stmmac, from Jose Abreu.
11) Lots of devm_platform_ioremap_resource() conversions, from
YueHaibing.
12) Add IONIC driver, from Shannon Nelson.
13) Several kTLS cleanups, from Jakub Kicinski.
* git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1930 commits)
mlxsw: spectrum_buffers: Add the ability to query the CPU port's shared buffer
mlxsw: spectrum: Register CPU port with devlink
mlxsw: spectrum_buffers: Prevent changing CPU port's configuration
net: ena: fix incorrect update of intr_delay_resolution
net: ena: fix retrieval of nonadaptive interrupt moderation intervals
net: ena: fix update of interrupt moderation register
net: ena: remove all old adaptive rx interrupt moderation code from ena_com
net: ena: remove ena_restore_ethtool_params() and relevant fields
net: ena: remove old adaptive interrupt moderation code from ena_netdev
net: ena: remove code duplication in ena_com_update_nonadaptive_moderation_interval _*()
net: ena: enable the interrupt_moderation in driver_supported_features
net: ena: reimplement set/get_coalesce()
net: ena: switch to dim algorithm for rx adaptive interrupt moderation
net: ena: add intr_moder_rx_interval to struct ena_com_dev and use it
net: phy: adin: implement Energy Detect Powerdown mode via phy-tunable
ethtool: implement Energy Detect Powerdown support via phy-tunable
xen-netfront: do not assume sk_buff_head list is empty in error handling
s390/ctcm: Delete unnecessary checks before the macro call “dev_kfree_skb”
net: ena: don't wake up tx queue when down
drop_monitor: Better sanitize notified packets
...
Diffstat (limited to 'samples/bpf')
-rw-r--r-- | samples/bpf/syscall_nrs.c | 6 | ||||
-rw-r--r-- | samples/bpf/trace_output_user.c | 43 | ||||
-rw-r--r-- | samples/bpf/tracex5_kern.c | 13 | ||||
-rw-r--r-- | samples/bpf/xdp_fwd_kern.c | 39 | ||||
-rw-r--r-- | samples/bpf/xdp_fwd_user.c | 35 | ||||
-rw-r--r-- | samples/bpf/xdp_sample_pkts_user.c | 61 | ||||
-rw-r--r-- | samples/bpf/xdpsock_user.c | 243 |
7 files changed, 261 insertions, 179 deletions
diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c index 516e255cbe8f..88f940052450 100644 --- a/samples/bpf/syscall_nrs.c +++ b/samples/bpf/syscall_nrs.c @@ -9,5 +9,11 @@ void syscall_defines(void) COMMENT("Linux system call numbers."); SYSNR(__NR_write); SYSNR(__NR_read); +#ifdef __NR_mmap2 + SYSNR(__NR_mmap2); +#endif +#ifdef __NR_mmap SYSNR(__NR_mmap); +#endif + } diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index 2dd1d39b152a..8ee47699a870 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -18,9 +18,6 @@ #include <libbpf.h> #include "bpf_load.h" #include "perf-sys.h" -#include "trace_helpers.h" - -static int pmu_fd; static __u64 time_get_ns(void) { @@ -31,12 +28,12 @@ static __u64 time_get_ns(void) } static __u64 start_time; +static __u64 cnt; #define MAX_CNT 100000ll -static int print_bpf_output(void *data, int size) +static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) { - static __u64 cnt; struct { __u64 pid; __u64 cookie; @@ -45,7 +42,7 @@ static int print_bpf_output(void *data, int size) if (e->cookie != 0x12345678) { printf("BUG pid %llx cookie %llx sized %d\n", e->pid, e->cookie, size); - return LIBBPF_PERF_EVENT_ERROR; + return; } cnt++; @@ -53,30 +50,14 @@ static int print_bpf_output(void *data, int size) if (cnt == MAX_CNT) { printf("recv %lld events per sec\n", MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); - return LIBBPF_PERF_EVENT_DONE; + return; } - - return LIBBPF_PERF_EVENT_CONT; -} - -static void test_bpf_perf_event(void) -{ - struct perf_event_attr attr = { - .sample_type = PERF_SAMPLE_RAW, - .type = PERF_TYPE_SOFTWARE, - .config = PERF_COUNT_SW_BPF_OUTPUT, - }; - int key = 0; - - pmu_fd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); - - assert(pmu_fd >= 0); - assert(bpf_map_update_elem(map_fd[0], &key, &pmu_fd, BPF_ANY) == 0); - ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); } int main(int argc, char **argv) { + struct perf_buffer_opts pb_opts = {}; + struct perf_buffer *pb; char filename[256]; FILE *f; int ret; @@ -88,16 +69,20 @@ int main(int argc, char **argv) return 1; } - test_bpf_perf_event(); - - if (perf_event_mmap(pmu_fd) < 0) + pb_opts.sample_cb = print_bpf_output; + pb = perf_buffer__new(map_fd[0], 8, &pb_opts); + ret = libbpf_get_error(pb); + if (ret) { + printf("failed to setup perf_buffer: %d\n", ret); return 1; + } f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r"); (void) f; start_time = time_get_ns(); - ret = perf_event_poller(pmu_fd, print_bpf_output); + while ((ret = perf_buffer__poll(pb, 1000)) >= 0 && cnt < MAX_CNT) { + } kill(0, SIGINT); return ret; } diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c index f57f4e1ea1ec..35cb0eed3be5 100644 --- a/samples/bpf/tracex5_kern.c +++ b/samples/bpf/tracex5_kern.c @@ -68,12 +68,25 @@ PROG(SYS__NR_read)(struct pt_regs *ctx) return 0; } +#ifdef __NR_mmap2 +PROG(SYS__NR_mmap2)(struct pt_regs *ctx) +{ + char fmt[] = "mmap2\n"; + + bpf_trace_printk(fmt, sizeof(fmt)); + return 0; +} +#endif + +#ifdef __NR_mmap PROG(SYS__NR_mmap)(struct pt_regs *ctx) { char fmt[] = "mmap\n"; + bpf_trace_printk(fmt, sizeof(fmt)); return 0; } +#endif char _license[] SEC("license") = "GPL"; u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c index a7e94e7ff87d..701a30f258b1 100644 --- a/samples/bpf/xdp_fwd_kern.c +++ b/samples/bpf/xdp_fwd_kern.c @@ -23,7 +23,8 @@ #define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) -struct bpf_map_def SEC("maps") tx_port = { +/* For TX-traffic redirect requires net_device ifindex to be in this devmap */ +struct bpf_map_def SEC("maps") xdp_tx_ports = { .type = BPF_MAP_TYPE_DEVMAP, .key_size = sizeof(int), .value_size = sizeof(int), @@ -102,14 +103,34 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) fib_params.ifindex = ctx->ingress_ifindex; rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); - - /* verify egress index has xdp support - * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with - * cannot pass map_type 14 into func bpf_map_lookup_elem#1: - * NOTE: without verification that egress index supports XDP - * forwarding packets are dropped. + /* + * Some rc (return codes) from bpf_fib_lookup() are important, + * to understand how this XDP-prog interacts with network stack. + * + * BPF_FIB_LKUP_RET_NO_NEIGH: + * Even if route lookup was a success, then the MAC-addresses are also + * needed. This is obtained from arp/neighbour table, but if table is + * (still) empty then BPF_FIB_LKUP_RET_NO_NEIGH is returned. To avoid + * doing ARP lookup directly from XDP, then send packet to normal + * network stack via XDP_PASS and expect it will do ARP resolution. + * + * BPF_FIB_LKUP_RET_FWD_DISABLED: + * The bpf_fib_lookup respect sysctl net.ipv{4,6}.conf.all.forwarding + * setting, and will return BPF_FIB_LKUP_RET_FWD_DISABLED if not + * enabled this on ingress device. */ - if (rc == 0) { + if (rc == BPF_FIB_LKUP_RET_SUCCESS) { + /* Verify egress index has been configured as TX-port. + * (Note: User can still have inserted an egress ifindex that + * doesn't support XDP xmit, which will result in packet drops). + * + * Note: lookup in devmap supported since 0cdbb4b09a0. + * If not supported will fail with: + * cannot pass map_type 14 into func bpf_map_lookup_elem#1: + */ + if (!bpf_map_lookup_elem(&xdp_tx_ports, &fib_params.ifindex)) + return XDP_PASS; + if (h_proto == htons(ETH_P_IP)) ip_decrease_ttl(iph); else if (h_proto == htons(ETH_P_IPV6)) @@ -117,7 +138,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); memcpy(eth->h_source, fib_params.smac, ETH_ALEN); - return bpf_redirect_map(&tx_port, fib_params.ifindex, 0); + return bpf_redirect_map(&xdp_tx_ports, fib_params.ifindex, 0); } return XDP_PASS; diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c index 5b46ee12c696..97ff1dad7669 100644 --- a/samples/bpf/xdp_fwd_user.c +++ b/samples/bpf/xdp_fwd_user.c @@ -27,14 +27,20 @@ #include "libbpf.h" #include <bpf/bpf.h> - -static int do_attach(int idx, int fd, const char *name) +static int do_attach(int idx, int prog_fd, int map_fd, const char *name) { int err; - err = bpf_set_link_xdp_fd(idx, fd, 0); - if (err < 0) + err = bpf_set_link_xdp_fd(idx, prog_fd, 0); + if (err < 0) { printf("ERROR: failed to attach program to %s\n", name); + return err; + } + + /* Adding ifindex as a possible egress TX port */ + err = bpf_map_update_elem(map_fd, &idx, &idx, 0); + if (err) + printf("ERROR: failed using device %s as TX-port\n", name); return err; } @@ -47,6 +53,9 @@ static int do_detach(int idx, const char *name) if (err < 0) printf("ERROR: failed to detach program from %s\n", name); + /* TODO: Remember to cleanup map, when adding use of shared map + * bpf_map_delete_elem((map_fd, &idx); + */ return err; } @@ -67,10 +76,10 @@ int main(int argc, char **argv) }; const char *prog_name = "xdp_fwd"; struct bpf_program *prog; + int prog_fd, map_fd = -1; char filename[PATH_MAX]; struct bpf_object *obj; int opt, i, idx, err; - int prog_fd, map_fd; int attach = 1; int ret = 0; @@ -103,8 +112,14 @@ int main(int argc, char **argv) return 1; } - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) + err = bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd); + if (err) { + printf("Does kernel support devmap lookup?\n"); + /* If not, the error message will be: + * "cannot pass map_type 14 into func bpf_map_lookup_elem#1" + */ return 1; + } prog = bpf_object__find_program_by_title(obj, prog_name); prog_fd = bpf_program__fd(prog); @@ -113,16 +128,12 @@ int main(int argc, char **argv) return 1; } map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj, - "tx_port")); + "xdp_tx_ports")); if (map_fd < 0) { printf("map not found: %s\n", strerror(map_fd)); return 1; } } - if (attach) { - for (i = 1; i < 64; ++i) - bpf_map_update_elem(map_fd, &i, &i, 0); - } for (i = optind; i < argc; ++i) { idx = if_nametoindex(argv[i]); @@ -138,7 +149,7 @@ int main(int argc, char **argv) if (err) ret = err; } else { - err = do_attach(idx, prog_fd, argv[i]); + err = do_attach(idx, prog_fd, map_fd, argv[i]); if (err) ret = err; } diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c index dc66345a929a..3002714e3cd5 100644 --- a/samples/bpf/xdp_sample_pkts_user.c +++ b/samples/bpf/xdp_sample_pkts_user.c @@ -17,14 +17,13 @@ #include <linux/if_link.h> #include "perf-sys.h" -#include "trace_helpers.h" #define MAX_CPUS 128 -static int pmu_fds[MAX_CPUS], if_idx; -static struct perf_event_mmap_page *headers[MAX_CPUS]; +static int if_idx; static char *if_name; static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; static __u32 prog_id; +static struct perf_buffer *pb = NULL; static int do_attach(int idx, int fd, const char *name) { @@ -73,7 +72,7 @@ static int do_detach(int idx, const char *name) #define SAMPLE_SIZE 64 -static int print_bpf_output(void *data, int size) +static void print_bpf_output(void *ctx, int cpu, void *data, __u32 size) { struct { __u16 cookie; @@ -83,45 +82,20 @@ static int print_bpf_output(void *data, int size) int i; if (e->cookie != 0xdead) { - printf("BUG cookie %x sized %d\n", - e->cookie, size); - return LIBBPF_PERF_EVENT_ERROR; + printf("BUG cookie %x sized %d\n", e->cookie, size); + return; } printf("Pkt len: %-5d bytes. Ethernet hdr: ", e->pkt_len); for (i = 0; i < 14 && i < e->pkt_len; i++) printf("%02x ", e->pkt_data[i]); printf("\n"); - - return LIBBPF_PERF_EVENT_CONT; -} - -static void test_bpf_perf_event(int map_fd, int num) -{ - struct perf_event_attr attr = { - .sample_type = PERF_SAMPLE_RAW, - .type = PERF_TYPE_SOFTWARE, - .config = PERF_COUNT_SW_BPF_OUTPUT, - .wakeup_events = 1, /* get an fd notification for every event */ - }; - int i; - - for (i = 0; i < num; i++) { - int key = i; - - pmu_fds[i] = sys_perf_event_open(&attr, -1/*pid*/, i/*cpu*/, - -1/*group_fd*/, 0); - - assert(pmu_fds[i] >= 0); - assert(bpf_map_update_elem(map_fd, &key, - &pmu_fds[i], BPF_ANY) == 0); - ioctl(pmu_fds[i], PERF_EVENT_IOC_ENABLE, 0); - } } static void sig_handler(int signo) { do_detach(if_idx, if_name); + perf_buffer__free(pb); exit(0); } @@ -140,13 +114,13 @@ int main(int argc, char **argv) struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; + struct perf_buffer_opts pb_opts = {}; const char *optstr = "F"; int prog_fd, map_fd, opt; struct bpf_object *obj; struct bpf_map *map; char filename[256]; - int ret, err, i; - int numcpus; + int ret, err; while ((opt = getopt(argc, argv, optstr)) != -1) { switch (opt) { @@ -169,10 +143,6 @@ int main(int argc, char **argv) return 1; } - numcpus = get_nprocs(); - if (numcpus > MAX_CPUS) - numcpus = MAX_CPUS; - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); prog_load_attr.file = filename; @@ -211,14 +181,17 @@ int main(int argc, char **argv) return 1; } - test_bpf_perf_event(map_fd, numcpus); + pb_opts.sample_cb = print_bpf_output; + pb = perf_buffer__new(map_fd, 8, &pb_opts); + err = libbpf_get_error(pb); + if (err) { + perror("perf_buffer setup failed"); + return 1; + } - for (i = 0; i < numcpus; i++) - if (perf_event_mmap_header(pmu_fds[i], &headers[i]) < 0) - return 1; + while ((ret = perf_buffer__poll(pb, 1000)) >= 0) { + } - ret = perf_event_poller_multi(pmu_fds, headers, numcpus, - print_bpf_output); kill(0, SIGINT); return ret; } diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 93eaaf7239b2..df011ac33402 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -67,8 +67,14 @@ static int opt_ifindex; static int opt_queue; static int opt_poll; static int opt_interval = 1; +static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP; +static u32 opt_umem_flags; +static int opt_unaligned_chunks; +static int opt_mmap_flags; static u32 opt_xdp_bind_flags; static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; +static int opt_timeout = 1000; +static bool opt_need_wakeup = true; static __u32 prog_id; struct xsk_umem_info { @@ -282,7 +288,9 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, .frame_size = opt_xsk_frame_size, .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, + .flags = opt_umem_flags }; + int ret; umem = calloc(1, sizeof(*umem)); @@ -291,6 +299,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, &cfg); + if (ret) exit_with_error(-ret); @@ -352,6 +361,8 @@ static struct option long_options[] = { {"zero-copy", no_argument, 0, 'z'}, {"copy", no_argument, 0, 'c'}, {"frame-size", required_argument, 0, 'f'}, + {"no-need-wakeup", no_argument, 0, 'm'}, + {"unaligned", no_argument, 0, 'u'}, {0, 0, 0, 0} }; @@ -372,6 +383,9 @@ static void usage(const char *prog) " -z, --zero-copy Force zero-copy mode.\n" " -c, --copy Force copy mode.\n" " -f, --frame-size=n Set the frame size (must be a power of two, default is %d).\n" + " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n" + " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n" + " -u, --unaligned Enable unaligned chunk placement\n" "\n"; fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE); exit(EXIT_FAILURE); @@ -384,8 +398,8 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options, - &option_index); + c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:mu", + long_options, &option_index); if (c == -1) break; @@ -424,12 +438,21 @@ static void parse_command_line(int argc, char **argv) case 'c': opt_xdp_bind_flags |= XDP_COPY; break; + case 'u': + opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG; + opt_unaligned_chunks = 1; + opt_mmap_flags = MAP_HUGETLB; + break; case 'F': opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; break; case 'f': opt_xsk_frame_size = atoi(optarg); + case 'm': + opt_need_wakeup = false; + opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP; break; + default: usage(basename(argv[0])); } @@ -442,7 +465,8 @@ static void parse_command_line(int argc, char **argv) usage(basename(argv[0])); } - if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) { + if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) && + !opt_unaligned_chunks) { fprintf(stderr, "--frame-size=%d is not a power of two\n", opt_xsk_frame_size); usage(basename(argv[0])); @@ -459,8 +483,10 @@ static void kick_tx(struct xsk_socket_info *xsk) exit_with_error(errno); } -static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) +static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, + struct pollfd *fds) { + struct xsk_umem_info *umem = xsk->umem; u32 idx_cq = 0, idx_fq = 0; unsigned int rcvd; size_t ndescs; @@ -468,27 +494,30 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) if (!xsk->outstanding_tx) return; - kick_tx(xsk); + if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); + ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE : xsk->outstanding_tx; /* re-add completed Tx buffers */ - rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq); + rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq); if (rcvd > 0) { unsigned int i; int ret; - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); - ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, - &idx_fq); + if (xsk_ring_prod__needs_wakeup(&umem->fq)) + ret = poll(fds, num_socks, opt_timeout); + ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq); } + for (i = 0; i < rcvd; i++) - *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = - *xsk_ring_cons__comp_addr(&xsk->umem->cq, - idx_cq++); + *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) = + *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++); xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->umem->cq, rcvd); @@ -505,7 +534,8 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk) if (!xsk->outstanding_tx) return; - kick_tx(xsk); + if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx); if (rcvd > 0) { @@ -515,30 +545,38 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk) } } -static void rx_drop(struct xsk_socket_info *xsk) +static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds) { unsigned int rcvd, i; u32 idx_rx = 0, idx_fq = 0; int ret; rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); - if (!rcvd) + if (!rcvd) { + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) + ret = poll(fds, num_socks, opt_timeout); return; + } ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); while (ret != rcvd) { if (ret < 0) exit_with_error(-ret); + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) + ret = poll(fds, num_socks, opt_timeout); ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); } for (i = 0; i < rcvd; i++) { u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; + u64 orig = xsk_umem__extract_addr(addr); + + addr = xsk_umem__add_offset_to_addr(addr); char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); hex_dump(pkt, len, addr); - *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr; + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig; } xsk_ring_prod__submit(&xsk->umem->fq, rcvd); @@ -549,42 +587,65 @@ static void rx_drop(struct xsk_socket_info *xsk) static void rx_drop_all(void) { struct pollfd fds[MAX_SOCKS + 1]; - int i, ret, timeout, nfds = 1; + int i, ret; memset(fds, 0, sizeof(fds)); for (i = 0; i < num_socks; i++) { fds[i].fd = xsk_socket__fd(xsks[i]->xsk); fds[i].events = POLLIN; - timeout = 1000; /* 1sn */ } for (;;) { if (opt_poll) { - ret = poll(fds, nfds, timeout); + ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) continue; } for (i = 0; i < num_socks; i++) - rx_drop(xsks[i]); + rx_drop(xsks[i], fds); + } +} + +static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb) +{ + u32 idx; + + if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == BATCH_SIZE) { + unsigned int i; + + for (i = 0; i < BATCH_SIZE; i++) { + xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr = + (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; + xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len = + sizeof(pkt_data) - 1; + } + + xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE); + xsk->outstanding_tx += BATCH_SIZE; + frame_nb += BATCH_SIZE; + frame_nb %= NUM_FRAMES; } + + complete_tx_only(xsk); } -static void tx_only(struct xsk_socket_info *xsk) +static void tx_only_all(void) { - int timeout, ret, nfds = 1; - struct pollfd fds[nfds + 1]; - u32 idx, frame_nb = 0; + struct pollfd fds[MAX_SOCKS]; + u32 frame_nb[MAX_SOCKS] = {}; + int i, ret; memset(fds, 0, sizeof(fds)); - fds[0].fd = xsk_socket__fd(xsk->xsk); - fds[0].events = POLLOUT; - timeout = 1000; /* 1sn */ + for (i = 0; i < num_socks; i++) { + fds[0].fd = xsk_socket__fd(xsks[i]->xsk); + fds[0].events = POLLOUT; + } for (;;) { if (opt_poll) { - ret = poll(fds, nfds, timeout); + ret = poll(fds, num_socks, opt_timeout); if (ret <= 0) continue; @@ -592,69 +653,78 @@ static void tx_only(struct xsk_socket_info *xsk) continue; } - if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == - BATCH_SIZE) { - unsigned int i; - - for (i = 0; i < BATCH_SIZE; i++) { - xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr - = (frame_nb + i) * opt_xsk_frame_size; - xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len = - sizeof(pkt_data) - 1; - } - - xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE); - xsk->outstanding_tx += BATCH_SIZE; - frame_nb += BATCH_SIZE; - frame_nb %= NUM_FRAMES; - } - - complete_tx_only(xsk); + for (i = 0; i < num_socks; i++) + tx_only(xsks[i], frame_nb[i]); } } -static void l2fwd(struct xsk_socket_info *xsk) +static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) { - for (;;) { - unsigned int rcvd, i; - u32 idx_rx = 0, idx_tx = 0; - int ret; + unsigned int rcvd, i; + u32 idx_rx = 0, idx_tx = 0; + int ret; - for (;;) { - complete_tx_l2fwd(xsk); + complete_tx_l2fwd(xsk, fds); - rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, - &idx_rx); - if (rcvd > 0) - break; - } + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); + if (!rcvd) { + if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) + ret = poll(fds, num_socks, opt_timeout); + return; + } + ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(-ret); + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) + kick_tx(xsk); ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); - while (ret != rcvd) { - if (ret < 0) - exit_with_error(-ret); - ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); - } + } - for (i = 0; i < rcvd; i++) { - u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, - idx_rx)->addr; - u32 len = xsk_ring_cons__rx_desc(&xsk->rx, - idx_rx++)->len; - char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); + for (i = 0; i < rcvd; i++) { + u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; + u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; + u64 orig = addr; - swap_mac_addresses(pkt); + addr = xsk_umem__add_offset_to_addr(addr); + char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - hex_dump(pkt, len, addr); - xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr; - xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; - } + swap_mac_addresses(pkt); + + hex_dump(pkt, len, addr); + xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig; + xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; + } + + xsk_ring_prod__submit(&xsk->tx, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); + + xsk->rx_npkts += rcvd; + xsk->outstanding_tx += rcvd; +} - xsk_ring_prod__submit(&xsk->tx, rcvd); - xsk_ring_cons__release(&xsk->rx, rcvd); +static void l2fwd_all(void) +{ + struct pollfd fds[MAX_SOCKS]; + int i, ret; + + memset(fds, 0, sizeof(fds)); - xsk->rx_npkts += rcvd; - xsk->outstanding_tx += rcvd; + for (i = 0; i < num_socks; i++) { + fds[i].fd = xsk_socket__fd(xsks[i]->xsk); + fds[i].events = POLLOUT | POLLIN; + } + + for (;;) { + if (opt_poll) { + ret = poll(fds, num_socks, opt_timeout); + if (ret <= 0) + continue; + } + + for (i = 0; i < num_socks; i++) + l2fwd(xsks[i], fds); } } @@ -674,11 +744,14 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } - ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */ - NUM_FRAMES * opt_xsk_frame_size); - if (ret) - exit_with_error(ret); - + /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */ + bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0); + if (bufs == MAP_FAILED) { + printf("ERROR: mmap failed\n"); + exit(EXIT_FAILURE); + } /* Create sockets... */ umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size); xsks[num_socks++] = xsk_configure_socket(umem); @@ -705,9 +778,9 @@ int main(int argc, char **argv) if (opt_bench == BENCH_RXDROP) rx_drop_all(); else if (opt_bench == BENCH_TXONLY) - tx_only(xsks[0]); + tx_only_all(); else - l2fwd(xsks[0]); + l2fwd_all(); return 0; } |