diff options
Diffstat (limited to 'samples')
-rw-r--r-- | samples/bpf/Makefile | 4 | ||||
-rw-r--r-- | samples/bpf/cpustat_kern.c | 281 | ||||
-rw-r--r-- | samples/bpf/cpustat_user.c | 219 | ||||
-rw-r--r-- | samples/bpf/tcbpf2_kern.c | 6 | ||||
-rwxr-xr-x | samples/bpf/test_cgrp2_sock.sh | 1 | ||||
-rwxr-xr-x | samples/bpf/test_cgrp2_sock2.sh | 3 | ||||
-rwxr-xr-x | samples/bpf/test_tunnel_bpf.sh | 5 | ||||
-rw-r--r-- | samples/bpf/xdp_redirect_user.c | 7 | ||||
-rw-r--r-- | samples/sockmap/Makefile | 2 | ||||
-rw-r--r-- | samples/sockmap/sockmap_user.c | 1 |
10 files changed, 524 insertions, 5 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index ec3fc8d88e87..2c2a587e0942 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu hostprogs-y += xdp_monitor hostprogs-y += xdp_rxq_info hostprogs-y += syscall_tp +hostprogs-y += cpustat # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o @@ -93,6 +94,7 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o +cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -144,6 +146,7 @@ always += xdp_monitor_kern.o always += xdp_rxq_info_kern.o always += xdp2skb_meta_kern.o always += syscall_tp_kern.o +always += cpustat_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -188,6 +191,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf HOSTLOADLIBES_xdp_monitor += -lelf HOSTLOADLIBES_xdp_rxq_info += -lelf HOSTLOADLIBES_syscall_tp += -lelf +HOSTLOADLIBES_cpustat += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c new file mode 100644 index 000000000000..68c84da065b1 --- /dev/null +++ b/samples/bpf/cpustat_kern.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +/* + * The CPU number, cstate number and pstate number are based + * on 96boards Hikey with octa CA53 CPUs. + * + * Every CPU have three idle states for cstate: + * WFI, CPU_OFF, CLUSTER_OFF + * + * Every CPU have 5 operating points: + * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz + * + * This code is based on these assumption and other platforms + * need to adjust these definitions. + */ +#define MAX_CPU 8 +#define MAX_PSTATE_ENTRIES 5 +#define MAX_CSTATE_ENTRIES 3 + +static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 }; + +/* + * my_map structure is used to record cstate and pstate index and + * timestamp (Idx, Ts), when new event incoming we need to update + * combination for new state index and timestamp (Idx`, Ts`). + * + * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time + * interval for the previous state: Duration(Idx) = Ts` - Ts. + * + * Every CPU has one below array for recording state index and + * timestamp, and record for cstate and pstate saperately: + * + * +--------------------------+ + * | cstate timestamp | + * +--------------------------+ + * | cstate index | + * +--------------------------+ + * | pstate timestamp | + * +--------------------------+ + * | pstate index | + * +--------------------------+ + */ +#define MAP_OFF_CSTATE_TIME 0 +#define MAP_OFF_CSTATE_IDX 1 +#define MAP_OFF_PSTATE_TIME 2 +#define MAP_OFF_PSTATE_IDX 3 +#define MAP_OFF_NUM 4 + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = MAX_CPU * MAP_OFF_NUM, +}; + +/* cstate_duration records duration time for every idle state per CPU */ +struct bpf_map_def SEC("maps") cstate_duration = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES, +}; + +/* pstate_duration records duration time for every operating point per CPU */ +struct bpf_map_def SEC("maps") pstate_duration = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES, +}; + +/* + * The trace events for cpu_idle and cpu_frequency are taken from: + * /sys/kernel/debug/tracing/events/power/cpu_idle/format + * /sys/kernel/debug/tracing/events/power/cpu_frequency/format + * + * These two events have same format, so define one common structure. + */ +struct cpu_args { + u64 pad; + u32 state; + u32 cpu_id; +}; + +/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */ +static u32 find_cpu_pstate_idx(u32 frequency) +{ + u32 i; + + for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) { + if (frequency == cpu_opps[i]) + return i; + } + + return i; +} + +SEC("tracepoint/power/cpu_idle") +int bpf_prog1(struct cpu_args *ctx) +{ + u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta; + u32 key, cpu, pstate_idx; + u64 *val; + + if (ctx->cpu_id > MAX_CPU) + return 0; + + cpu = ctx->cpu_id; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME; + cts = bpf_map_lookup_elem(&my_map, &key); + if (!cts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; + cstate = bpf_map_lookup_elem(&my_map, &key); + if (!cstate) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; + pts = bpf_map_lookup_elem(&my_map, &key); + if (!pts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; + pstate = bpf_map_lookup_elem(&my_map, &key); + if (!pstate) + return 0; + + prev_state = *cstate; + *cstate = ctx->state; + + if (!*cts) { + *cts = bpf_ktime_get_ns(); + return 0; + } + + cur_ts = bpf_ktime_get_ns(); + delta = cur_ts - *cts; + *cts = cur_ts; + + /* + * When state doesn't equal to (u32)-1, the cpu will enter + * one idle state; for this case we need to record interval + * for the pstate. + * + * OPP2 + * +---------------------+ + * OPP1 | | + * ---------+ | + * | Idle state + * +--------------- + * + * |<- pstate duration ->| + * ^ ^ + * pts cur_ts + */ + if (ctx->state != (u32)-1) { + + /* record pstate after have first cpu_frequency event */ + if (!*pts) + return 0; + + delta = cur_ts - *pts; + + pstate_idx = find_cpu_pstate_idx(*pstate); + if (pstate_idx >= MAX_PSTATE_ENTRIES) + return 0; + + key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; + val = bpf_map_lookup_elem(&pstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + + /* + * When state equal to (u32)-1, the cpu just exits from one + * specific idle state; for this case we need to record + * interval for the pstate. + * + * OPP2 + * -----------+ + * | OPP1 + * | +----------- + * | Idle state | + * +---------------------+ + * + * |<- cstate duration ->| + * ^ ^ + * cts cur_ts + */ + } else { + + key = cpu * MAX_CSTATE_ENTRIES + prev_state; + val = bpf_map_lookup_elem(&cstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + } + + /* Update timestamp for pstate as new start time */ + if (*pts) + *pts = cur_ts; + + return 0; +} + +SEC("tracepoint/power/cpu_frequency") +int bpf_prog2(struct cpu_args *ctx) +{ + u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta; + u32 key, cpu, pstate_idx; + u64 *val; + + cpu = ctx->cpu_id; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; + pts = bpf_map_lookup_elem(&my_map, &key); + if (!pts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; + pstate = bpf_map_lookup_elem(&my_map, &key); + if (!pstate) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; + cstate = bpf_map_lookup_elem(&my_map, &key); + if (!cstate) + return 0; + + prev_state = *pstate; + *pstate = ctx->state; + + if (!*pts) { + *pts = bpf_ktime_get_ns(); + return 0; + } + + cur_ts = bpf_ktime_get_ns(); + delta = cur_ts - *pts; + *pts = cur_ts; + + /* When CPU is in idle, bail out to skip pstate statistics */ + if (*cstate != (u32)(-1)) + return 0; + + /* + * The cpu changes to another different OPP (in below diagram + * change frequency from OPP3 to OPP1), need recording interval + * for previous frequency OPP3 and update timestamp as start + * time for new frequency OPP1. + * + * OPP3 + * +---------------------+ + * OPP2 | | + * ---------+ | + * | OPP1 + * +--------------- + * + * |<- pstate duration ->| + * ^ ^ + * pts cur_ts + */ + pstate_idx = find_cpu_pstate_idx(*pstate); + if (pstate_idx >= MAX_PSTATE_ENTRIES) + return 0; + + key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; + val = bpf_map_lookup_elem(&pstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c new file mode 100644 index 000000000000..2b4cd1ae57c5 --- /dev/null +++ b/samples/bpf/cpustat_user.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <sched.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <linux/bpf.h> +#include <locale.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <sys/wait.h> + +#include "libbpf.h" +#include "bpf_load.h" + +#define MAX_CPU 8 +#define MAX_PSTATE_ENTRIES 5 +#define MAX_CSTATE_ENTRIES 3 +#define MAX_STARS 40 + +#define CPUFREQ_MAX_SYSFS_PATH "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq" +#define CPUFREQ_LOWEST_FREQ "208000" +#define CPUFREQ_HIGHEST_FREQ "12000000" + +struct cpu_stat_data { + unsigned long cstate[MAX_CSTATE_ENTRIES]; + unsigned long pstate[MAX_PSTATE_ENTRIES]; +}; + +static struct cpu_stat_data stat_data[MAX_CPU]; + +static void cpu_stat_print(void) +{ + int i, j; + char state_str[sizeof("cstate-9")]; + struct cpu_stat_data *data; + + /* Clear screen */ + printf("\033[2J"); + + /* Header */ + printf("\nCPU states statistics:\n"); + printf("%-10s ", "state(ms)"); + + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { + sprintf(state_str, "cstate-%d", i); + printf("%-11s ", state_str); + } + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { + sprintf(state_str, "pstate-%d", i); + printf("%-11s ", state_str); + } + + printf("\n"); + + for (j = 0; j < MAX_CPU; j++) { + data = &stat_data[j]; + + printf("CPU-%-6d ", j); + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) + printf("%-11ld ", data->cstate[i] / 1000000); + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) + printf("%-11ld ", data->pstate[i] / 1000000); + + printf("\n"); + } +} + +static void cpu_stat_update(int cstate_fd, int pstate_fd) +{ + unsigned long key, value; + int c, i; + + for (c = 0; c < MAX_CPU; c++) { + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { + key = c * MAX_CSTATE_ENTRIES + i; + bpf_map_lookup_elem(cstate_fd, &key, &value); + stat_data[c].cstate[i] = value; + } + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { + key = c * MAX_PSTATE_ENTRIES + i; + bpf_map_lookup_elem(pstate_fd, &key, &value); + stat_data[c].pstate[i] = value; + } + } +} + +/* + * This function is copied from 'idlestat' tool function + * idlestat_wake_all() in idlestate.c. + * + * It sets the self running task affinity to cpus one by one so can wake up + * the specific CPU to handle scheduling; this results in all cpus can be + * waken up once and produce ftrace event 'trace_cpu_idle'. + */ +static int cpu_stat_inject_cpu_idle_event(void) +{ + int rcpu, i, ret; + cpu_set_t cpumask; + cpu_set_t original_cpumask; + + ret = sysconf(_SC_NPROCESSORS_CONF); + if (ret < 0) + return -1; + + rcpu = sched_getcpu(); + if (rcpu < 0) + return -1; + + /* Keep track of the CPUs we will run on */ + sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask); + + for (i = 0; i < ret; i++) { + + /* Pointless to wake up ourself */ + if (i == rcpu) + continue; + + /* Pointless to wake CPUs we will not run on */ + if (!CPU_ISSET(i, &original_cpumask)) + continue; + + CPU_ZERO(&cpumask); + CPU_SET(i, &cpumask); + + sched_setaffinity(0, sizeof(cpumask), &cpumask); + } + + /* Enable all the CPUs of the original mask */ + sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask); + return 0; +} + +/* + * It's possible to have no any frequency change for long time and cannot + * get ftrace event 'trace_cpu_frequency' for long period, this introduces + * big deviation for pstate statistics. + * + * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz + * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to + * the maximum frequency value 1.2GHz. + */ +static int cpu_stat_inject_cpu_frequency_event(void) +{ + int len, fd; + + fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY); + if (fd < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + return fd; + } + + len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ)); + if (len < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + goto err; + } + + len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ)); + if (len < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + goto err; + } + +err: + close(fd); + return len; +} + +static void int_exit(int sig) +{ + cpu_stat_inject_cpu_idle_event(); + cpu_stat_inject_cpu_frequency_event(); + cpu_stat_update(map_fd[1], map_fd[2]); + cpu_stat_print(); + exit(0); +} + +int main(int argc, char **argv) +{ + char filename[256]; + int ret; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + ret = cpu_stat_inject_cpu_idle_event(); + if (ret < 0) + return 1; + + ret = cpu_stat_inject_cpu_frequency_event(); + if (ret < 0) + return 1; + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + while (1) { + cpu_stat_update(map_fd[1], map_fd[2]); + cpu_stat_print(); + sleep(5); + } + + return 0; +} diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c index efdc16d195ff..9a8db7bd6db4 100644 --- a/samples/bpf/tcbpf2_kern.c +++ b/samples/bpf/tcbpf2_kern.c @@ -52,7 +52,8 @@ int _gre_set_tunnel(struct __sk_buff *skb) key.tunnel_tos = 0; key.tunnel_ttl = 64; - ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); + ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), + BPF_F_ZERO_CSUM_TX | BPF_F_SEQ_NUMBER); if (ret < 0) { ERROR(ret); return TC_ACT_SHOT; @@ -92,7 +93,8 @@ int _ip6gretap_set_tunnel(struct __sk_buff *skb) key.tunnel_label = 0xabcde; ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), - BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX); + BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_SEQ_NUMBER); if (ret < 0) { ERROR(ret); return TC_ACT_SHOT; diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh index 8ee0371a100a..9f6174236856 100755 --- a/samples/bpf/test_cgrp2_sock.sh +++ b/samples/bpf/test_cgrp2_sock.sh @@ -61,6 +61,7 @@ cleanup_and_exit() [ -n "$msg" ] && echo "ERROR: $msg" + test_cgrp2_sock -d ${CGRP_MNT}/sockopts ip li del cgrp2_sock umount ${CGRP_MNT} diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh index fc4e64d00cb3..0f396a86e0cb 100755 --- a/samples/bpf/test_cgrp2_sock2.sh +++ b/samples/bpf/test_cgrp2_sock2.sh @@ -28,6 +28,9 @@ function attach_bpf { } function cleanup { + if [ -d /tmp/cgroupv2/foo ]; then + test_cgrp2_sock -d /tmp/cgroupv2/foo + fi ip link del veth0b ip netns delete at_ns0 umount /tmp/cgroupv2 diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh index 43ce049996ee..c265863ccdf9 100755 --- a/samples/bpf/test_tunnel_bpf.sh +++ b/samples/bpf/test_tunnel_bpf.sh @@ -23,7 +23,8 @@ function config_device { function add_gre_tunnel { # in namespace ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE key 2 local 172.16.1.100 remote 172.16.1.200 + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local 172.16.1.100 remote 172.16.1.200 ip netns exec at_ns0 ip link set dev $DEV_NS up ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 @@ -43,7 +44,7 @@ function add_ip6gretap_tunnel { # in namespace ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE flowlabel 0xbcdef key 2 \ + ip link add dev $DEV_NS type $TYPE seq flowlabel 0xbcdef key 2 \ local ::11 remote ::22 ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index d54e91eb6cbf..b701b5c21342 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -20,6 +20,7 @@ #include <string.h> #include <unistd.h> #include <libgen.h> +#include <sys/resource.h> #include "bpf_load.h" #include "bpf_util.h" @@ -75,6 +76,7 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; const char *optstr = "SN"; char filename[256]; int ret, opt, key = 0; @@ -98,6 +100,11 @@ int main(int argc, char **argv) return 1; } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + ifindex_in = strtoul(argv[optind], NULL, 0); ifindex_out = strtoul(argv[optind + 1], NULL, 0); printf("input: %d output: %d\n", ifindex_in, ifindex_out); diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile index 73f1da4d116c..9bf2881bd11b 100644 --- a/samples/sockmap/Makefile +++ b/samples/sockmap/Makefile @@ -2,7 +2,7 @@ hostprogs-y := sockmap # Libbpf dependencies -LIBBPF := ../../tools/lib/bpf/bpf.o +LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 7c25c0c112bc..95a54a89a532 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -566,6 +566,7 @@ run: else fprintf(stderr, "unknown test\n"); out: + bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS); close(s1); close(s2); close(p1); |