summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/bpf.h50
-rw-r--r--include/linux/filter.h3
-rw-r--r--kernel/bpf/cgroup.c25
-rw-r--r--kernel/bpf/syscall.c12
-rw-r--r--kernel/bpf/verifier.c16
-rw-r--r--net/ipv4/ip_output.c34
-rw-r--r--net/ipv6/ip6_output.c26
-rwxr-xr-xsamples/bpf/do_hbm_test.sh10
-rw-r--r--samples/bpf/hbm.c51
-rw-r--r--samples/bpf/hbm.h9
-rw-r--r--samples/bpf/hbm_kern.h66
-rw-r--r--samples/bpf/hbm_out_kern.c48
12 files changed, 299 insertions, 51 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ff3e00ff84d2..2cc58fc0f413 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -552,6 +552,56 @@ _out: \
_ret; \
})
+/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
+ * so BPF programs can request cwr for TCP packets.
+ *
+ * Current cgroup skb programs can only return 0 or 1 (0 to drop the
+ * packet. This macro changes the behavior so the low order bit
+ * indicates whether the packet should be dropped (0) or not (1)
+ * and the next bit is a congestion notification bit. This could be
+ * used by TCP to call tcp_enter_cwr()
+ *
+ * Hence, new allowed return values of CGROUP EGRESS BPF programs are:
+ * 0: drop packet
+ * 1: keep packet
+ * 2: drop packet and cn
+ * 3: keep packet and cn
+ *
+ * This macro then converts it to one of the NET_XMIT or an error
+ * code that is then interpreted as drop packet (and no cn):
+ * 0: NET_XMIT_SUCCESS skb should be transmitted
+ * 1: NET_XMIT_DROP skb should be dropped and cn
+ * 2: NET_XMIT_CN skb should be transmitted and cn
+ * 3: -EPERM skb should be dropped
+ */
+#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \
+ ({ \
+ struct bpf_prog_array_item *_item; \
+ struct bpf_prog *_prog; \
+ struct bpf_prog_array *_array; \
+ u32 ret; \
+ u32 _ret = 1; \
+ u32 _cn = 0; \
+ preempt_disable(); \
+ rcu_read_lock(); \
+ _array = rcu_dereference(array); \
+ _item = &_array->items[0]; \
+ while ((_prog = READ_ONCE(_item->prog))) { \
+ bpf_cgroup_storage_set(_item->cgroup_storage); \
+ ret = func(_prog, ctx); \
+ _ret &= (ret & 1); \
+ _cn |= (ret & 2); \
+ _item++; \
+ } \
+ rcu_read_unlock(); \
+ preempt_enable(); \
+ if (_ret) \
+ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \
+ else \
+ _ret = (_cn ? NET_XMIT_DROP : -EPERM); \
+ _ret; \
+ })
+
#define BPF_PROG_RUN_ARRAY(array, ctx, func) \
__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ba8b65270e0d..43b45d6db36d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -526,7 +526,8 @@ struct bpf_prog {
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
- has_callchain_buf:1; /* callchain buffer allocated? */
+ has_callchain_buf:1, /* callchain buffer allocated? */
+ enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ff594eb86fd7..1b65ab0df457 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -587,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* The program type passed in via @type must be suitable for network
* filtering. No further check is performed to assert that.
*
- * This function will return %-EPERM if any if an attached program was found
- * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ * For egress packets, this function can return:
+ * NET_XMIT_SUCCESS (0) - continue with packet output
+ * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
+ * NET_XMIT_CN (2) - continue with packet output and notify TCP
+ * to call cwr
+ * -EPERM - drop packet
+ *
+ * For ingress packets, this function will return -EPERM if any
+ * attached program was found and if it returned != 1 during execution.
+ * Otherwise 0 is returned.
*/
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
@@ -614,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- __bpf_prog_run_save_cb);
+ if (type == BPF_CGROUP_INET_EGRESS) {
+ ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
+ cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+ } else {
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ __bpf_prog_run_save_cb);
+ ret = (ret == 1 ? 0 : -EPERM);
+ }
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
- return ret == 1 ? 0 : -EPERM;
+
+ return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3d546b6f4646..1539774d78c7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ return 0;
+ default:
+ return -EINVAL;
+ }
default:
return 0;
}
@@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ return prog->enforce_expected_attach_type &&
+ prog->expected_attach_type != attach_type ?
+ -EINVAL : 0;
default:
return 0;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2778417e6e0c..5c2cb5bd84ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
static int check_return_code(struct bpf_verifier_env *env)
{
+ struct tnum enforce_attach_type_range = tnum_unknown;
struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1);
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SKB:
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
+ range = tnum_range(0, 3);
+ enforce_attach_type_range = tnum_range(2, 3);
+ }
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_SOCK_OPS:
@@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env)
}
if (!tnum_in(range, reg->var_off)) {
+ char tn_buf[48];
+
verbose(env, "At program exit the register R0 ");
if (!tnum_is_unknown(reg->var_off)) {
- char tn_buf[48];
-
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "has value %s", tn_buf);
} else {
verbose(env, "has unknown scalar value");
}
- verbose(env, " should have been 0 or 1\n");
+ tnum_strn(tn_buf, sizeof(tn_buf), range);
+ verbose(env, " should have been %s\n", tn_buf);
return -EINVAL;
}
+
+ if (!tnum_is_unknown(enforce_attach_type_range) &&
+ tnum_in(enforce_attach_type_range, reg->var_off))
+ env->prog->enforce_expected_attach_type = 1;
return 0;
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index bfd0ca554977..1217a53381c2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
return ret;
}
-static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
- int ret;
-
- ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
- kfree_skb(skb);
- return ret;
- }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -315,18 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return ip_finish_output2(net, sk, skb);
}
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return __ip_finish_output(net, sk, skb);
+ case NET_XMIT_CN:
+ return __ip_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb(skb);
+ return ret;
+ }
+}
+
static int ip_mc_finish_output(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
int ret;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return dev_loopback_xmit(net, sk, skb);
+ case NET_XMIT_CN:
+ return dev_loopback_xmit(net, sk, skb) ? : ret;
+ default:
kfree_skb(skb);
return ret;
}
-
- return dev_loopback_xmit(net, sk, skb);
}
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index adef2236abe2..a75bc21d8c88 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -128,16 +128,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
return -EINVAL;
}
-static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- int ret;
-
- ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
- kfree_skb(skb);
- return ret;
- }
-
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
@@ -154,6 +146,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
return ip6_finish_output2(net, sk, skb);
}
+static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return __ip6_finish_output(net, sk, skb);
+ case NET_XMIT_CN:
+ return __ip6_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb(skb);
+ return ret;
+ }
+}
+
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
index 56c8b4115c95..e48b047d4646 100755
--- a/samples/bpf/do_hbm_test.sh
+++ b/samples/bpf/do_hbm_test.sh
@@ -13,10 +13,10 @@ Usage() {
echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
echo "loads. The output is the goodput in Mbps (unless -D was used)."
echo ""
- echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]"
- echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]"
+ echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
+ echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E]"
echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
- echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]"
+ echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
echo " Where:"
@@ -33,6 +33,7 @@ Usage() {
echo " -f or --flows number of concurrent flows (default=1)"
echo " -i or --id cgroup id (an integer, default is 1)"
echo " -N use netperf instead of iperf3"
+ echo " --no_cn Do not return CN notifications"
echo " -l do not limit flows using loopback"
echo " -h Help"
echo " -p or --port iperf3 port (default is 5201)"
@@ -115,6 +116,9 @@ processArgs () {
-c=*|--cc=*)
cc="${i#*=}"
;;
+ --no_cn)
+ flags="$flags --no_cn"
+ ;;
--debug)
flags="$flags -d"
debug_flag=1
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index a79828ab273f..480b7ad6a1f2 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -16,6 +16,7 @@
* -l Also limit flows doing loopback
* -n <#> To create cgroup \"/hbm#\" and attach prog
* Default is /hbm1
+ * --no_cn Do not return cn notifications
* -r <rate> Rate limit in Mbps
* -s Get HBM stats (marked, dropped, etc.)
* -t <time> Exit after specified seconds (default is 0)
@@ -42,6 +43,7 @@
#include <linux/bpf.h>
#include <bpf/bpf.h>
+#include <getopt.h>
#include "bpf_load.h"
#include "bpf_rlimit.h"
@@ -59,6 +61,7 @@ bool stats_flag;
bool loopback_flag;
bool debugFlag;
bool work_conserving_flag;
+bool no_cn_flag;
static void Usage(void);
static void read_trace_pipe2(void);
@@ -185,6 +188,7 @@ static int run_bpf_prog(char *prog, int cg_id)
qstats.rate = rate;
qstats.stats = stats_flag ? 1 : 0;
qstats.loopback = loopback_flag ? 1 : 0;
+ qstats.no_cn = no_cn_flag ? 1 : 0;
if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
printf("ERROR: Could not update map element\n");
goto err;
@@ -312,6 +316,14 @@ static int run_bpf_prog(char *prog, int cg_id)
double percent_pkts, percent_bytes;
char fname[100];
FILE *fout;
+ int k;
+ static const char *returnValNames[] = {
+ "DROP_PKT",
+ "ALLOW_PKT",
+ "DROP_PKT_CWR",
+ "ALLOW_PKT_CWR"
+ };
+#define RET_VAL_COUNT 4
// Future support of ingress
// if (!outFlag)
@@ -346,6 +358,31 @@ static int run_bpf_prog(char *prog, int cg_id)
(qstats.bytes_total + 1);
fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
+
+ // ECN CE markings
+ percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
+ (qstats.pkts_total + 1);
+ fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
+ (int)qstats.pkts_ecn_ce);
+
+ // Average cwnd
+ fprintf(fout, "avg cwnd:%d\n",
+ (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
+ // Average rtt
+ fprintf(fout, "avg rtt:%d\n",
+ (int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
+ // Average credit
+ fprintf(fout, "avg credit:%d\n",
+ (int)(qstats.sum_credit /
+ (1500 * ((int)qstats.pkts_total) + 1)));
+
+ // Return values stats
+ for (k = 0; k < RET_VAL_COUNT; k++) {
+ percent_pkts = (qstats.returnValCount[k] * 100.0) /
+ (qstats.pkts_total + 1);
+ fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
+ percent_pkts, (int)qstats.returnValCount[k]);
+ }
fclose(fout);
}
@@ -366,14 +403,15 @@ static void Usage(void)
{
printf("This program loads a cgroup skb BPF program to enforce\n"
"cgroup output (egress) bandwidth limits.\n\n"
- "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n"
- " [-t <secs>] [-w] [-h] [prog]\n"
+ "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
+ " [-s] [-t <secs>] [-w] [-h] [prog]\n"
" Where:\n"
" -o indicates egress direction (default)\n"
" -d print BPF trace debug buffer\n"
" -l also limit flows using loopback\n"
" -n <#> to create cgroup \"/hbm#\" and attach prog\n"
" Default is /hbm1\n"
+ " --no_cn disable CN notifcations\n"
" -r <rate> Rate in Mbps\n"
" -s Update HBM stats\n"
" -t <time> Exit after specified seconds (default is 0)\n"
@@ -393,9 +431,16 @@ int main(int argc, char **argv)
int k;
int cg_id = 1;
char *optstring = "iodln:r:st:wh";
+ struct option loptions[] = {
+ {"no_cn", 0, NULL, 1},
+ {NULL, 0, NULL, 0}
+ };
- while ((k = getopt(argc, argv, optstring)) != -1) {
+ while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
switch (k) {
+ case 1:
+ no_cn_flag = true;
+ break;
case'o':
break;
case 'd':
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
index 518e8147d084..f0963ed6a562 100644
--- a/samples/bpf/hbm.h
+++ b/samples/bpf/hbm.h
@@ -19,7 +19,8 @@ struct hbm_vqueue {
struct hbm_queue_stats {
unsigned long rate; /* in Mbps*/
unsigned long stats:1, /* get HBM stats (marked, dropped,..) */
- loopback:1; /* also limit flows using loopback */
+ loopback:1, /* also limit flows using loopback */
+ no_cn:1; /* do not use cn flags */
unsigned long long pkts_marked;
unsigned long long bytes_marked;
unsigned long long pkts_dropped;
@@ -28,4 +29,10 @@ struct hbm_queue_stats {
unsigned long long bytes_total;
unsigned long long firstPacketTime;
unsigned long long lastPacketTime;
+ unsigned long long pkts_ecn_ce;
+ unsigned long long returnValCount[4];
+ unsigned long long sum_cwnd;
+ unsigned long long sum_rtt;
+ unsigned long long sum_cwnd_cnt;
+ long long sum_credit;
};
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
index 41384be233b9..be19cf1d5cd5 100644
--- a/samples/bpf/hbm_kern.h
+++ b/samples/bpf/hbm_kern.h
@@ -65,17 +65,43 @@ struct bpf_map_def SEC("maps") queue_stats = {
BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
struct hbm_pkt_info {
+ int cwnd;
+ int rtt;
bool is_ip;
bool is_tcp;
short ecn;
};
+static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti)
+{
+ struct bpf_sock *sk;
+ struct bpf_tcp_sock *tp;
+
+ sk = skb->sk;
+ if (sk) {
+ sk = bpf_sk_fullsock(sk);
+ if (sk) {
+ if (sk->protocol == IPPROTO_TCP) {
+ tp = bpf_tcp_sock(sk);
+ if (tp) {
+ pkti->cwnd = tp->snd_cwnd;
+ pkti->rtt = tp->srtt_us >> 3;
+ return 0;
+ }
+ }
+ }
+ }
+ return 1;
+}
+
static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
struct hbm_pkt_info *pkti)
{
struct iphdr iph;
struct ipv6hdr *ip6h;
+ pkti->cwnd = 0;
+ pkti->rtt = 0;
bpf_skb_load_bytes(skb, 0, &iph, 12);
if (iph.version == 6) {
ip6h = (struct ipv6hdr *)&iph;
@@ -91,6 +117,8 @@ static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
pkti->is_tcp = false;
pkti->ecn = 0;
}
+ if (pkti->is_tcp)
+ get_tcp_info(skb, pkti);
}
static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
@@ -105,8 +133,14 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
int len,
unsigned long long curtime,
bool congestion_flag,
- bool drop_flag)
+ bool drop_flag,
+ bool cwr_flag,
+ bool ecn_ce_flag,
+ struct hbm_pkt_info *pkti,
+ int credit)
{
+ int rv = ALLOW_PKT;
+
if (qsp != NULL) {
// Following is needed for work conserving
__sync_add_and_fetch(&(qsp->bytes_total), len);
@@ -116,7 +150,7 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
qsp->firstPacketTime = curtime;
qsp->lastPacketTime = curtime;
__sync_add_and_fetch(&(qsp->pkts_total), 1);
- if (congestion_flag || drop_flag) {
+ if (congestion_flag) {
__sync_add_and_fetch(&(qsp->pkts_marked), 1);
__sync_add_and_fetch(&(qsp->bytes_marked), len);
}
@@ -125,6 +159,34 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
__sync_add_and_fetch(&(qsp->bytes_dropped),
len);
}
+ if (ecn_ce_flag)
+ __sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1);
+ if (pkti->cwnd) {
+ __sync_add_and_fetch(&(qsp->sum_cwnd),
+ pkti->cwnd);
+ __sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1);
+ }
+ if (pkti->rtt)
+ __sync_add_and_fetch(&(qsp->sum_rtt),
+ pkti->rtt);
+ __sync_add_and_fetch(&(qsp->sum_credit), credit);
+
+ if (drop_flag)
+ rv = DROP_PKT;
+ if (cwr_flag)
+ rv |= 2;
+ if (rv == DROP_PKT)
+ __sync_add_and_fetch(&(qsp->returnValCount[0]),
+ 1);
+ else if (rv == ALLOW_PKT)
+ __sync_add_and_fetch(&(qsp->returnValCount[1]),
+ 1);
+ else if (rv == 2)
+ __sync_add_and_fetch(&(qsp->returnValCount[2]),
+ 1);
+ else if (rv == 3)
+ __sync_add_and_fetch(&(qsp->returnValCount[3]),
+ 1);
}
}
}
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
index f806863d0b79..829934bd43cb 100644
--- a/samples/bpf/hbm_out_kern.c
+++ b/samples/bpf/hbm_out_kern.c
@@ -62,11 +62,12 @@ int _hbm_out_cg(struct __sk_buff *skb)
unsigned int queue_index = 0;
unsigned long long curtime;
int credit;
- signed long long delta = 0, zero = 0;
+ signed long long delta = 0, new_credit;
int max_credit = MAX_CREDIT;
bool congestion_flag = false;
bool drop_flag = false;
bool cwr_flag = false;
+ bool ecn_ce_flag = false;
struct hbm_vqueue *qdp;
struct hbm_queue_stats *qsp = NULL;
int rv = ALLOW_PKT;
@@ -99,9 +100,11 @@ int _hbm_out_cg(struct __sk_buff *skb)
*/
if (delta > 0) {
qdp->lasttime = curtime;
- credit += CREDIT_PER_NS(delta, qdp->rate);
- if (credit > MAX_CREDIT)
+ new_credit = credit + CREDIT_PER_NS(delta, qdp->rate);
+ if (new_credit > MAX_CREDIT)
credit = MAX_CREDIT;
+ else
+ credit = new_credit;
}
credit -= len;
qdp->credit = credit;
@@ -119,13 +122,16 @@ int _hbm_out_cg(struct __sk_buff *skb)
// Set flags (drop, congestion, cwr)
// Dropping => we are congested, so ignore congestion flag
if (credit < -DROP_THRESH ||
- (len > LARGE_PKT_THRESH &&
- credit < -LARGE_PKT_DROP_THRESH)) {
- // Very congested, set drop flag
+ (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) {
+ // Very congested, set drop packet
drop_flag = true;
+ if (pkti.ecn)
+ congestion_flag = true;
+ else if (pkti.is_tcp)
+ cwr_flag = true;
} else if (credit < 0) {
// Congested, set congestion flag
- if (pkti.ecn) {
+ if (pkti.ecn || pkti.is_tcp) {
if (credit < -MARK_THRESH)
congestion_flag = true;
else
@@ -136,22 +142,38 @@ int _hbm_out_cg(struct __sk_buff *skb)
}
if (congestion_flag) {
- if (!bpf_skb_ecn_set_ce(skb)) {
- if (len > LARGE_PKT_THRESH) {
+ if (bpf_skb_ecn_set_ce(skb)) {
+ ecn_ce_flag = true;
+ } else {
+ if (pkti.is_tcp) {
+ unsigned int rand = bpf_get_prandom_u32();
+
+ if (-credit >= MARK_THRESH +
+ (rand % MARK_REGION_SIZE)) {
+ // Do congestion control
+ cwr_flag = true;
+ }
+ } else if (len > LARGE_PKT_THRESH) {
// Problem if too many small packets?
drop_flag = true;
}
}
}
- if (drop_flag)
- rv = DROP_PKT;
+ if (qsp != NULL)
+ if (qsp->no_cn)
+ cwr_flag = false;
- hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
+ hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
+ cwr_flag, ecn_ce_flag, &pkti, credit);
- if (rv == DROP_PKT)
+ if (drop_flag) {
__sync_add_and_fetch(&(qdp->credit), len);
+ rv = DROP_PKT;
+ }
+ if (cwr_flag)
+ rv |= 2;
return rv;
}
char _license[] SEC("license") = "GPL";