summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2022-05-03 12:10:51 +0200
committerPaolo Abeni <pabeni@redhat.com>2022-05-03 12:10:52 +0200
commitf4f1fd7646203d36b844645704636e2208a54268 (patch)
tree67ca60400f8dac193ec6c0428eb77fd0e8d6bddb
parentcb636b3e372bac984ac54ab7b39bef560bbcf7d5 (diff)
parent1d267aa8699b5985a297f697c493aadc507711a9 (diff)
downloadlinux-f4f1fd7646203d36b844645704636e2208a54268.tar.bz2
Merge branch 'mlxsw-remove-size-limitations-on-egress-descriptor-buffer'
Ido Schimmel says: ==================== mlxsw: Remove size limitations on egress descriptor buffer Petr says: Spectrum machines have two resources related to keeping packets in an internal buffer: bytes (allocated in cell-sized units) for packet payload, and descriptors, for keeping headers. Currently, mlxsw only configures the bytes part of the resource management. Spectrum switches permit a full parallel configuration for the descriptor resources, including port-pool and port-TC-pool quotas. By default, these are all configured to use pool 14, with an infinite quota. The ingress pool 14 is then infinite in size. However, egress pool 14 has finite size by default. The size is chip dependent, but always much lower than what the chip actually permits. As a result, we can easily construct workloads that exhaust the configured descriptor limit. Going forward, mlxsw will have to fix this issue properly by maintaining descriptor buffer sizes, TC bindings, and quotas that match the architecture recommendation. Short term, fix the issue by configuring the egress descriptor pool to be infinite in size as well. This will maintain the same configuration philosophy, but will unlock all chip resources to be usable. In this patchset, patch #1 first adds the "desc" field into the pool configuration register. Then in patch #2, the new field is used to configure both ingress and egress pool 14 as infinite. In patches #3 and #4, add a selftest that verifies that a large burst can be absorbed by the shared buffer. This test specifically exercises a scenario where descriptor buffer is the limiting factor and the test fails without the above patches. ==================== Link: https://lore.kernel.org/r/20220502084926.365268-1-idosch@nvidia.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/reg.h6
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c26
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/qos_burst.sh480
-rwxr-xr-xtools/testing/selftests/net/forwarding/lib.sh21
4 files changed, 530 insertions, 3 deletions
diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 04c4d7a4bd83..078e3aa04383 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -12641,6 +12641,12 @@ static inline void mlxsw_reg_tidem_pack(char *payload, u8 underlay_ecn,
MLXSW_REG_DEFINE(sbpr, MLXSW_REG_SBPR_ID, MLXSW_REG_SBPR_LEN);
+/* reg_sbpr_desc
+ * When set, configures descriptor buffer.
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, sbpr, desc, 0x00, 31, 1);
+
/* shared direstion enum for SBPR, SBCM, SBPM */
enum mlxsw_reg_sbxx_dir {
MLXSW_REG_SBXX_DIR_INGRESS,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
index 98f26f596e30..c68fc8f7ca99 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
@@ -202,6 +202,21 @@ static int mlxsw_sp_sb_pr_write(struct mlxsw_sp *mlxsw_sp, u16 pool_index,
return 0;
}
+static int mlxsw_sp_sb_pr_desc_write(struct mlxsw_sp *mlxsw_sp,
+ enum mlxsw_reg_sbxx_dir dir,
+ enum mlxsw_reg_sbpr_mode mode,
+ u32 size, bool infi_size)
+{
+ char sbpr_pl[MLXSW_REG_SBPR_LEN];
+
+ /* The FW default descriptor buffer configuration uses only pool 14 for
+ * descriptors.
+ */
+ mlxsw_reg_sbpr_pack(sbpr_pl, 14, dir, mode, size, infi_size);
+ mlxsw_reg_sbpr_desc_set(sbpr_pl, true);
+ return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sbpr), sbpr_pl);
+}
+
static int mlxsw_sp_sb_cm_write(struct mlxsw_sp *mlxsw_sp, u16 local_port,
u8 pg_buff, u32 min_buff, u32 max_buff,
bool infi_max, u16 pool_index)
@@ -775,6 +790,17 @@ static int mlxsw_sp_sb_prs_init(struct mlxsw_sp *mlxsw_sp,
if (err)
return err;
}
+
+ err = mlxsw_sp_sb_pr_desc_write(mlxsw_sp, MLXSW_REG_SBXX_DIR_INGRESS,
+ MLXSW_REG_SBPR_MODE_DYNAMIC, 0, true);
+ if (err)
+ return err;
+
+ err = mlxsw_sp_sb_pr_desc_write(mlxsw_sp, MLXSW_REG_SBXX_DIR_EGRESS,
+ MLXSW_REG_SBPR_MODE_DYNAMIC, 0, true);
+ if (err)
+ return err;
+
return 0;
}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_burst.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_burst.sh
new file mode 100755
index 000000000000..82a47b903f92
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_burst.sh
@@ -0,0 +1,480 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test sends 1Gbps of traffic through the switch, into which it then
+# injects a burst of traffic and tests that there are no drops.
+#
+# The 1Gbps stream is created by sending >1Gbps stream from H1. This stream
+# ingresses through $swp1, and is forwarded thtrough a small temporary pool to a
+# 1Gbps $swp3.
+#
+# Thus a 1Gbps stream enters $swp4, and is forwarded through a large pool to
+# $swp2, and eventually to H2. Since $swp2 is a 1Gbps port as well, no backlog
+# is generated.
+#
+# At this point, a burst of traffic is forwarded from H3. This enters $swp5, is
+# forwarded to $swp2, which is fully subscribed by the 1Gbps stream. The
+# expectation is that the burst is wholly absorbed by the large pool and no
+# drops are caused. After the burst, there should be a backlog that is hard to
+# get rid of, because $sw2 is fully subscribed. But because each individual
+# packet is scheduled soon after getting enqueued, SLL and HLL do not impact the
+# test.
+#
+# +-----------------------+ +-----------------------+
+# | H1 | | H3 |
+# | + $h1.111 | | $h3.111 + |
+# | | 192.0.2.33/28 | | 192.0.2.35/28 | |
+# | | | | | |
+# | + $h1 | | $h3 + |
+# +---|-------------------+ +--------------------+ +------------------|----+
+# | | | |
+# +---|----------------------|--------------------|----------------------|----+
+# | + $swp1 $swp3 + + $swp4 $swp5 | |
+# | | iPOOL1 iPOOL0 | | iPOOL2 iPOOL2 | |
+# | | ePOOL4 ePOOL5 | | ePOOL4 ePOOL4 | |
+# | | 1Gbps | | 1Gbps | |
+# | +-|----------------------|-+ +-|----------------------|-+ |
+# | | + $swp1.111 $swp3.111 + | | + $swp4.111 $swp5.111 + | |
+# | | | | | |
+# | | BR1 | | BR2 | |
+# | | | | | |
+# | | | | + $swp2.111 | |
+# | +--------------------------+ +---------|----------------+ |
+# | | |
+# | iPOOL0: 500KB dynamic | |
+# | iPOOL1: 500KB dynamic | |
+# | iPOOL2: 10MB dynamic + $swp2 |
+# | ePOOL4: 500KB dynamic | iPOOL0 |
+# | ePOOL5: 500KB dnamic | ePOOL6 |
+# | ePOOL6: 10MB dynamic | 1Gbps |
+# +-------------------------------------------------------|-------------------+
+# |
+# +---|-------------------+
+# | + $h2 H2 |
+# | | 1Gbps |
+# | | |
+# | + $h2.111 |
+# | 192.0.2.34/28 |
+# +-----------------------+
+#
+# iPOOL0+ePOOL4 are helper pools for control traffic etc.
+# iPOOL1+ePOOL5 are helper pools for modeling the 1Gbps stream
+# iPOOL2+ePOOL6 are pools for soaking the burst traffic
+
+ALL_TESTS="
+ ping_ipv4
+ test_8K
+ test_800
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=8
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+source qos_lib.sh
+source mlxsw_lib.sh
+
+_1KB=1000
+_500KB=$((500 * _1KB))
+_1MB=$((1000 * _1KB))
+
+# The failure mode that this specifically tests is exhaustion of descriptor
+# buffer. The point is to produce a burst that shared buffer should be able
+# to accommodate, but produce it with small enough packets that the machine
+# runs out of the descriptor buffer space with default configuration.
+#
+# The machine therefore needs to be able to produce line rate with as small
+# packets as possible, and at the same time have large enough buffer that
+# when filled with these small packets, it runs out of descriptors.
+# Spectrum-2 is very close, but cannot perform this test. Therefore use
+# Spectrum-3 as a minimum, and permit larger burst size, and therefore
+# larger packets, to reduce spurious failures.
+#
+mlxsw_only_on_spectrum 3+ || exit
+
+BURST_SIZE=$((50000000))
+POOL_SIZE=$BURST_SIZE
+
+h1_create()
+{
+ simple_if_init $h1
+ mtu_set $h1 10000
+
+ vlan_create $h1 111 v$h1 192.0.2.33/28
+ ip link set dev $h1.111 type vlan egress-qos-map 0:1
+}
+
+h1_destroy()
+{
+ vlan_destroy $h1 111
+
+ mtu_restore $h1
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ simple_if_init $h2
+ mtu_set $h2 10000
+ ethtool -s $h2 speed 1000 autoneg off
+
+ vlan_create $h2 111 v$h2 192.0.2.34/28
+}
+
+h2_destroy()
+{
+ vlan_destroy $h2 111
+
+ ethtool -s $h2 autoneg on
+ mtu_restore $h2
+ simple_if_fini $h2
+}
+
+h3_create()
+{
+ simple_if_init $h3
+ mtu_set $h3 10000
+
+ vlan_create $h3 111 v$h3 192.0.2.35/28
+}
+
+h3_destroy()
+{
+ vlan_destroy $h3 111
+
+ mtu_restore $h3
+ simple_if_fini $h3
+}
+
+switch_create()
+{
+ # pools
+ # -----
+
+ devlink_pool_size_thtype_save 0
+ devlink_pool_size_thtype_save 4
+ devlink_pool_size_thtype_save 1
+ devlink_pool_size_thtype_save 5
+ devlink_pool_size_thtype_save 2
+ devlink_pool_size_thtype_save 6
+
+ devlink_port_pool_th_save $swp1 1
+ devlink_port_pool_th_save $swp2 6
+ devlink_port_pool_th_save $swp3 5
+ devlink_port_pool_th_save $swp4 2
+ devlink_port_pool_th_save $swp5 2
+
+ devlink_tc_bind_pool_th_save $swp1 1 ingress
+ devlink_tc_bind_pool_th_save $swp2 1 egress
+ devlink_tc_bind_pool_th_save $swp3 1 egress
+ devlink_tc_bind_pool_th_save $swp4 1 ingress
+ devlink_tc_bind_pool_th_save $swp5 1 ingress
+
+ # Control traffic pools. Just reduce the size.
+ devlink_pool_size_thtype_set 0 dynamic $_500KB
+ devlink_pool_size_thtype_set 4 dynamic $_500KB
+
+ # Stream modeling pools.
+ devlink_pool_size_thtype_set 1 dynamic $_500KB
+ devlink_pool_size_thtype_set 5 dynamic $_500KB
+
+ # Burst soak pools.
+ devlink_pool_size_thtype_set 2 static $POOL_SIZE
+ devlink_pool_size_thtype_set 6 static $POOL_SIZE
+
+ # $swp1
+ # -----
+
+ ip link set dev $swp1 up
+ mtu_set $swp1 10000
+ vlan_create $swp1 111
+ ip link set dev $swp1.111 type vlan ingress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp1 1 16
+ devlink_tc_bind_pool_th_set $swp1 1 ingress 1 16
+
+ # Configure qdisc...
+ tc qdisc replace dev $swp1 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+ # ... so that we can assign prio1 traffic to PG1.
+ dcb buffer set dev $swp1 prio-buffer all:0 1:1
+
+ # $swp2
+ # -----
+
+ ip link set dev $swp2 up
+ mtu_set $swp2 10000
+ ethtool -s $swp2 speed 1000 autoneg off
+ vlan_create $swp2 111
+ ip link set dev $swp2.111 type vlan egress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp2 6 $POOL_SIZE
+ devlink_tc_bind_pool_th_set $swp2 1 egress 6 $POOL_SIZE
+
+ # prio 0->TC0 (band 7), 1->TC1 (band 6)
+ tc qdisc replace dev $swp2 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+
+ # $swp3
+ # -----
+
+ ip link set dev $swp3 up
+ mtu_set $swp3 10000
+ ethtool -s $swp3 speed 1000 autoneg off
+ vlan_create $swp3 111
+ ip link set dev $swp3.111 type vlan egress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp3 5 16
+ devlink_tc_bind_pool_th_set $swp3 1 egress 5 16
+
+ # prio 0->TC0 (band 7), 1->TC1 (band 6)
+ tc qdisc replace dev $swp3 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+
+ # $swp4
+ # -----
+
+ ip link set dev $swp4 up
+ mtu_set $swp4 10000
+ ethtool -s $swp4 speed 1000 autoneg off
+ vlan_create $swp4 111
+ ip link set dev $swp4.111 type vlan ingress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp4 2 $POOL_SIZE
+ devlink_tc_bind_pool_th_set $swp4 1 ingress 2 $POOL_SIZE
+
+ # Configure qdisc...
+ tc qdisc replace dev $swp4 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+ # ... so that we can assign prio1 traffic to PG1.
+ dcb buffer set dev $swp4 prio-buffer all:0 1:1
+
+ # $swp5
+ # -----
+
+ ip link set dev $swp5 up
+ mtu_set $swp5 10000
+ vlan_create $swp5 111
+ ip link set dev $swp5.111 type vlan ingress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp5 2 $POOL_SIZE
+ devlink_tc_bind_pool_th_set $swp5 1 ingress 2 $POOL_SIZE
+
+ # Configure qdisc...
+ tc qdisc replace dev $swp5 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+ # ... so that we can assign prio1 traffic to PG1.
+ dcb buffer set dev $swp5 prio-buffer all:0 1:1
+
+ # bridges
+ # -------
+
+ ip link add name br1 type bridge vlan_filtering 0
+ ip link set dev $swp1.111 master br1
+ ip link set dev $swp3.111 master br1
+ ip link set dev br1 up
+
+ ip link add name br2 type bridge vlan_filtering 0
+ ip link set dev $swp2.111 master br2
+ ip link set dev $swp4.111 master br2
+ ip link set dev $swp5.111 master br2
+ ip link set dev br2 up
+}
+
+switch_destroy()
+{
+ # Do this first so that we can reset the limits to values that are only
+ # valid for the original static / dynamic setting.
+ devlink_pool_size_thtype_restore 6
+ devlink_pool_size_thtype_restore 5
+ devlink_pool_size_thtype_restore 4
+ devlink_pool_size_thtype_restore 2
+ devlink_pool_size_thtype_restore 1
+ devlink_pool_size_thtype_restore 0
+
+ # bridges
+ # -------
+
+ ip link set dev br2 down
+ ip link set dev $swp5.111 nomaster
+ ip link set dev $swp4.111 nomaster
+ ip link set dev $swp2.111 nomaster
+ ip link del dev br2
+
+ ip link set dev br1 down
+ ip link set dev $swp3.111 nomaster
+ ip link set dev $swp1.111 nomaster
+ ip link del dev br1
+
+ # $swp5
+ # -----
+
+ dcb buffer set dev $swp5 prio-buffer all:0
+ tc qdisc del dev $swp5 root
+
+ devlink_tc_bind_pool_th_restore $swp5 1 ingress
+ devlink_port_pool_th_restore $swp5 2
+
+ vlan_destroy $swp5 111
+ mtu_restore $swp5
+ ip link set dev $swp5 down
+
+ # $swp4
+ # -----
+
+ dcb buffer set dev $swp4 prio-buffer all:0
+ tc qdisc del dev $swp4 root
+
+ devlink_tc_bind_pool_th_restore $swp4 1 ingress
+ devlink_port_pool_th_restore $swp4 2
+
+ vlan_destroy $swp4 111
+ ethtool -s $swp4 autoneg on
+ mtu_restore $swp4
+ ip link set dev $swp4 down
+
+ # $swp3
+ # -----
+
+ tc qdisc del dev $swp3 root
+
+ devlink_tc_bind_pool_th_restore $swp3 1 egress
+ devlink_port_pool_th_restore $swp3 5
+
+ vlan_destroy $swp3 111
+ ethtool -s $swp3 autoneg on
+ mtu_restore $swp3
+ ip link set dev $swp3 down
+
+ # $swp2
+ # -----
+
+ tc qdisc del dev $swp2 root
+
+ devlink_tc_bind_pool_th_restore $swp2 1 egress
+ devlink_port_pool_th_restore $swp2 6
+
+ vlan_destroy $swp2 111
+ ethtool -s $swp2 autoneg on
+ mtu_restore $swp2
+ ip link set dev $swp2 down
+
+ # $swp1
+ # -----
+
+ dcb buffer set dev $swp1 prio-buffer all:0
+ tc qdisc del dev $swp1 root
+
+ devlink_tc_bind_pool_th_restore $swp1 1 ingress
+ devlink_port_pool_th_restore $swp1 1
+
+ vlan_destroy $swp1 111
+ mtu_restore $swp1
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ swp4=${NETIFS[p6]}
+
+ swp5=${NETIFS[p7]}
+ h3=${NETIFS[p8]}
+
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.34 " h1->h2"
+ ping_test $h3 192.0.2.34 " h3->h2"
+}
+
+__test_qos_burst()
+{
+ local pktsize=$1; shift
+
+ RET=0
+
+ start_traffic_pktsize $pktsize $h1.111 192.0.2.33 192.0.2.34 $h2mac
+ sleep 1
+
+ local q0=$(ethtool_stats_get $swp2 tc_transmit_queue_tc_1)
+ ((q0 == 0))
+ check_err $? "Transmit queue non-zero?"
+
+ local d0=$(ethtool_stats_get $swp2 tc_no_buffer_discard_uc_tc_1)
+
+ local cell_size=$(devlink_cell_size_get)
+ local cells=$((BURST_SIZE / cell_size))
+ # Each packet is $pktsize of payload + headers.
+ local pkt_cells=$(((pktsize + 50 + cell_size - 1) / cell_size))
+ # How many packets can we admit:
+ local pkts=$((cells / pkt_cells))
+
+ $MZ $h3 -p $pktsize -Q 1:111 -A 192.0.2.35 -B 192.0.2.34 \
+ -a own -b $h2mac -c $pkts -t udp -q
+ sleep 1
+
+ local d1=$(ethtool_stats_get $swp2 tc_no_buffer_discard_uc_tc_1)
+ ((d1 == d0))
+ check_err $? "Drops seen on egress port: $d0 -> $d1 ($((d1 - d0)))"
+
+ # Check that the queue is somewhat close to the burst size This
+ # makes sure that the lack of drops above was not due to port
+ # undersubscribtion.
+ local q0=$(ethtool_stats_get $swp2 tc_transmit_queue_tc_1)
+ local qe=$((90 * BURST_SIZE / 100))
+ ((q0 > qe))
+ check_err $? "Queue size expected >$qe, got $q0"
+
+ stop_traffic
+ sleep 2
+
+ log_test "Burst: absorb $pkts ${pktsize}-B packets"
+}
+
+test_8K()
+{
+ __test_qos_burst 8000
+}
+
+test_800()
+{
+ __test_qos_burst 800
+}
+
+bail_on_lldpad
+
+trap cleanup EXIT
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 5386c826e46a..66681a2bcdd3 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1375,25 +1375,40 @@ flood_test()
__start_traffic()
{
+ local pktsize=$1; shift
local proto=$1; shift
local h_in=$1; shift # Where the traffic egresses the host
local sip=$1; shift
local dip=$1; shift
local dmac=$1; shift
- $MZ $h_in -p 8000 -A $sip -B $dip -c 0 \
+ $MZ $h_in -p $pktsize -A $sip -B $dip -c 0 \
-a own -b $dmac -t "$proto" -q "$@" &
sleep 1
}
+start_traffic_pktsize()
+{
+ local pktsize=$1; shift
+
+ __start_traffic $pktsize udp "$@"
+}
+
+start_tcp_traffic_pktsize()
+{
+ local pktsize=$1; shift
+
+ __start_traffic $pktsize tcp "$@"
+}
+
start_traffic()
{
- __start_traffic udp "$@"
+ start_traffic_pktsize 8000 "$@"
}
start_tcp_traffic()
{
- __start_traffic tcp "$@"
+ start_tcp_traffic_pktsize 8000 "$@"
}
stop_traffic()