summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2022-06-30 21:18:18 -0700
committerJakub Kicinski <kuba@kernel.org>2022-06-30 21:18:19 -0700
commit087b79854b9bf8cff3f7b4dd2a25380aac6858a4 (patch)
tree5ff0dddc936cd8561624623fe3ae925492460bff
parentc7e5c423cb59caddbf296717af484dcbacd76a3f (diff)
parent837ced3a1a5d8bb1a637dd584711f31ae6b54d93 (diff)
downloadlinux-087b79854b9bf8cff3f7b4dd2a25380aac6858a4.tar.bz2
Merge branch 'prevent-permanently-closed-tc-taprio-gates-from-blocking-a-felix-dsa-switch-port'
Vladimir Oltean says: ==================== Prevent permanently closed tc-taprio gates from blocking a Felix DSA switch port Richie Pearn reports that if we install a tc-taprio schedule on a Felix switch port, and that schedule has at least one gate that never opens (for example TC0 below): tc qdisc add dev swp1 root taprio num_tc 8 map 0 1 2 3 4 5 6 7 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ base-time 0 sched-entry S fe 1000000 flags 0x2 then packets classified to the permanently closed traffic class will not be dequeued by the egress port. They will just remain in the queue system, to consume resources. Frame aging does not trigger either, because in order for that to happen, the packets need to be eligible for egress scheduling in the first place, which they aren't. If that port is allowed to consume the entire shared buffer of the switch (as we configure things by default using devlink-sb), then eventually, by sending enough packets, the entire switch will hang. If we think enough about the problem, we realize that this is only a special case of a more general issue, and can also be reproduced with gates that aren't permanently closed, but are not large enough to send an entire frame. In that sense, a permanently closed gate is simply a case where all frames are oversized. The ENETC has logic to reject transmitted packets that would overrun the time window - see commit 285e8dedb4bd ("net: enetc: count the tc-taprio window drops"). The Felix switch has no such thing on a per-packet basis, but it has a register replicated per {egress port, TC} which essentially limits the max MTU. A packet which exceeds the per-port-TC MTU is immediately discarded and therefore will not hang the port anymore (albeit, sadly, this only bumps a generic drop hardware counter and we cannot really infer the reason such as to offer a dedicated counter for these events). This patch set calculates the max MTU per {port, TC} when the tc-taprio config, or link speed, or port-global MTU values change. This solves the larger "gate too small for packet" problem, but also the original issue with the gate permanently closed that was reported by Richie. ==================== Link: https://lore.kernel.org/r/20220628145238.3247853-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--drivers/net/dsa/ocelot/felix.c9
-rw-r--r--drivers/net/dsa/ocelot/felix.h1
-rw-r--r--drivers/net/dsa/ocelot/felix_vsc9959.c242
-rw-r--r--include/linux/time64.h3
-rw-r--r--include/soc/mscc/ocelot.h5
-rw-r--r--net/sched/sch_taprio.c5
6 files changed, 238 insertions, 27 deletions
diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 3e07dc39007a..859196898a7d 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -1553,9 +1553,18 @@ static void felix_txtstamp(struct dsa_switch *ds, int port,
static int felix_change_mtu(struct dsa_switch *ds, int port, int new_mtu)
{
struct ocelot *ocelot = ds->priv;
+ struct ocelot_port *ocelot_port = ocelot->ports[port];
+ struct felix *felix = ocelot_to_felix(ocelot);
ocelot_port_set_maxlen(ocelot, port, new_mtu);
+ mutex_lock(&ocelot->tas_lock);
+
+ if (ocelot_port->taprio && felix->info->tas_guard_bands_update)
+ felix->info->tas_guard_bands_update(ocelot, port);
+
+ mutex_unlock(&ocelot->tas_lock);
+
return 0;
}
diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h
index 9e07eb7ee28d..deb8dde1fc19 100644
--- a/drivers/net/dsa/ocelot/felix.h
+++ b/drivers/net/dsa/ocelot/felix.h
@@ -53,6 +53,7 @@ struct felix_info {
struct phylink_link_state *state);
int (*port_setup_tc)(struct dsa_switch *ds, int port,
enum tc_setup_type type, void *type_data);
+ void (*tas_guard_bands_update)(struct ocelot *ocelot, int port);
void (*port_sched_speed_set)(struct ocelot *ocelot, int port,
u32 speed);
struct regmap *(*init_regmap)(struct ocelot *ocelot,
diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index 693cd6ffbace..61ed317602e7 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -16,6 +16,7 @@
#include <linux/iopoll.h>
#include <linux/mdio.h>
#include <linux/pci.h>
+#include <linux/time.h>
#include "felix.h"
#define VSC9959_NUM_PORTS 6
@@ -1127,9 +1128,199 @@ static void vsc9959_mdio_bus_free(struct ocelot *ocelot)
mdiobus_free(felix->imdio);
}
+/* Extract shortest continuous gate open intervals in ns for each traffic class
+ * of a cyclic tc-taprio schedule. If a gate is always open, the duration is
+ * considered U64_MAX. If the gate is always closed, it is considered 0.
+ */
+static void vsc9959_tas_min_gate_lengths(struct tc_taprio_qopt_offload *taprio,
+ u64 min_gate_len[OCELOT_NUM_TC])
+{
+ struct tc_taprio_sched_entry *entry;
+ u64 gate_len[OCELOT_NUM_TC];
+ int tc, i, n;
+
+ /* Initialize arrays */
+ for (tc = 0; tc < OCELOT_NUM_TC; tc++) {
+ min_gate_len[tc] = U64_MAX;
+ gate_len[tc] = 0;
+ }
+
+ /* If we don't have taprio, consider all gates as permanently open */
+ if (!taprio)
+ return;
+
+ n = taprio->num_entries;
+
+ /* Walk through the gate list twice to determine the length
+ * of consecutively open gates for a traffic class, including
+ * open gates that wrap around. We are just interested in the
+ * minimum window size, and this doesn't change what the
+ * minimum is (if the gate never closes, min_gate_len will
+ * remain U64_MAX).
+ */
+ for (i = 0; i < 2 * n; i++) {
+ entry = &taprio->entries[i % n];
+
+ for (tc = 0; tc < OCELOT_NUM_TC; tc++) {
+ if (entry->gate_mask & BIT(tc)) {
+ gate_len[tc] += entry->interval;
+ } else {
+ /* Gate closes now, record a potential new
+ * minimum and reinitialize length
+ */
+ if (min_gate_len[tc] > gate_len[tc])
+ min_gate_len[tc] = gate_len[tc];
+ gate_len[tc] = 0;
+ }
+ }
+ }
+}
+
+/* Update QSYS_PORT_MAX_SDU to make sure the static guard bands added by the
+ * switch (see the ALWAYS_GUARD_BAND_SCH_Q comment) are correct at all MTU
+ * values (the default value is 1518). Also, for traffic class windows smaller
+ * than one MTU sized frame, update QSYS_QMAXSDU_CFG to enable oversized frame
+ * dropping, such that these won't hang the port, as they will never be sent.
+ */
+static void vsc9959_tas_guard_bands_update(struct ocelot *ocelot, int port)
+{
+ struct ocelot_port *ocelot_port = ocelot->ports[port];
+ u64 min_gate_len[OCELOT_NUM_TC];
+ int speed, picos_per_byte;
+ u64 needed_bit_time_ps;
+ u32 val, maxlen;
+ u8 tas_speed;
+ int tc;
+
+ lockdep_assert_held(&ocelot->tas_lock);
+
+ val = ocelot_read_rix(ocelot, QSYS_TAG_CONFIG, port);
+ tas_speed = QSYS_TAG_CONFIG_LINK_SPEED_X(val);
+
+ switch (tas_speed) {
+ case OCELOT_SPEED_10:
+ speed = SPEED_10;
+ break;
+ case OCELOT_SPEED_100:
+ speed = SPEED_100;
+ break;
+ case OCELOT_SPEED_1000:
+ speed = SPEED_1000;
+ break;
+ case OCELOT_SPEED_2500:
+ speed = SPEED_2500;
+ break;
+ default:
+ return;
+ }
+
+ picos_per_byte = (USEC_PER_SEC * 8) / speed;
+
+ val = ocelot_port_readl(ocelot_port, DEV_MAC_MAXLEN_CFG);
+ /* MAXLEN_CFG accounts automatically for VLAN. We need to include it
+ * manually in the bit time calculation, plus the preamble and SFD.
+ */
+ maxlen = val + 2 * VLAN_HLEN;
+ /* Consider the standard Ethernet overhead of 8 octets preamble+SFD,
+ * 4 octets FCS, 12 octets IFG.
+ */
+ needed_bit_time_ps = (maxlen + 24) * picos_per_byte;
+
+ dev_dbg(ocelot->dev,
+ "port %d: max frame size %d needs %llu ps at speed %d\n",
+ port, maxlen, needed_bit_time_ps, speed);
+
+ vsc9959_tas_min_gate_lengths(ocelot_port->taprio, min_gate_len);
+
+ for (tc = 0; tc < OCELOT_NUM_TC; tc++) {
+ u32 max_sdu;
+
+ if (min_gate_len[tc] == U64_MAX /* Gate always open */ ||
+ min_gate_len[tc] * PSEC_PER_NSEC > needed_bit_time_ps) {
+ /* Setting QMAXSDU_CFG to 0 disables oversized frame
+ * dropping.
+ */
+ max_sdu = 0;
+ dev_dbg(ocelot->dev,
+ "port %d tc %d min gate len %llu"
+ ", sending all frames\n",
+ port, tc, min_gate_len[tc]);
+ } else {
+ /* If traffic class doesn't support a full MTU sized
+ * frame, make sure to enable oversize frame dropping
+ * for frames larger than the smallest that would fit.
+ */
+ max_sdu = div_u64(min_gate_len[tc] * PSEC_PER_NSEC,
+ picos_per_byte);
+ /* A TC gate may be completely closed, which is a
+ * special case where all packets are oversized.
+ * Any limit smaller than 64 octets accomplishes this
+ */
+ if (!max_sdu)
+ max_sdu = 1;
+ /* Take L1 overhead into account, but just don't allow
+ * max_sdu to go negative or to 0. Here we use 20
+ * because QSYS_MAXSDU_CFG_* already counts the 4 FCS
+ * octets as part of packet size.
+ */
+ if (max_sdu > 20)
+ max_sdu -= 20;
+ dev_info(ocelot->dev,
+ "port %d tc %d min gate length %llu"
+ " ns not enough for max frame size %d at %d"
+ " Mbps, dropping frames over %d"
+ " octets including FCS\n",
+ port, tc, min_gate_len[tc], maxlen, speed,
+ max_sdu);
+ }
+
+ /* ocelot_write_rix is a macro that concatenates
+ * QSYS_MAXSDU_CFG_* with _RSZ, so we need to spell out
+ * the writes to each traffic class
+ */
+ switch (tc) {
+ case 0:
+ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_0,
+ port);
+ break;
+ case 1:
+ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_1,
+ port);
+ break;
+ case 2:
+ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_2,
+ port);
+ break;
+ case 3:
+ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_3,
+ port);
+ break;
+ case 4:
+ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_4,
+ port);
+ break;
+ case 5:
+ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_5,
+ port);
+ break;
+ case 6:
+ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_6,
+ port);
+ break;
+ case 7:
+ ocelot_write_rix(ocelot, max_sdu, QSYS_QMAXSDU_CFG_7,
+ port);
+ break;
+ }
+ }
+
+ ocelot_write_rix(ocelot, maxlen, QSYS_PORT_MAX_SDU, port);
+}
+
static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port,
u32 speed)
{
+ struct ocelot_port *ocelot_port = ocelot->ports[port];
u8 tas_speed;
switch (speed) {
@@ -1154,6 +1345,13 @@ static void vsc9959_sched_speed_set(struct ocelot *ocelot, int port,
QSYS_TAG_CONFIG_LINK_SPEED(tas_speed),
QSYS_TAG_CONFIG_LINK_SPEED_M,
QSYS_TAG_CONFIG, port);
+
+ mutex_lock(&ocelot->tas_lock);
+
+ if (ocelot_port->taprio)
+ vsc9959_tas_guard_bands_update(ocelot, port);
+
+ mutex_unlock(&ocelot->tas_lock);
}
static void vsc9959_new_base_time(struct ocelot *ocelot, ktime_t base_time,
@@ -1204,12 +1402,14 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
mutex_lock(&ocelot->tas_lock);
if (!taprio->enable) {
- ocelot_rmw_rix(ocelot,
- QSYS_TAG_CONFIG_INIT_GATE_STATE(0xFF),
- QSYS_TAG_CONFIG_ENABLE |
- QSYS_TAG_CONFIG_INIT_GATE_STATE_M,
+ ocelot_rmw_rix(ocelot, 0, QSYS_TAG_CONFIG_ENABLE,
QSYS_TAG_CONFIG, port);
+ taprio_offload_free(ocelot_port->taprio);
+ ocelot_port->taprio = NULL;
+
+ vsc9959_tas_guard_bands_update(ocelot, port);
+
mutex_unlock(&ocelot->tas_lock);
return 0;
}
@@ -1258,8 +1458,6 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
QSYS_TAG_CONFIG_SCH_TRAFFIC_QUEUES_M,
QSYS_TAG_CONFIG, port);
- ocelot_port->base_time = taprio->base_time;
-
vsc9959_new_base_time(ocelot, taprio->base_time,
taprio->cycle_time, &base_ts);
ocelot_write(ocelot, base_ts.tv_nsec, QSYS_PARAM_CFG_REG_1);
@@ -1282,6 +1480,11 @@ static int vsc9959_qos_port_tas_set(struct ocelot *ocelot, int port,
ret = readx_poll_timeout(vsc9959_tas_read_cfg_status, ocelot, val,
!(val & QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE),
10, 100000);
+ if (ret)
+ goto err;
+
+ ocelot_port->taprio = taprio_offload_get(taprio);
+ vsc9959_tas_guard_bands_update(ocelot, port);
err:
mutex_unlock(&ocelot->tas_lock);
@@ -1291,17 +1494,18 @@ err:
static void vsc9959_tas_clock_adjust(struct ocelot *ocelot)
{
+ struct tc_taprio_qopt_offload *taprio;
struct ocelot_port *ocelot_port;
struct timespec64 base_ts;
- u64 cycletime;
int port;
u32 val;
mutex_lock(&ocelot->tas_lock);
for (port = 0; port < ocelot->num_phys_ports; port++) {
- val = ocelot_read_rix(ocelot, QSYS_TAG_CONFIG, port);
- if (!(val & QSYS_TAG_CONFIG_ENABLE))
+ ocelot_port = ocelot->ports[port];
+ taprio = ocelot_port->taprio;
+ if (!taprio)
continue;
ocelot_rmw(ocelot,
@@ -1309,17 +1513,12 @@ static void vsc9959_tas_clock_adjust(struct ocelot *ocelot)
QSYS_TAS_PARAM_CFG_CTRL_PORT_NUM_M,
QSYS_TAS_PARAM_CFG_CTRL);
- ocelot_rmw_rix(ocelot,
- QSYS_TAG_CONFIG_INIT_GATE_STATE(0xFF),
- QSYS_TAG_CONFIG_ENABLE |
- QSYS_TAG_CONFIG_INIT_GATE_STATE_M,
+ /* Disable time-aware shaper */
+ ocelot_rmw_rix(ocelot, 0, QSYS_TAG_CONFIG_ENABLE,
QSYS_TAG_CONFIG, port);
- cycletime = ocelot_read(ocelot, QSYS_PARAM_CFG_REG_4);
- ocelot_port = ocelot->ports[port];
-
- vsc9959_new_base_time(ocelot, ocelot_port->base_time,
- cycletime, &base_ts);
+ vsc9959_new_base_time(ocelot, taprio->base_time,
+ taprio->cycle_time, &base_ts);
ocelot_write(ocelot, base_ts.tv_nsec, QSYS_PARAM_CFG_REG_1);
ocelot_write(ocelot, lower_32_bits(base_ts.tv_sec),
@@ -1334,11 +1533,9 @@ static void vsc9959_tas_clock_adjust(struct ocelot *ocelot)
QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE,
QSYS_TAS_PARAM_CFG_CTRL);
- ocelot_rmw_rix(ocelot,
- QSYS_TAG_CONFIG_INIT_GATE_STATE(0xFF) |
+ /* Re-enable time-aware shaper */
+ ocelot_rmw_rix(ocelot, QSYS_TAG_CONFIG_ENABLE,
QSYS_TAG_CONFIG_ENABLE,
- QSYS_TAG_CONFIG_ENABLE |
- QSYS_TAG_CONFIG_INIT_GATE_STATE_M,
QSYS_TAG_CONFIG, port);
}
mutex_unlock(&ocelot->tas_lock);
@@ -2311,6 +2508,7 @@ static const struct felix_info felix_info_vsc9959 = {
.port_modes = vsc9959_port_modes,
.port_setup_tc = vsc9959_port_setup_tc,
.port_sched_speed_set = vsc9959_sched_speed_set,
+ .tas_guard_bands_update = vsc9959_tas_guard_bands_update,
.init_regmap = ocelot_regmap_init,
};
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 81b9686a2079..2fb8232cff1d 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -20,6 +20,9 @@ struct itimerspec64 {
struct timespec64 it_value;
};
+/* Parameters used to convert the timespec values: */
+#define PSEC_PER_NSEC 1000L
+
/* Located here for timespec[64]_valid_strict */
#define TIME64_MAX ((s64)~((u64)1 << 63))
#define TIME64_MIN (-TIME64_MAX - 1)
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 3737570116c3..ac151ecc7f19 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -670,6 +670,8 @@ struct ocelot_port {
/* VLAN that untagged frames are classified to, on ingress */
const struct ocelot_bridge_vlan *pvid_vlan;
+ struct tc_taprio_qopt_offload *taprio;
+
phy_interface_t phy_mode;
unsigned int ptp_skbs_in_flight;
@@ -692,9 +694,6 @@ struct ocelot_port {
int bridge_num;
int speed;
-
- /* Store the AdminBaseTime of EST fetched from userspace. */
- s64 base_time;
};
struct ocelot {
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index b9c71a304d39..0b941dd63d26 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/time.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -176,7 +177,7 @@ static ktime_t get_interval_end_time(struct sched_gate_list *sched,
static int length_to_duration(struct taprio_sched *q, int len)
{
- return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
+ return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
}
/* Returns the entry corresponding to next available interval. If
@@ -551,7 +552,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
{
atomic_set(&entry->budget,
- div64_u64((u64)entry->interval * 1000,
+ div64_u64((u64)entry->interval * PSEC_PER_NSEC,
atomic64_read(&q->picos_per_byte)));
}