summaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-09-22 17:44:59 -0700
committerDavid S. Miller <davem@davemloft.net>2020-09-22 17:44:59 -0700
commit573a8095f68c7a73b362b5c0f65bb6a0a8a53a03 (patch)
tree4a93cd0afd57d6836ef3fea1f9fb89b2c9da4709 /drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
parent748d1c8a425ec529d541f082ee7a81f6a51fa120 (diff)
parent5af75c747e2a868abbf8611494b50ed5e076fca7 (diff)
downloadlinux-573a8095f68c7a73b362b5c0f65bb6a0a8a53a03.tar.bz2
Merge tag 'mlx5-updates-2020-09-21' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux
Saeed Mahameed says: ==================== mlx5-updates-2020-09-21 Multi packet TX descriptor support for SKBs. This series introduces some refactoring of the regular TX data path in mlx5 and adds the Enhanced TX MPWQE feature support. MPWQE stands for multi-packet work queue element, and it can serve multiple packets, reducing the PCI bandwidth spent on control traffic. It should improve performance in scenarios where PCI is the bottleneck, and xmit_more is signaled by the kernel. The refactoring done in this series also improves the packet rate on its own. MPWQE is already implemented in the XDP tx path, this series adds the support of MPWQE for regular kernel SKB tx path. MPWQE is supported from ConnectX-5 and onward, for legacy devices we need to keep backward compatibility for regular (Single packet) WQE descriptor. MPWQE is not compatible with certain offloads and features, such as TLS offload, TSO, nonlinear SKBs. If such incompatible features are in use, the driver gracefully falls back to non-MPWQE per SKB. Prior to the final patch "net/mlx5e: Enhanced TX MPWQE for SKBs" that adds the actual support, Maxim did some refactoring to the tx data path to split it into stages and smaller helper functions that can be utilized and reused for both legacy and new MPWQE feature. Performance testing: UDP performance is improved in a single stream pktgen test: Packet rate: 16.86 Mpps (±0.15 Mpps) -> 20.94 Mpps (±0.33 Mpps) Instructions per packet: 434 -> 329 Cycles per packet: 158 -> 123 Instructions per cycle: 2.75 -> 2.67 TCP and XDP_TX single stream tests show no performance difference. MPWQE can reduce PCI bandwidth: PCI Gen2, pktgen at fixed rate of 36864000 pps on 24 CPU cores: Inbound PCI utilization with MPWQE off: 80.3% Inbound PCI utilization with MPWQE on: 59.0% PCI Gen3, pktgen at fixed rate of 56064000 pps on 24 CPU cores: Inbound PCI utilization with MPWQE off: 65.4% Inbound PCI utilization with MPWQE on: 49.3% MPWQE can also reduce CPU load, increasing the packet rate in case of CPU bottleneck: PCI Gen2, pktgen at full rate on 24 CPU cores: Packet rate with MPWQE off: 37.5 Mpps Packet rate with MPWQE on: 49.0 Mpps PCI Gen3, pktgen at full rate on 24 CPU cores: Packet rate with MPWQE off: 57.0 Mpps Packet rate with MPWQE on: 66.8 Mpps Burst size in all pktgen tests is 32. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz (x86_64) NIC: Mellanox ConnectX-6 Dx GCC 10.2.0 ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c35
1 files changed, 19 insertions, 16 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 562bc465f82b..ae90d533a350 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -59,7 +59,7 @@ static inline bool
mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
struct mlx5e_dma_info *di, struct xdp_buff *xdp)
{
- struct mlx5e_xdp_xmit_data xdptxd;
+ struct mlx5e_xmit_data xdptxd;
struct mlx5e_xdp_info xdpi;
struct xdp_frame *xdpf;
dma_addr_t dma_addr;
@@ -194,18 +194,22 @@ static u16 mlx5e_xdpsq_get_next_pi(struct mlx5e_xdpsq *sq, u16 size)
static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq)
{
- struct mlx5e_xdp_mpwqe *session = &sq->mpwqe;
+ struct mlx5e_tx_mpwqe *session = &sq->mpwqe;
struct mlx5e_xdpsq_stats *stats = sq->stats;
+ struct mlx5e_tx_wqe *wqe;
u16 pi;
- pi = mlx5e_xdpsq_get_next_pi(sq, MLX5_SEND_WQE_MAX_WQEBBS);
- session->wqe = MLX5E_TX_FETCH_WQE(sq, pi);
+ pi = mlx5e_xdpsq_get_next_pi(sq, MLX5E_TX_MPW_MAX_WQEBBS);
+ wqe = MLX5E_TX_FETCH_WQE(sq, pi);
+ net_prefetchw(wqe->data);
- net_prefetchw(session->wqe->data);
- session->ds_count = MLX5E_XDP_TX_EMPTY_DS_COUNT;
- session->pkt_count = 0;
-
- mlx5e_xdp_update_inline_state(sq);
+ *session = (struct mlx5e_tx_mpwqe) {
+ .wqe = wqe,
+ .bytes_count = 0,
+ .ds_count = MLX5E_TX_WQE_EMPTY_DS_COUNT,
+ .pkt_count = 0,
+ .inline_on = mlx5e_xdp_get_inline_state(sq, session->inline_on),
+ };
stats->mpwqe++;
}
@@ -213,7 +217,7 @@ static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq)
void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq)
{
struct mlx5_wq_cyc *wq = &sq->wq;
- struct mlx5e_xdp_mpwqe *session = &sq->mpwqe;
+ struct mlx5e_tx_mpwqe *session = &sq->mpwqe;
struct mlx5_wqe_ctrl_seg *cseg = &session->wqe->ctrl;
u16 ds_count = session->ds_count;
u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
@@ -258,10 +262,10 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq
}
INDIRECT_CALLABLE_SCOPE bool
-mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_xmit_data *xdptxd,
+mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
struct mlx5e_xdp_info *xdpi, int check_result)
{
- struct mlx5e_xdp_mpwqe *session = &sq->mpwqe;
+ struct mlx5e_tx_mpwqe *session = &sq->mpwqe;
struct mlx5e_xdpsq_stats *stats = sq->stats;
if (unlikely(xdptxd->len > sq->hw_mtu)) {
@@ -284,8 +288,7 @@ mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_xmit_data *x
mlx5e_xdp_mpwqe_add_dseg(sq, xdptxd, stats);
- if (unlikely(mlx5e_xdp_no_room_for_inline_pkt(session) ||
- session->ds_count == MLX5E_XDP_MPW_MAX_NUM_DS))
+ if (unlikely(mlx5e_xdp_mpqwe_is_full(session)))
mlx5e_xdp_mpwqe_complete(sq);
mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi);
@@ -306,7 +309,7 @@ INDIRECT_CALLABLE_SCOPE int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq)
}
INDIRECT_CALLABLE_SCOPE bool
-mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_xmit_data *xdptxd,
+mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd,
struct mlx5e_xdp_info *xdpi, int check_result)
{
struct mlx5_wq_cyc *wq = &sq->wq;
@@ -503,7 +506,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
- struct mlx5e_xdp_xmit_data xdptxd;
+ struct mlx5e_xmit_data xdptxd;
struct mlx5e_xdp_info xdpi;
bool ret;