summaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
diff options
context:
space:
mode:
authorEran Ben Elisha2019-01-17 22:59:19 +0100
committerDavid S. Miller2019-01-18 23:51:23 +0100
commitce019faa70f81555fa17ebc1d5a03651f2e7e15a (patch)
tree24f45a4882a0a3f1852c7ae7909fabc0b693aae2 /drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
parentnet/mlx5e: Add TX reporter support (diff)
downloadkernel-qcow2-linux-ce019faa70f81555fa17ebc1d5a03651f2e7e15a.tar.gz
kernel-qcow2-linux-ce019faa70f81555fa17ebc1d5a03651f2e7e15a.tar.xz
kernel-qcow2-linux-ce019faa70f81555fa17ebc1d5a03651f2e7e15a.zip
net/mlx5e: Add TX timeout support for mlx5e TX reporter
With this patch, ndo_tx_timeout callback will be redirected to the TX reporter in order to detect a TX timeout error and report it to the devlink health. (The watchdog detects TX timeouts, but the driver verify the issue still exists before launching any recover method). In addition, recover from TX timeout in case of lost interrupt was added to the TX reporter recover method. The TX timeout recover from lost interrupt is not a new feature in the driver, this patch re-organize the functionality and move it to the TX reporter recovery flow. TX timeout example: (with auto_recover set to false, if set to true, the manual recover and diagnose sections are irrelevant) $cat /sys/kernel/debug/tracing/trace ... devlink_health_report: bus_name=pci dev_name=0000:00:09.0 driver_name=mlx5_core reporter_name=TX: TX timeout on queue: 0, SQ: 0xd8a, CQ: 0x406, SQ Cons: 0x2 SQ Prod: 0x2, usecs since last trans: 13972000 $devlink health diagnose pci/0000:00:09 reporter TX SQ 0xd8a: HW state: 1, stopped: 1 SQ 0xe44: HW state: 1, stopped: 0 SQ 0xeb4: HW state: 1, stopped: 0 SQ 0xf1f: HW state: 1, stopped: 0 SQ 0xf80: HW state: 1, stopped: 0 SQ 0xfe5: HW state: 1, stopped: 0 $devlink health recover pci/0000:00:09 reporter TX $devlink health show pci/0000:00:09.0: name TX state healthy #err 1 #recover 1 last_dump_ts N/A dump_available false attributes: grace_period 500 auto_recover false Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> Reviewed-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c35
1 files changed, 35 insertions, 0 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index 9800df4909c2..d9675afbb924 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -126,6 +126,41 @@ void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq)
&err_ctx);
}
+static int mlx5e_tx_reporter_timeout_recover(struct mlx5e_txqsq *sq)
+{
+ struct mlx5_eq_comp *eq = sq->cq.mcq.eq;
+ u32 eqe_count;
+
+ netdev_err(sq->channel->netdev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
+ eq->core.eqn, eq->core.cons_index, eq->core.irqn);
+
+ eqe_count = mlx5_eq_poll_irq_disabled(eq);
+ if (!eqe_count) {
+ clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+ return 1;
+ }
+
+ netdev_err(sq->channel->netdev, "Recover %d eqes on EQ 0x%x\n",
+ eqe_count, eq->core.eqn);
+ sq->channel->stats->eq_rearm++;
+ return 0;
+}
+
+void mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq)
+{
+ struct mlx5e_tx_err_ctx err_ctx;
+ char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN];
+
+ err_ctx.sq = sq;
+ err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
+ sprintf(err_str,
+ "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
+ sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
+ jiffies_to_usecs(jiffies - sq->txq->trans_start));
+ devlink_health_report(sq->channel->priv->tx_reporter, err_str,
+ &err_ctx);
+}
+
/* state lock cannot be grabbed within this function.
* It can cause a dead lock or a read-after-free.
*/