summaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
diff options
context:
space:
mode:
authorEran Ben Elisha2017-12-19 13:52:29 +0100
committerSaeed Mahameed2018-01-19 21:41:33 +0100
commit7ca560b5af70b5f578c9bf32c8fbfbd68d22252f (patch)
tree1ea15c68151f7fc60fd605c929d55084541dedbc /drivers/net/ethernet/mellanox/mlx5/core/en_main.c
parentnet/mlx5e: Add Event Queue meta data info for TX timeout logs (diff)
downloadkernel-qcow2-linux-7ca560b5af70b5f578c9bf32c8fbfbd68d22252f.tar.gz
kernel-qcow2-linux-7ca560b5af70b5f578c9bf32c8fbfbd68d22252f.tar.xz
kernel-qcow2-linux-7ca560b5af70b5f578c9bf32c8fbfbd68d22252f.zip
net/mlx5e: Poll event queue upon TX timeout before performing full channels recovery
Up until this patch, on every TX timeout we would try to do channels recovery. However, in case of a lost interrupt for an EQ, the channel associated to it cannot be recovered if reopened as it would never get another interrupt on sent/received traffic, and eventually ends up with another TX timeout (Restarting the EQ is not part of channel recovery). This patch adds a mechanism for explicitly polling EQ in case of a TX timeout in order to recover from a lost interrupt. If this is not the case (no pending EQEs), perform a channels full recovery as usual. Once a lost EQE is recovered, it triggers the NAPI to run and handle all pending completions. This will free some budget in the bql (via calling netdev_tx_completed_queue) or by clearing pending TXWQEs and waking up the queue. One of the above actions will move the queue to be ready for transmit again. Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> Reviewed-by: Tariq Toukan <tariqt@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/en_main.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c51
1 files changed, 36 insertions, 15 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 71b8b171c565..12ea5e319e38 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3757,10 +3757,37 @@ static netdev_features_t mlx5e_features_check(struct sk_buff *skb,
return features;
}
+static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev,
+ struct mlx5e_txqsq *sq)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5_core_dev *mdev = priv->mdev;
+ int irqn_not_used, eqn;
+ struct mlx5_eq *eq;
+ u32 eqe_count;
+
+ if (mlx5_vector2eqn(mdev, sq->cq.mcq.vector, &eqn, &irqn_not_used))
+ return false;
+
+ eq = mlx5_eqn2eq(mdev, eqn);
+ if (IS_ERR(eq))
+ return false;
+
+ netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
+ eqn, eq->cons_index, eq->irqn);
+
+ eqe_count = mlx5_eq_poll_irq_disabled(eq);
+ if (!eqe_count)
+ return false;
+
+ netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->eqn);
+ return true;
+}
+
static void mlx5e_tx_timeout(struct net_device *dev)
{
struct mlx5e_priv *priv = netdev_priv(dev);
- bool sched_work = false;
+ bool reopen_channels = false;
int i;
netdev_err(dev, "TX timeout detected\n");
@@ -3768,29 +3795,23 @@ static void mlx5e_tx_timeout(struct net_device *dev)
for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) {
struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i);
struct mlx5e_txqsq *sq = priv->txq2sq[i];
- struct mlx5_core_dev *mdev = priv->mdev;
- int irqn_not_used, eqn;
- struct mlx5_eq *eq;
if (!netif_xmit_stopped(dev_queue))
continue;
- sched_work = true;
- clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
netdev_err(dev, "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n",
i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
jiffies_to_usecs(jiffies - dev_queue->trans_start));
- if (mlx5_vector2eqn(mdev, sq->cq.mcq.vector, &eqn,
- &irqn_not_used))
- continue;
-
- eq = mlx5_eqn2eq(mdev, eqn);
- if (!IS_ERR(eq))
- netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
- eqn, eq->cons_index, eq->irqn);
+ /* If we recover a lost interrupt, most likely TX timeout will
+ * be resolved, skip reopening channels
+ */
+ if (!mlx5e_tx_timeout_eq_recover(dev, sq)) {
+ clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+ reopen_channels = true;
+ }
}
- if (sched_work && test_bit(MLX5E_STATE_OPENED, &priv->state))
+ if (reopen_channels && test_bit(MLX5E_STATE_OPENED, &priv->state))
schedule_work(&priv->tx_timeout_work);
}