1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00

net/mlx5: Increase FW pre-init timeout for health recovery

Currently, health recovery will reload driver to recover it from fatal
errors. During the driver's load process, it would wait for FW to set the
pre-init bit for up to 120 seconds, beyond this threshold it would abort
the load process. In some cases, such as a FW upgrade on the DPU, this
timeout period is insufficient, and the user has no way to recover the
host device.

To solve this issue, introduce a new FW pre-init timeout for health
recovery, which is set to 2 hours.

The timeout for devlink reload and probe will use the original one because
they are user triggered flows, and therefore should not have a
significantly long timeout, during which the user command would hang.

Signed-off-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
This commit is contained in:
Gavin Li 2022-03-27 17:36:44 +03:00 committed by Saeed Mahameed
parent 8324a02c34
commit 37ca95e62e
6 changed files with 20 additions and 13 deletions

View file

@ -178,13 +178,13 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
*actions_performed = BIT(action);
switch (action) {
case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
return mlx5_load_one(dev);
return mlx5_load_one(dev, false);
case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
break;
/* On fw_activate action, also driver is reloaded and reinit performed */
*actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
return mlx5_load_one(dev);
return mlx5_load_one(dev, false);
default:
/* Unsupported action should not get to this function */
WARN_ON(1);

View file

@ -148,7 +148,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
complete(&fw_reset->done);
} else {
mlx5_load_one(dev);
mlx5_load_one(dev, false);
devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));

View file

@ -10,6 +10,7 @@ struct mlx5_timeouts {
static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
[MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000,
[MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000,
[MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
[MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
[MLX5_TO_FW_INIT_MS] = 2000,

View file

@ -7,6 +7,7 @@
enum mlx5_timeouts_types {
/* pre init timeouts (not read from FW) */
MLX5_TO_FW_PRE_INIT_TIMEOUT_MS,
MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS,
MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS,
MLX5_TO_FW_PRE_INIT_WAIT_MS,

View file

@ -1003,7 +1003,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
mlx5_devcom_unregister_device(dev->priv.devcom);
}
static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
{
int err;
@ -1018,11 +1018,11 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
/* wait for firmware to accept initialization segments configurations
*/
err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT),
err = wait_fw_init(dev, timeout,
mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL));
if (err) {
mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n",
mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
timeout);
return err;
}
@ -1272,7 +1272,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
mutex_lock(&dev->intf_state_mutex);
dev->state = MLX5_DEVICE_STATE_UP;
err = mlx5_function_setup(dev, true);
err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
if (err)
goto err_function;
@ -1336,9 +1336,10 @@ out:
mutex_unlock(&dev->intf_state_mutex);
}
int mlx5_load_one(struct mlx5_core_dev *dev)
int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery)
{
int err = 0;
u64 timeout;
mutex_lock(&dev->intf_state_mutex);
if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@ -1348,7 +1349,11 @@ int mlx5_load_one(struct mlx5_core_dev *dev)
/* remove any previous indication of internal error */
dev->state = MLX5_DEVICE_STATE_UP;
err = mlx5_function_setup(dev, false);
if (recovery)
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
else
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
err = mlx5_function_setup(dev, timeout);
if (err)
goto err_function;
@ -1719,7 +1724,7 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
mlx5_pci_trace(dev, "Enter, loading driver..\n");
err = mlx5_load_one(dev);
err = mlx5_load_one(dev, false);
mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
!err ? "recovered" : "Failed");
@ -1807,7 +1812,7 @@ static int mlx5_resume(struct pci_dev *pdev)
{
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
return mlx5_load_one(dev);
return mlx5_load_one(dev, false);
}
static const struct pci_device_id mlx5_core_pci_table[] = {
@ -1852,7 +1857,7 @@ int mlx5_recover_device(struct mlx5_core_dev *dev)
return -EIO;
}
return mlx5_load_one(dev);
return mlx5_load_one(dev, true);
}
static struct pci_driver mlx5_core_driver = {

View file

@ -290,7 +290,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev);
int mlx5_init_one(struct mlx5_core_dev *dev);
void mlx5_uninit_one(struct mlx5_core_dev *dev);
void mlx5_unload_one(struct mlx5_core_dev *dev);
int mlx5_load_one(struct mlx5_core_dev *dev);
int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);