RDMA/mlx5: Add a umr recovery flow
When a UMR fails, the UMR QP state changes to an error state. Therefore, all the further UMR operations will fail too. Add a recovery flow to the UMR QP, and repost the flushed WQEs. Link: https://lore.kernel.org/r/6cc24816cca049bd8541317f5e41d3ac659445d3.1652588303.git.leonro@nvidia.com Signed-off-by: Aharon Landau <aharonl@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Signed-off-by: Leon Romanovsky <leon@kernel.org>
This commit is contained in:
parent
650126a890
commit
158e71bb69
3 changed files with 83 additions and 11 deletions
|
@ -523,6 +523,10 @@ repoll:
|
||||||
"Requestor" : "Responder", cq->mcq.cqn);
|
"Requestor" : "Responder", cq->mcq.cqn);
|
||||||
mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
|
mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
|
||||||
err_cqe->syndrome, err_cqe->vendor_err_synd);
|
err_cqe->syndrome, err_cqe->vendor_err_synd);
|
||||||
|
if (wc->status != IB_WC_WR_FLUSH_ERR &&
|
||||||
|
(*cur_qp)->type == MLX5_IB_QPT_REG_UMR)
|
||||||
|
dev->umrc.state = MLX5_UMR_STATE_RECOVER;
|
||||||
|
|
||||||
if (opcode == MLX5_CQE_REQ_ERR) {
|
if (opcode == MLX5_CQE_REQ_ERR) {
|
||||||
wq = &(*cur_qp)->sq;
|
wq = &(*cur_qp)->sq;
|
||||||
wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
|
wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
|
||||||
|
|
|
@ -717,13 +717,23 @@ struct mlx5_ib_umr_context {
|
||||||
struct completion done;
|
struct completion done;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum {
|
||||||
|
MLX5_UMR_STATE_ACTIVE,
|
||||||
|
MLX5_UMR_STATE_RECOVER,
|
||||||
|
MLX5_UMR_STATE_ERR,
|
||||||
|
};
|
||||||
|
|
||||||
struct umr_common {
|
struct umr_common {
|
||||||
struct ib_pd *pd;
|
struct ib_pd *pd;
|
||||||
struct ib_cq *cq;
|
struct ib_cq *cq;
|
||||||
struct ib_qp *qp;
|
struct ib_qp *qp;
|
||||||
/* control access to UMR QP
|
/* Protects from UMR QP overflow
|
||||||
*/
|
*/
|
||||||
struct semaphore sem;
|
struct semaphore sem;
|
||||||
|
/* Protects from using UMR while the UMR is not active
|
||||||
|
*/
|
||||||
|
struct mutex lock;
|
||||||
|
unsigned int state;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct mlx5_cache_ent {
|
struct mlx5_cache_ent {
|
||||||
|
|
|
@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
|
||||||
dev->umrc.pd = pd;
|
dev->umrc.pd = pd;
|
||||||
|
|
||||||
sema_init(&dev->umrc.sem, MAX_UMR_WR);
|
sema_init(&dev->umrc.sem, MAX_UMR_WR);
|
||||||
|
mutex_init(&dev->umrc.lock);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
|
||||||
ib_dealloc_pd(dev->umrc.pd);
|
ib_dealloc_pd(dev->umrc.pd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
|
||||||
|
{
|
||||||
|
struct umr_common *umrc = &dev->umrc;
|
||||||
|
struct ib_qp_attr attr;
|
||||||
|
int err;
|
||||||
|
|
||||||
|
attr.qp_state = IB_QPS_RESET;
|
||||||
|
err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
|
||||||
|
if (err) {
|
||||||
|
mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
|
||||||
|
if (err)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
umrc->state = MLX5_UMR_STATE_ACTIVE;
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
err:
|
||||||
|
umrc->state = MLX5_UMR_STATE_ERR;
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
|
static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
|
||||||
struct mlx5r_umr_wqe *wqe, bool with_data)
|
struct mlx5r_umr_wqe *wqe, bool with_data)
|
||||||
{
|
{
|
||||||
|
@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
|
||||||
|
|
||||||
id.ib_cqe = cqe;
|
id.ib_cqe = cqe;
|
||||||
mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
|
mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
|
||||||
MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR);
|
MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
|
||||||
|
|
||||||
mlx5r_ring_db(qp, 1, ctrl);
|
mlx5r_ring_db(qp, 1, ctrl);
|
||||||
|
|
||||||
|
@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
|
||||||
mlx5r_umr_init_context(&umr_context);
|
mlx5r_umr_init_context(&umr_context);
|
||||||
|
|
||||||
down(&umrc->sem);
|
down(&umrc->sem);
|
||||||
|
while (true) {
|
||||||
|
mutex_lock(&umrc->lock);
|
||||||
|
if (umrc->state == MLX5_UMR_STATE_ERR) {
|
||||||
|
mutex_unlock(&umrc->lock);
|
||||||
|
err = -EFAULT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (umrc->state == MLX5_UMR_STATE_RECOVER) {
|
||||||
|
mutex_unlock(&umrc->lock);
|
||||||
|
usleep_range(3000, 5000);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
|
err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
|
||||||
with_data);
|
with_data);
|
||||||
if (err)
|
mutex_unlock(&umrc->lock);
|
||||||
mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
|
if (err) {
|
||||||
else {
|
mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
|
||||||
wait_for_completion(&umr_context.done);
|
err);
|
||||||
if (umr_context.status != IB_WC_SUCCESS) {
|
break;
|
||||||
mlx5_ib_warn(dev, "reg umr failed (%u)\n",
|
|
||||||
umr_context.status);
|
|
||||||
err = -EFAULT;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_for_completion(&umr_context.done);
|
||||||
|
|
||||||
|
if (umr_context.status == IB_WC_SUCCESS)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (umr_context.status == IB_WC_WR_FLUSH_ERR)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
WARN_ON_ONCE(1);
|
||||||
|
mlx5_ib_warn(dev,
|
||||||
|
"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
|
||||||
|
umr_context.status);
|
||||||
|
mutex_lock(&umrc->lock);
|
||||||
|
err = mlx5r_umr_recover(dev);
|
||||||
|
mutex_unlock(&umrc->lock);
|
||||||
|
if (err)
|
||||||
|
mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
|
||||||
|
err);
|
||||||
|
err = -EFAULT;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
up(&umrc->sem);
|
up(&umrc->sem);
|
||||||
return err;
|
return err;
|
||||||
|
|
Loading…
Add table
Reference in a new issue