xprtrdma: Fix recursion into rpcrdma_xprt_disconnect()
Both Dan and I have observed two processes invoking
rpcrdma_xprt_disconnect() concurrently. In my case:
1. The connect worker invokes rpcrdma_xprt_disconnect(), which
drains the QP and waits for the final completion
2. This causes the newly posted Receive to flush and invoke
xprt_force_disconnect()
3. xprt_force_disconnect() sets CLOSE_WAIT and wakes up the RPC task
that is holding the transport lock
4. The RPC task invokes xprt_connect(), which calls ->ops->close
5. xprt_rdma_close() invokes rpcrdma_xprt_disconnect(), which tries
to destroy the QP.
Deadlock.
To prevent xprt_force_disconnect() from waking anything, handle the
clean up after a failed connection attempt in the xprt's sndtask.
The retry loop is removed from rpcrdma_xprt_connect() to ensure
that the newly allocated ep and id are properly released before
a REJECTED connection attempt can be retried.
Reported-by: Dan Aloni <dan@kernelim.com>
Fixes: e28ce90083
("xprtrdma: kmalloc rpcrdma_ep separate from rpcrdma_xprt")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
This commit is contained in:
parent
85bfd71bc3
commit
4cf44be6f1
2 changed files with 7 additions and 8 deletions
|
@ -249,6 +249,11 @@ xprt_rdma_connect_worker(struct work_struct *work)
|
||||||
xprt->stat.connect_start;
|
xprt->stat.connect_start;
|
||||||
xprt_set_connected(xprt);
|
xprt_set_connected(xprt);
|
||||||
rc = -EAGAIN;
|
rc = -EAGAIN;
|
||||||
|
} else {
|
||||||
|
/* Force a call to xprt_rdma_close to clean up */
|
||||||
|
spin_lock(&xprt->transport_lock);
|
||||||
|
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
|
||||||
|
spin_unlock(&xprt->transport_lock);
|
||||||
}
|
}
|
||||||
xprt_wake_pending_tasks(xprt, rc);
|
xprt_wake_pending_tasks(xprt, rc);
|
||||||
}
|
}
|
||||||
|
|
|
@ -290,7 +290,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
|
||||||
sap, rdma_reject_msg(id, event->status));
|
sap, rdma_reject_msg(id, event->status));
|
||||||
ep->re_connect_status = -ECONNREFUSED;
|
ep->re_connect_status = -ECONNREFUSED;
|
||||||
if (event->status == IB_CM_REJ_STALE_CONN)
|
if (event->status == IB_CM_REJ_STALE_CONN)
|
||||||
ep->re_connect_status = -EAGAIN;
|
ep->re_connect_status = -ENOTCONN;
|
||||||
goto disconnected;
|
goto disconnected;
|
||||||
case RDMA_CM_EVENT_DISCONNECTED:
|
case RDMA_CM_EVENT_DISCONNECTED:
|
||||||
ep->re_connect_status = -ECONNABORTED;
|
ep->re_connect_status = -ECONNABORTED;
|
||||||
|
@ -521,8 +521,6 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
|
||||||
struct rpcrdma_ep *ep;
|
struct rpcrdma_ep *ep;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
retry:
|
|
||||||
rpcrdma_xprt_disconnect(r_xprt);
|
|
||||||
rc = rpcrdma_ep_create(r_xprt);
|
rc = rpcrdma_ep_create(r_xprt);
|
||||||
if (rc)
|
if (rc)
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -550,17 +548,13 @@ retry:
|
||||||
wait_event_interruptible(ep->re_connect_wait,
|
wait_event_interruptible(ep->re_connect_wait,
|
||||||
ep->re_connect_status != 0);
|
ep->re_connect_status != 0);
|
||||||
if (ep->re_connect_status <= 0) {
|
if (ep->re_connect_status <= 0) {
|
||||||
if (ep->re_connect_status == -EAGAIN)
|
|
||||||
goto retry;
|
|
||||||
rc = ep->re_connect_status;
|
rc = ep->re_connect_status;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
rc = rpcrdma_reqs_setup(r_xprt);
|
rc = rpcrdma_reqs_setup(r_xprt);
|
||||||
if (rc) {
|
if (rc)
|
||||||
rpcrdma_xprt_disconnect(r_xprt);
|
|
||||||
goto out;
|
goto out;
|
||||||
}
|
|
||||||
rpcrdma_mrs_create(r_xprt);
|
rpcrdma_mrs_create(r_xprt);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
|
Loading…
Add table
Reference in a new issue