From 05eb06d86685e7d9dac60e6bbb46d7f4c30b056e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 19 Jun 2019 10:32:48 -0400 Subject: xprtrdma: Fix occasional transport deadlock Under high I/O workloads, I've noticed that an RPC/RDMA transport occasionally deadlocks (IOPS goes to zero, and doesn't recover). Diagnosis shows that the sendctx queue is empty, but when sendctxs are returned to the queue, the xprt_write_space wake-up never occurs. The wake-up logic in rpcrdma_sendctx_put_locked is racy. I noticed that both EMPTY_SCQ and XPRT_WRITE_SPACE are implemented via an atomic bit. Just one of those is sufficient. Removing EMPTY_SCQ in favor of the generic bit mechanism makes the deadlock un-reproducible. Without EMPTY_SCQ, rpcrdma_buffer::rb_flags is no longer used and is therefore removed. Unfortunately this patch does not apply cleanly to stable. If needed, someone will have to port it and test it. Fixes: 2fad659209d5 ("xprtrdma: Wait on empty sendctx queue") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 59492a93fe1d..2fb415136f7b 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -539,6 +539,33 @@ TRACE_EVENT(xprtrdma_marshal_failed, ) ); +TRACE_EVENT(xprtrdma_prepsend_failed, + TP_PROTO(const struct rpc_rqst *rqst, + int ret + ), + + TP_ARGS(rqst, ret), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(int, ret) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->ret = ret; + ), + + TP_printk("task:%u@%u xid=0x%08x: ret=%d", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->ret + ) +); + TRACE_EVENT(xprtrdma_post_send, TP_PROTO( const struct rpcrdma_req *req, -- cgit v1.2.3-55-g7522