From 77a5db13153906a7e00740b10b2730e53385c5a8 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Wed, 26 Oct 2016 12:36:40 -0700 Subject: rdma_cm: add rdma_reject_msg() helper function rdma_reject_msg() returns a pointer to a string message associated with the transport reject reason codes. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Steve Wise Reviewed-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'drivers/infiniband/core/cm.c') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c99525512b34..6c64d0ce64b3 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -57,6 +57,54 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); +static const char * const ibcm_rej_reason_strs[] = { + [IB_CM_REJ_NO_QP] = "no QP", + [IB_CM_REJ_NO_EEC] = "no EEC", + [IB_CM_REJ_NO_RESOURCES] = "no resources", + [IB_CM_REJ_TIMEOUT] = "timeout", + [IB_CM_REJ_UNSUPPORTED] = "unsupported", + [IB_CM_REJ_INVALID_COMM_ID] = "invalid comm ID", + [IB_CM_REJ_INVALID_COMM_INSTANCE] = "invalid comm instance", + [IB_CM_REJ_INVALID_SERVICE_ID] = "invalid service ID", + [IB_CM_REJ_INVALID_TRANSPORT_TYPE] = "invalid transport type", + [IB_CM_REJ_STALE_CONN] = "stale conn", + [IB_CM_REJ_RDC_NOT_EXIST] = "RDC not exist", + [IB_CM_REJ_INVALID_GID] = "invalid GID", + [IB_CM_REJ_INVALID_LID] = "invalid LID", + [IB_CM_REJ_INVALID_SL] = "invalid SL", + [IB_CM_REJ_INVALID_TRAFFIC_CLASS] = "invalid traffic class", + [IB_CM_REJ_INVALID_HOP_LIMIT] = "invalid hop limit", + [IB_CM_REJ_INVALID_PACKET_RATE] = "invalid packet rate", + [IB_CM_REJ_INVALID_ALT_GID] = "invalid alt GID", + [IB_CM_REJ_INVALID_ALT_LID] = "invalid alt LID", + [IB_CM_REJ_INVALID_ALT_SL] = "invalid alt SL", + [IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS] = "invalid alt traffic class", + [IB_CM_REJ_INVALID_ALT_HOP_LIMIT] = "invalid alt hop limit", + [IB_CM_REJ_INVALID_ALT_PACKET_RATE] = "invalid alt packet rate", + [IB_CM_REJ_PORT_CM_REDIRECT] = "port CM redirect", + [IB_CM_REJ_PORT_REDIRECT] = "port redirect", + [IB_CM_REJ_INVALID_MTU] = "invalid MTU", + [IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES] = "insufficient resp resources", + [IB_CM_REJ_CONSUMER_DEFINED] = "consumer defined", + [IB_CM_REJ_INVALID_RNR_RETRY] = "invalid RNR retry", + [IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID] = "duplicate local comm ID", + [IB_CM_REJ_INVALID_CLASS_VERSION] = "invalid class version", + [IB_CM_REJ_INVALID_FLOW_LABEL] = "invalid flow label", + [IB_CM_REJ_INVALID_ALT_FLOW_LABEL] = "invalid alt flow label", +}; + +const char *__attribute_const__ ibcm_reject_msg(int reason) +{ + size_t index = reason; + + if (index < ARRAY_SIZE(ibcm_rej_reason_strs) && + ibcm_rej_reason_strs[index]) + return ibcm_rej_reason_strs[index]; + else + return "unrecognized reason"; +} +EXPORT_SYMBOL(ibcm_reject_msg); + static void cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device, void *client_data); -- cgit v1.2.3-55-g7522 From 9315bc9a133011fdb084f2626b86db3ebb64661f Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Fri, 28 Oct 2016 13:14:29 +0200 Subject: IB/core: Issue DREQ when receiving REQ/REP for stale QP from "InfiBand Architecture Specifications Volume 1": A QP is said to have a stale connection when only one side has connection information. A stale connection may result if the remote CM had dropped the connection and sent a DREQ but the DREQ was never received by the local CM. Alternatively the remote CM may have lost all record of past connections because its node crashed and rebooted, while the local CM did not become aware of the remote node's reboot and therefore did not clean up stale connections. and: A local CM may receive a REQ/REP for a stale connection. It shall abort the connection issuing REJ to the REQ/REP. It shall then issue DREQ with "DREQ:remote QPN” set to the remote QPN from the REQ/REP. This patch solves a problem with reuse of QPN. Current codebase, that is IPoIB, relies on a REAP-mechanism to do cleanup of the structures in CM. A problem with this is the timeconstants governing this mechanism; they are up to 768 seconds and the interface may look inresponsive in that period. Issuing a DREQ (and receiving a DREP) does the necessary cleanup and the interface comes up. Signed-off-by: Hans Westgaard Ry Reviewed-by: Håkon Bugge Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/core/cm.c') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c99525512b34..c97e4d586b44 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1519,6 +1519,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv; struct cm_timewait_info *timewait_info; struct cm_req_msg *req_msg; + struct ib_cm_id *cm_id; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1540,10 +1541,18 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { cm_cleanup_timewait(cm_id_priv->timewait_info); + cur_cm_id_priv = cm_get_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, NULL, 0); + if (cur_cm_id_priv) { + cm_id = &cur_cm_id_priv->id; + ib_send_cm_dreq(cm_id, NULL, 0); + cm_deref_id(cur_cm_id_priv); + } return NULL; } @@ -1919,6 +1928,9 @@ static int cm_rep_handler(struct cm_work *work) struct cm_id_private *cm_id_priv; struct cm_rep_msg *rep_msg; int ret; + struct cm_id_private *cur_cm_id_priv; + struct ib_cm_id *cm_id; + struct cm_timewait_info *timewait_info; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); @@ -1953,16 +1965,26 @@ static int cm_rep_handler(struct cm_work *work) goto error; } /* Check for a stale connection. */ - if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) { + timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); + if (timewait_info) { rb_erase(&cm_id_priv->timewait_info->remote_id_node, &cm.remote_id_table); cm_id_priv->timewait_info->inserted_remote_id = 0; + cur_cm_id_priv = cm_get_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; + if (cur_cm_id_priv) { + cm_id = &cur_cm_id_priv->id; + ib_send_cm_dreq(cm_id, NULL, 0); + cm_deref_id(cur_cm_id_priv); + } + goto error; } spin_unlock(&cm.lock); -- cgit v1.2.3-55-g7522