From cb4876e8ce1c6d78306c206df1970748ebb89025 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Mon, 13 Nov 2017 23:29:05 +0200
Subject: nvmet-rdma: removed queue cleanup from module exit

We already do that when we are notified in device removal
which is triggered when unregistering as an ib client.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/rdma.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 0e4c15754c58..454a5dce81f8 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1503,25 +1503,9 @@ err_ib_client:
 
 static void __exit nvmet_rdma_exit(void)
 {
-	struct nvmet_rdma_queue *queue;
-
 	nvmet_unregister_transport(&nvmet_rdma_ops);
-
-	flush_scheduled_work();
-
-	mutex_lock(&nvmet_rdma_queue_mutex);
-	while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list,
-			struct nvmet_rdma_queue, queue_list))) {
-		list_del_init(&queue->queue_list);
-
-		mutex_unlock(&nvmet_rdma_queue_mutex);
-		__nvmet_rdma_queue_disconnect(queue);
-		mutex_lock(&nvmet_rdma_queue_mutex);
-	}
-	mutex_unlock(&nvmet_rdma_queue_mutex);
-
-	flush_scheduled_work();
 	ib_unregister_client(&nvmet_rdma_ib_client);
+	WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
 	ida_destroy(&nvmet_rdma_queue_ida);
 }
 
-- 
cgit v1.2.3-55-g7522


From 424125a09db7a207ab53876db50a6198ca88518f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Mon, 4 Dec 2017 10:47:10 +0200
Subject: nvmet-rdma: lowering log level for chatty debug messages

It is a bit chatty to report on every deleted queue, so keep it for debug
purposes only.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 454a5dce81f8..978e169c11bf 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -921,7 +921,7 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
 
 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
 {
-	pr_info("freeing queue %d\n", queue->idx);
+	pr_debug("freeing queue %d\n", queue->idx);
 
 	nvmet_sq_destroy(&queue->nvme_sq);
 
-- 
cgit v1.2.3-55-g7522


From 6a1c57acab85e2e7a18827b43710b4e16c11148d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Mon, 4 Dec 2017 10:47:09 +0200
Subject: nvmet: lower log level for each queue creation

It is a bit chatty to report on each queue, log it only for debug
purposes.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fabrics-cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index db3bf6b8bf9e..19e9e42ae943 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -225,7 +225,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
 		goto out_ctrl_put;
 	}
 
-	pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
+	pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
 
 out:
 	kfree(d);
-- 
cgit v1.2.3-55-g7522


From 278e096063f1914fccfc77a617be9fc8dbb31b0e Mon Sep 17 00:00:00 2001
From: James Smart
Date: Wed, 29 Nov 2017 16:47:30 -0800
Subject: nvme_fcloop: fix abort race condition

A test case revealed a race condition of an i/o completing on a thread
parallel to the delete_association generating the aborts for the
outstanding ios on the controller.  The i/o completion was freeing the
target fcloop context, thus the abort task referenced the just-freed
memory.

Correct by clearing the target/initiator cross pointers in the io
completion and abort tasks before calling the callbacks. On aborts
that detect already finished io's, ensure the complete context is
called.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fcloop.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 7b75d9de55ab..3eb2a0733f46 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -370,6 +370,7 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
 
 	spin_lock(&tfcp_req->reqlock);
 	fcpreq = tfcp_req->fcpreq;
+	tfcp_req->fcpreq = NULL;
 	spin_unlock(&tfcp_req->reqlock);
 
 	if (tport->remoteport && fcpreq) {
@@ -611,11 +612,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
 
 	if (!tfcp_req)
 		/* abort has already been called */
-		return;
-
-	if (rport->targetport)
-		nvmet_fc_rcv_fcp_abort(rport->targetport,
-					&tfcp_req->tgt_fcp_req);
+		goto finish;
 
 	/* break initiator/target relationship for io */
 	spin_lock(&tfcp_req->reqlock);
@@ -623,6 +620,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
 	tfcp_req->fcpreq = NULL;
 	spin_unlock(&tfcp_req->reqlock);
 
+	if (rport->targetport)
+		nvmet_fc_rcv_fcp_abort(rport->targetport,
+					&tfcp_req->tgt_fcp_req);
+
+finish:
 	/* post the aborted io completion */
 	fcpreq->status = -ECANCELED;
 	schedule_work(&inireq->iniwork);
-- 
cgit v1.2.3-55-g7522


From 6fda20283e55b9d288cd56822ce39fc8e64f2208 Mon Sep 17 00:00:00 2001
From: James Smart
Date: Wed, 29 Nov 2017 16:47:31 -0800
Subject: nvme_fcloop: disassocate local port structs

The current fcloop driver gets its lport structure from the private
area co-allocated with the fc_localport. All is fine except the
teardown path, which wants to wait on the completion, which is marked
complete by the delete_localport callback performed after
unregister_localport.  The issue is, the nvme_fc transport frees the
localport structure immediately after delete_localport is called,
meaning the original routine is trying to wait on a complete that
was just freed.

Change such that a lport struct is allocated coincident with the
addition and registration of a localport. The private area of the
localport now contains just a backpointer to the real lport struct.
Now, the completion can be waited for, and after completing, the
new structure can be kfree'd.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fcloop.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 3eb2a0733f46..c0080f6ab2f5 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -204,6 +204,10 @@ struct fcloop_lport {
 	struct completion unreg_done;
 };
 
+struct fcloop_lport_priv {
+	struct fcloop_lport *lport;
+};
+
 struct fcloop_rport {
 	struct nvme_fc_remote_port *remoteport;
 	struct nvmet_fc_target_port *targetport;
@@ -659,7 +663,8 @@ fcloop_nport_get(struct fcloop_nport *nport)
 static void
 fcloop_localport_delete(struct nvme_fc_local_port *localport)
 {
-	struct fcloop_lport *lport = localport->private;
+	struct fcloop_lport_priv *lport_priv = localport->private;
+	struct fcloop_lport *lport = lport_priv->lport;
 
 	/* release any threads waiting for the unreg to complete */
 	complete(&lport->unreg_done);
@@ -699,7 +704,7 @@ static struct nvme_fc_port_template fctemplate = {
 	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
 	.dma_boundary		= FCLOOP_DMABOUND_4G,
 	/* sizes of additional private data for data structures */
-	.local_priv_sz		= sizeof(struct fcloop_lport),
+	.local_priv_sz		= sizeof(struct fcloop_lport_priv),
 	.remote_priv_sz		= sizeof(struct fcloop_rport),
 	.lsrqst_priv_sz		= sizeof(struct fcloop_lsreq),
 	.fcprqst_priv_sz	= sizeof(struct fcloop_ini_fcpreq),
@@ -730,11 +735,17 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
 	struct fcloop_ctrl_options *opts;
 	struct nvme_fc_local_port *localport;
 	struct fcloop_lport *lport;
-	int ret;
+	struct fcloop_lport_priv *lport_priv;
+	unsigned long flags;
+	int ret = -ENOMEM;
+
+	lport = kzalloc(sizeof(*lport), GFP_KERNEL);
+	if (!lport)
+		return -ENOMEM;
 
 	opts = kzalloc(sizeof(*opts), GFP_KERNEL);
 	if (!opts)
-		return -ENOMEM;
+		goto out_free_lport;
 
 	ret = fcloop_parse_options(opts, buf);
 	if (ret)
@@ -754,23 +765,25 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
 
 	ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
 	if (!ret) {
-		unsigned long flags;
-
 		/* success */
-		lport = localport->private;
+		lport_priv = localport->private;
+		lport_priv->lport = lport;
+
 		lport->localport = localport;
 		INIT_LIST_HEAD(&lport->lport_list);
 
 		spin_lock_irqsave(&fcloop_lock, flags);
 		list_add_tail(&lport->lport_list, &fcloop_lports);
 		spin_unlock_irqrestore(&fcloop_lock, flags);
-
-		/* mark all of the input buffer consumed */
-		ret = count;
 	}
 
 out_free_opts:
 	kfree(opts);
+out_free_lport:
+	/* free only if we're going to fail */
+	if (ret)
+		kfree(lport);
+
 	return ret ? ret : count;
 }
 
@@ -792,6 +805,8 @@ __wait_localport_unreg(struct fcloop_lport *lport)
 
 	wait_for_completion(&lport->unreg_done);
 
+	kfree(lport);
+
 	return ret;
 }
 
-- 
cgit v1.2.3-55-g7522


From 24431d60d3fbfd4c8c05e1828e5d9b35db4fd81c Mon Sep 17 00:00:00 2001
From: James Smart
Date: Wed, 29 Nov 2017 16:47:32 -0800
Subject: nvme_fcloop: rework to remove xxx_IN_ISR feature flags

The existing fcloop driver expects the target side upcalls to
the transport to context switch, thus the calls into the nvmet layer
are not done in the calling context of the host/initiator down calls.
The xxx_IN_ISR feature flags are used to select this logic.

The xxx_IN_ISR feature flags should go away in the nvmet_fc transport
as no other lldd utilizes them. Both Broadcom and Cavium lldds have their
own non-ISR deferred handlers thus the nvmet calls can be made directly.

This patch converts the paths that make the target upcalls (command
receive, abort receive) such that they schedule a work item rather
than expecting the transport to schedule the work item.

The patch also cleans up the following:
- The completion path from target to host scheduled a host work
  element called "work". Rename it "tio_done_work" for code clarity.
- The abort io path called a iniwork item to call the host side
  io done. This is no longer needed as the abort routine can make
  the same call.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fcloop.c | 98 ++++++++++++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 35 deletions(-)

diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index c0080f6ab2f5..c5015199c031 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -249,14 +249,15 @@ struct fcloop_fcpreq {
 	u16				status;
 	bool				active;
 	bool				aborted;
-	struct work_struct		work;
+	struct work_struct		fcp_rcv_work;
+	struct work_struct		abort_rcv_work;
+	struct work_struct		tio_done_work;
 	struct nvmefc_tgt_fcp_req	tgt_fcp_req;
 };
 
 struct fcloop_ini_fcpreq {
 	struct nvmefc_fcp_req		*fcpreq;
 	struct fcloop_fcpreq		*tfcp_req;
-	struct work_struct		iniwork;
 };
 
 static inline struct fcloop_lsreq *
@@ -347,17 +348,58 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
 	return 0;
 }
 
-/*
- * FCP IO operation done by initiator abort.
- * call back up initiator "done" flows.
- */
 static void
-fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+fcloop_fcp_recv_work(struct work_struct *work)
+{
+	struct fcloop_fcpreq *tfcp_req =
+		container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
+	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+	struct fcloop_ini_fcpreq *inireq = NULL;
+	int ret = 0;
+
+	ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+				&tfcp_req->tgt_fcp_req,
+				fcpreq->cmdaddr, fcpreq->cmdlen);
+	if (ret) {
+		inireq = fcpreq->private;
+		inireq->tfcp_req = NULL;
+
+		fcpreq->status = tfcp_req->status;
+		fcpreq->done(fcpreq);
+	}
+}
+
+static void
+fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
+			struct fcloop_fcpreq *tfcp_req, int status)
+{
+	struct fcloop_ini_fcpreq *inireq = NULL;
+
+	if (fcpreq) {
+		inireq = fcpreq->private;
+		inireq->tfcp_req = NULL;
+
+		fcpreq->status = status;
+		fcpreq->done(fcpreq);
+	}
+}
+
+static void
+fcloop_fcp_abort_recv_work(struct work_struct *work)
 {
-	struct fcloop_ini_fcpreq *inireq =
-		container_of(work, struct fcloop_ini_fcpreq, iniwork);
+	struct fcloop_fcpreq *tfcp_req =
+		container_of(work, struct fcloop_fcpreq, abort_rcv_work);
+	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+
+	if (tfcp_req->tport->targetport)
+		nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
+					&tfcp_req->tgt_fcp_req);
+
+	spin_lock(&tfcp_req->reqlock);
+	tfcp_req->fcpreq = NULL;
+	spin_unlock(&tfcp_req->reqlock);
 
-	inireq->fcpreq->done(inireq->fcpreq);
+	fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
 }
 
 /*
@@ -368,8 +410,7 @@ static void
 fcloop_tgt_fcprqst_done_work(struct work_struct *work)
 {
 	struct fcloop_fcpreq *tfcp_req =
-		container_of(work, struct fcloop_fcpreq, work);
-	struct fcloop_tport *tport = tfcp_req->tport;
+		container_of(work, struct fcloop_fcpreq, tio_done_work);
 	struct nvmefc_fcp_req *fcpreq;
 
 	spin_lock(&tfcp_req->reqlock);
@@ -377,10 +418,7 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
 	tfcp_req->fcpreq = NULL;
 	spin_unlock(&tfcp_req->reqlock);
 
-	if (tport->remoteport && fcpreq) {
-		fcpreq->status = tfcp_req->status;
-		fcpreq->done(fcpreq);
-	}
+	fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
 
 	kfree(tfcp_req);
 }
@@ -395,7 +433,6 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
 	struct fcloop_rport *rport = remoteport->private;
 	struct fcloop_ini_fcpreq *inireq = fcpreq->private;
 	struct fcloop_fcpreq *tfcp_req;
-	int ret = 0;
 
 	if (!rport->targetport)
 		return -ECONNREFUSED;
@@ -406,16 +443,16 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
 
 	inireq->fcpreq = fcpreq;
 	inireq->tfcp_req = tfcp_req;
-	INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
 	tfcp_req->fcpreq = fcpreq;
 	tfcp_req->tport = rport->targetport->private;
 	spin_lock_init(&tfcp_req->reqlock);
-	INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+	INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
+	INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
+	INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
 
-	ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
-				 fcpreq->cmdaddr, fcpreq->cmdlen);
+	schedule_work(&tfcp_req->fcp_rcv_work);
 
-	return ret;
+	return 0;
 }
 
 static void
@@ -594,7 +631,7 @@ fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
 {
 	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
 
-	schedule_work(&tfcp_req->work);
+	schedule_work(&tfcp_req->tio_done_work);
 }
 
 static void
@@ -610,13 +647,12 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
 			void *hw_queue_handle,
 			struct nvmefc_fcp_req *fcpreq)
 {
-	struct fcloop_rport *rport = remoteport->private;
 	struct fcloop_ini_fcpreq *inireq = fcpreq->private;
 	struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
 
 	if (!tfcp_req)
 		/* abort has already been called */
-		goto finish;
+		return;
 
 	/* break initiator/target relationship for io */
 	spin_lock(&tfcp_req->reqlock);
@@ -624,14 +660,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
 	tfcp_req->fcpreq = NULL;
 	spin_unlock(&tfcp_req->reqlock);
 
-	if (rport->targetport)
-		nvmet_fc_rcv_fcp_abort(rport->targetport,
-					&tfcp_req->tgt_fcp_req);
-
-finish:
-	/* post the aborted io completion */
-	fcpreq->status = -ECANCELED;
-	schedule_work(&inireq->iniwork);
+	WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
 }
 
 static void
@@ -721,8 +750,7 @@ static struct nvmet_fc_target_template tgttemplate = {
 	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
 	.dma_boundary		= FCLOOP_DMABOUND_4G,
 	/* optional features */
-	.target_features	= NVMET_FCTGTFEAT_CMD_IN_ISR |
-				  NVMET_FCTGTFEAT_OPDONE_IN_ISR,
+	.target_features	= 0,
 	/* sizes of additional private data for data structures */
 	.target_priv_sz		= sizeof(struct fcloop_tport),
 };
-- 
cgit v1.2.3-55-g7522


From b6f807738b5e3a24eda3ea6864abc18d10279e69 Mon Sep 17 00:00:00 2001
From: James Smart
Date: Wed, 29 Nov 2017 16:47:33 -0800
Subject: nvme_fcloop: refactor host/target io job access

The split between what the host accesses on its flows vs what the
target side accesses was flawed. Abort handling didn't properly
clear initiator vs target structure cross-reference and locks
weren't used for synchronization. Thus, there were issues of
freeing structures too soon and access after free.

A couple of these existed pre the IN_ISR mods, but when the
target upcalls were converted to work items, thus adding delays
between the 2 sides of accesses, the problems became pronounced.

Resolve by:
- tracking io state mainly in the tgt-side io structure.
- make the tgt-side io structure released by reference not by
  code flow.
- when changing initiator structures, use locks for
  synchronization
- aborts are clearly tracked for which side saw the abort, and
  after seeing the abort, cross-references are cleared under lock.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fcloop.c | 147 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 125 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index c5015199c031..9f8a6726df91 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -242,13 +242,22 @@ struct fcloop_lsreq {
 	int				status;
 };
 
+enum {
+	INI_IO_START		= 0,
+	INI_IO_ACTIVE		= 1,
+	INI_IO_ABORTED		= 2,
+	INI_IO_COMPLETED	= 3,
+};
+
 struct fcloop_fcpreq {
 	struct fcloop_tport		*tport;
 	struct nvmefc_fcp_req		*fcpreq;
 	spinlock_t			reqlock;
 	u16				status;
+	u32				inistate;
 	bool				active;
 	bool				aborted;
+	struct kref			ref;
 	struct work_struct		fcp_rcv_work;
 	struct work_struct		abort_rcv_work;
 	struct work_struct		tio_done_work;
@@ -258,6 +267,7 @@ struct fcloop_fcpreq {
 struct fcloop_ini_fcpreq {
 	struct nvmefc_fcp_req		*fcpreq;
 	struct fcloop_fcpreq		*tfcp_req;
+	spinlock_t			inilock;
 };
 
 static inline struct fcloop_lsreq *
@@ -349,24 +359,24 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
 }
 
 static void
-fcloop_fcp_recv_work(struct work_struct *work)
+fcloop_tfcp_req_free(struct kref *ref)
 {
 	struct fcloop_fcpreq *tfcp_req =
-		container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
-	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
-	struct fcloop_ini_fcpreq *inireq = NULL;
-	int ret = 0;
+		container_of(ref, struct fcloop_fcpreq, ref);
 
-	ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
-				&tfcp_req->tgt_fcp_req,
-				fcpreq->cmdaddr, fcpreq->cmdlen);
-	if (ret) {
-		inireq = fcpreq->private;
-		inireq->tfcp_req = NULL;
+	kfree(tfcp_req);
+}
 
-		fcpreq->status = tfcp_req->status;
-		fcpreq->done(fcpreq);
-	}
+static void
+fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
+{
+	kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
+}
+
+static int
+fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
+{
+	return kref_get_unless_zero(&tfcp_req->ref);
 }
 
 static void
@@ -377,11 +387,52 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
 
 	if (fcpreq) {
 		inireq = fcpreq->private;
+		spin_lock(&inireq->inilock);
 		inireq->tfcp_req = NULL;
+		spin_unlock(&inireq->inilock);
 
 		fcpreq->status = status;
 		fcpreq->done(fcpreq);
 	}
+
+	/* release original io reference on tgt struct */
+	fcloop_tfcp_req_put(tfcp_req);
+}
+
+static void
+fcloop_fcp_recv_work(struct work_struct *work)
+{
+	struct fcloop_fcpreq *tfcp_req =
+		container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
+	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+	int ret = 0;
+	bool aborted = false;
+
+	spin_lock(&tfcp_req->reqlock);
+	switch (tfcp_req->inistate) {
+	case INI_IO_START:
+		tfcp_req->inistate = INI_IO_ACTIVE;
+		break;
+	case INI_IO_ABORTED:
+		aborted = true;
+		break;
+	default:
+		spin_unlock(&tfcp_req->reqlock);
+		WARN_ON(1);
+		return;
+	}
+	spin_unlock(&tfcp_req->reqlock);
+
+	if (unlikely(aborted))
+		ret = -ECANCELED;
+	else
+		ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+				&tfcp_req->tgt_fcp_req,
+				fcpreq->cmdaddr, fcpreq->cmdlen);
+	if (ret)
+		fcloop_call_host_done(fcpreq, tfcp_req, ret);
+
+	return;
 }
 
 static void
@@ -389,7 +440,29 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
 {
 	struct fcloop_fcpreq *tfcp_req =
 		container_of(work, struct fcloop_fcpreq, abort_rcv_work);
-	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+	struct nvmefc_fcp_req *fcpreq;
+	bool completed = false;
+
+	spin_lock(&tfcp_req->reqlock);
+	fcpreq = tfcp_req->fcpreq;
+	switch (tfcp_req->inistate) {
+	case INI_IO_ABORTED:
+		break;
+	case INI_IO_COMPLETED:
+		completed = true;
+		break;
+	default:
+		spin_unlock(&tfcp_req->reqlock);
+		WARN_ON(1);
+		return;
+	}
+	spin_unlock(&tfcp_req->reqlock);
+
+	if (unlikely(completed)) {
+		/* remove reference taken in original abort downcall */
+		fcloop_tfcp_req_put(tfcp_req);
+		return;
+	}
 
 	if (tfcp_req->tport->targetport)
 		nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
@@ -400,6 +473,7 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
 	spin_unlock(&tfcp_req->reqlock);
 
 	fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
+	/* call_host_done releases reference for abort downcall */
 }
 
 /*
@@ -415,12 +489,10 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
 
 	spin_lock(&tfcp_req->reqlock);
 	fcpreq = tfcp_req->fcpreq;
-	tfcp_req->fcpreq = NULL;
+	tfcp_req->inistate = INI_IO_COMPLETED;
 	spin_unlock(&tfcp_req->reqlock);
 
 	fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
-
-	kfree(tfcp_req);
 }
 
 
@@ -443,12 +515,16 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
 
 	inireq->fcpreq = fcpreq;
 	inireq->tfcp_req = tfcp_req;
+	spin_lock_init(&inireq->inilock);
+
 	tfcp_req->fcpreq = fcpreq;
 	tfcp_req->tport = rport->targetport->private;
+	tfcp_req->inistate = INI_IO_START;
 	spin_lock_init(&tfcp_req->reqlock);
 	INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
 	INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
 	INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
+	kref_init(&tfcp_req->ref);
 
 	schedule_work(&tfcp_req->fcp_rcv_work);
 
@@ -648,7 +724,14 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
 			struct nvmefc_fcp_req *fcpreq)
 {
 	struct fcloop_ini_fcpreq *inireq = fcpreq->private;
-	struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+	struct fcloop_fcpreq *tfcp_req;
+	bool abortio = true;
+
+	spin_lock(&inireq->inilock);
+	tfcp_req = inireq->tfcp_req;
+	if (tfcp_req)
+		fcloop_tfcp_req_get(tfcp_req);
+	spin_unlock(&inireq->inilock);
 
 	if (!tfcp_req)
 		/* abort has already been called */
@@ -656,11 +739,31 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
 
 	/* break initiator/target relationship for io */
 	spin_lock(&tfcp_req->reqlock);
-	inireq->tfcp_req = NULL;
-	tfcp_req->fcpreq = NULL;
+	switch (tfcp_req->inistate) {
+	case INI_IO_START:
+	case INI_IO_ACTIVE:
+		tfcp_req->inistate = INI_IO_ABORTED;
+		break;
+	case INI_IO_COMPLETED:
+		abortio = false;
+		break;
+	default:
+		spin_unlock(&tfcp_req->reqlock);
+		WARN_ON(1);
+		return;
+	}
 	spin_unlock(&tfcp_req->reqlock);
 
-	WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
+	if (abortio)
+		/* leave the reference while the work item is scheduled */
+		WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
+	else  {
+		/*
+		 * as the io has already had the done callback made,
+		 * nothing more to do. So release the reference taken above
+		 */
+		fcloop_tfcp_req_put(tfcp_req);
+	}
 }
 
 static void
-- 
cgit v1.2.3-55-g7522


From 9ce1f2e12e017607fe17a67cea79ebcf0184e5b3 Mon Sep 17 00:00:00 2001
From: James Smart
Date: Wed, 29 Nov 2017 15:11:55 -0800
Subject: nvmet-fc: cleanup nvmet add_port/remove_port

The current fc transport add_port routine validates that there is a
matching port to the target port config. It then takes a reference
on the targetport. The del_port removes the reference.

Unfortunately, if the LLDD undergoes a hw reset or driver unload and
wants to unreg the targetport, due to the reference, the targetport
effectively can't be removed. It requires the admin to remove the
port from the nvmet config first, which calls the del_port.
Note: it appears nvmetcli clear skips over the del_port call (I'm
not attempting to change that).

There's no real reason to take the reference. With FC, there is nothing
to enable or disable as the presence of the FC targetport implicitly
means its enabled, and removal of the targtport means its disabled.

Change add_port to simply validate and change remove_port to a noop.
No references are taken on the targetport.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fc.c | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 840d1a39de33..9b39a6cb1935 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -2490,14 +2490,8 @@ nvmet_fc_add_port(struct nvmet_port *port)
 	list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
 		if ((tgtport->fc_target_port.node_name == traddr.nn) &&
 		    (tgtport->fc_target_port.port_name == traddr.pn)) {
-			/* a FC port can only be 1 nvmet port id */
-			if (!tgtport->port) {
-				tgtport->port = port;
-				port->priv = tgtport;
-				nvmet_fc_tgtport_get(tgtport);
-				ret = 0;
-			} else
-				ret = -EALREADY;
+			tgtport->port = port;
+			ret = 0;
 			break;
 		}
 	}
@@ -2508,19 +2502,7 @@ nvmet_fc_add_port(struct nvmet_port *port)
 static void
 nvmet_fc_remove_port(struct nvmet_port *port)
 {
-	struct nvmet_fc_tgtport *tgtport = port->priv;
-	unsigned long flags;
-	bool matched = false;
-
-	spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
-	if (tgtport->port == port) {
-		matched = true;
-		tgtport->port = NULL;
-	}
-	spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
-
-	if (matched)
-		nvmet_fc_tgtport_put(tgtport);
+	/* nothing to do */
 }
 
 static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {
-- 
cgit v1.2.3-55-g7522


From 0de5cd367c6aa2a31a1c931628f778f79f8ef22e Mon Sep 17 00:00:00 2001
From: Roy Shterman
Date: Mon, 25 Dec 2017 14:18:30 +0200
Subject: nvme-fabrics: protect against module unload during create_ctrl

NVMe transport driver module unload may (and usually does) trigger
iteration over the active controllers and delete them all (sometimes
under a mutex).  However, a controller can be created concurrently with
module unload which can lead to leakage of resources (most important char
device node leakage) in case the controller creation occured after the
unload delete and drain sequence.  To protect against this, we take a
module reference to guarantee that the nvme transport driver is not
unloaded while creating a controller.

Signed-off-by: Roy Shterman <roys@lightbitslabs.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 17 +++++++++++++----
 drivers/nvme/host/fabrics.h |  2 ++
 drivers/nvme/host/fc.c      |  1 +
 drivers/nvme/host/rdma.c    |  1 +
 drivers/nvme/target/loop.c  |  1 +
 5 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 76b4fe6816a0..2f68befd31bf 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -492,7 +492,7 @@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
  */
 int nvmf_register_transport(struct nvmf_transport_ops *ops)
 {
-	if (!ops->create_ctrl)
+	if (!ops->create_ctrl || !ops->module)
 		return -EINVAL;
 
 	down_write(&nvmf_transports_rwsem);
@@ -868,32 +868,41 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
 		goto out_unlock;
 	}
 
+	if (!try_module_get(ops->module)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
 	ret = nvmf_check_required_opts(opts, ops->required_opts);
 	if (ret)
-		goto out_unlock;
+		goto out_module_put;
 	ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
 				ops->allowed_opts | ops->required_opts);
 	if (ret)
-		goto out_unlock;
+		goto out_module_put;
 
 	ctrl = ops->create_ctrl(dev, opts);
 	if (IS_ERR(ctrl)) {
 		ret = PTR_ERR(ctrl);
-		goto out_unlock;
+		goto out_module_put;
 	}
 
 	if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
 		dev_warn(ctrl->device,
 			"controller returned incorrect NQN: \"%s\".\n",
 			ctrl->subsys->subnqn);
+		module_put(ops->module);
 		up_read(&nvmf_transports_rwsem);
 		nvme_delete_ctrl_sync(ctrl);
 		return ERR_PTR(-EINVAL);
 	}
 
+	module_put(ops->module);
 	up_read(&nvmf_transports_rwsem);
 	return ctrl;
 
+out_module_put:
+	module_put(ops->module);
 out_unlock:
 	up_read(&nvmf_transports_rwsem);
 out_free_opts:
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 9ba614953607..25b19f722f5b 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -108,6 +108,7 @@ struct nvmf_ctrl_options {
  *			       fabric implementation of NVMe fabrics.
  * @entry:		Used by the fabrics library to add the new
  *			registration entry to its linked-list internal tree.
+ * @module:             Transport module reference
  * @name:		Name of the NVMe fabric driver implementation.
  * @required_opts:	sysfs command-line options that must be specified
  *			when adding a new NVMe controller.
@@ -126,6 +127,7 @@ struct nvmf_ctrl_options {
  */
 struct nvmf_transport_ops {
 	struct list_head	entry;
+	struct module		*module;
 	const char		*name;
 	int			required_opts;
 	int			allowed_opts;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 0a8af4daef89..2a7a9a75105d 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3381,6 +3381,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
 
 static struct nvmf_transport_ops nvme_fc_transport = {
 	.name		= "fc",
+	.module		= THIS_MODULE,
 	.required_opts	= NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
 	.allowed_opts	= NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
 	.create_ctrl	= nvme_fc_create_ctrl,
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 37af56596be6..75d6956eb380 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2006,6 +2006,7 @@ out_free_ctrl:
 
 static struct nvmf_transport_ops nvme_rdma_transport = {
 	.name		= "rdma",
+	.module		= THIS_MODULE,
 	.required_opts	= NVMF_OPT_TRADDR,
 	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
 			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 1e21b286f299..fdfcc961029f 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -686,6 +686,7 @@ static struct nvmet_fabrics_ops nvme_loop_ops = {
 
 static struct nvmf_transport_ops nvme_loop_transport = {
 	.name		= "loop",
+	.module		= THIS_MODULE,
 	.create_ctrl	= nvme_loop_create_ctrl,
 };
 
-- 
cgit v1.2.3-55-g7522


From 6fbcde6691b514faa963c60f5537332530f1bf0a Mon Sep 17 00:00:00 2001
From: Minwoo Im
Date: Tue, 5 Dec 2017 05:23:54 +0900
Subject: nvme-pci: remove an unnecessary initialization in HMB code

The local variable __size__ will be set a bit later in a for-loop.
Remove the explicit initialization at the beginning of this function.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f5800c3c9082..35331fa0013c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1770,7 +1770,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	dma_addr_t descs_dma;
 	int i = 0;
 	void **bufs;
-	u64 size = 0, tmp;
+	u64 size, tmp;
 
 	tmp = (preferred + chunk_size - 1);
 	do_div(tmp, chunk_size);
@@ -1853,7 +1853,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
 	u64 min = (u64)dev->ctrl.hmmin * 4096;
 	u32 enable_bits = NVME_HOST_MEM_ENABLE;
-	int ret = 0;
+	int ret;
 
 	preferred = min(preferred, max);
 	if (min > max) {
-- 
cgit v1.2.3-55-g7522


From eca19dc1d84d924544dda0c8d2fd4bb4131affeb Mon Sep 17 00:00:00 2001
From: Israel Rukshin
Date: Mon, 13 Nov 2017 12:29:40 +0000
Subject: nvmet: fix error flow in nvmet_alloc_ctrl()

Remove the allocated id on error.

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b54748ad5f48..07eb45d32a7a 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -830,7 +830,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 		/* Don't accept keep-alive timeout for discovery controllers */
 		if (kato) {
 			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
-			goto out_free_sqs;
+			goto out_remove_ida;
 		}
 
 		/*
@@ -860,6 +860,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	*ctrlp = ctrl;
 	return 0;
 
+out_remove_ida:
+	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
 out_free_sqs:
 	kfree(ctrl->sqs);
 out_free_cqs:
-- 
cgit v1.2.3-55-g7522


From 6b1943af3f4329c814ec7a651121746d08e6c9ee Mon Sep 17 00:00:00 2001
From: Israel Rukshin
Date: Mon, 13 Nov 2017 12:29:41 +0000
Subject: nvmet: rearrange nvmet_ctrl_free()

Make it symmetric to nvmet_alloc_ctrl().

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 07eb45d32a7a..7282ea8d3b96 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -879,21 +879,22 @@ static void nvmet_ctrl_free(struct kref *ref)
 	struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
 	struct nvmet_subsys *subsys = ctrl->subsys;
 
-	nvmet_stop_keep_alive_timer(ctrl);
-
 	mutex_lock(&subsys->lock);
 	list_del(&ctrl->subsys_entry);
 	mutex_unlock(&subsys->lock);
 
+	nvmet_stop_keep_alive_timer(ctrl);
+
 	flush_work(&ctrl->async_event_work);
 	cancel_work_sync(&ctrl->fatal_err_work);
 
 	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
-	nvmet_subsys_put(subsys);
 
 	kfree(ctrl->sqs);
 	kfree(ctrl->cqs);
 	kfree(ctrl);
+
+	nvmet_subsys_put(subsys);
 }
 
 void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
-- 
cgit v1.2.3-55-g7522


From 4caff8fc19f10ffb06f095a9cf5a9e755377112e Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Sun, 31 Dec 2017 14:01:19 +0200
Subject: nvme-pci: don't open-code nvme_reset_ctrl

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 35331fa0013c..add7b18d825d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2498,10 +2498,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto release_pools;
 
-	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
-	queue_work(nvme_wq, &dev->ctrl.reset_work);
+	nvme_reset_ctrl(&dev->ctrl);
+
 	return 0;
 
  release_pools:
-- 
cgit v1.2.3-55-g7522


From 1a3838d732eaae47385490de88d978d4132d3d84 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy
Date: Sun, 31 Dec 2017 15:33:27 +0200
Subject: nvme: modify the debug level for setting shutdown timeout

When an NVMe controller reports RTD3 Entry Latency larger than the value
of shutdown_timeout module parameter, we update the shutdown_timeout
accordingly to honor RTD3 Entry Latency. Use an informational debug level
instead of a warning level for it.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f837d666cbd4..2a69d735efbc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2252,7 +2252,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 						 shutdown_timeout, 60);
 
 		if (ctrl->shutdown_timeout != shutdown_timeout)
-			dev_warn(ctrl->device,
+			dev_info(ctrl->device,
 				 "Shutdown timeout set to %u seconds\n",
 				 ctrl->shutdown_timeout);
 	} else
-- 
cgit v1.2.3-55-g7522


From 2b1b7e784a63f5ded4dda804e05e3f34b3880b25 Mon Sep 17 00:00:00 2001
From: Jianchao Wang
Date: Sat, 6 Jan 2018 08:01:58 +0800
Subject: nvme-pci: fix NULL pointer reference in nvme_alloc_ns

When the io queues setup or tagset allocation failed, ctrl.tagset is
NULL.  But the scan work will still be queued and executed, then panic
comes up due to NULL pointer reference of ctrl.tagset.

To fix this, add a new ctrl state NVME_CTRL_ADMIN_ONLY to inidcate only
admin queue is live. When non io queues or tagset allocation failed, ctrl
enters into this state, scan work will not be started.  But async event
work and nvme dev ioctl will be still available.  This will be helpful to
do further investigation and recovery.

Suggested-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 25 ++++++++++++++++++++++---
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  | 30 +++++++++++++++++++++---------
 3 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2a69d735efbc..609307ca9e4d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -232,6 +232,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 
 	old_state = ctrl->state;
 	switch (new_state) {
+	case NVME_CTRL_ADMIN_ONLY:
+		switch (old_state) {
+		case NVME_CTRL_RESETTING:
+			changed = true;
+			/* FALLTHRU */
+		default:
+			break;
+		}
+		break;
 	case NVME_CTRL_LIVE:
 		switch (old_state) {
 		case NVME_CTRL_NEW:
@@ -247,6 +256,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		switch (old_state) {
 		case NVME_CTRL_NEW:
 		case NVME_CTRL_LIVE:
+		case NVME_CTRL_ADMIN_ONLY:
 			changed = true;
 			/* FALLTHRU */
 		default:
@@ -266,6 +276,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 	case NVME_CTRL_DELETING:
 		switch (old_state) {
 		case NVME_CTRL_LIVE:
+		case NVME_CTRL_ADMIN_ONLY:
 		case NVME_CTRL_RESETTING:
 		case NVME_CTRL_RECONNECTING:
 			changed = true;
@@ -2336,8 +2347,14 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
 	struct nvme_ctrl *ctrl =
 		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
 
-	if (ctrl->state != NVME_CTRL_LIVE)
+	switch (ctrl->state) {
+	case NVME_CTRL_LIVE:
+	case NVME_CTRL_ADMIN_ONLY:
+		break;
+	default:
 		return -EWOULDBLOCK;
+	}
+
 	file->private_data = ctrl;
 	return 0;
 }
@@ -2601,6 +2618,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
 	static const char *const state_name[] = {
 		[NVME_CTRL_NEW]		= "new",
 		[NVME_CTRL_LIVE]	= "live",
+		[NVME_CTRL_ADMIN_ONLY]	= "only-admin",
 		[NVME_CTRL_RESETTING]	= "resetting",
 		[NVME_CTRL_RECONNECTING]= "reconnecting",
 		[NVME_CTRL_DELETING]	= "deleting",
@@ -3073,6 +3091,8 @@ static void nvme_scan_work(struct work_struct *work)
 	if (ctrl->state != NVME_CTRL_LIVE)
 		return;
 
+	WARN_ON_ONCE(!ctrl->tagset);
+
 	if (nvme_identify_ctrl(ctrl, &id))
 		return;
 
@@ -3093,8 +3113,7 @@ static void nvme_scan_work(struct work_struct *work)
 void nvme_queue_scan(struct nvme_ctrl *ctrl)
 {
 	/*
-	 * Do not queue new scan work when a controller is reset during
-	 * removal.
+	 * Only new queue scan work when admin and IO queues are both alive
 	 */
 	if (ctrl->state == NVME_CTRL_LIVE)
 		queue_work(nvme_wq, &ctrl->scan_work);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ea1aa5283e8e..eecf71ce6e75 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -119,6 +119,7 @@ static inline struct nvme_request *nvme_req(struct request *req)
 enum nvme_ctrl_state {
 	NVME_CTRL_NEW,
 	NVME_CTRL_LIVE,
+	NVME_CTRL_ADMIN_ONLY,    /* Only admin queue live */
 	NVME_CTRL_RESETTING,
 	NVME_CTRL_RECONNECTING,
 	NVME_CTRL_DELETING,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index add7b18d825d..62119078c2bf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2035,13 +2035,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
 }
 
 /*
- * Return: error value if an error occurred setting up the queues or calling
- * Identify Device.  0 if these succeeded, even if adding some of the
- * namespaces failed.  At the moment, these failures are silent.  TBD which
- * failures should be reported.
+ * return error value only when tagset allocation failed
  */
 static int nvme_dev_add(struct nvme_dev *dev)
 {
+	int ret;
+
 	if (!dev->ctrl.tagset) {
 		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
@@ -2057,8 +2056,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
 		dev->tagset.driver_data = dev;
 
-		if (blk_mq_alloc_tag_set(&dev->tagset))
-			return 0;
+		ret = blk_mq_alloc_tag_set(&dev->tagset);
+		if (ret) {
+			dev_warn(dev->ctrl.device,
+				"IO queues tagset allocation failed %d\n", ret);
+			return ret;
+		}
 		dev->ctrl.tagset = &dev->tagset;
 
 		nvme_dbbuf_set(dev);
@@ -2291,6 +2294,7 @@ static void nvme_reset_work(struct work_struct *work)
 		container_of(work, struct nvme_dev, ctrl.reset_work);
 	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
 	int result = -ENODEV;
+	enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
 
 	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
 		goto out;
@@ -2354,15 +2358,23 @@ static void nvme_reset_work(struct work_struct *work)
 		dev_warn(dev->ctrl.device, "IO queues not created\n");
 		nvme_kill_queues(&dev->ctrl);
 		nvme_remove_namespaces(&dev->ctrl);
+		new_state = NVME_CTRL_ADMIN_ONLY;
 	} else {
 		nvme_start_queues(&dev->ctrl);
 		nvme_wait_freeze(&dev->ctrl);
-		nvme_dev_add(dev);
+		/* hit this only when allocate tagset fails */
+		if (nvme_dev_add(dev))
+			new_state = NVME_CTRL_ADMIN_ONLY;
 		nvme_unfreeze(&dev->ctrl);
 	}
 
-	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
-		dev_warn(dev->ctrl.device, "failed to mark controller live\n");
+	/*
+	 * If only admin queue live, keep it to do further investigation or
+	 * recovery.
+	 */
+	if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
+		dev_warn(dev->ctrl.device,
+			"failed to mark controller state %d\n", new_state);
 		goto out;
 	}
 
-- 
cgit v1.2.3-55-g7522


From 85088c4a0f65f0be25a98164ec6bca02ac5cad04 Mon Sep 17 00:00:00 2001
From: Nitzan Carmi
Date: Thu, 4 Jan 2018 17:56:13 +0200
Subject: nvme: take refcount on transport module

The block device is backed by the transport so we must ensure that the
transport driver will not be removed until all references are released.
Otherwise, we might end up referencing freed memory.

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Nitzan Carmi <nitzanc@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 609307ca9e4d..c8bcfe64e976 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1228,16 +1228,27 @@ static int nvme_open(struct block_device *bdev, fmode_t mode)
 #ifdef CONFIG_NVME_MULTIPATH
 	/* should never be called due to GENHD_FL_HIDDEN */
 	if (WARN_ON_ONCE(ns->head->disk))
-		return -ENXIO;
+		goto fail;
 #endif
 	if (!kref_get_unless_zero(&ns->kref))
-		return -ENXIO;
+		goto fail;
+	if (!try_module_get(ns->ctrl->ops->module))
+		goto fail_put_ns;
+
 	return 0;
+
+fail_put_ns:
+	nvme_put_ns(ns);
+fail:
+	return -ENXIO;
 }
 
 static void nvme_release(struct gendisk *disk, fmode_t mode)
 {
-	nvme_put_ns(disk->private_data);
+	struct nvme_ns *ns = disk->private_data;
+
+	module_put(ns->ctrl->ops->module);
+	nvme_put_ns(ns);
 }
 
 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-- 
cgit v1.2.3-55-g7522


From b837b28394fb76993c28bb242db7061ee0417da6 Mon Sep 17 00:00:00 2001
From: Israel Rukshin
Date: Thu, 4 Jan 2018 17:56:14 +0200
Subject: nvme: fix subsystem multiple controllers support check

There is a problem when another module (e.g. nvmet) takes a reference on
the nvme block device and the physical nvme drive is removed.  In that
case nvme_free_ctrl() will not be called and the controller state will be
"deleting" or "dead" unless nvmet module releases the block device.
Later on, the same nvme drive probes back and nvme_init_subsystem() will
be called and fail due to duplicate subnqn (if the nvme device doesn't
support subsystem with multiple controllers). This will cause a probe
failure.  This commit changes the check of multiple controllers support
at nvme_init_subsystem() by not counting all the controllers at "dead" or
"deleting" state (this is safe because controllers at this state will
never be active again).

Fixes: ab9e00cc72fa ("nvme: track subsystems")
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c8bcfe64e976..2bcd49584f71 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2069,6 +2069,22 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = {
 	NULL,
 };
 
+static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+{
+	int count = 0;
+	struct nvme_ctrl *ctrl;
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		if (ctrl->state != NVME_CTRL_DELETING &&
+		    ctrl->state != NVME_CTRL_DEAD)
+			count++;
+	}
+	mutex_unlock(&subsys->lock);
+
+	return count;
+}
+
 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 {
 	struct nvme_subsystem *subsys, *found;
@@ -2107,7 +2123,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 		 * Verify that the subsystem actually supports multiple
 		 * controllers, else bail out.
 		 */
-		if (!(id->cmic & (1 << 1))) {
+		if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
 			dev_err(ctrl->device,
 				"ignoring ctrl due to duplicate subnqn (%s).\n",
 				found->subnqn);
-- 
cgit v1.2.3-55-g7522