From fab7772bfbcfe8fb8e3e352a6a8fcaf044cded17 Mon Sep 17 00:00:00 2001
From: Anthony Iliopoulos
Date: Mon, 29 Jul 2019 14:40:40 +0200
Subject: nvme-multipath: revalidate nvme_ns_head gendisk in nvme_validate_ns

When CONFIG_NVME_MULTIPATH is set, only the hidden gendisk associated
with the per-controller ns is run through revalidate_disk when a
rescan is triggered, while the visible blockdev never gets its size
(bdev->bd_inode->i_size) updated to reflect any capacity changes that
may have occurred.

This prevents online resizing of nvme block devices and in extension of
any filesystems atop that will are unable to expand while mounted, as
userspace relies on the blockdev size for obtaining the disk capacity
(via BLKGETSIZE/64 ioctls).

Fix this by explicitly revalidating the actual namespace gendisk in
addition to the per-controller gendisk, when multipath is enabled.

Signed-off-by: Anthony Iliopoulos <ailiopoulos@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8f3fbe5ca937..80c7a7ee240b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1715,6 +1715,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	if (ns->head->disk) {
 		nvme_update_disk_info(ns->head->disk, ns, id);
 		blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
+		revalidate_disk(ns->head->disk);
 	}
 #endif
 }
-- 
cgit v1.2.3-55-g7522


From 3aed86731ee2b23e4dc4d2c6d943d33992cd551b Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe
Date: Wed, 31 Jul 2019 17:35:31 -0600
Subject: nvmet: Fix use-after-free bug when a port is removed

When a port is removed through configfs, any connected controllers
are still active and can still send commands. This causes a
use-after-free bug which is detected by KASAN for any admin command
that dereferences req->port (like in nvmet_execute_identify_ctrl).

To fix this, disconnect all active controllers when a subsystem is
removed from a port. This ensures there are no active controllers
when the port is eventually removed.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by : Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/configfs.c |  1 +
 drivers/nvme/target/core.c     | 12 ++++++++++++
 drivers/nvme/target/nvmet.h    |  3 +++
 3 files changed, 16 insertions(+)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index cd52b9f15376..98613a45bd3b 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -675,6 +675,7 @@ static void nvmet_port_subsys_drop_link(struct config_item *parent,
 
 found:
 	list_del(&p->entry);
+	nvmet_port_del_ctrls(port, subsys);
 	nvmet_port_disc_changed(port, subsys);
 
 	if (list_empty(&port->subsystems))
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index dad0243c7c96..b86a23aa9020 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -280,6 +280,18 @@ void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops)
 }
 EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
 
+void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys)
+{
+	struct nvmet_ctrl *ctrl;
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		if (ctrl->port == port)
+			ctrl->ops->delete_ctrl(ctrl);
+	}
+	mutex_unlock(&subsys->lock);
+}
+
 int nvmet_enable_port(struct nvmet_port *port)
 {
 	const struct nvmet_fabrics_ops *ops;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 6ee66c610739..c51f8dd01dc4 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -418,6 +418,9 @@ void nvmet_port_send_ana_event(struct nvmet_port *port);
 int nvmet_register_transport(const struct nvmet_fabrics_ops *ops);
 void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops);
 
+void nvmet_port_del_ctrls(struct nvmet_port *port,
+			  struct nvmet_subsys *subsys);
+
 int nvmet_enable_port(struct nvmet_port *port);
 void nvmet_disable_port(struct nvmet_port *port);
 
-- 
cgit v1.2.3-55-g7522


From 86b9a63e595ff03f9d0a7b92b6acc231fecefc29 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe
Date: Wed, 31 Jul 2019 17:35:32 -0600
Subject: nvmet-loop: Flush nvme_delete_wq when removing the port

After calling nvme_loop_delete_ctrl(), the controllers will not
yet be deleted because nvme_delete_ctrl() only schedules work
to do the delete.

This means a race can occur if a port is removed but there
are still active controllers trying to access that memory.

To fix this, flush the nvme_delete_wq before returning from
nvme_loop_remove_port() so that any controllers that might
be in the process of being deleted won't access a freed port.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by : Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/loop.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index b16dc3981c69..0940c5024a34 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -654,6 +654,14 @@ static void nvme_loop_remove_port(struct nvmet_port *port)
 	mutex_lock(&nvme_loop_ports_mutex);
 	list_del_init(&port->entry);
 	mutex_unlock(&nvme_loop_ports_mutex);
+
+	/*
+	 * Ensure any ctrls that are in the process of being
+	 * deleted are in fact deleted before we return
+	 * and free the port. This is to prevent active
+	 * ctrls from using a port after it's freed.
+	 */
+	flush_workqueue(nvme_delete_wq);
 }
 
 static const struct nvmet_fabrics_ops nvme_loop_ops = {
-- 
cgit v1.2.3-55-g7522


From cfc1a1af56200362d1508b82b9a3cc3acb2eae0c Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe
Date: Wed, 31 Jul 2019 17:35:33 -0600
Subject: nvmet-file: fix nvmet_file_flush() always returning an error

Presently, nvmet_file_flush() always returns a call to
errno_to_nvme_status() but that helper doesn't take into account the
case when errno=0. So nvmet_file_flush() always returns an error code.

All other callers of errno_to_nvme_status() check for success before
calling it.

To fix this, ensure errno_to_nvme_status() returns success if the
errno is zero. This should prevent future mistakes like this from
happening.

Fixes: c6aa3542e010 ("nvmet: add error log support for file backend")
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b86a23aa9020..3a67e244e568 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -46,6 +46,9 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
 	u16 status;
 
 	switch (errno) {
+	case 0:
+		status = NVME_SC_SUCCESS;
+		break;
 	case -ENOSPC:
 		req->error_loc = offsetof(struct nvme_rw_command, length);
 		status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
-- 
cgit v1.2.3-55-g7522


From 8c36e66fb407ce076535a7db98ab9f6d720b866a Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe
Date: Wed, 31 Jul 2019 17:35:34 -0600
Subject: nvme-core: Fix extra device_put() call on error path

In the error path for nvme_init_subsystem(), nvme_put_subsystem()
will call device_put(), but it will get called again after the
mutex_unlock().

The device_put() only needs to be called if device_add() fails.

This bug caused a KASAN use-after-free error when adding and
removing subsytems in a loop:

  BUG: KASAN: use-after-free in device_del+0x8d9/0x9a0
  Read of size 8 at addr ffff8883cdaf7120 by task multipathd/329

  CPU: 0 PID: 329 Comm: multipathd Not tainted 5.2.0-rc6-vmlocalyes-00019-g70a2b39005fd-dirty #314
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
  Call Trace:
   dump_stack+0x7b/0xb5
   print_address_description+0x6f/0x280
   ? device_del+0x8d9/0x9a0
   __kasan_report+0x148/0x199
   ? device_del+0x8d9/0x9a0
   ? class_release+0x100/0x130
   ? device_del+0x8d9/0x9a0
   kasan_report+0x12/0x20
   __asan_report_load8_noabort+0x14/0x20
   device_del+0x8d9/0x9a0
   ? device_platform_notify+0x70/0x70
   nvme_destroy_subsystem+0xf9/0x150
   nvme_free_ctrl+0x280/0x3a0
   device_release+0x72/0x1d0
   kobject_put+0x144/0x410
   put_device+0x13/0x20
   nvme_free_ns+0xc4/0x100
   nvme_release+0xb3/0xe0
   __blkdev_put+0x549/0x6e0
   ? kasan_check_write+0x14/0x20
   ? bd_set_size+0xb0/0xb0
   ? kasan_check_write+0x14/0x20
   ? mutex_lock+0x8f/0xe0
   ? __mutex_lock_slowpath+0x20/0x20
   ? locks_remove_file+0x239/0x370
   blkdev_put+0x72/0x2c0
   blkdev_close+0x8d/0xd0
   __fput+0x256/0x770
   ? _raw_read_lock_irq+0x40/0x40
   ____fput+0xe/0x10
   task_work_run+0x10c/0x180
   ? filp_close+0xf7/0x140
   exit_to_usermode_loop+0x151/0x170
   do_syscall_64+0x240/0x2e0
   ? prepare_exit_to_usermode+0xd5/0x190
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f5a79af05d7
  Code: 00 00 0f 05 48 3d 00 f0 ff ff 77 3f c3 66 0f 1f 44 00 00 53 89 fb 48 83 ec 10 e8 c4 fb ff ff 89 df 89 c2 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 2b 89 d7 89 44 24 0c e8 06 fc ff ff 8b 44 24
  RSP: 002b:00007f5a7799c810 EFLAGS: 00000293 ORIG_RAX: 0000000000000003
  RAX: 0000000000000000 RBX: 0000000000000008 RCX: 00007f5a79af05d7
  RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000008
  RBP: 00007f5a58000f98 R08: 0000000000000002 R09: 00007f5a7935ee80
  R10: 0000000000000000 R11: 0000000000000293 R12: 000055e432447240
  R13: 0000000000000000 R14: 0000000000000001 R15: 000055e4324a9cf0

  Allocated by task 1236:
   save_stack+0x21/0x80
   __kasan_kmalloc.constprop.6+0xab/0xe0
   kasan_kmalloc+0x9/0x10
   kmem_cache_alloc_trace+0x102/0x210
   nvme_init_identify+0x13c3/0x3820
   nvme_loop_configure_admin_queue+0x4fa/0x5e0
   nvme_loop_create_ctrl+0x469/0xf40
   nvmf_dev_write+0x19a3/0x21ab
   __vfs_write+0x66/0x120
   vfs_write+0x154/0x490
   ksys_write+0x104/0x240
   __x64_sys_write+0x73/0xb0
   do_syscall_64+0xa5/0x2e0
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

  Freed by task 329:
   save_stack+0x21/0x80
   __kasan_slab_free+0x129/0x190
   kasan_slab_free+0xe/0x10
   kfree+0xa7/0x200
   nvme_release_subsystem+0x49/0x60
   device_release+0x72/0x1d0
   kobject_put+0x144/0x410
   put_device+0x13/0x20
   klist_class_dev_put+0x31/0x40
   klist_put+0x8f/0xf0
   klist_del+0xe/0x10
   device_del+0x3a7/0x9a0
   nvme_destroy_subsystem+0xf9/0x150
   nvme_free_ctrl+0x280/0x3a0
   device_release+0x72/0x1d0
   kobject_put+0x144/0x410
   put_device+0x13/0x20
   nvme_free_ns+0xc4/0x100
   nvme_release+0xb3/0xe0
   __blkdev_put+0x549/0x6e0
   blkdev_put+0x72/0x2c0
   blkdev_close+0x8d/0xd0
   __fput+0x256/0x770
   ____fput+0xe/0x10
   task_work_run+0x10c/0x180
   exit_to_usermode_loop+0x151/0x170
   do_syscall_64+0x240/0x2e0
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: 32fd90c40768 ("nvme: change locking for the per-subsystem controller list")
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by : Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 80c7a7ee240b..e35f16b60fc9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2488,6 +2488,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 		if (ret) {
 			dev_err(ctrl->device,
 				"failed to register subsystem device.\n");
+			put_device(&subsys->dev);
 			goto out_unlock;
 		}
 		ida_init(&subsys->ns_ida);
@@ -2510,7 +2511,6 @@ out_put_subsystem:
 	nvme_put_subsystem(subsys);
 out_unlock:
 	mutex_unlock(&nvme_subsystems_lock);
-	put_device(&subsys->dev);
 	return ret;
 }
 
-- 
cgit v1.2.3-55-g7522


From b9156daeb1601d69007b7e50efcf89d69d72ec1d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Wed, 31 Jul 2019 11:00:26 -0700
Subject: nvme: fix a possible deadlock when passthru commands sent to a
 multipath device

When the user issues a command with side effects, we will end up freezing
the namespace request queue when updating disk info (and the same for
the corresponding mpath disk node).

However, we are not freezing the mpath node request queue,
which means that mpath I/O can still come in and block on blk_queue_enter
(called from nvme_ns_head_make_request -> direct_make_request).

This is a deadlock, because blk_queue_enter will block until the inner
namespace request queue is unfroze, but that process is blocked because
the namespace revalidation is trying to update the mpath disk info
and freeze its request queue (which will never complete because
of the I/O that is blocked on blk_queue_enter).

Fix this by freezing all the subsystem nsheads request queues before
executing the passthru command. Given that these commands are infrequent
we should not worry about this temporary I/O freeze to keep things sane.

Here is the matching hang traces:
--
[ 374.465002] INFO: task systemd-udevd:17994 blocked for more than 122 seconds.
[ 374.472975] Not tainted 5.2.0-rc3-mpdebug+ #42
[ 374.478522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 374.487274] systemd-udevd D 0 17994 1 0x00000000
[ 374.493407] Call Trace:
[ 374.496145] __schedule+0x2ef/0x620
[ 374.500047] schedule+0x38/0xa0
[ 374.503569] blk_queue_enter+0x139/0x220
[ 374.507959] ? remove_wait_queue+0x60/0x60
[ 374.512540] direct_make_request+0x60/0x130
[ 374.517219] nvme_ns_head_make_request+0x11d/0x420 [nvme_core]
[ 374.523740] ? generic_make_request_checks+0x307/0x6f0
[ 374.529484] generic_make_request+0x10d/0x2e0
[ 374.534356] submit_bio+0x75/0x140
[ 374.538163] ? guard_bio_eod+0x32/0xe0
[ 374.542361] submit_bh_wbc+0x171/0x1b0
[ 374.546553] block_read_full_page+0x1ed/0x330
[ 374.551426] ? check_disk_change+0x70/0x70
[ 374.556008] ? scan_shadow_nodes+0x30/0x30
[ 374.560588] blkdev_readpage+0x18/0x20
[ 374.564783] do_read_cache_page+0x301/0x860
[ 374.569463] ? blkdev_writepages+0x10/0x10
[ 374.574037] ? prep_new_page+0x88/0x130
[ 374.578329] ? get_page_from_freelist+0xa2f/0x1280
[ 374.583688] ? __alloc_pages_nodemask+0x179/0x320
[ 374.588947] read_cache_page+0x12/0x20
[ 374.593142] read_dev_sector+0x2d/0xd0
[ 374.597337] read_lba+0x104/0x1f0
[ 374.601046] find_valid_gpt+0xfa/0x720
[ 374.605243] ? string_nocheck+0x58/0x70
[ 374.609534] ? find_valid_gpt+0x720/0x720
[ 374.614016] efi_partition+0x89/0x430
[ 374.618113] ? string+0x48/0x60
[ 374.621632] ? snprintf+0x49/0x70
[ 374.625339] ? find_valid_gpt+0x720/0x720
[ 374.629828] check_partition+0x116/0x210
[ 374.634214] rescan_partitions+0xb6/0x360
[ 374.638699] __blkdev_reread_part+0x64/0x70
[ 374.643377] blkdev_reread_part+0x23/0x40
[ 374.647860] blkdev_ioctl+0x48c/0x990
[ 374.651956] block_ioctl+0x41/0x50
[ 374.655766] do_vfs_ioctl+0xa7/0x600
[ 374.659766] ? locks_lock_inode_wait+0xb1/0x150
[ 374.664832] ksys_ioctl+0x67/0x90
[ 374.668539] __x64_sys_ioctl+0x1a/0x20
[ 374.672732] do_syscall_64+0x5a/0x1c0
[ 374.676828] entry_SYSCALL_64_after_hwframe+0x44/0xa9

[ 374.738474] INFO: task nvmeadm:49141 blocked for more than 123 seconds.
[ 374.745871] Not tainted 5.2.0-rc3-mpdebug+ #42
[ 374.751419] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 374.760170] nvmeadm D 0 49141 36333 0x00004080
[ 374.766301] Call Trace:
[ 374.769038] __schedule+0x2ef/0x620
[ 374.772939] schedule+0x38/0xa0
[ 374.776452] blk_mq_freeze_queue_wait+0x59/0x100
[ 374.781614] ? remove_wait_queue+0x60/0x60
[ 374.786192] blk_mq_freeze_queue+0x1a/0x20
[ 374.790773] nvme_update_disk_info.isra.57+0x5f/0x350 [nvme_core]
[ 374.797582] ? nvme_identify_ns.isra.50+0x71/0xc0 [nvme_core]
[ 374.804006] __nvme_revalidate_disk+0xe5/0x110 [nvme_core]
[ 374.810139] nvme_revalidate_disk+0xa6/0x120 [nvme_core]
[ 374.816078] ? nvme_submit_user_cmd+0x11e/0x320 [nvme_core]
[ 374.822299] nvme_user_cmd+0x264/0x370 [nvme_core]
[ 374.827661] nvme_dev_ioctl+0x112/0x1d0 [nvme_core]
[ 374.833114] do_vfs_ioctl+0xa7/0x600
[ 374.837117] ? __audit_syscall_entry+0xdd/0x130
[ 374.842184] ksys_ioctl+0x67/0x90
[ 374.845891] __x64_sys_ioctl+0x1a/0x20
[ 374.850082] do_syscall_64+0x5a/0x1c0
[ 374.854178] entry_SYSCALL_64_after_hwframe+0x44/0xa9
--

Reported-by: James Puthukattukaran <james.puthukattukaran@oracle.com>
Tested-by: James Puthukattukaran <james.puthukattukaran@oracle.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c      |  5 +++++
 drivers/nvme/host/multipath.c | 30 ++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h      | 12 ++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e35f16b60fc9..88b8dfd7928a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1286,6 +1286,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	 */
 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
 		mutex_lock(&ctrl->scan_lock);
+		mutex_lock(&ctrl->subsys->lock);
+		nvme_mpath_start_freeze(ctrl->subsys);
+		nvme_mpath_wait_freeze(ctrl->subsys);
 		nvme_start_freeze(ctrl);
 		nvme_wait_freeze(ctrl);
 	}
@@ -1316,6 +1319,8 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
 		nvme_update_formats(ctrl);
 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
 		nvme_unfreeze(ctrl);
+		nvme_mpath_unfreeze(ctrl->subsys);
+		mutex_unlock(&ctrl->subsys->lock);
 		mutex_unlock(&ctrl->scan_lock);
 	}
 	if (effects & NVME_CMD_EFFECTS_CCC)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 4f0d0d12744e..b34dcb2288e7 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -12,6 +12,36 @@ module_param(multipath, bool, 0444);
 MODULE_PARM_DESC(multipath,
 	"turn on native support for multiple controllers per subsystem");
 
+void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+	list_for_each_entry(h, &subsys->nsheads, entry)
+		if (h->disk)
+			blk_mq_unfreeze_queue(h->disk->queue);
+}
+
+void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+	list_for_each_entry(h, &subsys->nsheads, entry)
+		if (h->disk)
+			blk_mq_freeze_queue_wait(h->disk->queue);
+}
+
+void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+	list_for_each_entry(h, &subsys->nsheads, entry)
+		if (h->disk)
+			blk_freeze_queue_start(h->disk->queue);
+}
+
 /*
  * If multipathing is enabled we need to always use the subsystem instance
  * number for numbering our devices to avoid conflicts between subsystems that
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 26b563f9985b..6b4fb67124c6 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -490,6 +490,9 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 	return ctrl->ana_log_buf != NULL;
 }
 
+void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
+void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
+void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 			struct nvme_ctrl *ctrl, int *flags);
 void nvme_failover_req(struct request *req);
@@ -568,6 +571,15 @@ static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 {
 }
+static inline void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
+{
+}
+static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
+{
+}
+static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
+{
+}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 #ifdef CONFIG_NVM
-- 
cgit v1.2.3-55-g7522


From d94211b8bad3787e0655a67284105f57db728cb1 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Fri, 26 Jul 2019 10:29:49 -0700
Subject: nvme-rdma: fix possible use-after-free in connect error flow

When start_queue fails, we need to make sure to drain the
queue cq before freeing the rdma resources because we might
still race with the completion path. Have start_queue() error
path safely stop the queue.

--
[30371.808111] nvme nvme1: Failed reconnect attempt 11
[30371.808113] nvme nvme1: Reconnecting in 10 seconds...
[...]
[30382.069315] nvme nvme1: creating 4 I/O queues.
[30382.257058] nvme nvme1: Connect Invalid SQE Parameter, qid 4
[30382.257061] nvme nvme1: failed to connect queue: 4 ret=386
[30382.305001] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
[30382.305022] IP: qedr_poll_cq+0x8a3/0x1170 [qedr]
[30382.305028] PGD 0 P4D 0
[30382.305037] Oops: 0000 [#1] SMP PTI
[...]
[30382.305153] Call Trace:
[30382.305166]  ? __switch_to_asm+0x34/0x70
[30382.305187]  __ib_process_cq+0x56/0xd0 [ib_core]
[30382.305201]  ib_poll_handler+0x26/0x70 [ib_core]
[30382.305213]  irq_poll_softirq+0x88/0x110
[30382.305223]  ? sort_range+0x20/0x20
[30382.305232]  __do_softirq+0xde/0x2c6
[30382.305241]  ? sort_range+0x20/0x20
[30382.305249]  run_ksoftirqd+0x1c/0x60
[30382.305258]  smpboot_thread_fn+0xef/0x160
[30382.305265]  kthread+0x113/0x130
[30382.305273]  ? kthread_create_worker_on_cpu+0x50/0x50
[30382.305281]  ret_from_fork+0x35/0x40
--

Reported-by: Nicolas Morey-Chaisemartin <NMoreyChaisemartin@suse.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/rdma.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index a249db528d54..1a6449bc547b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -562,13 +562,17 @@ out_destroy_cm_id:
 	return ret;
 }
 
+static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
+{
+	rdma_disconnect(queue->cm_id);
+	ib_drain_qp(queue->qp);
+}
+
 static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
 {
 	if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
 		return;
-
-	rdma_disconnect(queue->cm_id);
-	ib_drain_qp(queue->qp);
+	__nvme_rdma_stop_queue(queue);
 }
 
 static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
@@ -607,11 +611,13 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
 	else
 		ret = nvmf_connect_admin_queue(&ctrl->ctrl);
 
-	if (!ret)
+	if (!ret) {
 		set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
-	else
+	} else {
+		__nvme_rdma_stop_queue(queue);
 		dev_info(ctrl->ctrl.device,
 			"failed to connect queue: %d ret=%d\n", idx, ret);
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3-55-g7522


From 0157ec8dad3c8fc9bc9790f76e0831ffdaf2e7f0 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg
Date: Thu, 25 Jul 2019 11:56:57 -0700
Subject: nvme: fix controller removal race with scan work

With multipath enabled, nvme_scan_work() can read from the device
(through nvme_mpath_add_disk()) and hang [1]. However, with fabrics,
once ctrl->state is set to NVME_CTRL_DELETING, the reads will hang
(see nvmf_check_ready()) and the mpath stack device make_request
will block if head->list is not empty. However, when the head->list
consistst of only DELETING/DEAD controllers, we should actually not
block, but rather fail immediately.

In addition, before we go ahead and remove the namespaces, make sure
to clear the current path and kick the requeue list so that the
request will fast fail upon requeuing.

[1]:
--
  INFO: task kworker/u4:3:166 blocked for more than 120 seconds.
        Not tainted 5.2.0-rc6-vmlocalyes-00005-g808c8c2dc0cf #316
  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  kworker/u4:3    D    0   166      2 0x80004000
  Workqueue: nvme-wq nvme_scan_work
  Call Trace:
   __schedule+0x851/0x1400
   schedule+0x99/0x210
   io_schedule+0x21/0x70
   do_read_cache_page+0xa57/0x1330
   read_cache_page+0x4a/0x70
   read_dev_sector+0xbf/0x380
   amiga_partition+0xc4/0x1230
   check_partition+0x30f/0x630
   rescan_partitions+0x19a/0x980
   __blkdev_get+0x85a/0x12f0
   blkdev_get+0x2a5/0x790
   __device_add_disk+0xe25/0x1250
   device_add_disk+0x13/0x20
   nvme_mpath_set_live+0x172/0x2b0
   nvme_update_ns_ana_state+0x130/0x180
   nvme_set_ns_ana_state+0x9a/0xb0
   nvme_parse_ana_log+0x1c3/0x4a0
   nvme_mpath_add_disk+0x157/0x290
   nvme_validate_ns+0x1017/0x1bd0
   nvme_scan_work+0x44d/0x6a0
   process_one_work+0x7d7/0x1240
   worker_thread+0x8e/0xff0
   kthread+0x2c3/0x3b0
   ret_from_fork+0x35/0x40

   INFO: task kworker/u4:1:1034 blocked for more than 120 seconds.
        Not tainted 5.2.0-rc6-vmlocalyes-00005-g808c8c2dc0cf #316
  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  kworker/u4:1    D    0  1034      2 0x80004000
  Workqueue: nvme-delete-wq nvme_delete_ctrl_work
  Call Trace:
   __schedule+0x851/0x1400
   schedule+0x99/0x210
   schedule_timeout+0x390/0x830
   wait_for_completion+0x1a7/0x310
   __flush_work+0x241/0x5d0
   flush_work+0x10/0x20
   nvme_remove_namespaces+0x85/0x3d0
   nvme_do_delete_ctrl+0xb4/0x1e0
   nvme_delete_ctrl_work+0x15/0x20
   process_one_work+0x7d7/0x1240
   worker_thread+0x8e/0xff0
   kthread+0x2c3/0x3b0
   ret_from_fork+0x35/0x40
--

Reported-by: Logan Gunthorpe <logang@deltatee.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c      |  7 +++++++
 drivers/nvme/host/multipath.c | 46 +++++++++++++++++++++++++++++++++++++------
 drivers/nvme/host/nvme.h      |  9 +++++++--
 3 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 88b8dfd7928a..c258a1ce4b28 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3577,6 +3577,13 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	struct nvme_ns *ns, *next;
 	LIST_HEAD(ns_list);
 
+	/*
+	 * make sure to requeue I/O to all namespaces as these
+	 * might result from the scan itself and must complete
+	 * for the scan_work to make progress
+	 */
+	nvme_mpath_clear_ctrl_paths(ctrl);
+
 	/* prevent racing with ns scanning */
 	flush_work(&ctrl->scan_work);
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index b34dcb2288e7..888d4543894e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -134,18 +134,34 @@ static const char *nvme_ana_state_names[] = {
 	[NVME_ANA_CHANGE]		= "change",
 };
 
-void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
 	struct nvme_ns_head *head = ns->head;
+	bool changed = false;
 	int node;
 
 	if (!head)
-		return;
+		goto out;
 
 	for_each_node(node) {
-		if (ns == rcu_access_pointer(head->current_path[node]))
+		if (ns == rcu_access_pointer(head->current_path[node])) {
 			rcu_assign_pointer(head->current_path[node], NULL);
+			changed = true;
+		}
 	}
+out:
+	return changed;
+}
+
+void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	mutex_lock(&ctrl->scan_lock);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		if (nvme_mpath_clear_current_path(ns))
+			kblockd_schedule_work(&ns->head->requeue_work);
+	mutex_unlock(&ctrl->scan_lock);
 }
 
 static bool nvme_path_is_disabled(struct nvme_ns *ns)
@@ -256,6 +272,24 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	return ns;
 }
 
+static bool nvme_available_path(struct nvme_ns_head *head)
+{
+	struct nvme_ns *ns;
+
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		switch (ns->ctrl->state) {
+		case NVME_CTRL_LIVE:
+		case NVME_CTRL_RESETTING:
+		case NVME_CTRL_CONNECTING:
+			/* fallthru */
+			return true;
+		default:
+			break;
+		}
+	}
+	return false;
+}
+
 static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 		struct bio *bio)
 {
@@ -282,14 +316,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 				      disk_devt(ns->head->disk),
 				      bio->bi_iter.bi_sector);
 		ret = direct_make_request(bio);
-	} else if (!list_empty_careful(&head->list)) {
-		dev_warn_ratelimited(dev, "no path available - requeuing I/O\n");
+	} else if (nvme_available_path(head)) {
+		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
 
 		spin_lock_irq(&head->requeue_lock);
 		bio_list_add(&head->requeue_list, bio);
 		spin_unlock_irq(&head->requeue_lock);
 	} else {
-		dev_warn_ratelimited(dev, "no path - failing I/O\n");
+		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
 
 		bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 6b4fb67124c6..778b3a0b6adb 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -503,7 +503,8 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head);
 int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
 void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
 void nvme_mpath_stop(struct nvme_ctrl *ctrl);
-void nvme_mpath_clear_current_path(struct nvme_ns *ns);
+bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
+void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
 struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
 
 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
@@ -551,7 +552,11 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
 static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 {
 }
-static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+	return false;
+}
+static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
 }
 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
-- 
cgit v1.2.3-55-g7522


From bd46a90634302bfe791e93ad5496f98f165f7ae0 Mon Sep 17 00:00:00 2001
From: Keith Busch
Date: Mon, 29 Jul 2019 16:34:52 -0600
Subject: nvme-pci: Fix async probe remove race

Ensure the controller is not in the NEW state when nvme_probe() exits.
This will always allow a subsequent nvme_remove() to set the state to
DELETING, fixing a potential race between the initial asynchronous probe
and device removal.

Reported-by: Li Zhong <lizhongfs@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index db160cee42ad..0c2c4b0c6655 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2695,7 +2695,7 @@ static void nvme_async_probe(void *data, async_cookie_t cookie)
 {
 	struct nvme_dev *dev = data;
 
-	nvme_reset_ctrl_sync(&dev->ctrl);
+	flush_work(&dev->ctrl.reset_work);
 	flush_work(&dev->ctrl.scan_work);
 	nvme_put_ctrl(&dev->ctrl);
 }
@@ -2761,6 +2761,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	nvme_reset_ctrl(&dev->ctrl);
 	nvme_get_ctrl(&dev->ctrl);
 	async_schedule(nvme_async_probe, dev);
 
-- 
cgit v1.2.3-55-g7522


From e26cc08265dda37d2acc8394604f220ef412299d Mon Sep 17 00:00:00 2001
From: zhengbin
Date: Mon, 12 Aug 2019 20:36:55 +0800
Subject: blk-mq: move cancel of requeue_work to the front of blk_exit_queue

blk_exit_queue will free elevator_data, while blk_mq_requeue_work
will access it. Move cancel of requeue_work to the front of
blk_exit_queue to avoid use-after-free.

blk_exit_queue                blk_mq_requeue_work
  __elevator_exit               blk_mq_run_hw_queues
    blk_mq_exit_sched             blk_mq_run_hw_queue
      dd_exit_queue                 blk_mq_hctx_has_pending
        kfree(elevator_data)          blk_mq_sched_has_work
                                        dd_has_work

Fixes: fbc2a15e3433 ("blk-mq: move cancel of requeue_work into blk_mq_release")
Cc: stable@vger.kernel.org
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: zhengbin <zhengbin13@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c    | 2 --
 block/blk-sysfs.c | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f78d3287dd82..a8e6a58f5f28 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2666,8 +2666,6 @@ void blk_mq_release(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx, *next;
 	int i;
 
-	cancel_delayed_work_sync(&q->requeue_work);
-
 	queue_for_each_hw_ctx(q, hctx, i)
 		WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 977c659dcd18..9bfa3ea4ed63 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -892,6 +892,9 @@ static void __blk_release_queue(struct work_struct *work)
 
 	blk_free_queue_stats(q->stats);
 
+	if (queue_is_mq(q))
+		cancel_delayed_work_sync(&q->requeue_work);
+
 	blk_exit_queue(q);
 
 	blk_queue_free_zone_bitmaps(q);
-- 
cgit v1.2.3-55-g7522


From ae78ca3cf3d9e9f914bfcd0bc5c389ff18b9c2e0 Mon Sep 17 00:00:00 2001
From: Wenwen Wang
Date: Sun, 11 Aug 2019 12:23:22 -0500
Subject: xen/blkback: fix memory leaks

In read_per_ring_refs(), after 'req' and related memory regions are
allocated, xen_blkif_map() is invoked to map the shared frame, irq, and
etc. However, if this mapping process fails, no cleanup is performed,
leading to memory leaks. To fix this issue, invoke the cleanup before
returning the error.

Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Wenwen Wang <wenwen@cs.uga.edu>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/xen-blkback/xenbus.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 3ac6a5d18071..b90dbcd99c03 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -965,6 +965,7 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
 		}
 	}
 
+	err = -ENOMEM;
 	for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
 		req = kzalloc(sizeof(*req), GFP_KERNEL);
 		if (!req)
@@ -987,7 +988,7 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
 	err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
-		return err;
+		goto fail;
 	}
 
 	return 0;
@@ -1007,8 +1008,7 @@ fail:
 		}
 		kfree(req);
 	}
-	return -ENOMEM;
-
+	return err;
 }
 
 static int connect_ring(struct backend_info *be)
-- 
cgit v1.2.3-55-g7522


From 99c79f6692ccdc42e04deea8a36e22bb48168a62 Mon Sep 17 00:00:00 2001
From: Aleix Roca Nonell
Date: Thu, 15 Aug 2019 14:03:22 +0200
Subject: io_uring: fix manual setup of iov_iter for fixed buffers

Commit bd11b3a391e3 ("io_uring: don't use iov_iter_advance() for fixed
buffers") introduced an optimization to avoid using the slow
iov_iter_advance by manually populating the iov_iter iterator in some
cases.

However, the computation of the iterator count field was erroneous: The
first bvec was always accounted for an extent of page size even if the
bvec length was smaller.

In consequence, some I/O operations on fixed buffers were unable to
operate on the full extent of the buffer, consistently skipping some
bytes at the end of it.

Fixes: bd11b3a391e3 ("io_uring: don't use iov_iter_advance() for fixed buffers")
Cc: stable@vger.kernel.org
Signed-off-by: Aleix Roca Nonell <aleix.rocanonell@bsc.es>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d542f1cf4428..aa25b5bbd4ae 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1097,10 +1097,8 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 
 			iter->bvec = bvec + seg_skip;
 			iter->nr_segs -= seg_skip;
-			iter->count -= (seg_skip << PAGE_SHIFT);
+			iter->count -= bvec->bv_len + offset;
 			iter->iov_offset = offset & ~PAGE_MASK;
-			if (iter->iov_offset)
-				iter->count -= iter->iov_offset;
 		}
 	}
 
-- 
cgit v1.2.3-55-g7522


From 7b6620d7db566a46f49b4b9deab9fa061fd4b59b Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Thu, 15 Aug 2019 11:09:16 -0600
Subject: block: remove REQ_NOWAIT_INLINE

We had a few issues with this code, and there's still a problem around
how we deal with error handling for chained/split bios. For now, just
revert the code and we'll try again with a thoroug solution. This
reverts commits:

e15c2ffa1091 ("block: fix O_DIRECT error handling for bio fragments")
0eb6ddfb865c ("block: Fix __blkdev_direct_IO() for bio fragments")
6a43074e2f46 ("block: properly handle IOCB_NOWAIT for async O_DIRECT IO")
893a1c97205a ("blk-mq: allow REQ_NOWAIT to return an error inline")

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c            |  8 ++------
 fs/block_dev.c            | 49 +++++------------------------------------------
 include/linux/blk_types.h |  5 +----
 3 files changed, 8 insertions(+), 54 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a8e6a58f5f28..0835f4d8d42e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1958,13 +1958,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	rq = blk_mq_get_request(q, bio, &data);
 	if (unlikely(!rq)) {
 		rq_qos_cleanup(q, bio);
-
-		cookie = BLK_QC_T_NONE;
-		if (bio->bi_opf & REQ_NOWAIT_INLINE)
-			cookie = BLK_QC_T_EAGAIN;
-		else if (bio->bi_opf & REQ_NOWAIT)
+		if (bio->bi_opf & REQ_NOWAIT)
 			bio_wouldblock_error(bio);
-		return cookie;
+		return BLK_QC_T_NONE;
 	}
 
 	trace_block_getrq(q, bio, bio->bi_opf);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index eb657ab94060..677cb364d33f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -345,24 +345,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	struct bio *bio;
 	bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
 	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
-	bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0;
 	loff_t pos = iocb->ki_pos;
 	blk_qc_t qc = BLK_QC_T_NONE;
-	gfp_t gfp;
-	int ret;
+	int ret = 0;
 
 	if ((pos | iov_iter_alignment(iter)) &
 	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 
-	if (nowait)
-		gfp = GFP_NOWAIT;
-	else
-		gfp = GFP_KERNEL;
-
-	bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool);
-	if (!bio)
-		return -EAGAIN;
+	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
 
 	dio = container_of(bio, struct blkdev_dio, bio);
 	dio->is_sync = is_sync = is_sync_kiocb(iocb);
@@ -384,7 +375,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	if (!is_poll)
 		blk_start_plug(&plug);
 
-	ret = 0;
 	for (;;) {
 		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = pos >> 9;
@@ -409,14 +399,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			task_io_account_write(bio->bi_iter.bi_size);
 		}
 
-		/*
-		 * Tell underlying layer to not block for resource shortage.
-		 * And if we would have blocked, return error inline instead
-		 * of through the bio->bi_end_io() callback.
-		 */
-		if (nowait)
-			bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE);
-
+		dio->size += bio->bi_iter.bi_size;
 		pos += bio->bi_iter.bi_size;
 
 		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
@@ -428,13 +411,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 				polled = true;
 			}
 
-			dio->size += bio->bi_iter.bi_size;
 			qc = submit_bio(bio);
-			if (qc == BLK_QC_T_EAGAIN) {
-				dio->size -= bio->bi_iter.bi_size;
-				ret = -EAGAIN;
-				goto error;
-			}
 
 			if (polled)
 				WRITE_ONCE(iocb->ki_cookie, qc);
@@ -455,19 +432,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			atomic_inc(&dio->ref);
 		}
 
-		dio->size += bio->bi_iter.bi_size;
-		qc = submit_bio(bio);
-		if (qc == BLK_QC_T_EAGAIN) {
-			dio->size -= bio->bi_iter.bi_size;
-			ret = -EAGAIN;
-			goto error;
-		}
-
-		bio = bio_alloc(gfp, nr_pages);
-		if (!bio) {
-			ret = -EAGAIN;
-			goto error;
-		}
+		submit_bio(bio);
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
 	}
 
 	if (!is_poll)
@@ -487,7 +453,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	}
 	__set_current_state(TASK_RUNNING);
 
-out:
 	if (!ret)
 		ret = blk_status_to_errno(dio->bio.bi_status);
 	if (likely(!ret))
@@ -495,10 +460,6 @@ out:
 
 	bio_put(&dio->bio);
 	return ret;
-error:
-	if (!is_poll)
-		blk_finish_plug(&plug);
-	goto out;
 }
 
 static ssize_t
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1b1fa1557e68..feff3fe4467e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,7 +311,6 @@ enum req_flag_bits {
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
 	__REQ_NOWAIT,           /* Don't wait if request will block */
-	__REQ_NOWAIT_INLINE,	/* Return would-block error inline */
 	/*
 	 * When a shared kthread needs to issue a bio for a cgroup, doing
 	 * so synchronously can lead to priority inversions as the kthread
@@ -346,7 +345,6 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
-#define REQ_NOWAIT_INLINE	(1ULL << __REQ_NOWAIT_INLINE)
 #define REQ_CGROUP_PUNT		(1ULL << __REQ_CGROUP_PUNT)
 
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
@@ -420,13 +418,12 @@ static inline int op_stat_group(unsigned int op)
 
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE		-1U
-#define BLK_QC_T_EAGAIN		-2U
 #define BLK_QC_T_SHIFT		16
 #define BLK_QC_T_INTERNAL	(1U << 31)
 
 static inline bool blk_qc_t_valid(blk_qc_t cookie)
 {
-	return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN;
+	return cookie != BLK_QC_T_NONE;
 }
 
 static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)
-- 
cgit v1.2.3-55-g7522


From a982eeb09b6030e567b8b815277c8c9197168040 Mon Sep 17 00:00:00 2001
From: Jackie Liu
Date: Wed, 14 Aug 2019 17:35:22 +0800
Subject: io_uring: fix an issue when IOSQE_IO_LINK is inserted into defer list

This patch may fix two issues:

First, when IOSQE_IO_DRAIN set, the next IOs need to be inserted into
defer list to delay execution, but link io will be actively scheduled to
run by calling io_queue_sqe.

Second, when multiple LINK_IOs are inserted together with defer_list,
the LINK_IO is no longer keep order.

   |-------------|
   |   LINK_IO   |      ----> insert to defer_list  -----------
   |-------------|                                            |
   |   LINK_IO   |      ----> insert to defer_list  ----------|
   |-------------|                                            |
   |   LINK_IO   |      ----> insert to defer_list  ----------|
   |-------------|                                            |
   |   NORMAL_IO |      ----> insert to defer_list  ----------|
   |-------------|                                            |
                                                              |
                              queue_work at same time   <-----|

Fixes: 9e645e1105c ("io_uring: add support for sqe links")
Signed-off-by: Jackie Liu <liuyun01@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index aa25b5bbd4ae..24bbe3cb7ad4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2023,6 +2023,15 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 {
 	int ret;
 
+	ret = io_req_defer(ctx, req, s->sqe);
+	if (ret) {
+		if (ret != -EIOCBQUEUED) {
+			io_free_req(req);
+			io_cqring_add_event(ctx, s->sqe->user_data, ret);
+		}
+		return 0;
+	}
+
 	ret = __io_submit_sqe(ctx, req, s, true);
 	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 		struct io_uring_sqe *sqe_copy;
@@ -2095,13 +2104,6 @@ err:
 		return;
 	}
 
-	ret = io_req_defer(ctx, req, s->sqe);
-	if (ret) {
-		if (ret != -EIOCBQUEUED)
-			goto err_req;
-		return;
-	}
-
 	/*
 	 * If we already have a head request, queue this one for async
 	 * submittal once the head completes. If we don't have a head but
-- 
cgit v1.2.3-55-g7522