From fab7772bfbcfe8fb8e3e352a6a8fcaf044cded17 Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Mon, 29 Jul 2019 14:40:40 +0200 Subject: nvme-multipath: revalidate nvme_ns_head gendisk in nvme_validate_ns When CONFIG_NVME_MULTIPATH is set, only the hidden gendisk associated with the per-controller ns is run through revalidate_disk when a rescan is triggered, while the visible blockdev never gets its size (bdev->bd_inode->i_size) updated to reflect any capacity changes that may have occurred. This prevents online resizing of nvme block devices and in extension of any filesystems atop that will are unable to expand while mounted, as userspace relies on the blockdev size for obtaining the disk capacity (via BLKGETSIZE/64 ioctls). Fix this by explicitly revalidating the actual namespace gendisk in addition to the per-controller gendisk, when multipath is enabled. Signed-off-by: Anthony Iliopoulos Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8f3fbe5ca937..80c7a7ee240b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1715,6 +1715,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); blk_queue_stack_limits(ns->head->disk->queue, ns->queue); + revalidate_disk(ns->head->disk); } #endif } -- cgit v1.2.3-55-g7522 From 3aed86731ee2b23e4dc4d2c6d943d33992cd551b Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 31 Jul 2019 17:35:31 -0600 Subject: nvmet: Fix use-after-free bug when a port is removed When a port is removed through configfs, any connected controllers are still active and can still send commands. This causes a use-after-free bug which is detected by KASAN for any admin command that dereferences req->port (like in nvmet_execute_identify_ctrl). To fix this, disconnect all active controllers when a subsystem is removed from a port. This ensures there are no active controllers when the port is eventually removed. Signed-off-by: Logan Gunthorpe Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by : Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/target/configfs.c | 1 + drivers/nvme/target/core.c | 12 ++++++++++++ drivers/nvme/target/nvmet.h | 3 +++ 3 files changed, 16 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index cd52b9f15376..98613a45bd3b 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -675,6 +675,7 @@ static void nvmet_port_subsys_drop_link(struct config_item *parent, found: list_del(&p->entry); + nvmet_port_del_ctrls(port, subsys); nvmet_port_disc_changed(port, subsys); if (list_empty(&port->subsystems)) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index dad0243c7c96..b86a23aa9020 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -280,6 +280,18 @@ void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops) } EXPORT_SYMBOL_GPL(nvmet_unregister_transport); +void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys) +{ + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->port == port) + ctrl->ops->delete_ctrl(ctrl); + } + mutex_unlock(&subsys->lock); +} + int nvmet_enable_port(struct nvmet_port *port) { const struct nvmet_fabrics_ops *ops; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 6ee66c610739..c51f8dd01dc4 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -418,6 +418,9 @@ void nvmet_port_send_ana_event(struct nvmet_port *port); int nvmet_register_transport(const struct nvmet_fabrics_ops *ops); void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops); +void nvmet_port_del_ctrls(struct nvmet_port *port, + struct nvmet_subsys *subsys); + int nvmet_enable_port(struct nvmet_port *port); void nvmet_disable_port(struct nvmet_port *port); -- cgit v1.2.3-55-g7522 From 86b9a63e595ff03f9d0a7b92b6acc231fecefc29 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 31 Jul 2019 17:35:32 -0600 Subject: nvmet-loop: Flush nvme_delete_wq when removing the port After calling nvme_loop_delete_ctrl(), the controllers will not yet be deleted because nvme_delete_ctrl() only schedules work to do the delete. This means a race can occur if a port is removed but there are still active controllers trying to access that memory. To fix this, flush the nvme_delete_wq before returning from nvme_loop_remove_port() so that any controllers that might be in the process of being deleted won't access a freed port. Signed-off-by: Logan Gunthorpe Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by : Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/target/loop.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index b16dc3981c69..0940c5024a34 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -654,6 +654,14 @@ static void nvme_loop_remove_port(struct nvmet_port *port) mutex_lock(&nvme_loop_ports_mutex); list_del_init(&port->entry); mutex_unlock(&nvme_loop_ports_mutex); + + /* + * Ensure any ctrls that are in the process of being + * deleted are in fact deleted before we return + * and free the port. This is to prevent active + * ctrls from using a port after it's freed. + */ + flush_workqueue(nvme_delete_wq); } static const struct nvmet_fabrics_ops nvme_loop_ops = { -- cgit v1.2.3-55-g7522 From cfc1a1af56200362d1508b82b9a3cc3acb2eae0c Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 31 Jul 2019 17:35:33 -0600 Subject: nvmet-file: fix nvmet_file_flush() always returning an error Presently, nvmet_file_flush() always returns a call to errno_to_nvme_status() but that helper doesn't take into account the case when errno=0. So nvmet_file_flush() always returns an error code. All other callers of errno_to_nvme_status() check for success before calling it. To fix this, ensure errno_to_nvme_status() returns success if the errno is zero. This should prevent future mistakes like this from happening. Fixes: c6aa3542e010 ("nvmet: add error log support for file backend") Signed-off-by: Logan Gunthorpe Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/target/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index b86a23aa9020..3a67e244e568 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -46,6 +46,9 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) u16 status; switch (errno) { + case 0: + status = NVME_SC_SUCCESS; + break; case -ENOSPC: req->error_loc = offsetof(struct nvme_rw_command, length); status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; -- cgit v1.2.3-55-g7522 From 8c36e66fb407ce076535a7db98ab9f6d720b866a Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 31 Jul 2019 17:35:34 -0600 Subject: nvme-core: Fix extra device_put() call on error path In the error path for nvme_init_subsystem(), nvme_put_subsystem() will call device_put(), but it will get called again after the mutex_unlock(). The device_put() only needs to be called if device_add() fails. This bug caused a KASAN use-after-free error when adding and removing subsytems in a loop: BUG: KASAN: use-after-free in device_del+0x8d9/0x9a0 Read of size 8 at addr ffff8883cdaf7120 by task multipathd/329 CPU: 0 PID: 329 Comm: multipathd Not tainted 5.2.0-rc6-vmlocalyes-00019-g70a2b39005fd-dirty #314 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Call Trace: dump_stack+0x7b/0xb5 print_address_description+0x6f/0x280 ? device_del+0x8d9/0x9a0 __kasan_report+0x148/0x199 ? device_del+0x8d9/0x9a0 ? class_release+0x100/0x130 ? device_del+0x8d9/0x9a0 kasan_report+0x12/0x20 __asan_report_load8_noabort+0x14/0x20 device_del+0x8d9/0x9a0 ? device_platform_notify+0x70/0x70 nvme_destroy_subsystem+0xf9/0x150 nvme_free_ctrl+0x280/0x3a0 device_release+0x72/0x1d0 kobject_put+0x144/0x410 put_device+0x13/0x20 nvme_free_ns+0xc4/0x100 nvme_release+0xb3/0xe0 __blkdev_put+0x549/0x6e0 ? kasan_check_write+0x14/0x20 ? bd_set_size+0xb0/0xb0 ? kasan_check_write+0x14/0x20 ? mutex_lock+0x8f/0xe0 ? __mutex_lock_slowpath+0x20/0x20 ? locks_remove_file+0x239/0x370 blkdev_put+0x72/0x2c0 blkdev_close+0x8d/0xd0 __fput+0x256/0x770 ? _raw_read_lock_irq+0x40/0x40 ____fput+0xe/0x10 task_work_run+0x10c/0x180 ? filp_close+0xf7/0x140 exit_to_usermode_loop+0x151/0x170 do_syscall_64+0x240/0x2e0 ? prepare_exit_to_usermode+0xd5/0x190 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f5a79af05d7 Code: 00 00 0f 05 48 3d 00 f0 ff ff 77 3f c3 66 0f 1f 44 00 00 53 89 fb 48 83 ec 10 e8 c4 fb ff ff 89 df 89 c2 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 2b 89 d7 89 44 24 0c e8 06 fc ff ff 8b 44 24 RSP: 002b:00007f5a7799c810 EFLAGS: 00000293 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 0000000000000008 RCX: 00007f5a79af05d7 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000008 RBP: 00007f5a58000f98 R08: 0000000000000002 R09: 00007f5a7935ee80 R10: 0000000000000000 R11: 0000000000000293 R12: 000055e432447240 R13: 0000000000000000 R14: 0000000000000001 R15: 000055e4324a9cf0 Allocated by task 1236: save_stack+0x21/0x80 __kasan_kmalloc.constprop.6+0xab/0xe0 kasan_kmalloc+0x9/0x10 kmem_cache_alloc_trace+0x102/0x210 nvme_init_identify+0x13c3/0x3820 nvme_loop_configure_admin_queue+0x4fa/0x5e0 nvme_loop_create_ctrl+0x469/0xf40 nvmf_dev_write+0x19a3/0x21ab __vfs_write+0x66/0x120 vfs_write+0x154/0x490 ksys_write+0x104/0x240 __x64_sys_write+0x73/0xb0 do_syscall_64+0xa5/0x2e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 329: save_stack+0x21/0x80 __kasan_slab_free+0x129/0x190 kasan_slab_free+0xe/0x10 kfree+0xa7/0x200 nvme_release_subsystem+0x49/0x60 device_release+0x72/0x1d0 kobject_put+0x144/0x410 put_device+0x13/0x20 klist_class_dev_put+0x31/0x40 klist_put+0x8f/0xf0 klist_del+0xe/0x10 device_del+0x3a7/0x9a0 nvme_destroy_subsystem+0xf9/0x150 nvme_free_ctrl+0x280/0x3a0 device_release+0x72/0x1d0 kobject_put+0x144/0x410 put_device+0x13/0x20 nvme_free_ns+0xc4/0x100 nvme_release+0xb3/0xe0 __blkdev_put+0x549/0x6e0 blkdev_put+0x72/0x2c0 blkdev_close+0x8d/0xd0 __fput+0x256/0x770 ____fput+0xe/0x10 task_work_run+0x10c/0x180 exit_to_usermode_loop+0x151/0x170 do_syscall_64+0x240/0x2e0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 32fd90c40768 ("nvme: change locking for the per-subsystem controller list") Signed-off-by: Logan Gunthorpe Reviewed-by: Sagi Grimberg Reviewed-by : Chaitanya Kulkarni Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 80c7a7ee240b..e35f16b60fc9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2488,6 +2488,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) if (ret) { dev_err(ctrl->device, "failed to register subsystem device.\n"); + put_device(&subsys->dev); goto out_unlock; } ida_init(&subsys->ns_ida); @@ -2510,7 +2511,6 @@ out_put_subsystem: nvme_put_subsystem(subsys); out_unlock: mutex_unlock(&nvme_subsystems_lock); - put_device(&subsys->dev); return ret; } -- cgit v1.2.3-55-g7522 From b9156daeb1601d69007b7e50efcf89d69d72ec1d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 31 Jul 2019 11:00:26 -0700 Subject: nvme: fix a possible deadlock when passthru commands sent to a multipath device When the user issues a command with side effects, we will end up freezing the namespace request queue when updating disk info (and the same for the corresponding mpath disk node). However, we are not freezing the mpath node request queue, which means that mpath I/O can still come in and block on blk_queue_enter (called from nvme_ns_head_make_request -> direct_make_request). This is a deadlock, because blk_queue_enter will block until the inner namespace request queue is unfroze, but that process is blocked because the namespace revalidation is trying to update the mpath disk info and freeze its request queue (which will never complete because of the I/O that is blocked on blk_queue_enter). Fix this by freezing all the subsystem nsheads request queues before executing the passthru command. Given that these commands are infrequent we should not worry about this temporary I/O freeze to keep things sane. Here is the matching hang traces: -- [ 374.465002] INFO: task systemd-udevd:17994 blocked for more than 122 seconds. [ 374.472975] Not tainted 5.2.0-rc3-mpdebug+ #42 [ 374.478522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 374.487274] systemd-udevd D 0 17994 1 0x00000000 [ 374.493407] Call Trace: [ 374.496145] __schedule+0x2ef/0x620 [ 374.500047] schedule+0x38/0xa0 [ 374.503569] blk_queue_enter+0x139/0x220 [ 374.507959] ? remove_wait_queue+0x60/0x60 [ 374.512540] direct_make_request+0x60/0x130 [ 374.517219] nvme_ns_head_make_request+0x11d/0x420 [nvme_core] [ 374.523740] ? generic_make_request_checks+0x307/0x6f0 [ 374.529484] generic_make_request+0x10d/0x2e0 [ 374.534356] submit_bio+0x75/0x140 [ 374.538163] ? guard_bio_eod+0x32/0xe0 [ 374.542361] submit_bh_wbc+0x171/0x1b0 [ 374.546553] block_read_full_page+0x1ed/0x330 [ 374.551426] ? check_disk_change+0x70/0x70 [ 374.556008] ? scan_shadow_nodes+0x30/0x30 [ 374.560588] blkdev_readpage+0x18/0x20 [ 374.564783] do_read_cache_page+0x301/0x860 [ 374.569463] ? blkdev_writepages+0x10/0x10 [ 374.574037] ? prep_new_page+0x88/0x130 [ 374.578329] ? get_page_from_freelist+0xa2f/0x1280 [ 374.583688] ? __alloc_pages_nodemask+0x179/0x320 [ 374.588947] read_cache_page+0x12/0x20 [ 374.593142] read_dev_sector+0x2d/0xd0 [ 374.597337] read_lba+0x104/0x1f0 [ 374.601046] find_valid_gpt+0xfa/0x720 [ 374.605243] ? string_nocheck+0x58/0x70 [ 374.609534] ? find_valid_gpt+0x720/0x720 [ 374.614016] efi_partition+0x89/0x430 [ 374.618113] ? string+0x48/0x60 [ 374.621632] ? snprintf+0x49/0x70 [ 374.625339] ? find_valid_gpt+0x720/0x720 [ 374.629828] check_partition+0x116/0x210 [ 374.634214] rescan_partitions+0xb6/0x360 [ 374.638699] __blkdev_reread_part+0x64/0x70 [ 374.643377] blkdev_reread_part+0x23/0x40 [ 374.647860] blkdev_ioctl+0x48c/0x990 [ 374.651956] block_ioctl+0x41/0x50 [ 374.655766] do_vfs_ioctl+0xa7/0x600 [ 374.659766] ? locks_lock_inode_wait+0xb1/0x150 [ 374.664832] ksys_ioctl+0x67/0x90 [ 374.668539] __x64_sys_ioctl+0x1a/0x20 [ 374.672732] do_syscall_64+0x5a/0x1c0 [ 374.676828] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 374.738474] INFO: task nvmeadm:49141 blocked for more than 123 seconds. [ 374.745871] Not tainted 5.2.0-rc3-mpdebug+ #42 [ 374.751419] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 374.760170] nvmeadm D 0 49141 36333 0x00004080 [ 374.766301] Call Trace: [ 374.769038] __schedule+0x2ef/0x620 [ 374.772939] schedule+0x38/0xa0 [ 374.776452] blk_mq_freeze_queue_wait+0x59/0x100 [ 374.781614] ? remove_wait_queue+0x60/0x60 [ 374.786192] blk_mq_freeze_queue+0x1a/0x20 [ 374.790773] nvme_update_disk_info.isra.57+0x5f/0x350 [nvme_core] [ 374.797582] ? nvme_identify_ns.isra.50+0x71/0xc0 [nvme_core] [ 374.804006] __nvme_revalidate_disk+0xe5/0x110 [nvme_core] [ 374.810139] nvme_revalidate_disk+0xa6/0x120 [nvme_core] [ 374.816078] ? nvme_submit_user_cmd+0x11e/0x320 [nvme_core] [ 374.822299] nvme_user_cmd+0x264/0x370 [nvme_core] [ 374.827661] nvme_dev_ioctl+0x112/0x1d0 [nvme_core] [ 374.833114] do_vfs_ioctl+0xa7/0x600 [ 374.837117] ? __audit_syscall_entry+0xdd/0x130 [ 374.842184] ksys_ioctl+0x67/0x90 [ 374.845891] __x64_sys_ioctl+0x1a/0x20 [ 374.850082] do_syscall_64+0x5a/0x1c0 [ 374.854178] entry_SYSCALL_64_after_hwframe+0x44/0xa9 -- Reported-by: James Puthukattukaran Tested-by: James Puthukattukaran Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 5 +++++ drivers/nvme/host/multipath.c | 30 ++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 12 ++++++++++++ 3 files changed, 47 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e35f16b60fc9..88b8dfd7928a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1286,6 +1286,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, */ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { mutex_lock(&ctrl->scan_lock); + mutex_lock(&ctrl->subsys->lock); + nvme_mpath_start_freeze(ctrl->subsys); + nvme_mpath_wait_freeze(ctrl->subsys); nvme_start_freeze(ctrl); nvme_wait_freeze(ctrl); } @@ -1316,6 +1319,8 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) nvme_update_formats(ctrl); if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { nvme_unfreeze(ctrl); + nvme_mpath_unfreeze(ctrl->subsys); + mutex_unlock(&ctrl->subsys->lock); mutex_unlock(&ctrl->scan_lock); } if (effects & NVME_CMD_EFFECTS_CCC) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 4f0d0d12744e..b34dcb2288e7 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -12,6 +12,36 @@ module_param(multipath, bool, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); +void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + list_for_each_entry(h, &subsys->nsheads, entry) + if (h->disk) + blk_mq_unfreeze_queue(h->disk->queue); +} + +void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + list_for_each_entry(h, &subsys->nsheads, entry) + if (h->disk) + blk_mq_freeze_queue_wait(h->disk->queue); +} + +void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + list_for_each_entry(h, &subsys->nsheads, entry) + if (h->disk) + blk_freeze_queue_start(h->disk->queue); +} + /* * If multipathing is enabled we need to always use the subsystem instance * number for numbering our devices to avoid conflicts between subsystems that diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 26b563f9985b..6b4fb67124c6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -490,6 +490,9 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) return ctrl->ana_log_buf != NULL; } +void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); +void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); +void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); void nvme_failover_req(struct request *req); @@ -568,6 +571,15 @@ static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl) { } +static inline void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) +{ +} +static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) +{ +} +static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_NVM -- cgit v1.2.3-55-g7522 From d94211b8bad3787e0655a67284105f57db728cb1 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 26 Jul 2019 10:29:49 -0700 Subject: nvme-rdma: fix possible use-after-free in connect error flow When start_queue fails, we need to make sure to drain the queue cq before freeing the rdma resources because we might still race with the completion path. Have start_queue() error path safely stop the queue. -- [30371.808111] nvme nvme1: Failed reconnect attempt 11 [30371.808113] nvme nvme1: Reconnecting in 10 seconds... [...] [30382.069315] nvme nvme1: creating 4 I/O queues. [30382.257058] nvme nvme1: Connect Invalid SQE Parameter, qid 4 [30382.257061] nvme nvme1: failed to connect queue: 4 ret=386 [30382.305001] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 [30382.305022] IP: qedr_poll_cq+0x8a3/0x1170 [qedr] [30382.305028] PGD 0 P4D 0 [30382.305037] Oops: 0000 [#1] SMP PTI [...] [30382.305153] Call Trace: [30382.305166] ? __switch_to_asm+0x34/0x70 [30382.305187] __ib_process_cq+0x56/0xd0 [ib_core] [30382.305201] ib_poll_handler+0x26/0x70 [ib_core] [30382.305213] irq_poll_softirq+0x88/0x110 [30382.305223] ? sort_range+0x20/0x20 [30382.305232] __do_softirq+0xde/0x2c6 [30382.305241] ? sort_range+0x20/0x20 [30382.305249] run_ksoftirqd+0x1c/0x60 [30382.305258] smpboot_thread_fn+0xef/0x160 [30382.305265] kthread+0x113/0x130 [30382.305273] ? kthread_create_worker_on_cpu+0x50/0x50 [30382.305281] ret_from_fork+0x35/0x40 -- Reported-by: Nicolas Morey-Chaisemartin Reviewed-by: Max Gurtovoy Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a249db528d54..1a6449bc547b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -562,13 +562,17 @@ out_destroy_cm_id: return ret; } +static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) +{ + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->qp); +} + static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) { if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) return; - - rdma_disconnect(queue->cm_id); - ib_drain_qp(queue->qp); + __nvme_rdma_stop_queue(queue); } static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) @@ -607,11 +611,13 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) else ret = nvmf_connect_admin_queue(&ctrl->ctrl); - if (!ret) + if (!ret) { set_bit(NVME_RDMA_Q_LIVE, &queue->flags); - else + } else { + __nvme_rdma_stop_queue(queue); dev_info(ctrl->ctrl.device, "failed to connect queue: %d ret=%d\n", idx, ret); + } return ret; } -- cgit v1.2.3-55-g7522 From 0157ec8dad3c8fc9bc9790f76e0831ffdaf2e7f0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 25 Jul 2019 11:56:57 -0700 Subject: nvme: fix controller removal race with scan work With multipath enabled, nvme_scan_work() can read from the device (through nvme_mpath_add_disk()) and hang [1]. However, with fabrics, once ctrl->state is set to NVME_CTRL_DELETING, the reads will hang (see nvmf_check_ready()) and the mpath stack device make_request will block if head->list is not empty. However, when the head->list consistst of only DELETING/DEAD controllers, we should actually not block, but rather fail immediately. In addition, before we go ahead and remove the namespaces, make sure to clear the current path and kick the requeue list so that the request will fast fail upon requeuing. [1]: -- INFO: task kworker/u4:3:166 blocked for more than 120 seconds. Not tainted 5.2.0-rc6-vmlocalyes-00005-g808c8c2dc0cf #316 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kworker/u4:3 D 0 166 2 0x80004000 Workqueue: nvme-wq nvme_scan_work Call Trace: __schedule+0x851/0x1400 schedule+0x99/0x210 io_schedule+0x21/0x70 do_read_cache_page+0xa57/0x1330 read_cache_page+0x4a/0x70 read_dev_sector+0xbf/0x380 amiga_partition+0xc4/0x1230 check_partition+0x30f/0x630 rescan_partitions+0x19a/0x980 __blkdev_get+0x85a/0x12f0 blkdev_get+0x2a5/0x790 __device_add_disk+0xe25/0x1250 device_add_disk+0x13/0x20 nvme_mpath_set_live+0x172/0x2b0 nvme_update_ns_ana_state+0x130/0x180 nvme_set_ns_ana_state+0x9a/0xb0 nvme_parse_ana_log+0x1c3/0x4a0 nvme_mpath_add_disk+0x157/0x290 nvme_validate_ns+0x1017/0x1bd0 nvme_scan_work+0x44d/0x6a0 process_one_work+0x7d7/0x1240 worker_thread+0x8e/0xff0 kthread+0x2c3/0x3b0 ret_from_fork+0x35/0x40 INFO: task kworker/u4:1:1034 blocked for more than 120 seconds. Not tainted 5.2.0-rc6-vmlocalyes-00005-g808c8c2dc0cf #316 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kworker/u4:1 D 0 1034 2 0x80004000 Workqueue: nvme-delete-wq nvme_delete_ctrl_work Call Trace: __schedule+0x851/0x1400 schedule+0x99/0x210 schedule_timeout+0x390/0x830 wait_for_completion+0x1a7/0x310 __flush_work+0x241/0x5d0 flush_work+0x10/0x20 nvme_remove_namespaces+0x85/0x3d0 nvme_do_delete_ctrl+0xb4/0x1e0 nvme_delete_ctrl_work+0x15/0x20 process_one_work+0x7d7/0x1240 worker_thread+0x8e/0xff0 kthread+0x2c3/0x3b0 ret_from_fork+0x35/0x40 -- Reported-by: Logan Gunthorpe Tested-by: Logan Gunthorpe Reviewed-by: Logan Gunthorpe Reviewed-by: Ming Lei Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 7 +++++++ drivers/nvme/host/multipath.c | 46 +++++++++++++++++++++++++++++++++++++------ drivers/nvme/host/nvme.h | 9 +++++++-- 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88b8dfd7928a..c258a1ce4b28 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3577,6 +3577,13 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) struct nvme_ns *ns, *next; LIST_HEAD(ns_list); + /* + * make sure to requeue I/O to all namespaces as these + * might result from the scan itself and must complete + * for the scan_work to make progress + */ + nvme_mpath_clear_ctrl_paths(ctrl); + /* prevent racing with ns scanning */ flush_work(&ctrl->scan_work); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index b34dcb2288e7..888d4543894e 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -134,18 +134,34 @@ static const char *nvme_ana_state_names[] = { [NVME_ANA_CHANGE] = "change", }; -void nvme_mpath_clear_current_path(struct nvme_ns *ns) +bool nvme_mpath_clear_current_path(struct nvme_ns *ns) { struct nvme_ns_head *head = ns->head; + bool changed = false; int node; if (!head) - return; + goto out; for_each_node(node) { - if (ns == rcu_access_pointer(head->current_path[node])) + if (ns == rcu_access_pointer(head->current_path[node])) { rcu_assign_pointer(head->current_path[node], NULL); + changed = true; + } } +out: + return changed; +} + +void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->scan_lock); + list_for_each_entry(ns, &ctrl->namespaces, list) + if (nvme_mpath_clear_current_path(ns)) + kblockd_schedule_work(&ns->head->requeue_work); + mutex_unlock(&ctrl->scan_lock); } static bool nvme_path_is_disabled(struct nvme_ns *ns) @@ -256,6 +272,24 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) return ns; } +static bool nvme_available_path(struct nvme_ns_head *head) +{ + struct nvme_ns *ns; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + switch (ns->ctrl->state) { + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + /* fallthru */ + return true; + default: + break; + } + } + return false; +} + static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, struct bio *bio) { @@ -282,14 +316,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); ret = direct_make_request(bio); - } else if (!list_empty_careful(&head->list)) { - dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); + } else if (nvme_available_path(head)) { + dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); spin_lock_irq(&head->requeue_lock); bio_list_add(&head->requeue_list, bio); spin_unlock_irq(&head->requeue_lock); } else { - dev_warn_ratelimited(dev, "no path - failing I/O\n"); + dev_warn_ratelimited(dev, "no available path - failing I/O\n"); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6b4fb67124c6..778b3a0b6adb 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -503,7 +503,8 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head); int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); void nvme_mpath_uninit(struct nvme_ctrl *ctrl); void nvme_mpath_stop(struct nvme_ctrl *ctrl); -void nvme_mpath_clear_current_path(struct nvme_ns *ns); +bool nvme_mpath_clear_current_path(struct nvme_ns *ns); +void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) @@ -551,7 +552,11 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns, static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) { } -static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) +static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) +{ + return false; +} +static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { } static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) -- cgit v1.2.3-55-g7522 From bd46a90634302bfe791e93ad5496f98f165f7ae0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 29 Jul 2019 16:34:52 -0600 Subject: nvme-pci: Fix async probe remove race Ensure the controller is not in the NEW state when nvme_probe() exits. This will always allow a subsequent nvme_remove() to set the state to DELETING, fixing a potential race between the initial asynchronous probe and device removal. Reported-by: Li Zhong Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index db160cee42ad..0c2c4b0c6655 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2695,7 +2695,7 @@ static void nvme_async_probe(void *data, async_cookie_t cookie) { struct nvme_dev *dev = data; - nvme_reset_ctrl_sync(&dev->ctrl); + flush_work(&dev->ctrl.reset_work); flush_work(&dev->ctrl.scan_work); nvme_put_ctrl(&dev->ctrl); } @@ -2761,6 +2761,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + nvme_reset_ctrl(&dev->ctrl); nvme_get_ctrl(&dev->ctrl); async_schedule(nvme_async_probe, dev); -- cgit v1.2.3-55-g7522 From e26cc08265dda37d2acc8394604f220ef412299d Mon Sep 17 00:00:00 2001 From: zhengbin Date: Mon, 12 Aug 2019 20:36:55 +0800 Subject: blk-mq: move cancel of requeue_work to the front of blk_exit_queue blk_exit_queue will free elevator_data, while blk_mq_requeue_work will access it. Move cancel of requeue_work to the front of blk_exit_queue to avoid use-after-free. blk_exit_queue blk_mq_requeue_work __elevator_exit blk_mq_run_hw_queues blk_mq_exit_sched blk_mq_run_hw_queue dd_exit_queue blk_mq_hctx_has_pending kfree(elevator_data) blk_mq_sched_has_work dd_has_work Fixes: fbc2a15e3433 ("blk-mq: move cancel of requeue_work into blk_mq_release") Cc: stable@vger.kernel.org Reviewed-by: Ming Lei Signed-off-by: zhengbin Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 -- block/blk-sysfs.c | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f78d3287dd82..a8e6a58f5f28 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2666,8 +2666,6 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx, *next; int i; - cancel_delayed_work_sync(&q->requeue_work); - queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 977c659dcd18..9bfa3ea4ed63 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -892,6 +892,9 @@ static void __blk_release_queue(struct work_struct *work) blk_free_queue_stats(q->stats); + if (queue_is_mq(q)) + cancel_delayed_work_sync(&q->requeue_work); + blk_exit_queue(q); blk_queue_free_zone_bitmaps(q); -- cgit v1.2.3-55-g7522 From ae78ca3cf3d9e9f914bfcd0bc5c389ff18b9c2e0 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Sun, 11 Aug 2019 12:23:22 -0500 Subject: xen/blkback: fix memory leaks In read_per_ring_refs(), after 'req' and related memory regions are allocated, xen_blkif_map() is invoked to map the shared frame, irq, and etc. However, if this mapping process fails, no cleanup is performed, leading to memory leaks. To fix this issue, invoke the cleanup before returning the error. Acked-by: Roger Pau Monné Reviewed-by: Boris Ostrovsky Signed-off-by: Wenwen Wang Signed-off-by: Jens Axboe --- drivers/block/xen-blkback/xenbus.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 3ac6a5d18071..b90dbcd99c03 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -965,6 +965,7 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) } } + err = -ENOMEM; for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) @@ -987,7 +988,7 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn); if (err) { xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); - return err; + goto fail; } return 0; @@ -1007,8 +1008,7 @@ fail: } kfree(req); } - return -ENOMEM; - + return err; } static int connect_ring(struct backend_info *be) -- cgit v1.2.3-55-g7522 From 99c79f6692ccdc42e04deea8a36e22bb48168a62 Mon Sep 17 00:00:00 2001 From: Aleix Roca Nonell Date: Thu, 15 Aug 2019 14:03:22 +0200 Subject: io_uring: fix manual setup of iov_iter for fixed buffers Commit bd11b3a391e3 ("io_uring: don't use iov_iter_advance() for fixed buffers") introduced an optimization to avoid using the slow iov_iter_advance by manually populating the iov_iter iterator in some cases. However, the computation of the iterator count field was erroneous: The first bvec was always accounted for an extent of page size even if the bvec length was smaller. In consequence, some I/O operations on fixed buffers were unable to operate on the full extent of the buffer, consistently skipping some bytes at the end of it. Fixes: bd11b3a391e3 ("io_uring: don't use iov_iter_advance() for fixed buffers") Cc: stable@vger.kernel.org Signed-off-by: Aleix Roca Nonell Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d542f1cf4428..aa25b5bbd4ae 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1097,10 +1097,8 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, iter->bvec = bvec + seg_skip; iter->nr_segs -= seg_skip; - iter->count -= (seg_skip << PAGE_SHIFT); + iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ~PAGE_MASK; - if (iter->iov_offset) - iter->count -= iter->iov_offset; } } -- cgit v1.2.3-55-g7522 From 7b6620d7db566a46f49b4b9deab9fa061fd4b59b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Aug 2019 11:09:16 -0600 Subject: block: remove REQ_NOWAIT_INLINE We had a few issues with this code, and there's still a problem around how we deal with error handling for chained/split bios. For now, just revert the code and we'll try again with a thoroug solution. This reverts commits: e15c2ffa1091 ("block: fix O_DIRECT error handling for bio fragments") 0eb6ddfb865c ("block: Fix __blkdev_direct_IO() for bio fragments") 6a43074e2f46 ("block: properly handle IOCB_NOWAIT for async O_DIRECT IO") 893a1c97205a ("blk-mq: allow REQ_NOWAIT to return an error inline") Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++------ fs/block_dev.c | 49 +++++------------------------------------------ include/linux/blk_types.h | 5 +---- 3 files changed, 8 insertions(+), 54 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a8e6a58f5f28..0835f4d8d42e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1958,13 +1958,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) rq = blk_mq_get_request(q, bio, &data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); - - cookie = BLK_QC_T_NONE; - if (bio->bi_opf & REQ_NOWAIT_INLINE) - cookie = BLK_QC_T_EAGAIN; - else if (bio->bi_opf & REQ_NOWAIT) + if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - return cookie; + return BLK_QC_T_NONE; } trace_block_getrq(q, bio, bio->bi_opf); diff --git a/fs/block_dev.c b/fs/block_dev.c index eb657ab94060..677cb364d33f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -345,24 +345,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct bio *bio; bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; - bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - gfp_t gfp; - int ret; + int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - if (nowait) - gfp = GFP_NOWAIT; - else - gfp = GFP_KERNEL; - - bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool); - if (!bio) - return -EAGAIN; + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); @@ -384,7 +375,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) if (!is_poll) blk_start_plug(&plug); - ret = 0; for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; @@ -409,14 +399,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) task_io_account_write(bio->bi_iter.bi_size); } - /* - * Tell underlying layer to not block for resource shortage. - * And if we would have blocked, return error inline instead - * of through the bio->bi_end_io() callback. - */ - if (nowait) - bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE); - + dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); @@ -428,13 +411,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) polled = true; } - dio->size += bio->bi_iter.bi_size; qc = submit_bio(bio); - if (qc == BLK_QC_T_EAGAIN) { - dio->size -= bio->bi_iter.bi_size; - ret = -EAGAIN; - goto error; - } if (polled) WRITE_ONCE(iocb->ki_cookie, qc); @@ -455,19 +432,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) atomic_inc(&dio->ref); } - dio->size += bio->bi_iter.bi_size; - qc = submit_bio(bio); - if (qc == BLK_QC_T_EAGAIN) { - dio->size -= bio->bi_iter.bi_size; - ret = -EAGAIN; - goto error; - } - - bio = bio_alloc(gfp, nr_pages); - if (!bio) { - ret = -EAGAIN; - goto error; - } + submit_bio(bio); + bio = bio_alloc(GFP_KERNEL, nr_pages); } if (!is_poll) @@ -487,7 +453,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); -out: if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) @@ -495,10 +460,6 @@ out: bio_put(&dio->bio); return ret; -error: - if (!is_poll) - blk_finish_plug(&plug); - goto out; } static ssize_t diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b1fa1557e68..feff3fe4467e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -311,7 +311,6 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ - __REQ_NOWAIT_INLINE, /* Return would-block error inline */ /* * When a shared kthread needs to issue a bio for a cgroup, doing * so synchronously can lead to priority inversions as the kthread @@ -346,7 +345,6 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) -#define REQ_NOWAIT_INLINE (1ULL << __REQ_NOWAIT_INLINE) #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) @@ -420,13 +418,12 @@ static inline int op_stat_group(unsigned int op) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U -#define BLK_QC_T_EAGAIN -2U #define BLK_QC_T_SHIFT 16 #define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { - return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN; + return cookie != BLK_QC_T_NONE; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) -- cgit v1.2.3-55-g7522 From a982eeb09b6030e567b8b815277c8c9197168040 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Wed, 14 Aug 2019 17:35:22 +0800 Subject: io_uring: fix an issue when IOSQE_IO_LINK is inserted into defer list This patch may fix two issues: First, when IOSQE_IO_DRAIN set, the next IOs need to be inserted into defer list to delay execution, but link io will be actively scheduled to run by calling io_queue_sqe. Second, when multiple LINK_IOs are inserted together with defer_list, the LINK_IO is no longer keep order. |-------------| | LINK_IO | ----> insert to defer_list ----------- |-------------| | | LINK_IO | ----> insert to defer_list ----------| |-------------| | | LINK_IO | ----> insert to defer_list ----------| |-------------| | | NORMAL_IO | ----> insert to defer_list ----------| |-------------| | | queue_work at same time <-----| Fixes: 9e645e1105c ("io_uring: add support for sqe links") Signed-off-by: Jackie Liu Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index aa25b5bbd4ae..24bbe3cb7ad4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2023,6 +2023,15 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, { int ret; + ret = io_req_defer(ctx, req, s->sqe); + if (ret) { + if (ret != -EIOCBQUEUED) { + io_free_req(req); + io_cqring_add_event(ctx, s->sqe->user_data, ret); + } + return 0; + } + ret = __io_submit_sqe(ctx, req, s, true); if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; @@ -2095,13 +2104,6 @@ err: return; } - ret = io_req_defer(ctx, req, s->sqe); - if (ret) { - if (ret != -EIOCBQUEUED) - goto err_req; - return; - } - /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but -- cgit v1.2.3-55-g7522