diff options
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r-- | drivers/nvme/host/pci.c | 198 |
1 files changed, 145 insertions, 53 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2a8708c9ac18..db160cee42ad 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -18,6 +18,7 @@ #include <linux/mutex.h> #include <linux/once.h> #include <linux/pci.h> +#include <linux/suspend.h> #include <linux/t10-pi.h> #include <linux/types.h> #include <linux/io-64-nonatomic-lo-hi.h> @@ -67,20 +68,14 @@ static int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); -static int queue_count_set(const char *val, const struct kernel_param *kp); -static const struct kernel_param_ops queue_count_ops = { - .set = queue_count_set, - .get = param_get_int, -}; - static int write_queues; -module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644); +module_param(write_queues, int, 0644); MODULE_PARM_DESC(write_queues, "Number of queues to use for writes. If not set, reads and writes " "will share a queue set."); -static int poll_queues = 0; -module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); +static int poll_queues; +module_param(poll_queues, int, 0644); MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); struct nvme_dev; @@ -116,6 +111,7 @@ struct nvme_dev { u32 cmbsz; u32 cmbloc; struct nvme_ctrl ctrl; + u32 last_ps; mempool_t *iod_mempool; @@ -144,19 +140,6 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp) return param_set_int(val, kp); } -static int queue_count_set(const char *val, const struct kernel_param *kp) -{ - int n, ret; - - ret = kstrtoint(val, 10, &n); - if (ret) - return ret; - if (n > num_possible_cpus()) - n = num_possible_cpus(); - - return param_set_int(val, kp); -} - static inline unsigned int sq_idx(unsigned int qid, u32 stride) { return qid * 2 * stride; @@ -464,7 +447,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set) * affinity), so use the regular blk-mq cpu mapping */ map->queue_offset = qoff; - if (i != HCTX_TYPE_POLL) + if (i != HCTX_TYPE_POLL && offset) blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); else blk_mq_map_queues(map); @@ -1257,7 +1240,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd; - bool shutdown = false; u32 csts = readl(dev->bar + NVME_REG_CSTS); /* If PCI error recovery process is happening, we cannot reset or @@ -1294,17 +1276,18 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * shutdown, so we return BLK_EH_DONE. */ switch (dev->ctrl.state) { - case NVME_CTRL_DELETING: - shutdown = true; - /* fall through */ case NVME_CTRL_CONNECTING: - case NVME_CTRL_RESETTING: + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + /* fall through */ + case NVME_CTRL_DELETING: dev_warn_ratelimited(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, shutdown); + nvme_dev_disable(dev, true); nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_DONE; + case NVME_CTRL_RESETTING: + return BLK_EH_RESET_TIMER; default: break; } @@ -1456,11 +1439,15 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); - nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, - nvmeq->sq_cmds); - if (nvmeq->sq_dma_addr) { - set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); - return 0; + if (nvmeq->sq_cmds) { + nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, + nvmeq->sq_cmds); + if (nvmeq->sq_dma_addr) { + set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); + return 0; + } + + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth)); } } @@ -2068,6 +2055,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) .priv = dev, }; unsigned int irq_queues, this_p_queues; + unsigned int nr_cpus = num_possible_cpus(); /* * Poll queues don't need interrupts, but we need at least one IO @@ -2078,7 +2066,10 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) this_p_queues = nr_io_queues - 1; irq_queues = 1; } else { - irq_queues = nr_io_queues - this_p_queues + 1; + if (nr_cpus < nr_io_queues - this_p_queues) + irq_queues = nr_cpus + 1; + else + irq_queues = nr_io_queues - this_p_queues + 1; } dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; @@ -2302,8 +2293,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) pci_set_master(pdev); - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64))) goto disable; if (readl(dev->bar + NVME_REG_CSTS) == -1) { @@ -2376,7 +2366,7 @@ static void nvme_pci_disable(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { - bool dead = true; + bool dead = true, freeze = false; struct pci_dev *pdev = to_pci_dev(dev->dev); mutex_lock(&dev->shutdown_lock); @@ -2384,8 +2374,10 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) u32 csts = readl(dev->bar + NVME_REG_CSTS); if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) + dev->ctrl.state == NVME_CTRL_RESETTING) { + freeze = true; nvme_start_freeze(&dev->ctrl); + } dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || pdev->error_state != pci_channel_io_normal); } @@ -2394,10 +2386,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * Give the controller a chance to complete all entered requests if * doing a safe shutdown. */ - if (!dead) { - if (shutdown) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); - } + if (!dead && shutdown && freeze) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); nvme_stop_queues(&dev->ctrl); @@ -2464,10 +2454,8 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) kfree(dev); } -static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) +static void nvme_remove_dead_ctrl(struct nvme_dev *dev) { - dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); - nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); nvme_kill_queues(&dev->ctrl); @@ -2480,11 +2468,13 @@ static void nvme_reset_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, ctrl.reset_work); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); - int result = -ENODEV; + int result; enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; - if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) + if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) { + result = -ENODEV; goto out; + } /* * If we're called to reset a live controller first shut it down before @@ -2492,6 +2482,7 @@ static void nvme_reset_work(struct work_struct *work) */ if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); + nvme_sync_queues(&dev->ctrl); mutex_lock(&dev->shutdown_lock); result = nvme_pci_enable(dev); @@ -2510,8 +2501,15 @@ static void nvme_reset_work(struct work_struct *work) * Limit the max command size to prevent iod->sg allocations going * over a single page. */ - dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; + dev->ctrl.max_hw_sectors = min_t(u32, + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); dev->ctrl.max_segments = NVME_MAX_SEGS; + + /* + * Don't limit the IOMMU merged segment size. + */ + dma_set_max_seg_size(dev->dev, 0xffffffff); + mutex_unlock(&dev->shutdown_lock); /* @@ -2521,6 +2519,7 @@ static void nvme_reset_work(struct work_struct *work) if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { dev_warn(dev->ctrl.device, "failed to mark controller CONNECTING\n"); + result = -EBUSY; goto out; } @@ -2581,6 +2580,7 @@ static void nvme_reset_work(struct work_struct *work) if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { dev_warn(dev->ctrl.device, "failed to mark controller state %d\n", new_state); + result = -ENODEV; goto out; } @@ -2590,7 +2590,10 @@ static void nvme_reset_work(struct work_struct *work) out_unlock: mutex_unlock(&dev->shutdown_lock); out: - nvme_remove_dead_ctrl(dev, result); + if (result) + dev_warn(dev->ctrl.device, + "Removing after probe failure status: %d\n", result); + nvme_remove_dead_ctrl(dev); } static void nvme_remove_dead_ctrl_work(struct work_struct *work) @@ -2828,16 +2831,94 @@ static void nvme_remove(struct pci_dev *pdev) } #ifdef CONFIG_PM_SLEEP +static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps) +{ + return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps); +} + +static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps) +{ + return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL); +} + +static int nvme_resume(struct device *dev) +{ + struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); + struct nvme_ctrl *ctrl = &ndev->ctrl; + + if (pm_resume_via_firmware() || !ctrl->npss || + nvme_set_power_state(ctrl, ndev->last_ps) != 0) + nvme_reset_ctrl(ctrl); + return 0; +} + static int nvme_suspend(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); + struct nvme_ctrl *ctrl = &ndev->ctrl; + int ret = -EBUSY; + + /* + * The platform does not remove power for a kernel managed suspend so + * use host managed nvme power settings for lowest idle power if + * possible. This should have quicker resume latency than a full device + * shutdown. But if the firmware is involved after the suspend or the + * device does not support any non-default power states, shut down the + * device fully. + */ + if (pm_suspend_via_firmware() || !ctrl->npss) { + nvme_dev_disable(ndev, true); + return 0; + } + + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + nvme_sync_queues(ctrl); + + if (ctrl->state != NVME_CTRL_LIVE && + ctrl->state != NVME_CTRL_ADMIN_ONLY) + goto unfreeze; + + ndev->last_ps = 0; + ret = nvme_get_power_state(ctrl, &ndev->last_ps); + if (ret < 0) + goto unfreeze; + + ret = nvme_set_power_state(ctrl, ctrl->npss); + if (ret < 0) + goto unfreeze; + + if (ret) { + /* + * Clearing npss forces a controller reset on resume. The + * correct value will be resdicovered then. + */ + nvme_dev_disable(ndev, true); + ctrl->npss = 0; + ret = 0; + goto unfreeze; + } + /* + * A saved state prevents pci pm from generically controlling the + * device's power. If we're using protocol specific settings, we don't + * want pci interfering. + */ + pci_save_state(pdev); +unfreeze: + nvme_unfreeze(ctrl); + return ret; +} + +static int nvme_simple_suspend(struct device *dev) +{ + struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); nvme_dev_disable(ndev, true); return 0; } -static int nvme_resume(struct device *dev) +static int nvme_simple_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); @@ -2845,9 +2926,16 @@ static int nvme_resume(struct device *dev) nvme_reset_ctrl(&ndev->ctrl); return 0; } -#endif -static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); +static const struct dev_pm_ops nvme_dev_pm_ops = { + .suspend = nvme_suspend, + .resume = nvme_resume, + .freeze = nvme_simple_suspend, + .thaw = nvme_simple_resume, + .poweroff = nvme_simple_suspend, + .restore = nvme_simple_resume, +}; +#endif /* CONFIG_PM_SLEEP */ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, pci_channel_state_t state) @@ -2939,6 +3027,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ .driver_data = NVME_QUIRK_LIGHTNVM, }, + { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, @@ -2952,9 +3042,11 @@ static struct pci_driver nvme_driver = { .probe = nvme_probe, .remove = nvme_remove, .shutdown = nvme_shutdown, +#ifdef CONFIG_PM_SLEEP .driver = { .pm = &nvme_dev_pm_ops, }, +#endif .sriov_configure = pci_sriov_configure_simple, .err_handler = &nvme_err_handler, }; |