diff options
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r-- | drivers/nvme/host/pci.c | 232 |
1 files changed, 112 insertions, 120 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5a0bf6a24d50..a90cf5d63aac 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NVM Express device driver * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/aer.h> @@ -95,6 +87,7 @@ struct nvme_dev; struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); +static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); /* * Represents an NVM Express device. Each nvme_dev is a PCI function. @@ -156,6 +149,8 @@ static int queue_count_set(const char *val, const struct kernel_param *kp) int n = 0, ret; ret = kstrtoint(val, 10, &n); + if (ret) + return ret; if (n > num_possible_cpus()) n = num_possible_cpus(); @@ -1019,9 +1014,11 @@ static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) { - if (++nvmeq->cq_head == nvmeq->q_depth) { + if (nvmeq->cq_head == nvmeq->q_depth - 1) { nvmeq->cq_head = 0; nvmeq->cq_phase = !nvmeq->cq_phase; + } else { + nvmeq->cq_head++; } } @@ -1420,6 +1417,14 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) return 0; } +static void nvme_suspend_io_queues(struct nvme_dev *dev) +{ + int i; + + for (i = dev->ctrl.queue_count - 1; i > 0; i--) + nvme_suspend_queue(&dev->queues[i]); +} + static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { struct nvme_queue *nvmeq = &dev->queues[0]; @@ -1485,8 +1490,8 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) if (dev->ctrl.queue_count > qid) return 0; - nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), - &nvmeq->cq_dma_addr, GFP_KERNEL); + nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth), + &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; @@ -1885,8 +1890,9 @@ static void nvme_free_host_mem(struct nvme_dev *dev) struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; - dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i], - le64_to_cpu(desc->addr)); + dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i], + le64_to_cpu(desc->addr), + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); } kfree(dev->host_mem_desc_bufs); @@ -1915,8 +1921,8 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) max_entries = dev->ctrl.hmmaxd; - descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs), - &descs_dma, GFP_KERNEL); + descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs), + &descs_dma, GFP_KERNEL); if (!descs) goto out; @@ -1952,8 +1958,9 @@ out_free_bufs: while (--i >= 0) { size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; - dma_free_coherent(dev->dev, size, bufs[i], - le64_to_cpu(descs[i].addr)); + dma_free_attrs(dev->dev, size, bufs[i], + le64_to_cpu(descs[i].addr), + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); } kfree(bufs); @@ -2028,49 +2035,52 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) return ret; } -static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues) +/* + * nirqs is the number of interrupts available for write and read + * queues. The core already reserved an interrupt for the admin queue. + */ +static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) { - unsigned int this_w_queues = write_queues; + struct nvme_dev *dev = affd->priv; + unsigned int nr_read_queues; /* - * Setup read/write queue split + * If there is no interupt available for queues, ensure that + * the default queue is set to 1. The affinity set size is + * also set to one, but the irq core ignores it for this case. + * + * If only one interrupt is available or 'write_queue' == 0, combine + * write and read queues. + * + * If 'write_queues' > 0, ensure it leaves room for at least one read + * queue. */ - if (irq_queues == 1) { - dev->io_queues[HCTX_TYPE_DEFAULT] = 1; - dev->io_queues[HCTX_TYPE_READ] = 0; - return; - } - - /* - * If 'write_queues' is set, ensure it leaves room for at least - * one read queue - */ - if (this_w_queues >= irq_queues) - this_w_queues = irq_queues - 1; - - /* - * If 'write_queues' is set to zero, reads and writes will share - * a queue set. - */ - if (!this_w_queues) { - dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues; - dev->io_queues[HCTX_TYPE_READ] = 0; + if (!nrirqs) { + nrirqs = 1; + nr_read_queues = 0; + } else if (nrirqs == 1 || !write_queues) { + nr_read_queues = 0; + } else if (write_queues >= nrirqs) { + nr_read_queues = 1; } else { - dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues; - dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues; + nr_read_queues = nrirqs - write_queues; } + + dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; + affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; + dev->io_queues[HCTX_TYPE_READ] = nr_read_queues; + affd->set_size[HCTX_TYPE_READ] = nr_read_queues; + affd->nr_sets = nr_read_queues ? 2 : 1; } static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) { struct pci_dev *pdev = to_pci_dev(dev->dev); - int irq_sets[2]; struct irq_affinity affd = { - .pre_vectors = 1, - .nr_sets = ARRAY_SIZE(irq_sets), - .sets = irq_sets, + .pre_vectors = 1, + .calc_sets = nvme_calc_irq_sets, + .priv = dev, }; - int result = 0; unsigned int irq_queues, this_p_queues; /* @@ -2082,54 +2092,22 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) this_p_queues = nr_io_queues - 1; irq_queues = 1; } else { - irq_queues = nr_io_queues - this_p_queues; + irq_queues = nr_io_queues - this_p_queues + 1; } dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; - /* - * For irq sets, we have to ask for minvec == maxvec. This passes - * any reduction back to us, so we can adjust our queue counts and - * IRQ vector needs. - */ - do { - nvme_calc_io_queues(dev, irq_queues); - irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT]; - irq_sets[1] = dev->io_queues[HCTX_TYPE_READ]; - if (!irq_sets[1]) - affd.nr_sets = 1; - - /* - * If we got a failure and we're down to asking for just - * 1 + 1 queues, just ask for a single vector. We'll share - * that between the single IO queue and the admin queue. - */ - if (result >= 0 && irq_queues > 1) - irq_queues = irq_sets[0] + irq_sets[1] + 1; + /* Initialize for the single interrupt case */ + dev->io_queues[HCTX_TYPE_DEFAULT] = 1; + dev->io_queues[HCTX_TYPE_READ] = 0; - result = pci_alloc_irq_vectors_affinity(pdev, irq_queues, - irq_queues, - PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); - - /* - * Need to reduce our vec counts. If we get ENOSPC, the - * platform should support mulitple vecs, we just need - * to decrease our ask. If we get EINVAL, the platform - * likely does not. Back down to ask for just one vector. - */ - if (result == -ENOSPC) { - irq_queues--; - if (!irq_queues) - return result; - continue; - } else if (result == -EINVAL) { - irq_queues = 1; - continue; - } else if (result <= 0) - return -EIO; - break; - } while (1); + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, + PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); +} - return result; +static void nvme_disable_io_queues(struct nvme_dev *dev) +{ + if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq)) + __nvme_disable_io_queues(dev, nvme_admin_delete_cq); } static int nvme_setup_io_queues(struct nvme_dev *dev) @@ -2168,6 +2146,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) } while (1); adminq->q_db = dev->dbs; + retry: /* Deregister the admin queue's interrupt */ pci_free_irq(pdev, 0, adminq); @@ -2185,25 +2164,34 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) result = max(result - 1, 1); dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL]; - dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", - dev->io_queues[HCTX_TYPE_DEFAULT], - dev->io_queues[HCTX_TYPE_READ], - dev->io_queues[HCTX_TYPE_POLL]); - /* * Should investigate if there's a performance win from allocating * more queues than interrupt vectors; it might allow the submission * path to scale better, even if the receive path is limited by the * number of interrupts. */ - result = queue_request_irq(adminq); if (result) { adminq->cq_vector = -1; return result; } set_bit(NVMEQ_ENABLED, &adminq->flags); - return nvme_create_io_queues(dev); + + result = nvme_create_io_queues(dev); + if (result || dev->online_queues < 2) + return result; + + if (dev->online_queues - 1 < dev->max_qid) { + nr_io_queues = dev->online_queues - 1; + nvme_disable_io_queues(dev); + nvme_suspend_io_queues(dev); + goto retry; + } + dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", + dev->io_queues[HCTX_TYPE_DEFAULT], + dev->io_queues[HCTX_TYPE_READ], + dev->io_queues[HCTX_TYPE_POLL]); + return 0; } static void nvme_del_queue_end(struct request *req, blk_status_t error) @@ -2248,7 +2236,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) return 0; } -static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) +static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) { int nr_queues = dev->online_queues - 1, sent = 0; unsigned long timeout; @@ -2294,7 +2282,6 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.nr_maps = 2; /* default + read */ if (dev->io_queues[HCTX_TYPE_POLL]) dev->tagset.nr_maps++; - dev->tagset.nr_maps = HCTX_MAX_TYPES; dev->tagset.timeout = NVME_IO_TIMEOUT; dev->tagset.numa_node = dev_to_node(dev->dev); dev->tagset.queue_depth = @@ -2410,7 +2397,6 @@ static void nvme_pci_disable(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { - int i; bool dead = true; struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -2437,13 +2423,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_stop_queues(&dev->ctrl); if (!dead && dev->ctrl.queue_count > 0) { - if (nvme_disable_io_queues(dev, nvme_admin_delete_sq)) - nvme_disable_io_queues(dev, nvme_admin_delete_cq); + nvme_disable_io_queues(dev); nvme_disable_admin_queue(dev, shutdown); } - for (i = dev->ctrl.queue_count - 1; i >= 0; i--) - nvme_suspend_queue(&dev->queues[i]); - + nvme_suspend_io_queues(dev); + nvme_suspend_queue(&dev->queues[0]); nvme_pci_disable(dev); blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); @@ -2527,27 +2511,18 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); - /* - * Introduce CONNECTING state from nvme-fc/rdma transports to mark the - * initializing procedure here. - */ - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { - dev_warn(dev->ctrl.device, - "failed to mark controller CONNECTING\n"); - goto out; - } - + mutex_lock(&dev->shutdown_lock); result = nvme_pci_enable(dev); if (result) - goto out; + goto out_unlock; result = nvme_pci_configure_admin_queue(dev); if (result) - goto out; + goto out_unlock; result = nvme_alloc_admin_tags(dev); if (result) - goto out; + goto out_unlock; /* * Limit the max command size to prevent iod->sg allocations going @@ -2555,6 +2530,17 @@ static void nvme_reset_work(struct work_struct *work) */ dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; dev->ctrl.max_segments = NVME_MAX_SEGS; + mutex_unlock(&dev->shutdown_lock); + + /* + * Introduce CONNECTING state from nvme-fc/rdma transports to mark the + * initializing procedure here. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { + dev_warn(dev->ctrl.device, + "failed to mark controller CONNECTING\n"); + goto out; + } result = nvme_init_identify(&dev->ctrl); if (result) @@ -2619,6 +2605,8 @@ static void nvme_reset_work(struct work_struct *work) nvme_start_ctrl(&dev->ctrl); return; + out_unlock: + mutex_unlock(&dev->shutdown_lock); out: nvme_remove_dead_ctrl(dev, result); } @@ -2946,8 +2934,11 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_MEDIUM_PRIO_SQ }, + { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */ + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ - .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, + .driver_data = NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ @@ -2988,6 +2979,7 @@ static struct pci_driver nvme_driver = { static int __init nvme_init(void) { + BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); return pci_register_driver(&nvme_driver); } |