From 195cc354696d75e9625cf303a0791404b3215501 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 1 Oct 2019 13:40:24 +0200 Subject: pci: pass along the return value of dma_memory_rw Some devices might want to know the return value of dma_memory_rw, so pass it along instead of ignoring it. There are no existing users of the return value, so this patch should be safe. Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Acked-by: Keith Busch --- include/hw/pci/pci.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 0a59a06b14..f19ffe6b4f 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -783,8 +783,7 @@ static inline AddressSpace *pci_get_address_space(PCIDevice *dev) static inline int pci_dma_rw(PCIDevice *dev, dma_addr_t addr, void *buf, dma_addr_t len, DMADirection dir) { - dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir); - return 0; + return dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir); } static inline int pci_dma_read(PCIDevice *dev, dma_addr_t addr, -- cgit v1.2.3-55-g7522 From cba0a8a344fea94aa2212e105611b8e099343cb1 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 12 Apr 2019 20:53:16 +0200 Subject: hw/block/nvme: add support for scatter gather lists For now, support the Data Block, Segment and Last Segment descriptor types. See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)"). Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 329 +++++++++++++++++++++++++++++++++++++++++--------- hw/block/trace-events | 4 + include/block/nvme.h | 6 +- 3 files changed, 279 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c0f1f8ccd4..63d0a17bc5 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -413,13 +413,262 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, return NVME_SUCCESS; } -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - uint64_t prp1, uint64_t prp2, DMADirection dir, +/* + * Map 'nsgld' data descriptors from 'segment'. The function will subtract the + * number of bytes mapped in len. + */ +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, + QEMUIOVector *iov, + NvmeSglDescriptor *segment, uint64_t nsgld, + size_t *len, NvmeRequest *req) +{ + dma_addr_t addr, trans_len; + uint32_t dlen; + uint16_t status; + + for (int i = 0; i < nsgld; i++) { + uint8_t type = NVME_SGL_TYPE(segment[i].type); + + switch (type) { + case NVME_SGL_DESCR_TYPE_DATA_BLOCK: + break; + case NVME_SGL_DESCR_TYPE_SEGMENT: + case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: + return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; + default: + return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; + } + + dlen = le32_to_cpu(segment[i].len); + if (!dlen) { + continue; + } + + if (*len == 0) { + /* + * All data has been mapped, but the SGL contains additional + * segments and/or descriptors. The controller might accept + * ignoring the rest of the SGL. + */ + uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls); + if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { + break; + } + + trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req)); + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + trans_len = MIN(*len, dlen); + addr = le64_to_cpu(segment[i].addr); + + if (UINT64_MAX - addr < dlen) { + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + status = nvme_map_addr(n, qsg, iov, addr, trans_len); + if (status) { + return status; + } + + *len -= trans_len; + } + + return NVME_SUCCESS; +} + +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, + NvmeSglDescriptor sgl, size_t len, NvmeRequest *req) +{ + /* + * Read the segment in chunks of 256 descriptors (one 4k page) to avoid + * dynamically allocating a potentially huge SGL. The spec allows the SGL + * to be larger (as in number of bytes required to describe the SGL + * descriptors and segment chain) than the command transfer size, so it is + * not bounded by MDTS. + */ + const int SEG_CHUNK_SIZE = 256; + + NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; + uint64_t nsgld; + uint32_t seg_len; + uint16_t status; + bool sgl_in_cmb = false; + hwaddr addr; + int ret; + + sgld = &sgl; + addr = le64_to_cpu(sgl.addr); + + trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len); + + /* + * If the entire transfer can be described with a single data block it can + * be mapped directly. + */ + if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { + status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req); + if (status) { + goto unmap; + } + + goto out; + } + + /* + * If the segment is located in the CMB, the submission queue of the + * request must also reside there. + */ + if (nvme_addr_is_cmb(n, addr)) { + if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) { + return NVME_INVALID_USE_OF_CMB | NVME_DNR; + } + + sgl_in_cmb = true; + } + + for (;;) { + switch (NVME_SGL_TYPE(sgld->type)) { + case NVME_SGL_DESCR_TYPE_SEGMENT: + case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: + break; + default: + return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + } + + seg_len = le32_to_cpu(sgld->len); + + /* check the length of the (Last) Segment descriptor */ + if (!seg_len || seg_len & 0xf) { + return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + } + + if (UINT64_MAX - addr < seg_len) { + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + nsgld = seg_len / sizeof(NvmeSglDescriptor); + + while (nsgld > SEG_CHUNK_SIZE) { + if (nvme_addr_read(n, addr, segment, sizeof(segment))) { + trace_pci_nvme_err_addr_read(addr); + status = NVME_DATA_TRAS_ERROR; + goto unmap; + } + + status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE, + &len, req); + if (status) { + goto unmap; + } + + nsgld -= SEG_CHUNK_SIZE; + addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor); + } + + ret = nvme_addr_read(n, addr, segment, nsgld * + sizeof(NvmeSglDescriptor)); + if (ret) { + trace_pci_nvme_err_addr_read(addr); + status = NVME_DATA_TRAS_ERROR; + goto unmap; + } + + last_sgld = &segment[nsgld - 1]; + + /* if the segment ends with a Data Block, then we are done */ + if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { + status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req); + if (status) { + goto unmap; + } + + goto out; + } + + /* + * If the last descriptor was not a Data Block, then the current + * segment must not be a Last Segment. + */ + if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) { + status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + goto unmap; + } + + sgld = last_sgld; + addr = le64_to_cpu(sgld->addr); + + /* + * Do not map the last descriptor; it will be a Segment or Last Segment + * descriptor and is handled by the next iteration. + */ + status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req); + if (status) { + goto unmap; + } + + /* + * If the next segment is in the CMB, make sure that the sgl was + * already located there. + */ + if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) { + status = NVME_INVALID_USE_OF_CMB | NVME_DNR; + goto unmap; + } + } + +out: + /* if there is any residual left in len, the SGL was too short */ + if (len) { + status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + goto unmap; + } + + return NVME_SUCCESS; + +unmap: + if (iov->iov) { + qemu_iovec_destroy(iov); + } + + if (qsg->sg) { + qemu_sglist_destroy(qsg); + } + + return status; +} + +static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) +{ + uint64_t prp1, prp2; + + switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) { + case NVME_PSDT_PRP: + prp1 = le64_to_cpu(req->cmd.dptr.prp1); + prp2 = le64_to_cpu(req->cmd.dptr.prp2); + + return nvme_map_prp(n, prp1, prp2, len, req); + case NVME_PSDT_SGL_MPTR_CONTIGUOUS: + case NVME_PSDT_SGL_MPTR_SGL: + /* SGLs shall not be used for Admin commands in NVMe over PCIe */ + if (!req->sq->sqid) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + return nvme_map_sgl(n, &req->qsg, &req->iov, req->cmd.dptr.sgl, len, + req); + default: + return NVME_INVALID_FIELD; + } +} + +static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, + DMADirection dir, NvmeRequest *req) { uint16_t status = NVME_SUCCESS; - status = nvme_map_prp(n, prp1, prp2, len, req); + status = nvme_map_dptr(n, len, req); if (status) { return status; } @@ -458,15 +707,6 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return status; } -static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) -{ - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - - return nvme_map_prp(n, prp1, prp2, len, req); -} - static void nvme_post_cqes(void *opaque) { NvmeCQueue *cq = opaque; @@ -929,10 +1169,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - uint32_t nsid = le32_to_cpu(cmd->nsid); + uint32_t nsid = le32_to_cpu(req->cmd.nsid); uint32_t trans_len; time_t current_ms; @@ -981,17 +1218,14 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, nvme_clear_events(n, NVME_AER_TYPE_SMART); } - return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *) &smart + off, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, NvmeRequest *req) { uint32_t trans_len; - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); NvmeFwSlotInfoLog fw_log = { .afi = 0x1, }; @@ -1004,17 +1238,14 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, trans_len = MIN(sizeof(fw_log) - off, buf_len); - return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { uint32_t trans_len; - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); NvmeErrorLog errlog; if (!rae) { @@ -1029,8 +1260,8 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, trans_len = MIN(sizeof(errlog) - off, buf_len); - return nvme_dma_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)&errlog, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) @@ -1190,14 +1421,10 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { - NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); - trace_pci_nvme_identify_ctrl(); - return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) @@ -1205,8 +1432,6 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); trace_pci_nvme_identify_ns(nsid); @@ -1217,8 +1442,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) ns = &n->namespaces[nsid - 1]; - return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) @@ -1226,8 +1451,6 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) NvmeIdentify *c = (NvmeIdentify *)&req->cmd; static const int data_len = NVME_IDENTIFY_DATA_SIZE; uint32_t min_nsid = le32_to_cpu(c->nsid); - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); uint32_t *list; uint16_t ret; int i, j = 0; @@ -1254,8 +1477,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) break; } } - ret = nvme_dma_prp(n, (uint8_t *)list, data_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE, + req); g_free(list); return ret; } @@ -1264,8 +1487,6 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); uint8_t list[NVME_IDENTIFY_DATA_SIZE]; @@ -1297,8 +1518,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN; stl_be_p(&ns_descrs->uuid.v, nsid); - return nvme_dma_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) @@ -1369,14 +1590,10 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n) static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - uint64_t timestamp = nvme_get_timestamp(n); - return nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) @@ -1505,12 +1722,9 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { uint16_t ret; uint64_t timestamp; - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - ret = nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, - prp2, DMA_DIRECTION_TO_DEVICE, req); + ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), + DMA_DIRECTION_TO_DEVICE, req); if (ret != NVME_SUCCESS) { return ret; } @@ -2437,6 +2651,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | NVME_ONCS_FEATURES); + id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN); subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial); strpadcpy((char *)id->subnqn, sizeof(id->subnqn), subnqn, '\0'); diff --git a/hw/block/trace-events b/hw/block/trace-events index 9a6c5fb3dd..22ea635144 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -36,6 +36,7 @@ pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d" +pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" '%s' nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" @@ -91,6 +92,9 @@ pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_cfs(void) "controller fatal status" pci_nvme_err_aio(uint16_t cid, const char *errname, uint16_t status) "cid %"PRIu16" err '%s' status 0x%"PRIx16"" +pci_nvme_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" +pci_nvme_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" +pci_nvme_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16"" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 65e68a82c8..58647bcdad 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -412,9 +412,9 @@ typedef union NvmeCmdDptr { } NvmeCmdDptr; enum NvmePsdt { - PSDT_PRP = 0x0, - PSDT_SGL_MPTR_CONTIGUOUS = 0x1, - PSDT_SGL_MPTR_SGL = 0x2, + NVME_PSDT_PRP = 0x0, + NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1, + NVME_PSDT_SGL_MPTR_SGL = 0x2, }; typedef struct QEMU_PACKED NvmeCmd { -- cgit v1.2.3-55-g7522 From c1e18246618b3401ba1769bf88d2bcdf49e947aa Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 13 Jan 2020 19:12:50 +0100 Subject: pci: allocate pci id for nvme The emulated nvme device (hw/block/nvme.c) is currently using an internal Intel device id. Prepare to change that by allocating a device id under the 1b36 (Red Hat, Inc.) vendor id. Signed-off-by: Klaus Jensen Acked-by: Gerd Hoffmann Reviewed-by: Maxim Levitsky Reviewed-by: Keith Busch --- MAINTAINERS | 1 + docs/specs/nvme.txt | 23 +++++++++++++++++++++++ docs/specs/pci-ids.txt | 1 + include/hw/pci/pci.h | 1 + 4 files changed, 26 insertions(+) create mode 100644 docs/specs/nvme.txt (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index ef6f5c7399..9e215088ce 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1877,6 +1877,7 @@ L: qemu-block@nongnu.org S: Supported F: hw/block/nvme* F: tests/qtest/nvme-test.c +F: docs/specs/nvme.txt T: git git://git.infradead.org/qemu-nvme.git nvme-next megasas diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt new file mode 100644 index 0000000000..56d393884e --- /dev/null +++ b/docs/specs/nvme.txt @@ -0,0 +1,23 @@ +NVM Express Controller +====================== + +The nvme device (-device nvme) emulates an NVM Express Controller. + + +Reference Specifications +------------------------ + +The device currently implements most mandatory features of NVMe v1.3d, see + + https://nvmexpress.org/resources/specifications/ + +for the specification. + + +Known issues +------------ + +* The accounting numbers in the SMART/Health are reset across power cycles + +* Interrupt Coalescing is not supported and is disabled by default in volation + of the specification. diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt index 4d53e5c7d9..abbdbca6be 100644 --- a/docs/specs/pci-ids.txt +++ b/docs/specs/pci-ids.txt @@ -63,6 +63,7 @@ PCI devices (other than virtio): 1b36:000b PCIe Expander Bridge (-device pxb-pcie) 1b36:000d PCI xhci usb host adapter 1b36:000f mdpy (mdev sample device), linux/samples/vfio-mdev/mdpy.c +1b36:0010 PCIe NVMe device (-device nvme) All these devices are documented in docs/specs. diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index f19ffe6b4f..72ce649eee 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -106,6 +106,7 @@ extern bool pci_available; #define PCI_DEVICE_ID_REDHAT_XHCI 0x000d #define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e #define PCI_DEVICE_ID_REDHAT_MDPY 0x000f +#define PCI_DEVICE_ID_REDHAT_NVME 0x0010 #define PCI_DEVICE_ID_REDHAT_QXL 0x0100 #define FMT_PCIBUS PRIx64 -- cgit v1.2.3-55-g7522 From 2fbbecc5cd90ec00027a155f7044f2f70ed84f30 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 30 Sep 2020 10:15:50 -0700 Subject: hw/block/nvme: support per-namespace smart log Let the user specify a specific namespace if they want to get access stats for a specific namespace. Signed-off-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 63 ++++++++++++++++++++++++++++++++-------------------- include/block/nvme.h | 1 + 2 files changed, 40 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/hw/block/nvme.c b/hw/block/nvme.c index aa725d1141..5a9ae699af 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1164,48 +1164,63 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) return NVME_SUCCESS; } +struct nvme_stats { + uint64_t units_read; + uint64_t units_written; + uint64_t read_commands; + uint64_t write_commands; +}; + +static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats) +{ + BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); + + stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; + stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; + stats->read_commands += s->nr_ops[BLOCK_ACCT_READ]; + stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; +} + static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { uint32_t nsid = le32_to_cpu(req->cmd.nsid); - + struct nvme_stats stats = { 0 }; + NvmeSmartLog smart = { 0 }; uint32_t trans_len; + NvmeNamespace *ns; time_t current_ms; - uint64_t units_read = 0, units_written = 0; - uint64_t read_commands = 0, write_commands = 0; - NvmeSmartLog smart; - - if (nsid && nsid != 0xffffffff) { - return NVME_INVALID_FIELD | NVME_DNR; - } if (off >= sizeof(smart)) { return NVME_INVALID_FIELD | NVME_DNR; } - for (int i = 1; i <= n->num_namespaces; i++) { - NvmeNamespace *ns = nvme_ns(n, i); + if (nsid != 0xffffffff) { + ns = nvme_ns(n, nsid); if (!ns) { - continue; + return NVME_INVALID_NSID | NVME_DNR; } + nvme_set_blk_stats(ns, &stats); + } else { + int i; - BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); - - units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; - units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; - read_commands += s->nr_ops[BLOCK_ACCT_READ]; - write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + nvme_set_blk_stats(ns, &stats); + } } trans_len = MIN(sizeof(smart) - off, buf_len); - memset(&smart, 0x0, sizeof(smart)); - - smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(units_read, 1000)); - smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(units_written, + smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read, + 1000)); + smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written, 1000)); - smart.host_read_commands[0] = cpu_to_le64(read_commands); - smart.host_write_commands[0] = cpu_to_le64(write_commands); + smart.host_read_commands[0] = cpu_to_le64(stats.read_commands); + smart.host_write_commands[0] = cpu_to_le64(stats.write_commands); smart.temperature = cpu_to_le16(n->temperature); @@ -2703,7 +2718,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->acl = 3; id->aerl = n->params.aerl; id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; - id->lpa = NVME_LPA_EXTENDED; + id->lpa = NVME_LPA_NS_SMART | NVME_LPA_EXTENDED; /* recommended default value (~70 C) */ id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); diff --git a/include/block/nvme.h b/include/block/nvme.h index 58647bcdad..868cf53f0b 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -849,6 +849,7 @@ enum NvmeIdCtrlFrmw { }; enum NvmeIdCtrlLpa { + NVME_LPA_NS_SMART = 1 << 0, NVME_LPA_EXTENDED = 1 << 2, }; -- cgit v1.2.3-55-g7522 From 492f9a8d79f2e815007e985cad8dd73b713722f0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 30 Sep 2020 10:54:05 -0700 Subject: hw/block/nvme: validate command set selected Fail to start the controller if the user requests a command set that the controller does not support. Signed-off-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 6 +++++- hw/block/trace-events | 1 + include/block/nvme.h | 4 ++++ 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5a9ae699af..94db06cf72 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2045,6 +2045,10 @@ static int nvme_start_ctrl(NvmeCtrl *n) trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq); return -1; } + if (unlikely(!(NVME_CAP_CSS(n->bar.cap) & (1 << NVME_CC_CSS(n->bar.cc))))) { + trace_pci_nvme_err_startfail_css(NVME_CC_CSS(n->bar.cc)); + return -1; + } if (unlikely(NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap))) { trace_pci_nvme_err_startfail_page_too_small( @@ -2746,7 +2750,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_MQES(n->bar.cap, 0x7ff); NVME_CAP_SET_CQR(n->bar.cap, 1); NVME_CAP_SET_TO(n->bar.cap, 0xf); - NVME_CAP_SET_CSS(n->bar.cap, 1); + NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); n->bar.vs = NVME_SPEC_VER; diff --git a/hw/block/trace-events b/hw/block/trace-events index e56d688b88..7b28091bd6 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -132,6 +132,7 @@ pci_nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_ pci_nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u" pci_nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u" pci_nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u" +pci_nvme_err_startfail_css(uint8_t css) "nvme_start_ctrl failed because invalid command set selected:%u" pci_nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero" pci_nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero" pci_nvme_err_startfail(void) "setting controller enable bit failed" diff --git a/include/block/nvme.h b/include/block/nvme.h index 868cf53f0b..bc20a2ba5e 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -82,6 +82,10 @@ enum NvmeCapMask { #define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\ << CAP_PMR_SHIFT) +enum NvmeCapCss { + NVME_CAP_CSS_NVM = 1 << 0, +}; + enum NvmeCcShift { CC_EN_SHIFT = 0, CC_CSS_SHIFT = 4, -- cgit v1.2.3-55-g7522 From 8c5cea85934eb7b580ced14f7f188e19880d4c1c Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 30 Sep 2020 10:58:03 -0700 Subject: hw/block/nvme: support for admin-only command set Signed-off-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 1 + include/block/nvme.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 94db06cf72..c1323ca869 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2751,6 +2751,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_CQR(n->bar.cap, 1); NVME_CAP_SET_TO(n->bar.cap, 0xf); NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM); + NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); n->bar.vs = NVME_SPEC_VER; diff --git a/include/block/nvme.h b/include/block/nvme.h index bc20a2ba5e..521533fd2a 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -83,7 +83,8 @@ enum NvmeCapMask { << CAP_PMR_SHIFT) enum NvmeCapCss { - NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_ADMIN_ONLY = 1 << 7, }; enum NvmeCcShift { -- cgit v1.2.3-55-g7522 From 1b48e4611a7a3ee3065d3bb8428f5f6acb5232fe Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 30 Sep 2020 01:19:07 +0200 Subject: hw/block/nvme: reject io commands if only admin command set selected If the host sets CC.CSS to 111b, all commands submitted to I/O queues should be completed with status Invalid Command Opcode. Note that this is technically a v1.4 feature, but it does not hurt to implement before we finally bump the reported version implemented. Reviewed-by: Dmitry Fomichev Signed-off-by: Klaus Jensen Signed-off-by: Keith Busch --- hw/block/nvme.c | 4 ++++ include/block/nvme.h | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c1323ca869..32c35fe587 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1026,6 +1026,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); + if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) { + return NVME_INVALID_OPCODE | NVME_DNR; + } + if (!nvme_nsid_valid(n, nsid)) { return NVME_INVALID_NSID | NVME_DNR; } diff --git a/include/block/nvme.h b/include/block/nvme.h index 521533fd2a..6de2d5aa75 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -115,6 +115,11 @@ enum NvmeCcMask { #define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK) #define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK) +enum NvmeCcCss { + NVME_CC_CSS_NVM = 0x0, + NVME_CC_CSS_ADMIN_ONLY = 0x7, +}; + enum NvmeCstsShift { CSTS_RDY_SHIFT = 0, CSTS_CFS_SHIFT = 1, -- cgit v1.2.3-55-g7522 From 28fee5b5d02d59a2b039c71a0a72292b1bc7f75b Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Mon, 19 Oct 2020 12:41:31 +0530 Subject: hw/block/nvme: fix prp mapping status codes Address 0 is not an invalid address. Remove those invalikd checks. Unaligned PRP2 and PRP list entries should result in Invalid PRP Offset status code and not Invalid Field. Fix that. See NVMe Express v1.3d, Section 4.3 ("Physical Region Page Entry and List"). Suggested-by: Keith Busch Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 20 +++++--------------- hw/block/trace-events | 4 +--- include/block/nvme.h | 1 + 3 files changed, 7 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b8c6be6318..2896bb49b9 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -327,11 +327,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); - if (unlikely(!prp1)) { - trace_pci_nvme_err_invalid_prp(); - return NVME_INVALID_FIELD | NVME_DNR; - } - if (nvme_addr_is_cmb(n, prp1)) { qemu_iovec_init(iov, num_prps); } else { @@ -345,11 +340,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, len -= trans_len; if (len) { - if (unlikely(!prp2)) { - trace_pci_nvme_err_invalid_prp2_missing(); - return NVME_INVALID_FIELD | NVME_DNR; - } - if (len > n->page_size) { uint64_t prp_list[n->max_prp_ents]; uint32_t nents, prp_trans; @@ -370,9 +360,9 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, uint64_t prp_ent = le64_to_cpu(prp_list[i]); if (i == n->max_prp_ents - 1 && len > n->page_size) { - if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { + if (unlikely(prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - return NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_PRP_OFFSET | NVME_DNR; } if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) { @@ -391,9 +381,9 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, prp_ent = le64_to_cpu(prp_list[i]); } - if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { + if (unlikely(prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - return NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_PRP_OFFSET | NVME_DNR; } trans_len = MIN(len, n->page_size); @@ -408,7 +398,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, } else { if (unlikely(prp2 & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prp2_align(prp2); - return NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_PRP_OFFSET | NVME_DNR; } status = nvme_map_addr(n, qsg, iov, prp2, len); if (status) { diff --git a/hw/block/trace-events b/hw/block/trace-events index cab9913b1f..c1537e3ac0 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -97,10 +97,8 @@ pci_nvme_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRI pci_nvme_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" pci_nvme_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16"" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" -pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" +pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" -pci_nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred" -pci_nvme_err_invalid_prp(void) "invalid PRP" pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 6de2d5aa75..8a46d9cf01 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -655,6 +655,7 @@ enum NvmeStatusCodes { NVME_MD_SGL_LEN_INVALID = 0x0010, NVME_SGL_DESCR_TYPE_INVALID = 0x0011, NVME_INVALID_USE_OF_CMB = 0x0012, + NVME_INVALID_PRP_OFFSET = 0x0013, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, NVME_NS_NOT_READY = 0x0082, -- cgit v1.2.3-55-g7522