diff options
author | Peter Maydell | 2021-03-10 21:11:33 +0100 |
---|---|---|
committer | Peter Maydell | 2021-03-10 21:11:33 +0100 |
commit | d689ecad073e0289afa8ca863e45879d719e5c21 (patch) | |
tree | 2191d97d19a56c86b775a165232d9afc727d8850 /hw/block | |
parent | Merge remote-tracking branch 'remotes/thuth-gitlab/tags/pull-request-2021-03-... (diff) | |
parent | hw/block/nvme: support Identify NS Attached Controller List (diff) | |
download | qemu-d689ecad073e0289afa8ca863e45879d719e5c21.tar.gz qemu-d689ecad073e0289afa8ca863e45879d719e5c21.tar.xz qemu-d689ecad073e0289afa8ca863e45879d719e5c21.zip |
Merge remote-tracking branch 'remotes/nvme/tags/nvme-next-pull-request' into staging
hw/block/nvme updates
* NVMe subsystem support (`-device nvme-subsys`) (Minwoo Im)
* Namespace (De|At)tachment support (Minwoo Im)
* Simple Copy command support (Klaus Jensen)
* Flush broadcast support (Gollu Appalanaidu)
* QEMUIOVector/QEMUSGList duality refactoring (Klaus Jensen)
plus various fixes from Minwoo, Gollu, Dmitry and me.
v2:
- add `nqn` nvme-subsys device parameter instead of using `id`.
(Paolo)
# gpg: Signature made Tue 09 Mar 2021 11:44:17 GMT
# gpg: using RSA key 522833AA75E2DCE6A24766C04DE1AF316D4F0DE9
# gpg: Good signature from "Klaus Jensen <its@irrelevant.dk>" [unknown]
# gpg: aka "Klaus Jensen <k.jensen@samsung.com>" [unknown]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: DDCA 4D9C 9EF9 31CC 3468 4272 63D5 6FC5 E55D A838
# Subkey fingerprint: 5228 33AA 75E2 DCE6 A247 66C0 4DE1 AF31 6D4F 0DE9
* remotes/nvme/tags/nvme-next-pull-request: (38 commits)
hw/block/nvme: support Identify NS Attached Controller List
hw/block/nvme: support changed namespace asynchronous event
hw/block/nvme: support namespace attachment command
hw/block/nvme: refactor nvme_select_ns_iocs
hw/block/nvme: support allocated namespace type
hw/block/nvme: fix allocated namespace list to 256
hw/block/nvme: fix namespaces array to 1-based
hw/block/nvme: support namespace detach
hw/block/nvme: refactor nvme_dma
hw/block/nvme: remove the req dependency in map functions
hw/block/nvme: try to deal with the iov/qsg duality
hw/block/nvme: fix strerror printing
hw/block/nvme: remove block accounting for write zeroes
hw/block/nvme: remove redundant len member in compare context
hw/block/nvme: report non-mdts command size limit for dsm
hw/block/nvme: add trace event for zone read check
hw/block/nvme: fix potential compilation error
hw/block/nvme: add identify trace event
hw/block/nvme: remove unnecessary endian conversion
hw/block/nvme: align zoned.zasl with mdts
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw/block')
-rw-r--r-- | hw/block/meson.build | 2 | ||||
-rw-r--r-- | hw/block/nvme-ns.c | 38 | ||||
-rw-r--r-- | hw/block/nvme-ns.h | 13 | ||||
-rw-r--r-- | hw/block/nvme-subsys.c | 116 | ||||
-rw-r--r-- | hw/block/nvme-subsys.h | 60 | ||||
-rw-r--r-- | hw/block/nvme.c | 1439 | ||||
-rw-r--r-- | hw/block/nvme.h | 63 | ||||
-rw-r--r-- | hw/block/trace-events | 21 |
8 files changed, 1359 insertions, 393 deletions
diff --git a/hw/block/meson.build b/hw/block/meson.build index 4bf994c64f..5492829155 100644 --- a/hw/block/meson.build +++ b/hw/block/meson.build @@ -13,7 +13,7 @@ softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c')) softmmu_ss.add(when: 'CONFIG_SWIM', if_true: files('swim.c')) softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c')) softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c')) -softmmu_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('nvme.c', 'nvme-ns.c')) +softmmu_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('nvme.c', 'nvme-ns.c', 'nvme-subsys.c')) specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c')) specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk.c')) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 93ac6e107a..eda6a0c003 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -63,6 +63,15 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) id_ns->npda = id_ns->npdg = npdg - 1; + if (nvme_ns_shared(ns)) { + id_ns->nmic |= NVME_NMIC_NS_SHARED; + } + + /* simple copy */ + id_ns->mssrl = cpu_to_le16(ns->params.mssrl); + id_ns->mcl = cpu_to_le32(ns->params.mcl); + id_ns->msrc = ns->params.msrc; + return 0; } @@ -154,6 +163,18 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) return -1; } + if (ns->params.max_active_zones) { + if (ns->params.max_open_zones > ns->params.max_active_zones) { + error_setg(errp, "max_open_zones (%u) exceeds max_active_zones (%u)", + ns->params.max_open_zones, ns->params.max_active_zones); + return -1; + } + + if (!ns->params.max_open_zones) { + ns->params.max_open_zones = ns->params.max_active_zones; + } + } + if (ns->params.zd_extension_size) { if (ns->params.zd_extension_size & 0x3f) { error_setg(errp, @@ -363,16 +384,27 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) return; } - if (nvme_register_namespace(n, ns, errp)) { - return; + if (ns->subsys) { + if (nvme_subsys_register_ns(ns, errp)) { + return; + } + } else { + if (nvme_register_namespace(n, ns, errp)) { + return; + } } - } static Property nvme_ns_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), + DEFINE_PROP_LINK("subsys", NvmeNamespace, subsys, TYPE_NVME_SUBSYS, + NvmeSubsystem *), + DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false), DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), + DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128), + DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128), + DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 127), DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false), DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs, NVME_DEFAULT_ZONE_SIZE), diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 293ac990e3..318d3aebe1 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -26,9 +26,14 @@ typedef struct NvmeZone { } NvmeZone; typedef struct NvmeNamespaceParams { + bool detached; uint32_t nsid; QemuUUID uuid; + uint16_t mssrl; + uint32_t mcl; + uint8_t msrc; + bool zoned; bool cross_zone_read; uint64_t zone_size_bs; @@ -47,6 +52,9 @@ typedef struct NvmeNamespace { const uint32_t *iocs; uint8_t csi; + NvmeSubsystem *subsys; + QTAILQ_ENTRY(NvmeNamespace) entry; + NvmeIdNsZoned *id_ns_zoned; NvmeZone *zone_array; QTAILQ_HEAD(, NvmeZone) exp_open_zones; @@ -77,6 +85,11 @@ static inline uint32_t nvme_nsid(NvmeNamespace *ns) return -1; } +static inline bool nvme_ns_shared(NvmeNamespace *ns) +{ + return !!ns->subsys; +} + static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns) { NvmeIdNs *id_ns = &ns->id_ns; diff --git a/hw/block/nvme-subsys.c b/hw/block/nvme-subsys.c new file mode 100644 index 0000000000..af4804a819 --- /dev/null +++ b/hw/block/nvme-subsys.c @@ -0,0 +1,116 @@ +/* + * QEMU NVM Express Subsystem: nvme-subsys + * + * Copyright (c) 2021 Minwoo Im <minwoo.im.dev@gmail.com> + * + * This code is licensed under the GNU GPL v2. Refer COPYING. + */ + +#include "qemu/units.h" +#include "qemu/osdep.h" +#include "qemu/uuid.h" +#include "qemu/iov.h" +#include "qemu/cutils.h" +#include "qapi/error.h" +#include "hw/qdev-properties.h" +#include "hw/qdev-core.h" +#include "hw/block/block.h" +#include "block/aio.h" +#include "block/accounting.h" +#include "sysemu/sysemu.h" +#include "hw/pci/pci.h" +#include "nvme.h" +#include "nvme-subsys.h" + +int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp) +{ + NvmeSubsystem *subsys = n->subsys; + int cntlid; + + for (cntlid = 0; cntlid < ARRAY_SIZE(subsys->ctrls); cntlid++) { + if (!subsys->ctrls[cntlid]) { + break; + } + } + + if (cntlid == ARRAY_SIZE(subsys->ctrls)) { + error_setg(errp, "no more free controller id"); + return -1; + } + + subsys->ctrls[cntlid] = n; + + return cntlid; +} + +int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp) +{ + NvmeSubsystem *subsys = ns->subsys; + NvmeCtrl *n; + int i; + + if (subsys->namespaces[nvme_nsid(ns)]) { + error_setg(errp, "namespace %d already registerd to subsy %s", + nvme_nsid(ns), subsys->parent_obj.id); + return -1; + } + + subsys->namespaces[nvme_nsid(ns)] = ns; + + for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) { + n = subsys->ctrls[i]; + + if (n && nvme_register_namespace(n, ns, errp)) { + return -1; + } + } + + return 0; +} + +static void nvme_subsys_setup(NvmeSubsystem *subsys) +{ + const char *nqn = subsys->params.nqn ? + subsys->params.nqn : subsys->parent_obj.id; + + snprintf((char *)subsys->subnqn, sizeof(subsys->subnqn), + "nqn.2019-08.org.qemu:%s", nqn); +} + +static void nvme_subsys_realize(DeviceState *dev, Error **errp) +{ + NvmeSubsystem *subsys = NVME_SUBSYS(dev); + + nvme_subsys_setup(subsys); +} + +static Property nvme_subsystem_props[] = { + DEFINE_PROP_STRING("nqn", NvmeSubsystem, params.nqn), + DEFINE_PROP_END_OF_LIST(), +}; + +static void nvme_subsys_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + + dc->realize = nvme_subsys_realize; + dc->desc = "Virtual NVMe subsystem"; + + device_class_set_props(dc, nvme_subsystem_props); +} + +static const TypeInfo nvme_subsys_info = { + .name = TYPE_NVME_SUBSYS, + .parent = TYPE_DEVICE, + .class_init = nvme_subsys_class_init, + .instance_size = sizeof(NvmeSubsystem), +}; + +static void nvme_subsys_register_types(void) +{ + type_register_static(&nvme_subsys_info); +} + +type_init(nvme_subsys_register_types) diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h new file mode 100644 index 0000000000..fb66ae752a --- /dev/null +++ b/hw/block/nvme-subsys.h @@ -0,0 +1,60 @@ +/* + * QEMU NVM Express Subsystem: nvme-subsys + * + * Copyright (c) 2021 Minwoo Im <minwoo.im.dev@gmail.com> + * + * This code is licensed under the GNU GPL v2. Refer COPYING. + */ + +#ifndef NVME_SUBSYS_H +#define NVME_SUBSYS_H + +#define TYPE_NVME_SUBSYS "nvme-subsys" +#define NVME_SUBSYS(obj) \ + OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS) + +#define NVME_SUBSYS_MAX_CTRLS 32 +#define NVME_SUBSYS_MAX_NAMESPACES 256 + +typedef struct NvmeCtrl NvmeCtrl; +typedef struct NvmeNamespace NvmeNamespace; +typedef struct NvmeSubsystem { + DeviceState parent_obj; + uint8_t subnqn[256]; + + NvmeCtrl *ctrls[NVME_SUBSYS_MAX_CTRLS]; + /* Allocated namespaces for this subsystem */ + NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES + 1]; + + struct { + char *nqn; + } params; +} NvmeSubsystem; + +int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp); +int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp); + +static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem *subsys, + uint32_t cntlid) +{ + if (!subsys) { + return NULL; + } + + return subsys->ctrls[cntlid]; +} + +/* + * Return allocated namespace of the specified nsid in the subsystem. + */ +static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys, + uint32_t nsid) +{ + if (!subsys) { + return NULL; + } + + return subsys->namespaces[nsid]; +} + +#endif /* NVME_SUBSYS_H */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index fb83636abd..d439e44db8 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -17,14 +17,17 @@ /** * Usage: add options: * -drive file=<file>,if=none,id=<drive_id> + * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id> * -device nvme,serial=<serial>,id=<bus_name>, \ * cmb_size_mb=<cmb_size_mb[optional]>, \ * [pmrdev=<mem_backend_file_id>,] \ * max_ioqpairs=<N[optional]>, \ - * aerl=<N[optional]>, aer_max_queued=<N[optional]>, \ - * mdts=<N[optional]>,zoned.append_size_limit=<N[optional]> \ + * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \ + * mdts=<N[optional]>,zoned.zasl=<N[optional]>, \ + * subsys=<subsys_id> * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\ - * zoned=<true|false[optional]> + * zoned=<true|false[optional]>, \ + * subsys=<subsys_id>,detached=<true|false[optional]> * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the @@ -38,9 +41,27 @@ * * The PMR will use BAR 4/5 exclusively. * + * To place controller(s) and namespace(s) to a subsystem, then provide + * nvme-subsys device as above. + * + * nvme subsystem device parameters + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * - `nqn` + * This parameter provides the `<nqn_id>` part of the string + * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field + * of subsystem controllers. Note that `<nqn_id>` should be unique per + * subsystem, but this is not enforced by QEMU. If not specified, it will + * default to the value of the `id` parameter (`<subsys_id>`). * * nvme device parameters * ~~~~~~~~~~~~~~~~~~~~~~ + * - `subsys` + * Specifying this parameter attaches the controller to the subsystem and + * the SUBNQN field in the controller will report the NQN of the subsystem + * device. This also enables multi controller capability represented in + * Identify Controller data structure in CMIC (Controller Multi-path I/O and + * Namesapce Sharing Capabilities). + * * - `aerl` * The Asynchronous Event Request Limit (AERL). Indicates the maximum number * of concurrently outstanding Asynchronous Event Request commands support @@ -51,13 +72,31 @@ * completion when there are no outstanding AERs. When the maximum number of * enqueued events are reached, subsequent events will be dropped. * - * - `zoned.append_size_limit` - * The maximum I/O size in bytes that is allowed in Zone Append command. - * The default is 128KiB. Since internally this this value is maintained as - * ZASL = log2(<maximum append size> / <page size>), some values assigned - * to this property may be rounded down and result in a lower maximum ZA - * data size being in effect. By setting this property to 0, users can make - * ZASL to be equal to MDTS. This property only affects zoned namespaces. + * - `mdts` + * Indicates the maximum data transfer size for a command that transfers data + * between host-accessible memory and the controller. The value is specified + * as a power of two (2^n) and is in units of the minimum memory page size + * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB). + * + * - `zoned.zasl` + * Indicates the maximum data transfer size for the Zone Append command. Like + * `mdts`, the value is specified as a power of two (2^n) and is in units of + * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e. + * defaulting to the value of `mdts`). + * + * nvme namespace device parameters + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * - `subsys` + * If given, the namespace will be attached to all controllers in the + * subsystem. Otherwise, `bus` must be given to attach this namespace to a + * specific controller as a non-shared namespace. + * + * - `detached` + * This parameter is only valid together with the `subsys` parameter. If left + * at the default value (`false/off`), the namespace will be attached to all + * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the + * namespace will be be available in the subsystem not not attached to any + * controllers. * * Setting `zoned` to true selects Zoned Command Set at the namespace. * In this case, the following namespace properties are available to configure @@ -157,6 +196,7 @@ static const uint32_t nvme_cse_acs[256] = { [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, }; static const uint32_t nvme_cse_iocs_none[256]; @@ -167,6 +207,7 @@ static const uint32_t nvme_cse_iocs_nvm[256] = { [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, }; @@ -176,6 +217,7 @@ static const uint32_t nvme_cse_iocs_zoned[256] = { [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, @@ -407,15 +449,31 @@ static void nvme_req_clear(NvmeRequest *req) req->status = NVME_SUCCESS; } -static void nvme_req_exit(NvmeRequest *req) +static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma) { - if (req->qsg.sg) { - qemu_sglist_destroy(&req->qsg); + if (dma) { + pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0); + sg->flags = NVME_SG_DMA; + } else { + qemu_iovec_init(&sg->iov, 0); } - if (req->iov.iov) { - qemu_iovec_destroy(&req->iov); + sg->flags |= NVME_SG_ALLOC; +} + +static inline void nvme_sg_unmap(NvmeSg *sg) +{ + if (!(sg->flags & NVME_SG_ALLOC)) { + return; } + + if (sg->flags & NVME_SG_DMA) { + qemu_sglist_destroy(&sg->qsg); + } else { + qemu_iovec_destroy(&sg->iov); + } + + memset(sg, 0x0, sizeof(*sg)); } static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, @@ -452,8 +510,7 @@ static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, return NVME_SUCCESS; } -static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, - hwaddr addr, size_t len) +static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len) { bool cmb = false, pmr = false; @@ -470,40 +527,33 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, } if (cmb || pmr) { - if (qsg && qsg->sg) { + if (sg->flags & NVME_SG_DMA) { return NVME_INVALID_USE_OF_CMB | NVME_DNR; } - assert(iov); - - if (!iov->iov) { - qemu_iovec_init(iov, 1); - } - if (cmb) { - return nvme_map_addr_cmb(n, iov, addr, len); + return nvme_map_addr_cmb(n, &sg->iov, addr, len); } else { - return nvme_map_addr_pmr(n, iov, addr, len); + return nvme_map_addr_pmr(n, &sg->iov, addr, len); } } - if (iov && iov->iov) { + if (!(sg->flags & NVME_SG_DMA)) { return NVME_INVALID_USE_OF_CMB | NVME_DNR; } - assert(qsg); - - if (!qsg->sg) { - pci_dma_sglist_init(qsg, &n->parent_obj, 1); - } - - qemu_sglist_add(qsg, addr, len); + qemu_sglist_add(&sg->qsg, addr, len); return NVME_SUCCESS; } -static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, - uint32_t len, NvmeRequest *req) +static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr) +{ + return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr)); +} + +static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, + uint64_t prp2, uint32_t len) { hwaddr trans_len = n->page_size - (prp1 % n->page_size); trans_len = MIN(len, trans_len); @@ -511,20 +561,13 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, uint16_t status; int ret; - QEMUSGList *qsg = &req->qsg; - QEMUIOVector *iov = &req->iov; - trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); - if (nvme_addr_is_cmb(n, prp1) || (nvme_addr_is_pmr(n, prp1))) { - qemu_iovec_init(iov, num_prps); - } else { - pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); - } + nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1)); - status = nvme_map_addr(n, qsg, iov, prp1, trans_len); + status = nvme_map_addr(n, sg, prp1, trans_len); if (status) { - return status; + goto unmap; } len -= trans_len; @@ -539,7 +582,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); if (ret) { trace_pci_nvme_err_addr_read(prp2); - return NVME_DATA_TRAS_ERROR; + status = NVME_DATA_TRAS_ERROR; + goto unmap; } while (len != 0) { uint64_t prp_ent = le64_to_cpu(prp_list[i]); @@ -547,7 +591,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, if (i == n->max_prp_ents - 1 && len > n->page_size) { if (unlikely(prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - return NVME_INVALID_PRP_OFFSET | NVME_DNR; + status = NVME_INVALID_PRP_OFFSET | NVME_DNR; + goto unmap; } i = 0; @@ -557,20 +602,22 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, prp_trans); if (ret) { trace_pci_nvme_err_addr_read(prp_ent); - return NVME_DATA_TRAS_ERROR; + status = NVME_DATA_TRAS_ERROR; + goto unmap; } prp_ent = le64_to_cpu(prp_list[i]); } if (unlikely(prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - return NVME_INVALID_PRP_OFFSET | NVME_DNR; + status = NVME_INVALID_PRP_OFFSET | NVME_DNR; + goto unmap; } trans_len = MIN(len, n->page_size); - status = nvme_map_addr(n, qsg, iov, prp_ent, trans_len); + status = nvme_map_addr(n, sg, prp_ent, trans_len); if (status) { - return status; + goto unmap; } len -= trans_len; @@ -579,26 +626,30 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, } else { if (unlikely(prp2 & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prp2_align(prp2); - return NVME_INVALID_PRP_OFFSET | NVME_DNR; + status = NVME_INVALID_PRP_OFFSET | NVME_DNR; + goto unmap; } - status = nvme_map_addr(n, qsg, iov, prp2, len); + status = nvme_map_addr(n, sg, prp2, len); if (status) { - return status; + goto unmap; } } } return NVME_SUCCESS; + +unmap: + nvme_sg_unmap(sg); + return status; } /* * Map 'nsgld' data descriptors from 'segment'. The function will subtract the * number of bytes mapped in len. */ -static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, - QEMUIOVector *iov, +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor *segment, uint64_t nsgld, - size_t *len, NvmeRequest *req) + size_t *len, NvmeCmd *cmd) { dma_addr_t addr, trans_len; uint32_t dlen; @@ -609,7 +660,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, switch (type) { case NVME_SGL_DESCR_TYPE_BIT_BUCKET: - if (req->cmd.opcode == NVME_CMD_WRITE) { + if (cmd->opcode == NVME_CMD_WRITE) { continue; } case NVME_SGL_DESCR_TYPE_DATA_BLOCK: @@ -638,7 +689,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, break; } - trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req)); + trace_pci_nvme_err_invalid_sgl_excess_length(dlen); return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; } @@ -654,7 +705,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; } - status = nvme_map_addr(n, qsg, iov, addr, trans_len); + status = nvme_map_addr(n, sg, addr, trans_len); if (status) { return status; } @@ -666,9 +717,8 @@ next: return NVME_SUCCESS; } -static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, - NvmeSglDescriptor sgl, size_t len, - NvmeRequest *req) +static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, + size_t len, NvmeCmd *cmd) { /* * Read the segment in chunks of 256 descriptors (one 4k page) to avoid @@ -689,14 +739,16 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, sgld = &sgl; addr = le64_to_cpu(sgl.addr); - trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len); + trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len); + + nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr)); /* * If the entire transfer can be described with a single data block it can * be mapped directly. */ if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { - status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req); + status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd); if (status) { goto unmap; } @@ -734,8 +786,8 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, goto unmap; } - status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE, - &len, req); + status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE, + &len, cmd); if (status) { goto unmap; } @@ -761,7 +813,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, switch (NVME_SGL_TYPE(last_sgld->type)) { case NVME_SGL_DESCR_TYPE_DATA_BLOCK: case NVME_SGL_DESCR_TYPE_BIT_BUCKET: - status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req); + status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd); if (status) { goto unmap; } @@ -788,7 +840,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, * Do not map the last descriptor; it will be a Segment or Last Segment * descriptor and is handled by the next iteration. */ - status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req); + status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd); if (status) { goto unmap; } @@ -804,83 +856,120 @@ out: return NVME_SUCCESS; unmap: - if (iov->iov) { - qemu_iovec_destroy(iov); - } - - if (qsg->sg) { - qemu_sglist_destroy(qsg); - } - + nvme_sg_unmap(sg); return status; } -static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) +static uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len, + NvmeCmd *cmd) { uint64_t prp1, prp2; - switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) { + switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) { case NVME_PSDT_PRP: - prp1 = le64_to_cpu(req->cmd.dptr.prp1); - prp2 = le64_to_cpu(req->cmd.dptr.prp2); + prp1 = le64_to_cpu(cmd->dptr.prp1); + prp2 = le64_to_cpu(cmd->dptr.prp2); - return nvme_map_prp(n, prp1, prp2, len, req); + return nvme_map_prp(n, sg, prp1, prp2, len); case NVME_PSDT_SGL_MPTR_CONTIGUOUS: case NVME_PSDT_SGL_MPTR_SGL: - /* SGLs shall not be used for Admin commands in NVMe over PCIe */ - if (!req->sq->sqid) { - return NVME_INVALID_FIELD | NVME_DNR; - } - - return nvme_map_sgl(n, &req->qsg, &req->iov, req->cmd.dptr.sgl, len, - req); + return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd); default: return NVME_INVALID_FIELD; } } -static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - DMADirection dir, NvmeRequest *req) -{ - uint16_t status = NVME_SUCCESS; +typedef enum NvmeTxDirection { + NVME_TX_DIRECTION_TO_DEVICE = 0, + NVME_TX_DIRECTION_FROM_DEVICE = 1, +} NvmeTxDirection; - status = nvme_map_dptr(n, len, req); - if (status) { - return status; - } - - /* assert that only one of qsg and iov carries data */ - assert((req->qsg.nsg > 0) != (req->iov.niov > 0)); +static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len, + NvmeTxDirection dir) +{ + assert(sg->flags & NVME_SG_ALLOC); - if (req->qsg.nsg > 0) { + if (sg->flags & NVME_SG_DMA) { uint64_t residual; - if (dir == DMA_DIRECTION_TO_DEVICE) { - residual = dma_buf_write(ptr, len, &req->qsg); + if (dir == NVME_TX_DIRECTION_TO_DEVICE) { + residual = dma_buf_write(ptr, len, &sg->qsg); } else { - residual = dma_buf_read(ptr, len, &req->qsg); + residual = dma_buf_read(ptr, len, &sg->qsg); } if (unlikely(residual)) { trace_pci_nvme_err_invalid_dma(); - status = NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_FIELD | NVME_DNR; } } else { size_t bytes; - if (dir == DMA_DIRECTION_TO_DEVICE) { - bytes = qemu_iovec_to_buf(&req->iov, 0, ptr, len); + if (dir == NVME_TX_DIRECTION_TO_DEVICE) { + bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len); } else { - bytes = qemu_iovec_from_buf(&req->iov, 0, ptr, len); + bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len); } if (unlikely(bytes != len)) { trace_pci_nvme_err_invalid_dma(); - status = NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_FIELD | NVME_DNR; } } - return status; + return NVME_SUCCESS; +} + +static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len, + NvmeRequest *req) +{ + uint16_t status; + + status = nvme_map_dptr(n, &req->sg, len, &req->cmd); + if (status) { + return status; + } + + return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE); +} + +static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len, + NvmeRequest *req) +{ + uint16_t status; + + status = nvme_map_dptr(n, &req->sg, len, &req->cmd); + if (status) { + return status; + } + + return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE); +} + +static inline void nvme_blk_read(BlockBackend *blk, int64_t offset, + BlockCompletionFunc *cb, NvmeRequest *req) +{ + assert(req->sg.flags & NVME_SG_ALLOC); + + if (req->sg.flags & NVME_SG_DMA) { + req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, + cb, req); + } else { + req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req); + } +} + +static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, + BlockCompletionFunc *cb, NvmeRequest *req) +{ + assert(req->sg.flags & NVME_SG_ALLOC); + + if (req->sg.flags & NVME_SG_DMA) { + req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, + cb, req); + } else { + req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req); + } } static void nvme_post_cqes(void *opaque) @@ -913,7 +1002,7 @@ static void nvme_post_cqes(void *opaque) } QTAILQ_REMOVE(&cq->req_list, req, entry); nvme_inc_cq_tail(cq); - nvme_req_exit(req); + nvme_sg_unmap(&req->sg); QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); } if (cq->tail != cq->head) { @@ -1048,6 +1137,7 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len) uint8_t mdts = n->params.mdts; if (mdts && len > n->page_size << mdts) { + trace_pci_nvme_err_mdts(len); return NVME_INVALID_FIELD | NVME_DNR; } @@ -1129,7 +1219,7 @@ static void nvme_aio_err(NvmeRequest *req, int ret) break; } - trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status); + trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status); error_setg_errno(&local_err, -ret, "aio failed"); error_report_err(local_err); @@ -1185,9 +1275,8 @@ static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) return NVME_INTERNAL_DEV_ERROR; } -static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, - NvmeZone *zone, uint64_t slba, - uint32_t nlb) +static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone, + uint64_t slba, uint32_t nlb) { uint64_t zcap = nvme_zone_wr_boundary(zone); uint16_t status; @@ -1212,8 +1301,6 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) { - uint16_t status; - switch (nvme_get_zone_state(zone)) { case NVME_ZONE_STATE_EMPTY: case NVME_ZONE_STATE_IMPLICITLY_OPEN: @@ -1221,16 +1308,15 @@ static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) case NVME_ZONE_STATE_FULL: case NVME_ZONE_STATE_CLOSED: case NVME_ZONE_STATE_READ_ONLY: - status = NVME_SUCCESS; - break; + return NVME_SUCCESS; case NVME_ZONE_STATE_OFFLINE: - status = NVME_ZONE_OFFLINE; - break; + trace_pci_nvme_err_zone_is_offline(zone->d.zslba); + return NVME_ZONE_OFFLINE; default: assert(false); } - return status; + return NVME_INTERNAL_DEV_ERROR; } static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, @@ -1265,7 +1351,45 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, return status; } -static void nvme_auto_transition_zone(NvmeNamespace *ns) +static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone) +{ + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_FULL: + return NVME_SUCCESS; + + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fallthrough */ + case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fallthrough */ + case NVME_ZONE_STATE_EMPTY: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); + return NVME_SUCCESS; + + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone) +{ + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + /* fall through */ + case NVME_ZONE_STATE_CLOSED: + return NVME_SUCCESS; + + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns) { NvmeZone *zone; @@ -1277,85 +1401,92 @@ static void nvme_auto_transition_zone(NvmeNamespace *ns) * Automatically close this implicitly open zone. */ QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); - nvme_aor_dec_open(ns); - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + nvme_zrm_close(ns, zone); } } } -static uint16_t nvme_auto_open_zone(NvmeNamespace *ns, NvmeZone *zone) +static uint16_t __nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone, + bool implicit) { - uint16_t status = NVME_SUCCESS; - uint8_t zs = nvme_get_zone_state(zone); + int act = 0; + uint16_t status; - if (zs == NVME_ZONE_STATE_EMPTY) { - nvme_auto_transition_zone(ns); - status = nvme_aor_check(ns, 1, 1); - } else if (zs == NVME_ZONE_STATE_CLOSED) { - nvme_auto_transition_zone(ns); - status = nvme_aor_check(ns, 0, 1); - } + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EMPTY: + act = 1; - return status; + /* fallthrough */ + + case NVME_ZONE_STATE_CLOSED: + nvme_zrm_auto_transition_zone(ns); + status = nvme_aor_check(ns, act, 1); + if (status) { + return status; + } + + if (act) { + nvme_aor_inc_active(ns); + } + + nvme_aor_inc_open(ns); + + if (implicit) { + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); + return NVME_SUCCESS; + } + + /* fallthrough */ + + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + if (implicit) { + return NVME_SUCCESS; + } + + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); + + /* fallthrough */ + + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + return NVME_SUCCESS; + + default: + return NVME_ZONE_INVAL_TRANSITION; + } } -static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, - bool failed) +static inline uint16_t nvme_zrm_auto(NvmeNamespace *ns, NvmeZone *zone) { - NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; - NvmeZone *zone; - NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; - uint64_t slba; - uint32_t nlb; + return __nvme_zrm_open(ns, zone, true); +} - slba = le64_to_cpu(rw->slba); - nlb = le16_to_cpu(rw->nlb) + 1; - zone = nvme_get_zone_by_slba(ns, slba); +static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone) +{ + return __nvme_zrm_open(ns, zone, false); +} +static void __nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, + uint32_t nlb) +{ zone->d.wp += nlb; - if (failed) { - res->slba = 0; - } - if (zone->d.wp == nvme_zone_wr_boundary(zone)) { - switch (nvme_get_zone_state(zone)) { - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - nvme_aor_dec_open(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - nvme_aor_dec_active(ns); - /* fall through */ - case NVME_ZONE_STATE_EMPTY: - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); - /* fall through */ - case NVME_ZONE_STATE_FULL: - break; - default: - assert(false); - } + nvme_zrm_finish(ns, zone); } } -static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, - uint32_t nlb) +static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) { - uint8_t zs; + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + NvmeZone *zone; + uint64_t slba; + uint32_t nlb; - zone->w_ptr += nlb; + slba = le64_to_cpu(rw->slba); + nlb = le16_to_cpu(rw->nlb) + 1; + zone = nvme_get_zone_by_slba(ns, slba); - if (zone->w_ptr < nvme_zone_wr_boundary(zone)) { - zs = nvme_get_zone_state(zone); - switch (zs) { - case NVME_ZONE_STATE_EMPTY: - nvme_aor_inc_active(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - nvme_aor_inc_open(ns); - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); - } - } + __nvme_advance_zone_wp(ns, zone, nlb); } static inline bool nvme_is_write(NvmeRequest *req) @@ -1379,9 +1510,37 @@ static void nvme_rw_cb(void *opaque, int ret) trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); if (ns->params.zoned && nvme_is_write(req)) { - nvme_finalize_zoned_write(ns, req, ret != 0); + nvme_finalize_zoned_write(ns, req); + } + + if (!ret) { + block_acct_done(stats, acct); + } else { + block_acct_failed(stats, acct); + nvme_aio_err(req, ret); } + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +struct nvme_aio_flush_ctx { + NvmeRequest *req; + NvmeNamespace *ns; + BlockAcctCookie acct; +}; + +static void nvme_aio_flush_cb(void *opaque, int ret) +{ + struct nvme_aio_flush_ctx *ctx = opaque; + NvmeRequest *req = ctx->req; + uintptr_t *num_flushes = (uintptr_t *)&req->opaque; + + BlockBackend *blk = ctx->ns->blkconf.blk; + BlockAcctCookie *acct = &ctx->acct; + BlockAcctStats *stats = blk_get_stats(blk); + + trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk)); + if (!ret) { block_acct_done(stats, acct); } else { @@ -1389,6 +1548,13 @@ static void nvme_rw_cb(void *opaque, int ret) nvme_aio_err(req, ret); } + (*num_flushes)--; + g_free(ctx); + + if (*num_flushes) { + return; + } + nvme_enqueue_req_completion(nvme_cq(req), req); } @@ -1459,10 +1625,139 @@ static void nvme_aio_zone_reset_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } +struct nvme_copy_ctx { + int copies; + uint8_t *bounce; + uint32_t nlb; +}; + +struct nvme_copy_in_ctx { + NvmeRequest *req; + QEMUIOVector iov; +}; + +static void nvme_copy_cb(void *opaque, int ret) +{ + NvmeRequest *req = opaque; + NvmeNamespace *ns = req->ns; + struct nvme_copy_ctx *ctx = req->opaque; + + trace_pci_nvme_copy_cb(nvme_cid(req)); + + if (ns->params.zoned) { + NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + uint64_t sdlba = le64_to_cpu(copy->sdlba); + NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); + + __nvme_advance_zone_wp(ns, zone, ctx->nlb); + } + + if (!ret) { + block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); + } else { + block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); + nvme_aio_err(req, ret); + } + + g_free(ctx->bounce); + g_free(ctx); + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +static void nvme_copy_in_complete(NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + struct nvme_copy_ctx *ctx = req->opaque; + uint64_t sdlba = le64_to_cpu(copy->sdlba); + uint16_t status; + + trace_pci_nvme_copy_in_complete(nvme_cid(req)); + + block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); + + status = nvme_check_bounds(ns, sdlba, ctx->nlb); + if (status) { + trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze); + goto invalid; + } + + if (ns->params.zoned) { + NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); + + status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb); + if (status) { + goto invalid; + } + + status = nvme_zrm_auto(ns, zone); + if (status) { + goto invalid; + } + + zone->w_ptr += ctx->nlb; + } + + qemu_iovec_init(&req->sg.iov, 1); + qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb)); + + block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, + BLOCK_ACCT_WRITE); + + req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba), + &req->sg.iov, 0, nvme_copy_cb, req); + + return; + +invalid: + req->status = status; + + g_free(ctx->bounce); + g_free(ctx); + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +static void nvme_aio_copy_in_cb(void *opaque, int ret) +{ + struct nvme_copy_in_ctx *in_ctx = opaque; + NvmeRequest *req = in_ctx->req; + NvmeNamespace *ns = req->ns; + struct nvme_copy_ctx *ctx = req->opaque; + + qemu_iovec_destroy(&in_ctx->iov); + g_free(in_ctx); + + trace_pci_nvme_aio_copy_in_cb(nvme_cid(req)); + + if (ret) { + nvme_aio_err(req, ret); + } + + ctx->copies--; + + if (ctx->copies) { + return; + } + + if (req->status) { + block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); + + g_free(ctx->bounce); + g_free(ctx); + + nvme_enqueue_req_completion(nvme_cq(req), req); + + return; + } + + nvme_copy_in_complete(req); +} + struct nvme_compare_ctx { QEMUIOVector iov; uint8_t *bounce; - size_t len; }; static void nvme_compare_cb(void *opaque, int ret) @@ -1483,16 +1778,15 @@ static void nvme_compare_cb(void *opaque, int ret) goto out; } - buf = g_malloc(ctx->len); + buf = g_malloc(ctx->iov.size); - status = nvme_dma(nvme_ctrl(req), buf, ctx->len, DMA_DIRECTION_TO_DEVICE, - req); + status = nvme_h2c(nvme_ctrl(req), buf, ctx->iov.size, req); if (status) { req->status = status; goto out; } - if (memcmp(buf, ctx->bounce, ctx->len)) { + if (memcmp(buf, ctx->bounce, ctx->iov.size)) { req->status = NVME_CMP_FAILURE; } @@ -1522,8 +1816,7 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) NvmeDsmRange range[nr]; uintptr_t *discards = (uintptr_t *)&req->opaque; - status = nvme_dma(n, (uint8_t *)range, sizeof(range), - DMA_DIRECTION_TO_DEVICE, req); + status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req); if (status) { return status; } @@ -1548,6 +1841,10 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, nlb); + if (nlb > n->dmrsl) { + trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl); + } + offset = nvme_l2b(ns, slba); len = nvme_l2b(ns, nlb); @@ -1577,6 +1874,121 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) return status; } +static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + g_autofree NvmeCopySourceRange *range = NULL; + + uint16_t nr = copy->nr + 1; + uint8_t format = copy->control[0] & 0xf; + uint32_t nlb = 0; + + uint8_t *bounce = NULL, *bouncep = NULL; + struct nvme_copy_ctx *ctx; + uint16_t status; + int i; + + trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format); + + if (!(n->id_ctrl.ocfs & (1 << format))) { + trace_pci_nvme_err_copy_invalid_format(format); + return NVME_INVALID_FIELD | NVME_DNR; + } + + if (nr > ns->id_ns.msrc + 1) { + return NVME_CMD_SIZE_LIMIT | NVME_DNR; + } + + range = g_new(NvmeCopySourceRange, nr); + + status = nvme_h2c(n, (uint8_t *)range, nr * sizeof(NvmeCopySourceRange), + req); + if (status) { + return status; + } + + for (i = 0; i < nr; i++) { + uint64_t slba = le64_to_cpu(range[i].slba); + uint32_t _nlb = le16_to_cpu(range[i].nlb) + 1; + + if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) { + return NVME_CMD_SIZE_LIMIT | NVME_DNR; + } + + status = nvme_check_bounds(ns, slba, _nlb); + if (status) { + trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze); + return status; + } + + if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { + status = nvme_check_dulbe(ns, slba, _nlb); + if (status) { + return status; + } + } + + if (ns->params.zoned) { + status = nvme_check_zone_read(ns, slba, _nlb); + if (status) { + return status; + } + } + + nlb += _nlb; + } + + if (nlb > le32_to_cpu(ns->id_ns.mcl)) { + return NVME_CMD_SIZE_LIMIT | NVME_DNR; + } + + bounce = bouncep = g_malloc(nvme_l2b(ns, nlb)); + + block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, + BLOCK_ACCT_READ); + + ctx = g_new(struct nvme_copy_ctx, 1); + + ctx->bounce = bounce; + ctx->nlb = nlb; + ctx->copies = 1; + + req->opaque = ctx; + + for (i = 0; i < nr; i++) { + uint64_t slba = le64_to_cpu(range[i].slba); + uint32_t nlb = le16_to_cpu(range[i].nlb) + 1; + + size_t len = nvme_l2b(ns, nlb); + int64_t offset = nvme_l2b(ns, slba); + + trace_pci_nvme_copy_source_range(slba, nlb); + + struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1); + in_ctx->req = req; + + qemu_iovec_init(&in_ctx->iov, 1); + qemu_iovec_add(&in_ctx->iov, bouncep, len); + + ctx->copies++; + + blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0, + nvme_aio_copy_in_cb, in_ctx); + + bouncep += len; + } + + /* account for the 1-initialization */ + ctx->copies--; + + if (!ctx->copies) { + nvme_copy_in_complete(req); + } + + return NVME_NO_COMPLETE; +} + static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; @@ -1594,7 +2006,6 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) status = nvme_check_mdts(n, len); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), len); return status; } @@ -1615,7 +2026,6 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) ctx = g_new(struct nvme_compare_ctx, 1); ctx->bounce = bounce; - ctx->len = len; req->opaque = ctx; @@ -1630,10 +2040,56 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { - block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, - BLOCK_ACCT_FLUSH); - req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); - return NVME_NO_COMPLETE; + uint32_t nsid = le32_to_cpu(req->cmd.nsid); + uintptr_t *num_flushes = (uintptr_t *)&req->opaque; + uint16_t status; + struct nvme_aio_flush_ctx *ctx; + NvmeNamespace *ns; + + trace_pci_nvme_flush(nvme_cid(req), nsid); + + if (nsid != NVME_NSID_BROADCAST) { + req->ns = nvme_ns(n, nsid); + if (unlikely(!req->ns)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, + BLOCK_ACCT_FLUSH); + req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); + return NVME_NO_COMPLETE; + } + + /* 1-initialize; see comment in nvme_dsm */ + *num_flushes = 1; + + for (int i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + + ctx = g_new(struct nvme_aio_flush_ctx, 1); + ctx->req = req; + ctx->ns = ns; + + (*num_flushes)++; + + block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0, + BLOCK_ACCT_FLUSH); + blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx); + } + + /* account for the 1-initialization */ + (*num_flushes)--; + + if (*num_flushes) { + status = NVME_NO_COMPLETE; + } else { + status = req->status; + } + + return status; } static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) @@ -1651,7 +2107,6 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) status = nvme_check_mdts(n, data_size); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), data_size); goto invalid; } @@ -1669,7 +2124,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) } } - status = nvme_map_dptr(n, data_size, req); + status = nvme_map_dptr(n, &req->sg, data_size, &req->cmd); if (status) { goto invalid; } @@ -1685,13 +2140,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) block_acct_start(blk_get_stats(blk), &req->acct, data_size, BLOCK_ACCT_READ); - if (req->qsg.sg) { - req->aiocb = dma_blk_read(blk, &req->qsg, data_offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); - } else { - req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0, - nvme_rw_cb, req); - } + nvme_blk_read(blk, data_offset, nvme_rw_cb, req); return NVME_NO_COMPLETE; invalid: @@ -1719,7 +2168,6 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, if (!wrz) { status = nvme_check_mdts(n, data_size); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), data_size); goto invalid; } } @@ -1740,48 +2188,40 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, goto invalid; } - if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { - trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); - status = NVME_INVALID_FIELD; - goto invalid; + if (n->params.zasl && data_size > n->page_size << n->params.zasl) { + trace_pci_nvme_err_zasl(data_size); + return NVME_INVALID_FIELD | NVME_DNR; } slba = zone->w_ptr; res->slba = cpu_to_le64(slba); } - status = nvme_check_zone_write(n, ns, zone, slba, nlb); + status = nvme_check_zone_write(ns, zone, slba, nlb); if (status) { goto invalid; } - status = nvme_auto_open_zone(ns, zone); + status = nvme_zrm_auto(ns, zone); if (status) { goto invalid; } - nvme_advance_zone_wp(ns, zone, nlb); + zone->w_ptr += nlb; } data_offset = nvme_l2b(ns, slba); if (!wrz) { - status = nvme_map_dptr(n, data_size, req); + status = nvme_map_dptr(n, &req->sg, data_size, &req->cmd); if (status) { goto invalid; } block_acct_start(blk_get_stats(blk), &req->acct, data_size, BLOCK_ACCT_WRITE); - if (req->qsg.sg) { - req->aiocb = dma_blk_write(blk, &req->qsg, data_offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); - } else { - req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0, - nvme_rw_cb, req); - } + nvme_blk_write(blk, data_offset, nvme_rw_cb, req); } else { - block_acct_start(blk_get_stats(blk), &req->acct, 0, BLOCK_ACCT_WRITE); req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size, BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req); @@ -1846,73 +2286,19 @@ enum NvmeZoneProcessingMask { static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state, NvmeRequest *req) { - uint16_t status; - - switch (state) { - case NVME_ZONE_STATE_EMPTY: - status = nvme_aor_check(ns, 1, 0); - if (status) { - return status; - } - nvme_aor_inc_active(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - status = nvme_aor_check(ns, 0, 1); - if (status) { - if (state == NVME_ZONE_STATE_EMPTY) { - nvme_aor_dec_active(ns); - } - return status; - } - nvme_aor_inc_open(ns); - /* fall through */ - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); - /* fall through */ - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - return NVME_SUCCESS; - default: - return NVME_ZONE_INVAL_TRANSITION; - } + return nvme_zrm_open(ns, zone); } static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state, NvmeRequest *req) { - switch (state) { - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - nvme_aor_dec_open(ns); - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - return NVME_SUCCESS; - default: - return NVME_ZONE_INVAL_TRANSITION; - } + return nvme_zrm_close(ns, zone); } static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state, NvmeRequest *req) { - switch (state) { - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - nvme_aor_dec_open(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - nvme_aor_dec_active(ns); - /* fall through */ - case NVME_ZONE_STATE_EMPTY: - zone->w_ptr = nvme_zone_wr_boundary(zone); - zone->d.wp = zone->w_ptr; - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); - /* fall through */ - case NVME_ZONE_STATE_FULL: - return NVME_SUCCESS; - default: - return NVME_ZONE_INVAL_TRANSITION; - } + return nvme_zrm_finish(ns, zone); } static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, @@ -2168,8 +2554,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } zd_ext = nvme_get_zd_extension(ns, zone_idx); - status = nvme_dma(n, zd_ext, ns->params.zd_extension_size, - DMA_DIRECTION_TO_DEVICE, req); + status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req); if (status) { trace_pci_nvme_err_zd_extension_map_error(zone_idx); return status; @@ -2267,7 +2652,6 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) status = nvme_check_mdts(n, data_size); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), data_size); return status; } @@ -2324,8 +2708,7 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) } } - status = nvme_dma(n, (uint8_t *)buf, data_size, - DMA_DIRECTION_FROM_DEVICE, req); + status = nvme_c2h(n, (uint8_t *)buf, data_size, req); g_free(buf); @@ -2343,6 +2726,29 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_NSID | NVME_DNR; } + /* + * In the base NVM command set, Flush may apply to all namespaces + * (indicated by NSID being set to 0xFFFFFFFF). But if that feature is used + * along with TP 4056 (Namespace Types), it may be pretty screwed up. + * + * If NSID is indeed set to 0xFFFFFFFF, we simply cannot associate the + * opcode with a specific command since we cannot determine a unique I/O + * command set. Opcode 0x0 could have any other meaning than something + * equivalent to flushing and say it DOES have completely different + * semantics in some other command set - does an NSID of 0xFFFFFFFF then + * mean "for all namespaces, apply whatever command set specific command + * that uses the 0x0 opcode?" Or does it mean "for all namespaces, apply + * whatever command that uses the 0x0 opcode if, and only if, it allows + * NSID to be 0xFFFFFFFF"? + * + * Anyway (and luckily), for now, we do not care about this since the + * device only supports namespace types that includes the NVM Flush command + * (NVM and Zoned), so always do an NVM Flush. + */ + if (req->cmd.opcode == NVME_CMD_FLUSH) { + return nvme_flush(n, req); + } + req->ns = nvme_ns(n, nsid); if (unlikely(!req->ns)) { return NVME_INVALID_FIELD | NVME_DNR; @@ -2354,8 +2760,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) } switch (req->cmd.opcode) { - case NVME_CMD_FLUSH: - return nvme_flush(n, req); case NVME_CMD_WRITE_ZEROES: return nvme_write_zeroes(n, req); case NVME_CMD_ZONE_APPEND: @@ -2368,6 +2772,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_compare(n, req); case NVME_CMD_DSM: return nvme_dsm(n, req); + case NVME_CMD_COPY: + return nvme_copy(n, req); case NVME_CMD_ZONE_MGMT_SEND: return nvme_zone_mgmt_send(n, req); case NVME_CMD_ZONE_MGMT_RECV: @@ -2568,8 +2974,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, nvme_clear_events(n, NVME_AER_TYPE_SMART); } - return nvme_dma(n, (uint8_t *) &smart + off, trans_len, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req); } static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, @@ -2587,8 +2992,7 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' '); trans_len = MIN(sizeof(fw_log) - off, buf_len); - return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req); } static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, @@ -2608,8 +3012,49 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, memset(&errlog, 0x0, sizeof(errlog)); trans_len = MIN(sizeof(errlog) - off, buf_len); - return nvme_dma(n, (uint8_t *)&errlog, trans_len, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req); +} + +static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + uint32_t nslist[1024]; + uint32_t trans_len; + int i = 0; + uint32_t nsid; + + memset(nslist, 0x0, sizeof(nslist)); + trans_len = MIN(sizeof(nslist) - off, buf_len); + + while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) != + NVME_CHANGED_NSID_SIZE) { + /* + * If more than 1024 namespaces, the first entry in the log page should + * be set to 0xffffffff and the others to 0 as spec. + */ + if (i == ARRAY_SIZE(nslist)) { + memset(nslist, 0x0, sizeof(nslist)); + nslist[0] = 0xffffffff; + break; + } + + nslist[i++] = nsid; + clear_bit(nsid, n->changed_nsids); + } + + /* + * Remove all the remaining list entries in case returns directly due to + * more than 1024 namespaces. + */ + if (nslist[0] == 0xffffffff) { + bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE); + } + + if (!rae) { + nvme_clear_events(n, NVME_AER_TYPE_NOTICE); + } + + return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req); } static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, @@ -2649,8 +3094,7 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, trans_len = MIN(sizeof(log) - off, buf_len); - return nvme_dma(n, ((uint8_t *)&log) + off, trans_len, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req); } static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) @@ -2686,7 +3130,6 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) status = nvme_check_mdts(n, len); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), len); return status; } @@ -2697,6 +3140,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) return nvme_smart_info(n, rae, len, off, req); case NVME_LOG_FW_SLOT_INFO: return nvme_fw_log_info(n, len, off, req); + case NVME_LOG_CHANGED_NSLIST: + return nvme_changed_nslist(n, rae, len, off, req); case NVME_LOG_CMD_EFFECTS: return nvme_cmd_effects(n, csi, len, off, req); default: @@ -2819,7 +3264,7 @@ static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) { uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; - return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, id, sizeof(id), req); } static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns) @@ -2836,31 +3281,33 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_identify_ctrl(); - return nvme_dma(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req); } static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - NvmeIdCtrlZoned id = {}; + uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; trace_pci_nvme_identify_ctrl_csi(c->csi); - if (c->csi == NVME_CSI_NVM) { - return nvme_rpt_empty_id_struct(n, req); - } else if (c->csi == NVME_CSI_ZONED) { - if (n->params.zasl_bs) { - id.zasl = n->zasl; - } - return nvme_dma(n, (uint8_t *)&id, sizeof(id), - DMA_DIRECTION_FROM_DEVICE, req); + switch (c->csi) { + case NVME_CSI_NVM: + ((NvmeIdCtrlNvm *)&id)->dmrsl = cpu_to_le32(n->dmrsl); + break; + + case NVME_CSI_ZONED: + ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl; + break; + + default: + return NVME_INVALID_FIELD | NVME_DNR; } - return NVME_INVALID_FIELD | NVME_DNR; + return nvme_c2h(n, id, sizeof(id), req); } -static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -2874,18 +3321,64 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) ns = nvme_ns(n, nsid); if (unlikely(!ns)) { - return nvme_rpt_empty_id_struct(n, req); + if (!active) { + ns = nvme_subsys_ns(n->subsys, nsid); + if (!ns) { + return nvme_rpt_empty_id_struct(n, req); + } + } else { + return nvme_rpt_empty_id_struct(n, req); + } } if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { - return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req); } return NVME_INVALID_CMD_SET | NVME_DNR; } -static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + uint16_t min_id = le16_to_cpu(c->ctrlid); + uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; + uint16_t *ids = &list[1]; + NvmeNamespace *ns; + NvmeCtrl *ctrl; + int cntlid, nr_ids = 0; + + trace_pci_nvme_identify_ns_attached_list(min_id); + + if (c->nsid == NVME_NSID_BROADCAST) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ns = nvme_subsys_ns(n->subsys, c->nsid); + if (!ns) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) { + ctrl = nvme_subsys_ctrl(n->subsys, cntlid); + if (!ctrl) { + continue; + } + + if (!nvme_ns_is_attached(ctrl, ns)) { + continue; + } + + ids[nr_ids++] = cntlid; + } + + list[0] = nr_ids; + + return nvme_c2h(n, (uint8_t *)list, sizeof(list), req); +} + +static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, + bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -2899,20 +3392,28 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) ns = nvme_ns(n, nsid); if (unlikely(!ns)) { - return nvme_rpt_empty_id_struct(n, req); + if (!active) { + ns = nvme_subsys_ns(n->subsys, nsid); + if (!ns) { + return nvme_rpt_empty_id_struct(n, req); + } + } else { + return nvme_rpt_empty_id_struct(n, req); + } } if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { return nvme_rpt_empty_id_struct(n, req); } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) { - return nvme_dma(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), + req); } return NVME_INVALID_FIELD | NVME_DNR; } -static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req, + bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -2937,7 +3438,14 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); if (!ns) { - continue; + if (!active) { + ns = nvme_subsys_ns(n->subsys, i); + if (!ns) { + continue; + } + } else { + continue; + } } if (ns->params.nsid <= min_nsid) { continue; @@ -2948,10 +3456,11 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) } } - return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, list, data_len, req); } -static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req, + bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -2977,7 +3486,14 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); if (!ns) { - continue; + if (!active) { + ns = nvme_subsys_ns(n->subsys, i); + if (!ns) { + continue; + } + } else { + continue; + } } if (ns->params.nsid <= min_nsid || c->csi != ns->csi) { continue; @@ -2988,7 +3504,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) } } - return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, list, data_len, req); } static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) @@ -3035,7 +3551,7 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI; ns_descrs->csi.v = ns->csi; - return nvme_dma(n, list, sizeof(list), DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, list, sizeof(list), req); } static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) @@ -3048,34 +3564,39 @@ static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) NVME_SET_CSI(*list, NVME_CSI_NVM); NVME_SET_CSI(*list, NVME_CSI_ZONED); - return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, list, data_len, req); } static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - switch (le32_to_cpu(c->cns)) { + trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid), + c->csi); + + switch (c->cns) { case NVME_ID_CNS_NS: - /* fall through */ + return nvme_identify_ns(n, req, true); case NVME_ID_CNS_NS_PRESENT: - return nvme_identify_ns(n, req); + return nvme_identify_ns(n, req, false); + case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST: + return nvme_identify_ns_attached_list(n, req); case NVME_ID_CNS_CS_NS: - /* fall through */ + return nvme_identify_ns_csi(n, req, true); case NVME_ID_CNS_CS_NS_PRESENT: - return nvme_identify_ns_csi(n, req); + return nvme_identify_ns_csi(n, req, false); case NVME_ID_CNS_CTRL: return nvme_identify_ctrl(n, req); case NVME_ID_CNS_CS_CTRL: return nvme_identify_ctrl_csi(n, req); case NVME_ID_CNS_NS_ACTIVE_LIST: - /* fall through */ + return nvme_identify_nslist(n, req, true); case NVME_ID_CNS_NS_PRESENT_LIST: - return nvme_identify_nslist(n, req); + return nvme_identify_nslist(n, req, false); case NVME_ID_CNS_CS_NS_ACTIVE_LIST: - /* fall through */ + return nvme_identify_nslist_csi(n, req, true); case NVME_ID_CNS_CS_NS_PRESENT_LIST: - return nvme_identify_nslist_csi(n, req); + return nvme_identify_nslist_csi(n, req, false); case NVME_ID_CNS_NS_DESCR_LIST: return nvme_identify_ns_descr_list(n, req); case NVME_ID_CNS_IO_COMMAND_SET: @@ -3137,8 +3658,7 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { uint64_t timestamp = nvme_get_timestamp(n); - return nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req); } static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) @@ -3299,8 +3819,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) uint16_t ret; uint64_t timestamp; - ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), - DMA_DIRECTION_TO_DEVICE, req); + ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req); if (ret) { return ret; } @@ -3472,6 +3991,71 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req) return NVME_NO_COMPLETE; } +static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns); +static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns; + NvmeCtrl *ctrl; + uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; + uint32_t nsid = le32_to_cpu(req->cmd.nsid); + uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); + bool attach = !(dw10 & 0xf); + uint16_t *nr_ids = &list[0]; + uint16_t *ids = &list[1]; + uint16_t ret; + int i; + + trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf); + + ns = nvme_subsys_ns(n->subsys, nsid); + if (!ns) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ret = nvme_h2c(n, (uint8_t *)list, 4096, req); + if (ret) { + return ret; + } + + if (!*nr_ids) { + return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; + } + + for (i = 0; i < *nr_ids; i++) { + ctrl = nvme_subsys_ctrl(n->subsys, ids[i]); + if (!ctrl) { + return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; + } + + if (attach) { + if (nvme_ns_is_attached(ctrl, ns)) { + return NVME_NS_ALREADY_ATTACHED | NVME_DNR; + } + + nvme_ns_attach(ctrl, ns); + __nvme_select_ns_iocs(ctrl, ns); + } else { + if (!nvme_ns_is_attached(ctrl, ns)) { + return NVME_NS_NOT_ATTACHED | NVME_DNR; + } + + nvme_ns_detach(ctrl, ns); + } + + /* + * Add namespace id to the changed namespace id list for event clearing + * via Get Log Page command. + */ + if (!test_and_set_bit(nsid, ctrl->changed_nsids)) { + nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED, + NVME_LOG_CHANGED_NSLIST); + } + } + + return NVME_SUCCESS; +} + static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, @@ -3482,6 +4066,11 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_OPCODE | NVME_DNR; } + /* SGLs shall not be used for Admin commands in NVMe over PCIe */ + if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) { + return NVME_INVALID_FIELD | NVME_DNR; + } + switch (req->cmd.opcode) { case NVME_ADM_CMD_DELETE_SQ: return nvme_del_sq(n, req); @@ -3503,6 +4092,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_get_feature(n, req); case NVME_ADM_CMD_ASYNC_EV_REQ: return nvme_aer(n, req); + case NVME_ADM_CMD_NS_ATTACHMENT: + return nvme_ns_attachment(n, req); default: assert(false); } @@ -3604,6 +4195,25 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n) } } +static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns) +{ + ns->iocs = nvme_cse_iocs_none; + switch (ns->csi) { + case NVME_CSI_NVM: + if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { + ns->iocs = nvme_cse_iocs_nvm; + } + break; + case NVME_CSI_ZONED: + if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) { + ns->iocs = nvme_cse_iocs_zoned; + } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) { + ns->iocs = nvme_cse_iocs_nvm; + } + break; + } +} + static void nvme_select_ns_iocs(NvmeCtrl *n) { NvmeNamespace *ns; @@ -3614,21 +4224,8 @@ static void nvme_select_ns_iocs(NvmeCtrl *n) if (!ns) { continue; } - ns->iocs = nvme_cse_iocs_none; - switch (ns->csi) { - case NVME_CSI_NVM: - if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { - ns->iocs = nvme_cse_iocs_nvm; - } - break; - case NVME_CSI_ZONED: - if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) { - ns->iocs = nvme_cse_iocs_zoned; - } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) { - ns->iocs = nvme_cse_iocs_nvm; - } - break; - } + + __nvme_select_ns_iocs(n, ns); } } @@ -3726,17 +4323,6 @@ static int nvme_start_ctrl(NvmeCtrl *n) nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0, NVME_AQA_ASQS(n->bar.aqa) + 1); - if (!n->params.zasl_bs) { - n->zasl = n->params.mdts; - } else { - if (n->params.zasl_bs < n->page_size) { - trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs, - n->page_size); - return -1; - } - n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size); - } - nvme_set_timestamp(n, 0ULL); QTAILQ_INIT(&n->aer_queue); @@ -4245,11 +4831,10 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) host_memory_backend_set_mapped(n->pmr.dev, true); } - if (n->params.zasl_bs) { - if (!is_power_of_2(n->params.zasl_bs)) { - error_setg(errp, "zone append size limit has to be a power of 2"); - return; - } + if (n->params.zasl > n->params.mdts) { + error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less " + "than or equal to mdts (Maximum Data Transfer Size)"); + return; } } @@ -4267,6 +4852,20 @@ static void nvme_init_state(NvmeCtrl *n) n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); } +static int nvme_attach_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +{ + if (nvme_ns_is_attached(n, ns)) { + error_setg(errp, + "namespace %d is already attached to controller %d", + nvme_nsid(ns), n->cntlid); + return -1; + } + + nvme_ns_attach(n, ns); + + return 0; +} + int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) { uint32_t nsid = nvme_nsid(ns); @@ -4298,7 +4897,26 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) trace_pci_nvme_register_namespace(nsid); - n->namespaces[nsid - 1] = ns; + /* + * If subsys is not given, namespae is always attached to the controller + * because there's no subsystem to manage namespace allocation. + */ + if (!n->subsys) { + if (ns->params.detached) { + error_setg(errp, + "detached needs nvme-subsys specified nvme or nvme-ns"); + return -1; + } + + return nvme_attach_namespace(n, ns, errp); + } else { + if (!ns->params.detached) { + return nvme_attach_namespace(n, ns, errp); + } + } + + n->dmrsl = MIN_NON_ZERO(n->dmrsl, + BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); return 0; } @@ -4405,24 +5023,49 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) return 0; } +static void nvme_init_subnqn(NvmeCtrl *n) +{ + NvmeSubsystem *subsys = n->subsys; + NvmeIdCtrl *id = &n->id_ctrl; + + if (!subsys) { + snprintf((char *)id->subnqn, sizeof(id->subnqn), + "nqn.2019-08.org.qemu:%s", n->params.serial); + } else { + pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn); + } +} + static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) { NvmeIdCtrl *id = &n->id_ctrl; uint8_t *pci_conf = pci_dev->config; - char *subnqn; id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' '); strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' '); strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' '); + + id->cntlid = cpu_to_le16(n->cntlid); + + id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR); + id->rab = 6; - id->ieee[0] = 0x00; - id->ieee[1] = 0x02; - id->ieee[2] = 0xb3; + + if (n->params.use_intel_id) { + id->ieee[0] = 0xb3; + id->ieee[1] = 0x02; + id->ieee[2] = 0x00; + } else { + id->ieee[0] = 0x00; + id->ieee[1] = 0x54; + id->ieee[2] = 0x52; + } + id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); - id->oacs = cpu_to_le16(0); + id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT); id->cntrltype = 0x1; /* @@ -4450,20 +5093,31 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | NVME_ONCS_FEATURES | NVME_ONCS_DSM | - NVME_ONCS_COMPARE); + NVME_ONCS_COMPARE | NVME_ONCS_COPY); - id->vwc = (0x2 << 1) | 0x1; + /* + * NOTE: If this device ever supports a command set that does NOT use 0x0 + * as a Flush-equivalent operation, support for the broadcast NSID in Flush + * should probably be removed. + * + * See comment in nvme_io_cmd. + */ + id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT; + + id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0); id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | NVME_CTRL_SGLS_BITBUCKET); - subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial); - strpadcpy((char *)id->subnqn, sizeof(id->subnqn), subnqn, '\0'); - g_free(subnqn); + nvme_init_subnqn(n); id->psd[0].mp = cpu_to_le16(0x9c4); id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); + if (n->subsys) { + id->cmic |= NVME_CMIC_MULTI_CTRL; + } + NVME_CAP_SET_MQES(n->bar.cap, 0x7ff); NVME_CAP_SET_CQR(n->bar.cap, 1); NVME_CAP_SET_TO(n->bar.cap, 0xf); @@ -4478,6 +5132,24 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) n->bar.intmc = n->bar.intms = 0; } +static int nvme_init_subsys(NvmeCtrl *n, Error **errp) +{ + int cntlid; + + if (!n->subsys) { + return 0; + } + + cntlid = nvme_subsys_register_ctrl(n, errp); + if (cntlid < 0) { + return -1; + } + + n->cntlid = cntlid; + + return 0; +} + static void nvme_realize(PCIDevice *pci_dev, Error **errp) { NvmeCtrl *n = NVME(pci_dev); @@ -4498,6 +5170,10 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) return; } + if (nvme_init_subsys(n, errp)) { + error_propagate(errp, local_err); + return; + } nvme_init_ctrl(n, pci_dev); /* setup a namespace if the controller drive property was given */ @@ -4550,6 +5226,8 @@ static Property nvme_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf), DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND, HostMemoryBackend *), + DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS, + NvmeSubsystem *), DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0), DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0), @@ -4560,8 +5238,7 @@ static Property nvme_props[] = { DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), - DEFINE_PROP_SIZE32("zoned.append_size_limit", NvmeCtrl, params.zasl_bs, - NVME_DEFAULT_MAX_ZA_SIZE), + DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme.h b/hw/block/nvme.h index dee6092bd4..4955d649c7 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -2,6 +2,7 @@ #define HW_NVME_H #include "block/nvme.h" +#include "nvme-subsys.h" #include "nvme-ns.h" #define NVME_MAX_NAMESPACES 256 @@ -9,6 +10,12 @@ #define NVME_DEFAULT_ZONE_SIZE (128 * MiB) #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) +/* + * Subsystem namespace list for allocated namespaces should be larger than + * attached namespace list in a controller. + */ +QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_SUBSYS_MAX_NAMESPACES); + typedef struct NvmeParams { char *serial; uint32_t num_queues; /* deprecated since 5.1 */ @@ -19,7 +26,7 @@ typedef struct NvmeParams { uint32_t aer_max_queued; uint8_t mdts; bool use_intel_id; - uint32_t zasl_bs; + uint8_t zasl; bool legacy_cmb; } NvmeParams; @@ -28,6 +35,20 @@ typedef struct NvmeAsyncEvent { NvmeAerResult result; } NvmeAsyncEvent; +enum { + NVME_SG_ALLOC = 1 << 0, + NVME_SG_DMA = 1 << 1, +}; + +typedef struct NvmeSg { + int flags; + + union { + QEMUSGList qsg; + QEMUIOVector iov; + }; +} NvmeSg; + typedef struct NvmeRequest { struct NvmeSQueue *sq; struct NvmeNamespace *ns; @@ -37,8 +58,7 @@ typedef struct NvmeRequest { NvmeCqe cqe; NvmeCmd cmd; BlockAcctCookie acct; - QEMUSGList qsg; - QEMUIOVector iov; + NvmeSg sg; QTAILQ_ENTRY(NvmeRequest)entry; } NvmeRequest; @@ -68,6 +88,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; + case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY"; case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND"; case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV"; case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND"; @@ -133,6 +154,7 @@ typedef struct NvmeCtrl { NvmeBus bus; BlockConf conf; + uint16_t cntlid; bool qs_created; uint32_t page_size; uint16_t page_bits; @@ -168,9 +190,19 @@ typedef struct NvmeCtrl { QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue; int aer_queued; - uint8_t zasl; + uint32_t dmrsl; + + /* Namespace ID is started with 1 so bitmap should be 1-based */ +#define NVME_CHANGED_NSID_SIZE (NVME_MAX_NAMESPACES + 1) + DECLARE_BITMAP(changed_nsids, NVME_CHANGED_NSID_SIZE); + + NvmeSubsystem *subsys; NvmeNamespace namespace; + /* + * Attached namespaces to this controller. If subsys is not given, all + * namespaces in this list will always be attached. + */ NvmeNamespace *namespaces[NVME_MAX_NAMESPACES]; NvmeSQueue **sq; NvmeCQueue **cq; @@ -189,6 +221,29 @@ static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid) return n->namespaces[nsid - 1]; } +static inline bool nvme_ns_is_attached(NvmeCtrl *n, NvmeNamespace *ns) +{ + int nsid; + + for (nsid = 1; nsid <= n->num_namespaces; nsid++) { + if (nvme_ns(n, nsid) == ns) { + return true; + } + } + + return false; +} + +static inline void nvme_ns_attach(NvmeCtrl *n, NvmeNamespace *ns) +{ + n->namespaces[nvme_nsid(ns) - 1] = ns; +} + +static inline void nvme_ns_detach(NvmeCtrl *n, NvmeNamespace *ns) +{ + n->namespaces[nvme_nsid(ns) - 1] = NULL; +} + static inline NvmeCQueue *nvme_cq(NvmeRequest *req) { NvmeSQueue *sq = req->sq; diff --git a/hw/block/trace-events b/hw/block/trace-events index d32475c398..ef06d2ea74 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -37,26 +37,36 @@ pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d" -pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64"" +pci_nvme_map_sgl(uint8_t typ, uint64_t len) "type 0x%"PRIx8" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" +pci_nvme_flush(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32"" pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" +pci_nvme_copy(uint16_t cid, uint32_t nsid, uint16_t nr, uint8_t format) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu16" format 0x%"PRIx8"" +pci_nvme_copy_source_range(uint64_t slba, uint32_t nlb) "slba 0x%"PRIx64" nlb %"PRIu32"" +pci_nvme_copy_in_complete(uint16_t cid) "cid %"PRIu16"" +pci_nvme_copy_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d" pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32"" pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" +pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32"" pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32"" pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16"" +pci_nvme_aio_copy_in_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64"" +pci_nvme_aio_flush_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16"" +pci_nvme_identify(uint16_t cid, uint8_t cns, uint16_t ctrlid, uint8_t csi) "cid %"PRIu16" cns 0x%"PRIx8" ctrlid %"PRIu16" csi 0x%"PRIx8"" pci_nvme_identify_ctrl(void) "identify controller" pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8"" pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" +pci_nvme_identify_ns_attached_list(uint16_t cntid) "cntid=%"PRIu16"" pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", csi=0x%"PRIx8"" pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", csi=0x%"PRIx8"" @@ -75,6 +85,8 @@ pci_nvme_aer(uint16_t cid) "cid %"PRIu16"" pci_nvme_aer_aerl_exceeded(void) "aerl exceeded" pci_nvme_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 0x%"PRIx8"" pci_nvme_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8"" +pci_nvme_ns_attachment(uint16_t cid, uint8_t sel) "cid %"PRIu16", sel=0x%"PRIx8"" +pci_nvme_ns_attachment_attach(uint16_t cntlid, uint32_t nsid) "cntlid=0x%"PRIx16", nsid=0x%"PRIx32"" pci_nvme_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8"" pci_nvme_enqueue_event_noqueue(int queued) "queued %d" pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8"" @@ -107,15 +119,17 @@ pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", sl pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state" # nvme traces for error conditions -pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu" +pci_nvme_err_mdts(size_t len) "len %zu" +pci_nvme_err_zasl(size_t len) "len %zu" pci_nvme_err_req_status(uint16_t cid, uint32_t nsid, uint16_t status, uint8_t opc) "cid %"PRIu16" nsid %"PRIu32" status 0x%"PRIx16" opc 0x%"PRIx8"" pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_cfs(void) "controller fatal status" pci_nvme_err_aio(uint16_t cid, const char *errname, uint16_t status) "cid %"PRIu16" err '%s' status 0x%"PRIx16"" +pci_nvme_err_copy_invalid_format(uint8_t format) "format 0x%"PRIx8"" pci_nvme_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" pci_nvme_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" -pci_nvme_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16"" +pci_nvme_err_invalid_sgl_excess_length(uint32_t residual) "residual %"PRIu32"" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" @@ -136,7 +150,6 @@ pci_nvme_err_zone_boundary(uint64_t slba, uint32_t nlb, uint64_t zcap) "lba 0x%" pci_nvme_err_zone_invalid_write(uint64_t slba, uint64_t wp) "lba 0x%"PRIx64" wp 0x%"PRIx64"" pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" -pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8"" pci_nvme_err_insuff_active_res(uint32_t max_active) "max_active=%"PRIu32" zone limit exceeded" pci_nvme_err_insuff_open_res(uint32_t max_open) "max_open=%"PRIu32" zone limit exceeded" pci_nvme_err_zd_extension_map_error(uint32_t zone_idx) "can't map descriptor extension for zone_idx=%"PRIu32"" |