From e8cc43d5cc680d348fcc43c46fc2d5e603d0579e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 6 Jul 2020 11:29:22 -0700 Subject: [PATCH 01/39] MAINTAINERS: update nvme entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nvme emulated device development pace has increased recently. Klaus has offered to co-maintain, and since we have many new contributions coming through, we're adding a repository to accumulate and test new features. Cc: Klaus Jensen Acked-by: Klaus Jensen Acked-by: Kevin Wolf Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Keith Busch --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index b5ba553ee0..b233da2a73 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1856,10 +1856,12 @@ F: include/hw/virtio/virtio-mem.h nvme M: Keith Busch +M: Klaus Jensen L: qemu-block@nongnu.org S: Supported F: hw/block/nvme* F: tests/qtest/nvme-test.c +T: git git://git.infradead.org/qemu-nvme.git nvme-next megasas M: Hannes Reinecke From 7a85fb7faa43f9acc22ab507b6038116cf297fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 30 Jun 2020 13:04:26 +0200 Subject: [PATCH 02/39] hw/block/nvme: Update specification URL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At some point the URL changed, update it to avoid other developers to search for it. Reviewed-by: Dmitry Fomichev Reviewed-by: Klaus Jensen Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200630110429.19972-2-philmd@redhat.com> Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 3426e17e65..9e2f9547e5 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -11,7 +11,7 @@ /** * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e * - * http://www.nvmexpress.org/resources/ + * https://nvmexpress.org/developers/nvme-specification/ */ /** From e989738f3ab3283039385e5771e459b3f2ccfbf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 30 Jun 2020 13:04:27 +0200 Subject: [PATCH 03/39] hw/block/nvme: Use QEMU_PACKED on hardware/packet structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These structures either describe hardware registers, or commands ('packets') to send to the hardware. To forbid the compiler to optimize and change fields alignment, mark the structures as packed. Reviewed-by: Dmitry Fomichev Reviewed-by: Klaus Jensen Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200630110429.19972-3-philmd@redhat.com> Signed-off-by: Klaus Jensen --- include/block/nvme.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 1720ee1d51..71c5681912 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1,7 +1,7 @@ #ifndef BLOCK_NVME_H #define BLOCK_NVME_H -typedef struct NvmeBar { +typedef struct QEMU_PACKED NvmeBar { uint64_t cap; uint32_t vs; uint32_t intms; @@ -377,7 +377,7 @@ enum NvmePmrmscMask { #define NVME_PMRMSC_SET_CBA(pmrmsc, val) \ (pmrmsc |= (uint64_t)(val & PMRMSC_CBA_MASK) << PMRMSC_CBA_SHIFT) -typedef struct NvmeCmd { +typedef struct QEMU_PACKED NvmeCmd { uint8_t opcode; uint8_t fuse; uint16_t cid; @@ -422,7 +422,7 @@ enum NvmeIoCommands { NVME_CMD_DSM = 0x09, }; -typedef struct NvmeDeleteQ { +typedef struct QEMU_PACKED NvmeDeleteQ { uint8_t opcode; uint8_t flags; uint16_t cid; @@ -432,7 +432,7 @@ typedef struct NvmeDeleteQ { uint32_t rsvd11[5]; } NvmeDeleteQ; -typedef struct NvmeCreateCq { +typedef struct QEMU_PACKED NvmeCreateCq { uint8_t opcode; uint8_t flags; uint16_t cid; @@ -449,7 +449,7 @@ typedef struct NvmeCreateCq { #define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1) #define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1) -typedef struct NvmeCreateSq { +typedef struct QEMU_PACKED NvmeCreateSq { uint8_t opcode; uint8_t flags; uint16_t cid; @@ -474,7 +474,7 @@ enum NvmeQueueFlags { NVME_Q_PRIO_LOW = 3, }; -typedef struct NvmeIdentify { +typedef struct QEMU_PACKED NvmeIdentify { uint8_t opcode; uint8_t flags; uint16_t cid; @@ -486,7 +486,7 @@ typedef struct NvmeIdentify { uint32_t rsvd11[5]; } NvmeIdentify; -typedef struct NvmeRwCmd { +typedef struct QEMU_PACKED NvmeRwCmd { uint8_t opcode; uint8_t flags; uint16_t cid; @@ -528,7 +528,7 @@ enum { NVME_RW_PRINFO_PRCHK_REF = 1 << 10, }; -typedef struct NvmeDsmCmd { +typedef struct QEMU_PACKED NvmeDsmCmd { uint8_t opcode; uint8_t flags; uint16_t cid; @@ -547,7 +547,7 @@ enum { NVME_DSMGMT_AD = 1 << 2, }; -typedef struct NvmeDsmRange { +typedef struct QEMU_PACKED NvmeDsmRange { uint32_t cattr; uint32_t nlb; uint64_t slba; @@ -569,14 +569,14 @@ enum NvmeAsyncEventRequest { NVME_AER_INFO_SMART_SPARE_THRESH = 2, }; -typedef struct NvmeAerResult { +typedef struct QEMU_PACKED NvmeAerResult { uint8_t event_type; uint8_t event_info; uint8_t log_page; uint8_t resv; } NvmeAerResult; -typedef struct NvmeCqe { +typedef struct QEMU_PACKED NvmeCqe { uint32_t result; uint32_t rsvd; uint16_t sq_head; @@ -634,7 +634,7 @@ enum NvmeStatusCodes { NVME_NO_COMPLETE = 0xffff, }; -typedef struct NvmeFwSlotInfoLog { +typedef struct QEMU_PACKED NvmeFwSlotInfoLog { uint8_t afi; uint8_t reserved1[7]; uint8_t frs1[8]; @@ -647,7 +647,7 @@ typedef struct NvmeFwSlotInfoLog { uint8_t reserved2[448]; } NvmeFwSlotInfoLog; -typedef struct NvmeErrorLog { +typedef struct QEMU_PACKED NvmeErrorLog { uint64_t error_count; uint16_t sqid; uint16_t cid; @@ -659,7 +659,7 @@ typedef struct NvmeErrorLog { uint8_t resv[35]; } NvmeErrorLog; -typedef struct NvmeSmartLog { +typedef struct QEMU_PACKED NvmeSmartLog { uint8_t critical_warning; uint8_t temperature[2]; uint8_t available_spare; @@ -693,7 +693,7 @@ enum LogIdentifier { NVME_LOG_FW_SLOT_INFO = 0x03, }; -typedef struct NvmePSD { +typedef struct QEMU_PACKED NvmePSD { uint16_t mp; uint16_t reserved; uint32_t enlat; @@ -713,7 +713,7 @@ enum { NVME_ID_CNS_NS_ACTIVE_LIST = 0x2, }; -typedef struct NvmeIdCtrl { +typedef struct QEMU_PACKED NvmeIdCtrl { uint16_t vid; uint16_t ssvid; uint8_t sn[20]; @@ -807,7 +807,7 @@ enum NvmeFeatureIds { NVME_SOFTWARE_PROGRESS_MARKER = 0x80 }; -typedef struct NvmeRangeType { +typedef struct QEMU_PACKED NvmeRangeType { uint8_t type; uint8_t attributes; uint8_t rsvd2[14]; @@ -817,13 +817,13 @@ typedef struct NvmeRangeType { uint8_t rsvd48[16]; } NvmeRangeType; -typedef struct NvmeLBAF { +typedef struct QEMU_PACKED NvmeLBAF { uint16_t ms; uint8_t ds; uint8_t rp; } NvmeLBAF; -typedef struct NvmeIdNs { +typedef struct QEMU_PACKED NvmeIdNs { uint64_t nsze; uint64_t ncap; uint64_t nuse; From af4a367d15d26c5594b4c38060083aa77a649a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 30 Jun 2020 13:04:28 +0200 Subject: [PATCH 04/39] hw/block/nvme: Fix pmrmsc register size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Persistent Memory Region Controller Memory Space Control register is 64-bit wide. See 'Figure 68: Register Definition' of the 'NVM Express Base Specification Revision 1.4'. Fixes: 6cf9413229 ("introduce PMR support from NVMe 1.4 spec") Reported-by: Klaus Jensen Reviewed-by: Dmitry Fomichev Reviewed-by: Klaus Jensen Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200630110429.19972-4-philmd@redhat.com> Signed-off-by: Klaus Jensen --- include/block/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 71c5681912..82c384614a 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -21,7 +21,7 @@ typedef struct QEMU_PACKED NvmeBar { uint32_t pmrsts; uint32_t pmrebs; uint32_t pmrswtp; - uint32_t pmrmsc; + uint64_t pmrmsc; } NvmeBar; enum NvmeCapShift { From 74e18435c0eb59004e580f7103b3482fafab2c47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 30 Jun 2020 13:04:29 +0200 Subject: [PATCH 05/39] hw/block/nvme: Align I/O BAR to 4 KiB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify the NVMe emulated device by aligning the I/O BAR to 4 KiB. Reviewed-by: Dmitry Fomichev Reviewed-by: Klaus Jensen Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200630110429.19972-5-philmd@redhat.com> Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 5 ++--- include/block/nvme.h | 2 ++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 9e2f9547e5..8073b0c8e6 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -55,7 +55,6 @@ #include "nvme.h" #define NVME_MAX_IOQPAIRS 0xffff -#define NVME_REG_SIZE 0x1000 #define NVME_DB_SIZE 4 #define NVME_CMB_BIR 2 #define NVME_PMR_BIR 2 @@ -1322,7 +1321,7 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, NvmeCtrl *n = (NvmeCtrl *)opaque; if (addr < sizeof(n->bar)) { nvme_write_bar(n, addr, data, size); - } else if (addr >= 0x1000) { + } else { nvme_process_db(n, addr, data); } } @@ -1415,7 +1414,7 @@ static void nvme_init_state(NvmeCtrl *n) { n->num_namespaces = 1; /* add one to max_ioqpairs to account for the admin queue pair */ - n->reg_size = pow2ceil(NVME_REG_SIZE + + n->reg_size = pow2ceil(sizeof(NvmeBar) + 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE); n->namespaces = g_new0(NvmeNamespace, n->num_namespaces); n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1); diff --git a/include/block/nvme.h b/include/block/nvme.h index 82c384614a..4e1cea576a 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -22,6 +22,7 @@ typedef struct QEMU_PACKED NvmeBar { uint32_t pmrebs; uint32_t pmrswtp; uint64_t pmrmsc; + uint8_t reserved[484]; } NvmeBar; enum NvmeCapShift { @@ -879,6 +880,7 @@ enum NvmeIdNsDps { static inline void _nvme_check_size(void) { + QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4); QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); From c26f21737045a6c6ce7fbdea998868d9bcd9b489 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:46 +0200 Subject: [PATCH 06/39] hw/block/nvme: bump spec data structures to v1.3 Add missing fields in the Identify Controller and Identify Namespace data structures to bring them in line with NVMe v1.3. This also adds data structures and defines for SGL support which requires a couple of trivial changes to the nvme block driver as well. Signed-off-by: Klaus Jensen Acked-by: Fam Zheng Reviewed-by: Maxim Levitsky Reviewed-by: Dmitry Fomichev Message-Id: <20200706061303.246057-2-its@irrelevant.dk> --- block/nvme.c | 18 ++--- hw/block/nvme.c | 12 ++-- include/block/nvme.h | 158 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 156 insertions(+), 32 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index 374e268915..c1c4c07ac6 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -518,7 +518,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) error_setg(errp, "Cannot map buffer for DMA"); goto out; } - cmd.prp1 = cpu_to_le64(iova); + cmd.dptr.prp1 = cpu_to_le64(iova); if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { error_setg(errp, "Failed to identify controller"); @@ -629,7 +629,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) } cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_CQ, - .prp1 = cpu_to_le64(q->cq.iova), + .dptr.prp1 = cpu_to_le64(q->cq.iova), .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), .cdw11 = cpu_to_le32(0x3), }; @@ -640,7 +640,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) } cmd = (NvmeCmd) { .opcode = NVME_ADM_CMD_CREATE_SQ, - .prp1 = cpu_to_le64(q->sq.iova), + .dptr.prp1 = cpu_to_le64(q->sq.iova), .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), .cdw11 = cpu_to_le32(0x1 | (n << 16)), }; @@ -988,16 +988,16 @@ try_map: case 0: abort(); case 1: - cmd->prp1 = pagelist[0]; - cmd->prp2 = 0; + cmd->dptr.prp1 = pagelist[0]; + cmd->dptr.prp2 = 0; break; case 2: - cmd->prp1 = pagelist[0]; - cmd->prp2 = pagelist[1]; + cmd->dptr.prp1 = pagelist[0]; + cmd->dptr.prp2 = pagelist[1]; break; default: - cmd->prp1 = pagelist[0]; - cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); + cmd->dptr.prp1 = pagelist[0]; + cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t)); break; } trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 8073b0c8e6..abd901fe1e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -396,8 +396,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRwCmd *rw = (NvmeRwCmd *)cmd; uint32_t nlb = le32_to_cpu(rw->nlb) + 1; uint64_t slba = le64_to_cpu(rw->slba); - uint64_t prp1 = le64_to_cpu(rw->prp1); - uint64_t prp2 = le64_to_cpu(rw->prp2); + uint64_t prp1 = le64_to_cpu(rw->dptr.prp1); + uint64_t prp2 = le64_to_cpu(rw->dptr.prp2); uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds; @@ -794,8 +794,8 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n) static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd) { - uint64_t prp1 = le64_to_cpu(cmd->prp1); - uint64_t prp2 = le64_to_cpu(cmd->prp2); + uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); + uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); uint64_t timestamp = nvme_get_timestamp(n); @@ -833,8 +833,8 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd) { uint16_t ret; uint64_t timestamp; - uint64_t prp1 = le64_to_cpu(cmd->prp1); - uint64_t prp2 = le64_to_cpu(cmd->prp2); + uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); + uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); ret = nvme_dma_write_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, prp2); diff --git a/include/block/nvme.h b/include/block/nvme.h index 4e1cea576a..58f8e48719 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -378,15 +378,53 @@ enum NvmePmrmscMask { #define NVME_PMRMSC_SET_CBA(pmrmsc, val) \ (pmrmsc |= (uint64_t)(val & PMRMSC_CBA_MASK) << PMRMSC_CBA_SHIFT) +enum NvmeSglDescriptorType { + NVME_SGL_DESCR_TYPE_DATA_BLOCK = 0x0, + NVME_SGL_DESCR_TYPE_BIT_BUCKET = 0x1, + NVME_SGL_DESCR_TYPE_SEGMENT = 0x2, + NVME_SGL_DESCR_TYPE_LAST_SEGMENT = 0x3, + NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK = 0x4, + + NVME_SGL_DESCR_TYPE_VENDOR_SPECIFIC = 0xf, +}; + +enum NvmeSglDescriptorSubtype { + NVME_SGL_DESCR_SUBTYPE_ADDRESS = 0x0, +}; + +typedef struct QEMU_PACKED NvmeSglDescriptor { + uint64_t addr; + uint32_t len; + uint8_t rsvd[3]; + uint8_t type; +} NvmeSglDescriptor; + +#define NVME_SGL_TYPE(type) ((type >> 4) & 0xf) +#define NVME_SGL_SUBTYPE(type) (type & 0xf) + +typedef union NvmeCmdDptr { + struct { + uint64_t prp1; + uint64_t prp2; + }; + + NvmeSglDescriptor sgl; +} NvmeCmdDptr; + +enum NvmePsdt { + PSDT_PRP = 0x0, + PSDT_SGL_MPTR_CONTIGUOUS = 0x1, + PSDT_SGL_MPTR_SGL = 0x2, +}; + typedef struct QEMU_PACKED NvmeCmd { uint8_t opcode; - uint8_t fuse; + uint8_t flags; uint16_t cid; uint32_t nsid; uint64_t res1; uint64_t mptr; - uint64_t prp1; - uint64_t prp2; + NvmeCmdDptr dptr; uint32_t cdw10; uint32_t cdw11; uint32_t cdw12; @@ -395,6 +433,9 @@ typedef struct QEMU_PACKED NvmeCmd { uint32_t cdw15; } NvmeCmd; +#define NVME_CMD_FLAGS_FUSE(flags) (flags & 0x3) +#define NVME_CMD_FLAGS_PSDT(flags) ((flags >> 6) & 0x3) + enum NvmeAdminCommands { NVME_ADM_CMD_DELETE_SQ = 0x00, NVME_ADM_CMD_CREATE_SQ = 0x01, @@ -494,8 +535,7 @@ typedef struct QEMU_PACKED NvmeRwCmd { uint32_t nsid; uint64_t rsvd2; uint64_t mptr; - uint64_t prp1; - uint64_t prp2; + NvmeCmdDptr dptr; uint64_t slba; uint16_t nlb; uint16_t control; @@ -535,8 +575,7 @@ typedef struct QEMU_PACKED NvmeDsmCmd { uint16_t cid; uint32_t nsid; uint64_t rsvd2[2]; - uint64_t prp1; - uint64_t prp2; + NvmeCmdDptr dptr; uint32_t nr; uint32_t attributes; uint32_t rsvd12[4]; @@ -600,6 +639,12 @@ enum NvmeStatusCodes { NVME_CMD_ABORT_MISSING_FUSE = 0x000a, NVME_INVALID_NSID = 0x000b, NVME_CMD_SEQ_ERROR = 0x000c, + NVME_INVALID_SGL_SEG_DESCR = 0x000d, + NVME_INVALID_NUM_SGL_DESCRS = 0x000e, + NVME_DATA_SGL_LEN_INVALID = 0x000f, + NVME_MD_SGL_LEN_INVALID = 0x0010, + NVME_SGL_DESCR_TYPE_INVALID = 0x0011, + NVME_INVALID_USE_OF_CMB = 0x0012, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, NVME_NS_NOT_READY = 0x0082, @@ -688,7 +733,7 @@ enum NvmeSmartWarn { NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, }; -enum LogIdentifier { +enum NvmeLogIdentifier { NVME_LOG_ERROR_INFO = 0x01, NVME_LOG_SMART_INFO = 0x02, NVME_LOG_FW_SLOT_INFO = 0x03, @@ -712,6 +757,7 @@ enum { NVME_ID_CNS_NS = 0x0, NVME_ID_CNS_CTRL = 0x1, NVME_ID_CNS_NS_ACTIVE_LIST = 0x2, + NVME_ID_CNS_NS_DESCR_LIST = 0x3, }; typedef struct QEMU_PACKED NvmeIdCtrl { @@ -724,7 +770,15 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint8_t ieee[3]; uint8_t cmic; uint8_t mdts; - uint8_t rsvd255[178]; + uint16_t cntlid; + uint32_t ver; + uint32_t rtd3r; + uint32_t rtd3e; + uint32_t oaes; + uint32_t ctratt; + uint8_t rsvd100[12]; + uint8_t fguid[16]; + uint8_t rsvd128[128]; uint16_t oacs; uint8_t acl; uint8_t aerl; @@ -732,10 +786,28 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint8_t lpa; uint8_t elpe; uint8_t npss; - uint8_t rsvd511[248]; + uint8_t avscc; + uint8_t apsta; + uint16_t wctemp; + uint16_t cctemp; + uint16_t mtfa; + uint32_t hmpre; + uint32_t hmmin; + uint8_t tnvmcap[16]; + uint8_t unvmcap[16]; + uint32_t rpmbs; + uint16_t edstt; + uint8_t dsto; + uint8_t fwug; + uint16_t kas; + uint16_t hctma; + uint16_t mntmt; + uint16_t mxtmt; + uint32_t sanicap; + uint8_t rsvd332[180]; uint8_t sqes; uint8_t cqes; - uint16_t rsvd515; + uint16_t maxcmd; uint32_t nn; uint16_t oncs; uint16_t fuses; @@ -743,8 +815,14 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint8_t vwc; uint16_t awun; uint16_t awupf; - uint8_t rsvd703[174]; - uint8_t rsvd2047[1344]; + uint8_t nvscc; + uint8_t rsvd531; + uint16_t acwu; + uint8_t rsvd534[2]; + uint32_t sgls; + uint8_t rsvd540[228]; + uint8_t subnqn[256]; + uint8_t rsvd1024[1024]; NvmePSD psd[32]; uint8_t vs[1024]; } NvmeIdCtrl; @@ -770,6 +848,16 @@ enum NvmeIdCtrlOncs { #define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf) #define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf) +#define NVME_CTRL_SGLS_SUPPORT_MASK (0x3 << 0) +#define NVME_CTRL_SGLS_SUPPORT_NO_ALIGN (0x1 << 0) +#define NVME_CTRL_SGLS_SUPPORT_DWORD_ALIGN (0x1 << 1) +#define NVME_CTRL_SGLS_KEYED (0x1 << 2) +#define NVME_CTRL_SGLS_BITBUCKET (0x1 << 16) +#define NVME_CTRL_SGLS_MPTR_CONTIGUOUS (0x1 << 17) +#define NVME_CTRL_SGLS_EXCESS_LENGTH (0x1 << 18) +#define NVME_CTRL_SGLS_MPTR_SGL (0x1 << 19) +#define NVME_CTRL_SGLS_ADDR_OFFSET (0x1 << 20) + typedef struct NvmeFeatureVal { uint32_t arbitration; uint32_t power_mgmt; @@ -792,6 +880,15 @@ typedef struct NvmeFeatureVal { #define NVME_INTC_THR(intc) (intc & 0xff) #define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff) +#define NVME_TEMP_THSEL(temp) ((temp >> 20) & 0x3) +#define NVME_TEMP_THSEL_OVER 0x0 +#define NVME_TEMP_THSEL_UNDER 0x1 + +#define NVME_TEMP_TMPSEL(temp) ((temp >> 16) & 0xf) +#define NVME_TEMP_TMPSEL_COMPOSITE 0x0 + +#define NVME_TEMP_TMPTH(temp) (temp & 0xffff) + enum NvmeFeatureIds { NVME_ARBITRATION = 0x1, NVME_POWER_MANAGEMENT = 0x2, @@ -834,18 +931,43 @@ typedef struct QEMU_PACKED NvmeIdNs { uint8_t mc; uint8_t dpc; uint8_t dps; - uint8_t nmic; uint8_t rescap; uint8_t fpi; uint8_t dlfeat; - - uint8_t res34[94]; + uint16_t nawun; + uint16_t nawupf; + uint16_t nacwu; + uint16_t nabsn; + uint16_t nabo; + uint16_t nabspf; + uint16_t noiob; + uint8_t nvmcap[16]; + uint8_t rsvd64[40]; + uint8_t nguid[16]; + uint64_t eui64; NvmeLBAF lbaf[16]; - uint8_t res192[192]; + uint8_t rsvd192[192]; uint8_t vs[3712]; } NvmeIdNs; +typedef struct QEMU_PACKED NvmeIdNsDescr { + uint8_t nidt; + uint8_t nidl; + uint8_t rsvd2[2]; +} NvmeIdNsDescr; + +enum { + NVME_NIDT_EUI64_LEN = 8, + NVME_NIDT_NGUID_LEN = 16, + NVME_NIDT_UUID_LEN = 16, +}; + +enum NvmeNsIdentifierType { + NVME_NIDT_EUI64 = 0x1, + NVME_NIDT_NGUID = 0x2, + NVME_NIDT_UUID = 0x3, +}; /*Deallocate Logical Block Features*/ #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat) ((dlfeat) & 0x10) @@ -897,5 +1019,7 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4); } #endif From a70174ef2e6cd73b8e071ed7ce3cbef8ccb60089 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:47 +0200 Subject: [PATCH 07/39] hw/block/nvme: fix missing endian conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a missing cpu_to conversion by moving conversion to just before returning instead. Signed-off-by: Klaus Jensen Suggested-by: Philippe Mathieu-Daudé Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-3-its@irrelevant.dk> --- hw/block/nvme.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index abd901fe1e..43b70730e4 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -814,8 +814,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); break; case NVME_NUMBER_OF_QUEUES: - result = cpu_to_le32((n->params.max_ioqpairs - 1) | - ((n->params.max_ioqpairs - 1) << 16)); + result = (n->params.max_ioqpairs - 1) | + ((n->params.max_ioqpairs - 1) << 16); trace_pci_nvme_getfeat_numq(result); break; case NVME_TIMESTAMP: @@ -825,7 +825,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } - req->cqe.result = result; + req->cqe.result = cpu_to_le32(result); return NVME_SUCCESS; } From a04425fb06738b696ba81a5494645965b1c5955b Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:48 +0200 Subject: [PATCH 08/39] hw/block/nvme: additional tracing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add various additional tracing and streamline nvme_identify_ns and nvme_identify_nslist (they do not need to repeat the command, it is already in the trace name). Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-4-its@irrelevant.dk> --- hw/block/nvme.c | 33 +++++++++++++++++++++++++++++++++ hw/block/trace-events | 13 +++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 43b70730e4..7e065353a5 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -68,6 +68,20 @@ static void nvme_process_sq(void *opaque); +static uint16_t nvme_cid(NvmeRequest *req) +{ + if (!req) { + return 0xffff; + } + + return le16_to_cpu(req->cqe.cid); +} + +static uint16_t nvme_sqid(NvmeRequest *req) +{ + return le16_to_cpu(req->sq->sqid); +} + static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { hwaddr low = n->ctrl_mem.addr; @@ -330,6 +344,8 @@ static void nvme_post_cqes(void *opaque) static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) { assert(cq->cqid == req->sq->cqid); + trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid, + req->status); QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); @@ -342,6 +358,8 @@ static void nvme_rw_cb(void *opaque, int ret) NvmeCtrl *n = sq->ctrl; NvmeCQueue *cq = n->cq[sq->cqid]; + trace_pci_nvme_rw_cb(nvme_cid(req)); + if (!ret) { block_acct_done(blk_get_stats(n->conf.blk), &req->acct); req->status = NVME_SUCCESS; @@ -377,6 +395,8 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, uint64_t offset = slba << data_shift; uint32_t count = nlb << data_shift; + trace_pci_nvme_write_zeroes(nvme_cid(req), slba, nlb); + if (unlikely(slba + nlb > ns->id_ns.nsze)) { trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); return NVME_LBA_RANGE | NVME_DNR; @@ -444,6 +464,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) NvmeNamespace *ns; uint32_t nsid = le32_to_cpu(cmd->nsid); + trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), cmd->opcode); + if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces); return NVME_INVALID_NSID | NVME_DNR; @@ -875,6 +897,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) { + trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), cmd->opcode); + switch (cmd->opcode) { case NVME_ADM_CMD_DELETE_SQ: return nvme_del_sq(n, cmd); @@ -1203,6 +1227,8 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) uint8_t *ptr = (uint8_t *)&n->bar; uint64_t val = 0; + trace_pci_nvme_mmio_read(addr); + if (unlikely(addr & (sizeof(uint32_t) - 1))) { NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32, "MMIO read not 32-bit aligned," @@ -1272,6 +1298,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) return; } + trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head); + start_sqs = nvme_cq_full(cq) ? 1 : 0; cq->head = new_head; if (start_sqs) { @@ -1310,6 +1338,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) return; } + trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail); + sq->tail = new_tail; timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); } @@ -1319,6 +1349,9 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { NvmeCtrl *n = (NvmeCtrl *)opaque; + + trace_pci_nvme_mmio_write(addr, data); + if (addr < sizeof(n->bar)) { nvme_write_bar(n, addr, data, size); } else { diff --git a/hw/block/trace-events b/hw/block/trace-events index 958fcc5508..c40c0d2e4b 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -33,19 +33,28 @@ pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u" pci_nvme_irq_pin(void) "pulsing IRQ pin" pci_nvme_irq_masked(void) "IRQ is masked" pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64"" +pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8"" +pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8"" pci_nvme_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64"" +pci_nvme_rw_cb(uint16_t cid) "cid %"PRIu16"" +pci_nvme_write_zeroes(uint16_t cid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" slba %"PRIu64" nlb %"PRIu32"" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16"" pci_nvme_identify_ctrl(void) "identify controller" -pci_nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16"" -pci_nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16"" +pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" +pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" pci_nvme_getfeat_vwcache(const char* result) "get feature volatile write cache, result=%s" pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d" pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d" pci_nvme_setfeat_timestamp(uint64_t ts) "set feature timestamp = 0x%"PRIx64"" pci_nvme_getfeat_timestamp(uint64_t ts) "get feature timestamp = 0x%"PRIx64"" +pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16"" +pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64"" +pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64"" +pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16"" +pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "cqid %"PRIu16" new_tail %"PRIu16"" pci_nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64"" pci_nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64"" pci_nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64"" From 1504ede69304ee8588483dd8c5e6434dd2c0faff Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:49 +0200 Subject: [PATCH 09/39] hw/block/nvme: add support for the abort command Required for compliance with NVMe revision 1.3d. See NVM Express 1.3d, Section 5.1 ("Abort command"). The Abort command is a best effort command; for now, the device always fails to abort the given command. Signed-off-by: Klaus Jensen Signed-off-by: Klaus Jensen Acked-by: Keith Busch Reviewed-by: Maxim Levitsky Reviewed-by: Dmitry Fomichev Message-Id: <20200706061303.246057-5-its@irrelevant.dk> --- hw/block/nvme.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 7e065353a5..b8d5251e1e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -774,6 +774,18 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd) } } +static uint16_t nvme_abort(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +{ + uint16_t sqid = le32_to_cpu(cmd->cdw10) & 0xffff; + + req->cqe.result = 1; + if (nvme_check_sqid(n, sqid)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + return NVME_SUCCESS; +} + static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts) { trace_pci_nvme_setfeat_timestamp(ts); @@ -910,6 +922,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return nvme_create_cq(n, cmd); case NVME_ADM_CMD_IDENTIFY: return nvme_identify(n, cmd); + case NVME_ADM_CMD_ABORT: + return nvme_abort(n, cmd, req); case NVME_ADM_CMD_SET_FEATURES: return nvme_set_feature(n, cmd, req); case NVME_ADM_CMD_GET_FEATURES: @@ -1594,6 +1608,19 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->ieee[1] = 0x02; id->ieee[2] = 0xb3; id->oacs = cpu_to_le16(0); + + /* + * Because the controller always completes the Abort command immediately, + * there can never be more than one concurrently executing Abort command, + * so this value is never used for anything. Note that there can easily be + * many Abort commands in the queues, but they are not considered + * "executing" until processed by nvme_abort. + * + * The specification recommends a value of 3 for Abort Command Limit (four + * concurrently outstanding Abort commands), so lets use that though it is + * inconsequential. + */ + id->acl = 3; id->frmw = 7 << 1; id->lpa = 1 << 0; id->sqes = (0x6 << 4) | 0x6; From 69ff06c49e9b0768bd68d887c2f29e8ff84dfaf3 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:50 +0200 Subject: [PATCH 10/39] hw/block/nvme: add temperature threshold feature It might seem weird to implement this feature for an emulated device, but it is mandatory to support and the feature is useful for testing asynchronous event request support, which will be added in a later patch. Signed-off-by: Klaus Jensen Acked-by: Keith Busch Reviewed-by: Maxim Levitsky Reviewed-by: Dmitry Fomichev Message-Id: <20200706061303.246057-6-its@irrelevant.dk> --- hw/block/nvme.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ hw/block/nvme.h | 1 + include/block/nvme.h | 5 ++++- 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b8d5251e1e..38c6461fd3 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -58,6 +58,9 @@ #define NVME_DB_SIZE 4 #define NVME_CMB_BIR 2 #define NVME_PMR_BIR 2 +#define NVME_TEMPERATURE 0x143 +#define NVME_TEMPERATURE_WARNING 0x157 +#define NVME_TEMPERATURE_CRITICAL 0x175 #define NVME_GUEST_ERR(trace, fmt, ...) \ do { \ @@ -840,9 +843,31 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd) static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) { uint32_t dw10 = le32_to_cpu(cmd->cdw10); + uint32_t dw11 = le32_to_cpu(cmd->cdw11); uint32_t result; switch (dw10) { + case NVME_TEMPERATURE_THRESHOLD: + result = 0; + + /* + * The controller only implements the Composite Temperature sensor, so + * return 0 for all other sensors. + */ + if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { + break; + } + + switch (NVME_TEMP_THSEL(dw11)) { + case NVME_TEMP_THSEL_OVER: + result = n->features.temp_thresh_hi; + break; + case NVME_TEMP_THSEL_UNDER: + result = n->features.temp_thresh_low; + break; + } + + break; case NVME_VOLATILE_WRITE_CACHE: result = blk_enable_write_cache(n->conf.blk); trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); @@ -887,6 +912,23 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) uint32_t dw11 = le32_to_cpu(cmd->cdw11); switch (dw10) { + case NVME_TEMPERATURE_THRESHOLD: + if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { + break; + } + + switch (NVME_TEMP_THSEL(dw11)) { + case NVME_TEMP_THSEL_OVER: + n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11); + break; + case NVME_TEMP_THSEL_UNDER: + n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11); + break; + default: + return NVME_INVALID_FIELD | NVME_DNR; + } + + break; case NVME_VOLATILE_WRITE_CACHE: blk_set_enable_write_cache(n->conf.blk, dw11 & 1); break; @@ -1466,6 +1508,7 @@ static void nvme_init_state(NvmeCtrl *n) n->namespaces = g_new0(NvmeNamespace, n->num_namespaces); n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1); n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1); + n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING; } static void nvme_init_blk(NvmeCtrl *n, Error **errp) @@ -1623,6 +1666,11 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->acl = 3; id->frmw = 7 << 1; id->lpa = 1 << 0; + + /* recommended default value (~70 C) */ + id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); + id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL); + id->sqes = (0x6 << 4) | 0x6; id->cqes = (0x4 << 4) | 0x4; id->nn = cpu_to_le32(n->num_namespaces); diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 1d30c0bca2..e3a2c907e2 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -107,6 +107,7 @@ typedef struct NvmeCtrl { NvmeSQueue admin_sq; NvmeCQueue admin_cq; NvmeIdCtrl id_ctrl; + NvmeFeatureVal features; } NvmeCtrl; /* calculate the number of LBAs that the namespace can accomodate */ diff --git a/include/block/nvme.h b/include/block/nvme.h index 58f8e48719..273640c2b1 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -861,7 +861,10 @@ enum NvmeIdCtrlOncs { typedef struct NvmeFeatureVal { uint32_t arbitration; uint32_t power_mgmt; - uint32_t temp_thresh; + struct { + uint16_t temp_thresh_hi; + uint16_t temp_thresh_low; + }; uint32_t err_rec; uint32_t volatile_wc; uint32_t num_queues; From 42a42e4610ab9e4ca0bcc976f4b19f9f612a085e Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:51 +0200 Subject: [PATCH 11/39] hw/block/nvme: mark fw slot 1 as read-only Mark firmware slot 1 as read-only and only support that slot. Signed-off-by: Klaus Jensen Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-7-its@irrelevant.dk> --- hw/block/nvme.c | 3 ++- include/block/nvme.h | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 38c6461fd3..c970e41699 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -61,6 +61,7 @@ #define NVME_TEMPERATURE 0x143 #define NVME_TEMPERATURE_WARNING 0x157 #define NVME_TEMPERATURE_CRITICAL 0x175 +#define NVME_NUM_FW_SLOTS 1 #define NVME_GUEST_ERR(trace, fmt, ...) \ do { \ @@ -1664,7 +1665,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) * inconsequential. */ id->acl = 3; - id->frmw = 7 << 1; + id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; id->lpa = 1 << 0; /* recommended default value (~70 C) */ diff --git a/include/block/nvme.h b/include/block/nvme.h index 273640c2b1..f3fdb5bf11 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -843,6 +843,10 @@ enum NvmeIdCtrlOncs { NVME_ONCS_TIMESTAMP = 1 << 6, }; +enum NvmeIdCtrlFrmw { + NVME_FRMW_SLOT1_RO = 1 << 0, +}; + #define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf) #define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf) #define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf) From 94a7897c41db0596c544af1bd27eedb833eae496 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:52 +0200 Subject: [PATCH 12/39] hw/block/nvme: add support for the get log page command Add support for the Get Log Page command and basic implementations of the mandatory Error Information, SMART / Health Information and Firmware Slot Information log pages. In violation of the specification, the SMART / Health Information log page does not persist information over the lifetime of the controller because the device has no place to store such persistent state. Note that the LPA field in the Identify Controller data structure intentionally has bit 0 cleared because there is no namespace specific information in the SMART / Health information log page. Required for compliance with NVMe revision 1.3d. See NVM Express 1.3d, Section 5.14 ("Get Log Page command"). Signed-off-by: Klaus Jensen Signed-off-by: Klaus Jensen Acked-by: Keith Busch Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-8-its@irrelevant.dk> --- hw/block/nvme.c | 141 +++++++++++++++++++++++++++++++++++++++++- hw/block/nvme.h | 2 + hw/block/trace-events | 2 + include/block/nvme.h | 6 +- 4 files changed, 149 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c970e41699..739581988d 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -605,6 +605,141 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd) return NVME_SUCCESS; } +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); + uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); + uint32_t nsid = le32_to_cpu(cmd->nsid); + + uint32_t trans_len; + time_t current_ms; + uint64_t units_read = 0, units_written = 0; + uint64_t read_commands = 0, write_commands = 0; + NvmeSmartLog smart; + BlockAcctStats *s; + + if (nsid && nsid != 0xffffffff) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + s = blk_get_stats(n->conf.blk); + + units_read = s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; + units_written = s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; + read_commands = s->nr_ops[BLOCK_ACCT_READ]; + write_commands = s->nr_ops[BLOCK_ACCT_WRITE]; + + if (off > sizeof(smart)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + trans_len = MIN(sizeof(smart) - off, buf_len); + + memset(&smart, 0x0, sizeof(smart)); + + smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(units_read, 1000)); + smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(units_written, + 1000)); + smart.host_read_commands[0] = cpu_to_le64(read_commands); + smart.host_write_commands[0] = cpu_to_le64(write_commands); + + smart.temperature = cpu_to_le16(n->temperature); + + if ((n->temperature >= n->features.temp_thresh_hi) || + (n->temperature <= n->features.temp_thresh_low)) { + smart.critical_warning |= NVME_SMART_TEMPERATURE; + } + + current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); + smart.power_on_hours[0] = + cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60); + + return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1, + prp2); +} + +static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + uint32_t trans_len; + uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); + uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); + NvmeFwSlotInfoLog fw_log = { + .afi = 0x1, + }; + + strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' '); + + if (off > sizeof(fw_log)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + trans_len = MIN(sizeof(fw_log) - off, buf_len); + + return nvme_dma_read_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, + prp2); +} + +static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + uint32_t trans_len; + uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); + uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); + NvmeErrorLog errlog; + + if (off > sizeof(errlog)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + memset(&errlog, 0x0, sizeof(errlog)); + + trans_len = MIN(sizeof(errlog) - off, buf_len); + + return nvme_dma_read_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2); +} + +static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +{ + uint32_t dw10 = le32_to_cpu(cmd->cdw10); + uint32_t dw11 = le32_to_cpu(cmd->cdw11); + uint32_t dw12 = le32_to_cpu(cmd->cdw12); + uint32_t dw13 = le32_to_cpu(cmd->cdw13); + uint8_t lid = dw10 & 0xff; + uint8_t lsp = (dw10 >> 8) & 0xf; + uint8_t rae = (dw10 >> 15) & 0x1; + uint32_t numdl, numdu; + uint64_t off, lpol, lpou; + size_t len; + + numdl = (dw10 >> 16); + numdu = (dw11 & 0xffff); + lpol = dw12; + lpou = dw13; + + len = (((numdu << 16) | numdl) + 1) << 2; + off = (lpou << 32ULL) | lpol; + + if (off & 0x3) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off); + + switch (lid) { + case NVME_LOG_ERROR_INFO: + return nvme_error_info(n, cmd, len, off, req); + case NVME_LOG_SMART_INFO: + return nvme_smart_info(n, cmd, len, off, req); + case NVME_LOG_FW_SLOT_INFO: + return nvme_fw_log_info(n, cmd, len, off, req); + default: + trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); + return NVME_INVALID_FIELD | NVME_DNR; + } +} + static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) { n->cq[cq->cqid] = NULL; @@ -959,6 +1094,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return nvme_del_sq(n, cmd); case NVME_ADM_CMD_CREATE_SQ: return nvme_create_sq(n, cmd); + case NVME_ADM_CMD_GET_LOG_PAGE: + return nvme_get_log(n, cmd, req); case NVME_ADM_CMD_DELETE_CQ: return nvme_del_cq(n, cmd); case NVME_ADM_CMD_CREATE_CQ: @@ -1509,7 +1646,9 @@ static void nvme_init_state(NvmeCtrl *n) n->namespaces = g_new0(NvmeNamespace, n->num_namespaces); n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1); n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1); + n->temperature = NVME_TEMPERATURE; n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING; + n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); } static void nvme_init_blk(NvmeCtrl *n, Error **errp) @@ -1666,7 +1805,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) */ id->acl = 3; id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; - id->lpa = 1 << 0; + id->lpa = NVME_LPA_EXTENDED; /* recommended default value (~70 C) */ id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); diff --git a/hw/block/nvme.h b/hw/block/nvme.h index e3a2c907e2..8228978e93 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -98,6 +98,8 @@ typedef struct NvmeCtrl { uint32_t irq_status; uint64_t host_timestamp; /* Timestamp sent by the host */ uint64_t timestamp_set_qemu_clock_ms; /* QEMU clock time */ + uint64_t starttime_ms; + uint16_t temperature; HostMemoryBackend *pmrdev; diff --git a/hw/block/trace-events b/hw/block/trace-events index c40c0d2e4b..3330d74e48 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -45,6 +45,7 @@ pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16"" pci_nvme_identify_ctrl(void) "identify controller" pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" +pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64"" pci_nvme_getfeat_vwcache(const char* result) "get feature volatile write cache, result=%s" pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d" pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d" @@ -94,6 +95,7 @@ pci_nvme_err_invalid_create_cq_qflags(uint16_t qflags) "failed creating completi pci_nvme_err_invalid_identify_cns(uint16_t cns) "identify, invalid cns=0x%"PRIx16"" pci_nvme_err_invalid_getfeat(int dw10) "invalid get features, dw10=0x%"PRIx32"" pci_nvme_err_invalid_setfeat(uint32_t dw10) "invalid set features, dw10=0x%"PRIx32"" +pci_nvme_err_invalid_log_page(uint16_t cid, uint16_t lid) "cid %"PRIu16" lid 0x%"PRIx16"" pci_nvme_err_startfail_cq(void) "nvme_start_ctrl failed because there are non-admin completion queues" pci_nvme_err_startfail_sq(void) "nvme_start_ctrl failed because there are non-admin submission queues" pci_nvme_err_startfail_nbarasq(void) "nvme_start_ctrl failed because the admin submission queue address is null" diff --git a/include/block/nvme.h b/include/block/nvme.h index f3fdb5bf11..9ec9f7d7a3 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -707,7 +707,7 @@ typedef struct QEMU_PACKED NvmeErrorLog { typedef struct QEMU_PACKED NvmeSmartLog { uint8_t critical_warning; - uint8_t temperature[2]; + uint16_t temperature; uint8_t available_spare; uint8_t available_spare_threshold; uint8_t percentage_used; @@ -847,6 +847,10 @@ enum NvmeIdCtrlFrmw { NVME_FRMW_SLOT1_RO = 1 << 0, }; +enum NvmeIdCtrlLpa { + NVME_LPA_EXTENDED = 1 << 2, +}; + #define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf) #define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf) #define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf) From 5d5a53302b95c50197d007407d24e2da3397926a Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:53 +0200 Subject: [PATCH 13/39] hw/block/nvme: add support for the asynchronous event request command Add support for the Asynchronous Event Request command. Required for compliance with NVMe revision 1.3d. See NVM Express 1.3d, Section 5.2 ("Asynchronous Event Request command"). Mostly imported from Keith's qemu-nvme tree. Modified with a max number of queued events (controllable with the aer_max_queued device parameter). The spec states that the controller *should* retain events, so we do best effort here. Signed-off-by: Klaus Jensen Signed-off-by: Klaus Jensen Acked-by: Keith Busch Reviewed-by: Maxim Levitsky Reviewed-by: Dmitry Fomichev Message-Id: <20200706061303.246057-9-its@irrelevant.dk> --- hw/block/nvme.c | 210 ++++++++++++++++++++++++++++++++++++++++-- hw/block/nvme.h | 10 +- hw/block/trace-events | 9 ++ include/block/nvme.h | 8 +- 4 files changed, 227 insertions(+), 10 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 739581988d..41a15da3f6 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -20,7 +20,8 @@ * -device nvme,drive=,serial=,id=, \ * cmb_size_mb=, \ * [pmrdev=,] \ - * max_ioqpairs= + * max_ioqpairs=, \ + * aerl=, aer_max_queued= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. @@ -32,6 +33,20 @@ * For example: * -object memory-backend-file,id=,share=on,mem-path=, \ * size= .... -device nvme,...,pmrdev= + * + * + * nvme device parameters + * ~~~~~~~~~~~~~~~~~~~~~~ + * - `aerl` + * The Asynchronous Event Request Limit (AERL). Indicates the maximum number + * of concurrently outstanding Asynchronous Event Request commands suppoert + * by the controller. This is a 0's based value. + * + * - `aer_max_queued` + * This is the maximum number of events that the device will enqueue for + * completion when there are no oustanding AERs. When the maximum number of + * enqueued events are reached, subsequent events will be dropped. + * */ #include "qemu/osdep.h" @@ -355,6 +370,85 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); } +static void nvme_process_aers(void *opaque) +{ + NvmeCtrl *n = opaque; + NvmeAsyncEvent *event, *next; + + trace_pci_nvme_process_aers(n->aer_queued); + + QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) { + NvmeRequest *req; + NvmeAerResult *result; + + /* can't post cqe if there is nothing to complete */ + if (!n->outstanding_aers) { + trace_pci_nvme_no_outstanding_aers(); + break; + } + + /* ignore if masked (cqe posted, but event not cleared) */ + if (n->aer_mask & (1 << event->result.event_type)) { + trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask); + continue; + } + + QTAILQ_REMOVE(&n->aer_queue, event, entry); + n->aer_queued--; + + n->aer_mask |= 1 << event->result.event_type; + n->outstanding_aers--; + + req = n->aer_reqs[n->outstanding_aers]; + + result = (NvmeAerResult *) &req->cqe.result; + result->event_type = event->result.event_type; + result->event_info = event->result.event_info; + result->log_page = event->result.log_page; + g_free(event); + + req->status = NVME_SUCCESS; + + trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info, + result->log_page); + + nvme_enqueue_req_completion(&n->admin_cq, req); + } +} + +static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type, + uint8_t event_info, uint8_t log_page) +{ + NvmeAsyncEvent *event; + + trace_pci_nvme_enqueue_event(event_type, event_info, log_page); + + if (n->aer_queued == n->params.aer_max_queued) { + trace_pci_nvme_enqueue_event_noqueue(n->aer_queued); + return; + } + + event = g_new(NvmeAsyncEvent, 1); + event->result = (NvmeAerResult) { + .event_type = event_type, + .event_info = event_info, + .log_page = log_page, + }; + + QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry); + n->aer_queued++; + + nvme_process_aers(n); +} + +static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) +{ + n->aer_mask &= ~(1 << event_type); + if (!QTAILQ_EMPTY(&n->aer_queue)) { + nvme_process_aers(n); + } +} + static void nvme_rw_cb(void *opaque, int ret) { NvmeRequest *req = opaque; @@ -605,8 +699,9 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd) return NVME_SUCCESS; } -static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, - uint64_t off, NvmeRequest *req) +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae, + uint32_t buf_len, uint64_t off, + NvmeRequest *req) { uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); @@ -655,6 +750,10 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, smart.power_on_hours[0] = cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60); + if (!rae) { + nvme_clear_events(n, NVME_AER_TYPE_SMART); + } + return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2); } @@ -681,14 +780,19 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, prp2); } -static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, - uint64_t off, NvmeRequest *req) +static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae, + uint32_t buf_len, uint64_t off, + NvmeRequest *req) { uint32_t trans_len; uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); NvmeErrorLog errlog; + if (!rae) { + nvme_clear_events(n, NVME_AER_TYPE_ERROR); + } + if (off > sizeof(errlog)) { return NVME_INVALID_FIELD | NVME_DNR; } @@ -729,9 +833,9 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) switch (lid) { case NVME_LOG_ERROR_INFO: - return nvme_error_info(n, cmd, len, off, req); + return nvme_error_info(n, cmd, rae, len, off, req); case NVME_LOG_SMART_INFO: - return nvme_smart_info(n, cmd, len, off, req); + return nvme_smart_info(n, cmd, rae, len, off, req); case NVME_LOG_FW_SLOT_INFO: return nvme_fw_log_info(n, cmd, len, off, req); default: @@ -1013,6 +1117,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) ((n->params.max_ioqpairs - 1) << 16); trace_pci_nvme_getfeat_numq(result); break; + case NVME_ASYNCHRONOUS_EVENT_CONF: + result = n->features.async_config; + break; case NVME_TIMESTAMP: return nvme_get_feature_timestamp(n, cmd); default: @@ -1064,6 +1171,14 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } + if (((n->temperature >= n->features.temp_thresh_hi) || + (n->temperature <= n->features.temp_thresh_low)) && + NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) { + nvme_enqueue_event(n, NVME_AER_TYPE_SMART, + NVME_AER_INFO_SMART_TEMP_THRESH, + NVME_LOG_SMART_INFO); + } + break; case NVME_VOLATILE_WRITE_CACHE: blk_set_enable_write_cache(n->conf.blk, dw11 & 1); @@ -1076,6 +1191,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) | ((n->params.max_ioqpairs - 1) << 16)); break; + case NVME_ASYNCHRONOUS_EVENT_CONF: + n->features.async_config = dw11; + break; case NVME_TIMESTAMP: return nvme_set_feature_timestamp(n, cmd); default: @@ -1085,6 +1203,25 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return NVME_SUCCESS; } +static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +{ + trace_pci_nvme_aer(nvme_cid(req)); + + if (n->outstanding_aers > n->params.aerl) { + trace_pci_nvme_aer_aerl_exceeded(); + return NVME_AER_LIMIT_EXCEEDED; + } + + n->aer_reqs[n->outstanding_aers] = req; + n->outstanding_aers++; + + if (!QTAILQ_EMPTY(&n->aer_queue)) { + nvme_process_aers(n); + } + + return NVME_NO_COMPLETE; +} + static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) { trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), cmd->opcode); @@ -1108,6 +1245,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return nvme_set_feature(n, cmd, req); case NVME_ADM_CMD_GET_FEATURES: return nvme_get_feature(n, cmd, req); + case NVME_ADM_CMD_ASYNC_EV_REQ: + return nvme_aer(n, cmd, req); default: trace_pci_nvme_err_invalid_admin_opc(cmd->opcode); return NVME_INVALID_OPCODE | NVME_DNR; @@ -1162,6 +1301,15 @@ static void nvme_clear_ctrl(NvmeCtrl *n) } } + while (!QTAILQ_EMPTY(&n->aer_queue)) { + NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue); + QTAILQ_REMOVE(&n->aer_queue, event, entry); + g_free(event); + } + + n->aer_queued = 0; + n->outstanding_aers = 0; + blk_flush(n->conf.blk); n->bar.cc = 0; } @@ -1258,6 +1406,8 @@ static int nvme_start_ctrl(NvmeCtrl *n) nvme_set_timestamp(n, 0ULL); + QTAILQ_INIT(&n->aer_queue); + return 0; } @@ -1479,6 +1629,26 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) "completion queue doorbell write" " for nonexistent queue," " sqid=%"PRIu32", ignoring", qid); + + /* + * NVM Express v1.3d, Section 4.1 state: "If host software writes + * an invalid value to the Submission Queue Tail Doorbell or + * Completion Queue Head Doorbell regiter and an Asynchronous Event + * Request command is outstanding, then an asynchronous event is + * posted to the Admin Completion Queue with a status code of + * Invalid Doorbell Write Value." + * + * Also note that the spec includes the "Invalid Doorbell Register" + * status code, but nowhere does it specify when to use it. + * However, it seems reasonable to use it here in a similar + * fashion. + */ + if (n->outstanding_aers) { + nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, + NVME_AER_INFO_ERR_INVALID_DB_REGISTER, + NVME_LOG_ERROR_INFO); + } + return; } @@ -1489,6 +1659,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) " beyond queue size, sqid=%"PRIu32"," " new_head=%"PRIu16", ignoring", qid, new_head); + + if (n->outstanding_aers) { + nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, + NVME_AER_INFO_ERR_INVALID_DB_VALUE, + NVME_LOG_ERROR_INFO); + } + return; } @@ -1519,6 +1696,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) "submission queue doorbell write" " for nonexistent queue," " sqid=%"PRIu32", ignoring", qid); + + if (n->outstanding_aers) { + nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, + NVME_AER_INFO_ERR_INVALID_DB_REGISTER, + NVME_LOG_ERROR_INFO); + } + return; } @@ -1529,6 +1713,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) " beyond queue size, sqid=%"PRIu32"," " new_tail=%"PRIu16", ignoring", qid, new_tail); + + if (n->outstanding_aers) { + nvme_enqueue_event(n, NVME_AER_TYPE_ERROR, + NVME_AER_INFO_ERR_INVALID_DB_VALUE, + NVME_LOG_ERROR_INFO); + } + return; } @@ -1649,6 +1840,7 @@ static void nvme_init_state(NvmeCtrl *n) n->temperature = NVME_TEMPERATURE; n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING; n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); + n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); } static void nvme_init_blk(NvmeCtrl *n, Error **errp) @@ -1804,6 +1996,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) * inconsequential. */ id->acl = 3; + id->aerl = n->params.aerl; id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; id->lpa = NVME_LPA_EXTENDED; @@ -1878,6 +2071,7 @@ static void nvme_exit(PCIDevice *pci_dev) g_free(n->namespaces); g_free(n->cq); g_free(n->sq); + g_free(n->aer_reqs); if (n->params.cmb_size_mb) { g_free(n->cmbuf); @@ -1898,6 +2092,8 @@ static Property nvme_props[] = { DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0), DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64), DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65), + DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3), + DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 8228978e93..1837233617 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -9,10 +9,12 @@ typedef struct NvmeParams { uint32_t max_ioqpairs; uint16_t msix_qsize; uint32_t cmb_size_mb; + uint8_t aerl; + uint32_t aer_max_queued; } NvmeParams; typedef struct NvmeAsyncEvent { - QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry; + QTAILQ_ENTRY(NvmeAsyncEvent) entry; NvmeAerResult result; } NvmeAsyncEvent; @@ -94,6 +96,7 @@ typedef struct NvmeCtrl { uint32_t num_namespaces; uint32_t max_q_ents; uint64_t ns_size; + uint8_t outstanding_aers; uint8_t *cmbuf; uint32_t irq_status; uint64_t host_timestamp; /* Timestamp sent by the host */ @@ -103,6 +106,11 @@ typedef struct NvmeCtrl { HostMemoryBackend *pmrdev; + uint8_t aer_mask; + NvmeRequest **aer_reqs; + QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue; + int aer_queued; + NvmeNamespace *namespaces; NvmeSQueue **sq; NvmeCQueue **cq; diff --git a/hw/block/trace-events b/hw/block/trace-events index 3330d74e48..091af16ca7 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -51,6 +51,15 @@ pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d" pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d" pci_nvme_setfeat_timestamp(uint64_t ts) "set feature timestamp = 0x%"PRIx64"" pci_nvme_getfeat_timestamp(uint64_t ts) "get feature timestamp = 0x%"PRIx64"" +pci_nvme_process_aers(int queued) "queued %d" +pci_nvme_aer(uint16_t cid) "cid %"PRIu16"" +pci_nvme_aer_aerl_exceeded(void) "aerl exceeded" +pci_nvme_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 0x%"PRIx8"" +pci_nvme_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8"" +pci_nvme_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8"" +pci_nvme_enqueue_event_noqueue(int queued) "queued %d" +pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8"" +pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs" pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16"" pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 9ec9f7d7a3..87fa60b8bd 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -598,8 +598,8 @@ enum NvmeAsyncEventRequest { NVME_AER_TYPE_SMART = 1, NVME_AER_TYPE_IO_SPECIFIC = 6, NVME_AER_TYPE_VENDOR_SPECIFIC = 7, - NVME_AER_INFO_ERR_INVALID_SQ = 0, - NVME_AER_INFO_ERR_INVALID_DB = 1, + NVME_AER_INFO_ERR_INVALID_DB_REGISTER = 0, + NVME_AER_INFO_ERR_INVALID_DB_VALUE = 1, NVME_AER_INFO_ERR_DIAG_FAIL = 2, NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3, NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR = 4, @@ -900,6 +900,10 @@ typedef struct NvmeFeatureVal { #define NVME_TEMP_TMPTH(temp) (temp & 0xffff) +#define NVME_AEC_SMART(aec) (aec & 0xff) +#define NVME_AEC_NS_ATTR(aec) ((aec >> 8) & 0x1) +#define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1) + enum NvmeFeatureIds { NVME_ARBITRATION = 0x1, NVME_POWER_MANAGEMENT = 0x2, From 46ac29c38bb45184db71dfbfbf5a31b3c12dbe29 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:54 +0200 Subject: [PATCH 14/39] hw/block/nvme: move NvmeFeatureVal into hw/block/nvme.h The NvmeFeatureVal does not belong with the spec-related data structures in include/block/nvme.h that is shared between the block-level nvme driver and the emulated nvme device. Move it into the nvme device specific header file as it is the only user of the structure. Also, remove the unused members. Signed-off-by: Klaus Jensen Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-10-its@irrelevant.dk> --- hw/block/nvme.h | 8 ++++++++ include/block/nvme.h | 17 ----------------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 1837233617..b93067c9e4 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -79,6 +79,14 @@ static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns) #define NVME(obj) \ OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME) +typedef struct NvmeFeatureVal { + struct { + uint16_t temp_thresh_hi; + uint16_t temp_thresh_low; + }; + uint32_t async_config; +} NvmeFeatureVal; + typedef struct NvmeCtrl { PCIDevice parent_obj; MemoryRegion iomem; diff --git a/include/block/nvme.h b/include/block/nvme.h index 87fa60b8bd..58e0a073f6 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -866,23 +866,6 @@ enum NvmeIdCtrlLpa { #define NVME_CTRL_SGLS_MPTR_SGL (0x1 << 19) #define NVME_CTRL_SGLS_ADDR_OFFSET (0x1 << 20) -typedef struct NvmeFeatureVal { - uint32_t arbitration; - uint32_t power_mgmt; - struct { - uint16_t temp_thresh_hi; - uint16_t temp_thresh_low; - }; - uint32_t err_rec; - uint32_t volatile_wc; - uint32_t num_queues; - uint32_t int_coalescing; - uint32_t *int_vector_config; - uint32_t write_atomicity; - uint32_t async_config; - uint32_t sw_prog_marker; -} NvmeFeatureVal; - #define NVME_ARB_AB(arb) (arb & 0x7) #define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff) #define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff) From d21d37f5ab8c439ea183d8b04c469823d3b0b5ef Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:55 +0200 Subject: [PATCH 15/39] hw/block/nvme: flush write cache when disabled If the write cache is disabled with a Set Features command, flush it if currently enabled. Signed-off-by: Klaus Jensen Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-11-its@irrelevant.dk> --- hw/block/nvme.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 41a15da3f6..298b250199 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1181,6 +1181,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) break; case NVME_VOLATILE_WRITE_CACHE: + if (!(dw11 & 0x1) && blk_enable_write_cache(n->conf.blk)) { + blk_flush(n->conf.blk); + } + blk_set_enable_write_cache(n->conf.blk, dw11 & 1); break; case NVME_NUMBER_OF_QUEUES: From 1302e48e499c526583b3182ad8c9b7e32010ac53 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:56 +0200 Subject: [PATCH 16/39] hw/block/nvme: add remaining mandatory controller parameters Add support for any remaining mandatory controller operating parameters (features). Signed-off-by: Klaus Jensen Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-12-its@irrelevant.dk> --- hw/block/nvme.c | 56 ++++++++++++++++++++++++++++++++++++++----- hw/block/trace-events | 2 ++ include/block/nvme.h | 10 +++++++- 3 files changed, 61 insertions(+), 7 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 298b250199..889a52ba11 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -85,6 +85,20 @@ " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \ } while (0) +static const bool nvme_feature_support[NVME_FID_MAX] = { + [NVME_ARBITRATION] = true, + [NVME_POWER_MANAGEMENT] = true, + [NVME_TEMPERATURE_THRESHOLD] = true, + [NVME_ERROR_RECOVERY] = true, + [NVME_VOLATILE_WRITE_CACHE] = true, + [NVME_NUMBER_OF_QUEUES] = true, + [NVME_INTERRUPT_COALESCING] = true, + [NVME_INTERRUPT_VECTOR_CONF] = true, + [NVME_WRITE_ATOMICITY] = true, + [NVME_ASYNCHRONOUS_EVENT_CONF] = true, + [NVME_TIMESTAMP] = true, +}; + static void nvme_process_sq(void *opaque); static uint16_t nvme_cid(NvmeRequest *req) @@ -1085,8 +1099,20 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); uint32_t result; + uint8_t fid = NVME_GETSETFEAT_FID(dw10); + uint16_t iv; - switch (dw10) { + static const uint32_t nvme_feature_default[NVME_FID_MAX] = { + [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, + }; + + trace_pci_nvme_getfeat(nvme_cid(req), fid, dw11); + + if (!nvme_feature_support[fid]) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + switch (fid) { case NVME_TEMPERATURE_THRESHOLD: result = 0; @@ -1116,6 +1142,18 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) result = (n->params.max_ioqpairs - 1) | ((n->params.max_ioqpairs - 1) << 16); trace_pci_nvme_getfeat_numq(result); + break; + case NVME_INTERRUPT_VECTOR_CONF: + iv = dw11 & 0xffff; + if (iv >= n->params.max_ioqpairs + 1) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + result = iv; + if (iv == n->admin_cq.vector) { + result |= NVME_INTVC_NOCOALESCING; + } + break; case NVME_ASYNCHRONOUS_EVENT_CONF: result = n->features.async_config; @@ -1123,8 +1161,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) case NVME_TIMESTAMP: return nvme_get_feature_timestamp(n, cmd); default: - trace_pci_nvme_err_invalid_getfeat(dw10); - return NVME_INVALID_FIELD | NVME_DNR; + result = nvme_feature_default[fid]; + break; } req->cqe.result = cpu_to_le32(result); @@ -1153,8 +1191,15 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) { uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); + uint8_t fid = NVME_GETSETFEAT_FID(dw10); - switch (dw10) { + trace_pci_nvme_setfeat(nvme_cid(req), fid, dw11); + + if (!nvme_feature_support[fid]) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + switch (fid) { case NVME_TEMPERATURE_THRESHOLD: if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { break; @@ -1201,8 +1246,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) case NVME_TIMESTAMP: return nvme_set_feature_timestamp(n, cmd); default: - trace_pci_nvme_err_invalid_setfeat(dw10); - return NVME_INVALID_FIELD | NVME_DNR; + return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; } return NVME_SUCCESS; } diff --git a/hw/block/trace-events b/hw/block/trace-events index 091af16ca7..42e62f4649 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -46,6 +46,8 @@ pci_nvme_identify_ctrl(void) "identify controller" pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64"" +pci_nvme_getfeat(uint16_t cid, uint8_t fid, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" cdw11 0x%"PRIx32"" +pci_nvme_setfeat(uint16_t cid, uint8_t fid, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" cdw11 0x%"PRIx32"" pci_nvme_getfeat_vwcache(const char* result) "get feature volatile write cache, result=%s" pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d" pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d" diff --git a/include/block/nvme.h b/include/block/nvme.h index 58e0a073f6..64d2cab782 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -663,6 +663,7 @@ enum NvmeStatusCodes { NVME_FW_REQ_RESET = 0x010b, NVME_INVALID_QUEUE_DEL = 0x010c, NVME_FID_NOT_SAVEABLE = 0x010d, + NVME_FEAT_NOT_CHANGEABLE = 0x010e, NVME_FID_NOT_NSID_SPEC = 0x010f, NVME_FW_REQ_SUSYSTEM_RESET = 0x0110, NVME_CONFLICTING_ATTRS = 0x0180, @@ -867,6 +868,7 @@ enum NvmeIdCtrlLpa { #define NVME_CTRL_SGLS_ADDR_OFFSET (0x1 << 20) #define NVME_ARB_AB(arb) (arb & 0x7) +#define NVME_ARB_AB_NOLIMIT 0x7 #define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff) #define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff) #define NVME_ARB_HPW(arb) ((arb >> 24) & 0xff) @@ -874,6 +876,8 @@ enum NvmeIdCtrlLpa { #define NVME_INTC_THR(intc) (intc & 0xff) #define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff) +#define NVME_INTVC_NOCOALESCING (0x1 << 16) + #define NVME_TEMP_THSEL(temp) ((temp >> 20) & 0x3) #define NVME_TEMP_THSEL_OVER 0x0 #define NVME_TEMP_THSEL_UNDER 0x1 @@ -900,9 +904,13 @@ enum NvmeFeatureIds { NVME_WRITE_ATOMICITY = 0xa, NVME_ASYNCHRONOUS_EVENT_CONF = 0xb, NVME_TIMESTAMP = 0xe, - NVME_SOFTWARE_PROGRESS_MARKER = 0x80 + NVME_SOFTWARE_PROGRESS_MARKER = 0x80, + NVME_FID_MAX = 0x100, }; +#define NVME_GETSETFEAT_FID_MASK 0xff +#define NVME_GETSETFEAT_FID(dw10) (dw10 & NVME_GETSETFEAT_FID_MASK) + typedef struct QEMU_PACKED NvmeRangeType { uint8_t type; uint8_t attributes; From 7c46310d298d8caa9dd0e4c0846331dd148a575c Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:57 +0200 Subject: [PATCH 17/39] hw/block/nvme: support the get/set features select and save fields Since the device does not have any persistent state storage, no features are "saveable" and setting the Save (SV) field in any Set Features command will result in a Feature Identifier Not Saveable status code. Similarly, if the Select (SEL) field is set to request saved values, the devices will (as it should) return the default values instead. Since this also introduces "Supported Capabilities", the nsid field is now also checked for validity wrt. the feature being get/set'ed. Signed-off-by: Klaus Jensen Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-13-its@irrelevant.dk> --- hw/block/nvme.c | 103 +++++++++++++++++++++++++++++++++++++----- hw/block/trace-events | 4 +- include/block/nvme.h | 27 ++++++++++- 3 files changed, 119 insertions(+), 15 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 889a52ba11..bfc23037c3 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -99,6 +99,14 @@ static const bool nvme_feature_support[NVME_FID_MAX] = { [NVME_TIMESTAMP] = true, }; +static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { + [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE, + [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE, + [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE, + [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE, + [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE, +}; + static void nvme_process_sq(void *opaque); static uint16_t nvme_cid(NvmeRequest *req) @@ -1098,20 +1106,47 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) { uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); + uint32_t nsid = le32_to_cpu(cmd->nsid); uint32_t result; uint8_t fid = NVME_GETSETFEAT_FID(dw10); + NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); uint16_t iv; static const uint32_t nvme_feature_default[NVME_FID_MAX] = { [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, }; - trace_pci_nvme_getfeat(nvme_cid(req), fid, dw11); + trace_pci_nvme_getfeat(nvme_cid(req), fid, sel, dw11); if (!nvme_feature_support[fid]) { return NVME_INVALID_FIELD | NVME_DNR; } + if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { + if (!nsid || nsid > n->num_namespaces) { + /* + * The Reservation Notification Mask and Reservation Persistence + * features require a status code of Invalid Field in Command when + * NSID is 0xFFFFFFFF. Since the device does not support those + * features we can always return Invalid Namespace or Format as we + * should do for all other features. + */ + return NVME_INVALID_NSID | NVME_DNR; + } + } + + switch (sel) { + case NVME_GETFEAT_SELECT_CURRENT: + break; + case NVME_GETFEAT_SELECT_SAVED: + /* no features are saveable by the controller; fallthrough */ + case NVME_GETFEAT_SELECT_DEFAULT: + goto defaults; + case NVME_GETFEAT_SELECT_CAP: + result = nvme_feature_cap[fid]; + goto out; + } + switch (fid) { case NVME_TEMPERATURE_THRESHOLD: result = 0; @@ -1121,22 +1156,45 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) * return 0 for all other sensors. */ if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { - break; + goto out; } switch (NVME_TEMP_THSEL(dw11)) { case NVME_TEMP_THSEL_OVER: result = n->features.temp_thresh_hi; - break; + goto out; case NVME_TEMP_THSEL_UNDER: result = n->features.temp_thresh_low; - break; + goto out; } - break; + return NVME_INVALID_FIELD | NVME_DNR; case NVME_VOLATILE_WRITE_CACHE: result = blk_enable_write_cache(n->conf.blk); trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); + goto out; + case NVME_ASYNCHRONOUS_EVENT_CONF: + result = n->features.async_config; + goto out; + case NVME_TIMESTAMP: + return nvme_get_feature_timestamp(n, cmd); + default: + break; + } + +defaults: + switch (fid) { + case NVME_TEMPERATURE_THRESHOLD: + result = 0; + + if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { + break; + } + + if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) { + result = NVME_TEMPERATURE_WARNING; + } + break; case NVME_NUMBER_OF_QUEUES: result = (n->params.max_ioqpairs - 1) | @@ -1155,16 +1213,12 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) } break; - case NVME_ASYNCHRONOUS_EVENT_CONF: - result = n->features.async_config; - break; - case NVME_TIMESTAMP: - return nvme_get_feature_timestamp(n, cmd); default: result = nvme_feature_default[fid]; break; } +out: req->cqe.result = cpu_to_le32(result); return NVME_SUCCESS; } @@ -1191,14 +1245,37 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) { uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); + uint32_t nsid = le32_to_cpu(cmd->nsid); uint8_t fid = NVME_GETSETFEAT_FID(dw10); + uint8_t save = NVME_SETFEAT_SAVE(dw10); - trace_pci_nvme_setfeat(nvme_cid(req), fid, dw11); + trace_pci_nvme_setfeat(nvme_cid(req), fid, save, dw11); + + if (save) { + return NVME_FID_NOT_SAVEABLE | NVME_DNR; + } if (!nvme_feature_support[fid]) { return NVME_INVALID_FIELD | NVME_DNR; } + if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { + if (!nsid || (nsid != NVME_NSID_BROADCAST && + nsid > n->num_namespaces)) { + return NVME_INVALID_NSID | NVME_DNR; + } + } else if (nsid && nsid != NVME_NSID_BROADCAST) { + if (nsid > n->num_namespaces) { + return NVME_INVALID_NSID | NVME_DNR; + } + + return NVME_FEAT_NOT_NS_SPEC | NVME_DNR; + } + + if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) { + return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; + } + switch (fid) { case NVME_TEMPERATURE_THRESHOLD: if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) { @@ -2055,7 +2132,9 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->sqes = (0x6 << 4) | 0x6; id->cqes = (0x4 << 4) | 0x4; id->nn = cpu_to_le32(n->num_namespaces); - id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP); + id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP | + NVME_ONCS_FEATURES); + id->psd[0].mp = cpu_to_le16(0x9c4); id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); diff --git a/hw/block/trace-events b/hw/block/trace-events index 42e62f4649..4a4ef34071 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -46,8 +46,8 @@ pci_nvme_identify_ctrl(void) "identify controller" pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64"" -pci_nvme_getfeat(uint16_t cid, uint8_t fid, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" cdw11 0x%"PRIx32"" -pci_nvme_setfeat(uint16_t cid, uint8_t fid, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" cdw11 0x%"PRIx32"" +pci_nvme_getfeat(uint16_t cid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32"" +pci_nvme_setfeat(uint16_t cid, uint8_t fid, uint8_t save, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" save 0x%"PRIx8" cdw11 0x%"PRIx32"" pci_nvme_getfeat_vwcache(const char* result) "get feature volatile write cache, result=%s" pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d" pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d" diff --git a/include/block/nvme.h b/include/block/nvme.h index 64d2cab782..370df7fc05 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -664,7 +664,7 @@ enum NvmeStatusCodes { NVME_INVALID_QUEUE_DEL = 0x010c, NVME_FID_NOT_SAVEABLE = 0x010d, NVME_FEAT_NOT_CHANGEABLE = 0x010e, - NVME_FID_NOT_NSID_SPEC = 0x010f, + NVME_FEAT_NOT_NS_SPEC = 0x010f, NVME_FW_REQ_SUSYSTEM_RESET = 0x0110, NVME_CONFLICTING_ATTRS = 0x0180, NVME_INVALID_PROT_INFO = 0x0181, @@ -908,9 +908,32 @@ enum NvmeFeatureIds { NVME_FID_MAX = 0x100, }; +typedef enum NvmeFeatureCap { + NVME_FEAT_CAP_SAVE = 1 << 0, + NVME_FEAT_CAP_NS = 1 << 1, + NVME_FEAT_CAP_CHANGE = 1 << 2, +} NvmeFeatureCap; + +typedef enum NvmeGetFeatureSelect { + NVME_GETFEAT_SELECT_CURRENT = 0x0, + NVME_GETFEAT_SELECT_DEFAULT = 0x1, + NVME_GETFEAT_SELECT_SAVED = 0x2, + NVME_GETFEAT_SELECT_CAP = 0x3, +} NvmeGetFeatureSelect; + #define NVME_GETSETFEAT_FID_MASK 0xff #define NVME_GETSETFEAT_FID(dw10) (dw10 & NVME_GETSETFEAT_FID_MASK) +#define NVME_GETFEAT_SELECT_SHIFT 8 +#define NVME_GETFEAT_SELECT_MASK 0x7 +#define NVME_GETFEAT_SELECT(dw10) \ + ((dw10 >> NVME_GETFEAT_SELECT_SHIFT) & NVME_GETFEAT_SELECT_MASK) + +#define NVME_SETFEAT_SAVE_SHIFT 31 +#define NVME_SETFEAT_SAVE_MASK 0x1 +#define NVME_SETFEAT_SAVE(dw10) \ + ((dw10 >> NVME_SETFEAT_SAVE_SHIFT) & NVME_SETFEAT_SAVE_MASK) + typedef struct QEMU_PACKED NvmeRangeType { uint8_t type; uint8_t attributes; @@ -927,6 +950,8 @@ typedef struct QEMU_PACKED NvmeLBAF { uint8_t rp; } NvmeLBAF; +#define NVME_NSID_BROADCAST 0xffffffff + typedef struct QEMU_PACKED NvmeIdNs { uint64_t nsze; uint64_t ncap; From 9932551154ff38b87e73c0ab209a1b75f702e84d Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:58 +0200 Subject: [PATCH 18/39] hw/block/nvme: make sure ncqr and nsqr is valid 0xffff is not an allowed value for NCQR and NSQR in Set Features on Number of Queues. Signed-off-by: Klaus Jensen Acked-by: Keith Busch Reviewed-by: Maxim Levitsky Reviewed-by: Dmitry Fomichev Message-Id: <20200706061303.246057-14-its@irrelevant.dk> --- hw/block/nvme.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index bfc23037c3..a5f6dc4b8e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1310,6 +1310,14 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) blk_set_enable_write_cache(n->conf.blk, dw11 & 1); break; case NVME_NUMBER_OF_QUEUES: + /* + * NVMe v1.3, Section 5.21.1.7: 0xffff is not an allowed value for NCQR + * and NSQR. + */ + if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) { + return NVME_INVALID_FIELD | NVME_DNR; + } + trace_pci_nvme_setfeat_numq((dw11 & 0xFFFF) + 1, ((dw11 >> 16) & 0xFFFF) + 1, n->params.max_ioqpairs, From 464eaa7292ab8b8fc3f09780649f89009ac2c535 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:12:59 +0200 Subject: [PATCH 19/39] hw/block/nvme: support identify namespace descriptor list Since we are not providing the NGUID or EUI64 fields, we must support the Namespace UUID. We do not have any way of storing a persistent unique identifier, so conjure up a UUID that is just the namespace id. Signed-off-by: Klaus Jensen Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-15-its@irrelevant.dk> --- hw/block/nvme.c | 41 +++++++++++++++++++++++++++++++++++++++++ hw/block/trace-events | 1 + 2 files changed, 42 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a5f6dc4b8e..35103dac89 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1022,6 +1022,45 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c) return ret; } +static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c) +{ + uint32_t nsid = le32_to_cpu(c->nsid); + uint64_t prp1 = le64_to_cpu(c->prp1); + uint64_t prp2 = le64_to_cpu(c->prp2); + + uint8_t list[NVME_IDENTIFY_DATA_SIZE]; + + struct data { + struct { + NvmeIdNsDescr hdr; + uint8_t v[16]; + } uuid; + }; + + struct data *ns_descrs = (struct data *)list; + + trace_pci_nvme_identify_ns_descr_list(nsid); + + if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { + trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces); + return NVME_INVALID_NSID | NVME_DNR; + } + + memset(list, 0x0, sizeof(list)); + + /* + * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data + * structure, a Namespace UUID (nidt = 0x3) must be reported in the + * Namespace Identification Descriptor. Add a very basic Namespace UUID + * here. + */ + ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID; + ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN; + stl_be_p(&ns_descrs->uuid.v, nsid); + + return nvme_dma_read_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2); +} + static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd) { NvmeIdentify *c = (NvmeIdentify *)cmd; @@ -1033,6 +1072,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd) return nvme_identify_ctrl(n, c); case NVME_ID_CNS_NS_ACTIVE_LIST: return nvme_identify_nslist(n, c); + case NVME_ID_CNS_NS_DESCR_LIST: + return nvme_identify_ns_descr_list(n, c); default: trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns)); return NVME_INVALID_FIELD | NVME_DNR; diff --git a/hw/block/trace-events b/hw/block/trace-events index 4a4ef34071..7b7303cab1 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -45,6 +45,7 @@ pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16"" pci_nvme_identify_ctrl(void) "identify controller" pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" +pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32"" pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64"" pci_nvme_getfeat(uint16_t cid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32"" pci_nvme_setfeat(uint16_t cid, uint8_t fid, uint8_t save, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" save 0x%"PRIx8" cdw11 0x%"PRIx32"" From 38a58e7ce38cf73f25e5242f5f1abcfcbd250a6d Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:13:00 +0200 Subject: [PATCH 20/39] hw/block/nvme: reject invalid nsid values in active namespace id list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reject the nsid broadcast value (0xffffffff) and 0xfffffffe in the Active Namespace ID list. Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-16-its@irrelevant.dk> --- hw/block/nvme.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 35103dac89..d884ac14a7 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1007,6 +1007,16 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c) trace_pci_nvme_identify_nslist(min_nsid); + /* + * Both 0xffffffff (NVME_NSID_BROADCAST) and 0xfffffffe are invalid values + * since the Active Namespace ID List should return namespaces with ids + * *higher* than the NSID specified in the command. This is also specified + * in the spec (NVM Express v1.3d, Section 5.15.4). + */ + if (min_nsid >= NVME_NSID_BROADCAST - 1) { + return NVME_INVALID_NSID | NVME_DNR; + } + list = g_malloc0(data_len); for (i = 0; i < n->num_namespaces; i++) { if (i < min_nsid) { From 9e7ecdca26c34ffc5bda3d1d49e90bbb6b4f4422 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:13:01 +0200 Subject: [PATCH 21/39] hw/block/nvme: enforce valid queue creation sequence Support returning Command Sequence Error if Set Features on Number of Queues is called after queues have been created. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Dmitry Fomichev Message-Id: <20200706061303.246057-17-its@irrelevant.dk> --- hw/block/nvme.c | 12 ++++++++++++ hw/block/nvme.h | 1 + 2 files changed, 13 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index d884ac14a7..41d4e3795d 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -961,6 +961,13 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd) cq = g_malloc0(sizeof(*cq)); nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1, NVME_CQ_FLAGS_IEN(qflags)); + + /* + * It is only required to set qs_created when creating a completion queue; + * creating a submission queue without a matching completion queue will + * fail. + */ + n->qs_created = true; return NVME_SUCCESS; } @@ -1361,6 +1368,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) blk_set_enable_write_cache(n->conf.blk, dw11 & 1); break; case NVME_NUMBER_OF_QUEUES: + if (n->qs_created) { + return NVME_CMD_SEQ_ERROR | NVME_DNR; + } + /* * NVMe v1.3, Section 5.21.1.7: 0xffff is not an allowed value for NCQR * and NSQR. @@ -1493,6 +1504,7 @@ static void nvme_clear_ctrl(NvmeCtrl *n) n->aer_queued = 0; n->outstanding_aers = 0; + n->qs_created = false; blk_flush(n->conf.blk); n->bar.cc = 0; diff --git a/hw/block/nvme.h b/hw/block/nvme.h index b93067c9e4..0b6a8ae665 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -95,6 +95,7 @@ typedef struct NvmeCtrl { BlockConf conf; NvmeParams params; + bool qs_created; uint32_t page_size; uint16_t page_bits; uint16_t max_prp_ents; From ccbefdb51d55b6649cf30541a2d2542dc6f89fab Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:13:02 +0200 Subject: [PATCH 22/39] hw/block/nvme: provide the mandatory subnqn field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SUBNQN field is mandatory in NVM Express 1.3. Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Dmitry Fomichev Reviewed-by: Maxim Levitsky Message-Id: <20200706061303.246057-18-its@irrelevant.dk> --- hw/block/nvme.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 41d4e3795d..a021572de6 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2168,6 +2168,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) { NvmeIdCtrl *id = &n->id_ctrl; uint8_t *pci_conf = pci_dev->config; + char *subnqn; id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); @@ -2206,6 +2207,10 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP | NVME_ONCS_FEATURES); + subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial); + strpadcpy((char *)id->subnqn, sizeof(id->subnqn), subnqn, '\0'); + g_free(subnqn); + id->psd[0].mp = cpu_to_le16(0x9c4); id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); From 3c40434e516d1541d3f356dbad8c695447a5a7ad Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 6 Jul 2020 08:13:03 +0200 Subject: [PATCH 23/39] hw/block/nvme: bump supported version to v1.3 Bump the supported NVM Express version to v1.3. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Dmitry Fomichev Message-Id: <20200706061303.246057-19-its@irrelevant.dk> --- hw/block/nvme.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a021572de6..aa57749f43 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -71,6 +71,7 @@ #define NVME_MAX_IOQPAIRS 0xffff #define NVME_DB_SIZE 4 +#define NVME_SPEC_VER 0x00010300 #define NVME_CMB_BIR 2 #define NVME_PMR_BIR 2 #define NVME_TEMPERATURE 0x143 @@ -2179,6 +2180,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->ieee[0] = 0x00; id->ieee[1] = 0x02; id->ieee[2] = 0xb3; + id->ver = cpu_to_le32(NVME_SPEC_VER); id->oacs = cpu_to_le16(0); /* @@ -2225,7 +2227,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_CSS(n->bar.cap, 1); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); - n->bar.vs = 0x00010200; + n->bar.vs = NVME_SPEC_VER; n->bar.intmc = n->bar.intms = 0; } From d1322b46684ef38f78760acfd4861b396a9c0264 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 23 Feb 2020 16:37:49 +0100 Subject: [PATCH 24/39] hw/block/nvme: memset preallocated requests structures This is preparatory to subsequent patches that change how QSGs/IOVs are handled. It is important that the qsg and iov members of the NvmeRequest are initially zeroed. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index aa57749f43..8cd30430e9 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -668,7 +668,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, sq->size = size; sq->cqid = cqid; sq->head = sq->tail = 0; - sq->io_req = g_new(NvmeRequest, sq->size); + sq->io_req = g_new0(NvmeRequest, sq->size); QTAILQ_INIT(&sq->req_list); QTAILQ_INIT(&sq->out_req_list); From a80b2ce68251cf5b11c450173128f29223045b9c Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 23 Feb 2020 14:21:52 +0100 Subject: [PATCH 25/39] hw/block/nvme: add mapping helpers Add nvme_map_addr, nvme_map_addr_cmb and nvme_addr_to_cmb helpers and use them in nvme_map_prp. This fixes a bug where in the case of a CMB transfer, the device would map to the buffer with a wrong length. Fixes: b2b2b67a00574 ("nvme: Add support for Read Data and Write Data in CMBs.") Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im Reviewed-by: Andrzej Jakowski --- hw/block/nvme.c | 111 +++++++++++++++++++++++++++++++++++------- hw/block/trace-events | 2 + 2 files changed, 96 insertions(+), 17 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 8cd30430e9..3a9e25eb0e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -132,10 +132,17 @@ static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) return addr >= low && addr < hi; } +static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) +{ + assert(nvme_addr_is_cmb(n, addr)); + + return &n->cmbuf[addr - n->ctrl_mem.addr]; +} + static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) { if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) { - memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size); + memcpy(buf, nvme_addr_to_cmb(n, addr), size); return; } @@ -218,29 +225,91 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) } } +static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, + size_t len) +{ + if (!len) { + return NVME_SUCCESS; + } + + trace_pci_nvme_map_addr_cmb(addr, len); + + if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) { + return NVME_DATA_TRAS_ERROR; + } + + qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len); + + return NVME_SUCCESS; +} + +static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, + hwaddr addr, size_t len) +{ + if (!len) { + return NVME_SUCCESS; + } + + trace_pci_nvme_map_addr(addr, len); + + if (nvme_addr_is_cmb(n, addr)) { + if (qsg && qsg->sg) { + return NVME_INVALID_USE_OF_CMB | NVME_DNR; + } + + assert(iov); + + if (!iov->iov) { + qemu_iovec_init(iov, 1); + } + + return nvme_map_addr_cmb(n, iov, addr, len); + } + + if (iov && iov->iov) { + return NVME_INVALID_USE_OF_CMB | NVME_DNR; + } + + assert(qsg); + + if (!qsg->sg) { + pci_dma_sglist_init(qsg, &n->parent_obj, 1); + } + + qemu_sglist_add(qsg, addr, len); + + return NVME_SUCCESS; +} + static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, uint64_t prp2, uint32_t len, NvmeCtrl *n) { hwaddr trans_len = n->page_size - (prp1 % n->page_size); trans_len = MIN(len, trans_len); int num_prps = (len >> n->page_bits) + 1; + uint16_t status; if (unlikely(!prp1)) { trace_pci_nvme_err_invalid_prp(); return NVME_INVALID_FIELD | NVME_DNR; - } else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr && - prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) { - qsg->nsg = 0; + } + + if (nvme_addr_is_cmb(n, prp1)) { qemu_iovec_init(iov, num_prps); - qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len); } else { pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); - qemu_sglist_add(qsg, prp1, trans_len); } + + status = nvme_map_addr(n, qsg, iov, prp1, trans_len); + if (status) { + goto unmap; + } + len -= trans_len; if (len) { if (unlikely(!prp2)) { trace_pci_nvme_err_invalid_prp2_missing(); + status = NVME_INVALID_FIELD | NVME_DNR; goto unmap; } if (len > n->page_size) { @@ -257,6 +326,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, if (i == n->max_prp_ents - 1 && len > n->page_size) { if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); + status = NVME_INVALID_FIELD | NVME_DNR; goto unmap; } @@ -270,14 +340,14 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); + status = NVME_INVALID_FIELD | NVME_DNR; goto unmap; } trans_len = MIN(len, n->page_size); - if (qsg->nsg){ - qemu_sglist_add(qsg, prp_ent, trans_len); - } else { - qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len); + status = nvme_map_addr(n, qsg, iov, prp_ent, trans_len); + if (status) { + goto unmap; } len -= trans_len; i++; @@ -285,20 +355,27 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, } else { if (unlikely(prp2 & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prp2_align(prp2); + status = NVME_INVALID_FIELD | NVME_DNR; goto unmap; } - if (qsg->nsg) { - qemu_sglist_add(qsg, prp2, len); - } else { - qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - n->ctrl_mem.addr], trans_len); + status = nvme_map_addr(n, qsg, iov, prp2, len); + if (status) { + goto unmap; } } } return NVME_SUCCESS; - unmap: - qemu_sglist_destroy(qsg); - return NVME_INVALID_FIELD | NVME_DNR; +unmap: + if (iov && iov->iov) { + qemu_iovec_destroy(iov); + } + + if (qsg && qsg->sg) { + qemu_sglist_destroy(qsg); + } + + return status; } static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, diff --git a/hw/block/trace-events b/hw/block/trace-events index 7b7303cab1..f3b2d004e0 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -33,6 +33,8 @@ pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u" pci_nvme_irq_pin(void) "pulsing IRQ pin" pci_nvme_irq_masked(void) "IRQ is masked" pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64"" +pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" +pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8"" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8"" pci_nvme_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64"" From 904248a53f3506fc41d30c6ef63d3ee7692d76d5 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 30 Mar 2020 22:22:38 +0200 Subject: [PATCH 26/39] hw/block/nvme: replace dma_acct with blk_acct equivalent The QSG isn't always initialized, so accounting could be wrong. Issue a call to blk_acct_start instead with the size taken from the QSG or IOV depending on the kind of I/O. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im --- hw/block/nvme.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 3a9e25eb0e..81f26fecad 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -637,9 +637,10 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return NVME_INVALID_FIELD | NVME_DNR; } - dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct); if (req->qsg.nsg > 0) { req->has_sg = true; + block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->qsg.size, + acct); req->aiocb = is_write ? dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req) : @@ -647,6 +648,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, nvme_rw_cb, req); } else { req->has_sg = false; + block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->iov.size, + acct); req->aiocb = is_write ? blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb, req) : From 13b4463d8b4f46ab69755d1dd74c6a0c957cce08 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 29 Jul 2020 21:08:04 +0200 Subject: [PATCH 27/39] hw/block/nvme: remove redundant has_sg member Remove the has_sg member from NvmeRequest since it's redundant. Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Maxim Levitsky --- hw/block/nvme.c | 7 ++----- hw/block/nvme.h | 1 - 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 81f26fecad..c04e244677 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -565,7 +565,8 @@ static void nvme_rw_cb(void *opaque, int ret) block_acct_failed(blk_get_stats(n->conf.blk), &req->acct); req->status = NVME_INTERNAL_DEV_ERROR; } - if (req->has_sg) { + + if (req->qsg.nalloc) { qemu_sglist_destroy(&req->qsg); } nvme_enqueue_req_completion(cq, req); @@ -574,7 +575,6 @@ static void nvme_rw_cb(void *opaque, int ret) static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRequest *req) { - req->has_sg = false; block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, BLOCK_ACCT_FLUSH); req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req); @@ -600,7 +600,6 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return NVME_LBA_RANGE | NVME_DNR; } - req->has_sg = false; block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, BLOCK_ACCT_WRITE); req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count, @@ -638,7 +637,6 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, } if (req->qsg.nsg > 0) { - req->has_sg = true; block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->qsg.size, acct); req->aiocb = is_write ? @@ -647,7 +645,6 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req); } else { - req->has_sg = false; block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->iov.size, acct); req->aiocb = is_write ? diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 0b6a8ae665..5519b5cc76 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -22,7 +22,6 @@ typedef struct NvmeRequest { struct NvmeSQueue *sq; BlockAIOCB *aiocb; uint16_t status; - bool has_sg; NvmeCqe cqe; BlockAcctCookie acct; QEMUSGList qsg; From f06a6aa9648c0567fda12b75358630b595834490 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 29 Jul 2020 21:08:33 +0200 Subject: [PATCH 28/39] hw/block/nvme: destroy request iov before reuse Make sure the request iov is destroyed before reuse; fixing a memory leak. Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Maxim Levitsky --- hw/block/nvme.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c04e244677..c2aed4d104 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -569,6 +569,10 @@ static void nvme_rw_cb(void *opaque, int ret) if (req->qsg.nalloc) { qemu_sglist_destroy(&req->qsg); } + if (req->iov.nalloc) { + qemu_iovec_destroy(&req->iov); + } + nvme_enqueue_req_completion(cq, req); } From 794ef1b5a7a6053f5dd1ea20c75c2c292d2a1f51 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 23 Feb 2020 16:03:34 +0100 Subject: [PATCH 29/39] hw/block/nvme: refactor dma read/write Refactor the nvme_dma_{read,write}_prp functions into a common function taking a DMADirection parameter. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im --- hw/block/nvme.c | 91 +++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c2aed4d104..8ec715a29d 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -378,55 +378,53 @@ unmap: return status; } -static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - uint64_t prp1, uint64_t prp2) +static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, + uint64_t prp1, uint64_t prp2, DMADirection dir) { QEMUSGList qsg; QEMUIOVector iov; uint16_t status = NVME_SUCCESS; - if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) { - return NVME_INVALID_FIELD | NVME_DNR; + status = nvme_map_prp(&qsg, &iov, prp1, prp2, len, n); + if (status) { + return status; } + + /* assert that only one of qsg and iov carries data */ + assert((qsg.nsg > 0) != (iov.niov > 0)); + if (qsg.nsg > 0) { - if (dma_buf_write(ptr, len, &qsg)) { - status = NVME_INVALID_FIELD | NVME_DNR; + uint64_t residual; + + if (dir == DMA_DIRECTION_TO_DEVICE) { + residual = dma_buf_write(ptr, len, &qsg); + } else { + residual = dma_buf_read(ptr, len, &qsg); } - qemu_sglist_destroy(&qsg); - } else { - if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) { - status = NVME_INVALID_FIELD | NVME_DNR; - } - qemu_iovec_destroy(&iov); - } - return status; -} -static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - uint64_t prp1, uint64_t prp2) -{ - QEMUSGList qsg; - QEMUIOVector iov; - uint16_t status = NVME_SUCCESS; - - trace_pci_nvme_dma_read(prp1, prp2); - - if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) { - return NVME_INVALID_FIELD | NVME_DNR; - } - if (qsg.nsg > 0) { - if (unlikely(dma_buf_read(ptr, len, &qsg))) { + if (unlikely(residual)) { trace_pci_nvme_err_invalid_dma(); status = NVME_INVALID_FIELD | NVME_DNR; } + qemu_sglist_destroy(&qsg); } else { - if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) { + size_t bytes; + + if (dir == DMA_DIRECTION_TO_DEVICE) { + bytes = qemu_iovec_to_buf(&iov, 0, ptr, len); + } else { + bytes = qemu_iovec_from_buf(&iov, 0, ptr, len); + } + + if (unlikely(bytes != len)) { trace_pci_nvme_err_invalid_dma(); status = NVME_INVALID_FIELD | NVME_DNR; } + qemu_iovec_destroy(&iov); } + return status; } @@ -858,8 +856,8 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae, nvme_clear_events(n, NVME_AER_TYPE_SMART); } - return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1, - prp2); + return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2, + DMA_DIRECTION_FROM_DEVICE); } static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, @@ -880,8 +878,8 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, trans_len = MIN(sizeof(fw_log) - off, buf_len); - return nvme_dma_read_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, - prp2); + return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2, + DMA_DIRECTION_FROM_DEVICE); } static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae, @@ -905,7 +903,8 @@ static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae, trans_len = MIN(sizeof(errlog) - off, buf_len); - return nvme_dma_read_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2); + return nvme_dma_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2, + DMA_DIRECTION_FROM_DEVICE); } static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) @@ -1060,8 +1059,8 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c) trace_pci_nvme_identify_ctrl(); - return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), - prp1, prp2); + return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1, + prp2, DMA_DIRECTION_FROM_DEVICE); } static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c) @@ -1080,8 +1079,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c) ns = &n->namespaces[nsid - 1]; - return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), - prp1, prp2); + return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1, + prp2, DMA_DIRECTION_FROM_DEVICE); } static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c) @@ -1116,7 +1115,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c) break; } } - ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2); + ret = nvme_dma_prp(n, (uint8_t *)list, data_len, prp1, prp2, + DMA_DIRECTION_FROM_DEVICE); g_free(list); return ret; } @@ -1157,7 +1157,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c) ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN; stl_be_p(&ns_descrs->uuid.v, nsid); - return nvme_dma_read_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2); + return nvme_dma_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2, + DMA_DIRECTION_FROM_DEVICE); } static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd) @@ -1238,8 +1239,8 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd) uint64_t timestamp = nvme_get_timestamp(n); - return nvme_dma_read_prp(n, (uint8_t *)×tamp, - sizeof(timestamp), prp1, prp2); + return nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, + prp2, DMA_DIRECTION_FROM_DEVICE); } static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) @@ -1370,8 +1371,8 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd) uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - ret = nvme_dma_write_prp(n, (uint8_t *)×tamp, - sizeof(timestamp), prp1, prp2); + ret = nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, + prp2, DMA_DIRECTION_TO_DEVICE); if (ret != NVME_SUCCESS) { return ret; } From 19012fb898d0a4b3e5999212d01923a5e45cb399 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 29 Jul 2020 21:18:34 +0200 Subject: [PATCH 30/39] hw/block/nvme: add tracing to nvme_map_prp Add tracing to nvme_map_prp. Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Maxim Levitsky --- hw/block/nvme.c | 2 ++ hw/block/trace-events | 1 + 2 files changed, 3 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 8ec715a29d..2c9fea8ac2 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -289,6 +289,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, int num_prps = (len >> n->page_bits) + 1; uint16_t status; + trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); + if (unlikely(!prp1)) { trace_pci_nvme_err_invalid_prp(); return NVME_INVALID_FIELD | NVME_DNR; diff --git a/hw/block/trace-events b/hw/block/trace-events index f3b2d004e0..f20c59a4b5 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -35,6 +35,7 @@ pci_nvme_irq_masked(void) "IRQ is masked" pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64"" pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" +pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8"" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8"" pci_nvme_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64"" From 6d1257f973e2be5c98650147f6cc89a5a05699e3 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 23 Feb 2020 09:39:58 -0800 Subject: [PATCH 31/39] hw/block/nvme: add request mapping helper Introduce the nvme_map helper to remove some noise in the main nvme_rw function. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im --- hw/block/nvme.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2c9fea8ac2..78b4873710 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -430,6 +430,15 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return status; } +static uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeCmd *cmd, size_t len, + NvmeRequest *req) +{ + uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); + uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); + + return nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, len, n); +} + static void nvme_post_cqes(void *opaque) { NvmeCQueue *cq = opaque; @@ -617,8 +626,6 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRwCmd *rw = (NvmeRwCmd *)cmd; uint32_t nlb = le32_to_cpu(rw->nlb) + 1; uint64_t slba = le64_to_cpu(rw->slba); - uint64_t prp1 = le64_to_cpu(rw->dptr.prp1); - uint64_t prp2 = le64_to_cpu(rw->dptr.prp2); uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds; @@ -635,7 +642,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return NVME_LBA_RANGE | NVME_DNR; } - if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) { + if (nvme_map_dptr(n, cmd, data_size, req)) { block_acct_invalid(blk_get_stats(n->conf.blk), acct); return NVME_INVALID_FIELD | NVME_DNR; } From 076c816f4e62824714427a9d193f31ccb693d404 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 23 Feb 2020 16:12:12 +0100 Subject: [PATCH 32/39] hw/block/nvme: verify validity of prp lists in the cmb Before this patch the device already supported PRP lists in the CMB, but it did not check for the validity of it nor announced the support in the Identify Controller data structure LISTS field. If some of the PRPs in a PRP list are in the CMB, then ALL entries must be there. This patch makes sure that requirement is verified as well as properly announcing support for PRP lists in the CMB. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im --- hw/block/nvme.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 78b4873710..fc629cd8b2 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -288,6 +288,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, trans_len = MIN(len, trans_len); int num_prps = (len >> n->page_bits) + 1; uint16_t status; + bool prp_list_in_cmb = false; trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); @@ -314,11 +315,16 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, status = NVME_INVALID_FIELD | NVME_DNR; goto unmap; } + if (len > n->page_size) { uint64_t prp_list[n->max_prp_ents]; uint32_t nents, prp_trans; int i = 0; + if (nvme_addr_is_cmb(n, prp2)) { + prp_list_in_cmb = true; + } + nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); @@ -332,6 +338,11 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, goto unmap; } + if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) { + status = NVME_INVALID_USE_OF_CMB | NVME_DNR; + goto unmap; + } + i = 0; nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); @@ -351,6 +362,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, if (status) { goto unmap; } + len -= trans_len; i++; } @@ -2168,7 +2180,7 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0); - NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0); + NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1); NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1); NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1); NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */ From 36c100f530b52cc43ad214c48dcfe4c6752859b6 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 23 Feb 2020 08:32:25 -0800 Subject: [PATCH 33/39] hw/block/nvme: refactor request bounds checking Hoist bounds checking into its own function and check for wrap-around. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im --- hw/block/nvme.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index fc629cd8b2..8dade3e223 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -570,6 +570,18 @@ static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) } } +static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns, + uint64_t slba, uint32_t nlb) +{ + uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); + + if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) { + return NVME_LBA_RANGE | NVME_DNR; + } + + return NVME_SUCCESS; +} + static void nvme_rw_cb(void *opaque, int ret) { NvmeRequest *req = opaque; @@ -617,12 +629,14 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, uint32_t nlb = le16_to_cpu(rw->nlb) + 1; uint64_t offset = slba << data_shift; uint32_t count = nlb << data_shift; + uint16_t status; trace_pci_nvme_write_zeroes(nvme_cid(req), slba, nlb); - if (unlikely(slba + nlb > ns->id_ns.nsze)) { + status = nvme_check_bounds(n, ns, slba, nlb); + if (status) { trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); - return NVME_LBA_RANGE | NVME_DNR; + return status; } block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, @@ -645,13 +659,15 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, uint64_t data_offset = slba << data_shift; int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0; enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; + uint16_t status; trace_pci_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba); - if (unlikely((slba + nlb) > ns->id_ns.nsze)) { - block_acct_invalid(blk_get_stats(n->conf.blk), acct); + status = nvme_check_bounds(n, ns, slba, nlb); + if (status) { trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); - return NVME_LBA_RANGE | NVME_DNR; + block_acct_invalid(blk_get_stats(n->conf.blk), acct); + return status; } if (nvme_map_dptr(n, cmd, data_size, req)) { From e137d20e7dfff00d0662b3d2d256e9c92f7a7f4a Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 23 Feb 2020 08:38:22 -0800 Subject: [PATCH 34/39] hw/block/nvme: add check for mdts Add 'mdts' device parameter to control the Maximum Data Transfer Size of the controller and check that it is respected. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im --- hw/block/nvme.c | 30 +++++++++++++++++++++++++++++- hw/block/nvme.h | 1 + hw/block/trace-events | 1 + 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 8dade3e223..30b42e2492 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -21,7 +21,8 @@ * cmb_size_mb=, \ * [pmrdev=,] \ * max_ioqpairs=, \ - * aerl=, aer_max_queued= + * aerl=, aer_max_queued=, \ + * mdts= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. @@ -570,6 +571,17 @@ static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) } } +static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len) +{ + uint8_t mdts = n->params.mdts; + + if (mdts && len > n->page_size << mdts) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + return NVME_SUCCESS; +} + static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns, uint64_t slba, uint32_t nlb) { @@ -663,6 +675,13 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, trace_pci_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba); + status = nvme_check_mdts(n, data_size); + if (status) { + trace_pci_nvme_err_mdts(nvme_cid(req), data_size); + block_acct_invalid(blk_get_stats(n->conf.blk), acct); + return status; + } + status = nvme_check_bounds(n, ns, slba, nlb); if (status) { trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); @@ -956,6 +975,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) uint32_t numdl, numdu; uint64_t off, lpol, lpou; size_t len; + uint16_t status; numdl = (dw10 >> 16); numdu = (dw11 & 0xffff); @@ -971,6 +991,12 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off); + status = nvme_check_mdts(n, len); + if (status) { + trace_pci_nvme_err_mdts(nvme_cid(req), len); + return status; + } + switch (lid) { case NVME_LOG_ERROR_INFO: return nvme_error_info(n, cmd, rae, len, off, req); @@ -2299,6 +2325,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->ieee[0] = 0x00; id->ieee[1] = 0x02; id->ieee[2] = 0xb3; + id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); id->oacs = cpu_to_le16(0); @@ -2418,6 +2445,7 @@ static Property nvme_props[] = { DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65), DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3), DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), + DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 5519b5cc76..137cd8c2bf 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -11,6 +11,7 @@ typedef struct NvmeParams { uint32_t cmb_size_mb; uint8_t aerl; uint32_t aer_max_queued; + uint8_t mdts; } NvmeParams; typedef struct NvmeAsyncEvent { diff --git a/hw/block/trace-events b/hw/block/trace-events index f20c59a4b5..72cf2d15cb 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -85,6 +85,7 @@ pci_nvme_mmio_shutdown_set(void) "shutdown bit set" pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" # nvme traces for error conditions +pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" From 69265150aa53f0e1dbc85809800df40ffc9e7d67 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 30 Mar 2020 23:10:13 +0200 Subject: [PATCH 35/39] hw/block/nvme: be consistent about zeros vs zeroes The NVM Express specification generally uses 'zeroes' and not 'zeros', so let us align with it. Cc: Fam Zheng Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Maxim Levitsky --- block/nvme.c | 4 ++-- hw/block/nvme.c | 8 ++++---- include/block/nvme.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index c1c4c07ac6..05485fdd11 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -537,7 +537,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) s->page_size / sizeof(uint64_t) * s->page_size); oncs = le16_to_cpu(idctrl->oncs); - s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROS); + s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES); s->supports_discard = !!(oncs & NVME_ONCS_DSM); memset(resp, 0, 4096); @@ -1201,7 +1201,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs, } NvmeCmd cmd = { - .opcode = NVME_CMD_WRITE_ZEROS, + .opcode = NVME_CMD_WRITE_ZEROES, .nsid = cpu_to_le32(s->nsid), .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF), .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF), diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 30b42e2492..20cb174963 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -631,7 +631,7 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return NVME_NO_COMPLETE; } -static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, +static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)cmd; @@ -731,8 +731,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) switch (cmd->opcode) { case NVME_CMD_FLUSH: return nvme_flush(n, ns, cmd, req); - case NVME_CMD_WRITE_ZEROS: - return nvme_write_zeros(n, ns, cmd, req); + case NVME_CMD_WRITE_ZEROES: + return nvme_write_zeroes(n, ns, cmd, req); case NVME_CMD_WRITE: case NVME_CMD_READ: return nvme_rw(n, ns, cmd, req); @@ -2352,7 +2352,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->sqes = (0x6 << 4) | 0x6; id->cqes = (0x4 << 4) | 0x4; id->nn = cpu_to_le32(n->num_namespaces); - id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP | + id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | NVME_ONCS_FEATURES); subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial); diff --git a/include/block/nvme.h b/include/block/nvme.h index 370df7fc05..65e68a82c8 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -460,7 +460,7 @@ enum NvmeIoCommands { NVME_CMD_READ = 0x02, NVME_CMD_WRITE_UNCOR = 0x04, NVME_CMD_COMPARE = 0x05, - NVME_CMD_WRITE_ZEROS = 0x08, + NVME_CMD_WRITE_ZEROES = 0x08, NVME_CMD_DSM = 0x09, }; @@ -838,7 +838,7 @@ enum NvmeIdCtrlOncs { NVME_ONCS_COMPARE = 1 << 0, NVME_ONCS_WRITE_UNCORR = 1 << 1, NVME_ONCS_DSM = 1 << 2, - NVME_ONCS_WRITE_ZEROS = 1 << 3, + NVME_ONCS_WRITE_ZEROES = 1 << 3, NVME_ONCS_FEATURES = 1 << 4, NVME_ONCS_RESRVATIONS = 1 << 5, NVME_ONCS_TIMESTAMP = 1 << 6, From 3143df3d568d45740473ef8d0cb73ae41eafad3f Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 20 Jul 2020 12:44:01 +0200 Subject: [PATCH 36/39] hw/block/nvme: add ns/cmd references in NvmeRequest Instead of passing around the NvmeNamespace and the NvmeCmd, add them as members in the NvmeRequest structure. Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Maxim Levitsky --- hw/block/nvme.c | 187 ++++++++++++++++++++++++++---------------------- hw/block/nvme.h | 2 + 2 files changed, 104 insertions(+), 85 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 20cb174963..7d5c69c04f 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -226,6 +226,12 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) } } +static void nvme_req_clear(NvmeRequest *req) +{ + req->ns = NULL; + memset(&req->cqe, 0x0, sizeof(req->cqe)); +} + static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, size_t len) { @@ -443,9 +449,9 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return status; } -static uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeCmd *cmd, size_t len, - NvmeRequest *req) +static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) { + NvmeCmd *cmd = &req->cmd; uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); @@ -621,8 +627,7 @@ static void nvme_rw_cb(void *opaque, int ret) nvme_enqueue_req_completion(cq, req); } -static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, - NvmeRequest *req) +static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, BLOCK_ACCT_FLUSH); @@ -631,10 +636,10 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return NVME_NO_COMPLETE; } -static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, - NvmeRequest *req) +static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) { - NvmeRwCmd *rw = (NvmeRwCmd *)cmd; + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + NvmeNamespace *ns = req->ns; const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds; uint64_t slba = le64_to_cpu(rw->slba); @@ -658,10 +663,10 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return NVME_NO_COMPLETE; } -static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, - NvmeRequest *req) +static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) { - NvmeRwCmd *rw = (NvmeRwCmd *)cmd; + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + NvmeNamespace *ns = req->ns; uint32_t nlb = le32_to_cpu(rw->nlb) + 1; uint64_t slba = le64_to_cpu(rw->slba); @@ -689,7 +694,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return status; } - if (nvme_map_dptr(n, cmd, data_size, req)) { + if (nvme_map_dptr(n, data_size, req)) { block_acct_invalid(blk_get_stats(n->conf.blk), acct); return NVME_INVALID_FIELD | NVME_DNR; } @@ -715,29 +720,29 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return NVME_NO_COMPLETE; } -static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) { - NvmeNamespace *ns; - uint32_t nsid = le32_to_cpu(cmd->nsid); + uint32_t nsid = le32_to_cpu(req->cmd.nsid); - trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), cmd->opcode); + trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), + req->cmd.opcode); if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces); return NVME_INVALID_NSID | NVME_DNR; } - ns = &n->namespaces[nsid - 1]; - switch (cmd->opcode) { + req->ns = &n->namespaces[nsid - 1]; + switch (req->cmd.opcode) { case NVME_CMD_FLUSH: - return nvme_flush(n, ns, cmd, req); + return nvme_flush(n, req); case NVME_CMD_WRITE_ZEROES: - return nvme_write_zeroes(n, ns, cmd, req); + return nvme_write_zeroes(n, req); case NVME_CMD_WRITE: case NVME_CMD_READ: - return nvme_rw(n, ns, cmd, req); + return nvme_rw(n, req); default: - trace_pci_nvme_err_invalid_opc(cmd->opcode); + trace_pci_nvme_err_invalid_opc(req->cmd.opcode); return NVME_INVALID_OPCODE | NVME_DNR; } } @@ -753,10 +758,10 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) } } -static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd) +static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) { - NvmeDeleteQ *c = (NvmeDeleteQ *)cmd; - NvmeRequest *req, *next; + NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; + NvmeRequest *r, *next; NvmeSQueue *sq; NvmeCQueue *cq; uint16_t qid = le16_to_cpu(c->qid); @@ -770,19 +775,19 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd) sq = n->sq[qid]; while (!QTAILQ_EMPTY(&sq->out_req_list)) { - req = QTAILQ_FIRST(&sq->out_req_list); - assert(req->aiocb); - blk_aio_cancel(req->aiocb); + r = QTAILQ_FIRST(&sq->out_req_list); + assert(r->aiocb); + blk_aio_cancel(r->aiocb); } if (!nvme_check_cqid(n, sq->cqid)) { cq = n->cq[sq->cqid]; QTAILQ_REMOVE(&cq->sq_list, sq, entry); nvme_post_cqes(cq); - QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) { - if (req->sq == sq) { - QTAILQ_REMOVE(&cq->req_list, req, entry); - QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); + QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) { + if (r->sq == sq) { + QTAILQ_REMOVE(&cq->req_list, r, entry); + QTAILQ_INSERT_TAIL(&sq->req_list, r, entry); } } } @@ -819,10 +824,10 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, n->sq[sqid] = sq; } -static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd) +static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) { NvmeSQueue *sq; - NvmeCreateSq *c = (NvmeCreateSq *)cmd; + NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd; uint16_t cqid = le16_to_cpu(c->cqid); uint16_t sqid = le16_to_cpu(c->sqid); @@ -857,10 +862,10 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd) return NVME_SUCCESS; } -static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae, - uint32_t buf_len, uint64_t off, - NvmeRequest *req) +static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, + uint64_t off, NvmeRequest *req) { + NvmeCmd *cmd = &req->cmd; uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); uint32_t nsid = le32_to_cpu(cmd->nsid); @@ -916,10 +921,11 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae, DMA_DIRECTION_FROM_DEVICE); } -static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, - uint64_t off, NvmeRequest *req) +static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, + NvmeRequest *req) { uint32_t trans_len; + NvmeCmd *cmd = &req->cmd; uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); NvmeFwSlotInfoLog fw_log = { @@ -938,11 +944,11 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len, DMA_DIRECTION_FROM_DEVICE); } -static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae, - uint32_t buf_len, uint64_t off, - NvmeRequest *req) +static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, + uint64_t off, NvmeRequest *req) { uint32_t trans_len; + NvmeCmd *cmd = &req->cmd; uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); NvmeErrorLog errlog; @@ -963,8 +969,10 @@ static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae, DMA_DIRECTION_FROM_DEVICE); } -static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) { + NvmeCmd *cmd = &req->cmd; + uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); uint32_t dw12 = le32_to_cpu(cmd->cdw12); @@ -999,11 +1007,11 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) switch (lid) { case NVME_LOG_ERROR_INFO: - return nvme_error_info(n, cmd, rae, len, off, req); + return nvme_error_info(n, rae, len, off, req); case NVME_LOG_SMART_INFO: - return nvme_smart_info(n, cmd, rae, len, off, req); + return nvme_smart_info(n, rae, len, off, req); case NVME_LOG_FW_SLOT_INFO: - return nvme_fw_log_info(n, cmd, len, off, req); + return nvme_fw_log_info(n, len, off, req); default: trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); return NVME_INVALID_FIELD | NVME_DNR; @@ -1021,9 +1029,9 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) } } -static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd) +static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req) { - NvmeDeleteQ *c = (NvmeDeleteQ *)cmd; + NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd; NvmeCQueue *cq; uint16_t qid = le16_to_cpu(c->qid); @@ -1064,10 +1072,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); } -static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd) +static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) { NvmeCQueue *cq; - NvmeCreateCq *c = (NvmeCreateCq *)cmd; + NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd; uint16_t cqid = le16_to_cpu(c->cqid); uint16_t vector = le16_to_cpu(c->irq_vector); uint16_t qsize = le16_to_cpu(c->qsize); @@ -1115,8 +1123,9 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd) return NVME_SUCCESS; } -static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c) +static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint64_t prp1 = le64_to_cpu(c->prp1); uint64_t prp2 = le64_to_cpu(c->prp2); @@ -1126,9 +1135,10 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c) prp2, DMA_DIRECTION_FROM_DEVICE); } -static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c) +static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) { NvmeNamespace *ns; + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); uint64_t prp1 = le64_to_cpu(c->prp1); uint64_t prp2 = le64_to_cpu(c->prp2); @@ -1146,8 +1156,9 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c) prp2, DMA_DIRECTION_FROM_DEVICE); } -static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c) +static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) { + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; static const int data_len = NVME_IDENTIFY_DATA_SIZE; uint32_t min_nsid = le32_to_cpu(c->nsid); uint64_t prp1 = le64_to_cpu(c->prp1); @@ -1184,8 +1195,9 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c) return ret; } -static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c) +static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) { + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); uint64_t prp1 = le64_to_cpu(c->prp1); uint64_t prp2 = le64_to_cpu(c->prp2); @@ -1224,28 +1236,28 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c) DMA_DIRECTION_FROM_DEVICE); } -static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd) +static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) { - NvmeIdentify *c = (NvmeIdentify *)cmd; + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; switch (le32_to_cpu(c->cns)) { case NVME_ID_CNS_NS: - return nvme_identify_ns(n, c); + return nvme_identify_ns(n, req); case NVME_ID_CNS_CTRL: - return nvme_identify_ctrl(n, c); + return nvme_identify_ctrl(n, req); case NVME_ID_CNS_NS_ACTIVE_LIST: - return nvme_identify_nslist(n, c); + return nvme_identify_nslist(n, req); case NVME_ID_CNS_NS_DESCR_LIST: - return nvme_identify_ns_descr_list(n, c); + return nvme_identify_ns_descr_list(n, req); default: trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns)); return NVME_INVALID_FIELD | NVME_DNR; } } -static uint16_t nvme_abort(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req) { - uint16_t sqid = le32_to_cpu(cmd->cdw10) & 0xffff; + uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff; req->cqe.result = 1; if (nvme_check_sqid(n, sqid)) { @@ -1295,8 +1307,9 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n) return cpu_to_le64(ts.all); } -static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd) +static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { + NvmeCmd *cmd = &req->cmd; uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); @@ -1306,8 +1319,9 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd) prp2, DMA_DIRECTION_FROM_DEVICE); } -static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) { + NvmeCmd *cmd = &req->cmd; uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); uint32_t nsid = le32_to_cpu(cmd->nsid); @@ -1381,7 +1395,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) result = n->features.async_config; goto out; case NVME_TIMESTAMP: - return nvme_get_feature_timestamp(n, cmd); + return nvme_get_feature_timestamp(n, req); default: break; } @@ -1427,10 +1441,11 @@ out: return NVME_SUCCESS; } -static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd) +static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { uint16_t ret; uint64_t timestamp; + NvmeCmd *cmd = &req->cmd; uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); @@ -1445,8 +1460,9 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd) return NVME_SUCCESS; } -static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) { + NvmeCmd *cmd = &req->cmd; uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); uint32_t nsid = le32_to_cpu(cmd->nsid); @@ -1537,14 +1553,14 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) n->features.async_config = dw11; break; case NVME_TIMESTAMP: - return nvme_set_feature_timestamp(n, cmd); + return nvme_set_feature_timestamp(n, req); default: return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; } return NVME_SUCCESS; } -static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_aer(nvme_cid(req)); @@ -1563,33 +1579,33 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return NVME_NO_COMPLETE; } -static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) { - trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), cmd->opcode); + trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode); - switch (cmd->opcode) { + switch (req->cmd.opcode) { case NVME_ADM_CMD_DELETE_SQ: - return nvme_del_sq(n, cmd); + return nvme_del_sq(n, req); case NVME_ADM_CMD_CREATE_SQ: - return nvme_create_sq(n, cmd); + return nvme_create_sq(n, req); case NVME_ADM_CMD_GET_LOG_PAGE: - return nvme_get_log(n, cmd, req); + return nvme_get_log(n, req); case NVME_ADM_CMD_DELETE_CQ: - return nvme_del_cq(n, cmd); + return nvme_del_cq(n, req); case NVME_ADM_CMD_CREATE_CQ: - return nvme_create_cq(n, cmd); + return nvme_create_cq(n, req); case NVME_ADM_CMD_IDENTIFY: - return nvme_identify(n, cmd); + return nvme_identify(n, req); case NVME_ADM_CMD_ABORT: - return nvme_abort(n, cmd, req); + return nvme_abort(n, req); case NVME_ADM_CMD_SET_FEATURES: - return nvme_set_feature(n, cmd, req); + return nvme_set_feature(n, req); case NVME_ADM_CMD_GET_FEATURES: - return nvme_get_feature(n, cmd, req); + return nvme_get_feature(n, req); case NVME_ADM_CMD_ASYNC_EV_REQ: - return nvme_aer(n, cmd, req); + return nvme_aer(n, req); default: - trace_pci_nvme_err_invalid_admin_opc(cmd->opcode); + trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); return NVME_INVALID_OPCODE | NVME_DNR; } } @@ -1613,11 +1629,12 @@ static void nvme_process_sq(void *opaque) req = QTAILQ_FIRST(&sq->req_list); QTAILQ_REMOVE(&sq->req_list, req, entry); QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry); - memset(&req->cqe, 0, sizeof(req->cqe)); + nvme_req_clear(req); req->cqe.cid = cmd.cid; + memcpy(&req->cmd, &cmd, sizeof(NvmeCmd)); - status = sq->sqid ? nvme_io_cmd(n, &cmd, req) : - nvme_admin_cmd(n, &cmd, req); + status = sq->sqid ? nvme_io_cmd(n, req) : + nvme_admin_cmd(n, req); if (status != NVME_NO_COMPLETE) { req->status = status; nvme_enqueue_req_completion(cq, req); diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 137cd8c2bf..52ba794f2e 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -21,9 +21,11 @@ typedef struct NvmeAsyncEvent { typedef struct NvmeRequest { struct NvmeSQueue *sq; + struct NvmeNamespace *ns; BlockAIOCB *aiocb; uint16_t status; NvmeCqe cqe; + NvmeCmd cmd; BlockAcctCookie acct; QEMUSGList qsg; QEMUIOVector iov; From c660ad250e04c598330f366a2b612ce60285d199 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 29 Jun 2020 10:04:10 +0200 Subject: [PATCH 37/39] hw/block/nvme: consolidate qsg/iov clearing Always destroy the request qsg/iov at the end of request use. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im --- hw/block/nvme.c | 52 ++++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 7d5c69c04f..a4fdc8e0a8 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -232,6 +232,17 @@ static void nvme_req_clear(NvmeRequest *req) memset(&req->cqe, 0x0, sizeof(req->cqe)); } +static void nvme_req_exit(NvmeRequest *req) +{ + if (req->qsg.sg) { + qemu_sglist_destroy(&req->qsg); + } + + if (req->iov.iov) { + qemu_iovec_destroy(&req->iov); + } +} + static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, size_t len) { @@ -312,15 +323,14 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, status = nvme_map_addr(n, qsg, iov, prp1, trans_len); if (status) { - goto unmap; + return status; } len -= trans_len; if (len) { if (unlikely(!prp2)) { trace_pci_nvme_err_invalid_prp2_missing(); - status = NVME_INVALID_FIELD | NVME_DNR; - goto unmap; + return NVME_INVALID_FIELD | NVME_DNR; } if (len > n->page_size) { @@ -341,13 +351,11 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, if (i == n->max_prp_ents - 1 && len > n->page_size) { if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - status = NVME_INVALID_FIELD | NVME_DNR; - goto unmap; + return NVME_INVALID_FIELD | NVME_DNR; } if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) { - status = NVME_INVALID_USE_OF_CMB | NVME_DNR; - goto unmap; + return NVME_INVALID_USE_OF_CMB | NVME_DNR; } i = 0; @@ -360,14 +368,13 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - status = NVME_INVALID_FIELD | NVME_DNR; - goto unmap; + return NVME_INVALID_FIELD | NVME_DNR; } trans_len = MIN(len, n->page_size); status = nvme_map_addr(n, qsg, iov, prp_ent, trans_len); if (status) { - goto unmap; + return status; } len -= trans_len; @@ -376,27 +383,16 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, } else { if (unlikely(prp2 & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prp2_align(prp2); - status = NVME_INVALID_FIELD | NVME_DNR; - goto unmap; + return NVME_INVALID_FIELD | NVME_DNR; } status = nvme_map_addr(n, qsg, iov, prp2, len); if (status) { - goto unmap; + return status; } } } + return NVME_SUCCESS; - -unmap: - if (iov && iov->iov) { - qemu_iovec_destroy(iov); - } - - if (qsg && qsg->sg) { - qemu_sglist_destroy(qsg); - } - - return status; } static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, @@ -481,6 +477,7 @@ static void nvme_post_cqes(void *opaque) nvme_inc_cq_tail(cq); pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, sizeof(req->cqe)); + nvme_req_exit(req); QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); } if (cq->tail != cq->head) { @@ -617,13 +614,6 @@ static void nvme_rw_cb(void *opaque, int ret) req->status = NVME_INTERNAL_DEV_ERROR; } - if (req->qsg.nalloc) { - qemu_sglist_destroy(&req->qsg); - } - if (req->iov.nalloc) { - qemu_iovec_destroy(&req->iov); - } - nvme_enqueue_req_completion(cq, req); } From d424d36a496da064de866469a344b07b771fbc12 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 23 Feb 2020 09:34:34 -0800 Subject: [PATCH 38/39] hw/block/nvme: use preallocated qsg/iov in nvme_dma_prp Since clean up of the request qsg/iov is now always done post-use, there is no need to use a stack-allocated qsg/iov in nvme_dma_prp. Signed-off-by: Klaus Jensen Acked-by: Keith Busch Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im --- hw/block/nvme.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a4fdc8e0a8..01648cfb4d 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -396,50 +396,45 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, } static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - uint64_t prp1, uint64_t prp2, DMADirection dir) + uint64_t prp1, uint64_t prp2, DMADirection dir, + NvmeRequest *req) { - QEMUSGList qsg; - QEMUIOVector iov; uint16_t status = NVME_SUCCESS; - status = nvme_map_prp(&qsg, &iov, prp1, prp2, len, n); + status = nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, len, n); if (status) { return status; } /* assert that only one of qsg and iov carries data */ - assert((qsg.nsg > 0) != (iov.niov > 0)); + assert((req->qsg.nsg > 0) != (req->iov.niov > 0)); - if (qsg.nsg > 0) { + if (req->qsg.nsg > 0) { uint64_t residual; if (dir == DMA_DIRECTION_TO_DEVICE) { - residual = dma_buf_write(ptr, len, &qsg); + residual = dma_buf_write(ptr, len, &req->qsg); } else { - residual = dma_buf_read(ptr, len, &qsg); + residual = dma_buf_read(ptr, len, &req->qsg); } if (unlikely(residual)) { trace_pci_nvme_err_invalid_dma(); status = NVME_INVALID_FIELD | NVME_DNR; } - - qemu_sglist_destroy(&qsg); } else { size_t bytes; if (dir == DMA_DIRECTION_TO_DEVICE) { - bytes = qemu_iovec_to_buf(&iov, 0, ptr, len); + bytes = qemu_iovec_to_buf(&req->iov, 0, ptr, len); } else { - bytes = qemu_iovec_from_buf(&iov, 0, ptr, len); + bytes = qemu_iovec_from_buf(&req->iov, 0, ptr, len); } if (unlikely(bytes != len)) { trace_pci_nvme_err_invalid_dma(); status = NVME_INVALID_FIELD | NVME_DNR; } - - qemu_iovec_destroy(&iov); } return status; @@ -908,7 +903,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, } return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE); + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, @@ -931,7 +926,7 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, trans_len = MIN(sizeof(fw_log) - off, buf_len); return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE); + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, @@ -956,7 +951,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, trans_len = MIN(sizeof(errlog) - off, buf_len); return nvme_dma_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE); + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) @@ -1122,7 +1117,7 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_identify_ctrl(); return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE); + prp2, DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) @@ -1143,7 +1138,7 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) ns = &n->namespaces[nsid - 1]; return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE); + prp2, DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) @@ -1180,7 +1175,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) } } ret = nvme_dma_prp(n, (uint8_t *)list, data_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE); + DMA_DIRECTION_FROM_DEVICE, req); g_free(list); return ret; } @@ -1223,7 +1218,7 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) stl_be_p(&ns_descrs->uuid.v, nsid); return nvme_dma_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE); + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) @@ -1306,7 +1301,7 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) uint64_t timestamp = nvme_get_timestamp(n); return nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE); + prp2, DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) @@ -1440,7 +1435,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); ret = nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, - prp2, DMA_DIRECTION_TO_DEVICE); + prp2, DMA_DIRECTION_TO_DEVICE, req); if (ret != NVME_SUCCESS) { return ret; } From 82386d4a0f19ff1e6a04e29f104da1d12269539e Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 29 Jul 2020 21:55:37 +0200 Subject: [PATCH 39/39] hw/block/nvme: remove explicit qsg/iov parameters Since nvme_map_prp always operate on the request-scoped qsg/iovs, just pass a single pointer to the NvmeRequest instead of two for each of the qsg and iov. Suggested-by: Minwoo Im Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Maxim Levitsky --- hw/block/nvme.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 01648cfb4d..63078f6009 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -299,8 +299,8 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, return NVME_SUCCESS; } -static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, - uint64_t prp2, uint32_t len, NvmeCtrl *n) +static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, + uint32_t len, NvmeRequest *req) { hwaddr trans_len = n->page_size - (prp1 % n->page_size); trans_len = MIN(len, trans_len); @@ -308,6 +308,9 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, uint16_t status; bool prp_list_in_cmb = false; + QEMUSGList *qsg = &req->qsg; + QEMUIOVector *iov = &req->iov; + trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); if (unlikely(!prp1)) { @@ -401,7 +404,7 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, { uint16_t status = NVME_SUCCESS; - status = nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, len, n); + status = nvme_map_prp(n, prp1, prp2, len, req); if (status) { return status; } @@ -446,7 +449,7 @@ static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - return nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, len, n); + return nvme_map_prp(n, prp1, prp2, len, req); } static void nvme_post_cqes(void *opaque)