On Jun 8 09:36, Jinhao Fan wrote: > Implement Doorbel Buffer Config command (Section 5.7 in NVMe Spec 1.3) > and Shadow Doorbel buffer & EventIdx buffer handling logic (Section 7.13 > in NVMe Spec 1.3). For queues created before the Doorbell Buffer Config > command, the nvme_dbbuf_config function tries to associate each existing > SQ and CQ with its Shadow Doorbel buffer and EventIdx buffer address. > Queues created after the Doorbell Buffer Config command will have the > doorbell buffers associated with them when they are initialized. > > In nvme_process_sq and nvme_post_cqe, proactively check for Shadow > Doorbell buffer changes instead of wait for doorbell register changes. > This reduces the number of MMIOs. > > Signed-off-by: Jinhao Fan <fanjinhao...@ict.ac.cn> > --- > hw/nvme/ctrl.c | 95 ++++++++++++++++++++++++++++++++++++++++++-- > hw/nvme/nvme.h | 8 ++++ > include/block/nvme.h | 2 + > 3 files changed, 102 insertions(+), 3 deletions(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index 03760ddeae..d3f6c432df 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -223,6 +223,7 @@ static const uint32_t nvme_cse_acs[256] = { > [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, > [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, > [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, > + [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP, > [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, > }; > > @@ -1304,6 +1305,12 @@ static inline void nvme_blk_write(BlockBackend *blk, > int64_t offset, > } > } > > +static void nvme_update_cq_head(NvmeCQueue *cq) > +{ > + pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head, > + sizeof(cq->head)); > +} > + > static void nvme_post_cqes(void *opaque) > { > NvmeCQueue *cq = opaque; > @@ -1316,6 +1323,10 @@ static void nvme_post_cqes(void *opaque) > NvmeSQueue *sq; > hwaddr addr; > > + if (cq->cqid && n->dbbuf_enabled) { > + nvme_update_cq_head(cq);
Shouldn't we update the cq head eventidx here (prior to reading the doorbell buffer)? Like we do for the sq tail? > + } > + > if (nvme_cq_full(cq)) { > break; > } > @@ -4237,6 +4248,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest > *req) > static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, > uint16_t sqid, uint16_t cqid, uint16_t size) > { > + uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap); > int i; > NvmeCQueue *cq; > > @@ -4256,6 +4268,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, > uint64_t dma_addr, > } > sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); > > + if (sqid && n->dbbuf_dbs && n->dbbuf_eis) { > + sq->db_addr = n->dbbuf_dbs + 2 * sqid * stride; > + sq->ei_addr = n->dbbuf_eis + 2 * sqid * stride; > + } > + > assert(n->cq[cqid]); > cq = n->cq[cqid]; > QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry); > @@ -4599,6 +4616,7 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, > uint64_t dma_addr, > uint16_t cqid, uint16_t vector, uint16_t size, > uint16_t irq_enabled) > { > + uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap); > int ret; > > if (msix_enabled(&n->parent_obj)) { > @@ -4615,6 +4633,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, > uint64_t dma_addr, > cq->head = cq->tail = 0; > QTAILQ_INIT(&cq->req_list); > QTAILQ_INIT(&cq->sq_list); > + if (cqid && n->dbbuf_dbs && n->dbbuf_eis) { > + cq->db_addr = n->dbbuf_dbs + (2 * cqid + 1) * stride; > + cq->ei_addr = n->dbbuf_eis + (2 * cqid + 1) * stride; > + } > n->cq[cqid] = cq; > cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); > } > @@ -5767,6 +5789,43 @@ out: > return status; > } > > +static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) > +{ > + uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap); > + uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); > + uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); > + int i; > + > + /* Address should be page aligned */ > + if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) { > + return NVME_INVALID_FIELD | NVME_DNR; > + } > + > + /* Save shadow buffer base addr for use during queue creation */ > + n->dbbuf_dbs = dbs_addr; > + n->dbbuf_eis = eis_addr; > + n->dbbuf_enabled = true; > + > + for (i = 1; i < n->params.max_ioqpairs + 1; i++) { > + NvmeSQueue *sq = n->sq[i]; > + NvmeCQueue *cq = n->cq[i]; > + > + if (sq) { > + /* Submission queue tail pointer location, 2 * QID * stride */ > + sq->db_addr = dbs_addr + 2 * i * stride; > + sq->ei_addr = eis_addr + 2 * i * stride; > + } > + > + if (cq) { > + /* Completion queue head pointer location, (2 * QID + 1) * > stride */ > + cq->db_addr = dbs_addr + (2 * i + 1) * stride; > + cq->ei_addr = eis_addr + (2 * i + 1) * stride; > + } > + } Why no love for the admin queue? :) You are special-casing the admin queue below in process_sq() and process_db(), as well as above in post_cqes(). As I'm reading the spec, I do not see why the Admin queue should be treated differently wrt. doorbell buffer configuration. Could this be a left-over from the behavior in the original Google extension (prior to going into NVMe)? I peeked in to the kernel and it looks like it doesnt enable doorbell buffers for admin queue, only for subsequently created I/O queues. Keith, is this a bug in the kernel? If the code here would expect the doorbell buffer to be updated for the admin queue as well, would we break? > + > + return NVME_SUCCESS; > +} > + > static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) > { > trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, > @@ -5809,6 +5868,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest > *req) > return nvme_aer(n, req); > case NVME_ADM_CMD_NS_ATTACHMENT: > return nvme_ns_attachment(n, req); > + case NVME_ADM_CMD_DBBUF_CONFIG: > + return nvme_dbbuf_config(n, req); > case NVME_ADM_CMD_FORMAT_NVM: > return nvme_format(n, req); > default: > @@ -5818,6 +5879,18 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, > NvmeRequest *req) > return NVME_INVALID_OPCODE | NVME_DNR; > } > > +static void nvme_update_sq_eventidx(const NvmeSQueue *sq) > +{ > + pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail, > + sizeof(sq->tail)); > +} > + > +static void nvme_update_sq_tail(NvmeSQueue *sq) > +{ > + pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail, > + sizeof(sq->tail)); > +} > + > static void nvme_process_sq(void *opaque) > { > NvmeSQueue *sq = opaque; > @@ -5829,6 +5902,10 @@ static void nvme_process_sq(void *opaque) > NvmeCmd cmd; > NvmeRequest *req; > > + if (sq->sqid && n->dbbuf_enabled) { > + nvme_update_sq_tail(sq); > + } > + > while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { > addr = sq->dma_addr + sq->head * n->sqe_size; > if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { > @@ -5852,6 +5929,11 @@ static void nvme_process_sq(void *opaque) > req->status = status; > nvme_enqueue_req_completion(cq, req); > } > + > + if (sq->sqid && n->dbbuf_enabled) { > + nvme_update_sq_eventidx(sq); > + nvme_update_sq_tail(sq); > + } > } > } > > @@ -5889,6 +5971,9 @@ static void nvme_ctrl_reset(NvmeCtrl *n) > n->aer_queued = 0; > n->outstanding_aers = 0; > n->qs_created = false; > + n->dbbuf_dbs = 0; > + n->dbbuf_eis = 0; > + n->dbbuf_enabled = false; > } > > static void nvme_ctrl_shutdown(NvmeCtrl *n) > @@ -6397,7 +6482,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, > int val) > trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head); > > start_sqs = nvme_cq_full(cq) ? 1 : 0; > - cq->head = new_head; > + if (!cq->cqid || !n->dbbuf_enabled) { > + cq->head = new_head; > + } > if (start_sqs) { > NvmeSQueue *sq; > QTAILQ_FOREACH(sq, &cq->sq_list, entry) { > @@ -6454,7 +6541,9 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, > int val) > > trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail); > > - sq->tail = new_tail; > + if (!sq->sqid || !n->dbbuf_enabled) { > + sq->tail = new_tail; > + } > timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > } > } > @@ -6733,7 +6822,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice > *pci_dev) > > id->mdts = n->params.mdts; > id->ver = cpu_to_le32(NVME_SPEC_VER); > - id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT); > + id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | > NVME_OACS_DBBUF); > id->cntrltype = 0x1; > > /* > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h > index 6773819325..4452e4b1bf 100644 > --- a/hw/nvme/nvme.h > +++ b/hw/nvme/nvme.h > @@ -334,6 +334,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc) > case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES"; > case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ"; > case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT"; > + case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG"; > case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM"; > default: return "NVME_ADM_CMD_UNKNOWN"; > } > @@ -365,6 +366,8 @@ typedef struct NvmeSQueue { > uint32_t tail; > uint32_t size; > uint64_t dma_addr; > + uint64_t db_addr; > + uint64_t ei_addr; > QEMUTimer *timer; > NvmeRequest *io_req; > QTAILQ_HEAD(, NvmeRequest) req_list; > @@ -382,6 +385,8 @@ typedef struct NvmeCQueue { > uint32_t vector; > uint32_t size; > uint64_t dma_addr; > + uint64_t db_addr; > + uint64_t ei_addr; > QEMUTimer *timer; > QTAILQ_HEAD(, NvmeSQueue) sq_list; > QTAILQ_HEAD(, NvmeRequest) req_list; > @@ -432,6 +437,9 @@ typedef struct NvmeCtrl { > uint64_t starttime_ms; > uint16_t temperature; > uint8_t smart_critical_warning; > + uint64_t dbbuf_dbs; > + uint64_t dbbuf_eis; > + bool dbbuf_enabled; > > struct { > MemoryRegion mem; > diff --git a/include/block/nvme.h b/include/block/nvme.h > index 3737351cc8..5b522d7b0e 100644 > --- a/include/block/nvme.h > +++ b/include/block/nvme.h > @@ -595,6 +595,7 @@ enum NvmeAdminCommands { > NVME_ADM_CMD_ACTIVATE_FW = 0x10, > NVME_ADM_CMD_DOWNLOAD_FW = 0x11, > NVME_ADM_CMD_NS_ATTACHMENT = 0x15, > + NVME_ADM_CMD_DBBUF_CONFIG = 0x7c, > NVME_ADM_CMD_FORMAT_NVM = 0x80, > NVME_ADM_CMD_SECURITY_SEND = 0x81, > NVME_ADM_CMD_SECURITY_RECV = 0x82, > @@ -1134,6 +1135,7 @@ enum NvmeIdCtrlOacs { > NVME_OACS_FORMAT = 1 << 1, > NVME_OACS_FW = 1 << 2, > NVME_OACS_NS_MGMT = 1 << 3, > + NVME_OACS_DBBUF = 1 << 8, > }; > > enum NvmeIdCtrlOncs { > -- > 2.25.1 > -- One of us - No more doubt, silence or taboo about mental illness.
signature.asc
Description: PGP signature