Re: [PATCH v4 17/24] nvme: allow multiple aios per command
On Jan 9 11:40, Beata Michalska wrote: > Hi Klaus, > > On Thu, 19 Dec 2019 at 13:09, Klaus Jensen wrote: > > +static NvmeAIO *nvme_aio_new(BlockBackend *blk, int64_t offset, size_t len, > > +QEMUSGList *qsg, QEMUIOVector *iov, NvmeRequest *req, > > +NvmeAIOCompletionFunc *cb) > > Minor: The indentation here (and in a few other places across the patchset) > does not seem right . And maybe inline ? I tried to follow the style in CODING_STYLE.rst for "Multiline Indent", but how the style is for function definition is a bit underspecified. I can change it to align with the opening paranthesis. I just found the "one indent" more readable for these long function definitions. > Also : seems that there are cases when some of the parameters are > not required (NULL) , maybe having a simplified version for those cases > might be useful ? > True. Actually - at this point in the series there are no users of the NvmeAIOCompletionFunc. It is preparatory for other patches I have in the pipeline. But I'll clean it up. > > +static void nvme_aio_cb(void *opaque, int ret) > > +{ > > +NvmeAIO *aio = opaque; > > +NvmeRequest *req = aio->req; > > + > > +BlockBackend *blk = aio->blk; > > +BlockAcctCookie *acct = &aio->acct; > > +BlockAcctStats *stats = blk_get_stats(blk); > > + > > +Error *local_err = NULL; > > + > > +trace_nvme_dev_aio_cb(nvme_cid(req), aio, blk_name(blk), aio->offset, > > +nvme_aio_opc_str(aio), req); > > + > > +if (req) { > > +QTAILQ_REMOVE(&req->aio_tailq, aio, tailq_entry); > > +} > > + > > if (!ret) { > > -block_acct_done(blk_get_stats(n->conf.blk), &req->acct); > > -req->status = NVME_SUCCESS; > > +block_acct_done(stats, acct); > > + > > +if (aio->cb) { > > +aio->cb(aio, aio->cb_arg); > > We are dropping setting status to SUCCESS here, > is that expected ? Yes, that is on purpose. nvme_aio_cb is called for *each* issued AIO and we do not want to overwrite a previously set error status with a success (if one aio in the request fails even though others succeed, it should not go unnoticed). Note that NVME_SUCCESS is the default setting in the request, so if no one sets an error code we are still good. > Also the aio callback will not get > called case failure and it probably should ? > I tried both but ended up with just not calling it on failure, but I think that in the future some AIO callbacks might want to take a different action if the request failed, so I'll add it back in an add the aio return value (ret) to the callback function definition. Thanks, Klaus
Re: [PATCH v4 17/24] nvme: allow multiple aios per command
Hi Klaus, On Thu, 19 Dec 2019 at 13:09, Klaus Jensen wrote: > > This refactors how the device issues asynchronous block backend > requests. The NvmeRequest now holds a queue of NvmeAIOs that are > associated with the command. This allows multiple aios to be issued for > a command. Only when all requests have been completed will the device > post a completion queue entry. > > Because the device is currently guaranteed to only issue a single aio > request per command, the benefit is not immediately obvious. But this > functionality is required to support metadata, the dataset management > command and other features. > > Signed-off-by: Klaus Jensen > Signed-off-by: Klaus Jensen > --- > hw/block/nvme.c | 422 ++ > hw/block/nvme.h | 126 +++-- > hw/block/trace-events | 8 + > 3 files changed, 461 insertions(+), 95 deletions(-) > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index be554ae1e94c..56659bbe263a 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -19,7 +19,8 @@ > * -drive file=,if=none,id= > * -device nvme,drive=,serial=,id=, \ > * cmb_size_mb=, \ > - * num_queues= > + * num_queues=, \ > + * mdts= > * > * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at > * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. > @@ -55,6 +56,7 @@ > } while (0) > > static void nvme_process_sq(void *opaque); > +static void nvme_aio_cb(void *opaque, int ret); > > static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) > { > @@ -339,6 +341,116 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, > uint32_t len, > return status; > } > > +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) > +{ > +NvmeNamespace *ns = req->ns; > + > +uint32_t len = req->nlb << nvme_ns_lbads(ns); > +uint64_t prp1 = le64_to_cpu(cmd->prp1); > +uint64_t prp2 = le64_to_cpu(cmd->prp2); > + > +return nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req); > +} > + > +static void nvme_aio_destroy(NvmeAIO *aio) > +{ > +g_free(aio); > +} > + > +static NvmeAIO *nvme_aio_new(BlockBackend *blk, int64_t offset, size_t len, > +QEMUSGList *qsg, QEMUIOVector *iov, NvmeRequest *req, > +NvmeAIOCompletionFunc *cb) Minor: The indentation here (and in a few other places across the patchset) does not seem right . And maybe inline ? Also : seems that there are cases when some of the parameters are not required (NULL) , maybe having a simplified version for those cases might be useful ? > +{ > +NvmeAIO *aio = g_malloc0(sizeof(*aio)); > + > +*aio = (NvmeAIO) { > +.blk = blk, > +.offset = offset, > +.len = len, > +.req = req, > +.qsg = qsg, > +.iov = iov, > +.cb = cb, > +}; > + > +return aio; > +} > + > +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio, > +NvmeAIOOp opc) > +{ > +aio->opc = opc; > + > +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk), > +aio->offset, aio->len, nvme_aio_opc_str(aio), req); > + > +if (req) { > +QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry); > +} > +} > + > +static void nvme_aio(NvmeAIO *aio) > +{ > +BlockBackend *blk = aio->blk; > +BlockAcctCookie *acct = &aio->acct; > +BlockAcctStats *stats = blk_get_stats(blk); > + > +bool is_write, dma; > + > +switch (aio->opc) { > +case NVME_AIO_OPC_NONE: > +break; > + > +case NVME_AIO_OPC_FLUSH: > +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH); > +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio); > +break; > + > +case NVME_AIO_OPC_WRITE_ZEROES: > +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE); > +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len, > +BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio); > +break; > + > +case NVME_AIO_OPC_READ: > +case NVME_AIO_OPC_WRITE: > +dma = aio->qsg != NULL; > +is_write = (aio->opc == NVME_AIO_OPC_WRITE); > + > +block_acct_start(stats, acct, aio->len, > +is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ); > + > +if (dma) { > +aio->aiocb = is_write ? > +dma_blk_write(blk, aio->qsg, aio->offset, > +BDRV_SECTOR_SIZE, nvme_aio_cb, aio) : > +dma_blk_read(blk, aio->qsg, aio->offset, > +BDRV_SECTOR_SIZE, nvme_aio_cb, aio); > + > +return; > +} > + > +aio->aiocb = is_write ? > +blk_aio_pwritev(blk, aio->offset, aio->iov, 0, > +nvme_aio_cb, aio) : > +blk_aio_preadv(blk, aio->offset, aio->iov, 0, > +nvme_aio_cb, aio); > + > +break; > +} > +} > + > +static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeReque
[PATCH v4 17/24] nvme: allow multiple aios per command
This refactors how the device issues asynchronous block backend requests. The NvmeRequest now holds a queue of NvmeAIOs that are associated with the command. This allows multiple aios to be issued for a command. Only when all requests have been completed will the device post a completion queue entry. Because the device is currently guaranteed to only issue a single aio request per command, the benefit is not immediately obvious. But this functionality is required to support metadata, the dataset management command and other features. Signed-off-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 422 ++ hw/block/nvme.h | 126 +++-- hw/block/trace-events | 8 + 3 files changed, 461 insertions(+), 95 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index be554ae1e94c..56659bbe263a 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -19,7 +19,8 @@ * -drive file=,if=none,id= * -device nvme,drive=,serial=,id=, \ * cmb_size_mb=, \ - * num_queues= + * num_queues=, \ + * mdts= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. @@ -55,6 +56,7 @@ } while (0) static void nvme_process_sq(void *opaque); +static void nvme_aio_cb(void *opaque, int ret); static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) { @@ -339,6 +341,116 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return status; } +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +{ +NvmeNamespace *ns = req->ns; + +uint32_t len = req->nlb << nvme_ns_lbads(ns); +uint64_t prp1 = le64_to_cpu(cmd->prp1); +uint64_t prp2 = le64_to_cpu(cmd->prp2); + +return nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req); +} + +static void nvme_aio_destroy(NvmeAIO *aio) +{ +g_free(aio); +} + +static NvmeAIO *nvme_aio_new(BlockBackend *blk, int64_t offset, size_t len, +QEMUSGList *qsg, QEMUIOVector *iov, NvmeRequest *req, +NvmeAIOCompletionFunc *cb) +{ +NvmeAIO *aio = g_malloc0(sizeof(*aio)); + +*aio = (NvmeAIO) { +.blk = blk, +.offset = offset, +.len = len, +.req = req, +.qsg = qsg, +.iov = iov, +.cb = cb, +}; + +return aio; +} + +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio, +NvmeAIOOp opc) +{ +aio->opc = opc; + +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk), +aio->offset, aio->len, nvme_aio_opc_str(aio), req); + +if (req) { +QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry); +} +} + +static void nvme_aio(NvmeAIO *aio) +{ +BlockBackend *blk = aio->blk; +BlockAcctCookie *acct = &aio->acct; +BlockAcctStats *stats = blk_get_stats(blk); + +bool is_write, dma; + +switch (aio->opc) { +case NVME_AIO_OPC_NONE: +break; + +case NVME_AIO_OPC_FLUSH: +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH); +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio); +break; + +case NVME_AIO_OPC_WRITE_ZEROES: +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE); +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len, +BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio); +break; + +case NVME_AIO_OPC_READ: +case NVME_AIO_OPC_WRITE: +dma = aio->qsg != NULL; +is_write = (aio->opc == NVME_AIO_OPC_WRITE); + +block_acct_start(stats, acct, aio->len, +is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ); + +if (dma) { +aio->aiocb = is_write ? +dma_blk_write(blk, aio->qsg, aio->offset, +BDRV_SECTOR_SIZE, nvme_aio_cb, aio) : +dma_blk_read(blk, aio->qsg, aio->offset, +BDRV_SECTOR_SIZE, nvme_aio_cb, aio); + +return; +} + +aio->aiocb = is_write ? +blk_aio_pwritev(blk, aio->offset, aio->iov, 0, +nvme_aio_cb, aio) : +blk_aio_preadv(blk, aio->offset, aio->iov, 0, +nvme_aio_cb, aio); + +break; +} +} + +static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req) +{ +NvmeAIO *aio; +size_t len = req->qsg.nsg > 0 ? req->qsg.size : req->iov.size; + +aio = nvme_aio_new(blk, offset, len, &req->qsg, &req->iov, req, NULL); +nvme_req_register_aio(req, aio, nvme_req_is_write(req) ? +NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ); +nvme_aio(aio); +} + static void nvme_post_cqes(void *opaque) { NvmeCQueue *cq = opaque; @@ -372,8 +484,16 @@ static void nvme_post_cqes(void *opaque) static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) { assert(cq->cqid == req->sq->cqid); -trace_nvme_dev_enqueue_req_completion(nvme_cid(r