Re: [PATCH v4 17/24] nvme: allow multiple aios per command

2020-01-13 Thread Klaus Birkelund Jensen
On Jan  9 11:40, Beata Michalska wrote:
> Hi Klaus,
> 
> On Thu, 19 Dec 2019 at 13:09, Klaus Jensen  wrote:
> > +static NvmeAIO *nvme_aio_new(BlockBackend *blk, int64_t offset, size_t len,
> > +QEMUSGList *qsg, QEMUIOVector *iov, NvmeRequest *req,
> > +NvmeAIOCompletionFunc *cb)
> 
> Minor: The indentation here (and in a few other places across the patchset)
> does not seem right . And maybe inline ?

I tried to follow the style in CODING_STYLE.rst for "Multiline Indent",
but how the style is for function definition is a bit underspecified.

I can change it to align with the opening paranthesis. I just found the
"one indent" more readable for these long function definitions.

> Also : seems that there are cases when some of the parameters are
> not required (NULL) , maybe having a simplified version for those cases
> might be useful ?
> 

True. Actually - at this point in the series there are no users of the
NvmeAIOCompletionFunc. It is preparatory for other patches I have in the
pipeline. But I'll clean it up.

> > +static void nvme_aio_cb(void *opaque, int ret)
> > +{
> > +NvmeAIO *aio = opaque;
> > +NvmeRequest *req = aio->req;
> > +
> > +BlockBackend *blk = aio->blk;
> > +BlockAcctCookie *acct = &aio->acct;
> > +BlockAcctStats *stats = blk_get_stats(blk);
> > +
> > +Error *local_err = NULL;
> > +
> > +trace_nvme_dev_aio_cb(nvme_cid(req), aio, blk_name(blk), aio->offset,
> > +nvme_aio_opc_str(aio), req);
> > +
> > +if (req) {
> > +QTAILQ_REMOVE(&req->aio_tailq, aio, tailq_entry);
> > +}
> > +
> >  if (!ret) {
> > -block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
> > -req->status = NVME_SUCCESS;
> > +block_acct_done(stats, acct);
> > +
> > +if (aio->cb) {
> > +aio->cb(aio, aio->cb_arg);
> 
> We are dropping setting status to SUCCESS here,
> is that expected ?

Yes, that is on purpose. nvme_aio_cb is called for *each* issued AIO and
we do not want to overwrite a previously set error status with a success
(if one aio in the request fails even though others succeed, it should
not go unnoticed). Note that NVME_SUCCESS is the default setting in the
request, so if no one sets an error code we are still good.

> Also the aio callback will not get
> called case failure and it probably should ?
> 

I tried both but ended up with just not calling it on failure, but I
think that in the future some AIO callbacks might want to take a
different action if the request failed, so I'll add it back in an add
the aio return value (ret) to the callback function definition.


Thanks,
Klaus


Re: [PATCH v4 17/24] nvme: allow multiple aios per command

2020-01-09 Thread Beata Michalska
Hi Klaus,

On Thu, 19 Dec 2019 at 13:09, Klaus Jensen  wrote:
>
> This refactors how the device issues asynchronous block backend
> requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> associated with the command. This allows multiple aios to be issued for
> a command. Only when all requests have been completed will the device
> post a completion queue entry.
>
> Because the device is currently guaranteed to only issue a single aio
> request per command, the benefit is not immediately obvious. But this
> functionality is required to support metadata, the dataset management
> command and other features.
>
> Signed-off-by: Klaus Jensen 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 422 ++
>  hw/block/nvme.h   | 126 +++--
>  hw/block/trace-events |   8 +
>  3 files changed, 461 insertions(+), 95 deletions(-)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index be554ae1e94c..56659bbe263a 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -19,7 +19,8 @@
>   *  -drive file=,if=none,id=
>   *  -device nvme,drive=,serial=,id=, \
>   *  cmb_size_mb=, \
> - *  num_queues=
> + *  num_queues=, \
> + *  mdts=
>   *
>   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> @@ -55,6 +56,7 @@
>  } while (0)
>
>  static void nvme_process_sq(void *opaque);
> +static void nvme_aio_cb(void *opaque, int ret);
>
>  static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
>  {
> @@ -339,6 +341,116 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
> uint32_t len,
>  return status;
>  }
>
> +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> +NvmeNamespace *ns = req->ns;
> +
> +uint32_t len = req->nlb << nvme_ns_lbads(ns);
> +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> +
> +return nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req);
> +}
> +
> +static void nvme_aio_destroy(NvmeAIO *aio)
> +{
> +g_free(aio);
> +}
> +
> +static NvmeAIO *nvme_aio_new(BlockBackend *blk, int64_t offset, size_t len,
> +QEMUSGList *qsg, QEMUIOVector *iov, NvmeRequest *req,
> +NvmeAIOCompletionFunc *cb)

Minor: The indentation here (and in a few other places across the patchset)
does not seem right . And maybe inline ?
Also : seems that there are cases when some of the parameters are
not required (NULL) , maybe having a simplified version for those cases
might be useful ?

> +{
> +NvmeAIO *aio = g_malloc0(sizeof(*aio));
> +
> +*aio = (NvmeAIO) {
> +.blk = blk,
> +.offset = offset,
> +.len = len,
> +.req = req,
> +.qsg = qsg,
> +.iov = iov,
> +.cb = cb,
> +};
> +
> +return aio;
> +}
> +
> +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
> +NvmeAIOOp opc)
> +{
> +aio->opc = opc;
> +
> +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
> +aio->offset, aio->len, nvme_aio_opc_str(aio), req);
> +
> +if (req) {
> +QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry);
> +}
> +}
> +
> +static void nvme_aio(NvmeAIO *aio)
> +{
> +BlockBackend *blk = aio->blk;
> +BlockAcctCookie *acct = &aio->acct;
> +BlockAcctStats *stats = blk_get_stats(blk);
> +
> +bool is_write, dma;
> +
> +switch (aio->opc) {
> +case NVME_AIO_OPC_NONE:
> +break;
> +
> +case NVME_AIO_OPC_FLUSH:
> +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
> +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
> +break;
> +
> +case NVME_AIO_OPC_WRITE_ZEROES:
> +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
> +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
> +BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio);
> +break;
> +
> +case NVME_AIO_OPC_READ:
> +case NVME_AIO_OPC_WRITE:
> +dma = aio->qsg != NULL;
> +is_write = (aio->opc == NVME_AIO_OPC_WRITE);
> +
> +block_acct_start(stats, acct, aio->len,
> +is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
> +
> +if (dma) {
> +aio->aiocb = is_write ?
> +dma_blk_write(blk, aio->qsg, aio->offset,
> +BDRV_SECTOR_SIZE, nvme_aio_cb, aio) :
> +dma_blk_read(blk, aio->qsg, aio->offset,
> +BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
> +
> +return;
> +}
> +
> +aio->aiocb = is_write ?
> +blk_aio_pwritev(blk, aio->offset, aio->iov, 0,
> +nvme_aio_cb, aio) :
> +blk_aio_preadv(blk, aio->offset, aio->iov, 0,
> +nvme_aio_cb, aio);
> +
> +break;
> +}
> +}
> +
> +static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeReque

[PATCH v4 17/24] nvme: allow multiple aios per command

2019-12-19 Thread Klaus Jensen
This refactors how the device issues asynchronous block backend
requests. The NvmeRequest now holds a queue of NvmeAIOs that are
associated with the command. This allows multiple aios to be issued for
a command. Only when all requests have been completed will the device
post a completion queue entry.

Because the device is currently guaranteed to only issue a single aio
request per command, the benefit is not immediately obvious. But this
functionality is required to support metadata, the dataset management
command and other features.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c   | 422 ++
 hw/block/nvme.h   | 126 +++--
 hw/block/trace-events |   8 +
 3 files changed, 461 insertions(+), 95 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index be554ae1e94c..56659bbe263a 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,7 +19,8 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
- *  num_queues=
+ *  num_queues=, \
+ *  mdts=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -55,6 +56,7 @@
 } while (0)
 
 static void nvme_process_sq(void *opaque);
+static void nvme_aio_cb(void *opaque, int ret);
 
 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
 {
@@ -339,6 +341,116 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
uint32_t len,
 return status;
 }
 
+static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+NvmeNamespace *ns = req->ns;
+
+uint32_t len = req->nlb << nvme_ns_lbads(ns);
+uint64_t prp1 = le64_to_cpu(cmd->prp1);
+uint64_t prp2 = le64_to_cpu(cmd->prp2);
+
+return nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req);
+}
+
+static void nvme_aio_destroy(NvmeAIO *aio)
+{
+g_free(aio);
+}
+
+static NvmeAIO *nvme_aio_new(BlockBackend *blk, int64_t offset, size_t len,
+QEMUSGList *qsg, QEMUIOVector *iov, NvmeRequest *req,
+NvmeAIOCompletionFunc *cb)
+{
+NvmeAIO *aio = g_malloc0(sizeof(*aio));
+
+*aio = (NvmeAIO) {
+.blk = blk,
+.offset = offset,
+.len = len,
+.req = req,
+.qsg = qsg,
+.iov = iov,
+.cb = cb,
+};
+
+return aio;
+}
+
+static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
+NvmeAIOOp opc)
+{
+aio->opc = opc;
+
+trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
+aio->offset, aio->len, nvme_aio_opc_str(aio), req);
+
+if (req) {
+QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry);
+}
+}
+
+static void nvme_aio(NvmeAIO *aio)
+{
+BlockBackend *blk = aio->blk;
+BlockAcctCookie *acct = &aio->acct;
+BlockAcctStats *stats = blk_get_stats(blk);
+
+bool is_write, dma;
+
+switch (aio->opc) {
+case NVME_AIO_OPC_NONE:
+break;
+
+case NVME_AIO_OPC_FLUSH:
+block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
+aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
+break;
+
+case NVME_AIO_OPC_WRITE_ZEROES:
+block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
+aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
+BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio);
+break;
+
+case NVME_AIO_OPC_READ:
+case NVME_AIO_OPC_WRITE:
+dma = aio->qsg != NULL;
+is_write = (aio->opc == NVME_AIO_OPC_WRITE);
+
+block_acct_start(stats, acct, aio->len,
+is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+if (dma) {
+aio->aiocb = is_write ?
+dma_blk_write(blk, aio->qsg, aio->offset,
+BDRV_SECTOR_SIZE, nvme_aio_cb, aio) :
+dma_blk_read(blk, aio->qsg, aio->offset,
+BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+
+return;
+}
+
+aio->aiocb = is_write ?
+blk_aio_pwritev(blk, aio->offset, aio->iov, 0,
+nvme_aio_cb, aio) :
+blk_aio_preadv(blk, aio->offset, aio->iov, 0,
+nvme_aio_cb, aio);
+
+break;
+}
+}
+
+static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req)
+{
+NvmeAIO *aio;
+size_t len = req->qsg.nsg > 0 ? req->qsg.size : req->iov.size;
+
+aio = nvme_aio_new(blk, offset, len, &req->qsg, &req->iov, req, NULL);
+nvme_req_register_aio(req, aio, nvme_req_is_write(req) ?
+NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ);
+nvme_aio(aio);
+}
+
 static void nvme_post_cqes(void *opaque)
 {
 NvmeCQueue *cq = opaque;
@@ -372,8 +484,16 @@ static void nvme_post_cqes(void *opaque)
 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
 {
 assert(cq->cqid == req->sq->cqid);
-trace_nvme_dev_enqueue_req_completion(nvme_cid(r