Re: [PATCH v6 32/42] nvme: allow multiple aios per command
On Mar 31 12:10, Maxim Levitsky wrote: > On Tue, 2020-03-31 at 07:47 +0200, Klaus Birkelund Jensen wrote: > > On Mar 25 12:57, Maxim Levitsky wrote: > > > On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote: > > > > @@ -516,10 +613,10 @@ static inline uint16_t nvme_check_prinfo(NvmeCtrl > > > > *n, NvmeNamespace *ns, > > > > return NVME_SUCCESS; > > > > } > > > > > > > > -static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace > > > > *ns, > > > > - uint64_t slba, uint32_t nlb, > > > > - NvmeRequest *req) > > > > +static inline uint16_t nvme_check_bounds(NvmeCtrl *n, uint64_t slba, > > > > + uint32_t nlb, NvmeRequest > > > > *req) > > > > { > > > > +NvmeNamespace *ns = req->ns; > > > > uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); > > > > > > This should go to the patch that added nvme_check_bounds as well > > > > > > > We can't really, because the NvmeRequest does not hold a reference to > > the namespace as a struct member at that point. This is also an issue > > with the nvme_check_prinfo function above. > > I see it now. The changes to NvmeRequest together with this are a good > candidate > to split from this patch to get this patch to size that is easy to review. > I'm factoring those changes and other stuff out into separate patches!
Re: [PATCH v6 32/42] nvme: allow multiple aios per command
On Tue, 2020-03-31 at 07:47 +0200, Klaus Birkelund Jensen wrote: > On Mar 25 12:57, Maxim Levitsky wrote: > > On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote: > > > From: Klaus Jensen > > > > > > This refactors how the device issues asynchronous block backend > > > requests. The NvmeRequest now holds a queue of NvmeAIOs that are > > > associated with the command. This allows multiple aios to be issued for > > > a command. Only when all requests have been completed will the device > > > post a completion queue entry. > > > > > > Because the device is currently guaranteed to only issue a single aio > > > request per command, the benefit is not immediately obvious. But this > > > functionality is required to support metadata, the dataset management > > > command and other features. > > > > > > Signed-off-by: Klaus Jensen > > > Signed-off-by: Klaus Jensen > > > Acked-by: Keith Busch > > > --- > > > hw/block/nvme.c | 377 +++--- > > > hw/block/nvme.h | 129 +-- > > > hw/block/trace-events | 6 + > > > 3 files changed, 407 insertions(+), 105 deletions(-) > > > > > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > > > index 0d2b5b45b0c5..817384e3b1a9 100644 > > > --- a/hw/block/nvme.c > > > +++ b/hw/block/nvme.c > > > @@ -373,6 +374,99 @@ static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, > > > QEMUSGList *qsg, > > > return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req); > > > } > > > > > > +static void nvme_aio_destroy(NvmeAIO *aio) > > > +{ > > > +g_free(aio); > > > +} > > > + > > > +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio, > > > > I guess I'll call this nvme_req_add_aio, > > or nvme_add_aio_to_reg. > > Thoughts? > > Also you can leave this as is, but add a comment on top explaining this > > > > nvme_req_add_aio it is :) And comment added. Thanks a lot! > > > > + NvmeAIOOp opc) > > > +{ > > > +aio->opc = opc; > > > + > > > +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, > > > blk_name(aio->blk), > > > +aio->offset, aio->len, > > > +nvme_aio_opc_str(aio), req); > > > + > > > +if (req) { > > > +QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry); > > > +} > > > +} > > > + > > > +static void nvme_submit_aio(NvmeAIO *aio) > > > > OK, this name makes sense > > Also please add a comment on top. > > Done. Thanks! > > > > @@ -505,9 +600,11 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, > > > size_t len, > > > return NVME_SUCCESS; > > > } > > > > > > -static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns, > > > - uint16_t ctrl, NvmeRequest *req) > > > +static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, uint16_t ctrl, > > > + NvmeRequest *req) > > > { > > > +NvmeNamespace *ns = req->ns; > > > + > > > > This should go to the patch that added nvme_check_prinfo > > > > Probably killing that patch. Yea, I also agree on that. Once we properly support metadata, then we can add all the checks for its correctness. > > > > @@ -516,10 +613,10 @@ static inline uint16_t nvme_check_prinfo(NvmeCtrl > > > *n, NvmeNamespace *ns, > > > return NVME_SUCCESS; > > > } > > > > > > -static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns, > > > - uint64_t slba, uint32_t nlb, > > > - NvmeRequest *req) > > > +static inline uint16_t nvme_check_bounds(NvmeCtrl *n, uint64_t slba, > > > + uint32_t nlb, NvmeRequest *req) > > > { > > > +NvmeNamespace *ns = req->ns; > > > uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); > > > > This should go to the patch that added nvme_check_bounds as well > > > > We can't really, because the NvmeRequest does not hold a reference to > the namespace as a struct member at that point. This is also an issue > with the nvme_check_prinfo function above. I see it now. The changes to NvmeRequest together with this are a good candidate to split from this patch to get this patch to size that is easy to review. > > > > > > > if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) { > > > @@ -530,55 +627,154 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl > > > *n, NvmeNamespace *ns, > > > return NVME_SUCCESS; > > > } > > > > > > -static void nvme_rw_cb(void *opaque, int ret) > > > +static uint16_t nvme_check_rw(NvmeCtrl *n, NvmeRequest *req) > > > +{ > > > +NvmeNamespace *ns = req->ns; > > > +NvmeRwCmd *rw = (NvmeRwCmd *) &req->cmd; > > > +uint16_t ctrl = le16_to_cpu(rw->control); > > > +size_t len = req->nlb << nvme_ns_lbads(ns); > > > +uint16_t status; > > > + > > > +status = nvme_check_mdts(n, len, req); > > > +if (statu
Re: [PATCH v6 32/42] nvme: allow multiple aios per command
On Mar 25 12:57, Maxim Levitsky wrote: > On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote: > > From: Klaus Jensen > > > > This refactors how the device issues asynchronous block backend > > requests. The NvmeRequest now holds a queue of NvmeAIOs that are > > associated with the command. This allows multiple aios to be issued for > > a command. Only when all requests have been completed will the device > > post a completion queue entry. > > > > Because the device is currently guaranteed to only issue a single aio > > request per command, the benefit is not immediately obvious. But this > > functionality is required to support metadata, the dataset management > > command and other features. > > > > Signed-off-by: Klaus Jensen > > Signed-off-by: Klaus Jensen > > Acked-by: Keith Busch > > --- > > hw/block/nvme.c | 377 +++--- > > hw/block/nvme.h | 129 +-- > > hw/block/trace-events | 6 + > > 3 files changed, 407 insertions(+), 105 deletions(-) > > > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > > index 0d2b5b45b0c5..817384e3b1a9 100644 > > --- a/hw/block/nvme.c > > +++ b/hw/block/nvme.c > > @@ -373,6 +374,99 @@ static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, > > QEMUSGList *qsg, > > return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req); > > } > > > > +static void nvme_aio_destroy(NvmeAIO *aio) > > +{ > > +g_free(aio); > > +} > > + > > +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio, > I guess I'll call this nvme_req_add_aio, > or nvme_add_aio_to_reg. > Thoughts? > Also you can leave this as is, but add a comment on top explaining this > nvme_req_add_aio it is :) And comment added. > > + NvmeAIOOp opc) > > +{ > > +aio->opc = opc; > > + > > +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk), > > +aio->offset, aio->len, > > +nvme_aio_opc_str(aio), req); > > + > > +if (req) { > > +QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry); > > +} > > +} > > + > > +static void nvme_submit_aio(NvmeAIO *aio) > OK, this name makes sense > Also please add a comment on top. Done. > > @@ -505,9 +600,11 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, > > size_t len, > > return NVME_SUCCESS; > > } > > > > -static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns, > > - uint16_t ctrl, NvmeRequest *req) > > +static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, uint16_t ctrl, > > + NvmeRequest *req) > > { > > +NvmeNamespace *ns = req->ns; > > + > This should go to the patch that added nvme_check_prinfo > Probably killing that patch. > > @@ -516,10 +613,10 @@ static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, > > NvmeNamespace *ns, > > return NVME_SUCCESS; > > } > > > > -static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns, > > - uint64_t slba, uint32_t nlb, > > - NvmeRequest *req) > > +static inline uint16_t nvme_check_bounds(NvmeCtrl *n, uint64_t slba, > > + uint32_t nlb, NvmeRequest *req) > > { > > +NvmeNamespace *ns = req->ns; > > uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); > This should go to the patch that added nvme_check_bounds as well > We can't really, because the NvmeRequest does not hold a reference to the namespace as a struct member at that point. This is also an issue with the nvme_check_prinfo function above. > > > > if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) { > > @@ -530,55 +627,154 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl > > *n, NvmeNamespace *ns, > > return NVME_SUCCESS; > > } > > > > -static void nvme_rw_cb(void *opaque, int ret) > > +static uint16_t nvme_check_rw(NvmeCtrl *n, NvmeRequest *req) > > +{ > > +NvmeNamespace *ns = req->ns; > > +NvmeRwCmd *rw = (NvmeRwCmd *) &req->cmd; > > +uint16_t ctrl = le16_to_cpu(rw->control); > > +size_t len = req->nlb << nvme_ns_lbads(ns); > > +uint16_t status; > > + > > +status = nvme_check_mdts(n, len, req); > > +if (status) { > > +return status; > > +} > > + > > +status = nvme_check_prinfo(n, ctrl, req); > > +if (status) { > > +return status; > > +} > > + > > +status = nvme_check_bounds(n, req->slba, req->nlb, req); > > +if (status) { > > +return status; > > +} > > + > > +return NVME_SUCCESS; > > +} > > Nitpick: I hate to say it but nvme_check_rw should be in a separate patch as > well. > It will also make diff more readable (when adding a funtion and changing a > function > at the same time, you get a diff between two unrelated things) > Done, but had to do it as a follow up patc
Re: [PATCH v6 32/42] nvme: allow multiple aios per command
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote: > From: Klaus Jensen > > This refactors how the device issues asynchronous block backend > requests. The NvmeRequest now holds a queue of NvmeAIOs that are > associated with the command. This allows multiple aios to be issued for > a command. Only when all requests have been completed will the device > post a completion queue entry. > > Because the device is currently guaranteed to only issue a single aio > request per command, the benefit is not immediately obvious. But this > functionality is required to support metadata, the dataset management > command and other features. > > Signed-off-by: Klaus Jensen > Signed-off-by: Klaus Jensen > Acked-by: Keith Busch > --- > hw/block/nvme.c | 377 +++--- > hw/block/nvme.h | 129 +-- > hw/block/trace-events | 6 + > 3 files changed, 407 insertions(+), 105 deletions(-) > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index 0d2b5b45b0c5..817384e3b1a9 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -59,6 +59,7 @@ > } while (0) > > static void nvme_process_sq(void *opaque); > +static void nvme_aio_cb(void *opaque, int ret); > > static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) > { > @@ -373,6 +374,99 @@ static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, > QEMUSGList *qsg, > return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req); > } > > +static void nvme_aio_destroy(NvmeAIO *aio) > +{ > +g_free(aio); > +} > + > +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio, I guess I'll call this nvme_req_add_aio, or nvme_add_aio_to_reg. Thoughts? Also you can leave this as is, but add a comment on top explaining this > + NvmeAIOOp opc) > +{ > +aio->opc = opc; > + > +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk), > +aio->offset, aio->len, > +nvme_aio_opc_str(aio), req); > + > +if (req) { > +QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry); > +} > +} > + > +static void nvme_submit_aio(NvmeAIO *aio) OK, this name makes sense Also please add a comment on top. > +{ > +BlockBackend *blk = aio->blk; > +BlockAcctCookie *acct = &aio->acct; > +BlockAcctStats *stats = blk_get_stats(blk); > + > +bool is_write; > + > +switch (aio->opc) { > +case NVME_AIO_OPC_NONE: > +break; > + > +case NVME_AIO_OPC_FLUSH: > +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH); > +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio); > +break; > + > +case NVME_AIO_OPC_WRITE_ZEROES: > +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE); > +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len, > + BDRV_REQ_MAY_UNMAP, nvme_aio_cb, > + aio); > +break; > + > +case NVME_AIO_OPC_READ: > +case NVME_AIO_OPC_WRITE: > +is_write = (aio->opc == NVME_AIO_OPC_WRITE); > + > +block_acct_start(stats, acct, aio->len, > + is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ); > + > +if (aio->qsg) { > +if (is_write) { > +aio->aiocb = dma_blk_write(blk, aio->qsg, aio->offset, > + BDRV_SECTOR_SIZE, nvme_aio_cb, > aio); > +} else { > +aio->aiocb = dma_blk_read(blk, aio->qsg, aio->offset, > + BDRV_SECTOR_SIZE, nvme_aio_cb, > aio); > +} > +} else { > +if (is_write) { > +aio->aiocb = blk_aio_pwritev(blk, aio->offset, aio->iov, 0, > + nvme_aio_cb, aio); > +} else { > +aio->aiocb = blk_aio_preadv(blk, aio->offset, aio->iov, 0, > +nvme_aio_cb, aio); > +} > +} Looks much better that way that a early return! > + > +break; > +} > +} > + > +static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req) > +{ > +NvmeAIO *aio; > +size_t len = req->qsg.nsg > 0 ? req->qsg.size : req->iov.size; > + > +aio = g_new0(NvmeAIO, 1); > + > +*aio = (NvmeAIO) { > +.blk = blk, > +.offset = offset, > +.len = len, > +.req = req, > +.qsg = req->qsg.sg ? &req->qsg : NULL, > +.iov = req->iov.iov ? &req->iov : NULL, OK, this is the fix for the bug I mentioned in V5, looks good. > +}; > + > +nvme_req_register_aio(req, aio, nvme_req_is_write(req) ? > + NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ); > +nvme_submit_aio(aio); > +} > + > static void nvme_post_cqes(void *opaque) > { > NvmeCQueue *cq = opaque; > @@ -396,6 +490,7 @@ s
[PATCH v6 32/42] nvme: allow multiple aios per command
From: Klaus Jensen This refactors how the device issues asynchronous block backend requests. The NvmeRequest now holds a queue of NvmeAIOs that are associated with the command. This allows multiple aios to be issued for a command. Only when all requests have been completed will the device post a completion queue entry. Because the device is currently guaranteed to only issue a single aio request per command, the benefit is not immediately obvious. But this functionality is required to support metadata, the dataset management command and other features. Signed-off-by: Klaus Jensen Signed-off-by: Klaus Jensen Acked-by: Keith Busch --- hw/block/nvme.c | 377 +++--- hw/block/nvme.h | 129 +-- hw/block/trace-events | 6 + 3 files changed, 407 insertions(+), 105 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 0d2b5b45b0c5..817384e3b1a9 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -59,6 +59,7 @@ } while (0) static void nvme_process_sq(void *opaque); +static void nvme_aio_cb(void *opaque, int ret); static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) { @@ -373,6 +374,99 @@ static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg, return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req); } +static void nvme_aio_destroy(NvmeAIO *aio) +{ +g_free(aio); +} + +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio, + NvmeAIOOp opc) +{ +aio->opc = opc; + +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk), +aio->offset, aio->len, +nvme_aio_opc_str(aio), req); + +if (req) { +QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry); +} +} + +static void nvme_submit_aio(NvmeAIO *aio) +{ +BlockBackend *blk = aio->blk; +BlockAcctCookie *acct = &aio->acct; +BlockAcctStats *stats = blk_get_stats(blk); + +bool is_write; + +switch (aio->opc) { +case NVME_AIO_OPC_NONE: +break; + +case NVME_AIO_OPC_FLUSH: +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH); +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio); +break; + +case NVME_AIO_OPC_WRITE_ZEROES: +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE); +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len, + BDRV_REQ_MAY_UNMAP, nvme_aio_cb, + aio); +break; + +case NVME_AIO_OPC_READ: +case NVME_AIO_OPC_WRITE: +is_write = (aio->opc == NVME_AIO_OPC_WRITE); + +block_acct_start(stats, acct, aio->len, + is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ); + +if (aio->qsg) { +if (is_write) { +aio->aiocb = dma_blk_write(blk, aio->qsg, aio->offset, + BDRV_SECTOR_SIZE, nvme_aio_cb, aio); +} else { +aio->aiocb = dma_blk_read(blk, aio->qsg, aio->offset, + BDRV_SECTOR_SIZE, nvme_aio_cb, aio); +} +} else { +if (is_write) { +aio->aiocb = blk_aio_pwritev(blk, aio->offset, aio->iov, 0, + nvme_aio_cb, aio); +} else { +aio->aiocb = blk_aio_preadv(blk, aio->offset, aio->iov, 0, +nvme_aio_cb, aio); +} +} + +break; +} +} + +static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req) +{ +NvmeAIO *aio; +size_t len = req->qsg.nsg > 0 ? req->qsg.size : req->iov.size; + +aio = g_new0(NvmeAIO, 1); + +*aio = (NvmeAIO) { +.blk = blk, +.offset = offset, +.len = len, +.req = req, +.qsg = req->qsg.sg ? &req->qsg : NULL, +.iov = req->iov.iov ? &req->iov : NULL, +}; + +nvme_req_register_aio(req, aio, nvme_req_is_write(req) ? + NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ); +nvme_submit_aio(aio); +} + static void nvme_post_cqes(void *opaque) { NvmeCQueue *cq = opaque; @@ -396,6 +490,7 @@ static void nvme_post_cqes(void *opaque) nvme_inc_cq_tail(cq); pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, sizeof(req->cqe)); +nvme_req_clear(req); QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); } if (cq->tail != cq->head) { @@ -406,8 +501,8 @@ static void nvme_post_cqes(void *opaque) static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) { assert(cq->cqid == req->sq->cqid); -trace_nvme_dev_enqueue_req_completion(nvme_cid(req), cq->cqid, - req->status); +trace_nvme_dev_enqueue_req_completion(nvme_cid(req),