Re: [PATCH v2 14/20] nvme: allow multiple aios per command
On Thu, 21 Nov 2019 at 11:57, Klaus Birkelund wrote: > > On Tue, Nov 12, 2019 at 03:25:06PM +, Beata Michalska wrote: > > Hi Klaus, > > > > On Tue, 15 Oct 2019 at 11:55, Klaus Jensen wrote: > > > @@ -341,19 +344,18 @@ static uint16_t nvme_dma_write_prp(NvmeCtrl *n, > > > uint8_t *ptr, uint32_t len, > > Any reason why the nvme_dma_write_prp is missing the changes applied > > to nvme_dma_read_prp ? > > > > This was adressed by proxy through changes to the previous patch > (by combining the read/write functions). > > > > +case NVME_AIO_OPC_WRITE_ZEROES: > > > +block_acct_start(stats, acct, aio->iov.size, BLOCK_ACCT_WRITE); > > > +aio->aiocb = blk_aio_pwrite_zeroes(aio->blk, aio->offset, > > > +aio->iov.size, BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio); > > Minor: aio->blk => blk > > > > Thanks. Fixed this in a couple of other places as well. > > > > @@ -621,8 +880,11 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd > > > *cmd) > > > sq = n->sq[qid]; > > > while (!QTAILQ_EMPTY(&sq->out_req_list)) { > > > req = QTAILQ_FIRST(&sq->out_req_list); > > > -assert(req->aiocb); > > > -blk_aio_cancel(req->aiocb); > > > +while (!QTAILQ_EMPTY(&req->aio_tailq)) { > > > +aio = QTAILQ_FIRST(&req->aio_tailq); > > > +assert(aio->aiocb); > > > +blk_aio_cancel(aio->aiocb); > > What about releasing memory associated with given aio ? > > I believe the callback is still called when cancelled? That should take > care of it. Or have I misunderstood that? At least for the DMAAIOCBs it > is. > It seems that the completion callback is supposed to be called. My bad. BR Beata > > > +struct NvmeAIO { > > > +NvmeRequest *req; > > > + > > > +NvmeAIOOp opc; > > > +int64_t offset; > > > +BlockBackend*blk; > > > +BlockAIOCB *aiocb; > > > +BlockAcctCookie acct; > > > + > > > +NvmeAIOCompletionFunc *cb; > > > +void *cb_arg; > > > + > > > +QEMUSGList *qsg; > > > +QEMUIOVector iov; > > > > There is a bit of inconsistency on the ownership of IOVs and SGLs. > > SGLs now seem to be owned by request whereas IOVs by the aio. > > WOuld be good to have that unified or documented at least. > > > > Fixed this. The NvmeAIO only holds pointers now. > > > > +#define NVME_REQ_TRANSFER_DMA 0x1 > > This one does not seem to be used > > > > I have dropped the flags and reverted to a simple req->is_cmb as that is > all that is really needed. >
Re: [PATCH v2 14/20] nvme: allow multiple aios per command
On Tue, Nov 12, 2019 at 03:25:06PM +, Beata Michalska wrote: > Hi Klaus, > > On Tue, 15 Oct 2019 at 11:55, Klaus Jensen wrote: > > @@ -341,19 +344,18 @@ static uint16_t nvme_dma_write_prp(NvmeCtrl *n, > > uint8_t *ptr, uint32_t len, > Any reason why the nvme_dma_write_prp is missing the changes applied > to nvme_dma_read_prp ? > This was adressed by proxy through changes to the previous patch (by combining the read/write functions). > > +case NVME_AIO_OPC_WRITE_ZEROES: > > +block_acct_start(stats, acct, aio->iov.size, BLOCK_ACCT_WRITE); > > +aio->aiocb = blk_aio_pwrite_zeroes(aio->blk, aio->offset, > > +aio->iov.size, BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio); > Minor: aio->blk => blk > Thanks. Fixed this in a couple of other places as well. > > @@ -621,8 +880,11 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd) > > sq = n->sq[qid]; > > while (!QTAILQ_EMPTY(&sq->out_req_list)) { > > req = QTAILQ_FIRST(&sq->out_req_list); > > -assert(req->aiocb); > > -blk_aio_cancel(req->aiocb); > > +while (!QTAILQ_EMPTY(&req->aio_tailq)) { > > +aio = QTAILQ_FIRST(&req->aio_tailq); > > +assert(aio->aiocb); > > +blk_aio_cancel(aio->aiocb); > What about releasing memory associated with given aio ? I believe the callback is still called when cancelled? That should take care of it. Or have I misunderstood that? At least for the DMAAIOCBs it is. > > +struct NvmeAIO { > > +NvmeRequest *req; > > + > > +NvmeAIOOp opc; > > +int64_t offset; > > +BlockBackend*blk; > > +BlockAIOCB *aiocb; > > +BlockAcctCookie acct; > > + > > +NvmeAIOCompletionFunc *cb; > > +void *cb_arg; > > + > > +QEMUSGList *qsg; > > +QEMUIOVector iov; > > There is a bit of inconsistency on the ownership of IOVs and SGLs. > SGLs now seem to be owned by request whereas IOVs by the aio. > WOuld be good to have that unified or documented at least. > Fixed this. The NvmeAIO only holds pointers now. > > +#define NVME_REQ_TRANSFER_DMA 0x1 > This one does not seem to be used > I have dropped the flags and reverted to a simple req->is_cmb as that is all that is really needed.
Re: [PATCH v2 14/20] nvme: allow multiple aios per command
Hi Klaus, On Tue, 15 Oct 2019 at 11:55, Klaus Jensen wrote: > > This refactors how the device issues asynchronous block backend > requests. The NvmeRequest now holds a queue of NvmeAIOs that are > associated with the command. This allows multiple aios to be issued for > a command. Only when all requests have been completed will the device > post a completion queue entry. > > Because the device is currently guaranteed to only issue a single aio > request per command, the benefit is not immediately obvious. But this > functionality is required to support metadata. > > Signed-off-by: Klaus Jensen > Signed-off-by: Klaus Jensen > --- > hw/block/nvme.c | 455 +- > hw/block/nvme.h | 165 --- > hw/block/trace-events | 8 + > 3 files changed, 511 insertions(+), 117 deletions(-) > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index cbc0b6a660b6..f4b9bd36a04e 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -25,6 +25,8 @@ > * Default: 64 > * cmb_size_mb= : Size of Controller Memory Buffer in MBs. > * Default: 0 (disabled) > + * mdts= : Maximum Data Transfer Size (power of two) > + * Default: 7 > */ > > #include "qemu/osdep.h" > @@ -56,6 +58,7 @@ > } while (0) > > static void nvme_process_sq(void *opaque); > +static void nvme_aio_cb(void *opaque, int ret); > > static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) > { > @@ -197,7 +200,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList > *qsg, uint64_t prp1, > } > > if (nvme_addr_is_cmb(n, prp1)) { > -req->is_cmb = true; > +nvme_req_set_cmb(req); > } > > pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); > @@ -255,8 +258,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList > *qsg, uint64_t prp1, > } > > addr_is_cmb = nvme_addr_is_cmb(n, prp_ent); > -if ((req->is_cmb && !addr_is_cmb) || > -(!req->is_cmb && addr_is_cmb)) { > +if ((nvme_req_is_cmb(req) && !addr_is_cmb) || > +(!nvme_req_is_cmb(req) && addr_is_cmb)) { > status = NVME_INVALID_USE_OF_CMB | NVME_DNR; > goto unmap; > } > @@ -269,8 +272,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList > *qsg, uint64_t prp1, > } > } else { > bool addr_is_cmb = nvme_addr_is_cmb(n, prp2); > -if ((req->is_cmb && !addr_is_cmb) || > -(!req->is_cmb && addr_is_cmb)) { > +if ((nvme_req_is_cmb(req) && !addr_is_cmb) || > +(!nvme_req_is_cmb(req) && addr_is_cmb)) { > status = NVME_INVALID_USE_OF_CMB | NVME_DNR; > goto unmap; > } > @@ -312,7 +315,7 @@ static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t > *ptr, uint32_t len, > return status; > } > > -if (req->is_cmb) { > +if (nvme_req_is_cmb(req)) { > QEMUIOVector iov; > > qemu_iovec_init(&iov, qsg.nsg); > @@ -341,19 +344,18 @@ static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t > *ptr, uint32_t len, Any reason why the nvme_dma_write_prp is missing the changes applied to nvme_dma_read_prp ? > static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, > uint64_t prp1, uint64_t prp2, NvmeRequest *req) > { > -QEMUSGList qsg; > uint16_t status = NVME_SUCCESS; > > -status = nvme_map_prp(n, &qsg, prp1, prp2, len, req); > +status = nvme_map_prp(n, &req->qsg, prp1, prp2, len, req); > if (status) { > return status; > } > > -if (req->is_cmb) { > +if (nvme_req_is_cmb(req)) { > QEMUIOVector iov; > > -qemu_iovec_init(&iov, qsg.nsg); > -dma_to_cmb(n, &qsg, &iov); > +qemu_iovec_init(&iov, req->qsg.nsg); > +dma_to_cmb(n, &req->qsg, &iov); > > if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) { > trace_nvme_err_invalid_dma(); > @@ -365,17 +367,137 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t > *ptr, uint32_t len, > goto out; > } > > -if (unlikely(dma_buf_read(ptr, len, &qsg))) { > +if (unlikely(dma_buf_read(ptr, len, &req->qsg))) { > trace_nvme_err_invalid_dma(); > status = NVME_INVALID_FIELD | NVME_DNR; > } > > out: > -qemu_sglist_destroy(&qsg); > +qemu_sglist_destroy(&req->qsg); > > return status; > } > > +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) > +{ > +NvmeNamespace *ns = req->ns; > + > +uint32_t len = req->nlb << nvme_ns_lbads(ns); > +uint64_t prp1 = le64_to_cpu(cmd->prp1); > +uint64_t prp2 = le64_to_cpu(cmd->prp2); > + > +return nvme_map_prp(n, &req->qsg, prp1, prp2, len, req); > +} > + > +static void nvme_aio_destroy(NvmeAI
[PATCH v2 14/20] nvme: allow multiple aios per command
This refactors how the device issues asynchronous block backend requests. The NvmeRequest now holds a queue of NvmeAIOs that are associated with the command. This allows multiple aios to be issued for a command. Only when all requests have been completed will the device post a completion queue entry. Because the device is currently guaranteed to only issue a single aio request per command, the benefit is not immediately obvious. But this functionality is required to support metadata. Signed-off-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 455 +- hw/block/nvme.h | 165 --- hw/block/trace-events | 8 + 3 files changed, 511 insertions(+), 117 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index cbc0b6a660b6..f4b9bd36a04e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -25,6 +25,8 @@ * Default: 64 * cmb_size_mb= : Size of Controller Memory Buffer in MBs. * Default: 0 (disabled) + * mdts= : Maximum Data Transfer Size (power of two) + * Default: 7 */ #include "qemu/osdep.h" @@ -56,6 +58,7 @@ } while (0) static void nvme_process_sq(void *opaque); +static void nvme_aio_cb(void *opaque, int ret); static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { @@ -197,7 +200,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, uint64_t prp1, } if (nvme_addr_is_cmb(n, prp1)) { -req->is_cmb = true; +nvme_req_set_cmb(req); } pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); @@ -255,8 +258,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, uint64_t prp1, } addr_is_cmb = nvme_addr_is_cmb(n, prp_ent); -if ((req->is_cmb && !addr_is_cmb) || -(!req->is_cmb && addr_is_cmb)) { +if ((nvme_req_is_cmb(req) && !addr_is_cmb) || +(!nvme_req_is_cmb(req) && addr_is_cmb)) { status = NVME_INVALID_USE_OF_CMB | NVME_DNR; goto unmap; } @@ -269,8 +272,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, uint64_t prp1, } } else { bool addr_is_cmb = nvme_addr_is_cmb(n, prp2); -if ((req->is_cmb && !addr_is_cmb) || -(!req->is_cmb && addr_is_cmb)) { +if ((nvme_req_is_cmb(req) && !addr_is_cmb) || +(!nvme_req_is_cmb(req) && addr_is_cmb)) { status = NVME_INVALID_USE_OF_CMB | NVME_DNR; goto unmap; } @@ -312,7 +315,7 @@ static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return status; } -if (req->is_cmb) { +if (nvme_req_is_cmb(req)) { QEMUIOVector iov; qemu_iovec_init(&iov, qsg.nsg); @@ -341,19 +344,18 @@ static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, uint64_t prp1, uint64_t prp2, NvmeRequest *req) { -QEMUSGList qsg; uint16_t status = NVME_SUCCESS; -status = nvme_map_prp(n, &qsg, prp1, prp2, len, req); +status = nvme_map_prp(n, &req->qsg, prp1, prp2, len, req); if (status) { return status; } -if (req->is_cmb) { +if (nvme_req_is_cmb(req)) { QEMUIOVector iov; -qemu_iovec_init(&iov, qsg.nsg); -dma_to_cmb(n, &qsg, &iov); +qemu_iovec_init(&iov, req->qsg.nsg); +dma_to_cmb(n, &req->qsg, &iov); if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) { trace_nvme_err_invalid_dma(); @@ -365,17 +367,137 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, goto out; } -if (unlikely(dma_buf_read(ptr, len, &qsg))) { +if (unlikely(dma_buf_read(ptr, len, &req->qsg))) { trace_nvme_err_invalid_dma(); status = NVME_INVALID_FIELD | NVME_DNR; } out: -qemu_sglist_destroy(&qsg); +qemu_sglist_destroy(&req->qsg); return status; } +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) +{ +NvmeNamespace *ns = req->ns; + +uint32_t len = req->nlb << nvme_ns_lbads(ns); +uint64_t prp1 = le64_to_cpu(cmd->prp1); +uint64_t prp2 = le64_to_cpu(cmd->prp2); + +return nvme_map_prp(n, &req->qsg, prp1, prp2, len, req); +} + +static void nvme_aio_destroy(NvmeAIO *aio) +{ +if (aio->iov.nalloc) { +qemu_iovec_destroy(&aio->iov); +} + +g_free(aio); +} + +static NvmeAIO *nvme_aio_new(BlockBackend *blk, int64_t offset, +QEMUSGList *qsg, NvmeRequest *req, NvmeAIOCompletionFunc *cb) +{ +NvmeAIO *aio = g_malloc0(sizeof(*aio)); + +*aio = (NvmeAIO) { +.blk = blk, +.offset = offset, +.req = req, +.qs