Re: [PATCH v3 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-25 Thread Klaus Jensen
On Jun 25 15:57, Andrzej Jakowski wrote:
> On 6/25/20 4:13 AM, Klaus Jensen wrote:
> > 
> > Come to think of it, the above might not even be sufficient since if just 
> > one
> > of the nvme_addr_is_cmb checks fails, we end up issuing an invalid
> > pci_dma_read. But I think that it will error out gracefully on that. But
> > this needs to be checked.
> > 
> > I suggest that you just drop the size check from this patch since it's not
> > needed and might need more work to be safe in the context of SGLs anyway.
> > 
> 
> How about just MMIO access to CMB region? It can be done to any address.
> What guarantees that this will not go outside of CMB region?
> 

This was addressed in commit 87ad860c622c ("nvme: fix out-of-bounds
access to the CMB").

> >> @@ -1453,33 +1457,62 @@ static void nvme_init_namespace(NvmeCtrl *n, 
> >> NvmeNamespace *ns, Error **errp)
> >>  id_ns->nuse = id_ns->ncap;
> >>  }
> >>  
> >> -static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> >> +static void nvme_bar4_init(PCIDevice *pci_dev, Error **errp)
> >>  {
> >> -NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
> >> -NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
> >> -
> >> -NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
> >> -NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
> >> -NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
> >> -NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
> >> -NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
> >> -NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
> >> -NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
> >> -
> >> -n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
> >> -memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
> >> -  "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
> >> -pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
> >> +NvmeCtrl *n = NVME(pci_dev);
> >> +int status;
> >> +uint64_t bar_size;
> >> +uint32_t msix_vectors;
> >> +uint32_t nvme_pba_offset;
> >> +uint32_t cmb_size_units;
> >> +
> >> +msix_vectors = n->params.max_ioqpairs + 1;
> >> +nvme_pba_offset = PCI_MSIX_ENTRY_SIZE * msix_vectors;
> >> +bar_size = nvme_pba_offset + QEMU_ALIGN_UP(msix_vectors, 64) / 8;
> >> +
> >> +if (n->params.cmb_size_mb) {
> >> +NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
> >> +NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
> >> +NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
> >> +NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
> >> +NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
> >> +NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
> >> +NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
> >> +
> >> +cmb_size_units = NVME_CMBSZ_GETSIZEUNITS(n->bar.cmbsz);
> >> +n->cmb_offset = QEMU_ALIGN_UP(bar_size, cmb_size_units);
> >> +
> >> +NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_MSIX_BIR);
> >> +NVME_CMBLOC_SET_OFST(n->bar.cmbloc, n->cmb_offset / 
> >> cmb_size_units);
> >> +
> >> +n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
> >> +
> >> +bar_size += n->cmb_offset;
> >> +bar_size += NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
> >> +}
> >> +
> >> +bar_size = pow2ceil(bar_size);
> >> +
> >> +memory_region_init_io(&n->bar4, OBJECT(n), &nvme_cmb_ops, n,
> >> +  "nvme-bar4", bar_size);
> >> +
> >> +status = msix_init(pci_dev, n->params.max_ioqpairs + 1,
> >> +   &n->bar4, NVME_MSIX_BIR, 0,
> >> +   &n->bar4, NVME_MSIX_BIR, nvme_pba_offset,
> >> +   0, errp);
> > 
> > This needs to use n->params.msix_qsize instead of
> > n->params.max_ioqpairs.
> 
> Makese sense.

Another thing here. You are initializing a single memory region for bar4
and use nvme_cmb_ops as callbacks for that.

msix_init then overlays two memory regions ontop of this (for the table
and the pba). I'm not sure this is... correct? Paolo, can you shed any
light on this?

It looks to me like we need to do something more like what
msix_init_exclusive_bar does:

  1. do a memory_region_init for n->bar4
  2. msix_init adds io regions for the msix table and pba.
  3. we should then add an io region for the cmb like msix_init does
 (with a memory_region_init_io and a memory_region_add_subregion
 pair) and keep n->ctrl_mem around.
  4. pci_register_bar on n->bar4

Thus, no "general" mmio_ops are registered for bar4, but only for the
ctrl_mem and the msix_{table,pba}_mmio regions.



Re: [PATCH v3 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-25 Thread Andrzej Jakowski
On 6/25/20 4:13 AM, Klaus Jensen wrote:
> On Jun 22 11:25, Andrzej Jakowski wrote:
>> So far it was not possible to have CMB and PMR emulated on the same
>> device, because BAR2 was used exclusively either of PMR or CMB. This
>> patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.
>>
>> Signed-off-by: Andrzej Jakowski 
>> ---
>>  hw/block/nvme.c  | 119 +++
>>  hw/block/nvme.h  |   3 +-
>>  include/block/nvme.h |   4 +-
>>  3 files changed, 80 insertions(+), 46 deletions(-)
>>
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index 9f11f3e9da..ec97b7af3c 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -22,12 +22,12 @@
>>   *  [pmrdev=,] \
>>   *  max_ioqpairs=
>>   *
>> - * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>> - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
>> + * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is 
>> assumed
>> + * to be resident in BAR4 at certain offset - this is because BAR4 is also
>> + * used for storing MSI-X table that is available at offset 0 in BAR4.
>>   *
>> - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
>> - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
>> - * both provided.
>> + * pmrdev is assumed to be resident in BAR2/BAR3. When configured it 
>> consumes
>> + * whole BAR2/BAR3 exclusively.
>>   * Enabling pmr emulation can be achieved by pointing to 
>> memory-backend-file.
>>   * For example:
>>   * -object memory-backend-file,id=,share=on,mem-path=, \
>> @@ -57,8 +57,8 @@
>>  #define NVME_MAX_IOQPAIRS 0x
>>  #define NVME_REG_SIZE 0x1000
>>  #define NVME_DB_SIZE  4
>> -#define NVME_CMB_BIR 2
>>  #define NVME_PMR_BIR 2
>> +#define NVME_MSIX_BIR 4
>>  
>>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>>  do { \
>> @@ -69,18 +69,19 @@
>>  
>>  static void nvme_process_sq(void *opaque);
>>  
>> -static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
>> +static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr, int size)
>>  {
>> -hwaddr low = n->ctrl_mem.addr;
>> -hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
>> +hwaddr low = n->bar4.addr + n->cmb_offset;
>> +hwaddr hi  = low + NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
>>  
>> -return addr >= low && addr < hi;
>> +return addr >= low && (addr + size) < hi;
> 
> I think the hi check is off by one? Should be `(addr + size + 1) < hi`
> or `<=`) right? Otherwise we reject a valid read against the last
> address in the CMB.
> 
> Also, a check for wrap-around is missing (`hi < addr` post-addition).
> 
> Anyway, I would really prefer that we do not change nvme_addr_is_cmb
> since it is a useful function on its own. I use it a lot in my PRP
> mapping refactor and SGL patches, so I would need to re-add another
> function that does what nvme_addr_is_cmb used to do.
> 
> But, see my next comment.

Hi Klaus,
Thx for your review. Yep I confirm this problem.

> 
>>  }
>>  
>>  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>>  {
>> -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
>> -memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
>> +hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
>> +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr, size)) {
>> +memcpy(buf, (void *)&n->cmbuf[addr - cmb_addr], size);
>>  return;
>>  }
> 
> I would prefer that we do the range check here (nvme_addr_read already has the
> size parameter) and have nvme_addr_read return a success/fail value. E.g.
> 
>  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> +hwaddr hi = addr + size - 1;
> +if (hi < addr) {
> +return 1;
> +}
> +
> +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, 
> hi)) {
>  memcpy(buf, nvme_addr_to_cmb(n, addr), size);
>  return 0;
>  }
> 
> Actually, since the controller only support PRPs at this stage, it is
> not required to check the ending address (addr + len - 1) of the CMB access 
> for
> validity since it is guaranteed to be in range of the CMB (nvme_addr_read is
> only used for reading PRPs and they never cross a page boundary so size is
> always <= page_size and the CMB is always at least 4k aligned).
> 
> This change is really only needed when the controller adds support for SGLs,
> which is why I have the above in my tree that supports SGLs.
> 
> Come to think of it, the above might not even be sufficient since if just one
> of the nvme_addr_is_cmb checks fails, we end up issuing an invalid
> pci_dma_read. But I think that it will error out gracefully on that. But
> this needs to be checked.
> 
> I suggest that you just drop the size check from this patch since it's not
> needed and might need more w

Re: [PATCH v3 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-25 Thread Klaus Jensen
On Jun 22 11:25, Andrzej Jakowski wrote:
> So far it was not possible to have CMB and PMR emulated on the same
> device, because BAR2 was used exclusively either of PMR or CMB. This
> patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.
> 
> Signed-off-by: Andrzej Jakowski 
> ---
>  hw/block/nvme.c  | 119 +++
>  hw/block/nvme.h  |   3 +-
>  include/block/nvme.h |   4 +-
>  3 files changed, 80 insertions(+), 46 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 9f11f3e9da..ec97b7af3c 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -22,12 +22,12 @@
>   *  [pmrdev=,] \
>   *  max_ioqpairs=
>   *
> - * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
> - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> + * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
> + * to be resident in BAR4 at certain offset - this is because BAR4 is also
> + * used for storing MSI-X table that is available at offset 0 in BAR4.
>   *
> - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
> - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
> - * both provided.
> + * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes
> + * whole BAR2/BAR3 exclusively.
>   * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
>   * For example:
>   * -object memory-backend-file,id=,share=on,mem-path=, \
> @@ -57,8 +57,8 @@
>  #define NVME_MAX_IOQPAIRS 0x
>  #define NVME_REG_SIZE 0x1000
>  #define NVME_DB_SIZE  4
> -#define NVME_CMB_BIR 2
>  #define NVME_PMR_BIR 2
> +#define NVME_MSIX_BIR 4
>  
>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>  do { \
> @@ -69,18 +69,19 @@
>  
>  static void nvme_process_sq(void *opaque);
>  
> -static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
> +static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr, int size)
>  {
> -hwaddr low = n->ctrl_mem.addr;
> -hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
> +hwaddr low = n->bar4.addr + n->cmb_offset;
> +hwaddr hi  = low + NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
>  
> -return addr >= low && addr < hi;
> +return addr >= low && (addr + size) < hi;

I think the hi check is off by one? Should be `(addr + size + 1) < hi`
or `<=`) right? Otherwise we reject a valid read against the last
address in the CMB.

Also, a check for wrap-around is missing (`hi < addr` post-addition).

Anyway, I would really prefer that we do not change nvme_addr_is_cmb
since it is a useful function on its own. I use it a lot in my PRP
mapping refactor and SGL patches, so I would need to re-add another
function that does what nvme_addr_is_cmb used to do.

But, see my next comment.

>  }
>  
>  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> -memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
> +hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
> +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr, size)) {
> +memcpy(buf, (void *)&n->cmbuf[addr - cmb_addr], size);
>  return;
>  }

I would prefer that we do the range check here (nvme_addr_read already has the
size parameter) and have nvme_addr_read return a success/fail value. E.g.

 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
+hwaddr hi = addr + size - 1;
+if (hi < addr) {
+return 1;
+}
+
+if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, 
hi)) {
 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
 return 0;
 }

Actually, since the controller only support PRPs at this stage, it is
not required to check the ending address (addr + len - 1) of the CMB access for
validity since it is guaranteed to be in range of the CMB (nvme_addr_read is
only used for reading PRPs and they never cross a page boundary so size is
always <= page_size and the CMB is always at least 4k aligned).

This change is really only needed when the controller adds support for SGLs,
which is why I have the above in my tree that supports SGLs.

Come to think of it, the above might not even be sufficient since if just one
of the nvme_addr_is_cmb checks fails, we end up issuing an invalid
pci_dma_read. But I think that it will error out gracefully on that. But
this needs to be checked.

I suggest that you just drop the size check from this patch since it's not
needed and might need more work to be safe in the context of SGLs anyway.

>  
> @@ -167,17 +168,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>   uint64_t prp2, uint32_t len, NvmeCtrl *n)
>  {
>  hwaddr trans_len = n->page_size - (p

[PATCH v3 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-22 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 119 +++
 hw/block/nvme.h  |   3 +-
 include/block/nvme.h |   4 +-
 3 files changed, 80 insertions(+), 46 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 9f11f3e9da..ec97b7af3c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,12 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at certain offset - this is because BAR4 is also
+ * used for storing MSI-X table that is available at offset 0 in BAR4.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes
+ * whole BAR2/BAR3 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -57,8 +57,8 @@
 #define NVME_MAX_IOQPAIRS 0x
 #define NVME_REG_SIZE 0x1000
 #define NVME_DB_SIZE  4
-#define NVME_CMB_BIR 2
 #define NVME_PMR_BIR 2
+#define NVME_MSIX_BIR 4
 
 #define NVME_GUEST_ERR(trace, fmt, ...) \
 do { \
@@ -69,18 +69,19 @@
 
 static void nvme_process_sq(void *opaque);
 
-static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
+static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr, int size)
 {
-hwaddr low = n->ctrl_mem.addr;
-hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
+hwaddr low = n->bar4.addr + n->cmb_offset;
+hwaddr hi  = low + NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
 
-return addr >= low && addr < hi;
+return addr >= low && (addr + size) < hi;
 }
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
-memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
+if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr, size)) {
+memcpy(buf, (void *)&n->cmbuf[addr - cmb_addr], size);
 return;
 }
 
@@ -167,17 +168,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
QEMUIOVector *iov, uint64_t prp1,
  uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 
 if (unlikely(!prp1)) {
 trace_pci_nvme_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
-   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+} else if (n->bar.cmbsz && prp1 >= cmb_addr &&
+   prp1 < cmb_addr + int128_get64(n->bar4.size)) {
 qsg->nsg = 0;
 qemu_iovec_init(iov, num_prps);
-qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
+qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - cmb_addr], trans_len);
 } else {
 pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
 qemu_sglist_add(qsg, prp1, trans_len);
@@ -222,7 +224,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg){
 qemu_sglist_add(qsg, prp_ent, trans_len);
 } else {
-qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - cmb_addr],
+   trans_len);
 }
 len -= trans_len;
 i++;
@@ -235,7 +238,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg) {
 qemu_sglist_add(qsg, prp2, len);
 } else {
-qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - cmb_addr],
+   trans_len);
 }
 }
 }
@@ -1395,7 +1399,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 return;
 }
 
-if (!n->params.cmb_size_mb && n->pmrdev) {
+if (n->pmrdev) {
 if (host_memory_backend_is_mapped(n->pmrdev)) {
 char *path = 
object_get_canonical_path_component(OBJECT(n->pmrdev));