[PATCH v1 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-05 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 127 +--
 hw/block/nvme.h  |   3 +-
 include/block/nvme.h |   4 +-
 3 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f0b45704be..353cf20e0a 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,12 @@
  *  [pmrdev=,] \
  *  num_queues=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at certain offset - this is because BAR4 is also
+ * used for storing MSI-X table that is available at offset 0 in BAR4.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2. When configured it consumes whole
+ * BAR2 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -64,9 +64,10 @@ static void nvme_process_sq(void *opaque);
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->cmbsz && addr >= n->ctrl_mem.addr &&
-addr < (n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size))) {
-memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
+if (n->cmbsz && addr >= cmb_addr &&
+(addr + size) <= (cmb_addr + NVME_CMBSZ_GETSIZE(n->bar.cmbsz))) {
+memcpy(buf, (void *)&n->cmbuf[addr - cmb_addr], size);
 } else {
 pci_dma_read(&n->parent_obj, addr, buf, size);
 }
@@ -152,17 +153,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
QEMUIOVector *iov, uint64_t prp1,
  uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 
 if (unlikely(!prp1)) {
 trace_nvme_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
-   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+} else if (n->cmbsz && prp1 >= cmb_addr &&
+   prp1 < cmb_addr + int128_get64(n->bar4.size)) {
 qsg->nsg = 0;
 qemu_iovec_init(iov, num_prps);
-qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
+qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - cmb_addr], trans_len);
 } else {
 pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
 qemu_sglist_add(qsg, prp1, trans_len);
@@ -207,7 +209,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg){
 qemu_sglist_add(qsg, prp_ent, trans_len);
 } else {
-qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - cmb_addr],
+   trans_len);
 }
 len -= trans_len;
 i++;
@@ -220,7 +223,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg) {
 qemu_sglist_add(qsg, prp2, len);
 } else {
-qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - cmb_addr],
+   trans_len);
 }
 }
 }
@@ -1342,6 +1346,71 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
+#define NVME_MSIX_BIR (4)
+static void nvme_bar4_init(PCIDevice *pci_dev)
+{
+NvmeCtrl *n = NVME(pci_dev);
+int status;
+uint64_t bar_size = 4096;
+uint32_t nvme_pba_offset = bar_size / 2;
+uint32_t nvme_pba_size = QEMU_ALIGN_UP(n->num_queues, 64) / 8;
+uint32_t cmb_size_units;
+
+if (n->num_queues * PCI_MSIX_ENTRY_SIZE > nvme_pba_offset) {
+nvme_pba_offset = n->num_queues * PCI_MSIX_ENTRY_SIZE;
+}
+
+if (nvme_pba_offset + nvme_pba_size > 4096) {
+bar_size = nvme_pba_offset + nvme_pba_size;
+}
+
+if (n->cmb_size_mb) {
+/* Contoller capabilities */
+NVME_CAP_SET_CMBS(n->bar.cap, 1);
+
+NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
+ 

Re: [PATCH v1 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-08 Thread Klaus Jensen
On Jun  5 11:10, Andrzej Jakowski wrote:
> So far it was not possible to have CMB and PMR emulated on the same
> device, because BAR2 was used exclusively either of PMR or CMB. This
> patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.
> 

Hi Andrzej,

Thanks for doing this, it's a nice addition!

Though, I would prefer that the table and pba was located in BAR0 and
keeping BAR4 for exclusive CMB use. I'm no expert on this, but is it ok
to have the table and pba in prefetchable memory? Having it "together"
with the other controller-level configuration memory just feels more
natural to me, but I'm not gonna put my foot down.

Using BAR0 would also slightly simplify the patch since no changes would
be required for the CMB path.

Also, can you rebase this on Kevin's block branch? There are a bunch of
refactoring patches that changes the realization code, so this patch
doesn't apply at all.

> Signed-off-by: Andrzej Jakowski 
> ---
>  hw/block/nvme.c  | 127 +--
>  hw/block/nvme.h  |   3 +-
>  include/block/nvme.h |   4 +-
>  3 files changed, 91 insertions(+), 43 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index f0b45704be..353cf20e0a 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -22,12 +22,12 @@
>   *  [pmrdev=,] \
>   *  num_queues=
>   *
> - * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
> - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> + * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
> + * to be resident in BAR4 at certain offset - this is because BAR4 is also
> + * used for storing MSI-X table that is available at offset 0 in BAR4.
>   *
> - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
> - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
> - * both provided.
> + * pmrdev is assumed to be resident in BAR2. When configured it consumes 
> whole
> + * BAR2 exclusively.

Actually it uses both BAR2 and BAR3 since its 64 bits.

> @@ -1342,6 +1346,71 @@ static const MemoryRegionOps nvme_cmb_ops = {
>  },
>  };
>  
> +#define NVME_MSIX_BIR (4)
> +static void nvme_bar4_init(PCIDevice *pci_dev)
> +{
> +NvmeCtrl *n = NVME(pci_dev);
> +int status;
> +uint64_t bar_size = 4096;
> +uint32_t nvme_pba_offset = bar_size / 2;
> +uint32_t nvme_pba_size = QEMU_ALIGN_UP(n->num_queues, 64) / 8;
> +uint32_t cmb_size_units;
> +
> +if (n->num_queues * PCI_MSIX_ENTRY_SIZE > nvme_pba_offset) {
> +nvme_pba_offset = n->num_queues * PCI_MSIX_ENTRY_SIZE;
> +}
> +
> +if (nvme_pba_offset + nvme_pba_size > 4096) {
> +bar_size = nvme_pba_offset + nvme_pba_size;
> +}
> +

This is migration compatibility stuff that is not needed because the
nvme device is unmigratable anyway.




Re: [PATCH v1 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-08 Thread Andrzej Jakowski
On 6/8/20 1:08 AM, Klaus Jensen wrote:
> On Jun  5 11:10, Andrzej Jakowski wrote:
>> So far it was not possible to have CMB and PMR emulated on the same
>> device, because BAR2 was used exclusively either of PMR or CMB. This
>> patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.
>>
> 
> Hi Andrzej,
> 
> Thanks for doing this, it's a nice addition!
> 
> Though, I would prefer that the table and pba was located in BAR0 and
> keeping BAR4 for exclusive CMB use. I'm no expert on this, but is it ok
> to have the table and pba in prefetchable memory? Having it "together"
> with the other controller-level configuration memory just feels more
> natural to me, but I'm not gonna put my foot down.
Hi Klaus,

Thx for your feedback!
I don't think it matters if MSIX table is in prefetchable vs 
non-prefetchable memory. 
My understanding is that spec allows MSIX and PBA to be in any BAR and
offset. I understand your preference and at the same time think that
since it is not in violation of the spec why don't we leave it as-is?
Does anybody know what's typical approach for real devices?
> 
> Using BAR0 would also slightly simplify the patch since no changes would
> be required for the CMB path.
> 
> Also, can you rebase this on Kevin's block branch? There are a bunch of
> refactoring patches that changes the realization code, so this patch
> doesn't apply at all.
Yep will reabse it.
> 
>> Signed-off-by: Andrzej Jakowski 
>> ---
>>  hw/block/nvme.c  | 127 +--
>>  hw/block/nvme.h  |   3 +-
>>  include/block/nvme.h |   4 +-
>>  3 files changed, 91 insertions(+), 43 deletions(-)
>>
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index f0b45704be..353cf20e0a 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -22,12 +22,12 @@
>>   *  [pmrdev=,] \
>>   *  num_queues=
>>   *
>> - * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>> - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
>> + * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is 
>> assumed
>> + * to be resident in BAR4 at certain offset - this is because BAR4 is also
>> + * used for storing MSI-X table that is available at offset 0 in BAR4.
>>   *
>> - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
>> - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
>> - * both provided.
>> + * pmrdev is assumed to be resident in BAR2. When configured it consumes 
>> whole
>> + * BAR2 exclusively.
> 
> Actually it uses both BAR2 and BAR3 since its 64 bits.
Correct. That's what I implied here w/o actual verbiage. I can extend it
to add that information.
> 
>> @@ -1342,6 +1346,71 @@ static const MemoryRegionOps nvme_cmb_ops = {
>>  },
>>  };
>>  
>> +#define NVME_MSIX_BIR (4)
>> +static void nvme_bar4_init(PCIDevice *pci_dev)
>> +{
>> +NvmeCtrl *n = NVME(pci_dev);
>> +int status;
>> +uint64_t bar_size = 4096;
>> +uint32_t nvme_pba_offset = bar_size / 2;
>> +uint32_t nvme_pba_size = QEMU_ALIGN_UP(n->num_queues, 64) / 8;
>> +uint32_t cmb_size_units;
>> +
>> +if (n->num_queues * PCI_MSIX_ENTRY_SIZE > nvme_pba_offset) {
>> +nvme_pba_offset = n->num_queues * PCI_MSIX_ENTRY_SIZE;
>> +}
>> +
>> +if (nvme_pba_offset + nvme_pba_size > 4096) {
>> +bar_size = nvme_pba_offset + nvme_pba_size;
>> +}
>> +
> 
> This is migration compatibility stuff that is not needed because the
> nvme device is unmigratable anyway.
I don't understand that comment. Could you please explain more?
 




Re: [PATCH v1 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-09 Thread Klaus Jensen
On Jun  8 12:44, Andrzej Jakowski wrote:
> On 6/8/20 1:08 AM, Klaus Jensen wrote:
> > On Jun  5 11:10, Andrzej Jakowski wrote:
> >> So far it was not possible to have CMB and PMR emulated on the same
> >> device, because BAR2 was used exclusively either of PMR or CMB. This
> >> patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.
> >>
> > 
> > Hi Andrzej,
> > 
> > Thanks for doing this, it's a nice addition!
> > 
> > Though, I would prefer that the table and pba was located in BAR0 and
> > keeping BAR4 for exclusive CMB use. I'm no expert on this, but is it ok
> > to have the table and pba in prefetchable memory? Having it "together"
> > with the other controller-level configuration memory just feels more
> > natural to me, but I'm not gonna put my foot down.
> Hi Klaus,
> 
> Thx for your feedback!
> I don't think it matters if MSIX table is in prefetchable vs 
> non-prefetchable memory. 
> My understanding is that spec allows MSIX and PBA to be in any BAR and
> offset. I understand your preference and at the same time think that
> since it is not in violation of the spec why don't we leave it as-is?
> Does anybody know what's typical approach for real devices?

On the SSD in my system, lspci -vv shows:

Capabilities: [b0] MSI-X: Enable+ Count=33 Masked-
Vector table: BAR=0 offset=2000
PBA: BAR=4 offset=

So, the table is in BAR0 but the PBA is in BAR4. Oh well.

> >> @@ -1342,6 +1346,71 @@ static const MemoryRegionOps nvme_cmb_ops = {
> >>  },
> >>  };
> >>  
> >> +#define NVME_MSIX_BIR (4)
> >> +static void nvme_bar4_init(PCIDevice *pci_dev)
> >> +{
> >> +NvmeCtrl *n = NVME(pci_dev);
> >> +int status;
> >> +uint64_t bar_size = 4096;
> >> +uint32_t nvme_pba_offset = bar_size / 2;
> >> +uint32_t nvme_pba_size = QEMU_ALIGN_UP(n->num_queues, 64) / 8;
> >> +uint32_t cmb_size_units;
> >> +
> >> +if (n->num_queues * PCI_MSIX_ENTRY_SIZE > nvme_pba_offset) {
> >> +nvme_pba_offset = n->num_queues * PCI_MSIX_ENTRY_SIZE;
> >> +}
> >> +
> >> +if (nvme_pba_offset + nvme_pba_size > 4096) {
> >> +bar_size = nvme_pba_offset + nvme_pba_size;
> >> +}
> >> +
> > 
> > This is migration compatibility stuff that is not needed because the
> > nvme device is unmigratable anyway.
> I don't understand that comment. Could you please explain more?
>  
> 

It looks like you cribbed this code from msix_init_exclusive_bar(). All
that fuzz about the PBA starting in the upper half (nvme_pba_offset =
bar_size / 2) for nentries lower or equal to 128 is for migration
compatibility (migrating a VM from an old version of QEMU to a newer
one). It is something that was added when the location of the msix table
and pba became dynamic (it used to be static). But, since the nvme
device is marked as unmigratable (VMStateDescription.unmigratable = 1),
I believe these special cases are irrelevant.