[PATCH v6] nvme: allow cmb and pmr emulation on same device

2020-07-29 Thread Andrzej Jakowski
Resending series recently posted on mailing list related to nvme device
extension with couple of fixes after review.

This patch series does following:
 - Fixes problem where CMBS bit was not set in controller capabilities
   register, so support for CMB was not correctly advertised to guest.
   This is resend of patch that has been submitted and reviewed by
   Klaus [1]
 - Introduces BAR4 sharing between MSI-X vectors and CMB. This allows
   to have CMB and PMR emulated on the same device. This extension
   was indicated by Keith [2]

v6:
 - instead of using memory_region_to_absolute_addr() function local helper has
   been defined (nvme_cmb_to_absolute_addr()) to calculate absolute address of
   CMB in simplified way. Also a number of code style changes has been done
   (function rename, use literal instead of macro definition, etc.)

v5:
 - fixed problem introduced in v4 where CMB buffer was represented as
   subregion of BAR4 memory region. In that case CMB address was used
   incorrectly as it was relative to BAR4 and not absolute. Appropriate
   changes were added to v5 to calculate CMB address properly ([6])

v4:
 - modified BAR4 initialization, so now it consists of CMB, MSIX and
   PBA memory regions overlapping on top of it. This reduces patch
   complexity significantly (Klaus [5])

v3:
 - code style fixes including: removal of spurious line break, moving
   define into define section and code alignment (Klaus [4])
 - removed unintended code reintroduction (Klaus [4])

v2:
 - rebase on Kevin's latest block branch (Klaus [3])
 - improved comments section (Klaus [3])
 - simplified calculation of BAR4 size (Klaus [3])

v1:
 - initial push of the patch

[1]: 
https://lore.kernel.org/qemu-devel/20200408055607.g2ii7gwqbnv6cd3w@apples.localdomain/
[2]: 
https://lore.kernel.org/qemu-devel/20200330165518.ga8...@redsun51.ssa.fujisawa.hgst.com/
[3]: 
https://lore.kernel.org/qemu-devel/20200605181043.28782-1-andrzej.jakow...@linux.intel.com/
[4]: 
https://lore.kernel.org/qemu-devel/20200618092524.posxi5mysb3jjtpn@apples.localdomain/
[5]: 
https://lore.kernel.org/qemu-devel/20200626055033.6vxqvi4s5pll7som@apples.localdomain/
[6]: 
https://lore.kernel.org/qemu-devel/9143a543-d32d-f3e7-c37b-b3df7f853...@linux.intel.com/





[PATCH v6 1/2] nvme: indicate CMB support through controller capabilities register

2020-07-29 Thread Andrzej Jakowski
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c  |  1 +
 include/block/nvme.h | 10 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 841c18920c..43866b744f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2198,6 +2198,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 NVME_CAP_SET_TO(n->bar.cap, 0xf);
 NVME_CAP_SET_CSS(n->bar.cap, 1);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
 
 n->bar.vs = NVME_SPEC_VER;
 n->bar.intmc = n->bar.intms = 0;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 370df7fc05..d641ca6649 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -36,6 +36,7 @@ enum NvmeCapShift {
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
 CAP_PMR_SHIFT  = 56,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -49,6 +50,7 @@ enum NvmeCapMask {
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
 CAP_PMR_MASK   = 0x1,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -78,9 +80,11 @@ enum NvmeCapMask {
 #define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMIN_MASK)\
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
-<< 
CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
-<< CAP_PMR_SHIFT)
+   << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
+   << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
+   << CAP_CMB_SHIFT)
 
 enum NvmeCcShift {
 CC_EN_SHIFT = 0,
-- 
2.25.4




[PATCH v6 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-07-29 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 124 +--
 hw/block/nvme.h  |   1 +
 include/block/nvme.h |   4 +-
 3 files changed, 89 insertions(+), 40 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 43866b744f..292bca445f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,13 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at offset that is 2MiB aligned. When CMB is emulated
+ * on Linux guest it is recommended to make cmb_size_mb multiple of 2. Both
+ * size and alignment restrictions are imposed by Linux guest.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes
+ * whole BAR2/BAR3 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -57,7 +58,6 @@
 #define NVME_MAX_IOQPAIRS 0x
 #define NVME_DB_SIZE  4
 #define NVME_SPEC_VER 0x00010300
-#define NVME_CMB_BIR 2
 #define NVME_PMR_BIR 2
 #define NVME_TEMPERATURE 0x143
 #define NVME_TEMPERATURE_WARNING 0x157
@@ -109,18 +109,25 @@ static uint16_t nvme_sqid(NvmeRequest *req)
 return le16_to_cpu(req->sq->sqid);
 }
 
+static inline hwaddr nvme_cmb_to_absolute_addr(NvmeCtrl *n)
+{
+return n->bar4.addr + n->ctrl_mem.addr;
+}
+
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
-hwaddr low = n->ctrl_mem.addr;
-hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
+hwaddr low = nvme_cmb_to_absolute_addr(n);
+hwaddr hi  = low + int128_get64(n->ctrl_mem.size);
 
 return addr >= low && addr < hi;
 }
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
+hwaddr cmb_addr = nvme_cmb_to_absolute_addr(n);
+
 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
-memcpy(buf, (void *)>cmbuf[addr - n->ctrl_mem.addr], size);
+memcpy(buf, (void *)>cmbuf[addr - cmb_addr], size);
 return;
 }
 
@@ -207,17 +214,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
QEMUIOVector *iov, uint64_t prp1,
  uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
+hwaddr cmb_addr = nvme_cmb_to_absolute_addr(n);
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 
 if (unlikely(!prp1)) {
 trace_pci_nvme_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
-   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+} else if (n->bar.cmbsz && prp1 >= cmb_addr &&
+   prp1 < cmb_addr + int128_get64(n->ctrl_mem.size)) {
 qsg->nsg = 0;
 qemu_iovec_init(iov, num_prps);
-qemu_iovec_add(iov, (void *)>cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp1 - cmb_addr], trans_len);
 } else {
 pci_dma_sglist_init(qsg, >parent_obj, num_prps);
 qemu_sglist_add(qsg, prp1, trans_len);
@@ -262,7 +270,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg){
 qemu_sglist_add(qsg, prp_ent, trans_len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - cmb_addr], 
trans_len);
 }
 len -= trans_len;
 i++;
@@ -275,7 +283,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg) {
 qemu_sglist_add(qsg, prp2, len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp2 - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp2 - cmb_addr], 
trans_len);
 }
 }
 }
@@ -1980,7 +1988,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 return;
 }
 
-if (!n->params.cmb_size_mb && n->pmrdev) {
+if (n->pmrdev)

Re: [PATCH 02/16] hw/block/nvme: add mapping helpers

2020-07-29 Thread Andrzej Jakowski
On 7/29/20 2:24 PM, Klaus Jensen wrote:
> On Jul 29 13:40, Andrzej Jakowski wrote:
>> On 7/20/20 4:37 AM, Klaus Jensen wrote:
>>> From: Klaus Jensen 
>>>
>>> Add nvme_map_addr, nvme_map_addr_cmb and nvme_addr_to_cmb helpers and
>>> use them in nvme_map_prp.
>>>
>>> This fixes a bug where in the case of a CMB transfer, the device would
>>> map to the buffer with a wrong length.
>>>
>>> Fixes: b2b2b67a00574 ("nvme: Add support for Read Data and Write Data in 
>>> CMBs.")
>>> Signed-off-by: Klaus Jensen 
>>> ---
>>>  hw/block/nvme.c   | 109 +++---
>>>  hw/block/trace-events |   2 +
>>>  2 files changed, 94 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>>> index 4d7b730a62b6..9b1a080cdc70 100644
>>> --- a/hw/block/nvme.c
>>> +++ b/hw/block/nvme.c
>>> @@ -270,20 +338,27 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
>>> QEMUIOVector *iov, uint64_t prp1,
>>>  } else {
>>>  if (unlikely(prp2 & (n->page_size - 1))) {
>>>  trace_pci_nvme_err_invalid_prp2_align(prp2);
>>> +status = NVME_INVALID_FIELD | NVME_DNR;
>>>  goto unmap;
>>>  }
>>> -if (qsg->nsg) {
>>> -qemu_sglist_add(qsg, prp2, len);
>>> -} else {
>>> -qemu_iovec_add(iov, (void *)>cmbuf[prp2 - 
>>> n->ctrl_mem.addr], trans_len);
>>> +status = nvme_map_addr(n, qsg, iov, prp2, len);
>>> +if (status) {
>>> +goto unmap;
>>>  }
>>>  }
>>>  }
>>>  return NVME_SUCCESS;
>>>  
>>> - unmap:
>>> -qemu_sglist_destroy(qsg);
>>> -return NVME_INVALID_FIELD | NVME_DNR;
>>> +unmap:
>>> +if (iov && iov->iov) {
>>> +qemu_iovec_destroy(iov);
>>> +}
>>> +
>>> +if (qsg && qsg->sg) {
>>> +qemu_sglist_destroy(qsg);
>>> +}
>>> +
>>> +return status;
>>
>> I think it would make sense to move whole unmap block to a separate function.
>> That function could be called from here and after completing IO and would 
>> contain
>> unified deinitialization block - so no code repetitions would be necessary.
>> Other than that it looks good to me. Thx!
>>
>> Reviewed-by: Andrzej Jakowski 
>>
>  
> Hi Andrzej,
> 
> Thanks for the review :)
> 
> Yes, this is done in a later patch ("hw/block/nvme: consolidate qsg/iov
> clearing"), but kept here to reduce churn.
> 
Yep, noticed that after sending email :)
Do you plan to submit second version of these patches incorporating some
of the feedback?




Re: [PATCH 02/16] hw/block/nvme: add mapping helpers

2020-07-29 Thread Andrzej Jakowski
ts - 1 && len > n->page_size) {
>  if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
>  trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
> +status = NVME_INVALID_FIELD | NVME_DNR;
>  goto unmap;
>  }
>  
> @@ -255,14 +323,14 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  
>  if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
>  trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
> +status = NVME_INVALID_FIELD | NVME_DNR;
>  goto unmap;
>  }
>  
>  trans_len = MIN(len, n->page_size);
> -if (qsg->nsg){
> -qemu_sglist_add(qsg, prp_ent, trans_len);
> -} else {
> -qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - 
> n->ctrl_mem.addr], trans_len);
> +status = nvme_map_addr(n, qsg, iov, prp_ent, trans_len);
> +if (status) {
> +goto unmap;
>  }
>  len -= trans_len;
>  i++;
> @@ -270,20 +338,27 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  } else {
>  if (unlikely(prp2 & (n->page_size - 1))) {
>  trace_pci_nvme_err_invalid_prp2_align(prp2);
> +status = NVME_INVALID_FIELD | NVME_DNR;
>  goto unmap;
>  }
> -if (qsg->nsg) {
> -qemu_sglist_add(qsg, prp2, len);
> -} else {
> -qemu_iovec_add(iov, (void *)>cmbuf[prp2 - 
> n->ctrl_mem.addr], trans_len);
> +    status = nvme_map_addr(n, qsg, iov, prp2, len);
> +if (status) {
> +goto unmap;
>  }
>  }
>  }
>  return NVME_SUCCESS;
>  
> - unmap:
> -qemu_sglist_destroy(qsg);
> -return NVME_INVALID_FIELD | NVME_DNR;
> +unmap:
> +if (iov && iov->iov) {
> +qemu_iovec_destroy(iov);
> +}
> +
> +if (qsg && qsg->sg) {
> +qemu_sglist_destroy(qsg);
> +}
> +
> +return status;

I think it would make sense to move whole unmap block to a separate function.
That function could be called from here and after completing IO and would 
contain
unified deinitialization block - so no code repetitions would be necessary.
Other than that it looks good to me. Thx!

Reviewed-by: Andrzej Jakowski 

>  }
>  
>  static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 7b7303cab1dd..f3b2d004e078 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -33,6 +33,8 @@ pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ 
> vector %u"
>  pci_nvme_irq_pin(void) "pulsing IRQ pin"
>  pci_nvme_irq_masked(void) "IRQ is masked"
>  pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" 
> prp2=0x%"PRIx64""
> +pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len 
> %"PRIu64""
> +pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len 
> %"PRIu64""
>  pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) 
> "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8""
>  pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode) "cid 
> %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8""
>  pci_nvme_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, 
> uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
> 




Re: [PATCH v5 3/3] nvme: allow cmb and pmr to be enabled on same device

2020-07-27 Thread Andrzej Jakowski
On 7/27/20 2:06 AM, Klaus Jensen wrote:
> On Jul 23 09:03, Andrzej Jakowski wrote:
>> So far it was not possible to have CMB and PMR emulated on the same
>> device, because BAR2 was used exclusively either of PMR or CMB. This
>> patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.
>>
>> Signed-off-by: Andrzej Jakowski 
>> ---
>>  hw/block/nvme.c  | 120 +--
>>  hw/block/nvme.h  |   1 +
>>  include/block/nvme.h |   4 +-
>>  3 files changed, 85 insertions(+), 40 deletions(-)
>>
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index 43866b744f..d55a71a346 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -22,12 +22,13 @@
>>   *  [pmrdev=,] \
>>   *  max_ioqpairs=
>>   *
>> - * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>> - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
>> + * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is 
>> assumed
>> + * to be resident in BAR4 at offset that is 2MiB aligned. When CMB is 
>> emulated
>> + * on Linux guest it is recommended to make cmb_size_mb multiple of 2. Both
>> + * size and alignment restrictions are imposed by Linux guest.
>>   *
>> - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
>> - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
>> - * both provided.
>> + * pmrdev is assumed to be resident in BAR2/BAR3. When configured it 
>> consumes
>> + * whole BAR2/BAR3 exclusively.
>>   * Enabling pmr emulation can be achieved by pointing to 
>> memory-backend-file.
>>   * For example:
>>   * -object memory-backend-file,id=,share=on,mem-path=, \
>> @@ -57,8 +58,8 @@
>>  #define NVME_MAX_IOQPAIRS 0x
>>  #define NVME_DB_SIZE  4
>>  #define NVME_SPEC_VER 0x00010300
>> -#define NVME_CMB_BIR 2
>>  #define NVME_PMR_BIR 2
>> +#define NVME_MSIX_BIR 4
> 
> I think that either we keep the CMB constant (but updated with '4' of
> course) or we just get rid of both NVME_{CMB,MSIX}_BIR and use a literal
> '4' in nvme_bar4_init. It is very clear that is only BAR 4 we use.
> 
>>  #define NVME_TEMPERATURE 0x143
>>  #define NVME_TEMPERATURE_WARNING 0x157
>>  #define NVME_TEMPERATURE_CRITICAL 0x175
>> @@ -111,16 +112,18 @@ static uint16_t nvme_sqid(NvmeRequest *req)
>>  
>>  static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
>>  {
>> -hwaddr low = n->ctrl_mem.addr;
>> -hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
>> +hwaddr low = memory_region_to_absolute_addr(>ctrl_mem, 0);
>> +hwaddr hi  = low + int128_get64(n->ctrl_mem.size);
> 
> Are we really really sure we want to use a global helper like this? What
> are the chances/risk that we ever introduce another overlay? I'd say
> zero. We are not even using a *real* overlay, it's just an io memory
> region (ctrl_mem) on top of a pure container (bar4). Can't we live with
> an internal helper doing `n->bar4.addr + n->ctrl_mem.addr` and be done
> with it? It also removes a data structure walk on each invocation of
> nvme_addr_is_cmb (which is done for **all** addresses in PRP lists and
> SGLs).

Thx!
My understanding of memory_region_absolute_addr()([1]) function is that it walks
memory hierarchy up to root while incrementing absolute addr. It is very 
similar to n->bar4.addr + n->ctrl_mem.addr approach with following 
differences:
 * n->bar4.addr + n->ctrl_mem.addr assumes single level hierarchy. Updates would
   be needed when another memory level is added.
 * memory_region_to_absolute_addr() works for any-level hierarchy at tradeoff
   of dereferencing data structure. 

I don't have data for likelihood of adding new memory level, nor how much more
memory_region_to_absolute_addr() vs n->bar4.addr + n->ctrl_mem.addr costs.

Please let me know which approach is preferred.

[1]
hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset)
{
MemoryRegion *root;
hwaddr abs_addr = offset;

abs_addr += mr->addr;
for (root = mr; root->container; ) {
root = root->container;
abs_addr += root->addr;
}

return abs_addr;
}

> 
>>  
>>  return addr >= low && addr < hi;
>>  }
>>  
>>  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>>  {
>> +hwaddr cmb_addr = memory_region_to_absolute_addr(>ctrl_mem, 0);
>> +
>>  if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
>> -memcpy(buf, (void *)>

[PATCH v5 2/3] nvme: indicate CMB support through controller capabilities register

2020-07-23 Thread Andrzej Jakowski
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c  |  1 +
 include/block/nvme.h | 10 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 841c18920c..43866b744f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2198,6 +2198,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 NVME_CAP_SET_TO(n->bar.cap, 0xf);
 NVME_CAP_SET_CSS(n->bar.cap, 1);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
 
 n->bar.vs = NVME_SPEC_VER;
 n->bar.intmc = n->bar.intms = 0;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 370df7fc05..d641ca6649 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -36,6 +36,7 @@ enum NvmeCapShift {
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
 CAP_PMR_SHIFT  = 56,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -49,6 +50,7 @@ enum NvmeCapMask {
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
 CAP_PMR_MASK   = 0x1,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -78,9 +80,11 @@ enum NvmeCapMask {
 #define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMIN_MASK)\
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
-<< 
CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
-<< CAP_PMR_SHIFT)
+   << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
+   << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
+   << CAP_CMB_SHIFT)
 
 enum NvmeCcShift {
 CC_EN_SHIFT = 0,
-- 
2.21.1




[PATCH v5 1/3] memory: export memory_region_to_absolute_addr() function

2020-07-23 Thread Andrzej Jakowski
This change exports memory_region_to_absolute_addr() function so it can
be used by drivers requiring to calculate absolute address for memory
subregions when memory hierarchy is used.

Signed-off-by: Andrzej Jakowski 
---
 include/exec/memory.h | 9 +
 softmmu/memory.c  | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 307e527835..6e5bba602e 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -2017,6 +2017,15 @@ MemTxResult memory_region_dispatch_write(MemoryRegion 
*mr,
  MemOp op,
  MemTxAttrs attrs);
 
+/**
+ * memory_region_to_absolute_addr: walk through memory hierarchy to retrieve
+ * absolute address for given MemoryRegion.
+ *
+ * @mr: #MemoryRegion to scan through
+ * @offset: starting offset within mr
+ */
+hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset);
+
 /**
  * address_space_init: initializes an address space
  *
diff --git a/softmmu/memory.c b/softmmu/memory.c
index 9200b20130..deff3739ff 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -399,7 +399,7 @@ static inline uint64_t 
memory_region_shift_write_access(uint64_t *value,
 return tmp;
 }
 
-static hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset)
+hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset)
 {
 MemoryRegion *root;
 hwaddr abs_addr = offset;
-- 
2.21.1




[PATCH v5] nvme: allow cmb and pmr emulation on same device

2020-07-23 Thread Andrzej Jakowski
Resending series recently posted on mailing list related to nvme device
extension with couple of fixes after review.

This patch series does following:
 - Exports memory_region_to_absolute_addr() function so it is avaialbe
   for use by drivers. This function is needed by 3rd patch in this
   series
 - Fixes problem where CMBS bit was not set in controller capabilities
   register, so support for CMB was not correctly advertised to guest.
   This is resend of patch that has been submitted and reviewed by
   Klaus [1]
 - Introduces BAR4 sharing between MSI-X vectors and CMB. This allows
   to have CMB and PMR emulated on the same device. This extension
   was indicated by Keith [2]

v5:
 - fixed problem introduced in v4 where CMB buffer was represented as
   subregion of BAR4 memory region. In that case CMB address was used
   incorrectly as it was relative to BAR4 and not absolute. Appropriate
   changes were added to v5 to calculate CMB address properly ([6])

v4:
 - modified BAR4 initialization, so now it consists of CMB, MSIX and
   PBA memory regions overlapping on top of it. This reduces patch
   complexity significantly (Klaus [5])

v3:
 - code style fixes including: removal of spurious line break, moving
   define into define section and code alignment (Klaus [4])
 - removed unintended code reintroduction (Klaus [4])

v2:
 - rebase on Kevin's latest block branch (Klaus [3])
 - improved comments section (Klaus [3])
 - simplified calculation of BAR4 size (Klaus [3])

v1:
 - initial push of the patch

[1]: 
https://lore.kernel.org/qemu-devel/20200408055607.g2ii7gwqbnv6cd3w@apples.localdomain/
[2]: 
https://lore.kernel.org/qemu-devel/20200330165518.ga8...@redsun51.ssa.fujisawa.hgst.com/
[3]: 
https://lore.kernel.org/qemu-devel/20200605181043.28782-1-andrzej.jakow...@linux.intel.com/
[4]: 
https://lore.kernel.org/qemu-devel/20200618092524.posxi5mysb3jjtpn@apples.localdomain/
[5]: 
https://lore.kernel.org/qemu-devel/20200626055033.6vxqvi4s5pll7som@apples.localdomain/
[6]: 
https://lore.kernel.org/qemu-devel/9143a543-d32d-f3e7-c37b-b3df7f853...@linux.intel.com/





[PATCH v5 3/3] nvme: allow cmb and pmr to be enabled on same device

2020-07-23 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 120 +--
 hw/block/nvme.h  |   1 +
 include/block/nvme.h |   4 +-
 3 files changed, 85 insertions(+), 40 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 43866b744f..d55a71a346 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,13 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at offset that is 2MiB aligned. When CMB is emulated
+ * on Linux guest it is recommended to make cmb_size_mb multiple of 2. Both
+ * size and alignment restrictions are imposed by Linux guest.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes
+ * whole BAR2/BAR3 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -57,8 +58,8 @@
 #define NVME_MAX_IOQPAIRS 0x
 #define NVME_DB_SIZE  4
 #define NVME_SPEC_VER 0x00010300
-#define NVME_CMB_BIR 2
 #define NVME_PMR_BIR 2
+#define NVME_MSIX_BIR 4
 #define NVME_TEMPERATURE 0x143
 #define NVME_TEMPERATURE_WARNING 0x157
 #define NVME_TEMPERATURE_CRITICAL 0x175
@@ -111,16 +112,18 @@ static uint16_t nvme_sqid(NvmeRequest *req)
 
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
-hwaddr low = n->ctrl_mem.addr;
-hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
+hwaddr low = memory_region_to_absolute_addr(>ctrl_mem, 0);
+hwaddr hi  = low + int128_get64(n->ctrl_mem.size);
 
 return addr >= low && addr < hi;
 }
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
+hwaddr cmb_addr = memory_region_to_absolute_addr(>ctrl_mem, 0);
+
 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
-memcpy(buf, (void *)>cmbuf[addr - n->ctrl_mem.addr], size);
+memcpy(buf, (void *)>cmbuf[addr - cmb_addr], size);
 return;
 }
 
@@ -207,17 +210,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
QEMUIOVector *iov, uint64_t prp1,
  uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
+hwaddr cmb_addr = memory_region_to_absolute_addr(>ctrl_mem, 0);
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 
 if (unlikely(!prp1)) {
 trace_pci_nvme_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
-   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+} else if (n->bar.cmbsz && prp1 >= cmb_addr &&
+   prp1 < cmb_addr + int128_get64(n->ctrl_mem.size)) {
 qsg->nsg = 0;
 qemu_iovec_init(iov, num_prps);
-qemu_iovec_add(iov, (void *)>cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp1 - cmb_addr], trans_len);
 } else {
 pci_dma_sglist_init(qsg, >parent_obj, num_prps);
 qemu_sglist_add(qsg, prp1, trans_len);
@@ -262,7 +266,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg){
 qemu_sglist_add(qsg, prp_ent, trans_len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - cmb_addr], 
trans_len);
 }
 len -= trans_len;
 i++;
@@ -275,7 +279,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg) {
 qemu_sglist_add(qsg, prp2, len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp2 - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp2 - cmb_addr], 
trans_len);
 }
 }
 }
@@ -1980,7 +1984,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 return;
 }
 
-if (!n->params.cmb_size_mb && n->pmrdev) {
+if (n->pmrdev) {
 if (host_memory_backend_is_m

Re: [PATCH v4 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-07-22 Thread Andrzej Jakowski
On 7/22/20 10:21 AM, Klaus Jensen wrote:
> On Jul 22 10:00, Andrzej Jakowski wrote:
>> On 7/22/20 12:43 AM, Klaus Jensen wrote:
>>> @keith, please see below - can you comment on the Linux kernel 2 MB
>>> boundary requirement for the CMB? Or should we hail Stephen (or Logan
>>> maybe) since this seems to be related to p2pdma?
>>>
>>> On Jul 21 14:54, Andrzej Jakowski wrote:
>>>> On 7/15/20 1:06 AM, Klaus Jensen wrote:
>>>>> Hi Andrzej,
>>>>>
>>>>> I've not been ignoring this, but sorry for not following up earlier.
>>>>>
>>>>> I'm hesitent to merge anything that very obviously breaks an OS that we
>>>>> know is used a lot to this using this device. Also because the issue has
>>>>> not been analyzed well enough to actually know if this is a QEMU or
>>>>> kernel issue.
>>>>
>>>> Hi Klaus,
>>>>
>>>> Thx for your response! I understand your hesitance on merging stuff that
>>>> obviously breaks guest OS. 
>>>>
>>>>>
>>>>> Now, as far as I can test, having the MSI-X vector table and PBA in BAR
>>>>> 0, PMR in BAR 2 and CMB in BAR 4 seems to make everyone happy
>>>>> (irregardless of IOMMU on/off).
>>>>>
>>>>> Later, when the issue is better understood, we can add options to set
>>>>> offsets, BIRs etc.
>>>>>
>>>>> The patch below replaces your "[PATCH v4 2/2] nvme: allow cmb and pmr to
>>>>> be enabled" (but still requires "[PATCH v4 1/2] ...") and applies to
>>>>> git://git.infradead.org/qemu-nvme.git nvme-next branch.
>>>>>
>>>>> Can you reproduce the issues with that patch? I can't on a stock Arch
>>>>> Linux 5.7.5-arch1-1 kernel.
>>>>
>>>> While I'm happy that approach with MSIX and PBA in BAR0 works fine, I
>>>> feel that investigation part why it works while mine doesn't is
>>>> missing. It looks to me that both patches are basically following same 
>>>> approach: create memory subregion and overlay on top of other memory
>>>> region. Why one works and the other doesn't then?
>>>>
>>>> Having in mind that, I have recently focused on understanding problem.
>>>> I observed that when guest assigns address to BAR4, addr field in
>>>> nvme-bar4 memory region gets populated, but it doesn't get automatically
>>>> populated in ctrl_mem (cmb) memory subregion, so later when 
>>>> nvme_addr_is_cmb() 
>>>> is called address check works incorrectly and as a consequence vmm does 
>>>> dma 
>>>> read instead of memcpy.
>>>> I created a patch that sets correct address on ctrl_mem subregion and 
>>>> guest 
>>>> OS boots up correctly.
>>>>
>>>> When I looked into pci and memory region code I noticed that indeed address
>>>> is only assigned to top level memory region but not to contained 
>>>> subregions.
>>>> I think that because in your approach cmb grabs whole bar exclusively it 
>>>> works
>>>> fine.
>>>>
>>>> Here is my question (perhaps pci folks can help answer :)): if we consider 
>>>> memory region overlapping for pci devices as valid use case should pci 
>>>> code on configuration cycles walk through all contained subregion and
>>>> update addr field accordingly?
>>>>
>>>> Thx!
>>>>
>>>
>>> Hi Andrzej,
>>>
>>> Thanks for looking into this. I think your analysis helped me nail this.
>>> The problem is that we added the use of a subregion and have some
>>> assumptions that no longer hold.
>>>
>>> nvme_addr_is_cmb() assumes that n->ctrl_mem.addr is an absolute address.
>>> But when the memory region is a subregion, addr holds an offset into the
>>> parent container instead. Thus, changing all occurances of
>>> n->ctrl_mem.addr to (n->bar0.addr + n->ctrl_mem.addr) fixes the issue
>>> (this is required in nvme_addr_is_cmb and nvme_map_prp). I patched that
>>> in your original patch[1]. The reason my version worked is because there
>>> was no subregion involved for the CMB, so the existing address
>>> validation calculations were still correct.
>>
>> I'm a little bit concerned with this approach:
>> (n->bar0.addr + n->ctrl_mem.addr) and hoping to have some debate. Let me 
>>

Re: [PATCH v4 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-07-22 Thread Andrzej Jakowski
On 7/22/20 12:43 AM, Klaus Jensen wrote:
> @keith, please see below - can you comment on the Linux kernel 2 MB
> boundary requirement for the CMB? Or should we hail Stephen (or Logan
> maybe) since this seems to be related to p2pdma?
> 
> On Jul 21 14:54, Andrzej Jakowski wrote:
>> On 7/15/20 1:06 AM, Klaus Jensen wrote:
>>> Hi Andrzej,
>>>
>>> I've not been ignoring this, but sorry for not following up earlier.
>>>
>>> I'm hesitent to merge anything that very obviously breaks an OS that we
>>> know is used a lot to this using this device. Also because the issue has
>>> not been analyzed well enough to actually know if this is a QEMU or
>>> kernel issue.
>>
>> Hi Klaus,
>>
>> Thx for your response! I understand your hesitance on merging stuff that
>> obviously breaks guest OS. 
>>
>>>
>>> Now, as far as I can test, having the MSI-X vector table and PBA in BAR
>>> 0, PMR in BAR 2 and CMB in BAR 4 seems to make everyone happy
>>> (irregardless of IOMMU on/off).
>>>
>>> Later, when the issue is better understood, we can add options to set
>>> offsets, BIRs etc.
>>>
>>> The patch below replaces your "[PATCH v4 2/2] nvme: allow cmb and pmr to
>>> be enabled" (but still requires "[PATCH v4 1/2] ...") and applies to
>>> git://git.infradead.org/qemu-nvme.git nvme-next branch.
>>>
>>> Can you reproduce the issues with that patch? I can't on a stock Arch
>>> Linux 5.7.5-arch1-1 kernel.
>>
>> While I'm happy that approach with MSIX and PBA in BAR0 works fine, I
>> feel that investigation part why it works while mine doesn't is
>> missing. It looks to me that both patches are basically following same 
>> approach: create memory subregion and overlay on top of other memory
>> region. Why one works and the other doesn't then?
>>
>> Having in mind that, I have recently focused on understanding problem.
>> I observed that when guest assigns address to BAR4, addr field in
>> nvme-bar4 memory region gets populated, but it doesn't get automatically
>> populated in ctrl_mem (cmb) memory subregion, so later when 
>> nvme_addr_is_cmb() 
>> is called address check works incorrectly and as a consequence vmm does dma 
>> read instead of memcpy.
>> I created a patch that sets correct address on ctrl_mem subregion and guest 
>> OS boots up correctly.
>>
>> When I looked into pci and memory region code I noticed that indeed address
>> is only assigned to top level memory region but not to contained subregions.
>> I think that because in your approach cmb grabs whole bar exclusively it 
>> works
>> fine.
>>
>> Here is my question (perhaps pci folks can help answer :)): if we consider 
>> memory region overlapping for pci devices as valid use case should pci 
>> code on configuration cycles walk through all contained subregion and
>> update addr field accordingly?
>>
>> Thx!
>>
> 
> Hi Andrzej,
> 
> Thanks for looking into this. I think your analysis helped me nail this.
> The problem is that we added the use of a subregion and have some
> assumptions that no longer hold.
> 
> nvme_addr_is_cmb() assumes that n->ctrl_mem.addr is an absolute address.
> But when the memory region is a subregion, addr holds an offset into the
> parent container instead. Thus, changing all occurances of
> n->ctrl_mem.addr to (n->bar0.addr + n->ctrl_mem.addr) fixes the issue
> (this is required in nvme_addr_is_cmb and nvme_map_prp). I patched that
> in your original patch[1]. The reason my version worked is because there
> was no subregion involved for the CMB, so the existing address
> validation calculations were still correct.

I'm a little bit concerned with this approach:
(n->bar0.addr + n->ctrl_mem.addr) and hoping to have some debate. Let me 
describe my understanding of the problem.
It looks to me that addr field sometimes contains *absolute* address (when no 
hierarchy is used) and other times it contains *relative* address (when
hierarchy is created). From my perspective use of this field is inconsistent
and thus error-prone.  
Because of that I think that doing n->bar0.addr + n->ctrl_mem.addr doesn't
solve root problem and is still prone to the same problem if in the future
we potentially build even more complicated hierarchy.
I think that we could solve it by introducing helper function like

hwaddr memory_region_get_abs_addr(MemoryRegion *mr) 

to retrieve absolute address and in the documentation indicate that addr field
can be relative or absolute and it is recommended to use above function to 
retrieve absolute address.
What d

Re: [PATCH v4 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-07-21 Thread Andrzej Jakowski
On 7/15/20 1:06 AM, Klaus Jensen wrote:
> Hi Andrzej,
> 
> I've not been ignoring this, but sorry for not following up earlier.
> 
> I'm hesitent to merge anything that very obviously breaks an OS that we
> know is used a lot to this using this device. Also because the issue has
> not been analyzed well enough to actually know if this is a QEMU or
> kernel issue.

Hi Klaus,

Thx for your response! I understand your hesitance on merging stuff that
obviously breaks guest OS. 

> 
> Now, as far as I can test, having the MSI-X vector table and PBA in BAR
> 0, PMR in BAR 2 and CMB in BAR 4 seems to make everyone happy
> (irregardless of IOMMU on/off).
> 
> Later, when the issue is better understood, we can add options to set
> offsets, BIRs etc.
> 
> The patch below replaces your "[PATCH v4 2/2] nvme: allow cmb and pmr to
> be enabled" (but still requires "[PATCH v4 1/2] ...") and applies to
> git://git.infradead.org/qemu-nvme.git nvme-next branch.
> 
> Can you reproduce the issues with that patch? I can't on a stock Arch
> Linux 5.7.5-arch1-1 kernel.

While I'm happy that approach with MSIX and PBA in BAR0 works fine, I
feel that investigation part why it works while mine doesn't is
missing. It looks to me that both patches are basically following same 
approach: create memory subregion and overlay on top of other memory
region. Why one works and the other doesn't then?

Having in mind that, I have recently focused on understanding problem.
I observed that when guest assigns address to BAR4, addr field in
nvme-bar4 memory region gets populated, but it doesn't get automatically
populated in ctrl_mem (cmb) memory subregion, so later when nvme_addr_is_cmb() 
is called address check works incorrectly and as a consequence vmm does dma 
read instead of memcpy.
I created a patch that sets correct address on ctrl_mem subregion and guest 
OS boots up correctly.

When I looked into pci and memory region code I noticed that indeed address
is only assigned to top level memory region but not to contained subregions.
I think that because in your approach cmb grabs whole bar exclusively it works
fine.

Here is my question (perhaps pci folks can help answer :)): if we consider 
memory region overlapping for pci devices as valid use case should pci 
code on configuration cycles walk through all contained subregion and
update addr field accordingly?

Thx!



Re: [PATCH v4 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-07-07 Thread Andrzej Jakowski
On 7/6/20 12:15 AM, Klaus Jensen wrote:
> On Jul  2 16:33, Andrzej Jakowski wrote:
>> On 7/2/20 10:51 AM, Klaus Jensen wrote:
>>> On Jul  2 08:07, Andrzej Jakowski wrote:
>>>> On 7/2/20 3:31 AM, Klaus Jensen wrote:
>>>>> Aight, an update here. This only happens when QEMU is run with a virtual
>>>>> IOMMU. Otherwise, the kernel is happy.
>>>>>
>>>>> With the vIOMMU, qemu also craps out a bit:
>>>>>
>>>>> qemu-system-x86_64: vtd_iova_to_slpte: detected slpte permission error 
>>>>> (iova=0xfd20, level=0x2, slpte=0x0, write=0)
>>>>> qemu-system-x86_64: vtd_iommu_translate: detected translation failure 
>>>>> (dev=03:00:00, iova=0xfd20)
>>>>>
>>>>> So I think we are back in QEMU land for the bug.
>>>>
>>>> Can you share command line for that?
>>>>
>>>>
>>>
>>> qemu-system-x86_64 \
>>>   -nodefaults \
>>>   -display none \
>>>   -device intel-iommu,pt,intremap=on,device-iotlb=on \
>>>   -machine type=q35,accel=kvm,kernel_irqchip=split \
>>>   -cpu host \
>>>   -smp 4 \
>>>   -m 8G \
>>>   -nic user,model=virtio-net-pci,hostfwd=tcp::-:22 \
>>>   -device virtio-rng-pci \
>>>   -drive 
>>> id=boot,file=/home/kbj/work/src/vmctl/state/pmr/boot.qcow2,format=qcow2,if=virtio,discard=on,detect-zeroes=unmap
>>>  \
>>>   -device pcie-root-port,id=pcie_root_port1,chassis=1,slot=0 \
>>>   -device x3130-upstream,id=pcie_upstream1,bus=pcie_root_port1 \
>>>   -device 
>>> xio3130-downstream,id=pcie_downstream1,bus=pcie_upstream1,chassis=1,slot=1 \
>>>   -drive 
>>> id=nvme0n1,file=/home/kbj/work/src/vmctl/state/pmr/nvme0n1.img,format=raw,if=none,discard=on,detect-zeroes=unmap
>>>  \
>>>   -object memory-backend-file,id=pmr,share=on,mem-path=pmr.bin,size=1M \
>>>   -device 
>>> nvme,id=nvme0,serial=deadbeef,bus=pcie_downstream1,drive=nvme0n1,msix_qsize=1,pmrdev=pmr,cmb_size_mb=2
>>>  \
>>>   -pidfile /home/kbj/work/src/vmctl/run/pmr/pidfile \
>>>   -kernel /home/kbj/work/src/kernel/linux/arch/x86_64/boot/bzImage \
>>>   -append root=/dev/vda1 console=ttyS0,115200 audit=0 nokaslr \
>>>   -virtfs 
>>> local,path=/home/kbj/work/src/kernel/linux,security_model=none,readonly,mount_tag=modules
>>>  \
>>>   -serial mon:stdio \
>>>   -trace pci_nvme*
>>>
>>>
>>
>> I focused on reproduction and it looks to me that my patch doesn't 
>> necessarily introduce regression. I run it w/ and w/o patch in both cases
>> getting error while registering. Here is kernel guest log:
>>
>> [   87.606482] nvme nvme0: pci function :00:04.0
>> [   87.635577] dev=95b0a83b bar=2 size=134217728 offset=0
>> [   87.636593] nvme nvme0: failed to register the CMB ret=-95
>> [   87.643262] nvme nvme0: 12/0/0 default/read/poll queues
>>
>> Any thoughts?
>>
> 
> Hmm, that's not what I am seeing.
> 
> With kernel v5.8-rc4, I'm not seeing any issues with CMB with and
> without IOMMU on QEMU master. With your patches, my kernel (v5.8-rc4)
> pukes both with and without iommu.
> 
> BUT! This doesn't mean that your patch is bad, it looks more like an
> issue in the kernel. I still think the BAR configuration looks sane, but
> I am not expert on this.
> 
> To satisify my curiosity I tried mending your patch to put the CMB on
> offset 0 and move the MSI-X vector table and PBA to BAR 0 (like I
> suggested back in the day). That works. With and without IOMMU. So, I
> think it is an issue with the Linux kernel not being too happy about the
> CMB being at an offset. It doesn't directly look like an issue in the
> nvme driver since the issue shows up far lower in the memory subsystem,
> but it would be nice to have the linux nvme gang at least acknowledge
> the issue.
> 

I have managed to reproduce that problem and played with patch to see
when the problem occurs vs not. 
When I put MSIX back to BAR2 (no PMR at all) and CMB left at BAR4 but 
starting at offset 0 I was still able to reproduce issue.
So then I've played with memory region API and interesting observed that
problem occurs when region overlaying is used via:

memory_region_init(>bar4, OBJECT(n), "nvme-bar4",  bar_size);$
$  
if (n->params.cmb_size_mb) {$
memory_region_init_io(>ctrl_mem, OBJECT(n), _cmb_ops, n,$
  "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));$
$  
memory_region_add_subregion_overlap(>bar4, cmb_offset, >ctrl_mem, 1);$
}$

on the other hand when cmb memory region is initialized w/o region
overlaying that is:

memory_region_init_io(>ctrl_mem, OBJECT(n), _cmb_ops, n,$
  "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));

I get no reproduction.

Also observed qemu complaing about failed translation:
qemu-system-x86_64: vtd_iova_to_slpte: detected slpte permission error 
(iova=0xfe40, level=0x2, slpte=0x0, write=0)
qemu-system-x86_64: vtd_iommu_translate: detected translation failure 
(dev=03:00:00, iova=0xfe40)

Not sure how we want to proceed. Any suggestions?



Re: [PATCH v4 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-07-02 Thread Andrzej Jakowski
On 7/2/20 10:51 AM, Klaus Jensen wrote:
> On Jul  2 08:07, Andrzej Jakowski wrote:
>> On 7/2/20 3:31 AM, Klaus Jensen wrote:
>>> Aight, an update here. This only happens when QEMU is run with a virtual
>>> IOMMU. Otherwise, the kernel is happy.
>>>
>>> With the vIOMMU, qemu also craps out a bit:
>>>
>>> qemu-system-x86_64: vtd_iova_to_slpte: detected slpte permission error 
>>> (iova=0xfd20, level=0x2, slpte=0x0, write=0)
>>> qemu-system-x86_64: vtd_iommu_translate: detected translation failure 
>>> (dev=03:00:00, iova=0xfd20)
>>>
>>> So I think we are back in QEMU land for the bug.
>>
>> Can you share command line for that?
>>
>>
> 
> qemu-system-x86_64 \
>   -nodefaults \
>   -display none \
>   -device intel-iommu,pt,intremap=on,device-iotlb=on \
>   -machine type=q35,accel=kvm,kernel_irqchip=split \
>   -cpu host \
>   -smp 4 \
>   -m 8G \
>   -nic user,model=virtio-net-pci,hostfwd=tcp::-:22 \
>   -device virtio-rng-pci \
>   -drive 
> id=boot,file=/home/kbj/work/src/vmctl/state/pmr/boot.qcow2,format=qcow2,if=virtio,discard=on,detect-zeroes=unmap
>  \
>   -device pcie-root-port,id=pcie_root_port1,chassis=1,slot=0 \
>   -device x3130-upstream,id=pcie_upstream1,bus=pcie_root_port1 \
>   -device 
> xio3130-downstream,id=pcie_downstream1,bus=pcie_upstream1,chassis=1,slot=1 \
>   -drive 
> id=nvme0n1,file=/home/kbj/work/src/vmctl/state/pmr/nvme0n1.img,format=raw,if=none,discard=on,detect-zeroes=unmap
>  \
>   -object memory-backend-file,id=pmr,share=on,mem-path=pmr.bin,size=1M \
>   -device 
> nvme,id=nvme0,serial=deadbeef,bus=pcie_downstream1,drive=nvme0n1,msix_qsize=1,pmrdev=pmr,cmb_size_mb=2
>  \
>   -pidfile /home/kbj/work/src/vmctl/run/pmr/pidfile \
>   -kernel /home/kbj/work/src/kernel/linux/arch/x86_64/boot/bzImage \
>   -append root=/dev/vda1 console=ttyS0,115200 audit=0 nokaslr \
>   -virtfs 
> local,path=/home/kbj/work/src/kernel/linux,security_model=none,readonly,mount_tag=modules
>  \
>   -serial mon:stdio \
>   -trace pci_nvme*
> 
> 

I focused on reproduction and it looks to me that my patch doesn't 
necessarily introduce regression. I run it w/ and w/o patch in both cases
getting error while registering. Here is kernel guest log:

[   87.606482] nvme nvme0: pci function :00:04.0
[   87.635577] dev=95b0a83b bar=2 size=134217728 offset=0
[   87.636593] nvme nvme0: failed to register the CMB ret=-95
[   87.643262] nvme nvme0: 12/0/0 default/read/poll queues

Any thoughts?



Re: nvme emulation merge process

2020-07-02 Thread Andrzej Jakowski
On 7/1/20 6:57 AM, Philippe Mathieu-Daudé wrote:
> On 7/1/20 3:18 PM, Klaus Jensen wrote:
>> On Jul  1 12:34, Kevin Wolf wrote:
>>> Am 30.06.2020 um 22:36 hat Klaus Jensen geschrieben:
>>>> On Jun 30 08:42, Keith Busch wrote:
>>>>> On Tue, Jun 30, 2020 at 04:09:46PM +0200, Philippe Mathieu-Daudé wrote:
>>>>>> What I see doable for the following days is:
>>>>>> - hw/block/nvme: Fix I/O BAR structure [3]
>>>>>> - hw/block/nvme: handle transient dma errors
>>>>>> - hw/block/nvme: bump to v1.3
>>>>>
>>>>>
>>>>> These look like sensible patches to rebase future work on, IMO. The 1.3
>>>>> updates had been prepared a while ago, at least.
>>>>
>>>> I think Philippe's "hw/block/nvme: Fix I/O BAR structure" series is a
>>>> no-brainer. It just needs to get in asap.
>>>
>>> I think we need to talk about how nvme patches are supposed to get
>>> merged. I'm not familiar with the hardware nor the code, so the model
>>> was that I just blindly merge patches that Keith has reviewed/acked,
>>> just to spare him the work to prepare a pull request. But obviously, we
>>> started doing things this way when there was a lot less activity around
>>> the nvme emulation.
>>>
>>> If we find that this doesn't scale any more, maybe we need to change
>>> something.
>>
>> Honestly, I do not think the current model has worked very well for some
>> time; especially for larger series where I, for one, has felt that my
>> work was largely ignored due to a lack of designated reviewers. Things
>> only picked up when Beata, Maxim and Philippe started reviewing my
>> series - maybe out of pity or because I was bombing the list, I don't
>> know ;)
> 
> I have no interest in the NVMe device emulation, but one of the first
> thing I notice when I look at the wiki the time I wanted to send my
> first patch, is the "Return the favor" paragraph:
> https://wiki.qemu.org/Contribute/SubmitAPatch#Return_the_favor
> 
>  "Peer review only works if everyone chips in a bit of review time.
>   If everyone submitted more patches than they reviewed, we would
>   have a patch backlog. A good goal is to try to review at least as
>   many patches from others as what you submit. Don't worry if you
>   don't know the code base as well as a maintainer; it's perfectly
>   fine to admit when your review is weak because you are unfamiliar
>   with the code."
> 
> So as some reviewed my patches, I try to return the favor to the
> community, in particular when I see someone is stuck waiting for
> review, and the patch topic is some area I can understand.
> 
> I don't see that as an "out of pity" reaction.
> 
> Note, it is true bomb series scares reviewers. You learned it the
> bad way. But you can see, after resending the first part of your
> "bomb", even if it took 10 versions, the result is a great
> improvement!
> 
>> We've also seen good patches from Andrzej linger on the list for quite a
>> while, prompting a number of RESENDs. I only recently allocated more
>> time and upped my review game, but I hope that contributors feel that
>> stuff gets reviewed in a timely fashion by now.
>>
>> Please understand that this is in NO WAY a criticism of Keith who
>> already made it very clear to me that he did not have a lot time to
>> review, but only ack the odd patch.
>>
>>> Depending on how much time Keith can spend on review in the
>>> near future and how much control he wants to keep over the development,
>>> I could imagine adding Klaus to MAINTAINERS, either as a co-maintainer
>>> or as a reviewer. Then I could rely on reviews/acks from either of you
>>> for merging series.
>>>
>>
>> I would be happy to step up (officially) to help maintain the device
>> with Keith and review on a daily basis, and my position can support
>> this.
> 
> Sounds good to me, but it is up to Keith Busch to accept.
> 
> It would be nice to have at least one developer from WDC listed as
> designated reviewer too.
> 
> Maxim is candidate for designated reviewer but I think he doesn't
> have the time.
> 
> It would also nice to have Andrzej Jakowski listed, if he is interested.

Thx! Of course I am interested in helping and I think it is actually great 
idea to have couple of designated maintainers/reviewers as it would be easier
for folks to receive feedback vs requesting it in polling manner :)
And please don't get me wrong -- I'm not complaining about anything -- 

Re: [PATCH v4 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-07-02 Thread Andrzej Jakowski
On 7/2/20 3:31 AM, Klaus Jensen wrote:
> Aight, an update here. This only happens when QEMU is run with a virtual
> IOMMU. Otherwise, the kernel is happy.
> 
> With the vIOMMU, qemu also craps out a bit:
> 
> qemu-system-x86_64: vtd_iova_to_slpte: detected slpte permission error 
> (iova=0xfd20, level=0x2, slpte=0x0, write=0)
> qemu-system-x86_64: vtd_iommu_translate: detected translation failure 
> (dev=03:00:00, iova=0xfd20)
> 
> So I think we are back in QEMU land for the bug.

Can you share command line for that?




Re: [PATCH v3 3/4] hw/block/nvme: Fix pmrmsc register size

2020-07-01 Thread Andrzej Jakowski
On 6/30/20 9:45 AM, Klaus Jensen wrote:
> On Jun 30 17:16, Philippe Mathieu-Daudé wrote:
>> On 6/30/20 5:10 PM, Andrzej Jakowski wrote:
>>> On 6/30/20 4:04 AM, Philippe Mathieu-Daudé wrote:
>>>> The Persistent Memory Region Controller Memory Space Control
>>>> register is 64-bit wide. See 'Figure 68: Register Definition'
>>>> of the 'NVM Express Base Specification Revision 1.4'.
>>>>
>>>> Fixes: 6cf9413229 ("introduce PMR support from NVMe 1.4 spec")
>>>> Reported-by: Klaus Jensen 
>>>> Reviewed-by: Klaus Jensen 
>>>> Signed-off-by: Philippe Mathieu-Daudé 
>>>> ---
>>>> Cc: Andrzej Jakowski 
>>>> ---
>>>>  include/block/nvme.h | 2 +-
>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/block/nvme.h b/include/block/nvme.h
>>>> index 71c5681912..82c384614a 100644
>>>> --- a/include/block/nvme.h
>>>> +++ b/include/block/nvme.h
>>>> @@ -21,7 +21,7 @@ typedef struct QEMU_PACKED NvmeBar {
>>>>  uint32_tpmrsts;
>>>>  uint32_tpmrebs;
>>>>  uint32_tpmrswtp;
>>>> -uint32_tpmrmsc;
>>>> +uint64_tpmrmsc;
>>>>  } NvmeBar;
>>>>  
>>>>  enum NvmeCapShift {
>>>> -- 2.21.3
>>>
>>> This is good catch, though I wanted to highlight that this will still 
>>> need to change as this register is not aligned properly and thus not in
>>> compliance with spec.
>>
>> I was wondering the odd alignment too. So you are saying at some time
>> in the future the spec will be updated to correct the alignment?
Yep I think so.
So PMRMSC currently is 64-bit register that is defined at E14h offset.
It is in conflict with spec because spec requires 64-bit registers to 
be 64-bit aligned.
I anticipate that changes will be needed.

>>
>> Should we use this instead?
>>
>>   uint32_tpmrmsc;
>>  +uint32_tpmrmsc_upper32; /* the spec define this, but *
>>  + * only low 32-bit are used  */
>>
>> Or eventually an unnamed struct:
>>
>>  -uint32_tpmrmsc;
>>  +struct {
>>  +uint32_t pmrmsc;
>>  +uint32_t pmrmsc_upper32; /* the spec define this, but *
>>  +  * only low 32-bit are used  */
>>  +};
>>
>>>
>>> Reviewed-by Andrzej Jakowski 
>>>
>>
> 
> I'm also not sure what you mean Andrzej. The odd alignment is exactly
> what the spec (v1.4) says?
> 




[PATCH v4 1/2] nvme: indicate CMB support through controller capabilities register

2020-07-01 Thread Andrzej Jakowski
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c  | 2 +-
 include/block/nvme.h | 6 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1aee042d4c..9f11f3e9da 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1582,6 +1582,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 NVME_CAP_SET_TO(n->bar.cap, 0xf);
 NVME_CAP_SET_CSS(n->bar.cap, 1);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
 
 n->bar.vs = 0x00010200;
 n->bar.intmc = n->bar.intms = 0;
@@ -1591,7 +1592,6 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 {
 NvmeCtrl *n = NVME(pci_dev);
 Error *local_err = NULL;
-
 int i;
 
 nvme_check_constraints(n, _err);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 1720ee1d51..14cf398dfa 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -35,6 +35,7 @@ enum NvmeCapShift {
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
 CAP_PMR_SHIFT  = 56,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -48,6 +49,7 @@ enum NvmeCapMask {
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
 CAP_PMR_MASK   = 0x1,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -78,8 +80,10 @@ enum NvmeCapMask {
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
 << 
CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
 << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
+   << CAP_CMB_SHIFT)
 
 enum NvmeCcShift {
 CC_EN_SHIFT = 0,
-- 
2.21.1




[PATCH v4 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-07-01 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 101 +--
 hw/block/nvme.h  |   1 +
 include/block/nvme.h |   4 +-
 3 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 9f11f3e9da..f5156bcfe5 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,12 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at certain offset - this is because BAR4 is also
+ * used for storing MSI-X table that is available at offset 0 in BAR4.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes
+ * whole BAR2/BAR3 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -57,8 +57,8 @@
 #define NVME_MAX_IOQPAIRS 0x
 #define NVME_REG_SIZE 0x1000
 #define NVME_DB_SIZE  4
-#define NVME_CMB_BIR 2
 #define NVME_PMR_BIR 2
+#define NVME_MSIX_BIR 4
 
 #define NVME_GUEST_ERR(trace, fmt, ...) \
 do { \
@@ -1395,7 +1395,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 return;
 }
 
-if (!n->params.cmb_size_mb && n->pmrdev) {
+if (n->pmrdev) {
 if (host_memory_backend_is_mapped(n->pmrdev)) {
 char *path = 
object_get_canonical_path_component(OBJECT(n->pmrdev));
 error_setg(errp, "can't use already busy memdev: %s", path);
@@ -1453,33 +1453,70 @@ static void nvme_init_namespace(NvmeCtrl *n, 
NvmeNamespace *ns, Error **errp)
 id_ns->nuse = id_ns->ncap;
 }
 
-static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
+static void nvme_bar4_init(PCIDevice *pci_dev, Error **errp)
 {
-NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
-NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
-
-NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
-NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
-NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
-NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
-NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
-NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
-NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
-
-n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
-memory_region_init_io(>ctrl_mem, OBJECT(n), _cmb_ops, n,
-  "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
-pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
+NvmeCtrl *n = NVME(pci_dev);
+int status;
+uint64_t bar_size, cmb_offset = 0;
+uint32_t msix_vectors;
+uint32_t nvme_pba_offset;
+uint32_t cmb_size_units;
+
+msix_vectors = n->params.max_ioqpairs + 1;
+nvme_pba_offset = PCI_MSIX_ENTRY_SIZE * msix_vectors;
+bar_size = nvme_pba_offset + QEMU_ALIGN_UP(msix_vectors, 64) / 8;
+
+if (n->params.cmb_size_mb) {
+NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
+NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
+NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
+NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
+NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
+
+cmb_size_units = NVME_CMBSZ_GETSIZEUNITS(n->bar.cmbsz);
+cmb_offset = QEMU_ALIGN_UP(bar_size, cmb_size_units);
+
+NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_MSIX_BIR);
+NVME_CMBLOC_SET_OFST(n->bar.cmbloc, cmb_offset / cmb_size_units);
+
+n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
+
+bar_size += cmb_offset;
+bar_size += NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
+}
+
+bar_size = pow2ceil(bar_size);
+
+/* Create memory region for BAR4, then overlap cmb, msix and pba
+ * tables on top of it.*/
+memory_region_init(>bar4, OBJECT(n), "nvme-bar4", bar_size);
+
+if (n->params.cmb_size_mb) {
+memory_region_init_io(>ctrl_mem, OBJECT(n), _cmb_ops, n,
+  "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
+
+memory_region_add_subregion(>bar4, cmb_offset, >ctrl_mem);
+}
+
+status = msix_init(pci_dev, n->params.msix_qsize,
+   >bar4, NVME_MSIX_BIR, 0,
+ 

[PATCH v4] nvme: allow cmb and pmr emulation on same device

2020-07-01 Thread Andrzej Jakowski
Hi All, 

Resending series recently posted on mailing list related to nvme device
extension with couple of fixes after review.

This patch series does following:
 - Fixes problem where CMBS bit was not set in controller capabilities
   register, so support for CMB was not correctly advertised to guest.
   This is resend of patch that has been submitted and reviewed by
   Klaus [1]
 - Introduces BAR4 sharing between MSI-X vectors and CMB. This allows
   to have CMB and PMR emulated on the same device. This extension
   was indicated by Keith [2]

v4:
 - modified BAR4 initialization, so now it consists of CMB, MSIX and
   PBA memory regions overlapping on top of it. This reduces patch
   complexity significantly (Klaus [5])

v3:
 - code style fixes including: removal of spurious line break, moving
   define into define section and code alignment (Klaus [4])
 - removed unintended code reintroduction (Klaus [4])

v2:
 - rebase on Kevin's latest block branch (Klaus [3])
 - improved comments section (Klaus [3])
 - simplified calculation of BAR4 size (Klaus [3])

v1:
 - initial push of the patch

[1]: 
https://lore.kernel.org/qemu-devel/20200408055607.g2ii7gwqbnv6cd3w@apples.localdomain/
[2]: 
https://lore.kernel.org/qemu-devel/20200330165518.ga8...@redsun51.ssa.fujisawa.hgst.com/
[3]: 
https://lore.kernel.org/qemu-devel/20200605181043.28782-1-andrzej.jakow...@linux.intel.com/
[4]: 
https://lore.kernel.org/qemu-devel/20200618092524.posxi5mysb3jjtpn@apples.localdomain/
[5]: 
https://lore.kernel.org/qemu-devel/20200626055033.6vxqvi4s5pll7som@apples.localdomain/





Re: [PATCH v3 3/4] hw/block/nvme: Fix pmrmsc register size

2020-06-30 Thread Andrzej Jakowski
On 6/30/20 4:04 AM, Philippe Mathieu-Daudé wrote:
> The Persistent Memory Region Controller Memory Space Control
> register is 64-bit wide. See 'Figure 68: Register Definition'
> of the 'NVM Express Base Specification Revision 1.4'.
> 
> Fixes: 6cf9413229 ("introduce PMR support from NVMe 1.4 spec")
> Reported-by: Klaus Jensen 
> Reviewed-by: Klaus Jensen 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
> Cc: Andrzej Jakowski 
> ---
>  include/block/nvme.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 71c5681912..82c384614a 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -21,7 +21,7 @@ typedef struct QEMU_PACKED NvmeBar {
>  uint32_tpmrsts;
>  uint32_tpmrebs;
>  uint32_tpmrswtp;
> -uint32_tpmrmsc;
> +uint64_tpmrmsc;
>  } NvmeBar;
>  
>  enum NvmeCapShift {
> -- 2.21.3

This is good catch, though I wanted to highlight that this will still 
need to change as this register is not aligned properly and thus not in
compliance with spec.

Reviewed-by Andrzej Jakowski 



Re: [PATCH v3 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-25 Thread Andrzej Jakowski
On 6/25/20 4:13 AM, Klaus Jensen wrote:
> On Jun 22 11:25, Andrzej Jakowski wrote:
>> So far it was not possible to have CMB and PMR emulated on the same
>> device, because BAR2 was used exclusively either of PMR or CMB. This
>> patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.
>>
>> Signed-off-by: Andrzej Jakowski 
>> ---
>>  hw/block/nvme.c  | 119 +++
>>  hw/block/nvme.h  |   3 +-
>>  include/block/nvme.h |   4 +-
>>  3 files changed, 80 insertions(+), 46 deletions(-)
>>
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index 9f11f3e9da..ec97b7af3c 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -22,12 +22,12 @@
>>   *  [pmrdev=,] \
>>   *  max_ioqpairs=
>>   *
>> - * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>> - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
>> + * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is 
>> assumed
>> + * to be resident in BAR4 at certain offset - this is because BAR4 is also
>> + * used for storing MSI-X table that is available at offset 0 in BAR4.
>>   *
>> - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
>> - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
>> - * both provided.
>> + * pmrdev is assumed to be resident in BAR2/BAR3. When configured it 
>> consumes
>> + * whole BAR2/BAR3 exclusively.
>>   * Enabling pmr emulation can be achieved by pointing to 
>> memory-backend-file.
>>   * For example:
>>   * -object memory-backend-file,id=,share=on,mem-path=, \
>> @@ -57,8 +57,8 @@
>>  #define NVME_MAX_IOQPAIRS 0x
>>  #define NVME_REG_SIZE 0x1000
>>  #define NVME_DB_SIZE  4
>> -#define NVME_CMB_BIR 2
>>  #define NVME_PMR_BIR 2
>> +#define NVME_MSIX_BIR 4
>>  
>>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>>  do { \
>> @@ -69,18 +69,19 @@
>>  
>>  static void nvme_process_sq(void *opaque);
>>  
>> -static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
>> +static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr, int size)
>>  {
>> -hwaddr low = n->ctrl_mem.addr;
>> -hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
>> +hwaddr low = n->bar4.addr + n->cmb_offset;
>> +hwaddr hi  = low + NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
>>  
>> -return addr >= low && addr < hi;
>> +return addr >= low && (addr + size) < hi;
> 
> I think the hi check is off by one? Should be `(addr + size + 1) < hi`
> or `<=`) right? Otherwise we reject a valid read against the last
> address in the CMB.
> 
> Also, a check for wrap-around is missing (`hi < addr` post-addition).
> 
> Anyway, I would really prefer that we do not change nvme_addr_is_cmb
> since it is a useful function on its own. I use it a lot in my PRP
> mapping refactor and SGL patches, so I would need to re-add another
> function that does what nvme_addr_is_cmb used to do.
> 
> But, see my next comment.

Hi Klaus,
Thx for your review. Yep I confirm this problem.

> 
>>  }
>>  
>>  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>>  {
>> -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
>> -memcpy(buf, (void *)>cmbuf[addr - n->ctrl_mem.addr], size);
>> +hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
>> +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr, size)) {
>> +memcpy(buf, (void *)>cmbuf[addr - cmb_addr], size);
>>  return;
>>  }
> 
> I would prefer that we do the range check here (nvme_addr_read already has the
> size parameter) and have nvme_addr_read return a success/fail value. E.g.
> 
>  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> +hwaddr hi = addr + size - 1;
> +if (hi < addr) {
> +return 1;
> +}
> +
> +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, 
> hi)) {
>  memcpy(buf, nvme_addr_to_cmb(n, addr), size);
>  return 0;
>  }
> 
> Actually, since the controller only support PRPs at this stage, it is
> not required to check the ending address (addr + len - 1) of the CMB access 
> for
> validity since it is guaranteed to be in range of the CMB (nvme_addr_re

[PATCH v3] nvme: allow cmb and pmr emulation on same device

2020-06-22 Thread Andrzej Jakowski
Hi All, 

Resending series recently posted on mailing list related to nvme device
extension with couple of fixes after review.


This patch series does following:
 - Fixes problem where CMBS bit was not set in controller capabilities
   register, so support for CMB was not correctly advertised to guest.
   This is resend of patch that has been submitted and reviewed by
   Klaus [1]
 - Introduces BAR4 sharing between MSI-X vectors and CMB. This allows
   to have CMB and PMR emulated on the same device. This extension
   was indicated by Keith [2]

v3:
 - Code style fixes including: removal of spurious line break, moving
   define into define section and code alignment (Klaus [4])
 - Removed unintended code reintroduction (Klaus [4])

v2:
 - rebase on Kevin's latest block branch (Klaus [3])
 - improved comments section (Klaus [3])
 - simplified calculation of BAR4 size (Klaus [3])

v1:
 - initial push of the patch

[1]: 
https://lore.kernel.org/qemu-devel/20200408055607.g2ii7gwqbnv6cd3w@apples.localdomain/
[2]: 
https://lore.kernel.org/qemu-devel/20200330165518.ga8...@redsun51.ssa.fujisawa.hgst.com/
[3]: 
https://lore.kernel.org/qemu-devel/20200605181043.28782-1-andrzej.jakow...@linux.intel.com/
[4]: 
https://lore.kernel.org/qemu-devel/20200618092524.posxi5mysb3jjtpn@apples.localdomain/





[PATCH v3 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-22 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 119 +++
 hw/block/nvme.h  |   3 +-
 include/block/nvme.h |   4 +-
 3 files changed, 80 insertions(+), 46 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 9f11f3e9da..ec97b7af3c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,12 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at certain offset - this is because BAR4 is also
+ * used for storing MSI-X table that is available at offset 0 in BAR4.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes
+ * whole BAR2/BAR3 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -57,8 +57,8 @@
 #define NVME_MAX_IOQPAIRS 0x
 #define NVME_REG_SIZE 0x1000
 #define NVME_DB_SIZE  4
-#define NVME_CMB_BIR 2
 #define NVME_PMR_BIR 2
+#define NVME_MSIX_BIR 4
 
 #define NVME_GUEST_ERR(trace, fmt, ...) \
 do { \
@@ -69,18 +69,19 @@
 
 static void nvme_process_sq(void *opaque);
 
-static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
+static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr, int size)
 {
-hwaddr low = n->ctrl_mem.addr;
-hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
+hwaddr low = n->bar4.addr + n->cmb_offset;
+hwaddr hi  = low + NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
 
-return addr >= low && addr < hi;
+return addr >= low && (addr + size) < hi;
 }
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
-memcpy(buf, (void *)>cmbuf[addr - n->ctrl_mem.addr], size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
+if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr, size)) {
+memcpy(buf, (void *)>cmbuf[addr - cmb_addr], size);
 return;
 }
 
@@ -167,17 +168,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
QEMUIOVector *iov, uint64_t prp1,
  uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 
 if (unlikely(!prp1)) {
 trace_pci_nvme_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
-   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+} else if (n->bar.cmbsz && prp1 >= cmb_addr &&
+   prp1 < cmb_addr + int128_get64(n->bar4.size)) {
 qsg->nsg = 0;
 qemu_iovec_init(iov, num_prps);
-qemu_iovec_add(iov, (void *)>cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp1 - cmb_addr], trans_len);
 } else {
 pci_dma_sglist_init(qsg, >parent_obj, num_prps);
 qemu_sglist_add(qsg, prp1, trans_len);
@@ -222,7 +224,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg){
 qemu_sglist_add(qsg, prp_ent, trans_len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - cmb_addr],
+   trans_len);
 }
 len -= trans_len;
 i++;
@@ -235,7 +238,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg) {
 qemu_sglist_add(qsg, prp2, len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp2 - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp2 - cmb_addr],
+   trans_len);
 }
 }
 }
@@ -1395,7 +1399,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 return;
 }
 
-if (!n->params.cmb_size_mb &&

[PATCH v3 1/2] nvme: indicate CMB support through controller capabilities register

2020-06-22 Thread Andrzej Jakowski
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c  | 2 +-
 include/block/nvme.h | 6 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1aee042d4c..9f11f3e9da 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1582,6 +1582,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 NVME_CAP_SET_TO(n->bar.cap, 0xf);
 NVME_CAP_SET_CSS(n->bar.cap, 1);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
 
 n->bar.vs = 0x00010200;
 n->bar.intmc = n->bar.intms = 0;
@@ -1591,7 +1592,6 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 {
 NvmeCtrl *n = NVME(pci_dev);
 Error *local_err = NULL;
-
 int i;
 
 nvme_check_constraints(n, _err);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 1720ee1d51..14cf398dfa 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -35,6 +35,7 @@ enum NvmeCapShift {
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
 CAP_PMR_SHIFT  = 56,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -48,6 +49,7 @@ enum NvmeCapMask {
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
 CAP_PMR_MASK   = 0x1,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -78,8 +80,10 @@ enum NvmeCapMask {
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
 << 
CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
 << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
+   << CAP_CMB_SHIFT)
 
 enum NvmeCcShift {
 CC_EN_SHIFT = 0,
-- 
2.21.1




Re: [PATCH RESEND v2 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-19 Thread Andrzej Jakowski
On 6/18/20 2:25 AM, Klaus Jensen wrote:
> On Jun 16 22:18, Andrzej Jakowski wrote:
>> So far it was not possible to have CMB and PMR emulated on the same
>> device, because BAR2 was used exclusively either of PMR or CMB. This
>> patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.
>>
>> Signed-off-by: Andrzej Jakowski 
>> ---
>>  hw/block/nvme.c  | 122 ---
>>  hw/block/nvme.h  |   5 +-
>>  include/block/nvme.h |   4 +-
>>  3 files changed, 86 insertions(+), 45 deletions(-)
>>
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index 9f11f3e9da..62681966b9 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -22,12 +22,12 @@
>>   *  [pmrdev=,] \
>>   *  max_ioqpairs=
>>   *
>> - * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>> - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
>> + * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is 
>> assumed
>> + * to be resident in BAR4 at certain offset - this is because BAR4 is also
>> + * used for storing MSI-X table that is available at offset 0 in BAR4.
> 
> I would still really like a R-b by a more PCI-competent reviewer to
> ensure that it is sane to have the MSI-X table here in prefetchable
> 64-bit address space.

Having Reviewed-by for that make sense to me. And let me offer some
evidences of real devices having MSI-X in prefetchable region and 
non-prefetchable region. Based on those examples I don't think it matters
where you place your MSI-X vector.

Why do you think it may not be sane to place MSI-x table in prefetchable
region?

Device with MSI-X in non-prefetchable region:
Region 0: Memory at fb42c000 (64-bit, non-prefetchable) [size=16K]
Capabilities: [80] MSI-X: Enable+ Count=1 Masked-
Vector table: BAR=0 offset=2000
PBA: BAR=0 offset=3000

Device with MSI-X in prefetchable region
Region 0: Memory at fbc0 (64-bit, prefetchable) [size=2M]
Region 2: I/O ports at e020 [size=32]
Region 4: Memory at fbe04000 (64-bit, prefetchable) [size=16K]
Capabilities: [40] Power Management version 3
Flags: PMEClk- DSI+ D1- D2- AuxCurrent=0mA 
PME(D0+,D1-,D2-,D3hot+,D3cold+)
Status: D0 NoSoftRst- PME-Enable- DSel=0 DScale=1 PME-
Capabilities: [50] MSI: Enable- Count=1/1 Maskable+ 64bit+
Address:   Data: 
Masking:   Pending: 
Capabilities: [70] MSI-X: Enable+ Count=64 Masked-
Vector table: BAR=4 offset=
PBA: BAR=4 offset=2000



> 
>>   *
>> - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
>> - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
>> - * both provided.
>> + * pmrdev is assumed to be resident in BAR2/BAR3. When configured it 
>> consumes
>> + * whole BAR2/BAR3 exclusively.
>>   * Enabling pmr emulation can be achieved by pointing to 
>> memory-backend-file.
>>   * For example:
>>   * -object memory-backend-file,id=,share=on,mem-path=, \
>> @@ -69,18 +69,19 @@
>>  
>>  static void nvme_process_sq(void *opaque);
>>  
>> -static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
>> +static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr, int size)
>>  {
>> -hwaddr low = n->ctrl_mem.addr;
>> -hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
>> +hwaddr low = n->bar4.addr + n->cmb_offset;
>> +hwaddr hi  = low + NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
> 
> Isn't it better to use n->bar4.size?

My understanding is that cmb doesn't necessarily need to occupy whole BAR,
which is required to be power-of-two in size.

> 
>>  
>> -return addr >= low && addr < hi;
>> +return addr >= low && (addr + size) < hi;
>>  }
> 
> I think nvme_addr_is_cmb should do what it says on the tin - that is,
> check that addr is within the CMB. The size check belongs in
> nvme_addr_read.

Yep, that was my intention. New check confirms that start and end of requested
addresses are within CMB range. While previous code only checked start address.
You may end up with situation where end adress could be outside of CMB range 
while
start in CMB range.

> 
>>  
>>  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>>  {
>> -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
>> -memcpy(buf, (void *)>cmbuf[addr - n->ctrl_mem.addr], size);
>> +hwaddr c

[PATCH RESEND v2 1/2] nvme: indicate CMB support through controller capabilities register

2020-06-16 Thread Andrzej Jakowski
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c  | 2 +-
 include/block/nvme.h | 6 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1aee042d4c..9f11f3e9da 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1582,6 +1582,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 NVME_CAP_SET_TO(n->bar.cap, 0xf);
 NVME_CAP_SET_CSS(n->bar.cap, 1);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
 
 n->bar.vs = 0x00010200;
 n->bar.intmc = n->bar.intms = 0;
@@ -1591,7 +1592,6 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 {
 NvmeCtrl *n = NVME(pci_dev);
 Error *local_err = NULL;
-
 int i;
 
 nvme_check_constraints(n, _err);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 1720ee1d51..14cf398dfa 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -35,6 +35,7 @@ enum NvmeCapShift {
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
 CAP_PMR_SHIFT  = 56,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -48,6 +49,7 @@ enum NvmeCapMask {
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
 CAP_PMR_MASK   = 0x1,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -78,8 +80,10 @@ enum NvmeCapMask {
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
 << 
CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
 << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
+   << CAP_CMB_SHIFT)
 
 enum NvmeCcShift {
 CC_EN_SHIFT = 0,
-- 
2.21.1




[PATCH RESEND v2 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-16 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 122 ---
 hw/block/nvme.h  |   5 +-
 include/block/nvme.h |   4 +-
 3 files changed, 86 insertions(+), 45 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 9f11f3e9da..62681966b9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,12 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at certain offset - this is because BAR4 is also
+ * used for storing MSI-X table that is available at offset 0 in BAR4.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes
+ * whole BAR2/BAR3 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -69,18 +69,19 @@
 
 static void nvme_process_sq(void *opaque);
 
-static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
+static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr, int size)
 {
-hwaddr low = n->ctrl_mem.addr;
-hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
+hwaddr low = n->bar4.addr + n->cmb_offset;
+hwaddr hi  = low + NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
 
-return addr >= low && addr < hi;
+return addr >= low && (addr + size) < hi;
 }
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
-memcpy(buf, (void *)>cmbuf[addr - n->ctrl_mem.addr], size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
+if (n->cmbsz && nvme_addr_is_cmb(n, addr, size)) {
+memcpy(buf, (void *)>cmbuf[addr - cmb_addr], size);
 return;
 }
 
@@ -167,17 +168,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
QEMUIOVector *iov, uint64_t prp1,
  uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 
 if (unlikely(!prp1)) {
 trace_pci_nvme_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
-   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+} else if (n->cmbsz && prp1 >= cmb_addr &&
+   prp1 < cmb_addr + int128_get64(n->bar4.size)) {
 qsg->nsg = 0;
 qemu_iovec_init(iov, num_prps);
-qemu_iovec_add(iov, (void *)>cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp1 - cmb_addr], trans_len);
 } else {
 pci_dma_sglist_init(qsg, >parent_obj, num_prps);
 qemu_sglist_add(qsg, prp1, trans_len);
@@ -222,7 +224,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg){
 qemu_sglist_add(qsg, prp_ent, trans_len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - cmb_addr],
+   trans_len);
 }
 len -= trans_len;
 i++;
@@ -235,7 +238,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg) {
 qemu_sglist_add(qsg, prp2, len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp2 - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp2 - cmb_addr],
+   trans_len);
 }
 }
 }
@@ -1360,6 +1364,7 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
+
 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
 {
 NvmeParams *params = >params;
@@ -1395,7 +1400,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 return;
 }
 
-if (!n->params.cmb_size_mb && n->pmrdev) {
+if (n->pmrdev) {
 

[PATCH RESEND v2] nvme: allow cmb and pmr emulation on same device

2020-06-16 Thread Andrzej Jakowski
Hi All, 

Resending series recently posted on mailing list related to nvme device
extension.


This patch series does following:
 - Fixes problem where CMBS bit was not set in controller capabilities
   register, so support for CMB was not correctly advertised to guest.
   This is resend of patch that has been submitted and reviewed by
   Klaus [1]
 - Introduces BAR4 sharing between MSI-X vectors and CMB. This allows
   to have CMB and PMR emulated on the same device. This extension
   was indicated by Keith [2]

v2:
 - rebase on Kevin's latest block branch (Klaus [3])
 - improved comments section (Klaus [3])
 - simplified calculation of BAR4 size (Klaus [3])

v1:
 - initial push of the patch

[1]: 
https://lore.kernel.org/qemu-devel/20200408055607.g2ii7gwqbnv6cd3w@apples.localdomain/
[2]: 
https://lore.kernel.org/qemu-devel/20200330165518.ga8...@redsun51.ssa.fujisawa.hgst.com/
[3]: 
https://lore.kernel.org/qemu-devel/20200605181043.28782-1-andrzej.jakow...@linux.intel.com/





[PATCH v2 1/2] nvme: indicate CMB support through controller capabilities register

2020-06-09 Thread Andrzej Jakowski
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c  | 2 +-
 include/block/nvme.h | 6 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 567bce7519..54d4201daa 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1570,6 +1570,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 NVME_CAP_SET_TO(n->bar.cap, 0xf);
 NVME_CAP_SET_CSS(n->bar.cap, 1);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
 
 n->bar.vs = 0x00010200;
 n->bar.intmc = n->bar.intms = 0;
@@ -1579,7 +1580,6 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 {
 NvmeCtrl *n = NVME(pci_dev);
 Error *local_err = NULL;
-
 int i;
 
 nvme_check_constraints(n, _err);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 1720ee1d51..14cf398dfa 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -35,6 +35,7 @@ enum NvmeCapShift {
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
 CAP_PMR_SHIFT  = 56,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -48,6 +49,7 @@ enum NvmeCapMask {
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
 CAP_PMR_MASK   = 0x1,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -78,8 +80,10 @@ enum NvmeCapMask {
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
 << 
CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
 << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
+   << CAP_CMB_SHIFT)
 
 enum NvmeCcShift {
 CC_EN_SHIFT = 0,
-- 
2.21.1




[PATCH v2 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-09 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 120 ---
 hw/block/nvme.h  |   5 +-
 include/block/nvme.h |   4 +-
 3 files changed, 86 insertions(+), 43 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 54d4201daa..c96ee5678d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,12 @@
  *  [pmrdev=,] \
  *  max_ioqpairs=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at certain offset - this is because BAR4 is also
+ * used for storing MSI-X table that is available at offset 0 in BAR4.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes
+ * whole BAR2/BAR3 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -68,18 +68,19 @@
 
 static void nvme_process_sq(void *opaque);
 
-static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
+static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr, int size)
 {
-hwaddr low = n->ctrl_mem.addr;
-hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
+hwaddr low = n->bar4.addr + n->cmb_offset;
+hwaddr hi  = low + NVME_CMBSZ_GETSIZE(n->bar.cmbsz);
 
-return addr >= low && addr < hi;
+return addr >= low && (addr + size) < hi;
 }
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
-memcpy(buf, (void *)>cmbuf[addr - n->ctrl_mem.addr], size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
+if (n->cmbsz && nvme_addr_is_cmb(n, addr, size)) {
+memcpy(buf, (void *)>cmbuf[addr - cmb_addr], size);
 return;
 }
 
@@ -166,17 +167,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
QEMUIOVector *iov, uint64_t prp1,
  uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 
 if (unlikely(!prp1)) {
 trace_pci_nvme_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
-   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+} else if (n->cmbsz && prp1 >= cmb_addr &&
+   prp1 < cmb_addr + int128_get64(n->bar4.size)) {
 qsg->nsg = 0;
 qemu_iovec_init(iov, num_prps);
-qemu_iovec_add(iov, (void *)>cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp1 - cmb_addr], trans_len);
 } else {
 pci_dma_sglist_init(qsg, >parent_obj, num_prps);
 qemu_sglist_add(qsg, prp1, trans_len);
@@ -221,7 +223,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg){
 qemu_sglist_add(qsg, prp_ent, trans_len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - cmb_addr],
+   trans_len);
 }
 len -= trans_len;
 i++;
@@ -234,7 +237,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg) {
 qemu_sglist_add(qsg, prp2, len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp2 - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp2 - cmb_addr],
+   trans_len);
 }
 }
 }
@@ -1359,6 +1363,7 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
+
 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
 {
 NvmeParams *params = >params;
@@ -1387,7 +1392,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 return;
 }
 
-if (!n->params.cmb_size_mb && n->pmrdev) {
+if (n->pmrdev) {
 

[PATCH v2] nvme: allow cmb and pmr emulation on same device

2020-06-09 Thread Andrzej Jakowski
This patch series does following:
 - Fixes problem where CMBS bit was not set in controller capabilities
   register, so support for CMB was not correctly advertised to guest.
   This is resend of patch that has been submitted and reviewed by
   Klaus [1]
 - Introduces BAR4 sharing between MSI-X vectors and CMB. This allows
   to have CMB and PMR emulated on the same device. This extension
   was indicated by Keith [2]

v2:
 - rebase on Kevin's latest block branch (Klaus [3])
 - improved comments section (Klaus [3])
 - simplified calculation of BAR4 size (Klaus [4])

v1:
 - initial push of the patch

[1]: 
https://lore.kernel.org/qemu-devel/20200408055607.g2ii7gwqbnv6cd3w@apples.localdomain/
[2]: 
https://lore.kernel.org/qemu-devel/20200330165518.ga8...@redsun51.ssa.fujisawa.hgst.com/
[3]: 
https://lore.kernel.org/qemu-devel/20200605181043.28782-1-andrzej.jakow...@linux.intel.com/a





Re: [PATCH v1 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-08 Thread Andrzej Jakowski
On 6/8/20 1:08 AM, Klaus Jensen wrote:
> On Jun  5 11:10, Andrzej Jakowski wrote:
>> So far it was not possible to have CMB and PMR emulated on the same
>> device, because BAR2 was used exclusively either of PMR or CMB. This
>> patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.
>>
> 
> Hi Andrzej,
> 
> Thanks for doing this, it's a nice addition!
> 
> Though, I would prefer that the table and pba was located in BAR0 and
> keeping BAR4 for exclusive CMB use. I'm no expert on this, but is it ok
> to have the table and pba in prefetchable memory? Having it "together"
> with the other controller-level configuration memory just feels more
> natural to me, but I'm not gonna put my foot down.
Hi Klaus,

Thx for your feedback!
I don't think it matters if MSIX table is in prefetchable vs 
non-prefetchable memory. 
My understanding is that spec allows MSIX and PBA to be in any BAR and
offset. I understand your preference and at the same time think that
since it is not in violation of the spec why don't we leave it as-is?
Does anybody know what's typical approach for real devices?
> 
> Using BAR0 would also slightly simplify the patch since no changes would
> be required for the CMB path.
> 
> Also, can you rebase this on Kevin's block branch? There are a bunch of
> refactoring patches that changes the realization code, so this patch
> doesn't apply at all.
Yep will reabse it.
> 
>> Signed-off-by: Andrzej Jakowski 
>> ---
>>  hw/block/nvme.c  | 127 +--
>>  hw/block/nvme.h  |   3 +-
>>  include/block/nvme.h |   4 +-
>>  3 files changed, 91 insertions(+), 43 deletions(-)
>>
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index f0b45704be..353cf20e0a 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -22,12 +22,12 @@
>>   *  [pmrdev=,] \
>>   *  num_queues=
>>   *
>> - * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>> - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
>> + * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is 
>> assumed
>> + * to be resident in BAR4 at certain offset - this is because BAR4 is also
>> + * used for storing MSI-X table that is available at offset 0 in BAR4.
>>   *
>> - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
>> - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
>> - * both provided.
>> + * pmrdev is assumed to be resident in BAR2. When configured it consumes 
>> whole
>> + * BAR2 exclusively.
> 
> Actually it uses both BAR2 and BAR3 since its 64 bits.
Correct. That's what I implied here w/o actual verbiage. I can extend it
to add that information.
> 
>> @@ -1342,6 +1346,71 @@ static const MemoryRegionOps nvme_cmb_ops = {
>>  },
>>  };
>>  
>> +#define NVME_MSIX_BIR (4)
>> +static void nvme_bar4_init(PCIDevice *pci_dev)
>> +{
>> +NvmeCtrl *n = NVME(pci_dev);
>> +int status;
>> +uint64_t bar_size = 4096;
>> +uint32_t nvme_pba_offset = bar_size / 2;
>> +uint32_t nvme_pba_size = QEMU_ALIGN_UP(n->num_queues, 64) / 8;
>> +uint32_t cmb_size_units;
>> +
>> +if (n->num_queues * PCI_MSIX_ENTRY_SIZE > nvme_pba_offset) {
>> +nvme_pba_offset = n->num_queues * PCI_MSIX_ENTRY_SIZE;
>> +}
>> +
>> +if (nvme_pba_offset + nvme_pba_size > 4096) {
>> +bar_size = nvme_pba_offset + nvme_pba_size;
>> +}
>> +
> 
> This is migration compatibility stuff that is not needed because the
> nvme device is unmigratable anyway.
I don't understand that comment. Could you please explain more?
 




[PATCH v1 2/2] nvme: allow cmb and pmr to be enabled on same device

2020-06-05 Thread Andrzej Jakowski
So far it was not possible to have CMB and PMR emulated on the same
device, because BAR2 was used exclusively either of PMR or CMB. This
patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 127 +--
 hw/block/nvme.h  |   3 +-
 include/block/nvme.h |   4 +-
 3 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f0b45704be..353cf20e0a 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -22,12 +22,12 @@
  *  [pmrdev=,] \
  *  num_queues=
  *
- * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed
+ * to be resident in BAR4 at certain offset - this is because BAR4 is also
+ * used for storing MSI-X table that is available at offset 0 in BAR4.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
+ * pmrdev is assumed to be resident in BAR2. When configured it consumes whole
+ * BAR2 exclusively.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
@@ -64,9 +64,10 @@ static void nvme_process_sq(void *opaque);
 
 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->cmbsz && addr >= n->ctrl_mem.addr &&
-addr < (n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size))) {
-memcpy(buf, (void *)>cmbuf[addr - n->ctrl_mem.addr], size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
+if (n->cmbsz && addr >= cmb_addr &&
+(addr + size) <= (cmb_addr + NVME_CMBSZ_GETSIZE(n->bar.cmbsz))) {
+memcpy(buf, (void *)>cmbuf[addr - cmb_addr], size);
 } else {
 pci_dma_read(>parent_obj, addr, buf, size);
 }
@@ -152,17 +153,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
QEMUIOVector *iov, uint64_t prp1,
  uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
+hwaddr cmb_addr = n->bar4.addr + n->cmb_offset;
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 
 if (unlikely(!prp1)) {
 trace_nvme_err_invalid_prp();
 return NVME_INVALID_FIELD | NVME_DNR;
-} else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
-   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+} else if (n->cmbsz && prp1 >= cmb_addr &&
+   prp1 < cmb_addr + int128_get64(n->bar4.size)) {
 qsg->nsg = 0;
 qemu_iovec_init(iov, num_prps);
-qemu_iovec_add(iov, (void *)>cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp1 - cmb_addr], trans_len);
 } else {
 pci_dma_sglist_init(qsg, >parent_obj, num_prps);
 qemu_sglist_add(qsg, prp1, trans_len);
@@ -207,7 +209,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg){
 qemu_sglist_add(qsg, prp_ent, trans_len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp_ent - cmb_addr],
+   trans_len);
 }
 len -= trans_len;
 i++;
@@ -220,7 +223,8 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector 
*iov, uint64_t prp1,
 if (qsg->nsg) {
 qemu_sglist_add(qsg, prp2, len);
 } else {
-qemu_iovec_add(iov, (void *)>cmbuf[prp2 - 
n->ctrl_mem.addr], trans_len);
+qemu_iovec_add(iov, (void *)>cmbuf[prp2 - cmb_addr],
+   trans_len);
 }
 }
 }
@@ -1342,6 +1346,71 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
+#define NVME_MSIX_BIR (4)
+static void nvme_bar4_init(PCIDevice *pci_dev)
+{
+NvmeCtrl *n = NVME(pci_dev);
+int status;
+uint64_t bar_size = 4096;
+uint32_t nvme_pba_offset = bar_size / 2;
+uint32_t nvme_pba_size = QEMU_ALIGN_UP(n->num_queues, 64) / 8;
+uint32_t cmb_size_units;
+
+if (n->num_queues * PCI_MSIX_ENTRY_SIZE > nvme_pba_offset) {
+nvme_pba_offset = n->num_queues * PCI_MSIX_ENTRY_SIZE;
+}
+
+if (nvme_pba_offset + nvme_pba_size > 4096) {
+bar_size = nvme_pba_offset + nvme_pba_size;
+}
+
+if (n->c

[PATCH v1] nvme: allow cmb and pmr emulation on same device

2020-06-05 Thread Andrzej Jakowski
This patch series does following:
 - Fixes problem where CMBS bit was not set in controller capabilities
   register, so support for CMB was not correctly advertised to guest.
   This is resend of patch that has been submitted and reviewed by
   Klaus [1]
 - Introduces BAR4 sharing between MSI-X vectors and CMB. This allows
   to have CMB and PMR emulated on the same device. This extension
   was indicated by Keith [2]

[1]: 
https://lore.kernel.org/qemu-devel/20200408055607.g2ii7gwqbnv6cd3w@apples.localdomain/
[2]: 
https://lore.kernel.org/qemu-devel/20200330165518.ga8...@redsun51.ssa.fujisawa.hgst.com/




[PATCH v1 1/2] nvme: indicate CMB support through controller capabilities register

2020-06-05 Thread Andrzej Jakowski
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c  | 2 ++
 include/block/nvme.h | 6 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a21eeca2fb..f0b45704be 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1449,6 +1449,8 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 n->bar.intmc = n->bar.intms = 0;
 
 if (n->cmb_size_mb) {
+/* Contoller capabilities */
+NVME_CAP_SET_CMBS(n->bar.cap, 1);
 
 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2);
 NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 5525c8e343..b48349dbd0 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -35,6 +35,7 @@ enum NvmeCapShift {
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
 CAP_PMR_SHIFT  = 56,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -48,6 +49,7 @@ enum NvmeCapMask {
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
 CAP_PMR_MASK   = 0x1,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -78,8 +80,10 @@ enum NvmeCapMask {
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
 << 
CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
 << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
+   << CAP_CMB_SHIFT)
 
 enum NvmeCcShift {
 CC_EN_SHIFT = 0,
-- 
2.21.1




Re: [PATCH for QEMU] hw/vfio: Add VMD Passthrough Quirk

2020-04-22 Thread Andrzej Jakowski
> +pci_byte_test_and_set_mask(pdev->config + VMD_VMLOCK, 0x2);
> +pci_set_byte(pdev->wmask + VMD_VMLOCK, 0);
> +pci_set_byte(vdev->emulated_config_bits + VMD_VMLOCK, 0x2);
> +
> +trace_vfio_pci_vmd_quirk_vmlock(vdev->vbasedev.name,
> +pci_get_byte(pdev->config + VMD_VMLOCK));
> +
> +/* Drivers can match the subsystem vendor/device id */
> +pci_set_word(pdev->config + PCI_SUBSYSTEM_VENDOR_ID,
> + PCI_SUBVENDOR_ID_REDHAT_QUMRANET);
> +pci_set_word(vdev->emulated_config_bits + PCI_SUBSYSTEM_VENDOR_ID, ~0);
> +
> +pci_set_word(pdev->config + PCI_SUBSYSTEM_ID, PCI_SUBDEVICE_ID_QEMU);
> +pci_set_word(vdev->emulated_config_bits + PCI_SUBSYSTEM_ID, ~0);
> +
> +trace_vfio_pci_vmd_quirk_subsystem(vdev->vbasedev.name,
> +   vdev->sub_vendor_id, vdev->sub_device_id,
> +   pci_get_word(pdev->config + 
> PCI_SUBSYSTEM_VENDOR_ID),
> +   pci_get_word(pdev->config + PCI_SUBSYSTEM_ID));
> +
> +return 0;
> +}
> +
> +int vfio_pci_vmd_init(VFIOPCIDevice *vdev)
> +{
> +int ret = 0;
> +
> +switch (vdev->device_id) {
> +case 0x28C0: /* Native passthrough support */
> +break;
> +/* Emulates Native passthrough support */
> +case 0x201D:
> +case 0x467F:
> +case 0x4C3D:
> +case 0x9A0B:
> +ret = vfio_vmd_emulate_shadow_registers(vdev);
> +break;
> +}
> +
> +return ret;
> +}
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 5e75a95129..85425a1a6f 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -3024,6 +3024,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
>  }
>  }
>  
> +if (vdev->vendor_id == PCI_VENDOR_ID_INTEL) {
> +ret = vfio_pci_vmd_init(vdev);
> +if (ret) {
> +error_report("Failed to setup VMD");
> +}
> +}
> +
>  vfio_register_err_notifier(vdev);
>  vfio_register_req_notifier(vdev);
>  vfio_setup_resetfn_quirk(vdev);
> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> index 0da7a20a7e..e8632d806b 100644
> --- a/hw/vfio/pci.h
> +++ b/hw/vfio/pci.h
> @@ -217,6 +217,8 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
>  int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp);
>  int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp);
>  
> +int vfio_pci_vmd_init(VFIOPCIDevice *vdev);
> +
>  void vfio_display_reset(VFIOPCIDevice *vdev);
>  int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
>  void vfio_display_finalize(VFIOPCIDevice *vdev);
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index b1ef55a33f..aabbd2693a 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -90,6 +90,10 @@ vfio_pci_nvidia_gpu_setup_quirk(const char *name, uint64_t 
> tgt, uint64_t size) "
>  vfio_pci_nvlink2_setup_quirk_ssatgt(const char *name, uint64_t tgt, uint64_t 
> size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64
>  vfio_pci_nvlink2_setup_quirk_lnkspd(const char *name, uint32_t link_speed) 
> "%s link_speed=0x%x"
>  
> +vfio_pci_vmd_quirk_shadow_regs(const char *name, uint64_t mb1, uint64_t mb2) 
> "%s membar1_phys=0x%"PRIx64" membar2_phys=0x%"PRIx64"
> +vfio_pci_vmd_quirk_vmlock(const char *name, uint8_t vmlock) "%s vmlock=0x%x"
> +vfio_pci_vmd_quirk_subsystem(const char *name, uint16_t old_svid, uint16_t 
> old_sdid, uint16_t new_svid, uint16_t new_sdid) "%s subsystem id 
> 0x%04x:0x%04x -> 0x%04x:0x%04x"
> +
>  # common.c
>  vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, 
> unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
>  vfio_region_read(char *name, int index, uint64_t addr, unsigned size, 
> uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64

Reviewed-by: Andrzej Jakowski 




[PATCH v1] nvme: indicate CMB support through controller capabilities register

2020-04-01 Thread Andrzej Jakowski
This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly reported
to guest OS.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c  | 2 ++
 include/block/nvme.h | 4 
 2 files changed, 6 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf3..986803398f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1393,6 +1393,8 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 n->bar.intmc = n->bar.intms = 0;
 
 if (n->cmb_size_mb) {
+/* Contoller capabilities */
+NVME_CAP_SET_CMBS(n->bar.cap, 1);
 
 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2);
 NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8fb941c653..561891b140 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -27,6 +27,7 @@ enum NvmeCapShift {
 CAP_CSS_SHIFT  = 37,
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -39,6 +40,7 @@ enum NvmeCapMask {
 CAP_CSS_MASK   = 0xff,
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -69,6 +71,8 @@ enum NvmeCapMask {
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
 << 
CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMB_MASK)\
+<< CAP_CMB_SHIFT)
 
 enum NvmeCcShift {
 CC_EN_SHIFT = 0,
-- 
2.21.1




[PATCH RESEND v4] nvme: introduce PMR support from NVMe 1.4 spec

2020-03-30 Thread Andrzej Jakowski
This patch introduces support for PMR that has been defined as part of NVMe 1.4
spec. User can now specify a pmrdev option that should point to 
HostMemoryBackend.
pmrdev memory region will subsequently be exposed as PCI BAR 2 in emulated NVMe
device. Guest OS can perform mmio read and writes to the PMR region that will 
stay
persistent across system reboot.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
Reviewed-by: Stefan Hajnoczi 
---
Changelog:

v4:
 - replaced qemu_msync() use with qemu_ram_writeback() to allow pmem_persist()
   or qemu_msync() be called depending on configuration [4] (Stefan)

 - rephrased comments to improve clarity and fixed code style issues [4]
   (Stefan, Klaus)

v3:
 - reworked PMR to use HostMemoryBackend instead of directly mapping PMR
   backend file into qemu [1] (Stefan)

v2:
 - provided support for Bit 1 from PMRWBM register instead of Bit 0 to ensure
   improved performance in virtualized environment [2] (Stefan)

 - added check if pmr size is power of two in size [3] (David)

 - addressed cross compilation build problems reported by CI environment

v1:
 - inital push of PMR emulation

[1]: 
https://lore.kernel.org/qemu-devel/20200306223853.37958-1-andrzej.jakow...@linux.intel.com/
[2]: 
https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf
[3]: 
https://lore.kernel.org/qemu-devel/20200218224811.30050-1-andrzej.jakow...@linux.intel.com/
[4]: 
https://lore.kernel.org/qemu-devel/20200318200303.11322-1-andrzej.jakow...@linux.intel.com/
 
---
Persistent Memory Region (PMR) is a new optional feature provided in NVMe 1.4
specification. This patch implements initial support for it in NVMe driver.
---
 hw/block/Makefile.objs |   2 +-
 hw/block/nvme.c| 109 ++
 hw/block/nvme.h|   2 +
 hw/block/trace-events  |   4 +
 include/block/nvme.h   | 172 +
 5 files changed, 288 insertions(+), 1 deletion(-)

diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
index 4b4a2b338d..47960b5f0d 100644
--- a/hw/block/Makefile.objs
+++ b/hw/block/Makefile.objs
@@ -7,12 +7,12 @@ common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
 common-obj-$(CONFIG_XEN) += xen-block.o
 common-obj-$(CONFIG_ECC) += ecc.o
 common-obj-$(CONFIG_ONENAND) += onenand.o
-common-obj-$(CONFIG_NVME_PCI) += nvme.o
 common-obj-$(CONFIG_SWIM) += swim.o
 
 common-obj-$(CONFIG_SH4) += tc58128.o
 
 obj-$(CONFIG_VIRTIO_BLK) += virtio-blk.o
 obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk.o
+obj-$(CONFIG_NVME_PCI) += nvme.o
 
 obj-y += dataplane/
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf3..9b453423cf 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,10 +19,19 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
+ *  [pmrdev=,] \
  *  num_queues=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ *
+ * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
+ * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
+ * both provided.
+ * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
+ * For example:
+ * -object memory-backend-file,id=,share=on,mem-path=, \
+ *  size=  -device nvme,...,pmrdev=
  */
 
 #include "qemu/osdep.h"
@@ -35,7 +44,9 @@
 #include "sysemu/sysemu.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
+#include "sysemu/hostmem.h"
 #include "sysemu/block-backend.h"
+#include "exec/ram_addr.h"
 
 #include "qemu/log.h"
 #include "qemu/module.h"
@@ -1141,6 +1152,26 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
"invalid write to read only CMBSZ, ignored");
 return;
+case 0xE00: /* PMRCAP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
+   "invalid write to PMRCAP register, ignored");
+return;
+case 0xE04: /* TODO PMRCTL */
+break;
+case 0xE08: /* PMRSTS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
+   "invalid write to PMRSTS register, ignored");
+return;
+case 0xE0C: /* PMREBS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
+   "invalid write to PMREBS register, ignored");
+return;
+case 0xE10: /* PMRSWTP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
+   "invalid write to PMRSWTP register, ignored");
+return;
+case 0xE14: /* TODO PMRMSC */
+ break;
 default:
 NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
"invalid MMI

Re: [PATCH v4] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-24 Thread Andrzej Jakowski
On 3/23/20 6:28 AM, Stefan Hajnoczi wrote:
> Excellent, thank you!
> 
> Reviewed-by: Stefan Hajnoczi 

Awesome, thx! Not sure about process...
Is this patch now staged for inclusion in QEMU?



[PATCH v4] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-20 Thread Andrzej Jakowski
This patch introduces support for PMR that has been defined as part of NVMe 1.4
spec. User can now specify a pmrdev option that should point to 
HostMemoryBackend.
pmrdev memory region will subsequently be exposed as PCI BAR 2 in emulated NVMe
device. Guest OS can perform mmio read and writes to the PMR region that will 
stay
persistent across system reboot.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Klaus Jensen 
---
v3:
 - replaced qemu_msync() use with qemu_ram_writeback() to allow pmem_persist()
   or qemu_msync() be called depending on configuration [4] (Stefan)
 - rephrased comments to improve clarity and fixed code style issues [4]
   (Stefan, Klaus)

v2:
 - reworked PMR to use HostMemoryBackend instead of directly mapping PMR
   backend file into qemu [1] (Stefan)

v1:
 - provided support for Bit 1 from PMRWBM register instead of Bit 0 to ensure
   improved performance in virtualized environment [2] (Stefan)

 - added check if pmr size is power of two in size [3] (David)

 - addressed cross compilation build problems reported by CI environment

[1]: 
https://lore.kernel.org/qemu-devel/20200306223853.37958-1-andrzej.jakow...@linux.intel.com/
[2]: 
https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf
[3]: 
https://lore.kernel.org/qemu-devel/20200218224811.30050-1-andrzej.jakow...@linux.intel.com/
[4]: 
https://lore.kernel.org/qemu-devel/20200318200303.11322-1-andrzej.jakow...@linux.intel.com/
---
Persistent Memory Region (PMR) is a new optional feature provided in NVMe 1.4
specification. This patch implements initial support for it in NVMe driver.
---
 hw/block/Makefile.objs |   2 +-
 hw/block/nvme.c| 109 ++
 hw/block/nvme.h|   2 +
 hw/block/trace-events  |   4 +
 include/block/nvme.h   | 172 +
 5 files changed, 288 insertions(+), 1 deletion(-)

diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
index 4b4a2b338d..47960b5f0d 100644
--- a/hw/block/Makefile.objs
+++ b/hw/block/Makefile.objs
@@ -7,12 +7,12 @@ common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
 common-obj-$(CONFIG_XEN) += xen-block.o
 common-obj-$(CONFIG_ECC) += ecc.o
 common-obj-$(CONFIG_ONENAND) += onenand.o
-common-obj-$(CONFIG_NVME_PCI) += nvme.o
 common-obj-$(CONFIG_SWIM) += swim.o
 
 common-obj-$(CONFIG_SH4) += tc58128.o
 
 obj-$(CONFIG_VIRTIO_BLK) += virtio-blk.o
 obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk.o
+obj-$(CONFIG_NVME_PCI) += nvme.o
 
 obj-y += dataplane/
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf3..9b453423cf 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,10 +19,19 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
+ *  [pmrdev=,] \
  *  num_queues=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ *
+ * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
+ * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
+ * both provided.
+ * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
+ * For example:
+ * -object memory-backend-file,id=,share=on,mem-path=, \
+ *  size=  -device nvme,...,pmrdev=
  */
 
 #include "qemu/osdep.h"
@@ -35,7 +44,9 @@
 #include "sysemu/sysemu.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
+#include "sysemu/hostmem.h"
 #include "sysemu/block-backend.h"
+#include "exec/ram_addr.h"
 
 #include "qemu/log.h"
 #include "qemu/module.h"
@@ -1141,6 +1152,26 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
"invalid write to read only CMBSZ, ignored");
 return;
+case 0xE00: /* PMRCAP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
+   "invalid write to PMRCAP register, ignored");
+return;
+case 0xE04: /* TODO PMRCTL */
+break;
+case 0xE08: /* PMRSTS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
+   "invalid write to PMRSTS register, ignored");
+return;
+case 0xE0C: /* PMREBS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
+   "invalid write to PMREBS register, ignored");
+return;
+case 0xE10: /* PMRSWTP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
+   "invalid write to PMRSWTP register, ignored");
+return;
+case 0xE14: /* TODO PMRMSC */
+ break;
 default:
 NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
"invalid MMIO write,"
@@ -1169,6 +1200,16 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr 
addr, unsigned 

Re: [PATCH v3] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-20 Thread Andrzej Jakowski
On 3/20/20 8:45 AM, Stefan Hajnoczi wrote:
> Please use qemu_ram_writeback() so that pmem_persist() and qemu_msync()
> are used as appropriate.

Thx!
qemu_ram_writeback() doesn't return any status. How can I know that actual 
msync succeds?

Also qemu_ram_writeback() requires me to include #include "exec/ram_addr.h". 
After including it when I compile code I'm getting following error:

In file included from hw/block/nvme.c:49:
/root/sources/pmr/qemu/include/exec/ram_addr.h:23:10: fatal error: cpu.h: No 
such file or directory
   23 | #include "cpu.h"
  |  ^~~
compilation terminated.
make: *** [/root/sources/pmr/qemu/rules.mak:69: hw/block/nvme.o] Error 1

Why this is happening and what should be changed.



[PATCH v3] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-18 Thread Andrzej Jakowski
This patch introduces support for PMR that has been defined as part of NVMe 1.4
spec. User can now specify a pmrdev option that should point to 
HostMemoryBackend.
pmrdev memory region will subsequently be exposed as PCI BAR 2 in emulated NVMe
device. Guest OS can perform mmio read and writes to the PMR region that will 
stay
persistent across system reboot.

Signed-off-by: Andrzej Jakowski 
---
v2:
 - reworked PMR to use HostMemoryBackend instead of directly mapping PMR
   backend file into qemu [1] (Stefan)

v1:
 - provided support for Bit 1 from PMRWBM register instead of Bit 0 to ensure
   improved performance in virtualized environment [2] (Stefan)

 - added check if pmr size is power of two in size [3] (David)

 - addressed cross compilation build problems reported by CI environment

[1]: 
https://lore.kernel.org/qemu-devel/20200306223853.37958-1-andrzej.jakow...@linux.intel.com/
[2]: 
https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf
[3]: 
https://lore.kernel.org/qemu-devel/20200218224811.30050-1-andrzej.jakow...@linux.intel.com/
---
Persistent Memory Region (PMR) is a new optional feature provided in NVMe 1.4
specification. This patch implements initial support for it in NVMe driver.
---
 hw/block/nvme.c   | 117 +++-
 hw/block/nvme.h   |   2 +
 hw/block/trace-events |   5 ++
 include/block/nvme.h  | 172 ++
 4 files changed, 294 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf3..70fd09d293 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,10 +19,18 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
+ *  [pmrdev=,] \
  *  num_queues=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ *
+ * Either cmb or pmr - due to limitation in available BAR indexes.
+ * pmr_file file needs to be power of two in size.
+ * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
+ * For example:
+ * -object memory-backend-file,id=,share=on,mem-path=, \
+ *  size=  -device nvme,...,pmrdev=
  */
 
 #include "qemu/osdep.h"
@@ -35,7 +43,9 @@
 #include "sysemu/sysemu.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
+#include "sysemu/hostmem.h"
 #include "sysemu/block-backend.h"
+#include "exec/ramblock.h"
 
 #include "qemu/log.h"
 #include "qemu/module.h"
@@ -1141,6 +1151,26 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
"invalid write to read only CMBSZ, ignored");
 return;
+case 0xE00: /* PMRCAP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
+   "invalid write to PMRCAP register, ignored");
+return;
+case 0xE04: /* TODO PMRCTL */
+break;
+case 0xE08: /* PMRSTS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
+   "invalid write to PMRSTS register, ignored");
+return;
+case 0xE0C: /* PMREBS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
+   "invalid write to PMREBS register, ignored");
+return;
+case 0xE10: /* PMRSWTP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
+   "invalid write to PMRSWTP register, ignored");
+return;
+case 0xE14: /* TODO PMRMSC */
+ break;
 default:
 NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
"invalid MMIO write,"
@@ -1169,6 +1199,23 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr 
addr, unsigned size)
 }
 
 if (addr < sizeof(n->bar)) {
+/*
+ * When PMRWBM bit 1 is set then read from
+ * from PMRSTS should ensure prior writes
+ * made it to persistent media
+ */
+if (addr == 0xE08 &&
+(NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02) >> 1) {
+int status;
+
+status = qemu_msync((void *)n->pmrdev->mr.ram_block->host,
+n->pmrdev->size,
+n->pmrdev->mr.ram_block->fd);
+if (!status) {
+NVME_GUEST_ERR(nvme_ub_mmiord_pmrread_barrier,
+   "error while persisting data");
+}
+}
 memcpy(, ptr + addr, size);
 } else {
 NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
@@ -1332,6 +1379,23 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 error_setg(errp, "serial property not set");
 return;
 }
+
+if (!n->cmb_size

Re: [PATCH RESEND v2] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-17 Thread Andrzej Jakowski
On 3/17/20 4:23 AM, Stefan Hajnoczi wrote:
>> Code is posted here
>> https://github.com/AndrzejJakowski/qemu/commit/3a7762a1d13ff1543d1da430748eb24e38faab6f
>>
>> QEMU command line:
>>
>> # below are just relevant pieces of configuration, other stuff omitted
>> # tried different setting (e.g. pmem=on and pmem=off)
>>
>> ./x86_64-softmmu/qemu-system-x86_64 ... \
>> -object 
>> memory-backend-file,id=mem1,share=off,pmem=on,mem-path=../nvme_pmr.bin,size=$((1*1024*1024))
>>  \
> share=off is MAP_PRIVATE.  If persistence is desired then share=on
> should be used.
> 
> However, this shouldn't affect "system_reset" behavior since the QEMU
> process still has the same mapped file open.
> 

Hi Stefan,

Thx!! share=off setting was the problem. I confirmed with my simple test
that persistence is achieved.
I didn't find API to perform flush (msync). Any suggestion what function to use?

Given that host memory backend is working I think my patch is almost ready for 
resubmission -- let me know if there are any other comments.

Andrzej

>> -drive file=../nvme.bin,format=raw,if=none,id=nvme_emulated \
>> -device nvme,drive=nvme_emulated,serial="test serial",pmrdev=mem1
>>
>> In VM:
>> My persisent memory region is exposed PCI BAR
>> Region 2: Memory at fe00 (64-bit, prefetchable) [size=1M]
>>
>> So I perform reads/writes from/to following adress 0xfe00 (decimal 
>> 4261412864)
>>
>> dd if=test.bin of=/dev/mem bs=1 count=30 seek=4261412864
>> dd if=/dev/mem of=test1.bin bs=1 count=30 skip=4261412864
> Did you verify that the guest kernel is really accessing the BAR?  I
> remember that distro kernels often ship with options that make
> /dev/mem of limited use because it's considered insecure.
> 
>> On VMM I didn't observe that backing file has been updated and after power 
>> cycling VM
>> I see old junk when reading PMR region.
> Did you check that the pmrdev mmap region contains the data the guest
> wrote before power cycling?
> 
>> Also from include/qemu/pmem.h it looks like pmem_persist() will cause qemu 
>> to exit
>> if libpmem is not installed:
> The libpmem support only needs to be used when the pmem=on option was
> given.  If there isn't a physical pmem device then it doesn't need to
> be used.
> 
> Stefan




Re: [PATCH RESEND v2] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-16 Thread Andrzej Jakowski
On 3/16/20 4:32 AM, Stefan Hajnoczi wrote:
> On Wed, Mar 11, 2020 at 11:08:27PM -0700, Klaus Birkelund Jensen wrote:
>> On Mar 11 15:54, Andrzej Jakowski wrote:
>>> On 3/11/20 2:20 AM, Stefan Hajnoczi wrote:
>>>> Please try:
>>>>
>>>>   $ git grep pmem
>>>>
>>>> backends/hostmem-file.c is the backend that can be used and the
>>>> pmem_persist() API can be used to flush writes.
>>> I've reworked this patch into hostmem-file type of backend.
>>> From simple tests in virtual machine: writing to PMR region
>>> and then reading from it after VM power cycle I have observed that
>>> there is no persistency.
> Sounds like an integration bug.  QEMU's NVDIMM emulation uses
> HostMemoryBackend and file contents survive guest reboot.
> 
> If you would like help debugging this, please post a link to the code
> and the command-line that you are using.
> 

Code is posted here
https://github.com/AndrzejJakowski/qemu/commit/3a7762a1d13ff1543d1da430748eb24e38faab6f

QEMU command line:

# below are just relevant pieces of configuration, other stuff omitted
# tried different setting (e.g. pmem=on and pmem=off)

./x86_64-softmmu/qemu-system-x86_64 ... \
-object 
memory-backend-file,id=mem1,share=off,pmem=on,mem-path=../nvme_pmr.bin,size=$((1*1024*1024))
 \
-drive file=../nvme.bin,format=raw,if=none,id=nvme_emulated \
-device nvme,drive=nvme_emulated,serial="test serial",pmrdev=mem1 

In VM:
My persisent memory region is exposed PCI BAR
Region 2: Memory at fe00 (64-bit, prefetchable) [size=1M]

So I perform reads/writes from/to following adress 0xfe00 (decimal 
4261412864)

dd if=test.bin of=/dev/mem bs=1 count=30 seek=4261412864
dd if=/dev/mem of=test1.bin bs=1 count=30 skip=4261412864

On VMM I didn't observe that backing file has been updated and after power 
cycling VM 
I see old junk when reading PMR region.

Also from include/qemu/pmem.h it looks like pmem_persist() will cause qemu to 
exit 
if libpmem is not installed:

#ifndef QEMU_PMEM_H
#define QEMU_PMEM_H

#ifdef CONFIG_LIBPMEM
#include 
#else  /* !CONFIG_LIBPMEM */

static inline void *
pmem_memcpy_persist(void *pmemdest, const void *src, size_t len)
{
/* If 'pmem' option is 'on', we should always have libpmem support,
   or qemu will report a error and exit, never come here. */
g_assert_not_reached();
return NULL;
}

static inline void
pmem_persist(const void *addr, size_t len)
{
g_assert_not_reached();
}

#endif /* CONFIG_LIBPMEM */

#endif /* QEMU_PMEM_H */ 



Re: [PATCH RESEND v2] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-11 Thread Andrzej Jakowski
On 3/11/20 2:20 AM, Stefan Hajnoczi wrote:
> Please try:
> 
>   $ git grep pmem
> 
> backends/hostmem-file.c is the backend that can be used and the
> pmem_persist() API can be used to flush writes.

I've reworked this patch into hostmem-file type of backend.
>From simple tests in virtual machine: writing to PMR region
and then reading from it after VM power cycle I have observed that
there is no persistency.

I guess that persistent behavior can be achieved if memory backend file
resides on actual persistent memory in VMM. I haven't found mechanism to
persist memory backend file when it resides in the file system on block
storage. My original mmap + msync based solution worked well there.
I believe that main problem with mmap was with "ifdef _WIN32" that made it 
platform specific and w/o it patchew CI complained. 
Is there a way that I could rework mmap + msync solution so it would fit
into qemu design?




Re: [PATCH RESEND v2] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-11 Thread Andrzej Jakowski
On 3/11/20 2:20 AM, Stefan Hajnoczi wrote:
> Oh, I think I see what you mean.  That is not how the term
> "preallocated" is usually used in POSIX file systems.  File systems
> have sparse files by default and the term preallocation is used in the
> context of fadvise(2) for reserving space.
> 
> In this case I think you're saying the file cannot grow.  That is
> implicit since the BAR can't grow either so you could drop the comment
> about preallocation.

Yes, there is no need to have file preallocated in POSIX meaning. Actaul
requirement is to have file that is multiple of MiB and power-of-two in size.
User may (but may not need to) use fallocate/fadvise to fulfill this 
requirement.



Re: [PATCH RESEND v2] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-10 Thread Andrzej Jakowski
On 3/10/20 2:51 AM, Stefan Hajnoczi wrote:
> On Fri, Mar 06, 2020 at 03:38:53PM -0700, Andrzej Jakowski wrote:
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index d28335cbf3..ff7e74d765 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -19,10 +19,14 @@
>>   *  -drive file=,if=none,id=
>>   *  -device nvme,drive=,serial=,id=, \
>>   *  cmb_size_mb=, \
>> + *  [pmr_file=,] \
>>   *  num_queues=
>>   *
>>   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>>   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
>> + *
>> + * Either cmb or pmr - due to limitation in avaialbe BAR indexes.
> 
> s/avaialbe/available/
> 
>> + * pmr_file file needs to be preallocated and power of two in size.
> 
> Why does it need to be preallocated?

PMR file is mmaped into address space. If memory accesses are made outside of 
file then SIGBUS signal is raised. Preallocation requirement was introduced
to prevent this situation.

> 
>>   */
>>  
>>  #include "qemu/osdep.h"
>> @@ -1141,6 +1145,28 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr 
>> offset, uint64_t data,
>>  NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
>> "invalid write to read only CMBSZ, ignored");
>>  return;
>> +#ifndef _WIN32
> 
> This ifdef is a hint that the layering is not right.  QEMU device models
> usually only implement the "frontend" device registers, interrupts, and
> request processing logic.  The platform-specific host "backend"
> (mmapping files, sending network packets, audio/graphics APIs, etc) is
> implemented separately.

Agree. I couldn't find QEMU backend ensuring persistence - thus decided to
go with mmap.

> 
> In the previous version I asked NVDIMM folks to review this patch and
> suggest how to use the same HostMemoryBackend (see
> include/sysemu/hostmem.h) that is already used for NVDIMM emulation.
> 
> That seems cleaner than baking platform-specific memory mapped file I/O
> into hw/block/nvme.c, and it will also add a few features that this
> patch does not have.
> 
> If NVDIMM folks don't respond to this email, would you be able to
> research backends/hostmem*.c and try to integrate it?  If you feel lost
> I can help but it will require me to spend time investigating how that
> stuff works again :).
> 

Yes I can research this topic. Does HostMemoryBacked provide persistence?




[PATCH RESEND v2] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-06 Thread Andrzej Jakowski
This patch introduces support for PMR that has been defined as part of NVMe 1.4
spec. User can now specify a pmr_file which will be mmap'ed into qemu address
space and subsequently in PCI BAR 2. Guest OS can perform mmio read and writes
to the PMR region that will stay persistent accross system reboot.

Signed-off-by: Andrzej Jakowski 
---
Changes since v1:
 - provided support for Bit 1 from PMRWBM register instead of Bit 0 to ensure
   improved performance in virtualized environment [1] (Stefan)

 - added check if pmr size is power of two in size (David)

 - addressed cross compilation build problems reported by CI environment

[1]: 
https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf
[2]: 
https://lore.kernel.org/qemu-devel/20200218224811.30050-1-andrzej.jakow...@linux.intel.com/
 
---

Persistent Memory Region (PMR) is a new optional feature provided in NVMe 1.4
specification. This patch implements initial support for it in NVMe driver.

 hw/block/nvme.c   | 165 +++-
 hw/block/nvme.h   |   5 ++
 hw/block/trace-events |   5 ++
 include/block/nvme.h  | 172 ++
 4 files changed, 346 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf3..ff7e74d765 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,10 +19,14 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
+ *  [pmr_file=,] \
  *  num_queues=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ *
+ * Either cmb or pmr - due to limitation in avaialbe BAR indexes.
+ * pmr_file file needs to be preallocated and power of two in size.
  */
 
 #include "qemu/osdep.h"
@@ -1141,6 +1145,28 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
"invalid write to read only CMBSZ, ignored");
 return;
+#ifndef _WIN32
+case 0xE00: /* PMRCAP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
+   "invalid write to PMRCAP register, ignored");
+return;
+case 0xE04: /* TODO PMRCTL */
+break;
+case 0xE08: /* PMRSTS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
+   "invalid write to PMRSTS register, ignored");
+return;
+case 0xE0C: /* PMREBS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
+   "invalid write to PMREBS register, ignored");
+return;
+case 0xE10: /* PMRSWTP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
+   "invalid write to PMRSWTP register, ignored");
+return;
+case 0xE14: /* TODO PMRMSC */
+ break;
+#endif /* !_WIN32 */
 default:
 NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
"invalid MMIO write,"
@@ -1169,6 +1195,22 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr 
addr, unsigned size)
 }
 
 if (addr < sizeof(n->bar)) {
+#ifndef _WIN32
+/*
+ * When PMRWBM bit 1 is set then read from
+ * from PMRSTS should ensure prior writes
+ * made it to persistent media
+ */
+if (addr == 0xE08 &&
+(NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02) >> 1) {
+int ret;
+ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC);
+if (!ret) {
+NVME_GUEST_ERR(nvme_ub_mmiord_pmrread_barrier,
+   "error while persisting data");
+}
+}
+#endif /* !_WIN32 */
 memcpy(, ptr + addr, size);
 } else {
 NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
@@ -1303,6 +1345,31 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
+#ifndef _WIN32
+static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data,
+unsigned size)
+{
+NvmeCtrl *n = (NvmeCtrl *)opaque;
+stn_le_p(>pmrbuf[addr], size, data);
+}
+
+static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size)
+{
+NvmeCtrl *n = (NvmeCtrl *)opaque;
+return ldn_le_p(>pmrbuf[addr], size);
+}
+
+static const MemoryRegionOps nvme_pmr_ops = {
+.read = nvme_pmr_read,
+.write = nvme_pmr_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.impl = {
+.min_access_size = 1,
+.max_access_size = 8,
+},
+};
+#endif /* !_WIN32 */
+
 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 {
 NvmeCtrl *n = NVME(pci_dev);
@@ -1332,6 +1399,39 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 error_setg(errp, "serial property not set");
 return;
 }
+
+#ifndef _WIN32
+if (!n->cmb_size_mb && n

Re: [PATCH v2 1/1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-03-03 Thread Andrzej Jakowski
On 2/21/20 2:23 PM, Andrzej Jakowski wrote:
> This patch introduces support for PMR that has been defined as part of NVMe 
> 1.4
> spec. User can now specify a pmr_file which will be mmap'ed into qemu address
> space and subsequently in PCI BAR 2. Guest OS can perform mmio read and writes
> to the PMR region that will stay persistent accross system reboot.
> 
> Signed-off-by: Andrzej Jakowski 
> ---
>  hw/block/nvme.c   | 165 +++-
>  hw/block/nvme.h   |   5 ++
>  hw/block/trace-events |   5 ++
>  include/block/nvme.h  | 172 ++
>  4 files changed, 346 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index d28335cbf3..ff7e74d765 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -19,10 +19,14 @@
>   *  -drive file=,if=none,id=
>   *  -device nvme,drive=,serial=,id=, \
>   *  cmb_size_mb=, \
> + *  [pmr_file=,] \
>   *  num_queues=
>   *
>   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> + *
> + * Either cmb or pmr - due to limitation in avaialbe BAR indexes.
> + * pmr_file file needs to be preallocated and power of two in size.
>   */
>  
>  #include "qemu/osdep.h"
> @@ -1141,6 +1145,28 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
> uint64_t data,
>  NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
> "invalid write to read only CMBSZ, ignored");
>  return;
> +#ifndef _WIN32
> +case 0xE00: /* PMRCAP */
> +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
> +   "invalid write to PMRCAP register, ignored");
> +return;
> +case 0xE04: /* TODO PMRCTL */
> +break;
> +case 0xE08: /* PMRSTS */
> +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
> +   "invalid write to PMRSTS register, ignored");
> +return;
> +case 0xE0C: /* PMREBS */
> +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
> +   "invalid write to PMREBS register, ignored");
> +return;
> +case 0xE10: /* PMRSWTP */
> +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
> +   "invalid write to PMRSWTP register, ignored");
> +return;
> +case 0xE14: /* TODO PMRMSC */
> + break;
> +#endif /* !_WIN32 */
>  default:
>  NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
> "invalid MMIO write,"
> @@ -1169,6 +1195,22 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr 
> addr, unsigned size)
>  }
>  
>  if (addr < sizeof(n->bar)) {
> +#ifndef _WIN32
> +/*
> + * When PMRWBM bit 1 is set then read from
> + * from PMRSTS should ensure prior writes
> + * made it to persistent media
> + */
> +if (addr == 0xE08 &&
> +(NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02) >> 1) {
> +int ret;
> +ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC);
> +if (!ret) {
> +NVME_GUEST_ERR(nvme_ub_mmiord_pmrread_barrier,
> +   "error while persisting data");
> +}
> +}
> +#endif /* !_WIN32 */
>  memcpy(, ptr + addr, size);
>  } else {
>  NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
> @@ -1303,6 +1345,31 @@ static const MemoryRegionOps nvme_cmb_ops = {
>  },
>  };
>  
> +#ifndef _WIN32
> +static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data,
> +unsigned size)
> +{
> +NvmeCtrl *n = (NvmeCtrl *)opaque;
> +stn_le_p(>pmrbuf[addr], size, data);
> +}
> +
> +static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +NvmeCtrl *n = (NvmeCtrl *)opaque;
> +return ldn_le_p(>pmrbuf[addr], size);
> +}
> +
> +static const MemoryRegionOps nvme_pmr_ops = {
> +.read = nvme_pmr_read,
> +.write = nvme_pmr_write,
> +.endianness = DEVICE_LITTLE_ENDIAN,
> +.impl = {
> +.min_access_size = 1,
> +.max_access_size = 8,
> +},
> +};
> +#endif /* !_WIN32 */
> +
>  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
>  {
>  NvmeCtrl *n = NVME(pci_dev);
> @@ -1332,6 +1399,39 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  error_setg(errp, "serial property not set");
>  return;
>  }
> +
> +#ifndef _WIN

Re: [PATCH v2 1/1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-26 Thread Andrzej Jakowski
On 2/21/20 2:23 PM, Andrzej Jakowski wrote:
> This patch introduces support for PMR that has been defined as part of NVMe 
> 1.4
> spec. User can now specify a pmr_file which will be mmap'ed into qemu address
> space and subsequently in PCI BAR 2. Guest OS can perform mmio read and writes
> to the PMR region that will stay persistent accross system reboot.
> 
> Signed-off-by: Andrzej Jakowski 

Hi,

v2 addresses feedback received in v1 and so far I haven't seen any other 
comments.
Is there anything else needed for inclusion of this patch in tree?

Thank you,
Andrzej



[PATCH v2 0/1] Enable PMR feature from NVMe 1.4 spec to NVMe driver

2020-02-21 Thread Andrzej Jakowski
Changes since v1:
 - provided support for Bit 1 from PMRWBM register instead of Bit 0 to ensure
   improved performance in virtualized environment [1] (Stefan)

 - added check if pmr size is power of two in size (David)

 - addressed cross compilation build problems reported by CI environment

[1]: 
https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf
[2]: 
https://lore.kernel.org/qemu-devel/20200218224811.30050-1-andrzej.jakow...@linux.intel.com/
 
---

Persistent Memory Region (PMR) is a new optional feature provided in NVMe 1.4
specification. This patch implements initial support for it in NVMe driver.

Andrzej Jakowski (1):
  block/nvme: introduce PMR support from NVMe 1.4 spec

 hw/block/nvme.c   | 165 +++-
 hw/block/nvme.h   |   5 ++
 hw/block/trace-events |   5 ++
 include/block/nvme.h  | 172 ++
 4 files changed, 346 insertions(+), 1 deletion(-)

-- 
2.21.1




[PATCH v2 1/1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Andrzej Jakowski
This patch introduces support for PMR that has been defined as part of NVMe 1.4
spec. User can now specify a pmr_file which will be mmap'ed into qemu address
space and subsequently in PCI BAR 2. Guest OS can perform mmio read and writes
to the PMR region that will stay persistent accross system reboot.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c   | 165 +++-
 hw/block/nvme.h   |   5 ++
 hw/block/trace-events |   5 ++
 include/block/nvme.h  | 172 ++
 4 files changed, 346 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf3..ff7e74d765 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,10 +19,14 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
+ *  [pmr_file=,] \
  *  num_queues=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ *
+ * Either cmb or pmr - due to limitation in avaialbe BAR indexes.
+ * pmr_file file needs to be preallocated and power of two in size.
  */
 
 #include "qemu/osdep.h"
@@ -1141,6 +1145,28 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
"invalid write to read only CMBSZ, ignored");
 return;
+#ifndef _WIN32
+case 0xE00: /* PMRCAP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
+   "invalid write to PMRCAP register, ignored");
+return;
+case 0xE04: /* TODO PMRCTL */
+break;
+case 0xE08: /* PMRSTS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
+   "invalid write to PMRSTS register, ignored");
+return;
+case 0xE0C: /* PMREBS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
+   "invalid write to PMREBS register, ignored");
+return;
+case 0xE10: /* PMRSWTP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
+   "invalid write to PMRSWTP register, ignored");
+return;
+case 0xE14: /* TODO PMRMSC */
+ break;
+#endif /* !_WIN32 */
 default:
 NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
"invalid MMIO write,"
@@ -1169,6 +1195,22 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr 
addr, unsigned size)
 }
 
 if (addr < sizeof(n->bar)) {
+#ifndef _WIN32
+/*
+ * When PMRWBM bit 1 is set then read from
+ * from PMRSTS should ensure prior writes
+ * made it to persistent media
+ */
+if (addr == 0xE08 &&
+(NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02) >> 1) {
+int ret;
+ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC);
+if (!ret) {
+NVME_GUEST_ERR(nvme_ub_mmiord_pmrread_barrier,
+   "error while persisting data");
+}
+}
+#endif /* !_WIN32 */
 memcpy(, ptr + addr, size);
 } else {
 NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
@@ -1303,6 +1345,31 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
+#ifndef _WIN32
+static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data,
+unsigned size)
+{
+NvmeCtrl *n = (NvmeCtrl *)opaque;
+stn_le_p(>pmrbuf[addr], size, data);
+}
+
+static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size)
+{
+NvmeCtrl *n = (NvmeCtrl *)opaque;
+return ldn_le_p(>pmrbuf[addr], size);
+}
+
+static const MemoryRegionOps nvme_pmr_ops = {
+.read = nvme_pmr_read,
+.write = nvme_pmr_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.impl = {
+.min_access_size = 1,
+.max_access_size = 8,
+},
+};
+#endif /* !_WIN32 */
+
 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 {
 NvmeCtrl *n = NVME(pci_dev);
@@ -1332,6 +1399,39 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 error_setg(errp, "serial property not set");
 return;
 }
+
+#ifndef _WIN32
+if (!n->cmb_size_mb && n->pmr_file) {
+int fd;
+
+n->f_pmr = fopen(n->pmr_file, "r+b");
+if (!n->f_pmr) {
+error_setg(errp, "pmr backend file open error");
+return;
+}
+
+fseek(n->f_pmr, 0L, SEEK_END);
+n->f_pmr_size = ftell(n->f_pmr);
+fseek(n->f_pmr, 0L, SEEK_SET);
+
+/* pmr file size needs to be power of 2 in size */
+if (!n->f_pmr_size || !is_power_of_2(n->f_pmr_size)) {
+error_setg(errp, "pmr backend file size needs to be greater than 0"
+ "and

Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Andrzej Jakowski
On 2/21/20 12:32 PM, Stefan Hajnoczi wrote:
> Hi Andrzej,
> After having looked at the PMRWBM part of the spec, I think that the
> Bit 1 mode should be implemented for slightly better performance.  Bit
> 0 mode is not well-suited to virtualization for the reasons I
> mentioned in the previous email.
> 
> The spec describes Bit 1 mode as "The completion of a read to the
> PMRSTS register shall ensure that all prior writes to the Persistent
> Memory Region have completed and are persistent".
> 
> Stefan

Make sense -- will incorporate that feedback in second version of patch.



Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Andrzej Jakowski
On 2/21/20 11:45 AM, Dr. David Alan Gilbert wrote:
> Isn't there also a requirement that BARs are powers of two? Wouldn't
> you need to ensure the PMR file is a power of 2 in size?
> 
> Dave

Agree, need to make sure it is power of 2.



Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Andrzej Jakowski
On 2/21/20 6:45 AM, Stefan Hajnoczi wrote:
> Why is msync(2) done on memory loads instead of stores?

This is my interpretation of NVMe spec wording with regards to PMRWBM field
which says:

"The completion of a memory read from any Persistent
Memory Region address ensures that all prior writes to the
Persistent Memory Region have completed and are
persistent."



Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-19 Thread Andrzej Jakowski
On 2/18/20 6:07 PM, no-re...@patchew.org wrote:
> === TEST SCRIPT BEGIN ===
> #! /bin/bash
> export ARCH=x86_64
> make docker-image-fedora V=1 NETWORK=1
> time make docker-test-mingw@fedora J=14 NETWORK=1
> === TEST SCRIPT END ===
> 
>   CC  hw/display/sii9022.o
>   CC  hw/display/ssd0303.o
> /tmp/qemu-test/src/hw/block/nvme.c: In function 'nvme_pmr_read':
> /tmp/qemu-test/src/hw/block/nvme.c:1342:15: error: implicit declaration of 
> function 'msync'; did you mean 'fsync'? 
> [-Werror=implicit-function-declaration]
>  ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC);
>^
>fsync
> /tmp/qemu-test/src/hw/block/nvme.c:1342:15: error: nested extern declaration 
> of 'msync' [-Werror=nested-externs]
> /tmp/qemu-test/src/hw/block/nvme.c:1342:47: error: 'MS_SYNC' undeclared 
> (first use in this function)
>  ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC);
>^~~
> /tmp/qemu-test/src/hw/block/nvme.c:1342:47: note: each undeclared identifier 
> is reported only once for each function it appears in
> /tmp/qemu-test/src/hw/block/nvme.c: In function 'nvme_realize':
> /tmp/qemu-test/src/hw/block/nvme.c:1413:21: error: implicit declaration of 
> function 'mmap'; did you mean 'max'? [-Werror=implicit-function-declaration]
>  n->pmrbuf = mmap(NULL, n->f_pmr_size,
>  ^~~~

This patch seems to fail on cross-compilation for Windows env.
I plan to submit second version of this patch which will conditionally
support PMR for Linux environment only. It should take care of this problem.

Do you see any better fix for that?



[PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-18 Thread Andrzej Jakowski
This patch introduces support for PMR that has been defined as part of NVMe 1.4
spec. User can now specify a pmr_file which will be mmap'ed into qemu address
space and subsequently in PCI BAR 2. Guest OS can perform mmio read and writes
to the PMR region that will stay persistent accross system reboot.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c   | 145 ++-
 hw/block/nvme.h   |   5 ++
 hw/block/trace-events |   5 ++
 include/block/nvme.h  | 172 ++
 4 files changed, 326 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf3..836cf8d426 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,10 +19,14 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
+ *  [pmr_file=,] \
  *  num_queues=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ *
+ * Either cmb or pmr - due to limitation in avaialbe BAR indexes.
+ * pmr_file file needs to be preallocated and be multiple of MiB in size.
  */
 
 #include "qemu/osdep.h"
@@ -1141,6 +1145,26 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
"invalid write to read only CMBSZ, ignored");
 return;
+case 0xE00: /* PMRCAP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
+   "invalid write to PMRCAP register, ignored");
+return;
+case 0xE04: /* TODO PMRCTL */
+break;
+case 0xE08: /* PMRSTS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
+   "invalid write to PMRSTS register, ignored");
+return;
+case 0xE0C: /* PMREBS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
+   "invalid write to PMREBS register, ignored");
+return;
+case 0xE10: /* PMRSWTP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
+   "invalid write to PMRSWTP register, ignored");
+return;
+case 0xE14: /* TODO PMRMSC */
+ break;
 default:
 NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
"invalid MMIO write,"
@@ -1303,6 +1327,38 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
+static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data,
+unsigned size)
+{
+NvmeCtrl *n = (NvmeCtrl *)opaque;
+stn_le_p(>pmrbuf[addr], size, data);
+}
+
+static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size)
+{
+NvmeCtrl *n = (NvmeCtrl *)opaque;
+if (!NVME_PMRCAP_PMRWBM(n->bar.pmrcap)) {
+int ret;
+ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC);
+if (!ret) {
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrread_barrier,
+   "error while persisting data");
+}
+}
+return ldn_le_p(>pmrbuf[addr], size);
+}
+
+static const MemoryRegionOps nvme_pmr_ops = {
+.read = nvme_pmr_read,
+.write = nvme_pmr_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.impl = {
+.min_access_size = 1,
+.max_access_size = 8,
+},
+};
+
+
 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 {
 NvmeCtrl *n = NVME(pci_dev);
@@ -1332,6 +1388,37 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 error_setg(errp, "serial property not set");
 return;
 }
+
+if (!n->cmb_size_mb && n->pmr_file) {
+int fd;
+
+n->f_pmr = fopen(n->pmr_file, "r+b");
+if (!n->f_pmr) {
+error_setg(errp, "pmr backend file open error");
+return;
+}
+
+fseek(n->f_pmr, 0L, SEEK_END);
+n->f_pmr_size = ftell(n->f_pmr);
+fseek(n->f_pmr, 0L, SEEK_SET);
+
+/* pmr file size needs to be multiple of MiB in size */
+if (!n->f_pmr_size || n->f_pmr_size % (1 << 20)) {
+error_setg(errp, "pmr backend file size needs to be greater than 0"
+ "and multiple of MiB in size");
+return;
+}
+
+fd = fileno(n->f_pmr);
+n->pmrbuf = mmap(NULL, n->f_pmr_size,
+ (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0);
+
+if (n->pmrbuf == MAP_FAILED) {
+error_setg(errp, "pmr backend file mmap error");
+return;
+}
+}
+
 blkconf_blocksizes(>conf);
 if (!blkconf_apply_backend_options(>conf, blk_is_read_only(n->conf.blk),
false, errp)) {
@@ -1393,7 +1480,6 @@ static void nvme_realize(PCIDevice *pci_dev, Error