Re: [PATCH v4 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-26 Thread Knut Omang
On Wed, 2022-01-26 at 18:11 +0100, Lukasz Maniak wrote:
> From: Knut Omang 
> 
> This patch provides the building blocks for creating an SR/IOV
> PCIe Extended Capability header and register/unregister
> SR/IOV Virtual Functions.
> 
> Signed-off-by: Knut Omang 
> ---
>  hw/pci/meson.build  |   1 +
>  hw/pci/pci.c    | 100 +---
>  hw/pci/pcie.c   |   5 +
>  hw/pci/pcie_sriov.c | 294 
>  hw/pci/trace-events |   5 +
>  include/hw/pci/pci.h    |  12 +-
>  include/hw/pci/pcie.h   |   6 +
>  include/hw/pci/pcie_sriov.h |  71 +
>  include/qemu/typedefs.h |   2 +
>  9 files changed, 470 insertions(+), 26 deletions(-)
>  create mode 100644 hw/pci/pcie_sriov.c
>  create mode 100644 include/hw/pci/pcie_sriov.h
> 
> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
> index 5c4bbac817..bcc9c75919 100644
> --- a/hw/pci/meson.build
> +++ b/hw/pci/meson.build
> @@ -5,6 +5,7 @@ pci_ss.add(files(
>    'pci.c',
>    'pci_bridge.c',
>    'pci_host.c',
> +  'pcie_sriov.c',
>    'shpc.c',
>    'slotid_cap.c'
>  ))
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 5d30f9ca60..ba8fb92efc 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -239,6 +239,9 @@ int pci_bar(PCIDevice *d, int reg)
>  {
>  uint8_t type;
>  
> +    /* PCIe virtual functions do not have their own BARs */
> +    assert(!pci_is_vf(d));
> +
>  if (reg != PCI_ROM_SLOT)
>  return PCI_BASE_ADDRESS_0 + reg * 4;
>  
> @@ -304,10 +307,30 @@ void pci_device_deassert_intx(PCIDevice *dev)
>  }
>  }
>  
> -static void pci_do_device_reset(PCIDevice *dev)
> +static void pci_reset_regions(PCIDevice *dev)
>  {
>  int r;
> +    if (pci_is_vf(dev)) {
> +    return;
> +    }
> +
> +    for (r = 0; r < PCI_NUM_REGIONS; ++r) {
> +    PCIIORegion *region = >io_regions[r];
> +    if (!region->size) {
> +    continue;
> +    }
>  
> +    if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
> +    region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +    pci_set_quad(dev->config + pci_bar(dev, r), region->type);
> +    } else {
> +    pci_set_long(dev->config + pci_bar(dev, r), region->type);
> +    }
> +    }
> +}
> +
> +static void pci_do_device_reset(PCIDevice *dev)
> +{
>  pci_device_deassert_intx(dev);
>  assert(dev->irq_state == 0);
>  
> @@ -323,19 +346,7 @@ static void pci_do_device_reset(PCIDevice *dev)
>    pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
>    pci_get_word(dev->w1cmask + 
> PCI_INTERRUPT_LINE));
>  dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
> -    for (r = 0; r < PCI_NUM_REGIONS; ++r) {
> -    PCIIORegion *region = >io_regions[r];
> -    if (!region->size) {
> -    continue;
> -    }
> -
> -    if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
> -    region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> -    pci_set_quad(dev->config + pci_bar(dev, r), region->type);
> -    } else {
> -    pci_set_long(dev->config + pci_bar(dev, r), region->type);
> -    }
> -    }
> +    pci_reset_regions(dev);
>  pci_update_mappings(dev);
>  
>  msi_reset(dev);
> @@ -884,6 +895,16 @@ static void pci_init_multifunction(PCIBus *bus, 
> PCIDevice *dev,
> Error **errp)
>  dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
>  }
>  
> +    /*
> + * With SR/IOV and ARI, a device at function 0 need not be a 
> multifunction
> + * device, as it may just be a VF that ended up with function 0 in
> + * the legacy PCI interpretation. Avoid failing in such cases:
> + */
> +    if (pci_is_vf(dev) &&
> +    dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> +    return;
> +    }
> +
>  /*
>   * multifunction bit is interpreted in two ways as follows.
>   *   - all functions must set the bit to 1.
> @@ -1083,6 +1104,7 @@ static PCIDevice *do_pci_register_device(PCIDevice 
> *pci_dev,
>     bus->devices[devfn]->name);
>  return NULL;
>  } else if (dev->hotplugged &&
> +   !pci_is_vf(pci_dev) &&
>     pci_get_function_0(pci_dev)) {
>  error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
>     " new func %s cannot be exposed to guest.",
> @@ -1191,6 +1213,7 @@ void pci_register_bar(PCIDevice *pci_dev, int 
> region_num,
>  pcibus_t size = memory_region_size(memory);
>  uint8_t hdr_type;
>  
> +    assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar 
> */
>  assert(region_num >= 0);
>  assert(region_num < PCI_NUM_REGIONS);
>  assert(is_power_of_2(size));
> @@ -1294,11 +1317,45 @@ pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int 
> region_num)
>  return pci_dev->io_regions[region_num].addr;
>  }
>  
> -static pcibus_t pci_bar_address(PCIDevice *d,
> -

Re: [PATCH v4 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-26 Thread Lukasz Maniak
On Wed, Jan 26, 2022 at 06:11:06PM +0100, Lukasz Maniak wrote:
> From: Knut Omang 
> 
> This patch provides the building blocks for creating an SR/IOV
> PCIe Extended Capability header and register/unregister
> SR/IOV Virtual Functions.
> 
> Signed-off-by: Knut Omang 

Hi Knut,

We have edited the comments to which Michael drew attention.
I also resolved the issues reported by the checkpatch script for this
patch.

Please kindly check and confirm that you agree with these changes.

Thanks,
Lukasz

> ---
>  hw/pci/meson.build  |   1 +
>  hw/pci/pci.c| 100 +---
>  hw/pci/pcie.c   |   5 +
>  hw/pci/pcie_sriov.c | 294 
>  hw/pci/trace-events |   5 +
>  include/hw/pci/pci.h|  12 +-
>  include/hw/pci/pcie.h   |   6 +
>  include/hw/pci/pcie_sriov.h |  71 +
>  include/qemu/typedefs.h |   2 +
>  9 files changed, 470 insertions(+), 26 deletions(-)
>  create mode 100644 hw/pci/pcie_sriov.c
>  create mode 100644 include/hw/pci/pcie_sriov.h
> 
> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
> index 5c4bbac817..bcc9c75919 100644
> --- a/hw/pci/meson.build
> +++ b/hw/pci/meson.build
> @@ -5,6 +5,7 @@ pci_ss.add(files(
>'pci.c',
>'pci_bridge.c',
>'pci_host.c',
> +  'pcie_sriov.c',
>'shpc.c',
>'slotid_cap.c'
>  ))
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 5d30f9ca60..ba8fb92efc 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -239,6 +239,9 @@ int pci_bar(PCIDevice *d, int reg)
>  {
>  uint8_t type;
>  
> +/* PCIe virtual functions do not have their own BARs */
> +assert(!pci_is_vf(d));
> +
>  if (reg != PCI_ROM_SLOT)
>  return PCI_BASE_ADDRESS_0 + reg * 4;
>  
> @@ -304,10 +307,30 @@ void pci_device_deassert_intx(PCIDevice *dev)
>  }
>  }
>  
> -static void pci_do_device_reset(PCIDevice *dev)
> +static void pci_reset_regions(PCIDevice *dev)
>  {
>  int r;
> +if (pci_is_vf(dev)) {
> +return;
> +}
> +
> +for (r = 0; r < PCI_NUM_REGIONS; ++r) {
> +PCIIORegion *region = >io_regions[r];
> +if (!region->size) {
> +continue;
> +}
>  
> +if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
> +region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +pci_set_quad(dev->config + pci_bar(dev, r), region->type);
> +} else {
> +pci_set_long(dev->config + pci_bar(dev, r), region->type);
> +}
> +}
> +}
> +
> +static void pci_do_device_reset(PCIDevice *dev)
> +{
>  pci_device_deassert_intx(dev);
>  assert(dev->irq_state == 0);
>  
> @@ -323,19 +346,7 @@ static void pci_do_device_reset(PCIDevice *dev)
>pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
>pci_get_word(dev->w1cmask + 
> PCI_INTERRUPT_LINE));
>  dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
> -for (r = 0; r < PCI_NUM_REGIONS; ++r) {
> -PCIIORegion *region = >io_regions[r];
> -if (!region->size) {
> -continue;
> -}
> -
> -if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
> -region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> -pci_set_quad(dev->config + pci_bar(dev, r), region->type);
> -} else {
> -pci_set_long(dev->config + pci_bar(dev, r), region->type);
> -}
> -}
> +pci_reset_regions(dev);
>  pci_update_mappings(dev);
>  
>  msi_reset(dev);
> @@ -884,6 +895,16 @@ static void pci_init_multifunction(PCIBus *bus, 
> PCIDevice *dev, Error **errp)
>  dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
>  }
>  
> +/*
> + * With SR/IOV and ARI, a device at function 0 need not be a 
> multifunction
> + * device, as it may just be a VF that ended up with function 0 in
> + * the legacy PCI interpretation. Avoid failing in such cases:
> + */
> +if (pci_is_vf(dev) &&
> +dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> +return;
> +}
> +
>  /*
>   * multifunction bit is interpreted in two ways as follows.
>   *   - all functions must set the bit to 1.
> @@ -1083,6 +1104,7 @@ static PCIDevice *do_pci_register_device(PCIDevice 
> *pci_dev,
> bus->devices[devfn]->name);
>  return NULL;
>  } else if (dev->hotplugged &&
> +   !pci_is_vf(pci_dev) &&
> pci_get_function_0(pci_dev)) {
>  error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
> " new func %s cannot be exposed to guest.",
> @@ -1191,6 +1213,7 @@ void pci_register_bar(PCIDevice *pci_dev, int 
> region_num,
>  pcibus_t size = memory_region_size(memory);
>  uint8_t hdr_type;
>  
> +assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar 
> */
>  assert(region_num >= 0);
>  assert(region_num < PCI_NUM_REGIONS);
>  

[PATCH v4 12/15] hw/nvme: Initialize capability structures for primary/secondary controllers

2022-01-26 Thread Lukasz Maniak
From: Łukasz Gieryk 

With four new properties:
 - sriov_v{i,q}_flexible,
 - sriov_max_v{i,q}_per_vf,
one can configure the number of available flexible resources, as well as
the limits. The primary and secondary controller capability structures
are initialized accordingly.

Since the number of available queues (interrupts) now varies between
VF/PF, BAR size calculation is also adjusted.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c   | 142 ---
 hw/nvme/nvme.h   |   4 ++
 include/block/nvme.h |   5 ++
 3 files changed, 144 insertions(+), 7 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index e101cb7d7c..551c8795f2 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -36,6 +36,10 @@
  *  zoned.zasl=, \
  *  zoned.auto_transition=, \
  *  sriov_max_vfs= \
+ *  sriov_vq_flexible= \
+ *  sriov_vi_flexible= \
+ *  sriov_max_vi_per_vf= \
+ *  sriov_max_vq_per_vf= \
  *  subsys=
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
@@ -113,6 +117,29 @@
  *   enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
  *   Virtual function controllers will not report SR-IOV capability.
  *
+ *   NOTE: Single Root I/O Virtualization support is experimental.
+ *   All the related parameters may be subject to change.
+ *
+ * - `sriov_vq_flexible`
+ *   Indicates the total number of flexible queue resources assignable to all
+ *   the secondary controllers. Implicitly sets the number of primary
+ *   controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
+ *
+ * - `sriov_vi_flexible`
+ *   Indicates the total number of flexible interrupt resources assignable to
+ *   all the secondary controllers. Implicitly sets the number of primary
+ *   controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
+ *
+ * - `sriov_max_vi_per_vf`
+ *   Indicates the maximum number of virtual interrupt resources assignable
+ *   to a secondary controller. The default 0 resolves to
+ *   `(sriov_vi_flexible / sriov_max_vfs)`.
+ *
+ * - `sriov_max_vq_per_vf`
+ *   Indicates the maximum number of virtual queue resources assignable to
+ *   a secondary controller. The default 0 resolves to
+ *   `(sriov_vq_flexible / sriov_max_vfs)`.
+ *
  * nvme namespace device parameters
  * 
  * - `shared`
@@ -184,6 +211,7 @@
 #define NVME_NUM_FW_SLOTS 1
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
 #define NVME_MAX_VFS 127
+#define NVME_VF_RES_GRANULARITY 1
 #define NVME_VF_OFFSET 0x1
 #define NVME_VF_STRIDE 1
 
@@ -6359,6 +6387,54 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 error_setg(errp, "PMR is not supported with SR-IOV");
 return;
 }
+
+if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
+error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
+   " must be set for the use of SR-IOV");
+return;
+}
+
+if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
+error_setg(errp, "sriov_vq_flexible must be greater than or equal"
+   " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 
2);
+return;
+}
+
+if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
+error_setg(errp, "sriov_vq_flexible - max_ioqpairs (PF-private"
+   " queue resources) must be greater than or equal to 2");
+return;
+}
+
+if (params->sriov_vi_flexible < params->sriov_max_vfs) {
+error_setg(errp, "sriov_vi_flexible must be greater than or equal"
+   " to %d (sriov_max_vfs)", params->sriov_max_vfs);
+return;
+}
+
+if (params->msix_qsize < params->sriov_vi_flexible + 1) {
+error_setg(errp, "sriov_vi_flexible - msix_qsize (PF-private"
+   " interrupt resources) must be greater than or equal"
+   " to 1");
+return;
+}
+
+if (params->sriov_max_vi_per_vf &&
+(params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
+error_setg(errp, "sriov_max_vi_per_vf must meet:"
+   " (X - 1) %% %d == 0 and X >= 1",
+   NVME_VF_RES_GRANULARITY);
+return;
+}
+
+if (params->sriov_max_vq_per_vf &&
+(params->sriov_max_vq_per_vf < 2 ||
+ (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
+error_setg(errp, "sriov_max_vq_per_vf must meet:"
+   " (X - 1) %% %d == 0 and X >= 2",
+   NVME_VF_RES_GRANULARITY);
+return;
+}
 }
 }
 
@@ -6367,10 +6443,19 @@ static void nvme_init_state(NvmeCtrl *n)
 NvmePriCtrlCap *cap = >pri_ctrl_cap;
 

[PATCH v4 15/15] hw/nvme: Update the initalization place for the AER queue

2022-01-26 Thread Lukasz Maniak
From: Łukasz Gieryk 

This patch updates the initialization place for the AER queue, so it’s
initialized once, at controller initialization, and not every time
controller is enabled.

While the original version works for a non-SR-IOV device, as it’s hard
to interact with the controller if it’s not enabled, the multiple
reinitialization is not necessarily correct.

With the SR/IOV feature enabled a segfault can happen: a VF can have its
controller disabled, while a namespace can still be attached to the
controller through the parent PF. An event generated in such case ends
up on an uninitialized queue.

While it’s an interesting question whether a VF should support AER in
the first place, I don’t think it must be answered today.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 624db2f9c6..b2228e960f 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6029,8 +6029,6 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 
 nvme_set_timestamp(n, 0ULL);
 
-QTAILQ_INIT(>aer_queue);
-
 nvme_select_iocs(n);
 
 return 0;
@@ -7007,6 +7005,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->cmic |= NVME_CMIC_MULTI_CTRL;
 }
 
+QTAILQ_INIT(>aer_queue);
+
 NVME_CAP_SET_MQES(cap, 0x7ff);
 NVME_CAP_SET_CQR(cap, 1);
 NVME_CAP_SET_TO(cap, 0xf);
-- 
2.25.1




[PATCH v4 11/15] hw/nvme: Calculate BAR attributes in a function

2022-01-26 Thread Lukasz Maniak
From: Łukasz Gieryk 

An NVMe device with SR-IOV capability calculates the BAR size
differently for PF and VF, so it makes sense to extract the common code
to a separate function.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c | 45 +++--
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 40eb6bd1a8..e101cb7d7c 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6431,6 +6431,34 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice 
*pci_dev)
 memory_region_set_enabled(>pmr.dev->mr, false);
 }
 
+static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs,
+  unsigned *msix_table_offset,
+  unsigned *msix_pba_offset)
+{
+uint64_t bar_size, msix_table_size, msix_pba_size;
+
+bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
+bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
+
+if (msix_table_offset) {
+*msix_table_offset = bar_size;
+}
+
+msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
+bar_size += msix_table_size;
+bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
+
+if (msix_pba_offset) {
+*msix_pba_offset = bar_size;
+}
+
+msix_pba_size = QEMU_ALIGN_UP(total_irqs, 64) / 8;
+bar_size += msix_pba_size;
+
+bar_size = pow2ceil(bar_size);
+return bar_size;
+}
+
 static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
 uint64_t bar_size)
 {
@@ -6470,7 +6498,7 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, 
uint8_t offset)
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
 uint8_t *pci_conf = pci_dev->config;
-uint64_t bar_size, msix_table_size, msix_pba_size;
+uint64_t bar_size;
 unsigned msix_table_offset, msix_pba_offset;
 int ret;
 
@@ -6496,19 +6524,8 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
 }
 
 /* add one to max_ioqpairs to account for the admin queue pair */
-bar_size = sizeof(NvmeBar) +
-   2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE;
-bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
-msix_table_offset = bar_size;
-msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
-
-bar_size += msix_table_size;
-bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
-msix_pba_offset = bar_size;
-msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
-
-bar_size += msix_pba_size;
-bar_size = pow2ceil(bar_size);
+bar_size = nvme_bar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize,
+ _table_offset, _pba_offset);
 
 memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size);
 memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme",
-- 
2.25.1




[PATCH v4 13/15] hw/nvme: Add support for the Virtualization Management command

2022-01-26 Thread Lukasz Maniak
From: Łukasz Gieryk 

With the new command one can:
 - assign flexible resources (queues, interrupts) to primary and
   secondary controllers,
 - toggle the online/offline state of given controller.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c   | 257 ++-
 hw/nvme/nvme.h   |  20 
 hw/nvme/trace-events |   3 +
 include/block/nvme.h |  17 +++
 4 files changed, 295 insertions(+), 2 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 551c8795f2..624db2f9c6 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -188,6 +188,7 @@
 #include "qemu/error-report.h"
 #include "qemu/log.h"
 #include "qemu/units.h"
+#include "qemu/range.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
 #include "sysemu/sysemu.h"
@@ -259,6 +260,7 @@ static const uint32_t nvme_cse_acs[256] = {
 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_NS_ATTACHMENT]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
+[NVME_ADM_CMD_VIRT_MNGMT]   = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_FORMAT_NVM]   = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 };
 
@@ -290,6 +292,7 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
 };
 
 static void nvme_process_sq(void *opaque);
+static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
 
 static uint16_t nvme_sqid(NvmeRequest *req)
 {
@@ -5541,6 +5544,167 @@ out:
 return status;
 }
 
+static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
+  int *num_prim, int *num_sec)
+{
+*num_total = le32_to_cpu(rt ?
+ n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
+*num_prim = le16_to_cpu(rt ?
+n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
+*num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
+}
+
+static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
+ uint16_t cntlid, uint8_t rt,
+ int nr)
+{
+int num_total, num_prim, num_sec;
+
+if (cntlid != n->cntlid) {
+return NVME_INVALID_CTRL_ID | NVME_DNR;
+}
+
+nvme_get_virt_res_num(n, rt, _total, _prim, _sec);
+
+if (nr > num_total) {
+return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
+}
+
+if (nr > num_total - num_sec) {
+return NVME_INVALID_RESOURCE_ID | NVME_DNR;
+}
+
+if (rt) {
+n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
+} else {
+n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
+}
+
+req->cqe.result = cpu_to_le32(nr);
+return req->status;
+}
+
+static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
+ uint8_t rt, int nr)
+{
+int prev_nr, prev_total;
+
+if (rt) {
+prev_nr = le16_to_cpu(sctrl->nvi);
+prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
+sctrl->nvi = cpu_to_le16(nr);
+n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
+} else {
+prev_nr = le16_to_cpu(sctrl->nvq);
+prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
+sctrl->nvq = cpu_to_le16(nr);
+n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
+}
+}
+
+static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
+uint16_t cntlid, uint8_t rt, int 
nr)
+{
+int num_total, num_prim, num_sec, num_free, diff, limit;
+NvmeSecCtrlEntry *sctrl;
+
+sctrl = nvme_sctrl_for_cntlid(n, cntlid);
+if (!sctrl) {
+return NVME_INVALID_CTRL_ID | NVME_DNR;
+}
+
+if (sctrl->scs) {
+return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
+}
+
+limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
+if (nr > limit) {
+return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
+}
+
+nvme_get_virt_res_num(n, rt, _total, _prim, _sec);
+num_free = num_total - num_prim - num_sec;
+diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
+
+if (diff > num_free) {
+return NVME_INVALID_RESOURCE_ID | NVME_DNR;
+}
+
+nvme_update_virt_res(n, sctrl, rt, nr);
+req->cqe.result = cpu_to_le32(nr);
+
+return req->status;
+}
+
+static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
+{
+NvmeCtrl *sn = NULL;
+NvmeSecCtrlEntry *sctrl;
+int vf_index;
+
+sctrl = nvme_sctrl_for_cntlid(n, cntlid);
+if (!sctrl) {
+return NVME_INVALID_CTRL_ID | NVME_DNR;
+}
+
+if (!pci_is_vf(>parent_obj)) {
+vf_index = le16_to_cpu(sctrl->vfn) - 1;
+sn = NVME(pcie_sriov_get_vf_at_index(>parent_obj, vf_index));
+}
+
+if (online) {
+if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
+return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
+}
+
+if 

[PATCH v4 14/15] docs: Add documentation for SR-IOV and Virtualization Enhancements

2022-01-26 Thread Lukasz Maniak
Signed-off-by: Lukasz Maniak 
---
 docs/system/devices/nvme.rst | 36 
 1 file changed, 36 insertions(+)

diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
index b5acb2a9c1..166a11abc6 100644
--- a/docs/system/devices/nvme.rst
+++ b/docs/system/devices/nvme.rst
@@ -239,3 +239,39 @@ The virtual namespace device supports DIF- and DIX-based 
protection information
   to ``1`` to transfer protection information as the first eight bytes of
   metadata. Otherwise, the protection information is transferred as the last
   eight bytes.
+
+Virtualization Enhancements and SR-IOV (Experimental Support)
+-
+
+The ``nvme`` device supports Single Root I/O Virtualization and Sharing
+along with Virtualization Enhancements. The controller has to be linked to
+an NVM Subsystem device (``nvme-subsys``) for use with SR-IOV.
+
+A number of parameters are present (**please note, that they may be
+subject to change**):
+
+``sriov_max_vfs`` (default: ``0``)
+  Indicates the maximum number of PCIe virtual functions supported
+  by the controller. Specifying a non-zero value enables reporting of both
+  SR-IOV and ARI (Alternative Routing-ID Interpretation) capabilities
+  by the NVMe device. Virtual function controllers will not report SR-IOV.
+
+``sriov_vq_flexible``
+  Indicates the total number of flexible queue resources assignable to all
+  the secondary controllers. Implicitly sets the number of primary
+  controller's private resources to ``(max_ioqpairs - sriov_vq_flexible)``.
+
+``sriov_vi_flexible``
+  Indicates the total number of flexible interrupt resources assignable to
+  all the secondary controllers. Implicitly sets the number of primary
+  controller's private resources to ``(msix_qsize - sriov_vi_flexible)``.
+
+``sriov_max_vi_per_vf`` (default: ``0``)
+  Indicates the maximum number of virtual interrupt resources assignable
+  to a secondary controller. The default ``0`` resolves to
+  ``(sriov_vi_flexible / sriov_max_vfs)``
+
+``sriov_max_vq_per_vf`` (default: ``0``)
+  Indicates the maximum number of virtual queue resources assignable to
+  a secondary controller. The default ``0`` resolves to
+  ``(sriov_vq_flexible / sriov_max_vfs)``
-- 
2.25.1




[PATCH v4 09/15] hw/nvme: Make max_ioqpairs and msix_qsize configurable in runtime

2022-01-26 Thread Lukasz Maniak
From: Łukasz Gieryk 

The NVMe device defines two properties: max_ioqpairs, msix_qsize. Having
them as constants is problematic for SR-IOV support.

SR-IOV introduces virtual resources (queues, interrupts) that can be
assigned to PF and its dependent VFs. Each device, following a reset,
should work with the configured number of queues. A single constant is
no longer sufficient to hold the whole state.

This patch tries to solve the problem by introducing additional
variables in NvmeCtrl’s state. The variables for, e.g., managing queues
are therefore organized as:
 - n->params.max_ioqpairs – no changes, constant set by the user
 - n->(mutable_state) – (not a part of this patch) user-configurable,
specifies number of queues available _after_
reset
 - n->conf_ioqpairs - (new) used in all the places instead of the ‘old’
  n->params.max_ioqpairs; initialized in realize()
  and updated during reset() to reflect user’s
  changes to the mutable state

Since the number of available i/o queues and interrupts can change in
runtime, buffers for sq/cqs and the MSIX-related structures are
allocated big enough to handle the limits, to completely avoid the
complicated reallocation. A helper function (nvme_update_msixcap_ts)
updates the corresponding capability register, to signal configuration
changes.

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c | 52 ++
 hw/nvme/nvme.h |  2 ++
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index b816b377c3..426507ca8a 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -416,12 +416,12 @@ static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
 
 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
 {
-return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
+return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
 }
 
 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
 {
-return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
+return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
 }
 
 static void nvme_inc_cq_tail(NvmeCQueue *cq)
@@ -4035,8 +4035,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
 return NVME_INVALID_CQID | NVME_DNR;
 }
-if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
-n->sq[sqid] != NULL)) {
+if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
 return NVME_INVALID_QID | NVME_DNR;
 }
@@ -4388,8 +4387,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
  NVME_CQ_FLAGS_IEN(qflags) != 0);
 
-if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
-n->cq[cqid] != NULL)) {
+if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
 return NVME_INVALID_QID | NVME_DNR;
 }
@@ -4405,7 +4403,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_err_invalid_create_cq_vector(vector);
 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
 }
-if (unlikely(vector >= n->params.msix_qsize)) {
+if (unlikely(vector >= n->conf_msix_qsize)) {
 trace_pci_nvme_err_invalid_create_cq_vector(vector);
 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
 }
@@ -5002,13 +5000,12 @@ defaults:
 
 break;
 case NVME_NUMBER_OF_QUEUES:
-result = (n->params.max_ioqpairs - 1) |
-((n->params.max_ioqpairs - 1) << 16);
+result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
 trace_pci_nvme_getfeat_numq(result);
 break;
 case NVME_INTERRUPT_VECTOR_CONF:
 iv = dw11 & 0x;
-if (iv >= n->params.max_ioqpairs + 1) {
+if (iv >= n->conf_ioqpairs + 1) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
@@ -5163,10 +5160,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
NvmeRequest *req)
 
 trace_pci_nvme_setfeat_numq((dw11 & 0x) + 1,
 ((dw11 >> 16) & 0x) + 1,
-n->params.max_ioqpairs,
-n->params.max_ioqpairs);
-req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
-  ((n->params.max_ioqpairs - 1) << 16));
+n->conf_ioqpairs,
+n->conf_ioqpairs);
+req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
+  ((n->conf_ioqpairs - 1) << 16));
 break;
 case 

[PATCH v4 07/15] hw/nvme: Add support for Secondary Controller List

2022-01-26 Thread Lukasz Maniak
Introduce handling for Secondary Controller List (Identify command with
CNS value of 15h).

Secondary controller ids are unique in the subsystem, hence they are
reserved by it upon initialization of the primary controller to the
number of sriov_max_vfs.

ID reservation requires the addition of an intermediate controller slot
state, so the reserved controller has the address 0x.
A secondary controller is in the reserved state when it has no virtual
function assigned, but its primary controller is realized.
Secondary controller reservations are released to NULL when its primary
controller is unregistered.

Signed-off-by: Lukasz Maniak 
---
 hw/nvme/ctrl.c   | 35 +
 hw/nvme/ns.c |  2 +-
 hw/nvme/nvme.h   | 18 +++
 hw/nvme/subsys.c | 75 ++--
 hw/nvme/trace-events |  1 +
 include/block/nvme.h | 20 
 6 files changed, 141 insertions(+), 10 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 1eb1c3df03..9ee5f83aa1 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4552,6 +4552,29 @@ static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, 
NvmeRequest *req)
 sizeof(NvmePriCtrlCap), req);
 }
 
+static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeIdentify *c = (NvmeIdentify *)>cmd;
+uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
+uint16_t min_id = le16_to_cpu(c->ctrlid);
+uint8_t num_sec_ctrl = n->sec_ctrl_list.numcntl;
+NvmeSecCtrlList list = {0};
+uint8_t i;
+
+for (i = 0; i < num_sec_ctrl; i++) {
+if (n->sec_ctrl_list.sec[i].scid >= min_id) {
+list.numcntl = num_sec_ctrl - i;
+memcpy(, n->sec_ctrl_list.sec + i,
+   list.numcntl * sizeof(NvmeSecCtrlEntry));
+break;
+}
+}
+
+trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
+
+return nvme_c2h(n, (uint8_t *), sizeof(list), req);
+}
+
 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
  bool active)
 {
@@ -4772,6 +4795,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_ctrl_list(n, req, false);
 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
 return nvme_identify_pri_ctrl_cap(n, req);
+case NVME_ID_CNS_SECONDARY_CTRL_LIST:
+return nvme_identify_sec_ctrl_list(n, req);
 case NVME_ID_CNS_CS_NS:
 return nvme_identify_ns_csi(n, req, true);
 case NVME_ID_CNS_CS_NS_PRESENT:
@@ -6323,6 +6348,9 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 static void nvme_init_state(NvmeCtrl *n)
 {
 NvmePriCtrlCap *cap = >pri_ctrl_cap;
+NvmeSecCtrlList *list = >sec_ctrl_list;
+NvmeSecCtrlEntry *sctrl;
+int i;
 
 /* add one to max_ioqpairs to account for the admin queue pair */
 n->reg_size = pow2ceil(sizeof(NvmeBar) +
@@ -6334,6 +6362,13 @@ static void nvme_init_state(NvmeCtrl *n)
 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 
+list->numcntl = cpu_to_le16(n->params.sriov_max_vfs);
+for (i = 0; i < n->params.sriov_max_vfs; i++) {
+sctrl = >sec[i];
+sctrl->pcid = cpu_to_le16(n->cntlid);
+sctrl->vfn = cpu_to_le16(i + 1);
+}
+
 cap->cntlid = cpu_to_le16(n->cntlid);
 }
 
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 8b5f98c761..e7a54ac572 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -511,7 +511,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) {
 NvmeCtrl *ctrl = subsys->ctrls[i];
 
-if (ctrl) {
+if (ctrl && ctrl != SUBSYS_SLOT_RSVD) {
 nvme_attach_ns(ctrl, ns);
 }
 }
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 81deb45dfb..2157a7b95f 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -43,6 +43,7 @@ typedef struct NvmeBus {
 #define TYPE_NVME_SUBSYS "nvme-subsys"
 #define NVME_SUBSYS(obj) \
 OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
+#define SUBSYS_SLOT_RSVD (void *)0x
 
 typedef struct NvmeSubsystem {
 DeviceState parent_obj;
@@ -67,6 +68,10 @@ static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem 
*subsys,
 return NULL;
 }
 
+if (subsys->ctrls[cntlid] == SUBSYS_SLOT_RSVD) {
+return NULL;
+}
+
 return subsys->ctrls[cntlid];
 }
 
@@ -463,6 +468,7 @@ typedef struct NvmeCtrl {
 } features;
 
 NvmePriCtrlCap  pri_ctrl_cap;
+NvmeSecCtrlList sec_ctrl_list;
 } NvmeCtrl;
 
 static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
@@ -497,6 +503,18 @@ static inline uint16_t nvme_cid(NvmeRequest *req)
 return le16_to_cpu(req->cqe.cid);
 }
 
+static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
+{
+PCIDevice *pci_dev = >parent_obj;
+NvmeCtrl *pf = 

[PATCH v4 08/15] hw/nvme: Implement the Function Level Reset

2022-01-26 Thread Lukasz Maniak
From: Łukasz Gieryk 

This patch implements the Function Level Reset, a feature currently not
implemented for the Nvme device, while listed as a mandatory ("shall")
in the 1.4 spec.

The implementation reuses FLR-related building blocks defined for the
pci-bridge module, and follows the same logic:
- FLR capability is advertised in the PCIE config,
- custom pci_write_config callback detects a write to the trigger
  register and performs the PCI reset,
- which, eventually, calls the custom dc->reset handler.

Depending on reset type, parts of the state should (or should not) be
cleared. To distinguish the type of reset, an additional parameter is
passed to the reset function.

This patch also enables advertisement of the Power Management PCI
capability. The main reason behind it is to announce the no_soft_reset=1
bit, to signal SR-IOV support where each VF can be reset individually.

The implementation purposedly ignores writes to the PMCS.PS register,
as even such naïve behavior is enough to correctly handle the D3->D0
transition.

It’s worth to note, that the power state transition back to to D3, with
all the corresponding side effects, wasn't and stil isn't handled
properly.

Signed-off-by: Łukasz Gieryk 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 52 
 hw/nvme/nvme.h   |  5 +
 hw/nvme/trace-events |  1 +
 3 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 9ee5f83aa1..b816b377c3 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5604,7 +5604,7 @@ static void nvme_process_sq(void *opaque)
 }
 }
 
-static void nvme_ctrl_reset(NvmeCtrl *n)
+static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
 {
 NvmeNamespace *ns;
 int i;
@@ -5636,7 +5636,9 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
 }
 
 if (!pci_is_vf(>parent_obj) && n->params.sriov_max_vfs) {
-pcie_sriov_pf_disable_vfs(>parent_obj);
+if (rst != NVME_RESET_CONTROLLER) {
+pcie_sriov_pf_disable_vfs(>parent_obj);
+}
 }
 
 n->aer_queued = 0;
@@ -5870,7 +5872,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 }
 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
 trace_pci_nvme_mmio_stopped();
-nvme_ctrl_reset(n);
+nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
 cc = 0;
 csts &= ~NVME_CSTS_READY;
 }
@@ -6428,6 +6430,28 @@ static void nvme_init_sriov(NvmeCtrl *n, PCIDevice 
*pci_dev, uint16_t offset,
   PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
 }
 
+static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
+{
+Error *err = NULL;
+int ret;
+
+ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
+ PCI_PM_SIZEOF, );
+if (err) {
+error_report_err(err);
+return ret;
+}
+
+pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
+ PCI_PM_CAP_VER_1_2);
+pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
+ PCI_PM_CTRL_NO_SOFT_RESET);
+pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
+ PCI_PM_CTRL_STATE_MASK);
+
+return 0;
+}
+
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
 uint8_t *pci_conf = pci_dev->config;
@@ -6449,7 +6473,9 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, 
Error **errp)
 }
 
 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
+nvme_add_pm_capability(pci_dev, 0x60);
 pcie_endpoint_cap_init(pci_dev, 0x80);
+pcie_cap_flr_init(pci_dev);
 if (n->params.sriov_max_vfs) {
 pcie_ari_init(pci_dev, 0x100, 1);
 }
@@ -6699,7 +6725,7 @@ static void nvme_exit(PCIDevice *pci_dev)
 NvmeNamespace *ns;
 int i;
 
-nvme_ctrl_reset(n);
+nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
 
 if (n->subsys) {
 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
@@ -6798,6 +6824,22 @@ static void nvme_set_smart_warning(Object *obj, Visitor 
*v, const char *name,
 }
 }
 
+static void nvme_pci_reset(DeviceState *qdev)
+{
+PCIDevice *pci_dev = PCI_DEVICE(qdev);
+NvmeCtrl *n = NVME(pci_dev);
+
+trace_pci_nvme_pci_reset();
+nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
+}
+
+static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
+  uint32_t val, int len)
+{
+pci_default_write_config(dev, address, val, len);
+pcie_cap_flr_write_config(dev, address, val, len);
+}
+
 static const VMStateDescription nvme_vmstate = {
 .name = "nvme",
 .unmigratable = 1,
@@ -6809,6 +6851,7 @@ static void nvme_class_init(ObjectClass *oc, void *data)
 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
 
 pc->realize = nvme_realize;
+pc->config_write = nvme_pci_write_config;
 pc->exit = nvme_exit;
 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
  

[PATCH v4 05/15] hw/nvme: Add support for SR-IOV

2022-01-26 Thread Lukasz Maniak
This patch implements initial support for Single Root I/O Virtualization
on an NVMe device.

Essentially, it allows to define the maximum number of virtual functions
supported by the NVMe controller via sriov_max_vfs parameter.

Passing a non-zero value to sriov_max_vfs triggers reporting of SR-IOV
capability by a physical controller and ARI capability by both the
physical and virtual function devices.

NVMe controllers created via virtual functions mirror functionally
the physical controller, which may not entirely be the case, thus
consideration would be needed on the way to limit the capabilities of
the VF.

NVMe subsystem is required for the use of SR-IOV.

Signed-off-by: Lukasz Maniak 
---
 hw/nvme/ctrl.c   | 85 ++--
 hw/nvme/nvme.h   |  3 +-
 include/hw/pci/pci_ids.h |  1 +
 3 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 1f62116af9..cdfd554da0 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -35,6 +35,7 @@
  *  mdts=,vsl=, \
  *  zoned.zasl=, \
  *  zoned.auto_transition=, \
+ *  sriov_max_vfs= \
  *  subsys=
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
@@ -106,6 +107,12 @@
  *   transitioned to zone state closed for resource management purposes.
  *   Defaults to 'on'.
  *
+ * - `sriov_max_vfs`
+ *   Indicates the maximum number of PCIe virtual functions supported
+ *   by the controller. The default value is 0. Specifying a non-zero value
+ *   enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
+ *   Virtual function controllers will not report SR-IOV capability.
+ *
  * nvme namespace device parameters
  * 
  * - `shared`
@@ -160,6 +167,7 @@
 #include "sysemu/block-backend.h"
 #include "sysemu/hostmem.h"
 #include "hw/pci/msix.h"
+#include "hw/pci/pcie_sriov.h"
 #include "migration/vmstate.h"
 
 #include "nvme.h"
@@ -175,6 +183,9 @@
 #define NVME_TEMPERATURE_CRITICAL 0x175
 #define NVME_NUM_FW_SLOTS 1
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+#define NVME_MAX_VFS 127
+#define NVME_VF_OFFSET 0x1
+#define NVME_VF_STRIDE 1
 
 #define NVME_GUEST_ERR(trace, fmt, ...) \
 do { \
@@ -5589,6 +5600,10 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
 g_free(event);
 }
 
+if (!pci_is_vf(>parent_obj) && n->params.sriov_max_vfs) {
+pcie_sriov_pf_disable_vfs(>parent_obj);
+}
+
 n->aer_queued = 0;
 n->outstanding_aers = 0;
 n->qs_created = false;
@@ -6270,6 +6285,29 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 error_setg(errp, "vsl must be non-zero");
 return;
 }
+
+if (params->sriov_max_vfs) {
+if (!n->subsys) {
+error_setg(errp, "subsystem is required for the use of SR-IOV");
+return;
+}
+
+if (params->sriov_max_vfs > NVME_MAX_VFS) {
+error_setg(errp, "sriov_max_vfs must be between 0 and %d",
+   NVME_MAX_VFS);
+return;
+}
+
+if (params->cmb_size_mb) {
+error_setg(errp, "CMB is not supported with SR-IOV");
+return;
+}
+
+if (n->pmr.dev) {
+error_setg(errp, "PMR is not supported with SR-IOV");
+return;
+}
+}
 }
 
 static void nvme_init_state(NvmeCtrl *n)
@@ -6327,6 +6365,20 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice 
*pci_dev)
 memory_region_set_enabled(>pmr.dev->mr, false);
 }
 
+static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
+uint64_t bar_size)
+{
+uint16_t vf_dev_id = n->params.use_intel_id ?
+ PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
+
+pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
+   n->params.sriov_max_vfs, n->params.sriov_max_vfs,
+   NVME_VF_OFFSET, NVME_VF_STRIDE);
+
+pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
+  PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
+}
+
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
 uint8_t *pci_conf = pci_dev->config;
@@ -6341,7 +6393,7 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, 
Error **errp)
 
 if (n->params.use_intel_id) {
 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
-pci_config_set_device_id(pci_conf, 0x5845);
+pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
 } else {
 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
@@ -6349,6 +6401,9 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, 
Error **errp)
 
 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
 pcie_endpoint_cap_init(pci_dev, 0x80);
+if 

[PATCH v4 10/15] hw/nvme: Remove reg_size variable and update BAR0 size calculation

2022-01-26 Thread Lukasz Maniak
From: Łukasz Gieryk 

The n->reg_size parameter unnecessarily splits the BAR0 size calculation
in two phases; removed to simplify the code.

With all the calculations done in one place, it seems the pow2ceil,
applied originally to reg_size, is unnecessary. The rounding should
happen as the last step, when BAR size includes Nvme registers, queue
registers, and MSIX-related space.

Finally, the size of the mmio memory region is extended to cover the 1st
4KiB padding (see the map below). Access to this range is handled as
interaction with a non-existing queue and generates an error trace, so
actually nothing changes, while the reg_size variable is no longer needed.


|  BAR0|

[Nvme Registers]
[Queues]
[power-of-2 padding] - removed in this patch
[4KiB padding (1)  ]
[MSIX TABLE]
[4KiB padding (2)  ]
[MSIX PBA  ]
[power-of-2 padding]

Signed-off-by: Łukasz Gieryk 
---
 hw/nvme/ctrl.c | 10 +-
 hw/nvme/nvme.h |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 426507ca8a..40eb6bd1a8 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6372,9 +6372,6 @@ static void nvme_init_state(NvmeCtrl *n)
 n->conf_ioqpairs = n->params.max_ioqpairs;
 n->conf_msix_qsize = n->params.msix_qsize;
 
-/* add one to max_ioqpairs to account for the admin queue pair */
-n->reg_size = pow2ceil(sizeof(NvmeBar) +
-   2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
 n->temperature = NVME_TEMPERATURE;
@@ -6498,7 +6495,10 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
 pcie_ari_init(pci_dev, 0x100, 1);
 }
 
-bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
+/* add one to max_ioqpairs to account for the admin queue pair */
+bar_size = sizeof(NvmeBar) +
+   2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE;
+bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
 msix_table_offset = bar_size;
 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
 
@@ -6512,7 +6512,7 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, 
Error **errp)
 
 memory_region_init(>bar0, OBJECT(n), "nvme-bar0", bar_size);
 memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme",
-  n->reg_size);
+  msix_table_offset);
 memory_region_add_subregion(>bar0, 0, >iomem);
 
 if (pci_is_vf(pci_dev)) {
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 927890b490..1401ac3904 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -414,7 +414,6 @@ typedef struct NvmeCtrl {
 uint16_tmax_prp_ents;
 uint16_tcqe_size;
 uint16_tsqe_size;
-uint32_treg_size;
 uint32_tmax_q_ents;
 uint8_t outstanding_aers;
 uint32_tirq_status;
-- 
2.25.1




[PATCH v4 06/15] hw/nvme: Add support for Primary Controller Capabilities

2022-01-26 Thread Lukasz Maniak
Implementation of Primary Controller Capabilities data
structure (Identify command with CNS value of 14h).

Currently, the command returns only ID of a primary controller.
Handling of remaining fields are added in subsequent patches
implementing virtualization enhancements.

Signed-off-by: Lukasz Maniak 
Reviewed-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 23 ++-
 hw/nvme/nvme.h   |  2 ++
 hw/nvme/trace-events |  1 +
 include/block/nvme.h | 23 +++
 4 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index cdfd554da0..1eb1c3df03 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4544,6 +4544,14 @@ static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, 
NvmeRequest *req,
 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
 }
 
+static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
+{
+trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
+
+return nvme_c2h(n, (uint8_t *)>pri_ctrl_cap,
+sizeof(NvmePriCtrlCap), req);
+}
+
 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
  bool active)
 {
@@ -4762,6 +4770,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_identify_ctrl_list(n, req, true);
 case NVME_ID_CNS_CTRL_LIST:
 return nvme_identify_ctrl_list(n, req, false);
+case NVME_ID_CNS_PRIMARY_CTRL_CAP:
+return nvme_identify_pri_ctrl_cap(n, req);
 case NVME_ID_CNS_CS_NS:
 return nvme_identify_ns_csi(n, req, true);
 case NVME_ID_CNS_CS_NS_PRESENT:
@@ -6312,6 +6322,8 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 
 static void nvme_init_state(NvmeCtrl *n)
 {
+NvmePriCtrlCap *cap = >pri_ctrl_cap;
+
 /* add one to max_ioqpairs to account for the admin queue pair */
 n->reg_size = pow2ceil(sizeof(NvmeBar) +
2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
@@ -6321,6 +6333,8 @@ static void nvme_init_state(NvmeCtrl *n)
 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
+
+cap->cntlid = cpu_to_le16(n->cntlid);
 }
 
 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
@@ -6621,15 +6635,14 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 qbus_init(>bus, sizeof(NvmeBus), TYPE_NVME_BUS,
   _dev->qdev, n->parent_obj.qdev.id);
 
-nvme_init_state(n);
-if (nvme_init_pci(n, pci_dev, errp)) {
-return;
-}
-
 if (nvme_init_subsys(n, errp)) {
 error_propagate(errp, local_err);
 return;
 }
+nvme_init_state(n);
+if (nvme_init_pci(n, pci_dev, errp)) {
+return;
+}
 nvme_init_ctrl(n, pci_dev);
 
 /* setup a namespace if the controller drive property was given */
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 4c8af34b28..81deb45dfb 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -461,6 +461,8 @@ typedef struct NvmeCtrl {
 };
 uint32_tasync_config;
 } features;
+
+NvmePriCtrlCap  pri_ctrl_cap;
 } NvmeCtrl;
 
 static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index ff6cafd520..1014ebceb6 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -52,6 +52,7 @@ pci_nvme_identify_ctrl(void) "identify controller"
 pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
 pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_identify_ctrl_list(uint8_t cns, uint16_t cntid) "cns 0x%"PRIx8" cntid 
%"PRIu16""
+pci_nvme_identify_pri_ctrl_cap(uint16_t cntlid) "identify primary controller 
capabilities cntlid=%"PRIu16""
 pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", 
csi=0x%"PRIx8""
 pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", 
csi=0x%"PRIx8""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index e3bd47bf76..f69bd1d14f 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1017,6 +1017,7 @@ enum NvmeIdCns {
 NVME_ID_CNS_NS_PRESENT= 0x11,
 NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12,
 NVME_ID_CNS_CTRL_LIST = 0x13,
+NVME_ID_CNS_PRIMARY_CTRL_CAP  = 0x14,
 NVME_ID_CNS_CS_NS_PRESENT_LIST= 0x1a,
 NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
 NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
@@ -1465,6 +1466,27 @@ typedef enum NvmeZoneState {
 NVME_ZONE_STATE_OFFLINE  = 0x0f,
 } NvmeZoneState;
 
+typedef struct QEMU_PACKED NvmePriCtrlCap {
+uint16_tcntlid;
+uint16_tportid;
+uint8_t crt;
+uint8_t rsvd5[27];
+uint32_tvqfrt;
+uint32_tvqrfa;
+uint16_tvqrfap;
+uint16_tvqprt;
+

[PATCH v4 04/15] pcie: Add 1.2 version token for the Power Management Capability

2022-01-26 Thread Lukasz Maniak
From: Łukasz Gieryk 

Signed-off-by: Łukasz Gieryk 
---
 include/hw/pci/pci_regs.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/hw/pci/pci_regs.h b/include/hw/pci/pci_regs.h
index 77ba64b931..a590140962 100644
--- a/include/hw/pci/pci_regs.h
+++ b/include/hw/pci/pci_regs.h
@@ -4,5 +4,6 @@
 #include "standard-headers/linux/pci_regs.h"
 
 #define  PCI_PM_CAP_VER_1_1 0x0002  /* PCI PM spec ver. 1.1 */
+#define  PCI_PM_CAP_VER_1_2 0x0003  /* PCI PM spec ver. 1.2 */
 
 #endif
-- 
2.25.1




Re: [PATCH for-7.0 4/4] hw/nvme: add support for zoned random write area

2022-01-26 Thread Keith Busch
On Thu, Nov 25, 2021 at 08:37:35AM +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add support for TP 4076 ("Zoned Random Write Area"), v2021.08.23
> ("Ratified").
> 
> This adds three new namespace parameters: "zoned.numzrwa" (number of
> zrwa resources, i.e. number of zones that can have a zrwa),
> "zoned.zrwas" (zrwa size in LBAs), "zoned.zrwafg" (granularity in LBAs
> for flushes).
> 
> Signed-off-by: Klaus Jensen 

Looks good, and will just need a minor update if you choose to take the
feedback from patch 2 onboard.

Reviewed-by: Keith Busch 



[PATCH v4 00/15] hw/nvme: SR-IOV with Virtualization Enhancements

2022-01-26 Thread Lukasz Maniak
Changes since v3:
- Addressed comments to review on pcie: Add support for Single Root I/O
  Virtualization (SR/IOV)
- Fixed issues reported by checkpatch.pl

Knut Omang (2):
  pcie: Add support for Single Root I/O Virtualization (SR/IOV)
  pcie: Add some SR/IOV API documentation in docs/pcie_sriov.txt

Lukasz Maniak (4):
  hw/nvme: Add support for SR-IOV
  hw/nvme: Add support for Primary Controller Capabilities
  hw/nvme: Add support for Secondary Controller List
  docs: Add documentation for SR-IOV and Virtualization Enhancements

Łukasz Gieryk (9):
  pcie: Add a helper to the SR/IOV API
  pcie: Add 1.2 version token for the Power Management Capability
  hw/nvme: Implement the Function Level Reset
  hw/nvme: Make max_ioqpairs and msix_qsize configurable in runtime
  hw/nvme: Remove reg_size variable and update BAR0 size calculation
  hw/nvme: Calculate BAR attributes in a function
  hw/nvme: Initialize capability structures for primary/secondary
controllers
  hw/nvme: Add support for the Virtualization Management command
  hw/nvme: Update the initalization place for the AER queue

 docs/pcie_sriov.txt  | 115 ++
 docs/system/devices/nvme.rst |  36 ++
 hw/nvme/ctrl.c   | 675 ---
 hw/nvme/ns.c |   2 +-
 hw/nvme/nvme.h   |  55 ++-
 hw/nvme/subsys.c |  75 +++-
 hw/nvme/trace-events |   6 +
 hw/pci/meson.build   |   1 +
 hw/pci/pci.c | 100 --
 hw/pci/pcie.c|   5 +
 hw/pci/pcie_sriov.c  | 302 
 hw/pci/trace-events  |   5 +
 include/block/nvme.h |  65 
 include/hw/pci/pci.h |  12 +-
 include/hw/pci/pci_ids.h |   1 +
 include/hw/pci/pci_regs.h|   1 +
 include/hw/pci/pcie.h|   6 +
 include/hw/pci/pcie_sriov.h  |  77 
 include/qemu/typedefs.h  |   2 +
 19 files changed, 1460 insertions(+), 81 deletions(-)
 create mode 100644 docs/pcie_sriov.txt
 create mode 100644 hw/pci/pcie_sriov.c
 create mode 100644 include/hw/pci/pcie_sriov.h

-- 
2.25.1




[PATCH v4 03/15] pcie: Add a helper to the SR/IOV API

2022-01-26 Thread Lukasz Maniak
From: Łukasz Gieryk 

Convenience function for retrieving the PCIDevice object of the N-th VF.

Signed-off-by: Łukasz Gieryk 
Reviewed-by: Knut Omang 
---
 hw/pci/pcie_sriov.c | 10 +-
 include/hw/pci/pcie_sriov.h |  6 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
index 3f256d483f..87abad6ac8 100644
--- a/hw/pci/pcie_sriov.c
+++ b/hw/pci/pcie_sriov.c
@@ -287,8 +287,16 @@ uint16_t pcie_sriov_vf_number(PCIDevice *dev)
 return dev->exp.sriov_vf.vf_number;
 }
 
-
 PCIDevice *pcie_sriov_get_pf(PCIDevice *dev)
 {
 return dev->exp.sriov_vf.pf;
 }
+
+PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n)
+{
+assert(!pci_is_vf(dev));
+if (n < dev->exp.sriov_pf.num_vfs) {
+return dev->exp.sriov_pf.vf[n];
+}
+return NULL;
+}
diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
index 990cff0a1c..80f5c84e75 100644
--- a/include/hw/pci/pcie_sriov.h
+++ b/include/hw/pci/pcie_sriov.h
@@ -68,4 +68,10 @@ uint16_t pcie_sriov_vf_number(PCIDevice *dev);
  */
 PCIDevice *pcie_sriov_get_pf(PCIDevice *dev);
 
+/*
+ * Get the n-th VF of this physical function - only valid for PF.
+ * Returns NULL if index is invalid
+ */
+PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n);
+
 #endif /* QEMU_PCIE_SRIOV_H */
-- 
2.25.1




[PATCH v4 02/15] pcie: Add some SR/IOV API documentation in docs/pcie_sriov.txt

2022-01-26 Thread Lukasz Maniak
From: Knut Omang 

Add a small intro + minimal documentation for how to
implement SR/IOV support for an emulated device.

Signed-off-by: Knut Omang 
---
 docs/pcie_sriov.txt | 115 
 1 file changed, 115 insertions(+)
 create mode 100644 docs/pcie_sriov.txt

diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt
new file mode 100644
index 00..f5e891e1d4
--- /dev/null
+++ b/docs/pcie_sriov.txt
@@ -0,0 +1,115 @@
+PCI SR/IOV EMULATION SUPPORT
+
+
+Description
+===
+SR/IOV (Single Root I/O Virtualization) is an optional extended capability
+of a PCI Express device. It allows a single physical function (PF) to appear 
as multiple
+virtual functions (VFs) for the main purpose of eliminating software
+overhead in I/O from virtual machines.
+
+Qemu now implements the basic common functionality to enable an emulated device
+to support SR/IOV. Yet no fully implemented devices exists in Qemu, but a
+proof-of-concept hack of the Intel igb can be found here:
+
+git://github.com/knuto/qemu.git sriov_patches_v5
+
+Implementation
+==
+Implementing emulation of an SR/IOV capable device typically consists of
+implementing support for two types of device classes; the "normal" physical 
device
+(PF) and the virtual device (VF). From Qemu's perspective, the VFs are just
+like other devices, except that some of their properties are derived from
+the PF.
+
+A virtual function is different from a physical function in that the BAR
+space for all VFs are defined by the BAR registers in the PFs SR/IOV
+capability. All VFs have the same BARs and BAR sizes.
+
+Accesses to these virtual BARs then is computed as
+
++  *  + 
+
+From our emulation perspective this means that there is a separate call for
+setting up a BAR for a VF.
+
+1) To enable SR/IOV support in the PF, it must be a PCI Express device so
+   you would need to add a PCI Express capability in the normal PCI
+   capability list. You might also want to add an ARI (Alternative
+   Routing-ID Interpretation) capability to indicate that your device
+   supports functions beyond it's "own" function space (0-7),
+   which is necessary to support more than 7 functions, or
+   if functions extends beyond offset 7 because they are placed at an
+   offset > 1 or have stride > 1.
+
+   ...
+   #include "hw/pci/pcie.h"
+   #include "hw/pci/pcie_sriov.h"
+
+   pci_your_pf_dev_realize( ... )
+   {
+  ...
+  int ret = pcie_endpoint_cap_init(d, 0x70);
+  ...
+  pcie_ari_init(d, 0x100, 1);
+  ...
+
+  /* Add and initialize the SR/IOV capability */
+  pcie_sriov_pf_init(d, 0x200, "your_virtual_dev",
+   vf_devid, initial_vfs, total_vfs,
+   fun_offset, stride);
+
+  /* Set up individual VF BARs (parameters as for normal BARs) */
+  pcie_sriov_pf_init_vf_bar( ... )
+  ...
+   }
+
+   For cleanup, you simply call:
+
+  pcie_sriov_pf_exit(device);
+
+   which will delete all the virtual functions and associated resources.
+
+2) Similarly in the implementation of the virtual function, you need to
+   make it a PCI Express device and add a similar set of capabilities
+   except for the SR/IOV capability. Then you need to set up the VF BARs as
+   subregions of the PFs SR/IOV VF BARs by calling
+   pcie_sriov_vf_register_bar() instead of the normal pci_register_bar() call:
+
+   pci_your_vf_dev_realize( ... )
+   {
+  ...
+  int ret = pcie_endpoint_cap_init(d, 0x60);
+  ...
+  pcie_ari_init(d, 0x100, 1);
+  ...
+  memory_region_init(mr, ... )
+  pcie_sriov_vf_register_bar(d, bar_nr, mr);
+  ...
+   }
+
+Testing on Linux guest
+==
+The easiest is if your device driver supports sysfs based SR/IOV
+enabling. Support for this was added in kernel v.3.8, so not all drivers
+support it yet.
+
+To enable 4 VFs for a device at 01:00.0:
+
+   modprobe yourdriver
+   echo 4 > /sys/bus/pci/devices/:01:00.0/sriov_numvfs
+
+You should now see 4 VFs with lspci.
+To turn SR/IOV off again - the standard requires you to turn it off before you 
can enable
+another VF count, and the emulation enforces this:
+
+   echo 0 > /sys/bus/pci/devices/:01:00.0/sriov_numvfs
+
+Older drivers typically provide a max_vfs module parameter
+to enable it at load time:
+
+   modprobe yourdriver max_vfs=4
+
+To disable the VFs again then, you simply have to unload the driver:
+
+   rmmod yourdriver
-- 
2.25.1




[PATCH v4 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-26 Thread Lukasz Maniak
From: Knut Omang 

This patch provides the building blocks for creating an SR/IOV
PCIe Extended Capability header and register/unregister
SR/IOV Virtual Functions.

Signed-off-by: Knut Omang 
---
 hw/pci/meson.build  |   1 +
 hw/pci/pci.c| 100 +---
 hw/pci/pcie.c   |   5 +
 hw/pci/pcie_sriov.c | 294 
 hw/pci/trace-events |   5 +
 include/hw/pci/pci.h|  12 +-
 include/hw/pci/pcie.h   |   6 +
 include/hw/pci/pcie_sriov.h |  71 +
 include/qemu/typedefs.h |   2 +
 9 files changed, 470 insertions(+), 26 deletions(-)
 create mode 100644 hw/pci/pcie_sriov.c
 create mode 100644 include/hw/pci/pcie_sriov.h

diff --git a/hw/pci/meson.build b/hw/pci/meson.build
index 5c4bbac817..bcc9c75919 100644
--- a/hw/pci/meson.build
+++ b/hw/pci/meson.build
@@ -5,6 +5,7 @@ pci_ss.add(files(
   'pci.c',
   'pci_bridge.c',
   'pci_host.c',
+  'pcie_sriov.c',
   'shpc.c',
   'slotid_cap.c'
 ))
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 5d30f9ca60..ba8fb92efc 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -239,6 +239,9 @@ int pci_bar(PCIDevice *d, int reg)
 {
 uint8_t type;
 
+/* PCIe virtual functions do not have their own BARs */
+assert(!pci_is_vf(d));
+
 if (reg != PCI_ROM_SLOT)
 return PCI_BASE_ADDRESS_0 + reg * 4;
 
@@ -304,10 +307,30 @@ void pci_device_deassert_intx(PCIDevice *dev)
 }
 }
 
-static void pci_do_device_reset(PCIDevice *dev)
+static void pci_reset_regions(PCIDevice *dev)
 {
 int r;
+if (pci_is_vf(dev)) {
+return;
+}
+
+for (r = 0; r < PCI_NUM_REGIONS; ++r) {
+PCIIORegion *region = >io_regions[r];
+if (!region->size) {
+continue;
+}
 
+if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
+region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+pci_set_quad(dev->config + pci_bar(dev, r), region->type);
+} else {
+pci_set_long(dev->config + pci_bar(dev, r), region->type);
+}
+}
+}
+
+static void pci_do_device_reset(PCIDevice *dev)
+{
 pci_device_deassert_intx(dev);
 assert(dev->irq_state == 0);
 
@@ -323,19 +346,7 @@ static void pci_do_device_reset(PCIDevice *dev)
   pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
   pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE));
 dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
-for (r = 0; r < PCI_NUM_REGIONS; ++r) {
-PCIIORegion *region = >io_regions[r];
-if (!region->size) {
-continue;
-}
-
-if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
-region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
-pci_set_quad(dev->config + pci_bar(dev, r), region->type);
-} else {
-pci_set_long(dev->config + pci_bar(dev, r), region->type);
-}
-}
+pci_reset_regions(dev);
 pci_update_mappings(dev);
 
 msi_reset(dev);
@@ -884,6 +895,16 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice 
*dev, Error **errp)
 dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
 }
 
+/*
+ * With SR/IOV and ARI, a device at function 0 need not be a multifunction
+ * device, as it may just be a VF that ended up with function 0 in
+ * the legacy PCI interpretation. Avoid failing in such cases:
+ */
+if (pci_is_vf(dev) &&
+dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+return;
+}
+
 /*
  * multifunction bit is interpreted in two ways as follows.
  *   - all functions must set the bit to 1.
@@ -1083,6 +1104,7 @@ static PCIDevice *do_pci_register_device(PCIDevice 
*pci_dev,
bus->devices[devfn]->name);
 return NULL;
 } else if (dev->hotplugged &&
+   !pci_is_vf(pci_dev) &&
pci_get_function_0(pci_dev)) {
 error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
" new func %s cannot be exposed to guest.",
@@ -1191,6 +1213,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
 pcibus_t size = memory_region_size(memory);
 uint8_t hdr_type;
 
+assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */
 assert(region_num >= 0);
 assert(region_num < PCI_NUM_REGIONS);
 assert(is_power_of_2(size));
@@ -1294,11 +1317,45 @@ pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int 
region_num)
 return pci_dev->io_regions[region_num].addr;
 }
 
-static pcibus_t pci_bar_address(PCIDevice *d,
-int reg, uint8_t type, pcibus_t size)
+static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
+uint8_t type, pcibus_t size)
+{
+pcibus_t new_addr;
+if (!pci_is_vf(d)) {
+int bar = pci_bar(d, reg);
+if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+new_addr = 

Re: [PATCH for-7.0 3/4] hw/nvme: add ozcs enum

2022-01-26 Thread Keith Busch
On Thu, Nov 25, 2021 at 08:37:34AM +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add enumeration for OZCS values.

Looks good. 

Reviewed-by: Keith Busch 



Re: [PATCH for-7.0 2/4] hw/nvme: add zone attribute get/set helpers

2022-01-26 Thread Keith Busch
On Thu, Nov 25, 2021 at 08:37:33AM +0100, Klaus Jensen wrote:
> @@ -295,7 +295,7 @@ static void nvme_assign_zone_state(NvmeNamespace *ns, 
> NvmeZone *zone,
>  case NVME_ZONE_STATE_READ_ONLY:
>  break;
>  default:
> -zone->d.za = 0;
> +NVME_ZA_CLEAR_ALL(zone->d.za);
>  }
>  }
>  
> @@ -3356,7 +3356,7 @@ static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, 
> NvmeZone *zone)
>  return status;
>  }
>  nvme_aor_inc_active(ns);
> -zone->d.za |= NVME_ZA_ZD_EXT_VALID;
> +NVME_ZA_SET(zone->d.za, NVME_ZA_ZD_EXT_VALID);
>  nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
>  return NVME_SUCCESS;
>  }
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 2ee227760265..2b8b906466ab 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -1407,6 +1407,10 @@ enum NvmeZoneAttr {
>  NVME_ZA_ZD_EXT_VALID = 1 << 7,
>  };
>  
> +#define NVME_ZA_SET(za, attrs)   ((za) |= (attrs))
> +#define NVME_ZA_CLEAR(za, attrs) ((za) &= ~(attrs))
> +#define NVME_ZA_CLEAR_ALL(za)((za) = 0x0)

This doesn't really look any more helpful than open coding it. I think
it would appear better to take a "struct NvmeZone" type parameter
instead, and use inline functions instead of macro.



Re: [PATCH for-7.0 1/4] hw/nvme: add struct for zone management send

2022-01-26 Thread Keith Busch
On Thu, Nov 25, 2021 at 08:37:32AM +0100, Klaus Jensen wrote:
> +typedef struct QEMU_PACKED NvmeZoneSendCmd {
> +uint8_t opcode;
> +uint8_t flags;
> +uint16_tcid;
> +uint32_tnsid;
> +uint32_trsvd2[4];
> +NvmeCmdDptr dptr;
> +uint64_tslba;
> +uint32_trsvd12;
> +uint8_t zsa;
> +uint8_t zsflags[3];

This should be just a single uint8_t for zsflags, followed by 
'uint8_t rsvd[2]'.

Otherwise, looks good.

Reviewed-by: Keith Busch 



Re: [PATCH] qemu-storage-daemon: Fix typo in vhost-user-blk help

2022-01-26 Thread Hanna Reitz

On 25.01.22 16:15, Kevin Wolf wrote:

The syntax of the fd passing case misses the "addr.type=" key. Add it.

Signed-off-by: Kevin Wolf 
---
  storage-daemon/qemu-storage-daemon.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Hanna Reitz 




Re: [PATCH] fdc: check for illegal dma length calculation

2022-01-26 Thread Philippe Mathieu-Daudé via

Hi Jon,

On 26/1/22 16:44, Jon Maloy wrote:

On 1/13/22 20:33, Jon Maloy wrote:

The function fdctrl_start_transfer() calculates the dma data length
wrongly when certain boundary conditions are fulfilled. We have
noticed that the if ((fdctrl->fifo[5] - fdctrl->fifo[6]) > 1) we get
a dma length that will be interpreted as negative by the next function
in the chain, fdctrl_transfer_handler(). This leads to a crash.


If a security issue is reproducible (like the ones found by fuzzers),
please include the reproducer along.


Rather than trying to fix this obscure calculation, we just check if
the harmful condition is fulfilled, and return without action if that
is the case. Since this is a condition that can only be created by a
malicious user we deem this solution safe.

This fix is intended to address CVE-2021-3507.


Ah, this is similar to:
https://lore.kernel.org/qemu-devel/2028115733.4038610-1-phi...@redhat.com/
which already contains the reproducer...
You might want to review it ;)


Signed-off-by: Jon Maloy 
---
  hw/block/fdc.c | 5 +
  1 file changed, 5 insertions(+)

diff --git a/hw/block/fdc.c b/hw/block/fdc.c
index 21d18ac2e3..80a1f1750a 100644
--- a/hw/block/fdc.c
+++ b/hw/block/fdc.c
@@ -1532,6 +1532,11 @@ static void fdctrl_start_transfer(FDCtrl 
*fdctrl, int direction)

  if (fdctrl->fifo[0] & 0x80)
  tmp += fdctrl->fifo[6];
  fdctrl->data_len *= tmp;
+    if (tmp < 0) {
+    FLOPPY_DPRINTF("calculated illegal data_len=%u, tmp=%i\n",
+   fdctrl->data_len, tmp);
+    return;
+    }
  }
  fdctrl->eot = fdctrl->fifo[6];
  if (fdctrl->dor & FD_DOR_DMAEN) {

I never received any feedback on this one.
Is there any?


Probably none so far because you forgot to Cc the maintainers:

$ ./scripts/get_maintainer.pl -f hw/block/fdc.c
John Snow  (supporter:Floppy)
Kevin Wolf  (supporter:Block layer core)
Hanna Reitz  (supporter:Block layer core)
qemu-block@nongnu.org (open list:Floppy)
qemu-de...@nongnu.org (open list:All patches CC here)

(now Cc'ed).



Re: [PATCH v3 09/16] jobs: remove aiocontext locks since the functions are under BQL

2022-01-26 Thread Emanuele Giuseppe Esposito



On 19/01/2022 12:09, Paolo Bonzini wrote:
>> @@ -3707,15 +3707,11 @@ BlockJobInfoList *qmp_query_block_jobs(Error
>> **errp)
>>     for (job = block_job_next(NULL); job; job =
>> block_job_next(job)) {
>>   BlockJobInfo *value;
>> -    AioContext *aio_context;
>>     if (block_job_is_internal(job)) {
>>   continue;
>>   }
> 
> block_job_next, block_job_query, etc. do not have the _locked suffix. Is
> this because all block_job_ functions need the job_mutex held, or just
> laziness? :)
> 

I wasn't really sure whether to touch that API naming or not (+ laziness
:D )

But it makes sense to add _locked also there. Will do.

Thank you,
Emanuele




Re: [PATCH v6 32/33] crypto: delegate permission functions to JobDriver .pre_run

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

block_crypto_amend_options_generic_luks uses the block layer
permission API, therefore it should be called with the BQL held.

However, the same function is being called by two BlockDriver
callbacks: bdrv_amend_options (under BQL) and bdrv_co_amend (I/O).

The latter is I/O because it is invoked by block/amend.c's
blockdev_amend_run(), a .run callback of the amend JobDriver

Therefore we want to 1) change block_crypto driver
to use the permission API only when the BQL is held, and
2) use the .pre_run JobDriver callback to check for
permissions before switching to the job aiocontext. This has also
the benefit of applying the same permission operation to all
amend implementations, not only luks.

Remove the permission check in block_crypto_amend_options_generic_luks()
and:
- Add helper functions block_crypto_amend_options_{prepare/cleanup}
   that take care of checking permissions in
   block_crypto_amend_options_luks(), so when it is under BQL, and

- Use job->pre_run() and job->clean() to do the same thing when
   we are in an iothread, by performing these checks before the
   job runs in its aiocontext. So far job->pre_run() is only defined
   but not called in job_start(), now it is the moment to use it.

Signed-off-by: Emanuele Giuseppe Esposito 
---
  block/crypto.c | 57 --
  job.c  | 13 
  2 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/block/crypto.c b/block/crypto.c
index f5e0c7b7c0..bdb4ba5664 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -791,6 +791,28 @@ block_crypto_amend_cleanup(BlockDriverState *bs)
  crypto->updating_keys = false;
  }
  
+static int

+block_crypto_amend_options_prepare(BlockDriverState *bs,
+   Error **errp)
+{
+BlockCrypto *crypto = bs->opaque;
+
+/* apply for exclusive read/write permissions to the underlying file*/
+crypto->updating_keys = true;
+return bdrv_child_refresh_perms(bs, bs->file, errp);
+}
+
+static int
+block_crypto_amend_options_cleanup(BlockDriverState *bs,
+   Error **errp)
+{
+BlockCrypto *crypto = bs->opaque;
+
+/* release exclusive read/write permissions to the underlying file*/
+crypto->updating_keys = false;
+return bdrv_child_refresh_perms(bs, bs->file, errp);
+}
+


Now that I have this patch applied, it does look like it would be nicer 
if we could skip adding these functions and just reuse 
block_crypto_amend_{pre_run,cleanup}() (which would require them to call 
bdrv_child_refresh_perms()).



  static int
  block_crypto_amend_options_generic_luks(BlockDriverState *bs,
  QCryptoBlockAmendOptions 
*amend_options,
@@ -798,30 +820,17 @@ block_crypto_amend_options_generic_luks(BlockDriverState 
*bs,
  Error **errp)
  {
  BlockCrypto *crypto = bs->opaque;
-int ret;
  
  assert(crypto);

  assert(crypto->block);
  
-/* apply for exclusive read/write permissions to the underlying file*/

-crypto->updating_keys = true;
-ret = bdrv_child_refresh_perms(bs, bs->file, errp);
-if (ret) {
-goto cleanup;
-}
-
-ret = qcrypto_block_amend_options(crypto->block,
-  block_crypto_read_func,
-  block_crypto_write_func,
-  bs,
-  amend_options,
-  force,
-  errp);
-cleanup:
-/* release exclusive read/write permissions to the underlying file*/
-crypto->updating_keys = false;
-bdrv_child_refresh_perms(bs, bs->file, errp);
-return ret;
+return qcrypto_block_amend_options(crypto->block,
+   block_crypto_read_func,
+   block_crypto_write_func,
+   bs,
+   amend_options,
+   force,
+   errp);
  }
  
  static int

@@ -847,8 +856,16 @@ block_crypto_amend_options_luks(BlockDriverState *bs,
  if (!amend_options) {
  goto cleanup;
  }
+
+ret = block_crypto_amend_options_prepare(bs, errp);
+if (ret) {
+goto perm_cleanup;
+}
  ret = block_crypto_amend_options_generic_luks(bs, amend_options,
force, errp);
+
+perm_cleanup:
+block_crypto_amend_options_cleanup(bs, errp);


Uh, pre-existing but still dangerous.  We must not pass @errp here, 
because it may (and if we come from ..._prepare() failing, s/may/will/) 
already contain some error, and then, if this fails (which it very 
likely will not), we will get an assertion failure in error_setv().


We could decide that this must not 

Re: [PATCH v3 03/16] job.h: define locked functions

2022-01-26 Thread Emanuele Giuseppe Esposito



On 24/01/2022 15:26, Paolo Bonzini wrote:
> On 1/21/22 17:04, Vladimir Sementsov-Ogievskiy wrote:
>>>
>>> The split was proposed in previous versions, but Vladimir did not
>>> really like it and suggested to send it as a separate series:
>>
>> I didn't really like it as it seemed unusual and unobvious to me. But
>> if we already accepted similar split for generic block layer, no way
>> for me to resist :) And if we follow new logic of generic block layer
>> in jobs, it's not "unusual" any more.
> 
> Either way I think it's okay to have it as a follow-up.  The explicit
> naming in the API is a bit verbose but definitely clearer, so it's okay
> to order different than the graph/IO split.  In that case we weren't
> even sure, until you went through all the testcase failures, that a
> _locked or rather "_drained" API was possible.
> 
> Paolo
> 

Ok, I will send the split in a separate series.

Thank you,
Emanuele




Re: [PATCH v3 14/16] job.c: use job_get_aio_context()

2022-01-26 Thread Emanuele Giuseppe Esposito



On 24/01/2022 15:22, Paolo Bonzini wrote:
> On 1/21/22 16:18, Emanuele Giuseppe Esposito wrote:

>>>
>>> Better to use aio_co_schedule here, too, and move it under the
>>> previous WITH_JOB_LOCK_GUARD.
>>
>> Unfortunately this does not work straightforward: aio_co_enter invokes
>> aio_co_schedule only if the context is different from the main loop,
>> otherwise it can directly enter the coroutine with
>> qemu_aio_coroutine_enter. So always replacing it with aio_co_schedule
>> breaks the unit tests assumptions, as they expect that when control is
>> returned the job has already executed.
>>
>> A possible solution is to aio_poll() on the condition we want to
>> assert, waiting for the bh to be scheduled. But I don't know if this
>> is then useful to test something.
> 
> I think you sorted that out, based on IRC conversation?
> 

Yes.

Thank you,
Emanuele




Re: [PATCH v6 31/33] include/qemu/job.h: introduce job->pre_run() and use it in amend

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

Introduce .pre_run() job callback. This cb will run in job_start,
before the coroutine is created and runs run() in the job aiocontext.


I presume this means “before the coroutine is created that will run 
run() in the job aiocontext”?


(The way it’s written here sounds a bit like .pre_run() will run run())


Therefore, .pre_run() always runs in the main loop.
We can use this function together with clean() cb to replace
bdrv_child_refresh_perms in block_crypto_amend_options_generic_luks(),
since that function can also be called from an iothread via
.bdrv_co_amend().

In addition, doing so we check for permissions in all bdrv
in amend, not only crypto.


As I’ve written in my reply to patch 30, I’m not quite sold on this 
part, but disregarding that part (and one typo below), this patch looks 
good to me.



.pre_run() and .clean() take care of calling bdrv_amend_pre_run()
and bdrv_amend_clean() respectively, to set up driver-specific flags
and allow the crypto driver to temporarly provide the WRITE
perm to qcrypto_block_amend_options().

.pre_run() is not yet invoked by job_start, but .clean() is.
This is not a problem, since it will just be a redundant check
and crypto will have the update->keys flag == false anyways.

Signed-off-by: Emanuele Giuseppe Esposito 
---
  block/amend.c  | 33 +
  include/qemu/job.h |  9 +
  2 files changed, 42 insertions(+)


[...]


diff --git a/include/qemu/job.h b/include/qemu/job.h
index 4ea7a4a0cd..915ceff425 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -223,6 +223,15 @@ struct JobDriver {
   * the GS API.
   */
  
+/**

+ * Called in the Main Loop context, before the job coroutine
+ * is started in the job aiocontext. Useful to perform operations
+ * that needs to be done under BQL (like permissions setup).


s/needs/need/


+ * On success, it returns 0. Otherwise the job failed to initialize
+ * and won't start.
+ */
+int (*pre_run)(Job *job, Error **errp);
+
  /**
   * Called when the job is resumed by the user (i.e. user_paused becomes
   * false). .user_resume is called before .resume.





Re: [PATCH 0/1 v2] Patch to adjust coroutine pool size adaptively

2022-01-26 Thread Stefan Hajnoczi
On Mon, Jan 24, 2022 at 10:01:30AM +, Hiroki Narukawa wrote:
> ping, how is the status of this patch?
> 
> Link for this patch v2 on patchew is this one: 
> https://patchew.org/QEMU/20220111091950.840-1-hnaru...@yahoo-corp.jp/
> The last message on patch v1 is this one: 
> https://lore.kernel.org/qemu-devel/tycpr01mb8357e8d13d661265cdbb442c80...@tycpr01mb8357.jpnprd01.prod.outlook.com/T/#u
> 
> The difference from v1 is use of qatomic_read().

Hi,
Sorry for the slow response. I will review this tomorrow (Thursday)!

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v6 30/33] include/block/block_int-common.h: introduce bdrv_amend_pre_run and bdrv_amend_clean

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

These two callbacks will be invoked by job callbacks to execute
driver-specific code while still being in BQL.
In this example, we want the amend JobDriver to execute the
permission check (bdrv_child_refresh_perms) currently only
done in block/crypto.c block_crypto_amend_options_generic_luks()
to all its bdrv.
This is achieved by introducing callbacks in the JobDriver, but
we also need to make sure that crypto->updating_keys is true
before refreshing the permissions the first time, so that
WRITE perm is temporarly given to qcrypto_block_amend_options(),
and set it to false when permissions are restored.

Therefore bdrv_amend_pre_run() and bdrv_amend_clean() will take care of
just temporarly setting the crypto-specific updating_keys flag.

Note that at this stage, they are not yet invoked.

Signed-off-by: Emanuele Giuseppe Esposito 
---
  block/crypto.c   | 16 
  include/block/block_int-common.h | 13 +
  2 files changed, 29 insertions(+)

diff --git a/block/crypto.c b/block/crypto.c
index c8ba4681e2..f5e0c7b7c0 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -777,6 +777,20 @@ block_crypto_get_specific_info_luks(BlockDriverState *bs, 
Error **errp)
  return spec_info;
  }
  
+static void

+block_crypto_amend_pre_run(BlockDriverState *bs)
+{
+BlockCrypto *crypto = bs->opaque;
+crypto->updating_keys = true;
+}


So I see you decided to leave actually updating the permissions to the 
caller, i.e. blockdev_amend_pre_run().  Why?


I’m asking because:

(1) If the .bdrv_amend_pre_run() isn’t implemented, I don’t think 
refreshing the permissions in blockdev_amend_pre_run() is necessary.  So 
if it is implemented, the implementation might as well do it itself.


(2) The way you implemented it, it’s actually kind of hard to see why 
this is even necessary.  Both of these functions 
(block_crypto_amend_{pre_run,cleanup}()) don’t require the BQL, so the 
explanation for .bdrv_amend_pre_run() (“useful to perform 
driver-specific initialization code under BQL”) doesn’t really apply.  
If you want to explain (and we should) why this is necessary, then the 
.bdrv_amend_pre_run() documentation needs to state that we will refresh 
the permissions after this function has run and before .bdrv_co_amend() 
will run, and so it’s also useful to put code here that will change the 
permissions on permission update, but that just really gets complicated 
to explain.  Naively, I find it simpler to just say “Put BQL code here, 
this will run before .bdrv_co_amend()” and have every implementation 
that needs it refresh the permissions itself.


(3) In patch 32, you add 
block_crypto_amend_options_{prepare,cleanup}().  If the functions added 
here (block_crypto_amend_{pre_run,cleanup}()) would refresh the 
permissions by themselves, they’d be exactly the same functions, so you 
could reuse the ones from here in patch 32.



+
+static void
+block_crypto_amend_cleanup(BlockDriverState *bs)
+{
+BlockCrypto *crypto = bs->opaque;
+crypto->updating_keys = false;
+}
+
  static int
  block_crypto_amend_options_generic_luks(BlockDriverState *bs,
  QCryptoBlockAmendOptions 
*amend_options,
@@ -931,6 +945,8 @@ static BlockDriver bdrv_crypto_luks = {
  .bdrv_get_specific_info = block_crypto_get_specific_info_luks,
  .bdrv_amend_options = block_crypto_amend_options_luks,
  .bdrv_co_amend  = block_crypto_co_amend_luks,
+.bdrv_amend_pre_run   = block_crypto_amend_pre_run,
+.bdrv_amend_clean = block_crypto_amend_cleanup,
  
  .is_format  = true,
  
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h

index cc8c8835ba..9d28396978 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -189,6 +189,19 @@ struct BlockDriver {
   * the GS API.
   */
  
+/*

+ * Called inside job->pre_run() callback, it is useful
+ * to perform driver-specific initialization code under
+ * BQL, like setting up specific permission flags.


I wouldn’t necessarily state that this function is called by 
`job->pre_run()`, but rather paint the picture of how it’s used together 
with `.bdrv_co_amend()`, e.g. something like:


“This function is invoked under the BQL before .bdrv_co_amend() (which 
in contrast does not necessarily run under the BQL) to allow 
driver-specific initialization code that requires the BQL, like setting 
up specific permission flags.”


(Though this comment should be much more specific if we keep updating 
the permissions in the caller, because then the comment also needs to 
explain that we do that.)



+ */
+void (*bdrv_amend_pre_run)(BlockDriverState *bs);
+/*
+ * Called inside job->clean() callback, it undoes
+ * the driver-specific initialization code done in amend_pre_run.
+ * Also this function is under BQL.


Here too I’d omit the job 

Re: [PATCH v6 29/33] job.h: assertions in the callers of JobDriver funcion pointers

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

Signed-off-by: Emanuele Giuseppe Esposito 
---
  job.c | 9 +
  1 file changed, 9 insertions(+)


Just curious, why did you remove the assertion in job_co_entry()? 
(Looking at it again, it might have been nicer to swap it with the 
assertion below it, so that `job != NULL` is asserted first, but other 
than that...)


(And since I’m already replying to this patch, might as well point out 
s/funcion/function/ in the subject)


Hanna




Re: [PATCH] block/export: Fix vhost-user-blk shutdown with requests in flight

2022-01-26 Thread Stefan Hajnoczi
On Tue, Jan 25, 2022 at 04:14:35PM +0100, Kevin Wolf wrote:
> The vhost-user-blk export runs requests asynchronously in their own
> coroutine. When the vhost connection goes away and we want to stop the
> vhost-user server, we need to wait for these coroutines to stop before
> we can unmap the shared memory. Otherwise, they would still access the
> unmapped memory and crash.
> 
> This introduces a refcount to VuServer which is increased when spawning
> a new request coroutine and decreased before the coroutine exits. The
> memory is only unmapped when the refcount reaches zero.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  include/qemu/vhost-user-server.h |  5 +
>  block/export/vhost-user-blk-server.c |  5 +
>  util/vhost-user-server.c | 22 ++
>  3 files changed, 32 insertions(+)
> 
> diff --git a/include/qemu/vhost-user-server.h 
> b/include/qemu/vhost-user-server.h
> index 121ea1dedf..cd43193b80 100644
> --- a/include/qemu/vhost-user-server.h
> +++ b/include/qemu/vhost-user-server.h
> @@ -42,6 +42,8 @@ typedef struct {
>  const VuDevIface *vu_iface;
>  
>  /* Protected by ctx lock */
> +unsigned int refcount;
> +bool wait_idle;
>  VuDev vu_dev;
>  QIOChannel *ioc; /* The I/O channel with the client */
>  QIOChannelSocket *sioc; /* The underlying data channel with the client */
> @@ -59,6 +61,9 @@ bool vhost_user_server_start(VuServer *server,
>  
>  void vhost_user_server_stop(VuServer *server);
>  
> +void vhost_user_server_ref(VuServer *server);
> +void vhost_user_server_unref(VuServer *server);
> +
>  void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx);
>  void vhost_user_server_detach_aio_context(VuServer *server);
>  
> diff --git a/block/export/vhost-user-blk-server.c 
> b/block/export/vhost-user-blk-server.c
> index 1862563336..a129204c44 100644
> --- a/block/export/vhost-user-blk-server.c
> +++ b/block/export/vhost-user-blk-server.c
> @@ -172,6 +172,7 @@ vu_blk_discard_write_zeroes(VuBlkExport *vexp, struct 
> iovec *iov,
>  return VIRTIO_BLK_S_IOERR;
>  }
>  
> +/* Called with server refcount increased, must decrease before returning */
>  static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
>  {
>  VuBlkReq *req = opaque;
> @@ -286,10 +287,12 @@ static void coroutine_fn vu_blk_virtio_process_req(void 
> *opaque)
>  }
>  
>  vu_blk_req_complete(req);
> +vhost_user_server_unref(server);
>  return;
>  
>  err:
>  free(req);
> +vhost_user_server_unref(server);
>  }
>  
>  static void vu_blk_process_vq(VuDev *vu_dev, int idx)
> @@ -310,6 +313,8 @@ static void vu_blk_process_vq(VuDev *vu_dev, int idx)
>  
>  Coroutine *co =
>  qemu_coroutine_create(vu_blk_virtio_process_req, req);
> +
> +vhost_user_server_ref(server);
>  qemu_coroutine_enter(co);

Why not increment inside vu_blk_virtio_process_req()? My understanding
is the coroutine is entered immediately so there is no race that needs
to be protected against by incrementing the refcount early.

>  }
>  }
> diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
> index f68287e811..f66fbba710 100644
> --- a/util/vhost-user-server.c
> +++ b/util/vhost-user-server.c
> @@ -74,6 +74,20 @@ static void panic_cb(VuDev *vu_dev, const char *buf)
>  error_report("vu_panic: %s", buf);
>  }
>  
> +void vhost_user_server_ref(VuServer *server)
> +{
> +assert(!server->wait_idle);
> +server->refcount++;
> +}
> +
> +void vhost_user_server_unref(VuServer *server)
> +{
> +server->refcount--;
> +if (server->wait_idle && !server->refcount) {
> +aio_co_wake(server->co_trip);
> +}
> +}
> +
>  static bool coroutine_fn
>  vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
>  {
> @@ -177,6 +191,14 @@ static coroutine_fn void vu_client_trip(void *opaque)
>  /* Keep running */
>  }
>  
> +if (server->refcount) {
> +/* Wait for requests to complete before we can unmap the memory */
> +server->wait_idle = true;
> +qemu_coroutine_yield();
> +server->wait_idle = false;
> +}
> +assert(server->refcount == 0);
> +
>  vu_deinit(vu_dev);
>  
>  /* vu_deinit() should have called remove_watch() */
> -- 
> 2.31.1
> 


signature.asc
Description: PGP signature


Re: [PATCH for-7.0 0/4] hw/nvme: zoned random write area

2022-01-26 Thread Klaus Jensen
+CC Dmitry

On Jan 26 13:15, Stefan Hajnoczi wrote:
> On Wed, Jan 26, 2022 at 09:58:59AM +0100, Klaus Jensen wrote:
> > On Nov 25 08:37, Klaus Jensen wrote:
> > > From: Klaus Jensen 
> > > 
> > > This series adds support for a zoned random write area as standardized
> > > in TP 4076 ("Zoned Random Write Area").
> > > 
> > > Klaus Jensen (4):
> > >   hw/nvme: add struct for zone management send
> > >   hw/nvme: add zone attribute get/set helpers
> > >   hw/nvme: add ozcs enum
> > >   hw/nvme: add support for zoned random write area
> > > 
> > >  hw/nvme/ctrl.c   | 185 ---
> > >  hw/nvme/ns.c |  61 +-
> > >  hw/nvme/nvme.h   |  10 +++
> > >  hw/nvme/trace-events |   1 +
> > >  include/block/nvme.h |  43 +-
> > >  5 files changed, 271 insertions(+), 29 deletions(-)
> > > 
> > > -- 
> > > 2.34.0
> > > 
> > 
> > Bz ping :)
> 
> Hi Klaus,
> Are you pinging Keith? It's not clear from the "To:" header and I want
> to check that I'm not holding up your patches.
> 

Oh, I've never actually considered moving those that I ping to "To:",
makes sense!

And no - you are not holding up anything. I'm just pinging anyone
interested in hw/nvme (and zns).


signature.asc
Description: PGP signature


Re: [PATCH v3 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-26 Thread Knut Omang
On Wed, 2022-01-26 at 14:23 +0100, Łukasz Gieryk wrote:
> I'm sorry for the delayed response. We (I and the other Lukasz) somehow
> had hoped that Knut, the original author of this patch, would have
> responded.

Yes, sorry - this one flushed past me here for some reason,

> 
> Let me address your questions, up to my best knowledge.
>   
> > > -static pcibus_t pci_bar_address(PCIDevice *d,
> > > -    int reg, uint8_t type, pcibus_t size)
> > > +static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
> > > +    uint8_t type, pcibus_t size)
> > > +{
> > > +    pcibus_t new_addr;
> > > +    if (!pci_is_vf(d)) {
> > > +    int bar = pci_bar(d, reg);
> > > +    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > +    new_addr = pci_get_quad(d->config + bar);
> > > +    } else {
> > > +    new_addr = pci_get_long(d->config + bar);
> > > +    }
> > > +    } else {
> > > +    PCIDevice *pf = d->exp.sriov_vf.pf;
> > > +    uint16_t sriov_cap = pf->exp.sriov_cap;
> > > +    int bar = sriov_cap + PCI_SRIOV_BAR + reg * 4;
> > > +    uint16_t vf_offset = pci_get_word(pf->config + sriov_cap +
> > > PCI_SRIOV_VF_OFFSET);
> > > +    uint16_t vf_stride = pci_get_word(pf->config + sriov_cap +
> > > PCI_SRIOV_VF_STRIDE);
> > > +    uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / 
> > > vf_stride;
> > > +
> > > +    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > +    new_addr = pci_get_quad(pf->config + bar);
> > > +    } else {
> > > +    new_addr = pci_get_long(pf->config + bar);
> > > +    }
> > > +    new_addr += vf_num * size;
> > > +    }
> > > +    if (reg != PCI_ROM_SLOT) {
> > > +    /* Preserve the rom enable bit */
> > > +    new_addr &= ~(size - 1);
> > 
> > This comment puzzles me. How does clearing low bits preserve
> > any bits? Looks like this will clear low bits if any.
> > 
> 
> I think the comment applies to (reg != PCI_ROM_SLOT), i.e., the bits are
> cleared for BARs, but not for expansion ROM. I agree the placement of this
> comment is slightly misleading. We will move it up and rephrase slightly.

I agree - it's maybe better to just put the comment above the if(...) 
other than that I believe it is correct.

Knut

>  
> > > +pcibus_t pci_bar_address(PCIDevice *d,
> > > + int reg, uint8_t type, pcibus_t size)
> > >   {
> > >   pcibus_t new_addr, last_addr;
> > > -    int bar = pci_bar(d, reg);
> > >   uint16_t cmd = pci_get_word(d->config + PCI_COMMAND);
> > >   Object *machine = qdev_get_machine();
> > >   ObjectClass *oc = object_get_class(machine);
> > > @@ -1309,7 +1363,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> > >   if (!(cmd & PCI_COMMAND_IO)) {
> > >   return PCI_BAR_UNMAPPED;
> > >   }
> > > -    new_addr = pci_get_long(d->config + bar) & ~(size - 1);
> > > +    new_addr = pci_config_get_bar_addr(d, reg, type, size);
> > >   last_addr = new_addr + size - 1;
> > >   /* Check if 32 bit BAR wraps around explicitly.
> > >    * TODO: make priorities correct and remove this work around.
> > > @@ -1324,11 +1378,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> > >   if (!(cmd & PCI_COMMAND_MEMORY)) {
> > >   return PCI_BAR_UNMAPPED;
> > >   }
> > > -    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > -    new_addr = pci_get_quad(d->config + bar);
> > > -    } else {
> > > -    new_addr = pci_get_long(d->config + bar);
> > > -    }
> > > +    new_addr = pci_config_get_bar_addr(d, reg, type, size);
> > >   /* the ROM slot has a specific enable bit */
> > >   if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) {
> > 
> > And in fact here we check the low bit and handle it specially.
> 
> The code seems correct for me. The bit is preserved for ROM case.
> 
> > > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > > index d7d73a31e4..182a225054 100644
> > > --- a/hw/pci/pcie.c
> > > +++ b/hw/pci/pcie.c
> > > @@ -446,6 +446,11 @@ void pcie_cap_slot_plug_cb(HotplugHandler 
> > > *hotplug_dev,
> > > DeviceState *dev,
> > >   PCIDevice *pci_dev = PCI_DEVICE(dev);
> > >   uint32_t lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> > >   
> > > +    if(pci_is_vf(pci_dev)) {
> > > +    /* We don't want to change any state in hotplug_dev for SR/IOV 
> > > virtual
> > > functions */
> > > +    return;
> > > +    }
> > > +
> > 
> > Coding style violation here.  And pls document the why not the what.
> > E.g. IIRC the reason is that VFs don't have an express capability,
> > right?
> 
> I think the reason is that virtual functions don’t exist physically, so
> they cannot be individually disconnected. Only PF should respond to
> hotplug events, implicitly disconnecting (thus: destroying) all child
> VFs.
> 
> Anyway, we will update this comment to state *why* and add the missing
> 

Re: [PATCH v3 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-26 Thread Łukasz Gieryk
I'm sorry for the delayed response. We (I and the other Lukasz) somehow
had hoped that Knut, the original author of this patch, would have
responded.

Let me address your questions, up to my best knowledge.
  
> > -static pcibus_t pci_bar_address(PCIDevice *d,
> > -int reg, uint8_t type, pcibus_t size)
> > +static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
> > +uint8_t type, pcibus_t size)
> > +{
> > +pcibus_t new_addr;
> > +if (!pci_is_vf(d)) {
> > +int bar = pci_bar(d, reg);
> > +if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +new_addr = pci_get_quad(d->config + bar);
> > +} else {
> > +new_addr = pci_get_long(d->config + bar);
> > +}
> > +} else {
> > +PCIDevice *pf = d->exp.sriov_vf.pf;
> > +uint16_t sriov_cap = pf->exp.sriov_cap;
> > +int bar = sriov_cap + PCI_SRIOV_BAR + reg * 4;
> > +uint16_t vf_offset = pci_get_word(pf->config + sriov_cap + 
> > PCI_SRIOV_VF_OFFSET);
> > +uint16_t vf_stride = pci_get_word(pf->config + sriov_cap + 
> > PCI_SRIOV_VF_STRIDE);
> > +uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / vf_stride;
> > +
> > +if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +new_addr = pci_get_quad(pf->config + bar);
> > +} else {
> > +new_addr = pci_get_long(pf->config + bar);
> > +}
> > +new_addr += vf_num * size;
> > +}
> > +if (reg != PCI_ROM_SLOT) {
> > +/* Preserve the rom enable bit */
> > +new_addr &= ~(size - 1);
> 
> This comment puzzles me. How does clearing low bits preserve
> any bits? Looks like this will clear low bits if any.
> 

I think the comment applies to (reg != PCI_ROM_SLOT), i.e., the bits are
cleared for BARs, but not for expansion ROM. I agree the placement of this
comment is slightly misleading. We will move it up and rephrase slightly.
 
> > +pcibus_t pci_bar_address(PCIDevice *d,
> > + int reg, uint8_t type, pcibus_t size)
> >  {
> >  pcibus_t new_addr, last_addr;
> > -int bar = pci_bar(d, reg);
> >  uint16_t cmd = pci_get_word(d->config + PCI_COMMAND);
> >  Object *machine = qdev_get_machine();
> >  ObjectClass *oc = object_get_class(machine);
> > @@ -1309,7 +1363,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> >  if (!(cmd & PCI_COMMAND_IO)) {
> >  return PCI_BAR_UNMAPPED;
> >  }
> > -new_addr = pci_get_long(d->config + bar) & ~(size - 1);
> > +new_addr = pci_config_get_bar_addr(d, reg, type, size);
> >  last_addr = new_addr + size - 1;
> >  /* Check if 32 bit BAR wraps around explicitly.
> >   * TODO: make priorities correct and remove this work around.
> > @@ -1324,11 +1378,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> >  if (!(cmd & PCI_COMMAND_MEMORY)) {
> >  return PCI_BAR_UNMAPPED;
> >  }
> > -if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > -new_addr = pci_get_quad(d->config + bar);
> > -} else {
> > -new_addr = pci_get_long(d->config + bar);
> > -}
> > +new_addr = pci_config_get_bar_addr(d, reg, type, size);
> >  /* the ROM slot has a specific enable bit */
> >  if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) {
> 
> And in fact here we check the low bit and handle it specially.

The code seems correct for me. The bit is preserved for ROM case.

> > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > index d7d73a31e4..182a225054 100644
> > --- a/hw/pci/pcie.c
> > +++ b/hw/pci/pcie.c
> > @@ -446,6 +446,11 @@ void pcie_cap_slot_plug_cb(HotplugHandler 
> > *hotplug_dev, DeviceState *dev,
> >  PCIDevice *pci_dev = PCI_DEVICE(dev);
> >  uint32_t lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> >  
> > +if(pci_is_vf(pci_dev)) {
> > +/* We don't want to change any state in hotplug_dev for SR/IOV 
> > virtual functions */
> > +return;
> > +}
> > +
> 
> Coding style violation here.  And pls document the why not the what.
> E.g. IIRC the reason is that VFs don't have an express capability,
> right?

I think the reason is that virtual functions don’t exist physically, so
they cannot be individually disconnected. Only PF should respond to
hotplug events, implicitly disconnecting (thus: destroying) all child
VFs.

Anyway, we will update this comment to state *why* and add the missing
space.

V4 with the mentioned changes will happen really soon.




Re: [PATCH for-7.0 0/4] hw/nvme: zoned random write area

2022-01-26 Thread Stefan Hajnoczi
On Wed, Jan 26, 2022 at 09:58:59AM +0100, Klaus Jensen wrote:
> On Nov 25 08:37, Klaus Jensen wrote:
> > From: Klaus Jensen 
> > 
> > This series adds support for a zoned random write area as standardized
> > in TP 4076 ("Zoned Random Write Area").
> > 
> > Klaus Jensen (4):
> >   hw/nvme: add struct for zone management send
> >   hw/nvme: add zone attribute get/set helpers
> >   hw/nvme: add ozcs enum
> >   hw/nvme: add support for zoned random write area
> > 
> >  hw/nvme/ctrl.c   | 185 ---
> >  hw/nvme/ns.c |  61 +-
> >  hw/nvme/nvme.h   |  10 +++
> >  hw/nvme/trace-events |   1 +
> >  include/block/nvme.h |  43 +-
> >  5 files changed, 271 insertions(+), 29 deletions(-)
> > 
> > -- 
> > 2.34.0
> > 
> 
> Bz ping :)

Hi Klaus,
Are you pinging Keith? It's not clear from the "To:" header and I want
to check that I'm not holding up your patches.

Thanks,
Stefan


signature.asc
Description: PGP signature


Re: [PATCH v6 26/33] block_int-common.h: assertions in the callers of BdrvChildClass function pointers

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

Signed-off-by: Emanuele Giuseppe Esposito 
---
  block.c | 9 +
  1 file changed, 9 insertions(+)

diff --git a/block.c b/block.c
index 448fb9d76f..ca16d90627 100644
--- a/block.c
+++ b/block.c


[...]


@@ -2120,6 +2121,7 @@ bool bdrv_is_writable(BlockDriverState *bs)
  
  static char *bdrv_child_user_desc(BdrvChild *c)

  {
+assert(qemu_in_main_thread());
  return c->klass->get_parent_desc(c);
  }


Quick note: Whether we really want this depends on whether we find that 
`.get_parent_desc()` really should be a GS-only function.


Since I leave that up to you, though (and this patch interestingly (and 
correctly) doesn’t add an assertion to bdrv_get_parent_name(), even 
though that calls `.get_name()`, which the previous patch did classify 
as GS):


Reviewed-by: Hanna Reitz 




Re: [PATCH v6 25/33] block_int-common.h: split function pointers in BdrvChildClass

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

Signed-off-by: Emanuele Giuseppe Esposito 
---
  include/block/block_int-common.h | 67 +++-
  1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index e007dbf768..cc8c8835ba 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -815,12 +815,16 @@ struct BdrvChildClass {
   */
  bool parent_is_bds;
  
+/*

+ * Global state (GS) API. These functions run under the BQL lock.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
  void (*inherit_options)(BdrvChildRole role, bool parent_is_format,
  int *child_flags, QDict *child_options,
  int parent_flags, QDict *parent_options);
-
  void (*change_media)(BdrvChild *child, bool load);
-void (*resize)(BdrvChild *child);
  
  /*

   * Returns a name that is supposedly more useful for human users than the


The method this comment belongs to is `.get_name()`.  It’s exposed 
through `bdrv_get_parent_name()`, which is called by 
`bdrv_get_device_name()` and `bdrv_get_device_or_node_name()` – so I 
think it should be classified as I/O.



@@ -837,6 +841,40 @@ struct BdrvChildClass {
   */
  char *(*get_parent_desc)(BdrvChild *child);


This function is very similar, so we might also want to reconsider 
classifying it as I/O.  There’s no need, because all of its callers do 
run in the main thread, but at the same time I don’t believe there’s 
anything stopping us (and it starts to sound to me like all functions of 
the “get name” kind perhaps should ideally be I/O, in that they 
shouldn’t require the GS context).


Up to you. O:)

(Rest of this patch looks good!)

Hanna




Re: [PATCH v6 23/33] block_int-common.h: split function pointers in BlockDriver

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

Similar to the header split, also the function pointers in BlockDriver
can be split in I/O and global state.

Signed-off-by: Emanuele Giuseppe Esposito 
---
  include/block/block_int-common.h | 434 ---
  1 file changed, 231 insertions(+), 203 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 70534f94ae..e007dbf768 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h


[...]


@@ -159,7 +148,66 @@ struct BlockDriver {


[...]


+/*
+ * Global state (GS) API. These functions run under the BQL lock.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
+/*
+ * Return true if @to_replace can be replaced by a BDS with the
+ * same data as @bs without it affecting @bs's behavior (that is,
+ * without it being visible to @bs's parents).
+ */
+bool (*bdrv_recurse_can_replace)(BlockDriverState *bs,
+ BlockDriverState *to_replace);
+
+int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);


Considering that its only caller (bdrv_probe_all()) is now an I/O 
function, shouldn’t this be, too?


Hanna




Re: [PATCH v6 21/33] block: move BQL logic of bdrv_co_invalidate_cache in bdrv_activate

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

Split bdrv_co_invalidate cache in two: the GS code that takes
care of permissions and running GS callbacks, and leave only the
I/O code (->bdrv_co_invalidate_cache) running in the I/O coroutine.

The only side effect is that bdrv_co_invalidate_cache is not
recursive anymore, and so is every direct call to
bdrv_invalidate_cache().

Signed-off-by: Emanuele Giuseppe Esposito 
---
  block.c | 36 +---
  1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/block.c b/block.c
index 7ab5031027..bad834c86e 100644
--- a/block.c
+++ b/block.c


[...]


@@ -6579,7 +6576,7 @@ int coroutine_fn 
bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
   * Note that the required permissions of inactive images are always a
   * subset of the permissions required after activating the image. This
   * allows us to just get the permissions upfront without restricting
- * drv->bdrv_invalidate_cache().
+ * drv->bdrv_co_invalidate_cache().


Perhaps just `bdrv_invalidate_cache()`, without the `drv->`?


   *
   * It also means that in error cases, we don't have to try and revert to
   * the old permissions (which is an operation that could fail, too). We 
can
@@ -6594,14 +6591,7 @@ int coroutine_fn 
bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
  return ret;
  }
  
-if (bs->drv->bdrv_co_invalidate_cache) {

-bs->drv->bdrv_co_invalidate_cache(bs, _err);
-if (local_err) {
-bs->open_flags |= BDRV_O_INACTIVE;
-error_propagate(errp, local_err);
-return -EINVAL;
-}
-}
+bdrv_invalidate_cache(bs, errp);


We should check the returned value here, and return on error.

Other than that, looks good and makes sense.

Hanna


  FOR_EACH_DIRTY_BITMAP(bs, bm) {
  bdrv_dirty_bitmap_skip_store(bm, false);





Re: [PATCH v6 20/33] block: rename bdrv_invalidate_cache_all, blk_invalidate_cache and test_sync_op_invalidate_cache

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

Following the bdrv_activate renaming, change also the name
of the respective callers.

bdrv_invalidate_cache_all -> bdrv_activate_all
blk_invalidate_cache -> blk_activate
test_sync_op_invalidate_cache -> test_sync_op_activate

No functional change intended.

Signed-off-by: Emanuele Giuseppe Esposito 
---
  block.c|  2 +-
  block/block-backend.c  |  2 +-
  hw/block/pflash_cfi01.c|  2 +-
  hw/nvram/spapr_nvram.c |  2 +-
  include/block/block-global-state.h |  2 +-
  include/sysemu/block-backend-io.h  |  2 +-
  migration/block.c  |  2 +-
  migration/migration.c  | 10 +-
  migration/savevm.c |  4 ++--
  monitor/qmp-cmds.c |  2 +-
  tests/unit/test-block-iothread.c   |  6 +++---
  11 files changed, 18 insertions(+), 18 deletions(-)


Reviewed-by: Hanna Reitz 




Re: [PATCH v6 19/33] block: introduce bdrv_activate

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

This function is currently just a wrapper for bdrv_invalidate_cache(),
but in future will contain the code of bdrv_co_invalidate_cache() that
has to always be protected by BQL, and leave the rest in the I/O
coroutine.

Replace all bdrv_invalidate_cache() invokations with bdrv_activate().

Signed-off-by: Emanuele Giuseppe Esposito 
---
  block.c| 7 ++-
  block/block-backend.c  | 2 +-
  block/export/export.c  | 2 +-
  block/parallels.c  | 2 +-
  include/block/block-global-state.h | 2 +-
  tests/unit/test-block-iothread.c   | 2 +-
  6 files changed, 11 insertions(+), 6 deletions(-)


Reviewed-by: Hanna Reitz 




Re: [PATCH 00/12] Removal of Aiocontext lock through drains: protect bdrv_replace_child_noperm.

2022-01-26 Thread Stefan Hajnoczi
If I understand correctly aio_context_acquire() calls are not explicitly
removed in this patch series but switching to BDRV_POLL_WHILE_UNLOCKED()
means that we no longer need to run under the AioContext lock?

Still, I'm a bit surprised I didn't notice any
aio_context_acquire/release() removals in this patch series because I
thought callers need that before they switch to
BDRV_POLL_WHILE_UNLOCKED()?


signature.asc
Description: PGP signature


Re: [PATCH 09/12] jobs: ensure sleep in job_sleep_ns is fully performed

2022-01-26 Thread Stefan Hajnoczi
On Tue, Jan 18, 2022 at 11:27:35AM -0500, Emanuele Giuseppe Esposito wrote:
> If a drain happens while a job is sleeping, the timeout
> gets cancelled and the job continues once the drain ends.
> This is especially bad for the sleep performed in commit and stream
> jobs, since that is dictated by ratelimit to maintain a certain speed.
> 
> Basically the execution path is the followig:

s/followig/following/

> 1. job calls job_sleep_ns, and yield with a timer in @ns ns.
> 2. meanwhile, a drain is executed, and
>child_job_drained_{begin/end} could be executed as ->drained_begin()
>and ->drained_end() callbacks.
>Therefore child_job_drained_begin() enters the job, that continues
>execution in job_sleep_ns() and calls job_pause_point_locked().
> 3. job_pause_point_locked() detects that we are in the middle of a
>drain, and firstly deletes any existing timer and then yields again,
>waiting for ->drained_end().
> 4. Once draining is finished, child_job_drained_end() runs and resumes
>the job. At this point, the timer has been lost and we just resume
>without checking if enough time has passed.
> 
> This fix implies that from now onwards, job_sleep_ns will force the job
> to sleep @ns, even if it is wake up (purposefully or not) in the middle
> of the sleep. Therefore qemu-iotests test might run a little bit slower,
> depending on the speed of the job. Setting a job speed to values like "1"
> is not allowed anymore (unless you want to wait forever).
> 
> Because of this fix, test_stream_parallel() in tests/qemu-iotests/030
> takes too long, since speed of stream job is just 1024 and before
> it was skipping all the wait thanks to the drains. Increase the
> speed to 256 * 1024. Exactly the same happens for test 151.
> 
> Instead we need to sleep less in test_cancel_ready() test-blockjob.c,
> so that the job will be able to exit the sleep and transition to ready
> before the main loop asserts.

I remember seeing Hanna and Kevin use carefully rate-limited jobs in
qemu-iotests. They might have thoughts on whether this patch is
acceptable or not.


signature.asc
Description: PGP signature


Re: [PATCH 08/12] reopen: add a transaction to drain_end nodes picked in bdrv_reopen_parse_file_or_backing

2022-01-26 Thread Stefan Hajnoczi
On Tue, Jan 18, 2022 at 11:27:34AM -0500, Emanuele Giuseppe Esposito wrote:
> Depending on the options given to reopen_state,
> bdrv_reopen_parse_file_or_backing could pick another bs
> that could be from another graph, and thus not protected
> by subtree_drained_begin called by the callers of this
> function.
> 
> We can't simply drain-undrain here, because of transactions.
> To simplify the logic, transactions always assume that they
> are run under drain, so the various subtree_drain introduced
> so far always take care of covering tran_commit().
> 
> And since we cannot directly do it, as the transaction is
> created/committed higher above, we can just add a new
> transaction to the list that just executes subtree_drained_end
> to match the drained_begin done in this function.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> ---
>  block.c | 25 -
>  1 file changed, 20 insertions(+), 5 deletions(-)
> 
> diff --git a/block.c b/block.c
> index fb5bc3077a..fcc44a49a0 100644
> --- a/block.c
> +++ b/block.c
> @@ -4522,6 +4522,10 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, 
> bool read_only,
>  return bdrv_reopen(bs, opts, true, errp);
>  }
>  
> +TransactionActionDrv bdrv_drv_subtree_end = {
> +.clean = (void (*)(void *)) bdrv_subtree_drained_end_unlocked,

Please don't cast function pointers. If the types don't match please
define a wrapper function so the compiler can check the types.


signature.asc
Description: PGP signature


Re: [PATCH 07/12] block/io.c: introduce bdrv_subtree_drained_{begin/end}_unlocked

2022-01-26 Thread Stefan Hajnoczi
On Tue, Jan 18, 2022 at 11:27:33AM -0500, Emanuele Giuseppe Esposito wrote:
> diff --git a/block/io.c b/block/io.c
> index 5123b7b713..9d5167f64a 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -244,6 +244,7 @@ typedef struct {
>  bool begin;
>  bool recursive;
>  bool poll;
> +bool unlock;
>  BdrvChild *parent;
>  bool ignore_bds_parents;
>  int *drained_end_counter;
...
> @@ -473,23 +477,35 @@ static void bdrv_do_drained_begin(BlockDriverState *bs, 
> bool recursive,
>   */
>  if (poll) {
>  assert(!ignore_bds_parents);
> -BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, 
> parent));
> +if (unlock) {

"Unlock" is a verb that suggests we'll perform an unlock operation.
Please call it "unlocked" instead.

> +BDRV_POLL_WHILE_UNLOCKED(bs,
> + bdrv_drain_poll_top_level(bs, recursive,
> +   parent));
> +} else {
> +BDRV_POLL_WHILE(bs,
> +bdrv_drain_poll_top_level(bs, recursive, 
> parent));
> +}
>  }
>  }


signature.asc
Description: PGP signature


Re: [PATCH v3 07/19] block/dirty-bitmap: introduce bdrv_dirty_bitmap_status()

2022-01-26 Thread Vladimir Sementsov-Ogievskiy

18.01.2022 16:31, Hanna Reitz wrote:

+/*
+ * bdrv_dirty_bitmap_status:
+ * @hb: The HBitmap to operate on
+ * @start: the offset to start from
+ * @end: end of requested area
+ * @is_dirty: is bitmap dirty at @offset
+ * @pnum: how many bits has same value starting from @offset
+ */
+void hbitmap_status(const HBitmap *hb, int64_t offset, int64_t bytes,


In addition to the comment not fitting the parameter names, I also don’t find 
it ideal that the parameter names here don’t match the ones in the function’s 
definition.

I don’t have a preference between `start` or `offset` (although most other 
bitmap functions seem to prefer `start`), but I do prefer `count` over `bytes`, 
because...  Well, it’s a bit count, not a byte count, right?  (And from the 
bitmap user’s perspective, those bits might stand for any arbitrary unit.)

Apart from that, looks nice to me.  I am wondering a bit why this function 
doesn’t simply return the dirty bit status (like, well, the block-status 
functions do it), but I presume you simply found this interface to be better 
suited for its callers.


Hmm, seems, no reason for it actually. Will change to use normal return value.

--
Best regards,
Vladimir



Re: [PATCH 01/12] introduce BDRV_POLL_WHILE_UNLOCKED

2022-01-26 Thread Stefan Hajnoczi
On Tue, Jan 18, 2022 at 11:27:27AM -0500, Emanuele Giuseppe Esposito wrote:
> Same as BDRV_POLL_WHILE, but uses AIO_WAIT_WHILE_UNLOCKED.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> ---
>  include/block/block-global-state.h | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/include/block/block-global-state.h 
> b/include/block/block-global-state.h
> index 419fe8427f..7ad9496f56 100644
> --- a/include/block/block-global-state.h
> +++ b/include/block/block-global-state.h
> @@ -158,6 +158,11 @@ void bdrv_drain_all(void);
>  AIO_WAIT_WHILE(bdrv_get_aio_context(bs_),  \
> cond); })
>  
> +#define BDRV_POLL_WHILE_UNLOCKED(bs, cond) ({  \
> +BlockDriverState *bs_ = (bs);  \
> +AIO_WAIT_WHILE_UNLOCKED(bdrv_get_aio_context(bs_), \
> +cond); })

No doc comments? When and why is this API useful? Are there any
preconditions or assumptions (e.g. cond must be thread-safe)?

Stefan


signature.asc
Description: PGP signature


Re: [PATCH-for-7.0 0/2] hw/nvme/ctrl: Buffer types cleanups

2022-01-26 Thread Klaus Jensen
On Nov 11 19:09, Klaus Jensen wrote:
> On Nov 11 16:45, Philippe Mathieu-Daudé wrote:
> > Some trivial notes I took while reviewing CVE-2021-3947:
> > https://lore.kernel.org/qemu-devel/2021153125.2258176-1-phi...@redhat.com/
> > 
> > Based-on: <2021153125.2258176-1-phi...@redhat.com>
> > 
> > *** BLURB HERE ***
> > 
> > Philippe Mathieu-Daudé (2):
> >   hw/nvme/ctrl: Have nvme_addr_write() take const buffer
> >   hw/nvme/ctrl: Pass buffers as 'void *' types
> > 
> >  hw/nvme/nvme.h |  4 ++--
> >  hw/nvme/ctrl.c | 12 ++--
> >  2 files changed, 8 insertions(+), 8 deletions(-)
> > 
> 
> Thanks Philippe, LGTM.
> 
> Reviewed-by: Klaus Jensen 

Applied to nvme-next. Thanks!


signature.asc
Description: PGP signature


Re: [PATCH v6 06/33] block/block-backend.c: assertions for block-backend

2022-01-26 Thread Hanna Reitz

On 21.01.22 18:05, Emanuele Giuseppe Esposito wrote:

All the global state (GS) API functions will check that
qemu_in_main_thread() returns true. If not, it means
that the safety of BQL cannot be guaranteed, and
they need to be moved to I/O.

Signed-off-by: Emanuele Giuseppe Esposito 
---
  block/block-backend.c  | 79 ++
  softmmu/qdev-monitor.c |  2 ++
  2 files changed, 81 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 6f91dcc85d..6c80ae54cc 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c


[...]


@@ -1908,6 +1958,7 @@ void blk_lock_medium(BlockBackend *blk, bool locked)
  void blk_eject(BlockBackend *blk, bool eject_flag)
  {
  BlockDriverState *bs = blk_bs(blk);
+
  char *id;
  
  if (bs) {


Left over hunk from when when there was an assertion added here, should 
be dropped, I believe.


Hanna




Re: [PATCH for-7.0 0/4] hw/nvme: zoned random write area

2022-01-26 Thread Klaus Jensen
On Nov 25 08:37, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> This series adds support for a zoned random write area as standardized
> in TP 4076 ("Zoned Random Write Area").
> 
> Klaus Jensen (4):
>   hw/nvme: add struct for zone management send
>   hw/nvme: add zone attribute get/set helpers
>   hw/nvme: add ozcs enum
>   hw/nvme: add support for zoned random write area
> 
>  hw/nvme/ctrl.c   | 185 ---
>  hw/nvme/ns.c |  61 +-
>  hw/nvme/nvme.h   |  10 +++
>  hw/nvme/trace-events |   1 +
>  include/block/nvme.h |  43 +-
>  5 files changed, 271 insertions(+), 29 deletions(-)
> 
> -- 
> 2.34.0
> 

Bz ping :)


signature.asc
Description: PGP signature


Re: [PATCH v3 00/15] hw/nvme: SR-IOV with Virtualization Enhancements

2022-01-26 Thread Klaus Jensen
On Dec 21 15:32, Lukasz Maniak wrote:
> This is the version of the patch series that we consider ready for
> staging. We do not intend to work on the v4 unless there are major
> issues.
> 
> Changes since v2:
> - The documentation mentions that SR-IOV support is still an
>   experimental feature.
> - The default value activates properly when sriov_max_v{i,q}_per_vf == 0.
> - Secondary Controller List (CNS 15h) handles the CDW10.CNTID field.
> - Virtual Function Number ("VFN") in Secondary Controller Entry is not
>   cleared to zero as the controller goes offline.
> - Removed no longer used helper pcie_sriov_vf_number_total.
> - Reset other than Controller Reset is necessary to activate (or
>   deactivate) flexible resources.
> - The v{i,q}rfap fields in Primary Controller Capabilities store the
>   currently active number of bound resources, not the number active
>   after reset.
> - Secondary controller cannot be set online unless the corresponding VF
>   is enabled (sriov_numvfs set to at least the secondary controller's VF
>   number)
> 
> The list of opens and known gaps remains the same as for v2:
> https://lists.gnu.org/archive/html/qemu-block/2021-11/msg00423.html
> 

hw/nvme parts look fine.

I expect some changes in response to Michaels comments before we
coordinate merging this?


signature.asc
Description: PGP signature