[PULL 12/16] vfio/migration: Reset bytes_transferred properly

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

Currently, VFIO bytes_transferred is not reset properly:
1. bytes_transferred is not reset after a VM snapshot (so a migration
   following a snapshot will report incorrect value).
2. bytes_transferred is a single counter for all VFIO devices, however
   upon migration failure it is reset multiple times, by each VFIO
   device.

Fix it by introducing a new function vfio_reset_bytes_transferred() and
calling it during migration and snapshot start.

Remove existing bytes_transferred reset in VFIO migration state
notifier, which is not needed anymore.

Fixes: 3710586caa5d ("qapi: Add VFIO devices migration stats in Migration 
stats")
Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-common.h |  1 +
 migration/migration.h |  1 +
 hw/vfio/migration.c   |  6 +-
 migration/migration.c |  1 +
 migration/savevm.c|  1 +
 migration/target.c| 17 +++--
 6 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 6d1b8487c374..1d19c6f251c1 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -229,6 +229,7 @@ int vfio_block_multiple_devices_migration(Error **errp);
 void vfio_unblock_multiple_devices_migration(void);
 int vfio_block_giommu_migration(Error **errp);
 int64_t vfio_mig_bytes_transferred(void);
+void vfio_reset_bytes_transferred(void);
 
 #ifdef CONFIG_LINUX
 int vfio_get_region_info(VFIODevice *vbasedev, int index,
diff --git a/migration/migration.h b/migration/migration.h
index c859a0d35eb7..a80b22b703cd 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -514,6 +514,7 @@ bool migration_rate_limit(void);
 void migration_cancel(const Error *error);
 
 void populate_vfio_info(MigrationInfo *info);
+void reset_vfio_bytes_transferred(void);
 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
 
 #endif
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index acbf0bb7ab3c..7cf143926ce9 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -697,7 +697,6 @@ static void vfio_migration_state_notifier(Notifier 
*notifier, void *data)
 case MIGRATION_STATUS_CANCELLING:
 case MIGRATION_STATUS_CANCELLED:
 case MIGRATION_STATUS_FAILED:
-bytes_transferred = 0;
 /*
  * If setting the device in RUNNING state fails, the device should
  * be reset. To do so, use ERROR state as a recover state.
@@ -818,6 +817,11 @@ int64_t vfio_mig_bytes_transferred(void)
 return bytes_transferred;
 }
 
+void vfio_reset_bytes_transferred(void)
+{
+bytes_transferred = 0;
+}
+
 int vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
 {
 int ret = -ENOTSUP;
diff --git a/migration/migration.c b/migration/migration.c
index 7653787f745d..096e8191d15c 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1628,6 +1628,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, 
bool blk_inc,
  */
 memset(_stats, 0, sizeof(mig_stats));
 memset(_counters, 0, sizeof(compression_counters));
+reset_vfio_bytes_transferred();
 
 return true;
 }
diff --git a/migration/savevm.c b/migration/savevm.c
index cdf47939244d..95c2abf47c10 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1622,6 +1622,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
 migrate_init(ms);
 memset(_stats, 0, sizeof(mig_stats));
 memset(_counters, 0, sizeof(compression_counters));
+reset_vfio_bytes_transferred();
 ms->to_dst_file = f;
 
 qemu_mutex_unlock_iothread();
diff --git a/migration/target.c b/migration/target.c
index 00ca007f9784..f39c9a8d8877 100644
--- a/migration/target.c
+++ b/migration/target.c
@@ -14,12 +14,25 @@
 #include "hw/vfio/vfio-common.h"
 #endif
 
+#ifdef CONFIG_VFIO
 void populate_vfio_info(MigrationInfo *info)
 {
-#ifdef CONFIG_VFIO
 if (vfio_mig_active()) {
 info->vfio = g_malloc0(sizeof(*info->vfio));
 info->vfio->transferred = vfio_mig_bytes_transferred();
 }
-#endif
 }
+
+void reset_vfio_bytes_transferred(void)
+{
+vfio_reset_bytes_transferred();
+}
+#else
+void populate_vfio_info(MigrationInfo *info)
+{
+}
+
+void reset_vfio_bytes_transferred(void)
+{
+}
+#endif
-- 
2.41.0




Re: [PATCH 3/4] ppc/pnv: Add P10 quad ops

2023-06-29 Thread Cédric Le Goater

On 6/30/23 05:55, Joel Stanley wrote:

Add a PnvQuad class for the P10 powernv machine. No xscoms are
implemented yet, but this allows them to be added.

Signed-off-by: Joel Stanley 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/ppc/pnv.c  |  2 +-
  hw/ppc/pnv_core.c | 53 +++
  2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index c77fdb6747a4..5f25fe985ab2 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1669,7 +1669,7 @@ static void pnv_chip_power10_quad_realize(Pnv10Chip 
*chip10, Error **errp)
  PnvQuad *eq = >quads[i];
  
  pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4],

-  PNV_QUAD_TYPE_NAME("power9"));
+  PNV_QUAD_TYPE_NAME("power10"));
  
  pnv_xscom_add_subregion(chip, PNV10_XSCOM_EQ_BASE(eq->quad_id),

  >xscom_regs);
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index b9a57463aec4..7fff2fd9e298 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -404,6 +404,47 @@ static const MemoryRegionOps pnv_quad_power9_xscom_ops = {
  .endianness = DEVICE_BIG_ENDIAN,
  };
  
+/*

+ * POWER10 Quads
+ */
+
+static uint64_t pnv_quad_power10_xscom_read(void *opaque, hwaddr addr,
+unsigned int width)
+{
+uint32_t offset = addr >> 3;
+uint64_t val = -1;
+
+switch (offset) {
+default:
+qemu_log_mask(LOG_UNIMP, "%s: writing @0x%08x\n", __func__,
+  offset);
+}
+
+return val;
+}
+
+static void pnv_quad_power10_xscom_write(void *opaque, hwaddr addr, uint64_t 
val,
+ unsigned int width)
+{
+uint32_t offset = addr >> 3;
+
+switch (offset) {
+default:
+qemu_log_mask(LOG_UNIMP, "%s: writing @0x%08x\n", __func__,
+  offset);
+}
+}
+
+static const MemoryRegionOps pnv_quad_power10_xscom_ops = {
+.read = pnv_quad_power10_xscom_read,
+.write = pnv_quad_power10_xscom_write,
+.valid.min_access_size = 8,
+.valid.max_access_size = 8,
+.impl.min_access_size = 8,
+.impl.max_access_size = 8,
+.endianness = DEVICE_BIG_ENDIAN,
+};
+
  static void pnv_quad_realize(DeviceState *dev, Error **errp)
  {
  PnvQuad *eq = PNV_QUAD(dev);
@@ -428,6 +469,13 @@ static void pnv_quad_power9_class_init(ObjectClass *oc, 
void *data)
  pqc->xscom_ops = _quad_power9_xscom_ops;
  }
  
+static void pnv_quad_power10_class_init(ObjectClass *oc, void *data)

+{
+PnvQuadClass *pqc = PNV_QUAD_CLASS(oc);
+
+pqc->xscom_ops = _quad_power10_xscom_ops;
+}
+
  static void pnv_quad_class_init(ObjectClass *oc, void *data)
  {
  DeviceClass *dc = DEVICE_CLASS(oc);
@@ -451,6 +499,11 @@ static const TypeInfo pnv_quad_infos[] = {
  .name = PNV_QUAD_TYPE_NAME("power9"),
  .class_init = pnv_quad_power9_class_init,
  },
+{
+.parent = TYPE_PNV_QUAD,
+.name = PNV_QUAD_TYPE_NAME("power10"),
+.class_init = pnv_quad_power10_class_init,
+},
  };
  
  DEFINE_TYPES(pnv_quad_infos);





[PULL 10/16] hw/vfio/pci-quirks: Support alternate offset for GPUDirect Cliques

2023-06-29 Thread Cédric Le Goater
From: Alex Williamson 

NVIDIA Turing and newer GPUs implement the MSI-X capability at the offset
previously reserved for use by hypervisors to implement the GPUDirect
Cliques capability.  A revised specification provides an alternate
location.  Add a config space walk to the quirk to check for conflicts,
allowing us to fall back to the new location or generate an error at the
quirk setup rather than when the real conflicting capability is added
should there be no available location.

Signed-off-by: Alex Williamson 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/pci-quirks.c | 41 -
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index f0147a050aaa..0ed2fcd53152 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -1490,6 +1490,9 @@ void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
  * +-+-+
  *
  * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
+ *
+ * Specification for Turning and later GPU architectures:
+ * https://lists.gnu.org/archive/html/qemu-devel/2023-06/pdf142OR4O4c2.pdf
  */
 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
const char *name, void *opaque,
@@ -1530,7 +1533,9 @@ const PropertyInfo qdev_prop_nv_gpudirect_clique = {
 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
 {
 PCIDevice *pdev = >pdev;
-int ret, pos = 0xC8;
+int ret, pos;
+bool c8_conflict = false, d4_conflict = false;
+uint8_t tmp;
 
 if (vdev->nv_gpudirect_clique == 0xFF) {
 return 0;
@@ -1547,6 +1552,40 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice 
*vdev, Error **errp)
 return -EINVAL;
 }
 
+/*
+ * Per the updated specification above, it's recommended to use offset
+ * D4h for Turing and later GPU architectures due to a conflict of the
+ * MSI-X capability at C8h.  We don't know how to determine the GPU
+ * architecture, instead we walk the capability chain to mark conflicts
+ * and choose one or error based on the result.
+ *
+ * NB. Cap list head in pdev->config is already cleared, read from device.
+ */
+ret = pread(vdev->vbasedev.fd, , 1,
+vdev->config_offset + PCI_CAPABILITY_LIST);
+if (ret != 1 || !tmp) {
+error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list");
+return -EINVAL;
+}
+
+do {
+if (tmp == 0xC8) {
+c8_conflict = true;
+} else if (tmp == 0xD4) {
+d4_conflict = true;
+}
+tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT];
+} while (tmp);
+
+if (!c8_conflict) {
+pos = 0xC8;
+} else if (!d4_conflict) {
+pos = 0xD4;
+} else {
+error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid config space");
+return -EINVAL;
+}
+
 ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
 if (ret < 0) {
 error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
-- 
2.41.0




Re: [PATCH 4/4] ppc/pnv: Return zero for core thread state xscom

2023-06-29 Thread Cédric Le Goater

On 6/30/23 05:55, Joel Stanley wrote:

Firmware now warns if booting in LPAR per core mode (PPC bit 62). So
this warning doesn't trigger report the core thread state is 0.

Signed-off-by: Joel Stanley 



Reviewed-by: Cédric Le Goater 

Thanks,

C.


---
  hw/ppc/pnv_core.c | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 7fff2fd9e298..98356d7f6538 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -116,6 +116,8 @@ static const MemoryRegionOps pnv_core_power8_xscom_ops = {
  #define PNV9_XSCOM_EC_PPM_SPECIAL_WKUP_HYP 0xf010d
  #define PNV9_XSCOM_EC_PPM_SPECIAL_WKUP_OTR 0xf010a
  
+#define PNV9_XSCOM_EC_CORE_THREAD_STATE0x10ab3

+
  static uint64_t pnv_core_power9_xscom_read(void *opaque, hwaddr addr,
 unsigned int width)
  {
@@ -134,6 +136,9 @@ static uint64_t pnv_core_power9_xscom_read(void *opaque, 
hwaddr addr,
  case PNV9_XSCOM_EC_PPM_SPECIAL_WKUP_OTR:
  val = 0x0;
  break;
+case PNV9_XSCOM_EC_CORE_THREAD_STATE:
+val = 0;
+break;
  default:
  qemu_log_mask(LOG_UNIMP, "Warning: reading reg=0x%" HWADDR_PRIx "\n",
addr);
@@ -408,6 +413,8 @@ static const MemoryRegionOps pnv_quad_power9_xscom_ops = {
   * POWER10 Quads
   */
  
+#define PNV10_XSCOM_EC_PC_PMC_CORE_THREAD_STATE 0x28412

+
  static uint64_t pnv_quad_power10_xscom_read(void *opaque, hwaddr addr,
  unsigned int width)
  {
@@ -415,6 +422,9 @@ static uint64_t pnv_quad_power10_xscom_read(void *opaque, 
hwaddr addr,
  uint64_t val = -1;
  
  switch (offset) {

+case PNV10_XSCOM_EC_PC_PMC_CORE_THREAD_STATE:
+val = 0;
+break;
  default:
  qemu_log_mask(LOG_UNIMP, "%s: writing @0x%08x\n", __func__,
offset);





[PULL 15/16] vfio/pci: Fix a segfault in vfio_realize

2023-06-29 Thread Cédric Le Goater
From: Zhenzhong Duan 

The kvm irqchip notifier is only registered if the device supports
INTx, however it's unconditionally removed in vfio realize error
path. If the assigned device does not support INTx, this will cause
QEMU to crash when vfio realize fails. Change it to conditionally
remove the notifier only if the notify hook is setup.

Before fix:
(qemu) device_add vfio-pci,host=81:11.1,id=vfio1,bus=root1,xres=1
Connection closed by foreign host.

After fix:
(qemu) device_add vfio-pci,host=81:11.1,id=vfio1,bus=root1,xres=1
Error: vfio :81:11.1: xres and yres properties require display=on
(qemu)

Fixes: c5478fea27ac ("vfio/pci: Respond to KVM irqchip change notifier")
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Joao Martins 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/pci.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 73e19a04b2bf..48df517f79ee 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3221,7 +3221,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 out_deregister:
 pci_device_set_intx_routing_notifier(>pdev, NULL);
-kvm_irqchip_remove_change_notifier(>irqchip_change_notifier);
+if (vdev->irqchip_change_notifier.notify) {
+kvm_irqchip_remove_change_notifier(>irqchip_change_notifier);
+}
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
-- 
2.41.0




Re: [PATCH 2/4] ppc/pnv: Subclass quad xscom callbacks

2023-06-29 Thread Cédric Le Goater

On 6/30/23 05:55, Joel Stanley wrote:

Make the existing pnv_quad_xscom_read/write be P9 specific, in
preparation for a different P10 callback.

Signed-off-by: Joel Stanley 



Reviewed-by: Cédric Le Goater 

Thanks,

C.


---
  include/hw/ppc/pnv_core.h | 12 +++-
  hw/ppc/pnv.c  | 11 +++
  hw/ppc/pnv_core.c | 36 
  3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/include/hw/ppc/pnv_core.h b/include/hw/ppc/pnv_core.h
index 3d75706e95da..ab3f6d6c2843 100644
--- a/include/hw/ppc/pnv_core.h
+++ b/include/hw/ppc/pnv_core.h
@@ -60,8 +60,18 @@ static inline PnvCPUState *pnv_cpu_state(PowerPCCPU *cpu)
  return (PnvCPUState *)cpu->machine_data;
  }
  
+struct PnvQuadClass {

+DeviceClass parent_class;
+
+const MemoryRegionOps *xscom_ops;
+};
+
  #define TYPE_PNV_QUAD "powernv-cpu-quad"
-OBJECT_DECLARE_SIMPLE_TYPE(PnvQuad, PNV_QUAD)
+
+#define PNV_QUAD_TYPE_SUFFIX "-" TYPE_PNV_QUAD
+#define PNV_QUAD_TYPE_NAME(cpu_model) cpu_model PNV_QUAD_TYPE_SUFFIX
+
+OBJECT_DECLARE_TYPE(PnvQuad, PnvQuadClass, PNV_QUAD)
  
  struct PnvQuad {

  DeviceState parent_obj;
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index fc083173f346..c77fdb6747a4 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1429,14 +1429,15 @@ static void pnv_chip_power9_instance_init(Object *obj)
  }
  
  static void pnv_chip_quad_realize_one(PnvChip *chip, PnvQuad *eq,

-  PnvCore *pnv_core)
+  PnvCore *pnv_core,
+  const char *type)
  {
  char eq_name[32];
  int core_id = CPU_CORE(pnv_core)->core_id;
  
  snprintf(eq_name, sizeof(eq_name), "eq[%d]", core_id);

  object_initialize_child_with_props(OBJECT(chip), eq_name, eq,
-   sizeof(*eq), TYPE_PNV_QUAD,
+   sizeof(*eq), type,
 _fatal, NULL);
  
  object_property_set_int(OBJECT(eq), "quad-id", core_id, _fatal);

@@ -1454,7 +1455,8 @@ static void pnv_chip_quad_realize(Pnv9Chip *chip9, Error 
**errp)
  for (i = 0; i < chip9->nr_quads; i++) {
  PnvQuad *eq = >quads[i];
  
-pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4]);

+pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4],
+  PNV_QUAD_TYPE_NAME("power9"));
  
  pnv_xscom_add_subregion(chip, PNV9_XSCOM_EQ_BASE(eq->quad_id),

  >xscom_regs);
@@ -1666,7 +1668,8 @@ static void pnv_chip_power10_quad_realize(Pnv10Chip 
*chip10, Error **errp)
  for (i = 0; i < chip10->nr_quads; i++) {
  PnvQuad *eq = >quads[i];
  
-pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4]);

+pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4],
+  PNV_QUAD_TYPE_NAME("power9"));
  
  pnv_xscom_add_subregion(chip, PNV10_XSCOM_EQ_BASE(eq->quad_id),

  >xscom_regs);
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 0b1c3cccfebc..b9a57463aec4 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -407,11 +407,12 @@ static const MemoryRegionOps pnv_quad_power9_xscom_ops = {
  static void pnv_quad_realize(DeviceState *dev, Error **errp)
  {
  PnvQuad *eq = PNV_QUAD(dev);
+PnvQuadClass *pqc = PNV_QUAD_GET_CLASS(eq);
  char name[32];
  
  snprintf(name, sizeof(name), "xscom-quad.%d", eq->quad_id);

  pnv_xscom_region_init(>xscom_regs, OBJECT(dev),
-  _quad_power9_xscom_ops,
+  pqc->xscom_ops,
eq, name, PNV9_XSCOM_EQ_SIZE);
  }
  
@@ -420,6 +421,13 @@ static Property pnv_quad_properties[] = {

  DEFINE_PROP_END_OF_LIST(),
  };
  
+static void pnv_quad_power9_class_init(ObjectClass *oc, void *data)

+{
+PnvQuadClass *pqc = PNV_QUAD_CLASS(oc);
+
+pqc->xscom_ops = _quad_power9_xscom_ops;
+}
+
  static void pnv_quad_class_init(ObjectClass *oc, void *data)
  {
  DeviceClass *dc = DEVICE_CLASS(oc);
@@ -429,16 +437,20 @@ static void pnv_quad_class_init(ObjectClass *oc, void 
*data)
  dc->user_creatable = false;
  }
  
-static const TypeInfo pnv_quad_info = {

-.name  = TYPE_PNV_QUAD,
-.parent= TYPE_DEVICE,
-.instance_size = sizeof(PnvQuad),
-.class_init= pnv_quad_class_init,
+static const TypeInfo pnv_quad_infos[] = {
+{
+.name  = TYPE_PNV_QUAD,
+.parent= TYPE_DEVICE,
+.instance_size = sizeof(PnvQuad),
+.class_size= sizeof(PnvQuadClass),
+.class_init= pnv_quad_class_init,
+.abstract  = true,
+},
+{
+.parent = TYPE_PNV_QUAD,
+.name = PNV_QUAD_TYPE_NAME("power9"),
+.class_init = pnv_quad_power9_class_init,
+},
  };
  
-static void pnv_core_register_types(void)

-{
-

[PULL 16/16] vfio/pci: Free leaked timer in vfio_realize error path

2023-06-29 Thread Cédric Le Goater
From: Zhenzhong Duan 

When vfio_realize fails, the mmap_timer used for INTx optimization
isn't freed. As this timer isn't activated yet, the potential impact
is just a piece of leaked memory.

Fixes: ea486926b07d ("vfio-pci: Update slow path INTx algorithm timer related")
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Joao Martins 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 48df517f79ee..ab6645ba60af 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3224,6 +3224,9 @@ out_deregister:
 if (vdev->irqchip_change_notifier.notify) {
 kvm_irqchip_remove_change_notifier(>irqchip_change_notifier);
 }
+if (vdev->intx.mmap_timer) {
+timer_free(vdev->intx.mmap_timer);
+}
 out_teardown:
 vfio_teardown_msi(vdev);
 vfio_bars_exit(vdev);
-- 
2.41.0




[PULL 13/16] vfio/migration: Make VFIO migration non-experimental

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

The major parts of VFIO migration are supported today in QEMU. This
includes basic VFIO migration, device dirty page tracking and precopy
support.

Thus, at this point in time, it seems appropriate to make VFIO migration
non-experimental: remove the x prefix from enable_migration property,
change it to ON_OFF_AUTO and let the default value be AUTO.

In addition, make the following adjustments:
1. When enable_migration is ON and migration is not supported, fail VFIO
   device realization.
2. When enable_migration is AUTO (i.e., not explicitly enabled), require
   device dirty tracking support. This is because device dirty tracking
   is currently the only method to do dirty page tracking, which is
   essential for migrating in a reasonable downtime. Setting
   enable_migration to ON will not require device dirty tracking.
3. Make migration error and blocker messages more elaborate.
4. Remove error prints in vfio_migration_query_flags().
5. Rename trace_vfio_migration_probe() to
   trace_vfio_migration_realize().

Signed-off-by: Avihai Horon 
Reviewed-by: Joao Martins 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-common.h |  6 +--
 hw/vfio/common.c  | 16 ++-
 hw/vfio/migration.c   | 79 +++
 hw/vfio/pci.c |  4 +-
 hw/vfio/trace-events  |  2 +-
 5 files changed, 73 insertions(+), 34 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 1d19c6f251c1..93429b9abba0 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -139,7 +139,7 @@ typedef struct VFIODevice {
 bool needs_reset;
 bool no_mmap;
 bool ram_block_discard_allowed;
-bool enable_migration;
+OnOffAuto enable_migration;
 VFIODeviceOps *ops;
 unsigned int num_irqs;
 unsigned int num_regions;
@@ -225,9 +225,9 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
 extern VFIOGroupList vfio_group_list;
 
 bool vfio_mig_active(void);
-int vfio_block_multiple_devices_migration(Error **errp);
+int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp);
 void vfio_unblock_multiple_devices_migration(void);
-int vfio_block_giommu_migration(Error **errp);
+int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp);
 int64_t vfio_mig_bytes_transferred(void);
 void vfio_reset_bytes_transferred(void);
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 28ec9e999c09..77e2ee0e5c6e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -381,7 +381,7 @@ static unsigned int vfio_migratable_device_num(void)
 return device_num;
 }
 
-int vfio_block_multiple_devices_migration(Error **errp)
+int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
 {
 int ret;
 
@@ -390,6 +390,12 @@ int vfio_block_multiple_devices_migration(Error **errp)
 return 0;
 }
 
+if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
+error_setg(errp, "Migration is currently not supported with multiple "
+ "VFIO devices");
+return -EINVAL;
+}
+
 error_setg(_devices_migration_blocker,
"Migration is currently not supported with multiple "
"VFIO devices");
@@ -427,7 +433,7 @@ static bool vfio_viommu_preset(void)
 return false;
 }
 
-int vfio_block_giommu_migration(Error **errp)
+int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp)
 {
 int ret;
 
@@ -436,6 +442,12 @@ int vfio_block_giommu_migration(Error **errp)
 return 0;
 }
 
+if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
+error_setg(errp,
+   "Migration is currently not supported with vIOMMU enabled");
+return -EINVAL;
+}
+
 error_setg(_migration_blocker,
"Migration is currently not supported with vIOMMU enabled");
 ret = migrate_add_blocker(giommu_migration_blocker, errp);
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 7cf143926ce9..1db7d52ab2c1 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -724,14 +724,6 @@ static int vfio_migration_query_flags(VFIODevice 
*vbasedev, uint64_t *mig_flags)
 feature->argsz = sizeof(buf);
 feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
-if (errno == ENOTTY) {
-error_report("%s: VFIO migration is not supported in kernel",
- vbasedev->name);
-} else {
-error_report("%s: Failed to query VFIO migration support, err: %s",
- vbasedev->name, strerror(errno));
-}
-
 return -errno;
 }
 
@@ -810,6 +802,27 @@ static int vfio_migration_init(VFIODevice *vbasedev)
 return 0;
 }
 
+static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
+{

[PULL 07/16] vfio/migration: Add VFIO migration pre-copy support

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

Pre-copy support allows the VFIO device data to be transferred while the
VM is running. This helps to accommodate VFIO devices that have a large
amount of data that needs to be transferred, and it can reduce migration
downtime.

Pre-copy support is optional in VFIO migration protocol v2.
Implement pre-copy of VFIO migration protocol v2 and use it for devices
that support it. Full description of it can be found in the following
Linux commit: 4db52602a607 ("vfio: Extend the device migration protocol
with PRE_COPY").

Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Tested-by: YangHang Liu 
Acked-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 docs/devel/vfio-migration.rst |  35 +---
 include/hw/vfio/vfio-common.h |   2 +
 hw/vfio/common.c  |   6 +-
 hw/vfio/migration.c   | 165 --
 hw/vfio/trace-events  |   4 +-
 5 files changed, 190 insertions(+), 22 deletions(-)

diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst
index 1b68ccf11529..e896b2a6734b 100644
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@@ -7,12 +7,14 @@ the guest is running on source host and restoring this saved 
state on the
 destination host. This document details how saving and restoring of VFIO
 devices is done in QEMU.
 
-Migration of VFIO devices currently consists of a single stop-and-copy phase.
-During the stop-and-copy phase the guest is stopped and the entire VFIO device
-data is transferred to the destination.
-
-The pre-copy phase of migration is currently not supported for VFIO devices.
-Support for VFIO pre-copy will be added later on.
+Migration of VFIO devices consists of two phases: the optional pre-copy phase,
+and the stop-and-copy phase. The pre-copy phase is iterative and allows to
+accommodate VFIO devices that have a large amount of data that needs to be
+transferred. The iterative pre-copy phase of migration allows for the guest to
+continue whilst the VFIO device state is transferred to the destination, this
+helps to reduce the total downtime of the VM. VFIO devices opt-in to pre-copy
+support by reporting the VFIO_MIGRATION_PRE_COPY flag in the
+VFIO_DEVICE_FEATURE_MIGRATION ioctl.
 
 Note that currently VFIO migration is supported only for a single device. This
 is due to VFIO migration's lack of P2P support. However, P2P support is planned
@@ -29,10 +31,20 @@ VFIO implements the device hooks for the iterative approach 
as follows:
 * A ``load_setup`` function that sets the VFIO device on the destination in
   _RESUMING state.
 
+* A ``state_pending_estimate`` function that reports an estimate of the
+  remaining pre-copy data that the vendor driver has yet to save for the VFIO
+  device.
+
 * A ``state_pending_exact`` function that reads pending_bytes from the vendor
   driver, which indicates the amount of data that the vendor driver has yet to
   save for the VFIO device.
 
+* An ``is_active_iterate`` function that indicates ``save_live_iterate`` is
+  active only when the VFIO device is in pre-copy states.
+
+* A ``save_live_iterate`` function that reads the VFIO device's data from the
+  vendor driver during iterative pre-copy phase.
+
 * A ``save_state`` function to save the device config space if it is present.
 
 * A ``save_live_complete_precopy`` function that sets the VFIO device in
@@ -111,8 +123,10 @@ Flow of state changes during Live migration
 ===
 
 Below is the flow of state change during live migration.
-The values in the brackets represent the VM state, the migration state, and
+The values in the parentheses represent the VM state, the migration state, and
 the VFIO device state, respectively.
+The text in the square brackets represents the flow if the VFIO device supports
+pre-copy.
 
 Live migration save path
 
@@ -124,11 +138,12 @@ Live migration save path
   |
  migrate_init spawns migration_thread
 Migration thread then calls each device's .save_setup()
-   (RUNNING, _SETUP, _RUNNING)
+  (RUNNING, _SETUP, _RUNNING [_PRE_COPY])
   |
-  (RUNNING, _ACTIVE, _RUNNING)
- If device is active, get pending_bytes by .state_pending_exact()
+  (RUNNING, _ACTIVE, _RUNNING [_PRE_COPY])
+  If device is active, get pending_bytes by 
.state_pending_{estimate,exact}()
   If total pending_bytes >= threshold_size, call .save_live_iterate()
+  [Data of VFIO device for pre-copy phase is copied]
 Iterate till total pending bytes converge and are less than threshold
   |
   On migration completion, vCPU stops and calls .save_live_complete_precopy for
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 

[PULL 11/16] vfio/pci: Call vfio_prepare_kvm_msi_virq_batch() in MSI retry path

2023-06-29 Thread Cédric Le Goater
From: Shameer Kolothum 

When vfio_enable_vectors() returns with less than requested nr_vectors
we retry with what kernel reported back. But the retry path doesn't
call vfio_prepare_kvm_msi_virq_batch() and this results in,

qemu-system-aarch64: vfio: Error: Failed to enable 4 MSI vectors, retry with 1
qemu-system-aarch64: ../hw/vfio/pci.c:602: vfio_commit_kvm_msi_virq_batch: 
Assertion `vdev->defer_kvm_irq_routing' failed

Fixes: dc580d51f7dd ("vfio: defer to commit kvm irq routing when enable 
msi/msix")
Reviewed-by: Longpeng 
Signed-off-by: Shameer Kolothum 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/pci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 73874a94de12..8fb2c53a63bf 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -663,6 +663,8 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev)
 
 vfio_disable_interrupts(vdev);
 
+vdev->nr_vectors = msi_nr_vectors_allocated(>pdev);
+retry:
 /*
  * Setting vector notifiers needs to enable route for each vector.
  * Deferring to commit the KVM routes once rather than per vector
@@ -670,8 +672,6 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev)
  */
 vfio_prepare_kvm_msi_virq_batch(vdev);
 
-vdev->nr_vectors = msi_nr_vectors_allocated(>pdev);
-retry:
 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
 
 for (i = 0; i < vdev->nr_vectors; i++) {
-- 
2.41.0




[PULL 09/16] vfio: Implement a common device info helper

2023-06-29 Thread Cédric Le Goater
From: Alex Williamson 

A common helper implementing the realloc algorithm for handling
capabilities.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Alex Williamson 
Reviewed-by: Robin Voetter 
Signed-off-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-common.h |  1 +
 hw/s390x/s390-pci-vfio.c  | 37 
 hw/vfio/common.c  | 46 ++-
 3 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 3dc5f2104c86..6d1b8487c374 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -216,6 +216,7 @@ void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
 void vfio_put_group(VFIOGroup *group);
+struct vfio_device_info *vfio_get_device_info(int fd);
 int vfio_get_device(VFIOGroup *group, const char *name,
 VFIODevice *vbasedev, Error **errp);
 
diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c
index f51190d4662f..59a2e03873bd 100644
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -289,38 +289,11 @@ static void s390_pci_read_pfip(S390PCIBusDevice *pbdev,
 memcpy(pbdev->zpci_fn.pfip, cap->pfip, CLP_PFIP_NR_SEGMENTS);
 }
 
-static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev,
-uint32_t argsz)
+static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev)
 {
-struct vfio_device_info *info = g_malloc0(argsz);
-VFIOPCIDevice *vfio_pci;
-int fd;
+VFIOPCIDevice *vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
 
-vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
-fd = vfio_pci->vbasedev.fd;
-
-/*
- * If the specified argsz is not large enough to contain all capabilities
- * it will be updated upon return from the ioctl.  Retry until we have
- * a big enough buffer to hold the entire capability chain.  On error,
- * just exit and rely on CLP defaults.
- */
-retry:
-info->argsz = argsz;
-
-if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
-trace_s390_pci_clp_dev_info(vfio_pci->vbasedev.name);
-g_free(info);
-return NULL;
-}
-
-if (info->argsz > argsz) {
-argsz = info->argsz;
-info = g_realloc(info, argsz);
-goto retry;
-}
-
-return info;
+return vfio_get_device_info(vfio_pci->vbasedev.fd);
 }
 
 /*
@@ -335,7 +308,7 @@ bool s390_pci_get_host_fh(S390PCIBusDevice *pbdev, uint32_t 
*fh)
 
 assert(fh);
 
-info = get_device_info(pbdev, sizeof(*info));
+info = get_device_info(pbdev);
 if (!info) {
 return false;
 }
@@ -356,7 +329,7 @@ void s390_pci_get_clp_info(S390PCIBusDevice *pbdev)
 {
 g_autofree struct vfio_device_info *info = NULL;
 
-info = get_device_info(pbdev, sizeof(*info));
+info = get_device_info(pbdev);
 if (!info) {
 return;
 }
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 25801de173c4..28ec9e999c09 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -2846,11 +2846,35 @@ void vfio_put_group(VFIOGroup *group)
 }
 }
 
+struct vfio_device_info *vfio_get_device_info(int fd)
+{
+struct vfio_device_info *info;
+uint32_t argsz = sizeof(*info);
+
+info = g_malloc0(argsz);
+
+retry:
+info->argsz = argsz;
+
+if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
+g_free(info);
+return NULL;
+}
+
+if (info->argsz > argsz) {
+argsz = info->argsz;
+info = g_realloc(info, argsz);
+goto retry;
+}
+
+return info;
+}
+
 int vfio_get_device(VFIOGroup *group, const char *name,
 VFIODevice *vbasedev, Error **errp)
 {
-struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
-int ret, fd;
+g_autofree struct vfio_device_info *info = NULL;
+int fd;
 
 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
 if (fd < 0) {
@@ -2862,11 +2886,11 @@ int vfio_get_device(VFIOGroup *group, const char *name,
 return fd;
 }
 
-ret = ioctl(fd, VFIO_DEVICE_GET_INFO, _info);
-if (ret) {
+info = vfio_get_device_info(fd);
+if (!info) {
 error_setg_errno(errp, errno, "error getting device info");
 close(fd);
-return ret;
+return -1;
 }
 
 /*
@@ -2894,14 +2918,14 @@ int vfio_get_device(VFIOGroup *group, const char *name,
 vbasedev->group = group;
 QLIST_INSERT_HEAD(>device_list, vbasedev, next);
 
-vbasedev->num_irqs = dev_info.num_irqs;
-vbasedev->num_regions = dev_info.num_regions;
-vbasedev->flags = dev_info.flags;
+vbasedev->num_irqs = info->num_irqs;
+vbasedev->num_regions = info->num_regions;
+vbasedev->flags = info->flags;
+
+trace_vfio_get_device(name, info->flags, info->num_regions, 

[PULL 00/16] vfio queue

2023-06-29 Thread Cédric Le Goater
The following changes since commit 4d541f63e90c81112c298cbb35ed53e9c79deb00:

  Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging 
(2023-06-29 13:16:06 +0200)

are available in the Git repository at:

  https://github.com/legoater/qemu/ tags/pull-vfio-20230630

for you to fetch changes up to 0cc889c8826cefa5b80110d31a62273b56aa1832:

  vfio/pci: Free leaked timer in vfio_realize error path (2023-06-30 06:02:51 
+0200)


vfio queue:

* migration: New switchover ack to reduce downtime
* VFIO migration pre-copy support
* Removal of the VFIO migration experimental flag
* Alternate offset for GPUDirect Cliques
* Misc fixes


Alex Williamson (3):
  vfio: Implement a common device info helper
  hw/vfio/pci-quirks: Support alternate offset for GPUDirect Cliques
  MAINTAINERS: Promote Cédric to VFIO co-maintainer

Avihai Horon (10):
  migration: Add switchover ack capability
  migration: Implement switchover ack logic
  migration: Enable switchover ack capability
  tests: Add migration switchover ack capability test
  vfio/migration: Refactor vfio_save_block() to return saved data size
  vfio/migration: Store VFIO migration flags in VFIOMigration
  vfio/migration: Add VFIO migration pre-copy support
  vfio/migration: Add support for switchover ack capability
  vfio/migration: Reset bytes_transferred properly
  vfio/migration: Make VFIO migration non-experimental

Shameer Kolothum (1):
  vfio/pci: Call vfio_prepare_kvm_msi_virq_batch() in MSI retry path

Zhenzhong Duan (2):
  vfio/pci: Fix a segfault in vfio_realize
  vfio/pci: Free leaked timer in vfio_realize error path

 MAINTAINERS   |   2 +-
 docs/devel/vfio-migration.rst |  45 +--
 qapi/migration.json   |  12 +-
 include/hw/vfio/vfio-common.h |  12 +-
 include/migration/register.h  |   2 +
 migration/migration.h |  15 +++
 migration/options.h   |   1 +
 migration/savevm.h|   1 +
 hw/s390x/s390-pci-vfio.c  |  37 +
 hw/vfio/common.c  |  68 +++---
 hw/vfio/migration.c   | 305 --
 hw/vfio/pci-quirks.c  |  41 +-
 hw/vfio/pci.c |  15 ++-
 migration/migration.c |  33 -
 migration/options.c   |  17 +++
 migration/savevm.c|  55 
 migration/target.c|  17 ++-
 tests/qtest/migration-test.c  |  31 +
 hw/vfio/trace-events  |   6 +-
 migration/trace-events|   3 +
 20 files changed, 600 insertions(+), 118 deletions(-)



[PULL 02/16] migration: Implement switchover ack logic

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

Implement switchover ack logic. This prevents the source from stopping
the VM and completing the migration until an ACK is received from the
destination that it's OK to do so.

To achieve this, a new SaveVMHandlers handler switchover_ack_needed()
and a new return path message MIG_RP_MSG_SWITCHOVER_ACK are added.

The switchover_ack_needed() handler is called during migration setup in
the destination to check if switchover ack is used by the migrated
device.

When switchover is approved by all migrated devices in the destination
that support this capability, the MIG_RP_MSG_SWITCHOVER_ACK return path
message is sent to the source to notify it that it's OK to do
switchover.

Signed-off-by: Avihai Horon 
Reviewed-by: Peter Xu 
Tested-by: YangHang Liu 
Acked-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 include/migration/register.h |  2 ++
 migration/migration.h| 14 ++
 migration/savevm.h   |  1 +
 migration/migration.c| 32 +++--
 migration/savevm.c   | 54 
 migration/trace-events   |  3 ++
 6 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/include/migration/register.h b/include/migration/register.h
index a8dfd8fefd0a..90914f32f50c 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -71,6 +71,8 @@ typedef struct SaveVMHandlers {
 int (*load_cleanup)(void *opaque);
 /* Called when postcopy migration wants to resume from failure */
 int (*resume_prepare)(MigrationState *s, void *opaque);
+/* Checks if switchover ack should be used. Called only in dest */
+bool (*switchover_ack_needed)(void *opaque);
 } SaveVMHandlers;
 
 int register_savevm_live(const char *idstr,
diff --git a/migration/migration.h b/migration/migration.h
index 30c3e97635b1..c859a0d35eb7 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -210,6 +210,13 @@ struct MigrationIncomingState {
  * contains valid information.
  */
 QemuMutex page_request_mutex;
+
+/*
+ * Number of devices that have yet to approve switchover. When this reaches
+ * zero an ACK that it's OK to do switchover is sent to the source. No lock
+ * is needed as this field is updated serially.
+ */
+unsigned int switchover_ack_pending_num;
 };
 
 MigrationIncomingState *migration_incoming_get_current(void);
@@ -440,6 +447,12 @@ struct MigrationState {
 
 /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */
 JSONWriter *vmdesc;
+
+/*
+ * Indicates whether an ACK from the destination that it's OK to do
+ * switchover has been received.
+ */
+bool switchover_acked;
 };
 
 void migrate_set_state(int *state, int old_state, int new_state);
@@ -480,6 +493,7 @@ int 
migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
  char *block_name);
 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value);
+int migrate_send_rp_switchover_ack(MigrationIncomingState *mis);
 
 void dirty_bitmap_mig_before_vm_start(void);
 void dirty_bitmap_mig_cancel_outgoing(void);
diff --git a/migration/savevm.h b/migration/savevm.h
index fb636735f0af..e894bbc14331 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -65,6 +65,7 @@ int qemu_loadvm_state(QEMUFile *f);
 void qemu_loadvm_state_cleanup(void);
 int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
 int qemu_load_device_state(QEMUFile *f);
+int qemu_loadvm_approve_switchover(void);
 int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
 bool in_postcopy, bool inactivate_disks);
 
diff --git a/migration/migration.c b/migration/migration.c
index dc05c6f6ea94..7653787f745d 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -78,6 +78,7 @@ enum mig_rp_message_type {
 MIG_RP_MSG_REQ_PAGES,/* data (start: be64, len: be32) */
 MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
 MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
+MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */
 
 MIG_RP_MSG_MAX
 };
@@ -760,6 +761,11 @@ bool migration_has_all_channels(void)
 return true;
 }
 
+int migrate_send_rp_switchover_ack(MigrationIncomingState *mis)
+{
+return migrate_send_rp_message(mis, MIG_RP_MSG_SWITCHOVER_ACK, 0, NULL);
+}
+
 /*
  * Send a 'SHUT' message on the return channel with the given value
  * to indicate that we've finished with the RP.  Non-0 value indicates
@@ -1405,6 +1411,7 @@ void migrate_init(MigrationState *s)
 s->vm_old_state = -1;
 s->iteration_initial_bytes = 0;
 s->threshold_size = 0;
+s->switchover_acked = false;
 }
 
 int migrate_add_blocker_internal(Error *reason, Error **errp)
@@ -1721,6 +1728,7 @@ static struct rp_cmd_args {
 [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = 

[PULL 04/16] tests: Add migration switchover ack capability test

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

Add migration switchover ack capability test. The test runs without
devices that support this capability, but is still useful to make sure
it didn't break anything.

Signed-off-by: Avihai Horon 
Reviewed-by: Juan Quintela 
Reviewed-by: Peter Xu 
Tested-by: YangHang Liu 
Acked-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 tests/qtest/migration-test.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index b0c355bbd9ca..b9cc194100b5 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1693,6 +1693,33 @@ static void test_precopy_tcp_plain(void)
 test_precopy_common();
 }
 
+static void *test_migrate_switchover_ack_start(QTestState *from, QTestState 
*to)
+{
+
+migrate_set_capability(from, "return-path", true);
+migrate_set_capability(to, "return-path", true);
+
+migrate_set_capability(from, "switchover-ack", true);
+migrate_set_capability(to, "switchover-ack", true);
+
+return NULL;
+}
+
+static void test_precopy_tcp_switchover_ack(void)
+{
+MigrateCommon args = {
+.listen_uri = "tcp:127.0.0.1:0",
+.start_hook = test_migrate_switchover_ack_start,
+/*
+ * Source VM must be running in order to consider the switchover ACK
+ * when deciding to do switchover or not.
+ */
+.live = true,
+};
+
+test_precopy_common();
+}
+
 #ifdef CONFIG_GNUTLS
 static void test_precopy_tcp_tls_psk_match(void)
 {
@@ -2737,6 +2764,10 @@ int main(int argc, char **argv)
 #endif /* CONFIG_GNUTLS */
 
 qtest_add_func("/migration/precopy/tcp/plain", test_precopy_tcp_plain);
+
+qtest_add_func("/migration/precopy/tcp/plain/switchover-ack",
+   test_precopy_tcp_switchover_ack);
+
 #ifdef CONFIG_GNUTLS
 qtest_add_func("/migration/precopy/tcp/tls/psk/match",
test_precopy_tcp_tls_psk_match);
-- 
2.41.0




[PULL 08/16] vfio/migration: Add support for switchover ack capability

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

Loading of a VFIO device's data can take a substantial amount of time as
the device may need to allocate resources, prepare internal data
structures, etc. This can increase migration downtime, especially for
VFIO devices with a lot of resources.

To solve this, VFIO migration uAPI defines "initial bytes" as part of
its precopy data stream. Initial bytes can be used in various ways to
improve VFIO migration performance. For example, it can be used to
transfer device metadata to pre-allocate resources in the destination.
However, for this to work we need to make sure that all initial bytes
are sent and loaded in the destination before the source VM is stopped.

Use migration switchover ack capability to make sure a VFIO device's
initial bytes are sent and loaded in the destination before the source
stops the VM and attempts to complete the migration.
This can significantly reduce migration downtime for some devices.

Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Tested-by: YangHang Liu 
Acked-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 docs/devel/vfio-migration.rst | 10 +
 include/hw/vfio/vfio-common.h |  1 +
 hw/vfio/migration.c   | 39 ++-
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst
index e896b2a6734b..b433cb5bb2c8 100644
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@@ -16,6 +16,13 @@ helps to reduce the total downtime of the VM. VFIO devices 
opt-in to pre-copy
 support by reporting the VFIO_MIGRATION_PRE_COPY flag in the
 VFIO_DEVICE_FEATURE_MIGRATION ioctl.
 
+When pre-copy is supported, it's possible to further reduce downtime by
+enabling "switchover-ack" migration capability.
+VFIO migration uAPI defines "initial bytes" as part of its pre-copy data stream
+and recommends that the initial bytes are sent and loaded in the destination
+before stopping the source VM. Enabling this migration capability will
+guarantee that and thus, can potentially reduce downtime even further.
+
 Note that currently VFIO migration is supported only for a single device. This
 is due to VFIO migration's lack of P2P support. However, P2P support is planned
 to be added later on.
@@ -45,6 +52,9 @@ VFIO implements the device hooks for the iterative approach 
as follows:
 * A ``save_live_iterate`` function that reads the VFIO device's data from the
   vendor driver during iterative pre-copy phase.
 
+* A ``switchover_ack_needed`` function that checks if the VFIO device uses
+  "switchover-ack" migration capability when this capability is enabled.
+
 * A ``save_state`` function to save the device config space if it is present.
 
 * A ``save_live_complete_precopy`` function that sets the VFIO device in
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 1db901c1941f..3dc5f2104c86 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -69,6 +69,7 @@ typedef struct VFIOMigration {
 uint64_t mig_flags;
 uint64_t precopy_init_size;
 uint64_t precopy_dirty_size;
+bool initial_data_sent;
 } VFIOMigration;
 
 typedef struct VFIOAddressSpace {
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index d8f6a22ae14e..acbf0bb7ab3c 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -18,6 +18,8 @@
 #include "sysemu/runstate.h"
 #include "hw/vfio/vfio-common.h"
 #include "migration/migration.h"
+#include "migration/options.h"
+#include "migration/savevm.h"
 #include "migration/vmstate.h"
 #include "migration/qemu-file.h"
 #include "migration/register.h"
@@ -45,6 +47,7 @@
 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xef12ULL)
 #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xef13ULL)
 #define VFIO_MIG_FLAG_DEV_DATA_STATE(0xef14ULL)
+#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xef15ULL)
 
 /*
  * This is an arbitrary size based on migration of mlx5 devices, where 
typically
@@ -384,6 +387,7 @@ static void vfio_save_cleanup(void *opaque)
 migration->data_buffer = NULL;
 migration->precopy_init_size = 0;
 migration->precopy_dirty_size = 0;
+migration->initial_data_sent = false;
 vfio_migration_cleanup(vbasedev);
 trace_vfio_save_cleanup(vbasedev->name);
 }
@@ -457,10 +461,17 @@ static int vfio_save_iterate(QEMUFile *f, void *opaque)
 if (data_size < 0) {
 return data_size;
 }
-qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
 
 vfio_update_estimated_pending_data(migration, data_size);
 
+if (migrate_switchover_ack() && !migration->precopy_init_size &&
+!migration->initial_data_sent) {
+qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
+migration->initial_data_sent = true;
+} else {
+qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
+}
+
 trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size,
   

[PULL 05/16] vfio/migration: Refactor vfio_save_block() to return saved data size

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

Refactor vfio_save_block() to return the size of saved data on success
and -errno on error.

This will be used in next patch to implement VFIO migration pre-copy
support.

Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Juan Quintela 
Tested-by: YangHang Liu 
Acked-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 hw/vfio/migration.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 6b58dddb8859..235978fd6805 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -241,8 +241,8 @@ static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
 return 0;
 }
 
-/* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */
-static int vfio_save_block(QEMUFile *f, VFIOMigration *migration)
+/* Returns the size of saved data on success and -errno on error */
+static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
 {
 ssize_t data_size;
 
@@ -252,7 +252,7 @@ static int vfio_save_block(QEMUFile *f, VFIOMigration 
*migration)
 return -errno;
 }
 if (data_size == 0) {
-return 1;
+return 0;
 }
 
 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
@@ -262,7 +262,7 @@ static int vfio_save_block(QEMUFile *f, VFIOMigration 
*migration)
 
 trace_vfio_save_block(migration->vbasedev->name, data_size);
 
-return qemu_file_get_error(f);
+return qemu_file_get_error(f) ?: data_size;
 }
 
 /* -- */
@@ -335,6 +335,7 @@ static void vfio_state_pending_exact(void *opaque, uint64_t 
*must_precopy,
 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
 {
 VFIODevice *vbasedev = opaque;
+ssize_t data_size;
 int ret;
 
 /* We reach here with device state STOP only */
@@ -345,11 +346,11 @@ static int vfio_save_complete_precopy(QEMUFile *f, void 
*opaque)
 }
 
 do {
-ret = vfio_save_block(f, vbasedev->migration);
-if (ret < 0) {
-return ret;
+data_size = vfio_save_block(f, vbasedev->migration);
+if (data_size < 0) {
+return data_size;
 }
-} while (!ret);
+} while (data_size);
 
 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
 ret = qemu_file_get_error(f);
-- 
2.41.0




Re: [PATCH 1/4] ppc/pnv: quad xscom callbacks are P9 specific

2023-06-29 Thread Cédric Le Goater

On 6/30/23 05:55, Joel Stanley wrote:

Rename the to include P9 in the name in preparation for adding P10
versions.

Signed-off-by: Joel Stanley 


Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/ppc/pnv_core.c | 17 +
  1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 0bc3ad41c81c..0b1c3cccfebc 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -360,8 +360,8 @@ DEFINE_TYPES(pnv_core_infos)
  
  #define P9X_EX_NCU_SPEC_BAR 0x11010
  
-static uint64_t pnv_quad_xscom_read(void *opaque, hwaddr addr,

-unsigned int width)
+static uint64_t pnv_quad_power9_xscom_read(void *opaque, hwaddr addr,
+   unsigned int width)
  {
  uint32_t offset = addr >> 3;
  uint64_t val = -1;
@@ -379,8 +379,8 @@ static uint64_t pnv_quad_xscom_read(void *opaque, hwaddr 
addr,
  return val;
  }
  
-static void pnv_quad_xscom_write(void *opaque, hwaddr addr, uint64_t val,

- unsigned int width)
+static void pnv_quad_power9_xscom_write(void *opaque, hwaddr addr, uint64_t 
val,
+unsigned int width)
  {
  uint32_t offset = addr >> 3;
  
@@ -394,9 +394,9 @@ static void pnv_quad_xscom_write(void *opaque, hwaddr addr, uint64_t val,

  }
  }
  
-static const MemoryRegionOps pnv_quad_xscom_ops = {

-.read = pnv_quad_xscom_read,
-.write = pnv_quad_xscom_write,
+static const MemoryRegionOps pnv_quad_power9_xscom_ops = {
+.read = pnv_quad_power9_xscom_read,
+.write = pnv_quad_power9_xscom_write,
  .valid.min_access_size = 8,
  .valid.max_access_size = 8,
  .impl.min_access_size = 8,
@@ -410,7 +410,8 @@ static void pnv_quad_realize(DeviceState *dev, Error **errp)
  char name[32];
  
  snprintf(name, sizeof(name), "xscom-quad.%d", eq->quad_id);

-pnv_xscom_region_init(>xscom_regs, OBJECT(dev), _quad_xscom_ops,
+pnv_xscom_region_init(>xscom_regs, OBJECT(dev),
+  _quad_power9_xscom_ops,
eq, name, PNV9_XSCOM_EQ_SIZE);
  }
  





[PULL 01/16] migration: Add switchover ack capability

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

Migration downtime estimation is calculated based on bandwidth and
remaining migration data. This assumes that loading of migration data in
the destination takes a negligible amount of time and that downtime
depends only on network speed.

While this may be true for RAM, it's not necessarily true for other
migrated devices. For example, loading the data of a VFIO device in the
destination might require from the device to allocate resources, prepare
internal data structures and so on. These operations can take a
significant amount of time which can increase migration downtime.

This patch adds a new capability "switchover ack" that prevents the
source from stopping the VM and completing the migration until an ACK
is received from the destination that it's OK to do so.

This can be used by migrated devices in various ways to reduce downtime.
For example, a device can send initial precopy metadata to pre-allocate
resources in the destination and use this capability to make sure that
the pre-allocation is completed before the source VM is stopped, so it
will have full effect.

This new capability relies on the return path capability to communicate
from the destination back to the source.

The actual implementation of the capability will be added in the
following patches.

Signed-off-by: Avihai Horon 
Reviewed-by: Peter Xu 
Acked-by: Markus Armbruster 
Tested-by: YangHang Liu 
Acked-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 qapi/migration.json | 12 +++-
 migration/options.h |  1 +
 migration/options.c | 21 +
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 5bb5ab82a0cf..47dfef02780f 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -487,6 +487,16 @@
 # and should not affect the correctness of postcopy migration.
 # (since 7.1)
 #
+# @switchover-ack: If enabled, migration will not stop the source VM
+# and complete the migration until an ACK is received from the
+# destination that it's OK to do so.  Exactly when this ACK is
+# sent depends on the migrated devices that use this feature.
+# For example, a device can use it to make sure some of its data
+# is sent and loaded in the destination before doing switchover.
+# This can reduce downtime if devices that support this capability
+# are present.  'return-path' capability must be enabled to use
+# it.  (since 8.1)
+#
 # Features:
 #
 # @unstable: Members @x-colo and @x-ignore-shared are experimental.
@@ -502,7 +512,7 @@
'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
{ 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
'validate-uuid', 'background-snapshot',
-   'zero-copy-send', 'postcopy-preempt'] }
+   'zero-copy-send', 'postcopy-preempt', 'switchover-ack'] }
 
 ##
 # @MigrationCapabilityStatus:
diff --git a/migration/options.h b/migration/options.h
index 45991af3c208..9aaf363322b4 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -40,6 +40,7 @@ bool migrate_postcopy_ram(void);
 bool migrate_rdma_pin_all(void);
 bool migrate_release_ram(void);
 bool migrate_return_path(void);
+bool migrate_switchover_ack(void);
 bool migrate_validate_uuid(void);
 bool migrate_xbzrle(void);
 bool migrate_zero_blocks(void);
diff --git a/migration/options.c b/migration/options.c
index b62ab30cd585..16007afca662 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -185,6 +185,8 @@ Property migration_properties[] = {
 DEFINE_PROP_MIG_CAP("x-zero-copy-send",
 MIGRATION_CAPABILITY_ZERO_COPY_SEND),
 #endif
+DEFINE_PROP_MIG_CAP("x-switchover-ack",
+MIGRATION_CAPABILITY_SWITCHOVER_ACK),
 
 DEFINE_PROP_END_OF_LIST(),
 };
@@ -308,6 +310,13 @@ bool migrate_return_path(void)
 return s->capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
 }
 
+bool migrate_switchover_ack(void)
+{
+MigrationState *s = migrate_get_current();
+
+return s->capabilities[MIGRATION_CAPABILITY_SWITCHOVER_ACK];
+}
+
 bool migrate_validate_uuid(void)
 {
 MigrationState *s = migrate_get_current();
@@ -547,6 +556,18 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, 
Error **errp)
 }
 }
 
+if (new_caps[MIGRATION_CAPABILITY_SWITCHOVER_ACK]) {
+if (!new_caps[MIGRATION_CAPABILITY_RETURN_PATH]) {
+error_setg(errp, "Capability 'switchover-ack' requires capability "
+ "'return-path'");
+return false;
+}
+
+/* Disable this capability until it's implemented */
+error_setg(errp, "'switchover-ack' is not implemented yet");
+return false;
+}
+
 return true;
 }
 
-- 
2.41.0




[PULL 03/16] migration: Enable switchover ack capability

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

Now that switchover ack logic has been implemented, enable the
capability.

Signed-off-by: Avihai Horon 
Reviewed-by: Juan Quintela 
Reviewed-by: Peter Xu 
Tested-by: YangHang Liu 
Acked-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 migration/options.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/migration/options.c b/migration/options.c
index 16007afca662..5a9505adf7a8 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -562,10 +562,6 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, 
Error **errp)
  "'return-path'");
 return false;
 }
-
-/* Disable this capability until it's implemented */
-error_setg(errp, "'switchover-ack' is not implemented yet");
-return false;
 }
 
 return true;
-- 
2.41.0




[PULL 14/16] MAINTAINERS: Promote Cédric to VFIO co-maintainer

2023-06-29 Thread Cédric Le Goater
From: Alex Williamson 

Cédric has stepped up involvement in vfio, reviewing and managing
patches, as well as pull requests.  This work deserves gratitude and
punishment with a promotion to co-maintainer ;)

Signed-off-by: Alex Williamson 
Reviewed-by: Philippe Mathieu-Daudé 
Acked-by: Cédric Le Goater 
Signed-off-by: Cédric Le Goater 
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index aba07722f64f..4feea49a6e95 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2051,7 +2051,7 @@ F: hw/usb/dev-serial.c
 
 VFIO
 M: Alex Williamson 
-R: Cédric Le Goater 
+M: Cédric Le Goater 
 S: Supported
 F: hw/vfio/*
 F: include/hw/vfio/
-- 
2.41.0




[PULL 06/16] vfio/migration: Store VFIO migration flags in VFIOMigration

2023-06-29 Thread Cédric Le Goater
From: Avihai Horon 

VFIO migration flags are queried once in vfio_migration_init(). Store
them in VFIOMigration so they can be used later to check the device's
migration capabilities without re-querying them.

This will be used in the next patch to check if the device supports
precopy migration.

Signed-off-by: Avihai Horon 
Reviewed-by: Cédric Le Goater 
Tested-by: YangHang Liu 
Acked-by: Alex Williamson 
Signed-off-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-common.h | 1 +
 hw/vfio/migration.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index eed244f25f34..5f29dab83913 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -66,6 +66,7 @@ typedef struct VFIOMigration {
 int data_fd;
 void *data_buffer;
 size_t data_buffer_size;
+uint64_t mig_flags;
 } VFIOMigration;
 
 typedef struct VFIOAddressSpace {
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 235978fd6805..8d3341437926 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -603,6 +603,7 @@ static int vfio_migration_init(VFIODevice *vbasedev)
 migration->vbasedev = vbasedev;
 migration->device_state = VFIO_DEVICE_STATE_RUNNING;
 migration->data_fd = -1;
+migration->mig_flags = mig_flags;
 
 vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
 
-- 
2.41.0




Re: [PATCH v3 0/6] eBPF RSS through QMP support.

2023-06-29 Thread Jason Wang
On Thu, Jun 15, 2023 at 6:29 AM Andrew Melnychenko  wrote:
>
> This series of patches provides the ability to retrieve eBPF program
> through qmp, so management application may load bpf blob with proper 
> capabilities.
> Now, virtio-net devices can accept eBPF programs and maps through properties
> as external file descriptors. Access to the eBPF map is direct through mmap()
> call, so it should not require additional capabilities to bpf* calls.
> eBPF file descriptors can be passed to QEMU from parent process or by unix
> socket with sendfd() qmp command.
>
> Possible solution for libvirt may look like this: 
> https://github.com/daynix/libvirt/tree/RSS_eBPF (WIP)
>
> Changes since v2:
>  * moved/refactored QMP command
>  * refactored virtio-net

I've queued this series, but a question left:

mmap() support for eBPF maps is not supported from day0, should we
fallback to syscall for the OS that doesn't support that?

Thanks

>
> Changes since v1:
>  * refactored virtio-net
>  * moved hunks for ebpf mmap()
>  * added qmp enum for eBPF id.
>
> Andrew Melnychenko (6):
>   ebpf: Added eBPF map update through mmap.
>   ebpf: Added eBPF initialization by fds.
>   virtio-net: Added property to load eBPF RSS with fds.
>   ebpf: Added declaration/initialization routines.
>   qmp: Added new command to retrieve eBPF blob.
>   ebpf: Updated eBPF program and skeleton.
>
>  ebpf/ebpf.c|   70 ++
>  ebpf/ebpf.h|   31 +
>  ebpf/ebpf_rss-stub.c   |6 +
>  ebpf/ebpf_rss.c|  150 +++-
>  ebpf/ebpf_rss.h|   10 +
>  ebpf/meson.build   |2 +-
>  ebpf/rss.bpf.skeleton.h| 1469 
>  hw/net/virtio-net.c|   55 +-
>  include/hw/virtio/virtio-net.h |1 +
>  qapi/ebpf.json |   55 ++
>  qapi/meson.build   |1 +
>  qapi/qapi-schema.json  |1 +
>  tools/ebpf/rss.bpf.c   |2 +-
>  13 files changed, 1093 insertions(+), 760 deletions(-)
>  create mode 100644 ebpf/ebpf.c
>  create mode 100644 ebpf/ebpf.h
>  create mode 100644 qapi/ebpf.json
>
> --
> 2.39.1
>




Re: [PATCH v4 6/6] net: tap: Use qemu_close_range() to close fds

2023-06-29 Thread Jason Wang
On Wed, Jun 28, 2023 at 11:29 PM Bin Meng  wrote:
>
> From: Zhangjin Wu 
>
> Current codes using a brute-force traversal of all file descriptors
> do not scale on a system where the maximum number of file descriptors
> is set to a very large value (e.g.: in a Docker container of Manjaro
> distribution it is set to 1073741816). QEMU just looks frozen during
> start-up.
>
> The close-on-exec flag (O_CLOEXEC) was introduced since Linux kernel
> 2.6.23, FreeBSD 8.3, OpenBSD 5.0, Solaris 11. While it's true QEMU
> doesn't need to manually close the fds for child process as the proper
> O_CLOEXEC flag should have been set properly on files with its own
> codes, QEMU uses a huge number of 3rd party libraries and we don't
> trust them to reliably be using O_CLOEXEC on everything they open.
>
> Modern Linux and BSDs have the close_range() call we can use to do the
> job, and on Linux we have one more way to walk through /proc/self/fd
> to complete the task efficiently, which is what qemu_close_range() does.
>
> Reported-by: Zhangjin Wu 
> Co-developed-by: Bin Meng 
> Signed-off-by: Zhangjin Wu 
> Signed-off-by: Bin Meng 
> Reviewed-by: Richard Henderson 

Patch looks good but I'm not sure using helper scripts is good for the
production environment since it increases attack surfaces. Passing TAP
fd should be a better way.

Thanks

>
> ---
>
> Changes in v4:
> - put fd on its own line
>
> Changes in v2:
> - Change to use qemu_close_range() to close fds for child process efficiently
> - v1 link: 
> https://lore.kernel.org/qemu-devel/20230406112041.798585-1-bm...@tinylab.org/
>
>  net/tap.c | 24 
>  1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/net/tap.c b/net/tap.c
> index 1bf085d422..9f080215f0 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -446,13 +446,13 @@ static void launch_script(const char *setup_script, 
> const char *ifname,
>  return;
>  }
>  if (pid == 0) {
> -int open_max = sysconf(_SC_OPEN_MAX), i;
> +unsigned int last_fd = sysconf(_SC_OPEN_MAX) - 1;
> +
> +/* skip stdin, stdout and stderr */
> +qemu_close_range(3, fd - 1);
> +/* skip the currently used fd */
> +qemu_close_range(fd + 1, last_fd);
>
> -for (i = 3; i < open_max; i++) {
> -if (i != fd) {
> -close(i);
> -}
> -}
>  parg = args;
>  *parg++ = (char *)setup_script;
>  *parg++ = (char *)ifname;
> @@ -536,16 +536,16 @@ static int net_bridge_run_helper(const char *helper, 
> const char *bridge,
>  return -1;
>  }
>  if (pid == 0) {
> -int open_max = sysconf(_SC_OPEN_MAX), i;
> +unsigned int last_fd = sysconf(_SC_OPEN_MAX) - 1;
> +unsigned int fd = sv[1];
>  char *fd_buf = NULL;
>  char *br_buf = NULL;
>  char *helper_cmd = NULL;
>
> -for (i = 3; i < open_max; i++) {
> -if (i != sv[1]) {
> -close(i);
> -}
> -}
> +/* skip stdin, stdout and stderr */
> +qemu_close_range(3, fd - 1);
> +/* skip the currently used fd */
> +qemu_close_range(fd + 1, last_fd);
>
>  fd_buf = g_strdup_printf("%s%d", "--fd=", sv[1]);
>
> --
> 2.34.1
>




[PATCH 2/4] ppc/pnv: Subclass quad xscom callbacks

2023-06-29 Thread Joel Stanley
Make the existing pnv_quad_xscom_read/write be P9 specific, in
preparation for a different P10 callback.

Signed-off-by: Joel Stanley 
---
 include/hw/ppc/pnv_core.h | 12 +++-
 hw/ppc/pnv.c  | 11 +++
 hw/ppc/pnv_core.c | 36 
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/include/hw/ppc/pnv_core.h b/include/hw/ppc/pnv_core.h
index 3d75706e95da..ab3f6d6c2843 100644
--- a/include/hw/ppc/pnv_core.h
+++ b/include/hw/ppc/pnv_core.h
@@ -60,8 +60,18 @@ static inline PnvCPUState *pnv_cpu_state(PowerPCCPU *cpu)
 return (PnvCPUState *)cpu->machine_data;
 }
 
+struct PnvQuadClass {
+DeviceClass parent_class;
+
+const MemoryRegionOps *xscom_ops;
+};
+
 #define TYPE_PNV_QUAD "powernv-cpu-quad"
-OBJECT_DECLARE_SIMPLE_TYPE(PnvQuad, PNV_QUAD)
+
+#define PNV_QUAD_TYPE_SUFFIX "-" TYPE_PNV_QUAD
+#define PNV_QUAD_TYPE_NAME(cpu_model) cpu_model PNV_QUAD_TYPE_SUFFIX
+
+OBJECT_DECLARE_TYPE(PnvQuad, PnvQuadClass, PNV_QUAD)
 
 struct PnvQuad {
 DeviceState parent_obj;
diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index fc083173f346..c77fdb6747a4 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1429,14 +1429,15 @@ static void pnv_chip_power9_instance_init(Object *obj)
 }
 
 static void pnv_chip_quad_realize_one(PnvChip *chip, PnvQuad *eq,
-  PnvCore *pnv_core)
+  PnvCore *pnv_core,
+  const char *type)
 {
 char eq_name[32];
 int core_id = CPU_CORE(pnv_core)->core_id;
 
 snprintf(eq_name, sizeof(eq_name), "eq[%d]", core_id);
 object_initialize_child_with_props(OBJECT(chip), eq_name, eq,
-   sizeof(*eq), TYPE_PNV_QUAD,
+   sizeof(*eq), type,
_fatal, NULL);
 
 object_property_set_int(OBJECT(eq), "quad-id", core_id, _fatal);
@@ -1454,7 +1455,8 @@ static void pnv_chip_quad_realize(Pnv9Chip *chip9, Error 
**errp)
 for (i = 0; i < chip9->nr_quads; i++) {
 PnvQuad *eq = >quads[i];
 
-pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4]);
+pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4],
+  PNV_QUAD_TYPE_NAME("power9"));
 
 pnv_xscom_add_subregion(chip, PNV9_XSCOM_EQ_BASE(eq->quad_id),
 >xscom_regs);
@@ -1666,7 +1668,8 @@ static void pnv_chip_power10_quad_realize(Pnv10Chip 
*chip10, Error **errp)
 for (i = 0; i < chip10->nr_quads; i++) {
 PnvQuad *eq = >quads[i];
 
-pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4]);
+pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4],
+  PNV_QUAD_TYPE_NAME("power9"));
 
 pnv_xscom_add_subregion(chip, PNV10_XSCOM_EQ_BASE(eq->quad_id),
 >xscom_regs);
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 0b1c3cccfebc..b9a57463aec4 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -407,11 +407,12 @@ static const MemoryRegionOps pnv_quad_power9_xscom_ops = {
 static void pnv_quad_realize(DeviceState *dev, Error **errp)
 {
 PnvQuad *eq = PNV_QUAD(dev);
+PnvQuadClass *pqc = PNV_QUAD_GET_CLASS(eq);
 char name[32];
 
 snprintf(name, sizeof(name), "xscom-quad.%d", eq->quad_id);
 pnv_xscom_region_init(>xscom_regs, OBJECT(dev),
-  _quad_power9_xscom_ops,
+  pqc->xscom_ops,
   eq, name, PNV9_XSCOM_EQ_SIZE);
 }
 
@@ -420,6 +421,13 @@ static Property pnv_quad_properties[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
+static void pnv_quad_power9_class_init(ObjectClass *oc, void *data)
+{
+PnvQuadClass *pqc = PNV_QUAD_CLASS(oc);
+
+pqc->xscom_ops = _quad_power9_xscom_ops;
+}
+
 static void pnv_quad_class_init(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
@@ -429,16 +437,20 @@ static void pnv_quad_class_init(ObjectClass *oc, void 
*data)
 dc->user_creatable = false;
 }
 
-static const TypeInfo pnv_quad_info = {
-.name  = TYPE_PNV_QUAD,
-.parent= TYPE_DEVICE,
-.instance_size = sizeof(PnvQuad),
-.class_init= pnv_quad_class_init,
+static const TypeInfo pnv_quad_infos[] = {
+{
+.name  = TYPE_PNV_QUAD,
+.parent= TYPE_DEVICE,
+.instance_size = sizeof(PnvQuad),
+.class_size= sizeof(PnvQuadClass),
+.class_init= pnv_quad_class_init,
+.abstract  = true,
+},
+{
+.parent = TYPE_PNV_QUAD,
+.name = PNV_QUAD_TYPE_NAME("power9"),
+.class_init = pnv_quad_power9_class_init,
+},
 };
 
-static void pnv_core_register_types(void)
-{
-type_register_static(_quad_info);
-}
-
-type_init(pnv_core_register_types)
+DEFINE_TYPES(pnv_quad_infos);
-- 
2.40.1




[PATCH 0/4] ppc/pnv: Extend "quad" model for p10

2023-06-29 Thread Joel Stanley
The quad model implements the EC xscoms for the p9 machine, reusing the
same model for p10 which isn't quite correct. This series adds a PnvQuad
class and subclasses it for P9 and P10. Implement the core thread state
xscom as an example. I expect more function to be implemented in future
patches.

There's one outstanding question. Skiboot has this for the p10 scom:

 #define P10_EC_CORE_THREAD_STATE0x412

However the read that comes is for 0x28412. I suspect the upper 0x28000
are addressing bits, so we're really reporting the core thread state for
the given core. Should the model instead wired so one is created for
each chiplet? Or should we report the value for all possible cores, like
the P9 code does for P9X_EX_NCU_SPEC_BAR?

switch (offset) {
case P9X_EX_NCU_SPEC_BAR:
case P9X_EX_NCU_SPEC_BAR + 0x400: /* Second EX */

Joel Stanley (4):
  ppc/pnv: quad xscom callbacks are P9 specific
  ppc/pnv: Subclass quad xscom callbacks
  ppc/pnv: Add P10 quad ops
  ppc/pnv: Return zero for core thread state xscom

 include/hw/ppc/pnv_core.h |  12 +++-
 hw/ppc/pnv.c  |  11 ++--
 hw/ppc/pnv_core.c | 114 +++---
 3 files changed, 113 insertions(+), 24 deletions(-)

-- 
2.40.1




[PATCH 4/4] ppc/pnv: Return zero for core thread state xscom

2023-06-29 Thread Joel Stanley
Firmware now warns if booting in LPAR per core mode (PPC bit 62). So
this warning doesn't trigger report the core thread state is 0.

Signed-off-by: Joel Stanley 
---
 hw/ppc/pnv_core.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 7fff2fd9e298..98356d7f6538 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -116,6 +116,8 @@ static const MemoryRegionOps pnv_core_power8_xscom_ops = {
 #define PNV9_XSCOM_EC_PPM_SPECIAL_WKUP_HYP 0xf010d
 #define PNV9_XSCOM_EC_PPM_SPECIAL_WKUP_OTR 0xf010a
 
+#define PNV9_XSCOM_EC_CORE_THREAD_STATE0x10ab3
+
 static uint64_t pnv_core_power9_xscom_read(void *opaque, hwaddr addr,
unsigned int width)
 {
@@ -134,6 +136,9 @@ static uint64_t pnv_core_power9_xscom_read(void *opaque, 
hwaddr addr,
 case PNV9_XSCOM_EC_PPM_SPECIAL_WKUP_OTR:
 val = 0x0;
 break;
+case PNV9_XSCOM_EC_CORE_THREAD_STATE:
+val = 0;
+break;
 default:
 qemu_log_mask(LOG_UNIMP, "Warning: reading reg=0x%" HWADDR_PRIx "\n",
   addr);
@@ -408,6 +413,8 @@ static const MemoryRegionOps pnv_quad_power9_xscom_ops = {
  * POWER10 Quads
  */
 
+#define PNV10_XSCOM_EC_PC_PMC_CORE_THREAD_STATE 0x28412
+
 static uint64_t pnv_quad_power10_xscom_read(void *opaque, hwaddr addr,
 unsigned int width)
 {
@@ -415,6 +422,9 @@ static uint64_t pnv_quad_power10_xscom_read(void *opaque, 
hwaddr addr,
 uint64_t val = -1;
 
 switch (offset) {
+case PNV10_XSCOM_EC_PC_PMC_CORE_THREAD_STATE:
+val = 0;
+break;
 default:
 qemu_log_mask(LOG_UNIMP, "%s: writing @0x%08x\n", __func__,
   offset);
-- 
2.40.1




[PATCH 1/4] ppc/pnv: quad xscom callbacks are P9 specific

2023-06-29 Thread Joel Stanley
Rename the to include P9 in the name in preparation for adding P10
versions.

Signed-off-by: Joel Stanley 
---
 hw/ppc/pnv_core.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 0bc3ad41c81c..0b1c3cccfebc 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -360,8 +360,8 @@ DEFINE_TYPES(pnv_core_infos)
 
 #define P9X_EX_NCU_SPEC_BAR 0x11010
 
-static uint64_t pnv_quad_xscom_read(void *opaque, hwaddr addr,
-unsigned int width)
+static uint64_t pnv_quad_power9_xscom_read(void *opaque, hwaddr addr,
+   unsigned int width)
 {
 uint32_t offset = addr >> 3;
 uint64_t val = -1;
@@ -379,8 +379,8 @@ static uint64_t pnv_quad_xscom_read(void *opaque, hwaddr 
addr,
 return val;
 }
 
-static void pnv_quad_xscom_write(void *opaque, hwaddr addr, uint64_t val,
- unsigned int width)
+static void pnv_quad_power9_xscom_write(void *opaque, hwaddr addr, uint64_t 
val,
+unsigned int width)
 {
 uint32_t offset = addr >> 3;
 
@@ -394,9 +394,9 @@ static void pnv_quad_xscom_write(void *opaque, hwaddr addr, 
uint64_t val,
 }
 }
 
-static const MemoryRegionOps pnv_quad_xscom_ops = {
-.read = pnv_quad_xscom_read,
-.write = pnv_quad_xscom_write,
+static const MemoryRegionOps pnv_quad_power9_xscom_ops = {
+.read = pnv_quad_power9_xscom_read,
+.write = pnv_quad_power9_xscom_write,
 .valid.min_access_size = 8,
 .valid.max_access_size = 8,
 .impl.min_access_size = 8,
@@ -410,7 +410,8 @@ static void pnv_quad_realize(DeviceState *dev, Error **errp)
 char name[32];
 
 snprintf(name, sizeof(name), "xscom-quad.%d", eq->quad_id);
-pnv_xscom_region_init(>xscom_regs, OBJECT(dev), _quad_xscom_ops,
+pnv_xscom_region_init(>xscom_regs, OBJECT(dev),
+  _quad_power9_xscom_ops,
   eq, name, PNV9_XSCOM_EQ_SIZE);
 }
 
-- 
2.40.1




[PATCH 3/4] ppc/pnv: Add P10 quad ops

2023-06-29 Thread Joel Stanley
Add a PnvQuad class for the P10 powernv machine. No xscoms are
implemented yet, but this allows them to be added.

Signed-off-by: Joel Stanley 
---
 hw/ppc/pnv.c  |  2 +-
 hw/ppc/pnv_core.c | 53 +++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index c77fdb6747a4..5f25fe985ab2 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -1669,7 +1669,7 @@ static void pnv_chip_power10_quad_realize(Pnv10Chip 
*chip10, Error **errp)
 PnvQuad *eq = >quads[i];
 
 pnv_chip_quad_realize_one(chip, eq, chip->cores[i * 4],
-  PNV_QUAD_TYPE_NAME("power9"));
+  PNV_QUAD_TYPE_NAME("power10"));
 
 pnv_xscom_add_subregion(chip, PNV10_XSCOM_EQ_BASE(eq->quad_id),
 >xscom_regs);
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index b9a57463aec4..7fff2fd9e298 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -404,6 +404,47 @@ static const MemoryRegionOps pnv_quad_power9_xscom_ops = {
 .endianness = DEVICE_BIG_ENDIAN,
 };
 
+/*
+ * POWER10 Quads
+ */
+
+static uint64_t pnv_quad_power10_xscom_read(void *opaque, hwaddr addr,
+unsigned int width)
+{
+uint32_t offset = addr >> 3;
+uint64_t val = -1;
+
+switch (offset) {
+default:
+qemu_log_mask(LOG_UNIMP, "%s: writing @0x%08x\n", __func__,
+  offset);
+}
+
+return val;
+}
+
+static void pnv_quad_power10_xscom_write(void *opaque, hwaddr addr, uint64_t 
val,
+ unsigned int width)
+{
+uint32_t offset = addr >> 3;
+
+switch (offset) {
+default:
+qemu_log_mask(LOG_UNIMP, "%s: writing @0x%08x\n", __func__,
+  offset);
+}
+}
+
+static const MemoryRegionOps pnv_quad_power10_xscom_ops = {
+.read = pnv_quad_power10_xscom_read,
+.write = pnv_quad_power10_xscom_write,
+.valid.min_access_size = 8,
+.valid.max_access_size = 8,
+.impl.min_access_size = 8,
+.impl.max_access_size = 8,
+.endianness = DEVICE_BIG_ENDIAN,
+};
+
 static void pnv_quad_realize(DeviceState *dev, Error **errp)
 {
 PnvQuad *eq = PNV_QUAD(dev);
@@ -428,6 +469,13 @@ static void pnv_quad_power9_class_init(ObjectClass *oc, 
void *data)
 pqc->xscom_ops = _quad_power9_xscom_ops;
 }
 
+static void pnv_quad_power10_class_init(ObjectClass *oc, void *data)
+{
+PnvQuadClass *pqc = PNV_QUAD_CLASS(oc);
+
+pqc->xscom_ops = _quad_power10_xscom_ops;
+}
+
 static void pnv_quad_class_init(ObjectClass *oc, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(oc);
@@ -451,6 +499,11 @@ static const TypeInfo pnv_quad_infos[] = {
 .name = PNV_QUAD_TYPE_NAME("power9"),
 .class_init = pnv_quad_power9_class_init,
 },
+{
+.parent = TYPE_PNV_QUAD,
+.name = PNV_QUAD_TYPE_NAME("power10"),
+.class_init = pnv_quad_power10_class_init,
+},
 };
 
 DEFINE_TYPES(pnv_quad_infos);
-- 
2.40.1




Re: [PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-06-29 Thread Akihiko Odaki

On 2023/06/29 23:18, Ani Sinha wrote:




On 29-Jun-2023, at 2:19 PM, Akihiko Odaki  wrote:

On 2023/06/29 17:05, Ani Sinha wrote:

On Thu, 29 Jun, 2023, 12:17 pm Akihiko Odaki, mailto:akihiko.od...@daynix.com>> wrote:
On 2023/06/29 13:07, Ani Sinha wrote:
 > PCI Express ports only have one slot, so PCI Express devices can
only be
 > plugged into slot 0 on a PCIE port. Enforce it.
 >
 > The change has been tested to not break ARI by instantiating
seven vfs on an
 > emulated igb device (the maximum number of vfs the linux igb
driver supports).
 > The vfs are seen to have non-zero device/slot numbers in the
conventional
 > PCI BDF representation.
 >
 > CC: jus...@redhat.com 
 > CC: imamm...@redhat.com 
 > CC: akihiko.od...@daynix.com 
 >
 > Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929

 > Signed-off-by: Ani Sinha mailto:anisi...@redhat.com>>
 > Reviewed-by: Julia Suvorova mailto:jus...@redhat.com>>
 > ---
 >   hw/pci/pci.c | 15 +++
 >   1 file changed, 15 insertions(+)
 >
 > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
 > index e2eb4c3b4a..0320ac2bb3 100644
 > --- a/hw/pci/pci.c
 > +++ b/hw/pci/pci.c
 > @@ -65,6 +65,7 @@ bool pci_available = true;
 >   static char *pcibus_get_dev_path(DeviceState *dev);
 >   static char *pcibus_get_fw_dev_path(DeviceState *dev);
 >   static void pcibus_reset(BusState *qbus);
 > +static bool pcie_has_upstream_port(PCIDevice *dev);
 >
 >   static Property pci_props[] = {
 >   DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
 > @@ -1190,6 +1191,20 @@ static PCIDevice
*do_pci_register_device(PCIDevice *pci_dev,
 >  name);
 >
 >  return NULL;
 > +} /*
 > +   * With SRIOV and ARI, vfs can have non-zero slot in the
conventional
 > +   * PCI interpretation as all five bits reserved for slot
addresses are
 > +   * also used for function bits for the various vfs. Ignore
that case.
 > +   * It is too early here to check for ARI capabilities in
the PCI config
 > +   * space. Hence, we check for a vf device instead.
 > +   */
Why don't just perform this check after the capabilities are set?
We don't want to allocate resources for wrong device parameters. We want to 
error out early. Other checks also are performed at the same place .


It is indeed better to raise an error as early as possible so that we can avoid 
allocation and other operations that will be reverted and may go wrong due to 
the invalid condition. That should be the reason why other checks for the 
address are performed here.

However, in this particular case, we cannot confidently perform the check here 
because it is unknown if the ARI capability will be advertised until the device 
realization code runs. This can justify delaying the check after the device 
realization, unlike the other checks.


Ok so are you proposing that the check we have right before (the check for 
unoccupied function 0) be also moved? It also uses the same vf approximation 
for seemingly to support ARI.


No, I don't think the check for function 0 is required to be disabled 
because of the change of addressing caused by ARI, but it is required 
because SR-IOV VF can be added and removed while the PF (function 0) 
remains. I think this check should be performed also when SR-IOV is 
disabled and ARI is enabled.


Thus the check for unoccupied function 0 needs to use pci_is_vf() 
instead of checking ARI capability, and that can happen in 
do_pci_register_device().



Also where do you propose we move the check?


In pci_qdev_realize(), somewhere after pc->realize() and before option 
ROM loading. See the check for failover pair as an example. I guess it's 
also placed in this region because it needs capability information.







Show quoted text
Regards,
Akihiko Odaki
 > +else if (!pci_is_vf(pci_dev) &&
 > + pcie_has_upstream_port(pci_dev) &&
 > + PCI_SLOT(devfn)) {
 > +error_setg(errp, "PCI: slot %d is not valid for %s,"
 > +   " parent device only allows plugging into
slot 0.",
 > +   PCI_SLOT(devfn), name);
 > +return NULL;
 >   }
 >
 >   pci_dev->devfn = devfn;






RE: [PATCH v4 5/5] vfio/migration: Refactor and fix print of "Migration disabled"

2023-06-29 Thread Duan, Zhenzhong
>-Original Message-
>From: Cédric Le Goater 
>Sent: Friday, June 30, 2023 12:40 AM
>Subject: Re: [PATCH v4 5/5] vfio/migration: Refactor and fix print of 
>"Migration
>disabled"
>
>Hello Zhenzhong,
>
>On 6/29/23 10:40, Zhenzhong Duan wrote:
>> This patch refactors vfio_migration_realize() and its dependend code
>> as follows:
>>
>> 1. It's redundant in vfio_migration_realize() to registers multiple blockers,
>> e.g: vIOMMU blocker can be refactored as per device blocker.
>> 2. Change vfio_viommu_preset() to be only a per device checker.
>> 3. Remove global vIOMMU blocker related stuff, e.g:
>> giommu_migration_blocker, vfio_[block|unblock]_giommu_migration()
>> and vfio_migration_finalize()
>> 4. Change vfio_migration_realize(), vfio_block_multiple_devices_migration()
>> vfio_block_migration() and vfio_viommu_preset() to return bool type.
>> 5. Print "Migration disabled" depending on enable_migration property
>> and print it as warning instead of error which is overkill.
>
>
>We are close to soft freeze and these combo patches adding various fixes all
>at once are difficult to evaluate.
>
>Please split this patch in multiple ones to ease the review.  May be start with
>the  int -> bool conversion of the return values. It should remove some noise.
Good suggestion! Will do.

Thanks
Zhenzhong


RE: [PATCH v4 5/5] vfio/migration: Refactor and fix print of "Migration disabled"

2023-06-29 Thread Duan, Zhenzhong
>-Original Message-
>From: Alex Williamson 
>Subject: Re: [PATCH v4 5/5] vfio/migration: Refactor and fix print of 
>"Migration
>disabled"
>
>On Thu, 29 Jun 2023 16:42:23 +0100
>Joao Martins  wrote:
>
>> On 29/06/2023 16:20, Avihai Horon wrote:
>> > On 29/06/2023 15:44, Joao Martins wrote:
>> >> On 29/06/2023 09:40, Zhenzhong Duan wrote:
...
>> >>> @@ -403,9 +402,15 @@ int
>> >>> vfio_block_multiple_devices_migration(VFIODevice
>> >>> *vbasedev, Error **errp)
>> >>>   if (ret < 0) {
>> >>>   error_free(multiple_devices_migration_blocker);
>> >>>   multiple_devices_migration_blocker = NULL;
>> >>> +    } else {
>> >>> +    /*
>> >>> + * Only ON_OFF_AUTO_AUTO case, ON_OFF_AUTO_OFF is checked
>> >>> + * in vfio_migration_realize().
>> >>> + */
>> >>> +    warn_report("Migration disabled, not support multiple
>> >>> +VFIO devices");
>> >>>   }
>> >>>
>> >> Perhaps you could stash the previous error message and use it in
>> >> the warn_report_error to consolidate the error messages e.g.
>> >>
>> >> bool vfio_block_multiple_devices_migration(VFIODevice *vbasedev,
>> >> Error **errp) {
>> >>  Error *err = NULL;
>> >>
>> >>  if (multiple_devices_migration_blocker ||
>> >>  vfio_migratable_device_num() <= 1) {
>> >>  return true;
>> >>  }
>> >>
>> >>  error_setg(, "%s: Migration is currently not supported with
>multiple "
>> >>   "VFIO devices", vbasedev->name);
>> >>
>> >>  if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
>> >>  error_propagate(errp, err);
>> >>  return -EINVAL;
>> >>  }
>> >>
>> >>  ...
>> >>  if (ret < 0) {
>> >>  } else {
>> >>  /* Warns only on ON_OFF_AUTO_AUTO case */
>> >>  warn_report_err(err);
>> >
>> > I'm not sure this warning is needed.
>> > If I remember correctly, I think Alex didn't want migration
>> > error/warning messages to be logged in the AUTO case.
>
>Correct.
>
>> Hmm, ok, I missed this from the previous discussions.
>>
>> So today there are migration warnings in the current code. (even in
>> the AUTO case). So if we want them removed, then this patch would then
>> just remove the "Migration disabled" all together (in the two places we
>commented).
>>
>> The rest of the cases already propagate the error I think. And the
>> AUTO case will always be blocked migration and see the same printed
>messages elsewhere.
>
>I tested this with Avihai's series and saw the correct logging, at least for a
>device that does not support migration.
>
>In AUTO mode, we should only ever see errors or warnings if the device
>supports migration and an error or incompatibility occurs while further
>probing or configuring it.  Lack of support for migration should only ever
>generate an error or warning when using enable_migration=on or the global -
>only-migratable flag.
Will remove the two places of "Migration disabled" print.

>
>As I understood Avihai's patch, we're populating the Error pointer, but we
>only ever propagate that error in the above cases.  Thanks,
>
>Alex
>
...
>> >>> +818,11 @@ static int vfio_block_migration(VFIODevice *vbasedev,
>> >>> Error *err, Error **errp)
>> >>>   if (ret < 0) {
>> >>>   error_free(vbasedev->migration_blocker);
>> >>>   vbasedev->migration_blocker = NULL;
>> >>> +    } else if (vbasedev->enable_migration != ON_OFF_AUTO_OFF) {
>> >>> +    warn_report("%s: Migration disabled", vbasedev->name);
>> >>>   }
>> >>>
>> >> Perhaps you can use the the local error to expand on why migration
>> >> was disabled e.g.
>> >>
>> >>  warn_report_err(err);
>> >
>> > Same here.
>> >
>> > Thanks.
>> >
>> >>
>> >>> -    return ret;
>> >>> +    return !ret;
>> >>>   }
>> >>>
>> >>>   /*
>> >>> --
>> >>>  */ @@ -835,7 +837,12 @@ void
>> >>> vfio_reset_bytes_transferred(void)
>> >>>   bytes_transferred = 0;
>> >>>   }
>> >>>
>> >>> -int vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
>> >>> +/*
>> >>> + * Return true when either migration initialized or blocker registered.
>> >>> + * Currently only return false when adding blocker fails which
>> >>> +will
>> >>> + * de-register vfio device.
>> >>> + */
>> >>> +bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
>> >>>   {
>> >>>   Error *err = NULL;
>> >>>   int ret;
>> >>> @@ -873,18 +880,17 @@ int vfio_migration_realize(VFIODevice
>> >>> *vbasedev, Error
>> >>> **errp)
>> >>>   vbasedev->name);
>> >>>   }
>> >>>
>> >>> -    ret = vfio_block_multiple_devices_migration(vbasedev, errp);
>> >>> -    if (ret) {
>> >>> -    return ret;
>> >>> +    if (!vfio_block_multiple_devices_migration(vbasedev, errp)) {
>> >>> +    return false;
>> >>>   }
>> >>>
>> >>> -    ret = vfio_block_giommu_migration(vbasedev, errp);
>> >>> -    if (ret) {
>> >>> -    return ret;
>> >>> +    if 

RE: [PATCH v4 4/5] vfio/pci: Free resources when vfio_migration_realize fails

2023-06-29 Thread Duan, Zhenzhong


>-Original Message-
>From: Joao Martins 
>Subject: Re: [PATCH v4 4/5] vfio/pci: Free resources when
>vfio_migration_realize fails
>
>On 29/06/2023 09:40, Zhenzhong Duan wrote:
>> When vfio_realize() succeeds, hot unplug will call vfio_exitfn() to
>> free resources allocated in vfio_realize(); when vfio_realize() fails,
>> vfio_exitfn() is never called and we need to free resources in
>> vfio_realize().
>>
>> In the case that vfio_migration_realize() fails,
>> e.g: with -only-migratable & enable-migration=off, we see below:
>>
>> (qemu) device_add
>> vfio-pci,host=81:11.1,id=vfio1,bus=root1,enable-migration=off
>> :81:11.1: Migration disabled
>> Error: disallowing migration blocker (--only-migratable) for:
>> :81:11.1: Migration is disabled for VFIO device
>>
>> If we hotplug again we should see same log as above, but we see:
>> (qemu) device_add
>> vfio-pci,host=81:11.1,id=vfio1,bus=root1,enable-migration=off
>> Error: vfio :81:11.1: device is already attached
>>
>> That's because some references to VFIO device isn't released, we
>> should check return value of vfio_migration_realize() and release the
>> references, then VFIO device will be truely released when hotplug
>> fails.
>>
>> Fixes: a22651053b59 ("vfio: Make vfio-pci device migration capable")
>> Signed-off-by: Zhenzhong Duan 
>> ---
>>  hw/vfio/pci.c | 3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index
>> 54a8179d1c64..dc69d3031b24 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -3210,6 +3210,7 @@ static void vfio_realize(PCIDevice *pdev, Error
>**errp)
>>  ret = vfio_migration_realize(vbasedev, errp);
>>  if (ret) {
>>  error_report("%s: Migration disabled", vbasedev->name);
>> +goto out_vfio_migration;
>>  }
>>  }
>>
>> @@ -3219,6 +3220,8 @@ static void vfio_realize(PCIDevice *pdev, Error
>> **errp)
>>
>>  return;
>>
>> +out_vfio_migration:
>> +vfio_migration_exit(vbasedev);
>>  out_deregister:
>>  vfio_disable_interrupts(vdev);
>>  out_intx_disable:
>
>I agree with the general sentiment behind the change.
>Clearly vfio::migration and vfio::migration_blocker are leaking from inside the
>migration_realize() function.
>
>But it is kinda awkward semantic that vfio_migration_realize() (or any realize)
>failures need to be accompanied with a vfio_migration_exit() that tears down
>state *leaked* by its realize() failure.
>
>It sounds to me that this should be inside the vfio_migration_realize() not on
>the caller? Unless QEMU ::realize() is expected to do this.
Good suggestion, will fix.

Thanks
Zhenzhong


RE: [PATCH v4 3/5] vfio/pci: Disable INTx in vfio_realize error path

2023-06-29 Thread Duan, Zhenzhong
>-Original Message-
>From: Joao Martins 
>Subject: Re: [PATCH v4 3/5] vfio/pci: Disable INTx in vfio_realize error path
>
>
>
>On 29/06/2023 16:13, Cédric Le Goater wrote:
>> On 6/29/23 13:24, Joao Martins wrote:
>>> On 29/06/2023 09:40, Zhenzhong Duan wrote:
 When vfio realize fails, INTx isn't disabled if it has been enabled.
 This may confuse host side with unhandled interrupt report.

 Add a new label to be used for vfio_intx_enable() failed case.

 Fixes: a9994687cb9b ("vfio/display: core & wireup")
 Fixes: b290659fc3dd ("hw/vfio/display: add ramfb support")
 Fixes: c62a0c7ce34e ("vfio/display: add xres + yres properties")
>>>
>>> Sounds to me the correct Fixes tag is the same as first patch i.e.:
>>>
>>> Fixes: c5478fea27ac ("vfio/pci: Respond to KVM irqchip change
>>> notifier")

OK, will use it.
Previously I thought I should pick commit a9994687cb9b which firstly
introduced the timer leak with a jump label out_teardown, then
b290659fc3dd and c62a0c7ce34e which used out_teardown.

>>>
 Signed-off-by: Zhenzhong Duan 
>>>
>>> Looks good, but see some clarifications below.
>>>
 ---
   hw/vfio/pci.c | 4 +++-
   1 file changed, 3 insertions(+), 1 deletion(-)

 diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index
 ab6645ba60af..54a8179d1c64 100644
 --- a/hw/vfio/pci.c
 +++ b/hw/vfio/pci.c
 @@ -3167,7 +3167,7 @@ static void vfio_realize(PCIDevice *pdev,
 Error **errp)

 kvm_irqchip_add_change_notifier(>irqchip_change_notifier);
   ret = vfio_intx_enable(vdev, errp);
   if (ret) {
 -    goto out_deregister;
 +    goto out_intx_disable;
   }
   }
   @@ -3220,6 +3220,8 @@ static void vfio_realize(PCIDevice *pdev,
 Error **errp)
   return;
     out_deregister:
 +    vfio_disable_interrupts(vdev);
>>>
>>> You are calling vfio_disable_interrupts() when what you want is
>>> vfio_intx_disable() ? But I guess your thinking was to call
>>> vfio_disable_interrupt() which eventually calls vfio_intx_disable()
>>> in case INTx was really setup, thus saving the duplicated check. The
>>> MSIx/MSI in realize() I don't think they will be enabled at this point.
Yes.

>>> Let me know if I misunderstood.
>>>
 +out_intx_disable:
>>>
>>> Maybe 'out_intx_teardown' or 'out_intx_deregister' because you are
>>> not really disabling INTx.
>>
>> or simply extract from vfio_disable_interrupts() :
>>
>>     if (vdev->interrupt == VFIO_INT_INTx) {
>>     vfio_intx_disable(vdev);
>>     }
>>
>> and add the above code before cleaning up the intx routing notifier
>> without any new goto labels.
>>
>An even better option indeed.
Will do.

Thanks
Zhenzhong



Memory region endianness

2023-06-29 Thread BALATON Zoltan

Hello,

Some devices have bits that allow the guest to change endianness of memory 
mapped resources, e.g. ati-vga should allow switching the regs BAR into 
big endian on writing a bit. What's the best way to emulate this?


The naive way could be to just test for the bit in the memory ops call 
backs and do the swap there, but that would add overhead when it's not 
needed (most guests don't need it) and there are two BARs to access the 
same registers (one is in an IO BAR that aliases part of the MEM space 
BAR) and these may need to have different endianness so I'd rather have 
the memory layer handle it.


Now the question is how can the endianness be changed from the memory ops 
call back? Is it allowed to overwrite ops.endianness or replace ops with 
another one that has DEVICE_BIG_ENDIAN? In MemoryRegion the ops field is 
declared const and nothing seems to try to change it so I guess it might 
not be changed.


Then do I need to define two memory regions one with little and another 
with big endian and unmap/map those when the bit is written? Can this be 
done when a write to the bit happens with LE ops then is it possible from 
the callback ro unmap the memory region being written and replace it with 
another? Is there any other easy simple way that I'm missing?


Regards,
BALATON Zoltan



Re: [RESEND][PATCH v1 2/2] xen_arm: Initialize RAM and add hi/low memory regions

2023-06-29 Thread Stefano Stabellini
On Thu, 29 Jun 2023, Vikram Garhwal wrote:
> From: Oleksandr Tyshchenko 
> 
> In order to use virtio backends we need to initialize RAM for the
> xen-mapcache (which is responsible for mapping guest memory using foreign
> mapping) to work. Calculate and add hi/low memory regions based on
> machine->ram_size.
> 
> Use the constants defined in public header arch-arm.h to be aligned with the 
> xen
> toolstack.
> 
> While using this machine, the toolstack should then pass real ram_size using
> "-m" arg. If "-m" is not given, create a QEMU machine without IOREQ, TPM and
> VIRTIO to keep it usable for /etc/init.d/xencommons.
> 
> Signed-off-by: Oleksandr Tyshchenko 
> Signed-off-by: Vikram Garhwal 
> ---
>  hw/arm/xen_arm.c | 45 +
>  1 file changed, 45 insertions(+)
> 
> diff --git a/hw/arm/xen_arm.c b/hw/arm/xen_arm.c
> index c0a93f2c9d..cc4dffee70 100644
> --- a/hw/arm/xen_arm.c
> +++ b/hw/arm/xen_arm.c
> @@ -60,6 +60,8 @@ struct XenArmState {
>  } cfg;
>  };
>  
> +static MemoryRegion ram_lo, ram_hi;
> +
>  #define VIRTIO_MMIO_DEV_SIZE   0x200
>  
>  #define NR_VIRTIO_MMIO_DEVICES   \
> @@ -86,6 +88,39 @@ static void xen_create_virtio_mmio_devices(XenArmState 
> *xam)
>  }
>  }
>  
> +static void xen_init_ram(MachineState *machine)
> +{
> +MemoryRegion *sysmem = get_system_memory();
> +ram_addr_t block_len, ram_size[GUEST_RAM_BANKS];
> +
> +if (machine->ram_size <= GUEST_RAM0_SIZE) {
> +ram_size[0] = machine->ram_size;
> +ram_size[1] = 0;
> +block_len = GUEST_RAM0_BASE + ram_size[0];
> +} else {
> +ram_size[0] = GUEST_RAM0_SIZE;
> +ram_size[1] = machine->ram_size - GUEST_RAM0_SIZE;
> +block_len = GUEST_RAM1_BASE + ram_size[1];
> +}
> +
> +memory_region_init_ram(_memory, NULL, "xen.ram", block_len,
> +   _fatal);
> +
> +memory_region_init_alias(_lo, NULL, "xen.ram.lo", _memory,
> + GUEST_RAM0_BASE, ram_size[0]);
> +memory_region_add_subregion(sysmem, GUEST_RAM0_BASE, _lo);
> +DPRINTF("Initialized region xen.ram.lo: base 0x%llx size 0x%lx\n",
> +GUEST_RAM0_BASE, ram_size[0]);
> +
> +if (ram_size[1] > 0) {
> +memory_region_init_alias(_hi, NULL, "xen.ram.hi", _memory,
> + GUEST_RAM1_BASE, ram_size[1]);
> +memory_region_add_subregion(sysmem, GUEST_RAM1_BASE, _hi);
> +DPRINTF("Initialized region xen.ram.hi: base 0x%llx size 0x%lx\n",
> +GUEST_RAM1_BASE, ram_size[1]);
> +}
> +}
> +
>  void arch_handle_ioreq(XenIOState *state, ioreq_t *req)
>  {
>  hw_error("Invalid ioreq type 0x%x\n", req->type);
> @@ -135,6 +170,14 @@ static void xen_arm_init(MachineState *machine)
>  
>  xam->state =  g_new0(XenIOState, 1);
>  
> +if (machine->ram_size == 0) {
> +DPRINTF("ram_size not specified. QEMU machine will be started 
> without"
> +" TPM, IOREQ and Virtio-MMIO backends\n");
> +return;
> +}

I would say "ram_size not specified. QEMU machine started without IOREQ
(no emulated devices including Virtio)."

We might add more devices in the future beyond Virtio and TPM. I don't
think we want to call out the whole list here.



> +xen_init_ram(machine);
> +
>  xen_register_ioreq(xam->state, machine->smp.cpus, xen_memory_listener);
>  
>  xen_create_virtio_mmio_devices(xam);
> @@ -182,6 +225,8 @@ static void xen_arm_machine_class_init(ObjectClass *oc, 
> void *data)
>  mc->init = xen_arm_init;
>  mc->max_cpus = 1;
>  mc->default_machine_opts = "accel=xen";
> +/* Set explicitly here to make sure that real ram_size is passed */
> +mc->default_ram_size = 0;
>  
>  printf("CHECK for NEW BUILD\n");
>  #ifdef CONFIG_TPM
> -- 
> 2.25.1
> 



Re: [RESEND][PATCH v1 1/2] xen_arm: Create virtio-mmio devices during initialization

2023-06-29 Thread Stefano Stabellini
On Thu, 29 Jun 2023, Vikram Garhwal wrote:
> From: Oleksandr Tyshchenko 
> 
> In order to use virtio backends we need to allocate virtio-mmio
> parameters (irq and base) and register corresponding buses.
> 
> Use the constants defined in public header arch-arm.h to be
> aligned with the toolstack. So the number of current supported
> virtio-mmio devices is 10.
> 
> For the interrupts triggering use already existing on Arm
> device-model hypercall.
> 
> The toolstack should then insert the same amount of device nodes
> into guest device-tree.
> 
> Signed-off-by: Oleksandr Tyshchenko 
> Signed-off-by: Vikram Garhwal 
> ---
>  hw/arm/xen_arm.c | 29 +
>  1 file changed, 29 insertions(+)
> 
> diff --git a/hw/arm/xen_arm.c b/hw/arm/xen_arm.c
> index 60dcd1bcc7..c0a93f2c9d 100644
> --- a/hw/arm/xen_arm.c
> +++ b/hw/arm/xen_arm.c
> @@ -26,6 +26,7 @@
>  #include "qapi/qapi-commands-migration.h"
>  #include "qapi/visitor.h"
>  #include "hw/boards.h"
> +#include "hw/irq.h"
>  #include "hw/sysbus.h"
>  #include "sysemu/block-backend.h"
>  #include "sysemu/tpm_backend.h"
> @@ -59,6 +60,32 @@ struct XenArmState {
>  } cfg;
>  };
>  
> +#define VIRTIO_MMIO_DEV_SIZE   0x200

Is this coming from QEMU? Or is it standard virtio?

Just asking to make sure that we don't run into a virtio device that
needs more than 0x200 of MMIO size.


> +#define NR_VIRTIO_MMIO_DEVICES   \
> +   (GUEST_VIRTIO_MMIO_SPI_LAST - GUEST_VIRTIO_MMIO_SPI_FIRST)
> +
> +static void xen_set_irq(void *opaque, int irq, int level)
> +{
> +xendevicemodel_set_irq_level(xen_dmod, xen_domid, irq, level);
> +}

Just a note: likely the xendevicemodel_set_irq_level call needs
privileges. Just something to keep in mind for when we try to run QEMU
in a domain other than Dom0. No need to do anything for now.

Everything looks good. If we can be sure 0x200 is the right MMIO size
for virtio devices then I would provide by Ack.


> +static void xen_create_virtio_mmio_devices(XenArmState *xam)
> +{
> +int i;
> +
> +for (i = 0; i < NR_VIRTIO_MMIO_DEVICES; i++) {
> +hwaddr base = GUEST_VIRTIO_MMIO_BASE + i * VIRTIO_MMIO_DEV_SIZE;
> +qemu_irq irq = qemu_allocate_irq(xen_set_irq, NULL,
> + GUEST_VIRTIO_MMIO_SPI_FIRST + i);
> +
> +sysbus_create_simple("virtio-mmio", base, irq);
> +
> +DPRINTF("Created virtio-mmio device %d: irq %d base 0x%lx\n",
> +i, GUEST_VIRTIO_MMIO_SPI_FIRST + i, base);
> +}
> +}
> +
>  void arch_handle_ioreq(XenIOState *state, ioreq_t *req)
>  {
>  hw_error("Invalid ioreq type 0x%x\n", req->type);
> @@ -110,6 +137,8 @@ static void xen_arm_init(MachineState *machine)
>  
>  xen_register_ioreq(xam->state, machine->smp.cpus, xen_memory_listener);
>  
> +xen_create_virtio_mmio_devices(xam);
> +
>  #ifdef CONFIG_TPM
>  if (xam->cfg.tpm_base_addr) {
>  xen_enable_tpm(xam);
> -- 
> 2.25.1
> 



Re: [PATCH v4 5/5] vfio/migration: Refactor and fix print of "Migration disabled"

2023-06-29 Thread Alex Williamson
On Thu, 29 Jun 2023 16:42:23 +0100
Joao Martins  wrote:

> On 29/06/2023 16:20, Avihai Horon wrote:
> > On 29/06/2023 15:44, Joao Martins wrote:  
> >> On 29/06/2023 09:40, Zhenzhong Duan wrote:  
> >>> This patch refactors vfio_migration_realize() and its dependend code
> >>> as follows:
> >>>
> >>> 1. It's redundant in vfio_migration_realize() to registers multiple 
> >>> blockers,
> >>>     e.g: vIOMMU blocker can be refactored as per device blocker.
> >>> 2. Change vfio_viommu_preset() to be only a per device checker.
> >>> 3. Remove global vIOMMU blocker related stuff, e.g:
> >>>     giommu_migration_blocker, vfio_[block|unblock]_giommu_migration()
> >>>     and vfio_migration_finalize()
> >>> 4. Change vfio_migration_realize(), 
> >>> vfio_block_multiple_devices_migration()
> >>>     vfio_block_migration() and vfio_viommu_preset() to return bool type.
> >>> 5. Print "Migration disabled" depending on enable_migration property
> >>>     and print it as warning instead of error which is overkill.
> >>>  
> >> I am not enterily sure we need to keep "Migration disabled". Perhaps we 
> >> should
> >> just derisk from error to warning and use always the same error messages.
> >>  
> >>> migrate_add_blocker() returns 0 when successfully adding the migration 
> >>> blocker.
> >>> However, the caller of vfio_migration_realize() considers that migration 
> >>> was
> >>> blocked when the latter returned an error. What matters for migration is 
> >>> that
> >>> the blocker is added in core migration, so this cleans up usability such 
> >>> that
> >>> user sees "Migrate disabled" when any of the vfio migration blockers are 
> >>> active
> >>> and it's not intentionally forced by user with enable-migration=off.
> >>>
> >>> Signed-off-by: Zhenzhong Duan 
> >>> ---
> >>>   hw/vfio/common.c  | 66 +++
> >>>   hw/vfio/migration.c   | 30 +---
> >>>   hw/vfio/pci.c |  4 +--
> >>>   include/hw/vfio/vfio-common.h |  7 ++--
> >>>   4 files changed, 36 insertions(+), 71 deletions(-)
> >>>
> >>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >>> index 77e2ee0e5c6e..c80ecb1da53f 100644
> >>> --- a/hw/vfio/common.c
> >>> +++ b/hw/vfio/common.c
> >>> @@ -362,7 +362,6 @@ bool vfio_mig_active(void)
> >>>   }
> >>>
> >>>   static Error *multiple_devices_migration_blocker;
> >>> -static Error *giommu_migration_blocker;
> >>>
> >>>   static unsigned int vfio_migratable_device_num(void)
> >>>   {
> >>> @@ -381,19 +380,19 @@ static unsigned int vfio_migratable_device_num(void)
> >>>   return device_num;
> >>>   }
> >>>
> >>> -int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error 
> >>> **errp)
> >>> +bool vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error 
> >>> **errp)
> >>>   {
> >>>   int ret;
> >>>
> >>>   if (multiple_devices_migration_blocker ||
> >>>   vfio_migratable_device_num() <= 1) {
> >>> -    return 0;
> >>> +    return true;
> >>>   }
> >>>
> >>>   if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
> >>>   error_setg(errp, "Migration is currently not supported with 
> >>> multiple "
> >>>    "VFIO devices");
> >>> -    return -EINVAL;
> >>> +    return false;
> >>>   }
> >>>
> >>>   error_setg(_devices_migration_blocker,
> >>> @@ -403,9 +402,15 @@ int vfio_block_multiple_devices_migration(VFIODevice
> >>> *vbasedev, Error **errp)
> >>>   if (ret < 0) {
> >>>   error_free(multiple_devices_migration_blocker);
> >>>   multiple_devices_migration_blocker = NULL;
> >>> +    } else {
> >>> +    /*
> >>> + * Only ON_OFF_AUTO_AUTO case, ON_OFF_AUTO_OFF is checked
> >>> + * in vfio_migration_realize().
> >>> + */
> >>> +    warn_report("Migration disabled, not support multiple VFIO 
> >>> devices");
> >>>   }
> >>>  
> >> Perhaps you could stash the previous error message and use it in the
> >> warn_report_error to consolidate the error messages e.g.
> >>
> >> bool vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error 
> >> **errp)
> >> {
> >>  Error *err = NULL;
> >>
> >>  if (multiple_devices_migration_blocker ||
> >>  vfio_migratable_device_num() <= 1) {
> >>  return true;
> >>  }
> >>
> >>  error_setg(, "%s: Migration is currently not supported with 
> >> multiple "
> >>   "VFIO devices", vbasedev->name);
> >>
> >>  if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
> >>  error_propagate(errp, err);
> >>  return -EINVAL;
> >>  }
> >>
> >>  ...
> >>  if (ret < 0) {
> >>  } else {
> >>  /* Warns only on ON_OFF_AUTO_AUTO case */
> >>  warn_report_err(err);  
> > 
> > I'm not sure this warning is needed.
> > If I remember correctly, I think Alex didn't want migration error/warning
> > messages to be logged in the AUTO case.

Correct.

> Hmm, ok, I missed this from 

Re: [PATCH v6 11/20] target/riscv/cpu: add misa_ext_info_arr[]

2023-06-29 Thread Daniel Henrique Barboza

Drew,

On 6/29/23 05:59, Andrew Jones wrote:

On Wed, Jun 28, 2023 at 06:30:24PM -0300, Daniel Henrique Barboza wrote:

Next patch will add KVM specific user properties for both MISA and
multi-letter extensions. For MISA extensions we want to make use of what
is already available in misa_ext_cfgs[] to avoid code repetition.

misa_ext_info_arr[] array will hold name and description for each MISA
extension that misa_ext_cfgs[] is declaring. We'll then use this new
array in KVM code to avoid duplicating strings.

There's nothing holding us back from doing the same with multi-letter
extensions. For now doing just with MISA extensions is enough.

Suggested-by: Andrew Jones 
Signed-off-by: Daniel Henrique Barboza 
---
  target/riscv/cpu.c | 83 ++
  target/riscv/cpu.h |  7 +++-
  2 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 2485e820f8..90dd2078ae 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1558,33 +1558,57 @@ static void cpu_get_misa_ext_cfg(Object *obj, Visitor 
*v, const char *name,
  visit_type_bool(v, name, , errp);
  }
  
-static const RISCVCPUMisaExtConfig misa_ext_cfgs[] = {

-{.name = "a", .description = "Atomic instructions",
- .misa_bit = RVA, .enabled = true},
-{.name = "c", .description = "Compressed instructions",
- .misa_bit = RVC, .enabled = true},
-{.name = "d", .description = "Double-precision float point",
- .misa_bit = RVD, .enabled = true},
-{.name = "f", .description = "Single-precision float point",
- .misa_bit = RVF, .enabled = true},
-{.name = "i", .description = "Base integer instruction set",
- .misa_bit = RVI, .enabled = true},
-{.name = "e", .description = "Base integer instruction set (embedded)",
- .misa_bit = RVE, .enabled = false},
-{.name = "m", .description = "Integer multiplication and division",
- .misa_bit = RVM, .enabled = true},
-{.name = "s", .description = "Supervisor-level instructions",
- .misa_bit = RVS, .enabled = true},
-{.name = "u", .description = "User-level instructions",
- .misa_bit = RVU, .enabled = true},
-{.name = "h", .description = "Hypervisor",
- .misa_bit = RVH, .enabled = true},
-{.name = "x-j", .description = "Dynamic translated languages",
- .misa_bit = RVJ, .enabled = false},
-{.name = "v", .description = "Vector operations",
- .misa_bit = RVV, .enabled = false},
-{.name = "g", .description = "General purpose (IMAFD_Zicsr_Zifencei)",
- .misa_bit = RVG, .enabled = false},
+typedef struct misa_ext_info {
+const char *name;
+const char *description;
+} MISAExtInfo;
+
+#define MISA_EXT_INFO(_idx, _propname, _descr) \
+[(_idx - 'A')] = {.name = _propname, .description = _descr}


We don't have to give up on passing RV* to this macro. Directly
using __builtin_ctz() with a constant should work, i.e.

  #define MISA_EXT_INFO(_bit, _propname, _descr) \
  [__builtin_ctz(_bit)] = {.name = _propname, .description = _descr}

and then

  MISA_EXT_INFO(RVA, "a", "Atomic instructions"),
  MISA_EXT_INFO(RVD, "d", "Double-precision float point"),
  ...

(We don't need the ctz32() wrapper since we know we'll never input zero to
__builtin_ctz().)


I run the series through gitlab because I got worried about this change in 
different
compilers and so on. And in fact I fear that we break 'clang-user' with it:

https://gitlab.com/danielhb/qemu/-/jobs/4569265199

u.c.o -c ../target/riscv/cpu.c
../target/riscv/cpu.c:1624:5: error: initializer element is not a compile-time 
constant
MISA_CFG(RVA, true),
^~~
../target/riscv/cpu.c:1619:53: note: expanded from macro 'MISA_CFG'
{.name = misa_ext_info_arr[MISA_INFO_IDX(_bit)].name, \
 ~~~^~~~
1 error generated.
[1503/2619] Compiling C object 
libqemu-ppc64le-linux-user.fa.p/linux-user_syscall.c.o


Which is a shame because gcc and everyone else is okay with it, but 
'clang-user' and
'tsan-build' runners are complaining about it.

Unless there's a directive to make clang accept this code (I didn't find any) 
we'll
need to keep updating name and description during runtime, and we'll have to 
keep
removing 'const' from misa_ext_cfgs[].


Thanks,


Daniel





+
+static const MISAExtInfo misa_ext_info_arr[] = {
+MISA_EXT_INFO('A', "a", "Atomic instructions"),
+MISA_EXT_INFO('C', "c", "Compressed instructions"),
+MISA_EXT_INFO('D', "d", "Double-precision float point"),
+MISA_EXT_INFO('F', "f", "Single-precision float point"),
+MISA_EXT_INFO('I', "i", "Base integer instruction set"),
+MISA_EXT_INFO('E', "e", "Base integer instruction set (embedded)"),
+MISA_EXT_INFO('M', "m", "Integer multiplication and division"),
+MISA_EXT_INFO('S', "s", "Supervisor-level instructions"),
+MISA_EXT_INFO('U', "u", "User-level instructions"),
+MISA_EXT_INFO('H', "h", "Hypervisor"),
+

Re: [PATCH 1/1] linux-user: add support for big endian variants of riscv

2023-06-29 Thread Palmer Dabbelt

On Fri, 30 Jun 2023 04:14:09 PDT (-0700), rory.opensou...@gmail.com wrote:

RISCV architecture supports an optional big endian mode of operation.
In this mode, data accesses are treated as big endian, while code is
always in little endian format. This is similar to how the ARM
architecture treats it's optional bi-endian support. This patch adds
support for big endian RISCV operation to linux-user.


We don't have BE support in Linux yet.  IIRC we've had some other 
linux-user stuff go in with a "we'll change it to match whatever uABI 
Linux ends up with" sort of caveat, but I might be mistaken.  I'm not 
opposed to doing that sort of thing for BE as well.  I don't remember 
what the right way to indicate that is, though.



Signed-off-by: rory.opensou...@gmail.com
---
 configs/targets/riscv64be-linux-user.mak|  7 +++
 configure   |  1 +
 linux-user/elfload.c| 10 ++
 linux-user/include/host/riscv/host-signal.h |  3 +++
 linux-user/riscv/signal.c   |  5 +
 linux-user/riscv/target_syscall.h   |  8 
 scripts/probe-gdb-support.py|  4 ++--
 scripts/qemu-binfmt-conf.sh | 12 ++--
 target/riscv/cpu.c  |  5 +
 target/riscv/translate.c| 13 +
 10 files changed, 64 insertions(+), 4 deletions(-)
 create mode 100644 configs/targets/riscv64be-linux-user.mak

diff --git a/configs/targets/riscv64be-linux-user.mak 
b/configs/targets/riscv64be-linux-user.mak
new file mode 100644
index 00..f22f5f0971
--- /dev/null
+++ b/configs/targets/riscv64be-linux-user.mak
@@ -0,0 +1,7 @@
+TARGET_ARCH=riscv64
+TARGET_BASE_ARCH=riscv
+TARGET_ABI_DIR=riscv
+TARGET_BIG_ENDIAN=y
+TARGET_XML_FILES= gdb-xml/riscv-64bit-cpu.xml gdb-xml/riscv-32bit-fpu.xml 
gdb-xml/riscv-64bit-fpu.xml gdb-xml/riscv-64bit-virtual.xml
+CONFIG_SEMIHOSTING=y
+CONFIG_ARM_COMPATIBLE_SEMIHOSTING=y
diff --git a/configure b/configure
index 2b41c49c0d..90795a0e9f 100755
--- a/configure
+++ b/configure
@@ -1190,6 +1190,7 @@ fi
 : ${cross_prefix_ppc64="powerpc64-linux-gnu-"}
 : ${cross_prefix_ppc64le="$cross_prefix_ppc64"}
 : ${cross_prefix_riscv64="riscv64-linux-gnu-"}
+: ${cross_prefix_riscv64be="riscv64be-linux-gnu-"}
 : ${cross_prefix_s390x="s390x-linux-gnu-"}
 : ${cross_prefix_sh4="sh4-linux-gnu-"}
 : ${cross_prefix_sparc64="sparc64-linux-gnu-"}
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 9a2ec568b0..e0204c7069 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1681,8 +1681,18 @@ static void elf_core_copy_regs(target_elf_gregset_t 
*regs,

 #ifdef TARGET_RISCV32
 #define ELF_CLASS ELFCLASS32
+#if TARGET_BIG_ENDIAN
+#define ELF_PLATFORM "riscv32be"
+#else
+#define ELF_PLATFORM "riscv32"
+#endif
 #else
 #define ELF_CLASS ELFCLASS64
+#if TARGET_BIG_ENDIAN
+#define ELF_PLATFORM "riscv64be"
+#else
+#define ELF_PLATFORM "riscv64"
+#endif
 #endif

 #define ELF_HWCAP get_elf_hwcap()
diff --git a/linux-user/include/host/riscv/host-signal.h 
b/linux-user/include/host/riscv/host-signal.h
index decacb2325..b3f2735261 100644
--- a/linux-user/include/host/riscv/host-signal.h
+++ b/linux-user/include/host/riscv/host-signal.h
@@ -38,6 +38,9 @@ static inline bool host_signal_write(siginfo_t *info, 
host_sigcontext *uc)
  */
 const uint16_t *pinsn = (const uint16_t *)host_signal_pc(uc);
 uint16_t insn = pinsn[0];
+#if TARGET_BIG_ENDIAN
+insn = (insn << 8) | (insn >> 8);
+#endif

 /* 16-bit instructions */
 switch (insn & 0xe003) {
diff --git a/linux-user/riscv/signal.c b/linux-user/riscv/signal.c
index eaa168199a..1d9e3413fb 100644
--- a/linux-user/riscv/signal.c
+++ b/linux-user/riscv/signal.c
@@ -199,8 +199,13 @@ void setup_sigtramp(abi_ulong sigtramp_page)
 uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 8, 0);
 assert(tramp != NULL);

+#if TARGET_BIG_ENDIAN
+__put_user(0x9308b008, tramp + 0);  /* li a7, 139 = __NR_rt_sigreturn */
+__put_user(0x7300, tramp + 1);  /* ecall */
+#else
 __put_user(0x08b00893, tramp + 0);  /* li a7, 139 = __NR_rt_sigreturn */
 __put_user(0x0073, tramp + 1);  /* ecall */
+#endif

 default_rt_sigreturn = sigtramp_page;
 unlock_user(tramp, sigtramp_page, 8);
diff --git a/linux-user/riscv/target_syscall.h 
b/linux-user/riscv/target_syscall.h
index 7601f10c28..88c0ac1351 100644
--- a/linux-user/riscv/target_syscall.h
+++ b/linux-user/riscv/target_syscall.h
@@ -44,10 +44,18 @@ struct target_pt_regs {
 };

 #ifdef TARGET_RISCV32
+#if TARGET_BIG_ENDIAN
+#define UNAME_MACHINE "riscv32be"
+#else
 #define UNAME_MACHINE "riscv32"
+#endif
 #define UNAME_MINIMUM_RELEASE "5.4.0"
 #else
+#if TARGET_BIG_ENDIAN
+#define UNAME_MACHINE "riscv64be"
+#else
 #define UNAME_MACHINE "riscv64"
+#endif
 #define UNAME_MINIMUM_RELEASE "4.15.0"
 #endif

diff --git a/scripts/probe-gdb-support.py b/scripts/probe-gdb-support.py
index 5755255966..a1e0905a10 100644
--- 

Re: [PATCH V3 2/2] migration: file URI offset

2023-06-29 Thread Peter Xu
On Thu, Jun 22, 2023 at 01:37:31PM -0700, Steve Sistare wrote:
> +static int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp)
> +{
> +char *option = strstr(filespec, OFFSET_OPTION);
> +int ret;
> +
> +if (option) {
> +*option = 0;
> +option += sizeof(OFFSET_OPTION) - 1;
> +ret = qemu_strtosz(option, NULL, offsetp);
> +if (ret) {
> +error_setg_errno(errp, ret, "file URI has bad offset %s", 
> option);

Probably "-ret" here.

> +return -1;
> +}
> +}
> +return 0;
> +}

-- 
Peter Xu




Re: [PATCH 6/6] tests/qtest: migration-test: Add tests for file-based migration

2023-06-29 Thread Peter Xu
On Wed, Jun 28, 2023 at 01:55:42PM -0300, Fabiano Rosas wrote:
> Add basic tests for file-based migration.
> 
> Signed-off-by: Fabiano Rosas 
> ---
>  tests/qtest/migration-test.c | 104 +++
>  1 file changed, 104 insertions(+)
> 
> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> index acb778a8cd..b3019f54de 100644
> --- a/tests/qtest/migration-test.c
> +++ b/tests/qtest/migration-test.c
> @@ -52,6 +52,10 @@ static bool got_dst_resume;
>   */
>  #define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
>  
> +#define QEMU_VM_FILE_MAGIC 0x5145564d
> +#define FILE_TEST_FILENAME "migfile"
> +#define FILE_TEST_OFFSET 0x1000
> +
>  #if defined(__linux__)
>  #include 
>  #include 
> @@ -763,6 +767,7 @@ static void test_migrate_end(QTestState *from, QTestState 
> *to, bool test_dest)
>  cleanup("migsocket");
>  cleanup("src_serial");
>  cleanup("dest_serial");
> +cleanup(FILE_TEST_FILENAME);
>  }
>  
>  #ifdef CONFIG_GNUTLS
> @@ -1460,11 +1465,28 @@ static void test_precopy_common(MigrateCommon *args)
>   */
>  wait_for_migration_complete(from);
>  
> +/*
> + * For file based migration the target must begin its
> + * migration after the source has finished.
> + */
> +if (strstr(connect_uri, "file:")) {
> +migrate_incoming_qmp(to, connect_uri, "{}");
> +}
> +
>  if (!got_src_stop) {
>  qtest_qmp_eventwait(from, "STOP");
>  }
>  } else {
>  wait_for_migration_complete(from);
> +
> +/*
> + * For file based migration the target must begin its
> + * migration after the source has finished.
> + */
> +if (strstr(connect_uri, "file:")) {
> +migrate_incoming_qmp(to, connect_uri, "{}");
> +}
> +
>  /*
>   * Must wait for dst to finish reading all incoming
>   * data on the socket before issuing 'cont' otherwise
> @@ -1682,6 +1704,78 @@ static void test_precopy_unix_compress_nowait(void)
>  test_precopy_common();
>  }
>  
> +static void test_precopy_file(void)
> +{
> +g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs,
> +   FILE_TEST_FILENAME);
> +MigrateCommon args = {
> +.connect_uri = uri,
> +.listen_uri = "defer",
> +};
> +
> +test_precopy_common();
> +}
> +
> +#if defined(__linux__)
> +static void file_offset_finish_hook(QTestState *from, QTestState *to, void 
> *opaque)
> +{
> +g_autofree char *path = g_strdup_printf("%s/%s", tmpfs, 
> FILE_TEST_FILENAME);
> +size_t size = FILE_TEST_OFFSET + sizeof(QEMU_VM_FILE_MAGIC);
> +uintptr_t *addr, *p;
> +int fd;
> +
> +fd = open(path, O_RDONLY);
> +g_assert(fd != -1);
> +addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);

Not something that matters a lot, but RO mapping a file with private is a
bit weird.  Maybe just use MAP_SHARED?

> +g_assert(addr != MAP_FAILED);
> +
> +/*
> + * Ensure the skipped offset contains zeros and the migration
> + * stream starts at the right place.
> + */
> +p = addr;
> +while (p < addr + FILE_TEST_OFFSET / sizeof(uintptr_t)) {
> +g_assert(*p == 0);
> +p++;
> +}
> +g_assert_cmpint(cpu_to_be32(*p), ==, QEMU_VM_FILE_MAGIC);
> +
> +munmap(addr, size);
> +close(fd);
> +}
> +
> +static void test_precopy_file_offset(void)
> +{
> +g_autofree char *uri = g_strdup_printf("file:%s/%s,offset=%d", tmpfs,
> +   FILE_TEST_FILENAME,
> +   FILE_TEST_OFFSET);

Is it intended to also only run this test with linux?  IIUC it should also
work for others. Maybe only file_offset_finish_hook() is optional?  Or am i
wrong?

> +MigrateCommon args = {
> +.connect_uri = uri,
> +.listen_uri = "defer",
> +.finish_hook = file_offset_finish_hook,
> +};
> +
> +test_precopy_common();
> +}
> +#endif
> +
> +static void test_precopy_file_offset_bad(void)
> +{
> +/* using a value not supported by qemu_strtosz() */
> +g_autofree char *uri = g_strdup_printf("file:%s/migfile,offset=0x20M",
> +   tmpfs);
> +MigrateCommon args = {
> +.connect_uri = uri,
> +.listen_uri = "defer",
> +.error_str = g_strdup(
> +"file URI has bad offset 0x20M: Unknown error -22"),

"Unknown error" may imply that in Steve's patch the errno is inverted..

Shall we not rely on the string in the test?  It might be too strict, I
worry, because error strings should be defined for human readers, and we
may not want some e.g. grammar / trivial change to break a test.

> +.result = MIG_TEST_QMP_ERROR,
> +};
> +
> +test_precopy_common();
> +g_free(args.error_str);

Re: [PATCH] linux-user: Avoid mmap of the last byte of the reserved_va

2023-06-29 Thread Michael Tokarev

29.06.2023 11:08, Richard Henderson wrote:

There is an overflow problem in mmap_find_vma_reserved:
when reserved_va == UINT32_MAX, end may overflow to 0.
Rather than a larger rewrite at this time, simply avoid
the final byte of the VA, which avoids searching the
final page, which avoids the overflow.

Cc: qemu-sta...@nongnu.org
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1741
Fixes: 95059f9c ("include/exec: Change reserved_va semantics to last byte")
Signed-off-by: Richard Henderson 


So, I pushed this to debian (where we've seen multiple failures),
let's see how it goes..

/mjt



[PATCH 1/1] linux-user: add support for big endian variants of riscv

2023-06-29 Thread Rory Bolt
RISCV architecture supports an optional big endian mode of operation.
In this mode, data accesses are treated as big endian, while code is
always in little endian format. This is similar to how the ARM
architecture treats it's optional bi-endian support. This patch adds
support for big endian RISCV operation to linux-user.

Signed-off-by: rory.opensou...@gmail.com
---
 configs/targets/riscv64be-linux-user.mak|  7 +++
 configure   |  1 +
 linux-user/elfload.c| 10 ++
 linux-user/include/host/riscv/host-signal.h |  3 +++
 linux-user/riscv/signal.c   |  5 +
 linux-user/riscv/target_syscall.h   |  8 
 scripts/probe-gdb-support.py|  4 ++--
 scripts/qemu-binfmt-conf.sh | 12 ++--
 target/riscv/cpu.c  |  5 +
 target/riscv/translate.c| 13 +
 10 files changed, 64 insertions(+), 4 deletions(-)
 create mode 100644 configs/targets/riscv64be-linux-user.mak

diff --git a/configs/targets/riscv64be-linux-user.mak 
b/configs/targets/riscv64be-linux-user.mak
new file mode 100644
index 00..f22f5f0971
--- /dev/null
+++ b/configs/targets/riscv64be-linux-user.mak
@@ -0,0 +1,7 @@
+TARGET_ARCH=riscv64
+TARGET_BASE_ARCH=riscv
+TARGET_ABI_DIR=riscv
+TARGET_BIG_ENDIAN=y
+TARGET_XML_FILES= gdb-xml/riscv-64bit-cpu.xml gdb-xml/riscv-32bit-fpu.xml 
gdb-xml/riscv-64bit-fpu.xml gdb-xml/riscv-64bit-virtual.xml
+CONFIG_SEMIHOSTING=y
+CONFIG_ARM_COMPATIBLE_SEMIHOSTING=y
diff --git a/configure b/configure
index 2b41c49c0d..90795a0e9f 100755
--- a/configure
+++ b/configure
@@ -1190,6 +1190,7 @@ fi
 : ${cross_prefix_ppc64="powerpc64-linux-gnu-"}
 : ${cross_prefix_ppc64le="$cross_prefix_ppc64"}
 : ${cross_prefix_riscv64="riscv64-linux-gnu-"}
+: ${cross_prefix_riscv64be="riscv64be-linux-gnu-"}
 : ${cross_prefix_s390x="s390x-linux-gnu-"}
 : ${cross_prefix_sh4="sh4-linux-gnu-"}
 : ${cross_prefix_sparc64="sparc64-linux-gnu-"}
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 9a2ec568b0..e0204c7069 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1681,8 +1681,18 @@ static void elf_core_copy_regs(target_elf_gregset_t 
*regs,
 
 #ifdef TARGET_RISCV32
 #define ELF_CLASS ELFCLASS32
+#if TARGET_BIG_ENDIAN
+#define ELF_PLATFORM "riscv32be"
+#else
+#define ELF_PLATFORM "riscv32"
+#endif
 #else
 #define ELF_CLASS ELFCLASS64
+#if TARGET_BIG_ENDIAN
+#define ELF_PLATFORM "riscv64be"
+#else
+#define ELF_PLATFORM "riscv64"
+#endif
 #endif
 
 #define ELF_HWCAP get_elf_hwcap()
diff --git a/linux-user/include/host/riscv/host-signal.h 
b/linux-user/include/host/riscv/host-signal.h
index decacb2325..b3f2735261 100644
--- a/linux-user/include/host/riscv/host-signal.h
+++ b/linux-user/include/host/riscv/host-signal.h
@@ -38,6 +38,9 @@ static inline bool host_signal_write(siginfo_t *info, 
host_sigcontext *uc)
  */
 const uint16_t *pinsn = (const uint16_t *)host_signal_pc(uc);
 uint16_t insn = pinsn[0];
+#if TARGET_BIG_ENDIAN
+insn = (insn << 8) | (insn >> 8);
+#endif
 
 /* 16-bit instructions */
 switch (insn & 0xe003) {
diff --git a/linux-user/riscv/signal.c b/linux-user/riscv/signal.c
index eaa168199a..1d9e3413fb 100644
--- a/linux-user/riscv/signal.c
+++ b/linux-user/riscv/signal.c
@@ -199,8 +199,13 @@ void setup_sigtramp(abi_ulong sigtramp_page)
 uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 8, 0);
 assert(tramp != NULL);
 
+#if TARGET_BIG_ENDIAN
+__put_user(0x9308b008, tramp + 0);  /* li a7, 139 = __NR_rt_sigreturn */
+__put_user(0x7300, tramp + 1);  /* ecall */
+#else
 __put_user(0x08b00893, tramp + 0);  /* li a7, 139 = __NR_rt_sigreturn */
 __put_user(0x0073, tramp + 1);  /* ecall */
+#endif
 
 default_rt_sigreturn = sigtramp_page;
 unlock_user(tramp, sigtramp_page, 8);
diff --git a/linux-user/riscv/target_syscall.h 
b/linux-user/riscv/target_syscall.h
index 7601f10c28..88c0ac1351 100644
--- a/linux-user/riscv/target_syscall.h
+++ b/linux-user/riscv/target_syscall.h
@@ -44,10 +44,18 @@ struct target_pt_regs {
 };
 
 #ifdef TARGET_RISCV32
+#if TARGET_BIG_ENDIAN
+#define UNAME_MACHINE "riscv32be"
+#else
 #define UNAME_MACHINE "riscv32"
+#endif
 #define UNAME_MINIMUM_RELEASE "5.4.0"
 #else
+#if TARGET_BIG_ENDIAN
+#define UNAME_MACHINE "riscv64be"
+#else
 #define UNAME_MACHINE "riscv64"
+#endif
 #define UNAME_MINIMUM_RELEASE "4.15.0"
 #endif
 
diff --git a/scripts/probe-gdb-support.py b/scripts/probe-gdb-support.py
index 5755255966..a1e0905a10 100644
--- a/scripts/probe-gdb-support.py
+++ b/scripts/probe-gdb-support.py
@@ -41,8 +41,8 @@
 "or1k" : "or1k",
 "powerpc:common" : "ppc",
 "powerpc:common64" : ["ppc64", "ppc64le"],
-"riscv:rv32" : "riscv32",
-"riscv:rv64" : "riscv64",
+"riscv:rv32" : ["riscv32", "riscv32be"],
+"riscv:rv64" : ["riscv64", "riscv64be"],
 "s390:64-bit" : "s390x",
 "sh4" : ["sh4", "sh4eb"],
 

[PATCH 0/1] linux-user: add support for big endian variants of riscv

2023-06-29 Thread Rory Bolt
Rory Bolt (1):
  linux-user: add support for big endian variants of riscv

 configs/targets/riscv64be-linux-user.mak|  7 +++
 configure   |  1 +
 linux-user/elfload.c| 10 ++
 linux-user/include/host/riscv/host-signal.h |  3 +++
 linux-user/riscv/signal.c   |  5 +
 linux-user/riscv/target_syscall.h   |  8 
 scripts/probe-gdb-support.py|  4 ++--
 scripts/qemu-binfmt-conf.sh | 12 ++--
 target/riscv/cpu.c  |  5 +
 target/riscv/translate.c| 13 +
 10 files changed, 64 insertions(+), 4 deletions(-)
 create mode 100644 configs/targets/riscv64be-linux-user.mak

-- 
2.39.3




Re: [PATCH 3/6] tests/qtest: migration: Add migrate_incoming_qmp helper

2023-06-29 Thread Peter Xu
On Wed, Jun 28, 2023 at 01:55:39PM -0300, Fabiano Rosas wrote:
> file-based migration requires the target to initiate its migration after
> the source has finished writing out the data in the file. Currently
> there's no easy way to initiate 'migrate-incoming', allow this by
> introducing migrate_incoming_qmp helper, similarly to migrate_qmp.
> 
> Also make sure migration events are enabled and wait for the incoming
> migration to start before returning. This avoid a race when querying
> the migration status too soon after issuing the command.
> 
> Signed-off-by: Fabiano Rosas 
> ---
>  tests/qtest/migration-helpers.c | 28 
>  tests/qtest/migration-helpers.h |  4 
>  2 files changed, 32 insertions(+)
> 
> diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
> index 2df198c99e..bc54b29184 100644
> --- a/tests/qtest/migration-helpers.c
> +++ b/tests/qtest/migration-helpers.c
> @@ -81,6 +81,34 @@ void migrate_set_capability(QTestState *who, const char 
> *capability,
>   capability, value);
>  }
>  
> +void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, 
> ...)
> +{
> +va_list ap;
> +QDict *args, *rsp, *data;
> +
> +va_start(ap, fmt);
> +args = qdict_from_vjsonf_nofail(fmt, ap);
> +va_end(ap);
> +
> +g_assert(!qdict_haskey(args, "uri"));
> +qdict_put_str(args, "uri", uri);
> +
> +migrate_set_capability(to, "events", true);
> +
> +rsp = qtest_qmp(to, "{ 'execute': 'migrate-incoming', 'arguments': %p}",
> +args);
> +g_assert(qdict_haskey(rsp, "return"));

rsp leaked?

> +
> +rsp = qtest_qmp_eventwait_ref(to, "MIGRATION");
> +g_assert(qdict_haskey(rsp, "data"));
> +
> +data = qdict_get_qdict(rsp, "data");
> +g_assert(qdict_haskey(data, "status"));
> +g_assert_cmpstr(qdict_get_str(data, "status"), ==, "setup");
> +
> +qobject_unref(rsp);
> +}
> +
>  /*
>   * Note: caller is responsible to free the returned object via
>   * qobject_unref() after use
> diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h
> index 484d7c960f..57d295a4fe 100644
> --- a/tests/qtest/migration-helpers.h
> +++ b/tests/qtest/migration-helpers.h
> @@ -23,6 +23,10 @@ bool migrate_watch_for_resume(QTestState *who, const char 
> *name,
>  G_GNUC_PRINTF(3, 4)
>  void migrate_qmp(QTestState *who, const char *uri, const char *fmt, ...);
>  
> +G_GNUC_PRINTF(3, 4)
> +void migrate_incoming_qmp(QTestState *who, const char *uri,
> +  const char *fmt, ...);
> +
>  void migrate_set_capability(QTestState *who, const char *capability,
>  bool value);
>  
> -- 
> 2.35.3
> 

-- 
Peter Xu




Re: [PATCH] hw: Simplify calls to pci_nic_init_nofail()

2023-06-29 Thread Philippe Mathieu-Daudé

On 29/6/23 16:58, Thomas Huth wrote:

On 29/06/2023 15.47, Philippe Mathieu-Daudé wrote:

Hi Thomas,

On 29/6/23 14:54, Thomas Huth wrote:

pci_nic_init_nofail() calls qemu_find_nic_model(), and this function
sets nd->model = g_strdup(default_model) if it has not been initialized
yet. So we don't have to set nd->model to the default_nic in the
calling sites.

Signed-off-by: Thomas Huth 
---
  hw/arm/sbsa-ref.c    | 8 +---
  hw/arm/virt.c    | 8 +---
  hw/loongarch/virt.c  | 8 +---
  hw/mips/loongson3_virt.c | 8 +---
  hw/xtensa/virt.c | 8 +---
  5 files changed, 5 insertions(+), 35 deletions(-)

...

This remind me of a branch from end of April with this
unfinished patch, did we already discuss this together?


No, I haven't seen your patch yet, neither we talked about it. I came up 
with the idea for my patch on my own after looking at certain spots in 
the code. But I guess you could easily rebase your patch on top of mine 
in case you want to finish it ;-)


Yeah sure, I was just wondering :)

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH 1/2] target/riscv: Check for CF_PARALLEL instead of qemu_tcg_mttcg_enabled

2023-06-29 Thread Philippe Mathieu-Daudé

On 29/6/23 18:26, Alex Bennée wrote:


Philippe Mathieu-Daudé  writes:


A CPU knows whether MTTCG is enabled or not because it is
reflected in its TCG flags via the CF_PARALLEL bit.

Suggested-by: Alex Bennée 
Signed-off-by: Philippe Mathieu-Daudé 
---
  target/riscv/cpu.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 4035fe0e62..4dfa64af6a 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -473,7 +473,7 @@ static void rv64_veyron_v1_cpu_init(Object *obj)
  
  static void rv128_base_cpu_init(Object *obj)

  {
-if (qemu_tcg_mttcg_enabled()) {
+if (CPU(obj)->tcg_cflags & CF_PARALLEL) {


Hmm have you checked that tcg_cpu_init_cflags() has executed by this point?


$arch_cpu_realize
 -> qemu_init_vcpu
 -> mttcg_start_vcpu_thread
   -> tcg_cpu_init_cflags

I'll document in the commit description.


  /* Missing 128-bit aligned atomics */
  error_report("128-bit RISC-V currently does not work with Multi "
   "Threaded TCG. Please use: -accel tcg,thread=single");


Not that we can do anything about it but in linux-user we start with
CF_PARALLEL unset and only set it at the point we spawn a new thread.


Hmm I'll give it more thinking then.

Thanks,

Phil.




Re: [PATCH 1/1] pcie: Add hotplug detect state register to w1cmask

2023-06-29 Thread Peter Xu
On Thu, Jun 29, 2023 at 04:06:53PM -0400, Michael S. Tsirkin wrote:
> On Thu, Jun 29, 2023 at 04:01:41PM -0400, Peter Xu wrote:
> > On Thu, Jun 29, 2023 at 03:33:06PM -0400, Michael S. Tsirkin wrote:
> > > On Thu, Jun 29, 2023 at 01:01:53PM -0400, Peter Xu wrote:
> > > > Hi, Leo,
> > > > 
> > > > Thanks for figuring this out.  Let me copy a few more potential 
> > > > reviewers
> > > > from commit 17858a1695 ("hw/acpi/ich9: Set ACPI PCI hot-plug as default 
> > > > on
> > > > Q35").
> > > > 
> > > > On Thu, Jun 29, 2023 at 06:05:00AM -0300, Leonardo Bras wrote:
> > > > > When trying to migrate a machine type pc-q35-6.0 or lower, with this
> > > > > cmdline options:
> > > > > 
> > > > > -device 
> > > > > driver=pcie-root-port,port=18,chassis=19,id=pcie-root-port18,bus=pcie.0,addr=0x12
> > > > >  \
> > > > > -device 
> > > > > driver=nec-usb-xhci,p2=4,p3=4,id=nex-usb-xhci0,bus=pcie-root-port18,addr=0x12.0x1
> > > > > 
> > > > > the following bug happens after all ram pages were sent:
> > > > > 
> > > > > qemu-kvm: get_pci_config_device: Bad config data: i=0x6e read: 0 
> > > > > device: 40 cmask: ff wmask: 0 w1cmask:19
> > > > > qemu-kvm: Failed to load PCIDevice:config
> > > > > qemu-kvm: Failed to load 
> > > > > pcie-root-port:parent_obj.parent_obj.parent_obj
> > > > > qemu-kvm: error while loading state for instance 0x0 of device 
> > > > > ':00:12.0/pcie-root-port'
> > > > > qemu-kvm: load of migration failed: Invalid argument
> > > > > 
> > > > > This happens on pc-q35-6.0 or lower because of:
> > > > > { "ICH9-LPC", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" }
> > > > > 
> > > > > In this scenario, hotplug_handler_plug() calls 
> > > > > pcie_cap_slot_plug_cb(),
> > > > > which sets the bus dev->config byte 0x6e with bit PCI_EXP_SLTSTA_PDS 
> > > > > to 
> > > > > signal PCI hotplug for the guest. After a while the guest will deal 
> > > > > with
> > > > > this hotplug and qemu will clear the above bit.
> > > 
> > > Presence Detect State – This bit indicates the presence of an
> > > adapter in the slot, reflected by the logical “OR” of the Physical
> > > Layer in-band presence detect mechanism and, if present, any
> > > out-of-band presence detect mechanism defined for the slot’s
> > > corresponding form factor. Note that the in-band presence
> > > detect mechanism requires that power be applied to an adapter
> > > for its presence to be detected. Consequently, form factors that
> > > require a power controller for hot-plug must implement a
> > > physical pin presence detect mechanism.
> > > RO
> > > Defined encodings are:
> > > 0b Slot Empty
> > > 1b Card Present in slot
> > > This bit must be implemented on all Downstream Ports that
> > > implement slots. For Downstream Ports not connected to slots
> > > (where the Slot Implemented bit of the PCI Express Capabilities
> > > register is 0b), this bit must be hardwired to 1b.
> > > 
> > > 
> > > And this seems to match what QEMU is doing: it clears on unplug
> > > not after guest deals with hotplug.
> > > 
> > > 
> > > > > Then, during migration, get_pci_config_device() will compare the
> > > > > configs of both the freshly created device and the one that is being
> > > > > received via migration, which will differ due to the 
> > > > > PCI_EXP_SLTSTA_PDS bit
> > > > > and cause the bug to reproduce.
> > > 
> > > So bit is set on source.
> > > But why is the bit cleared on destination? This is the part I don't get.
> > 
> > My understanding is that when ACPI_PM_PROP_ACPI_PCIHP_BRIDGE is off for the
> > device, we just won't ever PCI_EXP_SLTSTA_PDS bit?
> 
> Why?

Never mind, spoke too soon, sorry. :(

I thought pcie_cap_slot_plug_cb() can skip the set, but then I just found
that dev->hotplugged is not what I imagined there.

Leo should know better.

> 
> 
> > > 
> > > 
> > > > > To avoid this fake incompatibility, there are two fields in PCIDevice 
> > > > > that
> > > > > can help:
> > > > > 
> > > > > .wmask: Used to implement R/W bytes, and
> > > > > .w1cmask: Used to implement RW1C(Write 1 to Clear) bytes
> > > > 
> > > > Is there one more option to clear the bit in cmask?
> > > > 
> > > > IIUC w1cmask means the guest can now write to this bit, but afaiu from 
> > > > the
> > > > pcie spec it's RO.
> > > 
> > > Yes this bit must be RO.
> > > 
> > > > > 
> > > > > According to pcie_cap_slot_init() the slot status register
> > > > > (PCI_EXP_SLTSTA), in which PCI_EXP_SLTSTA_PDS is a flag, seems to fall
> > > > > under w1cmask field, with makes sense due to the way signaling the 
> > > > > hotplug
> > > > > works.
> > > > > 
> > > > > So, add PCI_EXP_SLTSTA_PDS bit to w1cmask, so the fake 
> > > > > incompatibility on
> > > > > get_pci_config_device() does not abort the migration.
> > > > > 
> > > > > Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2215819
> > > > > Signed-off-by: Leonardo Bras 
> > > > 
> > > > Do we need a Fixes: and also the need to copy stable?
> > > > 
> > > > > ---
> > > > >  hw/pci/pcie.c | 2 +-
> > > > >  1 file changed, 

Re: [PATCH 1/1] pcie: Add hotplug detect state register to w1cmask

2023-06-29 Thread Michael S. Tsirkin
On Thu, Jun 29, 2023 at 04:01:41PM -0400, Peter Xu wrote:
> On Thu, Jun 29, 2023 at 03:33:06PM -0400, Michael S. Tsirkin wrote:
> > On Thu, Jun 29, 2023 at 01:01:53PM -0400, Peter Xu wrote:
> > > Hi, Leo,
> > > 
> > > Thanks for figuring this out.  Let me copy a few more potential reviewers
> > > from commit 17858a1695 ("hw/acpi/ich9: Set ACPI PCI hot-plug as default on
> > > Q35").
> > > 
> > > On Thu, Jun 29, 2023 at 06:05:00AM -0300, Leonardo Bras wrote:
> > > > When trying to migrate a machine type pc-q35-6.0 or lower, with this
> > > > cmdline options:
> > > > 
> > > > -device 
> > > > driver=pcie-root-port,port=18,chassis=19,id=pcie-root-port18,bus=pcie.0,addr=0x12
> > > >  \
> > > > -device 
> > > > driver=nec-usb-xhci,p2=4,p3=4,id=nex-usb-xhci0,bus=pcie-root-port18,addr=0x12.0x1
> > > > 
> > > > the following bug happens after all ram pages were sent:
> > > > 
> > > > qemu-kvm: get_pci_config_device: Bad config data: i=0x6e read: 0 
> > > > device: 40 cmask: ff wmask: 0 w1cmask:19
> > > > qemu-kvm: Failed to load PCIDevice:config
> > > > qemu-kvm: Failed to load pcie-root-port:parent_obj.parent_obj.parent_obj
> > > > qemu-kvm: error while loading state for instance 0x0 of device 
> > > > ':00:12.0/pcie-root-port'
> > > > qemu-kvm: load of migration failed: Invalid argument
> > > > 
> > > > This happens on pc-q35-6.0 or lower because of:
> > > > { "ICH9-LPC", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" }
> > > > 
> > > > In this scenario, hotplug_handler_plug() calls pcie_cap_slot_plug_cb(),
> > > > which sets the bus dev->config byte 0x6e with bit PCI_EXP_SLTSTA_PDS to 
> > > > signal PCI hotplug for the guest. After a while the guest will deal with
> > > > this hotplug and qemu will clear the above bit.
> > 
> > Presence Detect State – This bit indicates the presence of an
> > adapter in the slot, reflected by the logical “OR” of the Physical
> > Layer in-band presence detect mechanism and, if present, any
> > out-of-band presence detect mechanism defined for the slot’s
> > corresponding form factor. Note that the in-band presence
> > detect mechanism requires that power be applied to an adapter
> > for its presence to be detected. Consequently, form factors that
> > require a power controller for hot-plug must implement a
> > physical pin presence detect mechanism.
> > RO
> > Defined encodings are:
> > 0b Slot Empty
> > 1b Card Present in slot
> > This bit must be implemented on all Downstream Ports that
> > implement slots. For Downstream Ports not connected to slots
> > (where the Slot Implemented bit of the PCI Express Capabilities
> > register is 0b), this bit must be hardwired to 1b.
> > 
> > 
> > And this seems to match what QEMU is doing: it clears on unplug
> > not after guest deals with hotplug.
> > 
> > 
> > > > Then, during migration, get_pci_config_device() will compare the
> > > > configs of both the freshly created device and the one that is being
> > > > received via migration, which will differ due to the PCI_EXP_SLTSTA_PDS 
> > > > bit
> > > > and cause the bug to reproduce.
> > 
> > So bit is set on source.
> > But why is the bit cleared on destination? This is the part I don't get.
> 
> My understanding is that when ACPI_PM_PROP_ACPI_PCIHP_BRIDGE is off for the
> device, we just won't ever PCI_EXP_SLTSTA_PDS bit?

Why?


> > 
> > 
> > > > To avoid this fake incompatibility, there are two fields in PCIDevice 
> > > > that
> > > > can help:
> > > > 
> > > > .wmask: Used to implement R/W bytes, and
> > > > .w1cmask: Used to implement RW1C(Write 1 to Clear) bytes
> > > 
> > > Is there one more option to clear the bit in cmask?
> > > 
> > > IIUC w1cmask means the guest can now write to this bit, but afaiu from the
> > > pcie spec it's RO.
> > 
> > Yes this bit must be RO.
> > 
> > > > 
> > > > According to pcie_cap_slot_init() the slot status register
> > > > (PCI_EXP_SLTSTA), in which PCI_EXP_SLTSTA_PDS is a flag, seems to fall
> > > > under w1cmask field, with makes sense due to the way signaling the 
> > > > hotplug
> > > > works.
> > > > 
> > > > So, add PCI_EXP_SLTSTA_PDS bit to w1cmask, so the fake incompatibility 
> > > > on
> > > > get_pci_config_device() does not abort the migration.
> > > > 
> > > > Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2215819
> > > > Signed-off-by: Leonardo Bras 
> > > 
> > > Do we need a Fixes: and also the need to copy stable?
> > > 
> > > > ---
> > > >  hw/pci/pcie.c | 2 +-
> > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > 
> > > > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > > > index b8c24cf45f..2def1765a5 100644
> > > > --- a/hw/pci/pcie.c
> > > > +++ b/hw/pci/pcie.c
> > > > @@ -657,7 +657,7 @@ void pcie_cap_slot_init(PCIDevice *dev, PCIESlot *s)
> > > > PCI_EXP_SLTCTL_EIC);
> > > >  
> > > >  pci_word_test_and_set_mask(dev->w1cmask + pos + PCI_EXP_SLTSTA,
> > > > -   PCI_EXP_HP_EV_SUPPORTED);
> > > > +   PCI_EXP_HP_EV_SUPPORTED 

Re: [PATCH 1/1] pcie: Add hotplug detect state register to w1cmask

2023-06-29 Thread Peter Xu
On Thu, Jun 29, 2023 at 03:33:06PM -0400, Michael S. Tsirkin wrote:
> On Thu, Jun 29, 2023 at 01:01:53PM -0400, Peter Xu wrote:
> > Hi, Leo,
> > 
> > Thanks for figuring this out.  Let me copy a few more potential reviewers
> > from commit 17858a1695 ("hw/acpi/ich9: Set ACPI PCI hot-plug as default on
> > Q35").
> > 
> > On Thu, Jun 29, 2023 at 06:05:00AM -0300, Leonardo Bras wrote:
> > > When trying to migrate a machine type pc-q35-6.0 or lower, with this
> > > cmdline options:
> > > 
> > > -device 
> > > driver=pcie-root-port,port=18,chassis=19,id=pcie-root-port18,bus=pcie.0,addr=0x12
> > >  \
> > > -device 
> > > driver=nec-usb-xhci,p2=4,p3=4,id=nex-usb-xhci0,bus=pcie-root-port18,addr=0x12.0x1
> > > 
> > > the following bug happens after all ram pages were sent:
> > > 
> > > qemu-kvm: get_pci_config_device: Bad config data: i=0x6e read: 0 device: 
> > > 40 cmask: ff wmask: 0 w1cmask:19
> > > qemu-kvm: Failed to load PCIDevice:config
> > > qemu-kvm: Failed to load pcie-root-port:parent_obj.parent_obj.parent_obj
> > > qemu-kvm: error while loading state for instance 0x0 of device 
> > > ':00:12.0/pcie-root-port'
> > > qemu-kvm: load of migration failed: Invalid argument
> > > 
> > > This happens on pc-q35-6.0 or lower because of:
> > > { "ICH9-LPC", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" }
> > > 
> > > In this scenario, hotplug_handler_plug() calls pcie_cap_slot_plug_cb(),
> > > which sets the bus dev->config byte 0x6e with bit PCI_EXP_SLTSTA_PDS to 
> > > signal PCI hotplug for the guest. After a while the guest will deal with
> > > this hotplug and qemu will clear the above bit.
> 
> Presence Detect State – This bit indicates the presence of an
> adapter in the slot, reflected by the logical “OR” of the Physical
> Layer in-band presence detect mechanism and, if present, any
> out-of-band presence detect mechanism defined for the slot’s
> corresponding form factor. Note that the in-band presence
> detect mechanism requires that power be applied to an adapter
> for its presence to be detected. Consequently, form factors that
> require a power controller for hot-plug must implement a
> physical pin presence detect mechanism.
> RO
> Defined encodings are:
> 0b Slot Empty
> 1b Card Present in slot
> This bit must be implemented on all Downstream Ports that
> implement slots. For Downstream Ports not connected to slots
> (where the Slot Implemented bit of the PCI Express Capabilities
> register is 0b), this bit must be hardwired to 1b.
> 
> 
> And this seems to match what QEMU is doing: it clears on unplug
> not after guest deals with hotplug.
> 
> 
> > > Then, during migration, get_pci_config_device() will compare the
> > > configs of both the freshly created device and the one that is being
> > > received via migration, which will differ due to the PCI_EXP_SLTSTA_PDS 
> > > bit
> > > and cause the bug to reproduce.
> 
> So bit is set on source.
> But why is the bit cleared on destination? This is the part I don't get.

My understanding is that when ACPI_PM_PROP_ACPI_PCIHP_BRIDGE is off for the
device, we just won't ever PCI_EXP_SLTSTA_PDS bit?

> 
> 
> > > To avoid this fake incompatibility, there are two fields in PCIDevice that
> > > can help:
> > > 
> > > .wmask: Used to implement R/W bytes, and
> > > .w1cmask: Used to implement RW1C(Write 1 to Clear) bytes
> > 
> > Is there one more option to clear the bit in cmask?
> > 
> > IIUC w1cmask means the guest can now write to this bit, but afaiu from the
> > pcie spec it's RO.
> 
> Yes this bit must be RO.
> 
> > > 
> > > According to pcie_cap_slot_init() the slot status register
> > > (PCI_EXP_SLTSTA), in which PCI_EXP_SLTSTA_PDS is a flag, seems to fall
> > > under w1cmask field, with makes sense due to the way signaling the hotplug
> > > works.
> > > 
> > > So, add PCI_EXP_SLTSTA_PDS bit to w1cmask, so the fake incompatibility on
> > > get_pci_config_device() does not abort the migration.
> > > 
> > > Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2215819
> > > Signed-off-by: Leonardo Bras 
> > 
> > Do we need a Fixes: and also the need to copy stable?
> > 
> > > ---
> > >  hw/pci/pcie.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > > index b8c24cf45f..2def1765a5 100644
> > > --- a/hw/pci/pcie.c
> > > +++ b/hw/pci/pcie.c
> > > @@ -657,7 +657,7 @@ void pcie_cap_slot_init(PCIDevice *dev, PCIESlot *s)
> > > PCI_EXP_SLTCTL_EIC);
> > >  
> > >  pci_word_test_and_set_mask(dev->w1cmask + pos + PCI_EXP_SLTSTA,
> > > -   PCI_EXP_HP_EV_SUPPORTED);
> > > +   PCI_EXP_HP_EV_SUPPORTED | 
> > > PCI_EXP_SLTSTA_PDS);
> > >  
> > >  dev->exp.hpev_notified = false;
> > >  
> > > -- 
> > > 2.41.0
> > > 
> > 
> > -- 
> > Peter Xu
> 

-- 
Peter Xu




Re: [PATCH 5/7] migration: Display error in query-migrate irrelevant of status

2023-06-29 Thread Peter Xu
On Wed, Jun 28, 2023 at 08:01:22PM -0300, Fabiano Rosas wrote:
> Peter Xu  writes:
> 
> > Display it as long as being set, irrelevant of FAILED status.  E.g., it may
> > also be applicable to PAUSED stage of postcopy, to provide hint on what has
> > gone wrong.
> 
> This might have made the documentation slightly inaccurate:

Hmm yes, maybe I should touch that up so as to include "postcopy-paused",
or just remove the statement that it must be in a "failed" stage.

> 
> # @error-desc: the human readable error description string, when
> # @status is 'failed'. Clients should not attempt to parse the
> # error strings.  (Since 2.7)
> 
> But it's not wrong, so:
> 
> Reviewed-by: Fabiano Rosas 

Thanks for taking a look.

-- 
Peter Xu




Re: [PATCH 1/1] pcie: Add hotplug detect state register to w1cmask

2023-06-29 Thread Michael S. Tsirkin
On Thu, Jun 29, 2023 at 01:01:53PM -0400, Peter Xu wrote:
> Hi, Leo,
> 
> Thanks for figuring this out.  Let me copy a few more potential reviewers
> from commit 17858a1695 ("hw/acpi/ich9: Set ACPI PCI hot-plug as default on
> Q35").
> 
> On Thu, Jun 29, 2023 at 06:05:00AM -0300, Leonardo Bras wrote:
> > When trying to migrate a machine type pc-q35-6.0 or lower, with this
> > cmdline options:
> > 
> > -device 
> > driver=pcie-root-port,port=18,chassis=19,id=pcie-root-port18,bus=pcie.0,addr=0x12
> >  \
> > -device 
> > driver=nec-usb-xhci,p2=4,p3=4,id=nex-usb-xhci0,bus=pcie-root-port18,addr=0x12.0x1
> > 
> > the following bug happens after all ram pages were sent:
> > 
> > qemu-kvm: get_pci_config_device: Bad config data: i=0x6e read: 0 device: 40 
> > cmask: ff wmask: 0 w1cmask:19
> > qemu-kvm: Failed to load PCIDevice:config
> > qemu-kvm: Failed to load pcie-root-port:parent_obj.parent_obj.parent_obj
> > qemu-kvm: error while loading state for instance 0x0 of device 
> > ':00:12.0/pcie-root-port'
> > qemu-kvm: load of migration failed: Invalid argument
> > 
> > This happens on pc-q35-6.0 or lower because of:
> > { "ICH9-LPC", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" }
> > 
> > In this scenario, hotplug_handler_plug() calls pcie_cap_slot_plug_cb(),
> > which sets the bus dev->config byte 0x6e with bit PCI_EXP_SLTSTA_PDS to 
> > signal PCI hotplug for the guest. After a while the guest will deal with
> > this hotplug and qemu will clear the above bit.

Presence Detect State – This bit indicates the presence of an
adapter in the slot, reflected by the logical “OR” of the Physical
Layer in-band presence detect mechanism and, if present, any
out-of-band presence detect mechanism defined for the slot’s
corresponding form factor. Note that the in-band presence
detect mechanism requires that power be applied to an adapter
for its presence to be detected. Consequently, form factors that
require a power controller for hot-plug must implement a
physical pin presence detect mechanism.
RO
Defined encodings are:
0b Slot Empty
1b Card Present in slot
This bit must be implemented on all Downstream Ports that
implement slots. For Downstream Ports not connected to slots
(where the Slot Implemented bit of the PCI Express Capabilities
register is 0b), this bit must be hardwired to 1b.


And this seems to match what QEMU is doing: it clears on unplug
not after guest deals with hotplug.


> > Then, during migration, get_pci_config_device() will compare the
> > configs of both the freshly created device and the one that is being
> > received via migration, which will differ due to the PCI_EXP_SLTSTA_PDS bit
> > and cause the bug to reproduce.

So bit is set on source.
But why is the bit cleared on destination? This is the part I don't get.


> > To avoid this fake incompatibility, there are two fields in PCIDevice that
> > can help:
> > 
> > .wmask: Used to implement R/W bytes, and
> > .w1cmask: Used to implement RW1C(Write 1 to Clear) bytes
> 
> Is there one more option to clear the bit in cmask?
> 
> IIUC w1cmask means the guest can now write to this bit, but afaiu from the
> pcie spec it's RO.

Yes this bit must be RO.

> > 
> > According to pcie_cap_slot_init() the slot status register
> > (PCI_EXP_SLTSTA), in which PCI_EXP_SLTSTA_PDS is a flag, seems to fall
> > under w1cmask field, with makes sense due to the way signaling the hotplug
> > works.
> > 
> > So, add PCI_EXP_SLTSTA_PDS bit to w1cmask, so the fake incompatibility on
> > get_pci_config_device() does not abort the migration.
> > 
> > Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2215819
> > Signed-off-by: Leonardo Bras 
> 
> Do we need a Fixes: and also the need to copy stable?
> 
> > ---
> >  hw/pci/pcie.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > index b8c24cf45f..2def1765a5 100644
> > --- a/hw/pci/pcie.c
> > +++ b/hw/pci/pcie.c
> > @@ -657,7 +657,7 @@ void pcie_cap_slot_init(PCIDevice *dev, PCIESlot *s)
> > PCI_EXP_SLTCTL_EIC);
> >  
> >  pci_word_test_and_set_mask(dev->w1cmask + pos + PCI_EXP_SLTSTA,
> > -   PCI_EXP_HP_EV_SUPPORTED);
> > +   PCI_EXP_HP_EV_SUPPORTED | 
> > PCI_EXP_SLTSTA_PDS);
> >  
> >  dev->exp.hpev_notified = false;
> >  
> > -- 
> > 2.41.0
> > 
> 
> -- 
> Peter Xu




Re: [PATCH 1/6] migration: Set migration status early in incoming side

2023-06-29 Thread Peter Xu
On Wed, Jun 28, 2023 at 01:55:37PM -0300, Fabiano Rosas wrote:
> We are sending a migration event of MIGRATION_STATUS_SETUP at
> qemu_start_incoming_migration but never actually setting the state.
> 
> This creates a window between qmp_migrate_incoming and
> process_incoming_migration_co where the migration status is still
> MIGRATION_STATUS_NONE. Calling query-migrate during this time will
> return an empty response even though the incoming migration command
> has already been issued.
> 
> Commit 7cf1fe6d68 ("migration: Add migration events on target side")
> has added support to the 'events' capability to the incoming part of
> migration, but chose to send the SETUP event without setting the
> state. I'm assuming this was a mistake.
> 
> To avoid introducing a change in behavior, we need to keep sending the
> SETUP event, even if the 'events' capability is not set. Add the
> force-emit-setup-event migration property to enable it.

This is so unfortunate... since qemu 2.4.

Does it mean that when cap-events is set we can send duplicated events?

The fix makes sense to me in general, butt I'm curious whether we can fix
it without having a compat bit doing the wrong thing, even if having the
risk of breaking someone, with the hope that the only thing he/she needs to
do is to enable the cap-events if didn't.  I'd consider that if e.g. as
long as libvirt is fine.  Does anyone know how libvirt handles this?

The worst case is if there's major breakage we can apply a patch adding the
compat bit and copy stable, which should cover all the recent releases. And
if no report after a few releases, probably mean we're fine anyway.

It's just feel so unfortunate migration needs to carry over so much legacy
issues along the way.  So hope to avoid it if any possiblility.

> 
> Signed-off-by: Fabiano Rosas 
> ---
>  migration/migration.c | 17 +++--
>  migration/migration.h | 11 +++
>  migration/options.c   | 13 +
>  migration/options.h   |  1 +
>  4 files changed, 40 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 7c8292d4d4..6da1865e80 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -424,13 +424,26 @@ void migrate_add_address(SocketAddress *address)
>  static void qemu_start_incoming_migration(const char *uri, Error **errp)
>  {
>  const char *p = NULL;
> +MigrationIncomingState *mis = migration_incoming_get_current();
>  
>  /* URI is not suitable for migration? */
>  if (!migration_channels_and_uri_compatible(uri, errp)) {
>  return;
>  }
>  
> -qapi_event_send_migration(MIGRATION_STATUS_SETUP);
> +migrate_set_state(>state, MIGRATION_STATUS_NONE,
> +  MIGRATION_STATUS_SETUP);
> +/*
> + * QMP clients should have set the 'events' migration capability
> + * if they want to receive this event, in which case the
> + * migrate_set_state() call above will have already sent the
> + * event. We still need to send the event for compatibility even
> + * if migration events are disabled.
> + */
> +if (migrate_emit_setup_event()) {
> +qapi_event_send_migration(MIGRATION_STATUS_SETUP);
> +}
> +
>  if (strstart(uri, "tcp:", ) ||
>  strstart(uri, "unix:", NULL) ||
>  strstart(uri, "vsock:", NULL)) {
> @@ -524,7 +537,7 @@ process_incoming_migration_co(void *opaque)
>  
>  mis->largest_page_size = qemu_ram_pagesize_largest();
>  postcopy_state_set(POSTCOPY_INCOMING_NONE);
> -migrate_set_state(>state, MIGRATION_STATUS_NONE,
> +migrate_set_state(>state, MIGRATION_STATUS_SETUP,
>MIGRATION_STATUS_ACTIVE);
>  
>  mis->loadvm_co = qemu_coroutine_self();
> diff --git a/migration/migration.h b/migration/migration.h
> index 30c3e97635..05e1e19e4f 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -433,6 +433,17 @@ struct MigrationState {
>   */
>  uint8_t clear_bitmap_shift;
>  
> +/*
> + * Always emit the incoming migration's SETUP event, even when the
> + * 'events' capability is not enabled.
> + *
> + * QMP clients that wish to receive migration events should always
> + * enable the 'events' capability. This property is for
> + * compatibility with clients that rely on the older QEMU behavior
> + * of unconditionally emitting the SETUP event.
> + */
> +bool force_emit_setup_event;
> +
>  /*
>   * This save hostname when out-going migration starts
>   */
> diff --git a/migration/options.c b/migration/options.c
> index b62ab30cd5..b0eda7cb05 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -95,6 +95,8 @@ Property migration_properties[] = {
>clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT),
>  DEFINE_PROP_BOOL("x-preempt-pre-7-2", MigrationState,
>   preempt_pre_7_2, false),
> +DEFINE_PROP_BOOL("force-emit-setup-event", 

[PULL 0/8] s390-ccw bios updates

2023-06-29 Thread Thomas Huth
 Hi Richard!

The following changes since commit 0eb8f90edebc11022a42abb211b026fac2e276f5:

  Merge tag 'for-upstream' of https://repo.or.cz/qemu/kevin into staging 
(2023-06-28 17:29:53 +0200)

are available in the Git repository at:

  https://gitlab.com/thuth/qemu.git tags/pull-request-2023-06-29

for you to fetch changes up to b806bc8d9cc16172f0cf2c9e42ca1d75b72f6555:

  pc-bios: Update the s390 bios images with the recent changes (2023-06-29 
20:47:45 +0200)


* Fix a compilation issue in the s390-ccw bios with Clang + binutils 2.40
* Create an initial stack frame for the main() function of the s390-ccw bios
* Clean up type definitions in the s390-ccw bios


Juan Quintela (1):
  s390-ccw: Getting rid of ulong

Thomas Huth (7):
  pc-bios/s390-ccw: Get rid of the the __u* types
  pc-bios/s390-ccw/Makefile: Use -z noexecstack to silence linker warning
  pc-bios/s390-ccw: Fix indentation in start.S
  pc-bios/s390-ccw: Provide space for initial stack frame in start.S
  pc-bios/s390-ccw: Move the stack array into start.S
  pc-bios/s390-ccw: Don't use __bss_start with the "larl" instruction
  pc-bios: Update the s390 bios images with the recent changes

 pc-bios/s390-ccw/cio.h   | 232 +++
 pc-bios/s390-ccw/helper.h|   2 +-
 pc-bios/s390-ccw/s390-ccw.h  |  12 +-
 pc-bios/s390-ccw/virtio-scsi.h   |   2 +-
 pc-bios/s390-ccw/virtio.h|   4 +-
 pc-bios/s390-ccw/main.c  |   1 -
 pc-bios/s390-ccw/netmain.c   |   1 -
 pc-bios/s390-ccw/virtio-blkdev.c |  12 +-
 pc-bios/s390-ccw/virtio-scsi.c   |   4 +-
 pc-bios/s390-ccw/virtio.c|  12 +-
 pc-bios/s390-ccw.img | Bin 42608 -> 42608 bytes
 pc-bios/s390-ccw/Makefile|   2 +-
 pc-bios/s390-ccw/start.S | 149 +
 pc-bios/s390-netboot.img | Bin 67232 -> 67232 bytes
 tests/tcg/s390x/head64.S |   7 +-
 15 files changed, 220 insertions(+), 220 deletions(-)




[PULL 2/8] pc-bios/s390-ccw: Get rid of the the __u* types

2023-06-29 Thread Thomas Huth
The types starting with double underscores have likely been
introduced into the s390-ccw bios to be able to re-use structs
from the Linux kernel in the past, but the corresponding structs
in cio.h have been changed there a long time ago already to not
use the variants with the double underscores anymore:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/diff/drivers/s390/cio/cio.h?id=cd6b4f27b9bb2a

So it would be good to replace these in the s390-ccw bios now, too.

Message-Id: <20230627114101.122231-1-th...@redhat.com>
Reviewed-by: Claudio Imbrenda 
Reviewed-by: Eric Farman 
Reviewed-by: Juan Quintela 
Signed-off-by: Thomas Huth 
---
 pc-bios/s390-ccw/cio.h  | 232 ++--
 pc-bios/s390-ccw/s390-ccw.h |   4 -
 2 files changed, 116 insertions(+), 120 deletions(-)

diff --git a/pc-bios/s390-ccw/cio.h b/pc-bios/s390-ccw/cio.h
index 88a88adfd2..8b18153deb 100644
--- a/pc-bios/s390-ccw/cio.h
+++ b/pc-bios/s390-ccw/cio.h
@@ -17,32 +17,32 @@
  * path management control word
  */
 struct pmcw {
-__u32 intparm;  /* interruption parameter */
-__u32 qf:1; /* qdio facility */
-__u32 w:1;
-__u32 isc:3;/* interruption subclass */
-__u32 res5:3;   /* reserved zeros */
-__u32 ena:1;/* enabled */
-__u32 lm:2; /* limit mode */
-__u32 mme:2;/* measurement-mode enable */
-__u32 mp:1; /* multipath mode */
-__u32 tf:1; /* timing facility */
-__u32 dnv:1;/* device number valid */
-__u32 dev:16;   /* device number */
-__u8  lpm;  /* logical path mask */
-__u8  pnom; /* path not operational mask */
-__u8  lpum; /* last path used mask */
-__u8  pim;  /* path installed mask */
-__u16 mbi;  /* measurement-block index */
-__u8  pom;  /* path operational mask */
-__u8  pam;  /* path available mask */
-__u8  chpid[8]; /* CHPID 0-7 (if available) */
-__u32 unused1:8;/* reserved zeros */
-__u32 st:3; /* subchannel type */
-__u32 unused2:18;   /* reserved zeros */
-__u32 mbfc:1;   /* measurement block format control */
-__u32 xmwme:1;  /* extended measurement word mode enable */
-__u32 csense:1; /* concurrent sense; can be enabled ...*/
+u32 intparm;/* interruption parameter */
+u32 qf:1;   /* qdio facility */
+u32 w:1;
+u32 isc:3;  /* interruption subclass */
+u32 res5:3; /* reserved zeros */
+u32 ena:1;  /* enabled */
+u32 lm:2;   /* limit mode */
+u32 mme:2;  /* measurement-mode enable */
+u32 mp:1;   /* multipath mode */
+u32 tf:1;   /* timing facility */
+u32 dnv:1;  /* device number valid */
+u32 dev:16; /* device number */
+u8  lpm;/* logical path mask */
+u8  pnom;   /* path not operational mask */
+u8  lpum;   /* last path used mask */
+u8  pim;/* path installed mask */
+u16 mbi;/* measurement-block index */
+u8  pom;/* path operational mask */
+u8  pam;/* path available mask */
+u8  chpid[8];   /* CHPID 0-7 (if available) */
+u32 unused1:8;  /* reserved zeros */
+u32 st:3;   /* subchannel type */
+u32 unused2:18; /* reserved zeros */
+u32 mbfc:1; /* measurement block format control */
+u32 xmwme:1;/* extended measurement word mode enable */
+u32 csense:1;   /* concurrent sense; can be enabled ...*/
 /*  ... per MSCH, however, if facility */
 /*  ... is not installed, this results */
 /*  ... in an operand exception.   */
@@ -50,24 +50,24 @@ struct pmcw {
 
 /* Target SCHIB configuration. */
 struct schib_config {
-__u64 mba;
-__u32 intparm;
-__u16 mbi;
-__u32 isc:3;
-__u32 ena:1;
-__u32 mme:2;
-__u32 mp:1;
-__u32 csense:1;
-__u32 mbfc:1;
+u64 mba;
+u32 intparm;
+u16 mbi;
+u32 isc:3;
+u32 ena:1;
+u32 mme:2;
+u32 mp:1;
+u32 csense:1;
+u32 mbfc:1;
 } __attribute__ ((packed));
 
 struct scsw {
-__u16 flags;
-__u16 ctrl;
-__u32 cpa;
-__u8 dstat;
-__u8 cstat;
-__u16 count;
+u16 flags;
+u16 ctrl;
+u32 cpa;
+u8 dstat;
+u8 cstat;
+u16 count;
 } __attribute__ ((packed));
 
 /* Function Control */
@@ -117,42 +117,42 @@ struct scsw {
 typedef struct schib {
 struct pmcw pmcw; /* path management control word */
 struct scsw scsw; /* subchannel status word */
-__u64 mba;/* measurement block address */
-__u8 mda[4];  /* model dependent area */
+u64 mba;  /* measurement block address */
+u8 mda[4];/* model dependent area */
 } __attribute__ ((packed, aligned(4))) 

[PULL 3/8] pc-bios/s390-ccw/Makefile: Use -z noexecstack to silence linker warning

2023-06-29 Thread Thomas Huth
Recent versions of ld complain when linking the s390-ccw bios:

 /usr/bin/ld: warning: start.o: missing .note.GNU-stack section implies
  executable stack
 /usr/bin/ld: NOTE: This behaviour is deprecated and will be removed in
  a future version of the linker

We can silence the warning by telling the linker to mark the stack
as not executable.

Message-Id: <20230622130822.396793-1-th...@redhat.com>
Acked-by: Christian Borntraeger 
Signed-off-by: Thomas Huth 
---
 pc-bios/s390-ccw/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pc-bios/s390-ccw/Makefile b/pc-bios/s390-ccw/Makefile
index 2e8cc015aa..acfcd1e71a 100644
--- a/pc-bios/s390-ccw/Makefile
+++ b/pc-bios/s390-ccw/Makefile
@@ -55,7 +55,7 @@ config-cc.mak: Makefile
$(call cc-option,-march=z900,-march=z10)) 3> config-cc.mak
 -include config-cc.mak
 
-LDFLAGS += -Wl,-pie -nostdlib
+LDFLAGS += -Wl,-pie -nostdlib -z noexecstack
 
 build-all: s390-ccw.img s390-netboot.img
 
-- 
2.39.3




[PULL 4/8] pc-bios/s390-ccw: Fix indentation in start.S

2023-06-29 Thread Thomas Huth
start.S is currently indented with a mixture of spaces and tabs, which
is quite ugly. QEMU coding style says indentation should be 4 spaces,
and this is also what we are using in the assembler files in the
tests/tcg/s390x/ folder already, so let's adjust start.S accordingly.

Reviewed-by: Cédric Le Goater 
Message-Id: <20230627074703.99608-2-th...@redhat.com>
Reviewed-by: Claudio Imbrenda 
Reviewed-by: Eric Farman 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Thomas Huth 
---
 pc-bios/s390-ccw/start.S | 136 +++
 1 file changed, 68 insertions(+), 68 deletions(-)

diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S
index 6072906df4..d29de09cc6 100644
--- a/pc-bios/s390-ccw/start.S
+++ b/pc-bios/s390-ccw/start.S
@@ -10,37 +10,37 @@
  * directory.
  */
 
-.globl _start
+.globl _start
 _start:
 
-   larl   %r15, stack + 0x8000 /* Set up stack */
+larl%r15,stack + 0x8000 /* Set up stack */
 
-   /* clear bss */
-   larl %r2, __bss_start
-   larl %r3, _end
-   slgr %r3, %r2   /* get sizeof bss */
-   ltgr%r3,%r3 /* bss empty? */
-   jz  done
-   aghi%r3,-1
-   srlg%r4,%r3,8   /* how many 256 byte chunks? */
-   ltgr%r4,%r4
-   lgr %r1,%r2
-   jz  remainder
+/* clear bss */
+larl%r2,__bss_start
+larl%r3,_end
+slgr%r3,%r2/* get sizeof bss */
+ltgr%r3,%r3/* bss empty? */
+jz  done
+aghi%r3,-1
+srlg%r4,%r3,8  /* how many 256 byte chunks? */
+ltgr%r4,%r4
+lgr %r1,%r2
+jz  remainder
 loop:
-   xc  0(256,%r1),0(%r1)
-   la  %r1,256(%r1)
-   brctg   %r4,loop
+xc  0(256,%r1),0(%r1)
+la  %r1,256(%r1)
+brctg   %r4,loop
 remainder:
-   larl%r2,memsetxc
-   ex  %r3,0(%r2)
+larl%r2,memsetxc
+ex  %r3,0(%r2)
 done:
-/* set up a pgm exception disabled wait psw */
-larl   %r2, disabled_wait_psw
-mvc0x01d0(16), 0(%r2)
-j  main/* And call C */
+/* set up a pgm exception disabled wait psw */
+larl%r2,disabled_wait_psw
+mvc 0x01d0(16),0(%r2)
+j   main   /* And call C */
 
 memsetxc:
-   xc  0(1,%r1),0(%r1)
+xc  0(1,%r1),0(%r1)
 
 
 /*
@@ -48,11 +48,11 @@ memsetxc:
  *
  * stops the current guest cpu.
  */
-   .globl disabled_wait
+.globl disabled_wait
 disabled_wait:
-   larl%r1,disabled_wait_psw
-   lpswe   0(%r1)
-1: j   1b
+larl%r1,disabled_wait_psw
+lpswe   0(%r1)
+1:  j   1b
 
 
 /*
@@ -60,61 +60,61 @@ disabled_wait:
  *
  * eats one sclp interrupt
  */
-.globl consume_sclp_int
+.globl consume_sclp_int
 consume_sclp_int:
-/* enable service interrupts in cr0 */
-stctg   %c0,%c0,0(%r15)
-oi  6(%r15),0x2
-lctlg   %c0,%c0,0(%r15)
-/* prepare external call handler */
-larl %r1, external_new_code
-stg %r1, 0x1b8
-larl %r1, external_new_mask
-mvc 0x1b0(8),0(%r1)
-/* load enabled wait PSW */
-larl %r1, enabled_wait_psw
-lpswe 0(%r1)
+/* enable service interrupts in cr0 */
+stctg   %c0,%c0,0(%r15)
+oi  6(%r15),0x2
+lctlg   %c0,%c0,0(%r15)
+/* prepare external call handler */
+larl%r1,external_new_code
+stg %r1,0x1b8
+larl%r1,external_new_mask
+mvc 0x1b0(8),0(%r1)
+/* load enabled wait PSW */
+larl%r1,enabled_wait_psw
+lpswe   0(%r1)
 
 /*
  * void consume_io_int(void)
  *
  * eats one I/O interrupt
  */
-.globl consume_io_int
+.globl consume_io_int
 consume_io_int:
-/* enable I/O interrupts in cr6 */
-stctg %c6,%c6,0(%r15)
-oi4(%r15), 0xff
-lctlg %c6,%c6,0(%r15)
-/* prepare i/o call handler */
-larl  %r1, io_new_code
-stg   %r1, 0x1f8
-larl  %r1, io_new_mask
-mvc   0x1f0(8),0(%r1)
-/* load enabled wait PSW */
-larl  %r1, enabled_wait_psw
-lpswe 0(%r1)
+/* enable I/O interrupts in cr6 */
+stctg   %c6,%c6,0(%r15)
+oi  4(%r15), 0xff
+lctlg   %c6,%c6,0(%r15)
+/* prepare i/o call handler */
+larl%r1,io_new_code
+stg %r1,0x1f8
+larl%r1,io_new_mask
+mvc 0x1f0(8),0(%r1)
+/* load enabled wait PSW */
+larl%r1,enabled_wait_psw
+lpswe   0(%r1)
 
 external_new_code:
-/* disable service interrupts in cr0 */
-stctg   %c0,%c0,0(%r15)
-ni  6(%r15),0xfd
-lctlg   %c0,%c0,0(%r15)
-br  %r14
+/* disable service interrupts in cr0 */
+stctg   %c0,%c0,0(%r15)
+ni  6(%r15),0xfd
+lctlg   %c0,%c0,0(%r15)
+br  %r14
 
 io_new_code:
-/* disable I/O interrupts in cr6 */
-stctg %c6,%c6,0(%r15)
-ni4(%r15), 0x00
-lctlg 

[PULL 5/8] pc-bios/s390-ccw: Provide space for initial stack frame in start.S

2023-06-29 Thread Thomas Huth
Providing the space of a stack frame is the duty of the caller,
so we should reserve 160 bytes before jumping into the main function.
Otherwise the main() function might write past the stack array.

While we're at it, add a proper STACK_SIZE macro for the stack size
instead of using magic numbers (this is also required for the following
patch).

Reviewed-by: Christian Borntraeger 
Reviewed-by: Cédric Le Goater 
Message-Id: <20230627074703.99608-3-th...@redhat.com>
Reviewed-by: Eric Farman 
Reviewed-by: Claudio Imbrenda 
Reviewed-by: Marc Hartmayer 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Thomas Huth 
---
 pc-bios/s390-ccw/start.S | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S
index d29de09cc6..abd6fe6639 100644
--- a/pc-bios/s390-ccw/start.S
+++ b/pc-bios/s390-ccw/start.S
@@ -10,10 +10,13 @@
  * directory.
  */
 
+#define STACK_SIZE0x8000
+#define STACK_FRAME_SIZE  160
+
 .globl _start
 _start:
 
-larl%r15,stack + 0x8000 /* Set up stack */
+larl%r15,stack + STACK_SIZE - STACK_FRAME_SIZE   /* Set up stack */
 
 /* clear bss */
 larl%r2,__bss_start
-- 
2.39.3




[PULL 8/8] pc-bios: Update the s390 bios images with the recent changes

2023-06-29 Thread Thomas Huth
The startup code of the bios has slightly been changed, apart
from that, there should not be any functional changes this time.

Signed-off-by: Thomas Huth 
---
 pc-bios/s390-ccw.img | Bin 42608 -> 42608 bytes
 pc-bios/s390-netboot.img | Bin 67232 -> 67232 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/pc-bios/s390-ccw.img b/pc-bios/s390-ccw.img
index 
c9a5a21c50b9e78a7048d456d842ddbfdf9e6ba3..f0d9ef6d4d86eb8d8ace19cac3929b9c06ae826f
 100644
GIT binary patch
delta 2507
zcmZ`*dr(tX8b9YtOu|FB;SmA`ykHQ5x^2ks_Tts@eZ@^IX)yJ=^wN;fFASd{EJH}nrXvnP}L
zo%8#C=bZ0+k8^ME3Iwk}?L?{^5I>E4-%Ql@zWKoIq}57zaPntTvslG7<~!lR
z7alh#VfpiUR3#EZ1uUbBs166~bMYb-(*Gad5nhKnaF}K%xDNj-C$#u8mu2rr`6vF0
z9q;_`@9mfmYiS$SLXvs&7@3O*BO%f1n9c0~6VNj3g{%qJlpu$!SA)eCp1#_}GDO`kuEexf|1)%Z5t+aGigtHSif-%BH*
z#mYB?#3k@=>i0RNSkGgh(%6j}0~pFj{TL>**;sUKGN~U`mq?kmPiFP7QWW;S`J|~*Ae7lyvVvTk-
zJ%Z=7(B6Q9sAXUjiZ%TE$N2(
z*JG()B+}4mXD<=hk6TR2P|(@NFTJj(o-w2lUAoNivltQ_L#+5GU3R2i&*-e3HejQ!
zn7)a(bsM3jUS6V)rxD$3bMAm>e96$I-^lyR%W+=BtTeY#vJs2ZX(!i-g9t=*Z
zr?}t1+K5Rs5zjBZ74)0rrug5GCLTWv7GtfqBMt;@!)Z=Q$
zQ~@-08M-a>XzHM=bDvCwZLlcZI|sCWqJwPTfWEA%|V2gWtfa|ui1W@i3<{
zM(EO^6P}68+QE)^4=_2vt-jfyG!oMD5q=OioxY1V<1*l8C630K;pWR|j(-Vyae;m%LK{5-y|C{^Wc(
z&0L*w5WL6vX5c-DpQg;D@8JEE{d6Jjm|@M(k%VhYl2lFgW`UP*ytH-
zuxx^Bn=mrf0@v1Jc4{?TOTiPVhZDU#MFM4PXRXr{Y~;z}zl0TOdkp?6ZjXOEHyHfP
zscFM$T@M3-XPKHn4wgVmSpq#^vXocrqJB^P%esE)iG8*0G3vUY8eitE#urBW{JR
z->3$3m10TyF6fHJ3+Y!PueO+*dbbDNK{v*vn!tSH=an$}(ZnAE9#yA5F4C=eKCF
zcG7)#*E)?e)|%)o+s{|#0$2eBj=D8coaOKAe0%*+=DXuwW*K?|W!NDkQ98LR=p
zPsmPwocrG)OrB1^#r(+!VQ3Ftp8O)+iuSBh(@-w6*r#VrDI+AWj4wQti3hW$#oIVK
zhUcsJIPA{Kr3TbyZ-j?!_-gjdiEYw}(5Pf4H1W7}o+PQ2#}=A?k-PL^J$kd(z{7Qz
zW=p4=&|&+XhKyLS)iy%UVpGnRsGVO@vfFQ9d3_nLPDzssrmUioe6Pn6OO*~hI(3SM
zJZ1(@T}CV9lxeGB1{t2i3B%J%LOUpm7gw
zm7DukL|d3!d`7Oa7Xm#bpUZoL>igI|ND2pVUVeJiEq2*|&7D@*A#co2rR=@7d*)LO
zdGfKmdzO#VuW-p68_%#gzlG0V!R|SgaDEhB+^3x`Grqk7y~FO}+T5fnQm2m|rM@Zz@(49!oap_4l>}?JSgjLTm%l$kRKlcBuC*#uS-E?|D|V)cdxx
za?s3fG{VZ_60x!u(j9hEs=iaZoLp5UY6{j>%2#vdIPP!6HlIS-U7tZ){A
z;=*^GqCuI>@{uZe*zU}Rju75)TA(9{x{_U>)X95Fj!-&?)&+N{U4oa$0yJ0(3uO)X~txg1K)G!Mhu71ox|vN(OIQNx%{q5c9spjEl2(b
D-K0M$

delta 2326
zcmZ`*4Nz3q6~5=}>Mr=h?g9(TuCj~E}@HK@kJ&5`?IXqK?K;Yb-&>A*~w5q0`zj
zkkB-faRPE2VxkqI;2L8#m^bOjB$Z7~aRX>JU;rB`AewQYNnR|Zc0laC59~}b)BEP#
z_wG68e&@UAoO|DsyHM`J+Gsd;)cWV(w`8WYx8$Qk5zhu@2Hu}MRq2Xc^`dh|uM-dC
z_uQHBsh`+Co)0r>28;?JA2eh;kwRBNXZwHlv-seo2api<^ZGd{FCQLvZTvndyMFun
zWAMV3qm+t2gGQ>wh{)U-bj3-Kp0W8eV;{Al5m|uxFrJJm{bRfL@NvZK
z6?g%r*#=6U>sXN2tpOfVMUqY>b7L1Q;}UsR5>54U6UaX|
z8;G=;eRyG)RHg}qpqtzm$9OkDuWLWo{7SrU-9sfoE3qV$;XbCuIAZVQ<
zNHAJ6Ay|eOg>6aZh)$Ji<=T4v`f`@4{Ck!fRa6rTT@N`s5BxP`9s(erm`GPUGvtVNw3k24a)mwa-G%cnfz|Im^sWS
zdEo;bdYKYJoH$fUMIrlfXgPI+B;t^Teh8U_Gjv}U4>c5}cffBcSD%N2o9T!?5(jhX
ztiB8fgDJ{TgtrE$+>i$CbirWJ-u}SX$k?qm3K*8b3Bu4A%k65hPV+UM{X@{|NoOc3
zNaXPLVRD7;O}_P%$H8=kI{AWP_^Kf$vW%PV6~*b@wnNu)T3~vezFOXA(v8r+;&3M&
z4|@Zu$r+xD!#k-w{1qIIr0MVjaF_NQU#tG;Orwz#qsSQH%py>D7p8!+^F`
zawCrhk5reI7Dq;gbxsPurXfKPO}-Y!I`2#CBVB-B@%5S(<5zCFV2VU3mD)}Ag#)KK
z!X=Kt>3n3S2c}f~hzB$37~z37ebUJ<8}LQYieZjl>B#QF!QV)1L*
z{yIUwzlLt*O{IL}k`zLeBC^`di&{xk>D@{08c`mSNwE
zimSe2Dz&|YeOCI`)`h*T^l^+8dpYx%cI@T(T~dy{JUn(ge)*7c?bmp{8)BD06|IcT
zTR5>}>^u1&4}p%ciS6`S>;_1n-q=)}2($2VJICYBd$L)b%(~kI!
z#;K<`r7IfVe1(iHFBAw0}IxA0AhYb$UKF9MaP(kV-hY?N8$l^Map=h5`uZY+c*^PFn^l#gbILjN!kRZ
zQqIz6aRsmfp=4`wtSFDQs
z3OX9x2fvTxTaxjT_A@&_*6#_ee^~luRtunCM~huGaGboZGOPcRXB}gFNsI$Ie0Lu`
zdiWo4qGJeOBKY4Sm-|Wd*HMK#3;jiO*=^POH}Jv(e;$49PC$Ps>2fUSS5tD%OXxo<
zeUS4ezz+>
z5Pl(*Z~8aD9@@Y8Fyu?4n;QY!H%mvh_9dD}MH$T$DF3qU7EEvaOY~R%AyrgtLw
zv7C*3V+rbQ&2*=-2-|ujS5@^Qa-K-1Pfs0*^7-G6-T=J~M*Wf5ueXoQ
zv;aM7h|oVC%?i;a#W~z9+y1EBFnHLe)c8-!8>vtE3gD?Z`dsXiCvHDk`R$eO5T>
icvDJ{n8;gIfC5dc^s}SKzbpK=B(nK3?54ivH~$OL01Kf2

diff --git a/pc-bios/s390-netboot.img b/pc-bios/s390-netboot.img
index 
682da24a05d3b13d9530fe1123b242e38109e1c3..6908e49f06801808b826d3a01f88132cf1b2f57c
 100644
GIT binary patch
delta 11001
zcmbVy30PEDvhY2Zmc|Vm1iIN9XaoVJ!3{SA1*3@CQKPLeQ)I=hV4f
zvO!g{LAB}Y=4BH9smeCrh)o{rXgqMfHPjUN-_?O{I6!Z8-k;x+H0gLCp|TJ{h$j=8F|
z$vZ?2s9j=x=Os9}+1_{$NoqxHb-oFPy1eGuJkv>aU{}craKNQMivHpfE9OD4%EvF?
z#4PGnlG~qn^Nlc6rS(iBXN-O_eXWgwbwJg@O3>~f1)#X*(k))~y
z=QP9X=&63S0*9|a2ThtG%QIA2AazzwBI1nXqAalAVfMF)(NIxgB*wQjNuoDg)%b^H
zAyFK-K}7veo}uL`$x}*_BJlI*-tq^ev?s3KCK#JWj*3R|zxiEIr$Q
zPtRT%m=sOj#l|SKgFbC^|`nI(jPYCCQ|Dynu;#g)J%j|
zJc8BB8D3rN8REOYT$XTDBH42dKD#uUGG+(#@QfCFVX9{c4r-0(ATa@|Jj>hmLS;f>
z1ryvWvJqYgC2fs9%2H*t54MT9
zT-Fe;{0ff36-Ox9^pBTFUXA1Edb4q4#MR@nXDE*;f<~t-XgE>sC3PcRv+9&(c02J*
zO;@%x>}>SatS2dq>`P3uRSeH!IGJFVG1!e-A8IE>!+Edv
zYM#oeE#aA0i27^7Px%e>-m&6UFnLGj?%zkKS9?}yB(Z;h708BlFpvX4em21+tp
z`D3x9k%({f+Zap*(q^QzAg}E&;_h1!ZOmW}v?%n5l$=pxK+kGWQ<~DHmes0>Szby{{V*KcJyh2tSzj}NFx|?j
zMSR)4s6FLnrS?UYlf%dy2O2qMMKDJaa#+ncQRx0q!bIky){_Kvx{!fmB75Vp0~R0i
z7g9d+E{let=z8buh<(2iZq!W-LX1?YoP4+M=VB>OBCnBO_Wek*v(nL@I$UF9o(wIt
z5rM~1#?#?tlvI~2Fl%6lHoW~-I7=>u<5ldV*C@61b!z^rsFtL2D~KqqwCVAsLWg)}
zEQe*fKH66=w-J~0XlmVGzQ&=Wvoy$Of=jygtp{CJi!vvsH@NzSx9*CVYA4133BHjn

[PULL 1/8] s390-ccw: Getting rid of ulong

2023-06-29 Thread Thomas Huth
From: Juan Quintela 

Any good reason why this still exist?
I can understand u* and __u* to be linux kernel like, but ulong?

Reviewed-by: Thomas Huth 
Signed-off-by: Juan Quintela 
Message-Id: <20230629104821.194859-2-th...@redhat.com>
Reviewed-by: Claudio Imbrenda 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Thomas Huth 
---
 pc-bios/s390-ccw/helper.h|  2 +-
 pc-bios/s390-ccw/s390-ccw.h  |  7 +++
 pc-bios/s390-ccw/virtio-scsi.h   |  2 +-
 pc-bios/s390-ccw/virtio.h|  4 ++--
 pc-bios/s390-ccw/virtio-blkdev.c | 12 ++--
 pc-bios/s390-ccw/virtio-scsi.c   |  4 ++--
 pc-bios/s390-ccw/virtio.c| 12 ++--
 7 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/pc-bios/s390-ccw/helper.h b/pc-bios/s390-ccw/helper.h
index 3d0731c4c6..8e3dfcb6d6 100644
--- a/pc-bios/s390-ccw/helper.h
+++ b/pc-bios/s390-ccw/helper.h
@@ -38,7 +38,7 @@ static inline void yield(void)
 
 static inline void sleep(unsigned int seconds)
 {
-ulong target = get_time_seconds() + seconds;
+unsigned long target = get_time_seconds() + seconds;
 
 while (get_time_seconds() < target) {
 yield();
diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h
index b88e0550ab..f849fba74b 100644
--- a/pc-bios/s390-ccw/s390-ccw.h
+++ b/pc-bios/s390-ccw/s390-ccw.h
@@ -17,7 +17,6 @@ typedef unsigned char  u8;
 typedef unsigned short u16;
 typedef unsigned int   u32;
 typedef unsigned long long u64;
-typedef unsigned long  ulong;
 typedef unsigned char  __u8;
 typedef unsigned short __u16;
 typedef unsigned int   __u32;
@@ -67,11 +66,11 @@ void sclp_get_loadparm_ascii(char *loadparm);
 int sclp_read(char *str, size_t count);
 
 /* virtio.c */
-unsigned long virtio_load_direct(ulong rec_list1, ulong rec_list2,
- ulong subchan_id, void *load_addr);
+unsigned long virtio_load_direct(unsigned long rec_list1, unsigned long 
rec_list2,
+ unsigned long subchan_id, void *load_addr);
 bool virtio_is_supported(SubChannelId schid);
 int virtio_blk_setup_device(SubChannelId schid);
-int virtio_read(ulong sector, void *load_addr);
+int virtio_read(unsigned long sector, void *load_addr);
 
 /* bootmap.c */
 void zipl_load(void);
diff --git a/pc-bios/s390-ccw/virtio-scsi.h b/pc-bios/s390-ccw/virtio-scsi.h
index e6b6cd4815..c5612e16a2 100644
--- a/pc-bios/s390-ccw/virtio-scsi.h
+++ b/pc-bios/s390-ccw/virtio-scsi.h
@@ -68,7 +68,7 @@ static inline bool virtio_scsi_response_ok(const 
VirtioScsiCmdResp *r)
 }
 
 int virtio_scsi_read_many(VDev *vdev,
-  ulong sector, void *load_addr, int sec_num);
+  unsigned long sector, void *load_addr, int sec_num);
 int virtio_scsi_setup_device(SubChannelId schid);
 
 #endif /* VIRTIO_SCSI_H */
diff --git a/pc-bios/s390-ccw/virtio.h b/pc-bios/s390-ccw/virtio.h
index e657d381ec..85bd9d1695 100644
--- a/pc-bios/s390-ccw/virtio.h
+++ b/pc-bios/s390-ccw/virtio.h
@@ -190,14 +190,14 @@ int virtio_get_block_size(void);
 uint8_t virtio_get_heads(void);
 uint8_t virtio_get_sectors(void);
 uint64_t virtio_get_blocks(void);
-int virtio_read_many(ulong sector, void *load_addr, int sec_num);
+int virtio_read_many(unsigned long sector, void *load_addr, int sec_num);
 
 #define VIRTIO_SECTOR_SIZE 512
 #define VIRTIO_ISO_BLOCK_SIZE 2048
 #define VIRTIO_SCSI_BLOCK_SIZE 512
 #define VIRTIO_DASD_DEFAULT_BLOCK_SIZE 4096
 
-static inline ulong virtio_sector_adjust(ulong sector)
+static inline unsigned long virtio_sector_adjust(unsigned long sector)
 {
 return sector * (virtio_get_block_size() / VIRTIO_SECTOR_SIZE);
 }
diff --git a/pc-bios/s390-ccw/virtio-blkdev.c b/pc-bios/s390-ccw/virtio-blkdev.c
index 794f99b42c..a81207b52e 100644
--- a/pc-bios/s390-ccw/virtio-blkdev.c
+++ b/pc-bios/s390-ccw/virtio-blkdev.c
@@ -16,7 +16,7 @@
 #define VIRTIO_BLK_F_GEOMETRY   (1 << 4)
 #define VIRTIO_BLK_F_BLK_SIZE   (1 << 6)
 
-static int virtio_blk_read_many(VDev *vdev, ulong sector, void *load_addr,
+static int virtio_blk_read_many(VDev *vdev, unsigned long sector, void 
*load_addr,
 int sec_num)
 {
 VirtioBlkOuthdr out_hdr;
@@ -49,7 +49,7 @@ static int virtio_blk_read_many(VDev *vdev, ulong sector, 
void *load_addr,
 return status;
 }
 
-int virtio_read_many(ulong sector, void *load_addr, int sec_num)
+int virtio_read_many(unsigned long sector, void *load_addr, int sec_num)
 {
 VDev *vdev = virtio_get_device();
 
@@ -63,14 +63,14 @@ int virtio_read_many(ulong sector, void *load_addr, int 
sec_num)
 return -1;
 }
 
-unsigned long virtio_load_direct(ulong rec_list1, ulong rec_list2,
- ulong subchan_id, void *load_addr)
+unsigned long virtio_load_direct(unsigned long rec_list1, unsigned long 
rec_list2,
+ unsigned long subchan_id, void *load_addr)
 {
 u8 status;
 int sec = rec_list1;
 int sec_num = ((rec_list2 >> 32) & 

[PULL 7/8] pc-bios/s390-ccw: Don't use __bss_start with the "larl" instruction

2023-06-29 Thread Thomas Huth
start.S currently cannot be compiled with Clang 16 and binutils 2.40:

 ld: start.o(.text+0x8): misaligned symbol `__bss_start' (0xc1e5) for
 relocation R_390_PC32DBL

According to the built-in linker script of ld, the symbol __bss_start
can actually point *before* the .bss section and does not need to have
any alignment, so in certain situations (like when using the internal
assembler of Clang), the __bss_start symbol can indeed be unaligned
and thus it is not suitable for being used with the "larl" instruction
that needs an address that is at least aligned to halfwords.
The problem went unnoticed so far since binutils <= 2.39 did not
check the alignment, but starting with binutils 2.40, such unaligned
addresses are now refused.

Fix it by loading the address indirectly instead.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2216662
Reported-by: Miroslav Rezanina 
Suggested-by:  Andreas Krebbel 
Message-Id: <20230629104821.194859-8-th...@redhat.com>
Reviewed-by: Claudio Imbrenda 
Signed-off-by: Thomas Huth 
---
 pc-bios/s390-ccw/start.S | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S
index 429a2b30a1..061b06591c 100644
--- a/pc-bios/s390-ccw/start.S
+++ b/pc-bios/s390-ccw/start.S
@@ -19,7 +19,8 @@ _start:
 larl%r15,stack + STACK_SIZE - STACK_FRAME_SIZE   /* Set up stack */
 
 /* clear bss */
-larl%r2,__bss_start
+larl%r2,bss_start_literal   /* __bss_start might be unaligned ... */
+lg  %r2,0(%r2)  /* ... so load it indirectly */
 larl%r3,_end
 slgr%r3,%r2/* get sizeof bss */
 ltgr%r3,%r3/* bss empty? */
@@ -45,7 +46,6 @@ done:
 memsetxc:
 xc  0(1,%r1),0(%r1)
 
-
 /*
  * void disabled_wait(void)
  *
@@ -113,6 +113,8 @@ io_new_code:
 br  %r14
 
 .align  8
+bss_start_literal:
+.quad   __bss_start
 disabled_wait_psw:
 .quad   0x000200018000,0x
 enabled_wait_psw:
-- 
2.39.3




[PULL 6/8] pc-bios/s390-ccw: Move the stack array into start.S

2023-06-29 Thread Thomas Huth
The stack array is only referenced from the start-up code (which is
shared between the s390-ccw.img and the s390-netboot.img), but it is
currently declared twice, once in main.c and once in netmain.c.
It makes more sense to declare this in start.S instead - which will
also be helpful in the next patch, since we need to mention the .bss
section in start.S in that patch.

While we're at it, let's also drop the huge alignment of the stack,
since there is no technical requirement for aligning it to page
boundaries.

Message-Id: <20230627074703.99608-4-th...@redhat.com>
Reviewed-by: Claudio Imbrenda 
Reviewed-by: Eric Farman 
Signed-off-by: Thomas Huth 
---
 pc-bios/s390-ccw/s390-ccw.h | 1 -
 pc-bios/s390-ccw/main.c | 1 -
 pc-bios/s390-ccw/netmain.c  | 1 -
 pc-bios/s390-ccw/start.S| 6 ++
 tests/tcg/s390x/head64.S| 7 ++-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h
index f68a832718..c977a52b50 100644
--- a/pc-bios/s390-ccw/s390-ccw.h
+++ b/pc-bios/s390-ccw/s390-ccw.h
@@ -50,7 +50,6 @@ void consume_io_int(void);
 /* main.c */
 void write_subsystem_identification(void);
 void write_iplb_location(void);
-extern char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE)));
 unsigned int get_loadparm_index(void);
 void main(void);
 
diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
index a2def83e82..5506798098 100644
--- a/pc-bios/s390-ccw/main.c
+++ b/pc-bios/s390-ccw/main.c
@@ -17,7 +17,6 @@
 #include "virtio-scsi.h"
 #include "dasd-ipl.h"
 
-char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE)));
 static SubChannelId blk_schid = { .one = 1 };
 static char loadparm_str[LOADPARM_LEN + 1];
 QemuIplParameters qipl;
diff --git a/pc-bios/s390-ccw/netmain.c b/pc-bios/s390-ccw/netmain.c
index 056e93a818..5cd619b2d6 100644
--- a/pc-bios/s390-ccw/netmain.c
+++ b/pc-bios/s390-ccw/netmain.c
@@ -50,7 +50,6 @@ void write_iplb_location(void) {}
 /* STSI 3.2.2 offset of first vmdb + offset of uuid inside vmdb */
 #define STSI322_VMDB_UUID_OFFSET ((8 + 12) * 4)
 
-char stack[PAGE_SIZE * 8] __attribute__((aligned(PAGE_SIZE)));
 IplParameterBlock iplb __attribute__((aligned(PAGE_SIZE)));
 static char cfgbuf[2048];
 
diff --git a/pc-bios/s390-ccw/start.S b/pc-bios/s390-ccw/start.S
index abd6fe6639..429a2b30a1 100644
--- a/pc-bios/s390-ccw/start.S
+++ b/pc-bios/s390-ccw/start.S
@@ -121,3 +121,9 @@ external_new_mask:
 .quad   0x00018000
 io_new_mask:
 .quad   0x00018000
+
+.bss
+.align  8
+stack:
+.space  STACK_SIZE
+.size   stack,STACK_SIZE
diff --git a/tests/tcg/s390x/head64.S b/tests/tcg/s390x/head64.S
index c6f36dfea4..4fe288388a 100644
--- a/tests/tcg/s390x/head64.S
+++ b/tests/tcg/s390x/head64.S
@@ -8,6 +8,8 @@
 #include "../../../pc-bios/s390-ccw/start.S"
 #undef main
 
+.text
+
 main_pre:
 aghi %r15,-160 /* reserve stack for C code */
 brasl %r14,sclp_setup
@@ -24,8 +26,3 @@ success_psw:
 .quad 0x200018000,0xfff/* see is_special_wait_psw() */
 failure_psw:
 .quad 0x200018000,0/* disabled wait */
-
-.section .bss
-.align 0x1000
-stack:
-.skip 0x8000
-- 
2.39.3




Re: [PATCH 1/2] vfio: Don't be a iterable and legacy device at the same time

2023-06-29 Thread Peter Xu
On Thu, Jun 22, 2023 at 01:22:26PM +0200, Lukas Straub wrote:
> Legacy savevm devices only implement save_state() and load_state().
> Iterable devices shouldn't implement save_state() or else they are
> handled both as an iterable and legacy device in the savevm code.
> 
> Signed-off-by: Lukas Straub 
> ---
> 
> Note: this patch is completely untested.

PS: if you're not confident on the change will always work, better mark as
rfc to show a proposal of such change.

Comparing to the "legacy" vs "modern" migration, IIUC it was about whether
to use vmsd, so it's "save_state()" vs "vmsd" in that regard.

Personally, I don't immediately see a direct conflict / issue with device
providing both save_state() and save_setup().  It means the device declares
both (1) iterable data, and (2) non-iterable data (which can be either vmsd
or save_state()).

I do think vmsd is still preferred here for (2), e.g., I quickly looked at
vmstate_vfio_pci_config which seems fine to be implemented as a vmsd, with
a post_load() perhaps.  But that's another story.  It just all looks still
fine.

Do we get any benefit from having that restriction?

-- 
Peter Xu




Re: [PULL 17/33] ui/dbus: win32 support

2023-06-29 Thread Bernhard Beschow



Am 27. Juni 2023 13:02:14 UTC schrieb marcandre.lur...@redhat.com:
>From: Marc-André Lureau 
>
>D-Bus doesn't support fd-passing on Windows (AF_UNIX doesn't have
>SCM_RIGHTS yet, but there are other means to share objects. I have
>proposed various solutions upstream, but none seem fitting enough atm).
>
>To make the "-display dbus" work on Windows, implement an alternative
>D-Bus interface where all the 'h' (FDs) arguments are replaced with
>'ay' (WSASocketW data), and sockets are passed to the other end via
>WSADuplicateSocket().
>
>Signed-off-by: Marc-André Lureau 
>Message-Id: <20230606115658.677673-6-marcandre.lur...@redhat.com>
>---
> meson.build  |  4 +--
> ui/dbus.h|  6 +
> audio/dbusaudio.c| 44 +++--
> ui/dbus-chardev.c| 22 +
> ui/dbus-console.c| 59 ++--
> ui/dbus-display1.xml | 28 +
> ui/meson.build   |  9 ++-
> 7 files changed, 149 insertions(+), 23 deletions(-)
>
>diff --git a/meson.build b/meson.build
>index b409788832..9a1ce43471 100644
>--- a/meson.build
>+++ b/meson.build
>@@ -838,6 +838,8 @@ if gdbus_codegen.found() and get_option('cfi')
>   gdbus_codegen_error = '@0@ uses gdbus-codegen, which does not support 
> control flow integrity'
> endif
> 
>+xml_pp = find_program('scripts/xml-preprocess.py')
>+
> lttng = not_found
> if 'ust' in get_option('trace_backends')
>   lttng = dependency('lttng-ust', required: true, version: '>= 2.1',
>@@ -1985,8 +1987,6 @@ dbus_display = get_option('dbus_display') \
>error_message: '-display dbus requires glib>=2.64') \
>   .require(gdbus_codegen.found(),
>error_message: gdbus_codegen_error.format('-display dbus')) \
>-  .require(targetos != 'windows',
>-   error_message: '-display dbus is not available on Windows') \
>   .allowed()
> 
> have_virtfs = get_option('virtfs') \
>diff --git a/ui/dbus.h b/ui/dbus.h
>index 9c149e7b41..1e8c24a48e 100644
>--- a/ui/dbus.h
>+++ b/ui/dbus.h
>@@ -62,6 +62,12 @@ struct DBusDisplay {
> Notifier notifier;
> };
> 
>+#ifdef WIN32
>+bool
>+dbus_win32_import_socket(GDBusMethodInvocation *invocation,
>+ GVariant *arg_listener, int *socket);
>+#endif
>+
> #define TYPE_DBUS_DISPLAY "dbus-display"
> OBJECT_DECLARE_SIMPLE_TYPE(DBusDisplay, DBUS_DISPLAY)
> 
>diff --git a/audio/dbusaudio.c b/audio/dbusaudio.c
>index de59467d9e..7a11fbfb42 100644
>--- a/audio/dbusaudio.c
>+++ b/audio/dbusaudio.c
>@@ -33,6 +33,7 @@
> #include 
> #endif
> 
>+#include "ui/dbus.h"

This patch causes below compile error since pixman.h isn't found. It seems as 
if the pixman include path is missing. Since pixman.h is found elsewhere in the 
same build I suspect that the DBUS audio module now needs a pixman dependency 
-- which sounds a little bit weired.

FAILED: libaudio-dbus.a.p/audio_dbusaudio.c.o 
cc -m64 -mcx16 -Ilibaudio-dbus.a.p -I. -I../src -Iqapi -Itrace -Iui -Iui/shader 
-I/usr/include/glib-2.0 -I/usr/lib/glib-2.0/include -I/usr/include/sysprof-4 
-I/usr/include/libmount -I/usr/include/blkid -I/usr/include/gio-unix-2.0 
-fdiagnostics-color=auto -Wall -Winvalid-pch -Werror -std=gnu11 -O0 -g 
-fstack-protector-strong -Wundef -Wwrite-strings -Wmissing-prototypes 
-Wstrict-prototypes -Wredundant-decls -Wold-style-declaration 
-Wold-style-definition -Wtype-limits -Wformat-security -Wformat-y2k -Winit-self 
-Wignored-qualifiers -Wempty-body -Wnested-externs -Wendif-labels 
-Wexpansion-to-defined -Wimplicit-fallthrough=2 -Wmissing-format-attribute 
-Wno-missing-include-dirs -Wno-shift-negative-value -Wno-psabi -isystem 
qemu/src/linux-headers -isystem linux-headers -iquote . -iquote qemu/src 
-iquote qemu/src/include -iquote qemu/src/host/include/x86_64 -iquote 
qemu/src/host/include/generic -iquote qemu/src/tcg/i386 -Wno-unused-function 
-pthread -D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE 
-fno-strict-aliasing -fno-common -fwrapv -march=x86-64 -mtune=generic -O2 -pipe 
-fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security 
-fstack-clash-protection -fcf-protection -fPIC -DBUILD_DSO -MD -MQ 
libaudio-dbus.a.p/audio_dbusaudio.c.o -MF 
libaudio-dbus.a.p/audio_dbusaudio.c.o.d -o 
libaudio-dbus.a.p/audio_dbusaudio.c.o -c ../src/audio/dbusaudio.c
In file included from qemu/src/include/ui/console.h:4,
 from qemu/src/ui/dbus.h:31,
 from ../src/audio/dbusaudio.c:36:
qemu/src/include/ui/qemu-pixman.h:12:10: fatal error: pixman.h: No such file or 
directory
   12 | #include 
  |  ^~

Best regards,
Bernhard

> #include "ui/dbus-display1.h"
> 
> #define AUDIO_CAP "dbus"
>@@ -422,7 +423,6 @@ dbus_audio_fini(void *opaque)
> g_free(da);
> }
> 
>-#ifdef G_OS_UNIX
> static void
> listener_out_vanished_cb(GDBusConnection *connection,
>  gboolean remote_peer_vanished,
>@@ -448,7 +448,9 @@ listener_in_vanished_cb(GDBusConnection *connection,
> static 

Re: [PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-06-29 Thread Ani Sinha



> On 29-Jun-2023, at 9:27 PM, Ani Sinha  wrote:
> 
> 
> 
>> On 29-Jun-2023, at 9:02 PM, Michael S. Tsirkin  wrote:
>> 
>> On Thu, Jun 29, 2023 at 08:07:57PM +0530, Ani Sinha wrote:
>>> 
>>> 
 On 29-Jun-2023, at 7:54 PM, Michael S. Tsirkin  wrote:
 
 On Thu, Jun 29, 2023 at 09:37:07AM +0530, Ani Sinha wrote:
> PCI Express ports only have one slot, so PCI Express devices can only be
> plugged into slot 0 on a PCIE port. Enforce it.
> 
> The change has been tested to not break ARI by instantiating seven vfs on 
> an
> emulated igb device (the maximum number of vfs the linux igb driver 
> supports).
 
 I guess we need to test with some other device then? 7 VFs is same
 slot so hardly a good test.
>>> 
>>> No its not the same slot. Its using different slots/device numbers. I 
>>> checked that.
>>> The same patch was failing without the vf check.
>> 
>> Ah, playing with VF stride?

Indeed. You’ll see IGB_VF_STRIDE is 2. pcie_sriov_pf_init() uses this to 
initialise the PCIE config space attributes. register_vfs() uses this to 
increment the devfn values :-) 


>> Could you show the command line please?
> 
> Akhido mentioned this in the other thread. Basically For QEMU:
> 
> -device pcie-root-port,id=p -device igb,bus=p
> 
> Then from within the guest (in my case RHEL 9.2):
> 
> $ echo 7 > /sys/bus/pci/devices/\:01\:00.0/sriov_numvfs
> 
> You’ll find that if you use something more than 7 there will be ERANGE from 
> the guest kernel because the driver can create maximum 7 vfs.
> This above command line will fail if we do not check for !vfs in the patch 
> with the following error from QEMU:
> 
> (qemu) qemu-system-x86_64: PCI: slot 16 is not valid for igbvf, parent device 
> only allows plugging into slot 0.
> 
> and an IO error on the write from the guest kernel.
> 
> In the current version of the patch with the vf check, you will find the vfs 
> created with the addresses:
> 
> 01:10.{2,4,6,8} and 01.11.{2,4,6} , that is bus 1 for the root port, devices 
> 10 and 11, functions 2,4,6,8 etc.
> 
> There would be no error from QEMU.
> 
>> 
 
> The vfs are seen to have non-zero device/slot numbers in the conventional
> PCI BDF representation.
> 
> CC: jus...@redhat.com
> CC: imamm...@redhat.com
> CC: akihiko.od...@daynix.com
> 
> Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
> Signed-off-by: Ani Sinha 
> Reviewed-by: Julia Suvorova 
> ---
> hw/pci/pci.c | 15 +++
> 1 file changed, 15 insertions(+)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index e2eb4c3b4a..0320ac2bb3 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -65,6 +65,7 @@ bool pci_available = true;
> static char *pcibus_get_dev_path(DeviceState *dev);
> static char *pcibus_get_fw_dev_path(DeviceState *dev);
> static void pcibus_reset(BusState *qbus);
> +static bool pcie_has_upstream_port(PCIDevice *dev);
> 
> static Property pci_props[] = {
>   DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
> @@ -1190,6 +1191,20 @@ static PCIDevice *do_pci_register_device(PCIDevice 
> *pci_dev,
>  name);
> 
>  return NULL;
> +} /*
> +   * With SRIOV and ARI, vfs can have non-zero slot in the 
> conventional
> +   * PCI interpretation as all five bits reserved for slot addresses 
> are
> +   * also used for function bits for the various vfs. Ignore that 
> case.
> +   * It is too early here to check for ARI capabilities in the PCI 
> config
> +   * space. Hence, we check for a vf device instead.
> +   */
> +else if (!pci_is_vf(pci_dev) &&
> + pcie_has_upstream_port(pci_dev) &&
> + PCI_SLOT(devfn)) {
> +error_setg(errp, "PCI: slot %d is not valid for %s,"
> +   " parent device only allows plugging into slot 0.",
> +   PCI_SLOT(devfn), name);
> +return NULL;
>   }
> 
>   pci_dev->devfn = devfn;
> -- 
> 2.39.1




[RESEND][PATCH v1 0/2] Add Virtio support to Xenpvh machine for arm

2023-06-29 Thread Vikram Garhwal
Hi,
We added virtio-mmio support in xenpvh machine. Now, it can support upto
10 virtio mmio.

I think none of previous patches were delivered to mailing list so Resending 
this series.

Regards,
Vikram

Oleksandr Tyshchenko (2):
  xen_arm: Create virtio-mmio devices during initialization
  xen_arm: Initialize RAM and add hi/low memory regions

 hw/arm/xen_arm.c | 74 
 1 file changed, 74 insertions(+)

-- 
2.25.1




[RESEND][PATCH v1 2/2] xen_arm: Initialize RAM and add hi/low memory regions

2023-06-29 Thread Vikram Garhwal
From: Oleksandr Tyshchenko 

In order to use virtio backends we need to initialize RAM for the
xen-mapcache (which is responsible for mapping guest memory using foreign
mapping) to work. Calculate and add hi/low memory regions based on
machine->ram_size.

Use the constants defined in public header arch-arm.h to be aligned with the xen
toolstack.

While using this machine, the toolstack should then pass real ram_size using
"-m" arg. If "-m" is not given, create a QEMU machine without IOREQ, TPM and
VIRTIO to keep it usable for /etc/init.d/xencommons.

Signed-off-by: Oleksandr Tyshchenko 
Signed-off-by: Vikram Garhwal 
---
 hw/arm/xen_arm.c | 45 +
 1 file changed, 45 insertions(+)

diff --git a/hw/arm/xen_arm.c b/hw/arm/xen_arm.c
index c0a93f2c9d..cc4dffee70 100644
--- a/hw/arm/xen_arm.c
+++ b/hw/arm/xen_arm.c
@@ -60,6 +60,8 @@ struct XenArmState {
 } cfg;
 };
 
+static MemoryRegion ram_lo, ram_hi;
+
 #define VIRTIO_MMIO_DEV_SIZE   0x200
 
 #define NR_VIRTIO_MMIO_DEVICES   \
@@ -86,6 +88,39 @@ static void xen_create_virtio_mmio_devices(XenArmState *xam)
 }
 }
 
+static void xen_init_ram(MachineState *machine)
+{
+MemoryRegion *sysmem = get_system_memory();
+ram_addr_t block_len, ram_size[GUEST_RAM_BANKS];
+
+if (machine->ram_size <= GUEST_RAM0_SIZE) {
+ram_size[0] = machine->ram_size;
+ram_size[1] = 0;
+block_len = GUEST_RAM0_BASE + ram_size[0];
+} else {
+ram_size[0] = GUEST_RAM0_SIZE;
+ram_size[1] = machine->ram_size - GUEST_RAM0_SIZE;
+block_len = GUEST_RAM1_BASE + ram_size[1];
+}
+
+memory_region_init_ram(_memory, NULL, "xen.ram", block_len,
+   _fatal);
+
+memory_region_init_alias(_lo, NULL, "xen.ram.lo", _memory,
+ GUEST_RAM0_BASE, ram_size[0]);
+memory_region_add_subregion(sysmem, GUEST_RAM0_BASE, _lo);
+DPRINTF("Initialized region xen.ram.lo: base 0x%llx size 0x%lx\n",
+GUEST_RAM0_BASE, ram_size[0]);
+
+if (ram_size[1] > 0) {
+memory_region_init_alias(_hi, NULL, "xen.ram.hi", _memory,
+ GUEST_RAM1_BASE, ram_size[1]);
+memory_region_add_subregion(sysmem, GUEST_RAM1_BASE, _hi);
+DPRINTF("Initialized region xen.ram.hi: base 0x%llx size 0x%lx\n",
+GUEST_RAM1_BASE, ram_size[1]);
+}
+}
+
 void arch_handle_ioreq(XenIOState *state, ioreq_t *req)
 {
 hw_error("Invalid ioreq type 0x%x\n", req->type);
@@ -135,6 +170,14 @@ static void xen_arm_init(MachineState *machine)
 
 xam->state =  g_new0(XenIOState, 1);
 
+if (machine->ram_size == 0) {
+DPRINTF("ram_size not specified. QEMU machine will be started without"
+" TPM, IOREQ and Virtio-MMIO backends\n");
+return;
+}
+
+xen_init_ram(machine);
+
 xen_register_ioreq(xam->state, machine->smp.cpus, xen_memory_listener);
 
 xen_create_virtio_mmio_devices(xam);
@@ -182,6 +225,8 @@ static void xen_arm_machine_class_init(ObjectClass *oc, 
void *data)
 mc->init = xen_arm_init;
 mc->max_cpus = 1;
 mc->default_machine_opts = "accel=xen";
+/* Set explicitly here to make sure that real ram_size is passed */
+mc->default_ram_size = 0;
 
 printf("CHECK for NEW BUILD\n");
 #ifdef CONFIG_TPM
-- 
2.25.1




[RESEND][PATCH v1 1/2] xen_arm: Create virtio-mmio devices during initialization

2023-06-29 Thread Vikram Garhwal
From: Oleksandr Tyshchenko 

In order to use virtio backends we need to allocate virtio-mmio
parameters (irq and base) and register corresponding buses.

Use the constants defined in public header arch-arm.h to be
aligned with the toolstack. So the number of current supported
virtio-mmio devices is 10.

For the interrupts triggering use already existing on Arm
device-model hypercall.

The toolstack should then insert the same amount of device nodes
into guest device-tree.

Signed-off-by: Oleksandr Tyshchenko 
Signed-off-by: Vikram Garhwal 
---
 hw/arm/xen_arm.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/hw/arm/xen_arm.c b/hw/arm/xen_arm.c
index 60dcd1bcc7..c0a93f2c9d 100644
--- a/hw/arm/xen_arm.c
+++ b/hw/arm/xen_arm.c
@@ -26,6 +26,7 @@
 #include "qapi/qapi-commands-migration.h"
 #include "qapi/visitor.h"
 #include "hw/boards.h"
+#include "hw/irq.h"
 #include "hw/sysbus.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/tpm_backend.h"
@@ -59,6 +60,32 @@ struct XenArmState {
 } cfg;
 };
 
+#define VIRTIO_MMIO_DEV_SIZE   0x200
+
+#define NR_VIRTIO_MMIO_DEVICES   \
+   (GUEST_VIRTIO_MMIO_SPI_LAST - GUEST_VIRTIO_MMIO_SPI_FIRST)
+
+static void xen_set_irq(void *opaque, int irq, int level)
+{
+xendevicemodel_set_irq_level(xen_dmod, xen_domid, irq, level);
+}
+
+static void xen_create_virtio_mmio_devices(XenArmState *xam)
+{
+int i;
+
+for (i = 0; i < NR_VIRTIO_MMIO_DEVICES; i++) {
+hwaddr base = GUEST_VIRTIO_MMIO_BASE + i * VIRTIO_MMIO_DEV_SIZE;
+qemu_irq irq = qemu_allocate_irq(xen_set_irq, NULL,
+ GUEST_VIRTIO_MMIO_SPI_FIRST + i);
+
+sysbus_create_simple("virtio-mmio", base, irq);
+
+DPRINTF("Created virtio-mmio device %d: irq %d base 0x%lx\n",
+i, GUEST_VIRTIO_MMIO_SPI_FIRST + i, base);
+}
+}
+
 void arch_handle_ioreq(XenIOState *state, ioreq_t *req)
 {
 hw_error("Invalid ioreq type 0x%x\n", req->type);
@@ -110,6 +137,8 @@ static void xen_arm_init(MachineState *machine)
 
 xen_register_ioreq(xam->state, machine->smp.cpus, xen_memory_listener);
 
+xen_create_virtio_mmio_devices(xam);
+
 #ifdef CONFIG_TPM
 if (xam->cfg.tpm_base_addr) {
 xen_enable_tpm(xam);
-- 
2.25.1




Re: [PATCH v4 5/5] vfio/migration: Refactor and fix print of "Migration disabled"

2023-06-29 Thread Cédric Le Goater

Hello Zhenzhong,

On 6/29/23 10:40, Zhenzhong Duan wrote:

This patch refactors vfio_migration_realize() and its dependend code
as follows:

1. It's redundant in vfio_migration_realize() to registers multiple blockers,
e.g: vIOMMU blocker can be refactored as per device blocker.
2. Change vfio_viommu_preset() to be only a per device checker.
3. Remove global vIOMMU blocker related stuff, e.g:
giommu_migration_blocker, vfio_[block|unblock]_giommu_migration()
and vfio_migration_finalize()
4. Change vfio_migration_realize(), vfio_block_multiple_devices_migration()
vfio_block_migration() and vfio_viommu_preset() to return bool type.
5. Print "Migration disabled" depending on enable_migration property
and print it as warning instead of error which is overkill.



We are close to soft freeze and these combo patches adding various fixes
all at once are difficult to evaluate.

Please split this patch in multiple ones to ease the review.  May be
start with the  int -> bool conversion of the return values. It should
remove some noise.

Thanks,

C.


migrate_add_blocker() returns 0 when successfully adding the migration blocker.
However, the caller of vfio_migration_realize() considers that migration was
blocked when the latter returned an error. What matters for migration is that
the blocker is added in core migration, so this cleans up usability such that
user sees "Migrate disabled" when any of the vfio migration blockers are active
and it's not intentionally forced by user with enable-migration=off.

Signed-off-by: Zhenzhong Duan 
---
  hw/vfio/common.c  | 66 +++
  hw/vfio/migration.c   | 30 +---
  hw/vfio/pci.c |  4 +--
  include/hw/vfio/vfio-common.h |  7 ++--
  4 files changed, 36 insertions(+), 71 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 77e2ee0e5c6e..c80ecb1da53f 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -362,7 +362,6 @@ bool vfio_mig_active(void)
  }
  
  static Error *multiple_devices_migration_blocker;

-static Error *giommu_migration_blocker;
  
  static unsigned int vfio_migratable_device_num(void)

  {
@@ -381,19 +380,19 @@ static unsigned int vfio_migratable_device_num(void)
  return device_num;
  }
  
-int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)

+bool vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
  {
  int ret;
  
  if (multiple_devices_migration_blocker ||

  vfio_migratable_device_num() <= 1) {
-return 0;
+return true;
  }
  
  if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {

  error_setg(errp, "Migration is currently not supported with multiple "
   "VFIO devices");
-return -EINVAL;
+return false;
  }
  
  error_setg(_devices_migration_blocker,

@@ -403,9 +402,15 @@ int vfio_block_multiple_devices_migration(VFIODevice 
*vbasedev, Error **errp)
  if (ret < 0) {
  error_free(multiple_devices_migration_blocker);
  multiple_devices_migration_blocker = NULL;
+} else {
+/*
+ * Only ON_OFF_AUTO_AUTO case, ON_OFF_AUTO_OFF is checked
+ * in vfio_migration_realize().
+ */
+warn_report("Migration disabled, not support multiple VFIO devices");
  }
  
-return ret;

+return !ret;
  }
  
  void vfio_unblock_multiple_devices_migration(void)

@@ -420,55 +425,10 @@ void vfio_unblock_multiple_devices_migration(void)
  multiple_devices_migration_blocker = NULL;
  }
  
-static bool vfio_viommu_preset(void)

+/* Block migration with a vIOMMU */
+bool vfio_viommu_preset(VFIODevice *vbasedev)
  {
-VFIOAddressSpace *space;
-
-QLIST_FOREACH(space, _address_spaces, list) {
-if (space->as != _space_memory) {
-return true;
-}
-}
-
-return false;
-}
-
-int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp)
-{
-int ret;
-
-if (giommu_migration_blocker ||
-!vfio_viommu_preset()) {
-return 0;
-}
-
-if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
-error_setg(errp,
-   "Migration is currently not supported with vIOMMU enabled");
-return -EINVAL;
-}
-
-error_setg(_migration_blocker,
-   "Migration is currently not supported with vIOMMU enabled");
-ret = migrate_add_blocker(giommu_migration_blocker, errp);
-if (ret < 0) {
-error_free(giommu_migration_blocker);
-giommu_migration_blocker = NULL;
-}
-
-return ret;
-}
-
-void vfio_migration_finalize(void)
-{
-if (!giommu_migration_blocker ||
-vfio_viommu_preset()) {
-return;
-}
-
-migrate_del_blocker(giommu_migration_blocker);
-error_free(giommu_migration_blocker);
-giommu_migration_blocker = NULL;
+return vbasedev->group->container->space->as != _space_memory;
  }
  
  static void 

Re: [PATCH v2 14/16] hw/pci-host/i440fx: Resolve i440fx_init()

2023-06-29 Thread Bernhard Beschow



Am 29. Juni 2023 07:50:10 UTC schrieb "Philippe Mathieu-Daudé" 
:
>Hi Bernhard,

Hi Phil,

>
>On 28/6/23 21:52, Bernhard Beschow wrote:
>> i440fx_init() is a legacy init function. The previous patches worked towards
>> TYPE_I440FX_PCI_HOST_BRIDGE to be instantiated the QOM way. Do this now by
>> transforming the parameters passed to i440fx_init() into property 
>> assignments.
>> 
>> Signed-off-by: Bernhard Beschow 
>> ---
>>   include/hw/pci-host/i440fx.h | 10 --
>>   hw/i386/pc_piix.c| 30 +-
>>   hw/pci-host/i440fx.c | 34 +-
>>   3 files changed, 26 insertions(+), 48 deletions(-)
>> 
>> diff --git a/include/hw/pci-host/i440fx.h b/include/hw/pci-host/i440fx.h
>> index 2d7bae5a45..c988f70890 100644
>> --- a/include/hw/pci-host/i440fx.h
>> +++ b/include/hw/pci-host/i440fx.h
>> @@ -34,14 +34,4 @@ struct PCII440FXState {
>> #define TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE "igd-passthrough-i440FX"
>>   -PCIBus *i440fx_init(const char *pci_type,
>> -DeviceState *dev,
>> -MemoryRegion *address_space_mem,
>> -MemoryRegion *address_space_io,
>> -ram_addr_t below_4g_mem_size,
>> -ram_addr_t above_4g_mem_size,
>> -MemoryRegion *pci_memory,
>> -MemoryRegion *ram_memory);
>> -
>> -
>>   #endif
>> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
>> index 87bee368fc..1df309b8e2 100644
>> --- a/hw/i386/pc_piix.c
>> +++ b/hw/i386/pc_piix.c
>> @@ -126,7 +126,7 @@ static void pc_init1(MachineState *machine,
>>   MemoryRegion *rom_memory;
>>   ram_addr_t lowmem;
>>   uint64_t hole64_size;
>> -DeviceState *i440fx_host;
>> +Object *i440fx_host;
>> /*
>>* Calculate ram split, for memory below and above 4G.  It's a bit
>> @@ -201,8 +201,8 @@ static void pc_init1(MachineState *machine,
>>   pci_memory = g_new(MemoryRegion, 1);
>>   memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
>>   rom_memory = pci_memory;
>> -i440fx_host = qdev_new(host_type);
>> -hole64_size = object_property_get_uint(OBJECT(i440fx_host),
>> +i440fx_host = OBJECT(qdev_new(host_type));
>
>[*]
>
>> +hole64_size = object_property_get_uint(i440fx_host,
>>  
>> PCI_HOST_PROP_PCI_HOLE64_SIZE,
>>  _abort);
>>   } else {
>> @@ -243,12 +243,24 @@ static void pc_init1(MachineState *machine,
>>   PIIX3State *piix3;
>>   PCIDevice *pci_dev;
>>   -pci_bus = i440fx_init(pci_type,
>> -  i440fx_host,
>> -  system_memory, system_io,
>> -  x86ms->below_4g_mem_size,
>> -  x86ms->above_4g_mem_size,
>> -  pci_memory, ram_memory);
>> +object_property_add_child(OBJECT(machine), "i440fx", i440fx_host);
>
>I'd keep the object_property_add_child() close to qdev_new() in [*].
>Matter of taste...

Okay. I'd add a dedicated patch before this one since it has value in its own 
(removal of qdev_get_machine() usage *and* doing what you propose).

Best regards,
Bernhard

>
>> +object_property_set_link(i440fx_host, PCI_HOST_PROP_RAM_MEM,
>> + OBJECT(ram_memory), _fatal);
>> +object_property_set_link(i440fx_host, PCI_HOST_PROP_PCI_MEM,
>> + OBJECT(pci_memory), _fatal);
>> +object_property_set_link(i440fx_host, PCI_HOST_PROP_SYSTEM_MEM,
>> + OBJECT(system_memory), _fatal);
>> +object_property_set_link(i440fx_host, PCI_HOST_PROP_IO_MEM,
>> + OBJECT(system_io), _fatal);
>> +object_property_set_uint(i440fx_host, PCI_HOST_BELOW_4G_MEM_SIZE,
>> + x86ms->below_4g_mem_size, _fatal);
>> +object_property_set_uint(i440fx_host, PCI_HOST_ABOVE_4G_MEM_SIZE,
>> + x86ms->above_4g_mem_size, _fatal);
>> +object_property_set_str(i440fx_host, I440FX_HOST_PROP_PCI_TYPE,
>> +pci_type, _fatal);
>> +sysbus_realize_and_unref(SYS_BUS_DEVICE(i440fx_host), _fatal);
>> +
>> +pci_bus = PCI_BUS(qdev_get_child_bus(DEVICE(i440fx_host), "pci.0"));
>>   pci_bus_map_irqs(pci_bus,
>>xen_enabled() ? xen_pci_slot_get_pirq
>>  : pc_pci_slot_get_pirq);
>> diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
>> index e8e66afc11..62d6287681 100644
>> --- a/hw/pci-host/i440fx.c
>> +++ b/hw/pci-host/i440fx.c
>> @@ -249,9 +249,14 @@ static void i440fx_pcihost_initfn(Object *obj)
>> static void i440fx_pcihost_realize(DeviceState *dev, Error **errp)
>>   {
>> +

Re: [QEMU PATCH 1/1] virtgpu: do not destroy resources when guest suspend

2023-06-29 Thread Kim, Dongwon



On 6/21/2023 4:14 AM, Robert Beckett wrote:


On 21/06/2023 09:39, Gerd Hoffmann wrote:

On Tue, Jun 20, 2023 at 01:26:15PM +0100, Robert Beckett wrote:

On 20/06/2023 10:41, Gerd Hoffmann wrote:

    Hi,


The guest driver should be able to restore resources after resume.

Thank you for your suggestion!
As far as I know, resources are created on host side and guest has 
no backup, if resources are destroyed, guest can't restore them.
Or do you mean guest driver need to send commands to re-create 
resources after resume?

The later.  The guest driver knows which resources it has created,
it can restore them after suspend.

Are you sure that this is viable?

How would you propose that a guest kernel could reproduce a resource,
including pixel data upload during a resume?

The kernel would not have any of the pixel data to transfer to host.

Depends on the of resource type.  For resources which are created by
uploading pixel data (using VIRTIO_GPU_CMD_TRANSFER_TO_HOST_*) a guest
mirror exists which can be used for re-upload.


unfortunately this is not always the case.

https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/gallium/drivers/virgl/virgl_resource.c#L668 



Often mesa will decide that it won't need to access a resource again 
after initial upload (textures etc). In this case, if it is able to 
copy back from host if needed, it will not maintain the guest shadow 
copy. Instead it will create a single page proxy object. The transfer 
to host will then over fill it to the correct size.


I think this was a fairly huge optimization for them.

I have been only focused on scanout blob so didn't think too much about 
all virgl objects but aren't all the virtio-gpu-object will be 
maintained until they are removed by the driver regardless of the type 
of data they contain? Does Mesa (virgl) remove those objects after they 
are uploaded to the host?




For resources filled by gl rendering ops this is indeed not the case.

Could you explain how you anticipate the guest being able to 
reproduce the

resources please?

Same you do on physical hardware?  Suspend can poweroff your PCI
devices, so there must be some standard way to handle that situation
for resources stored in gpu device memory, which is very similar to
the problem we have here.


In traditional PCI gfx card setups, TTM is used as the memory manager 
in the kernel. This is used to migrate the buffers back from VRAM to 
system pages during a suspend.


This would be suitable for use to track host blob buffers that get 
mapped to guest via the PCI BAR, though would be a significant 
re-architecting of virtio gpu driver.


It would not help with the previously mentioned proxied resources. 
Though in theory the driver could read the resources back from host to 
guest pages during suspend, this would then be potentially complicated 
by suspend time alloc failures etc.



As virtio drivers are by design paravirt drivers ,I think it is 
reasonable to accept some knowledge with and cooperation with the host 
to manage suspend/resume.


It seems to me like a lot of effort and long term maintenance to add 
support for transparent suspend/resume that would otherwise be unneeded.


Perhaps others have alternative designs for this?



take care,
   Gerd







Re: [PATCH 1/1] pcie: Add hotplug detect state register to w1cmask

2023-06-29 Thread Peter Xu
Hi, Leo,

Thanks for figuring this out.  Let me copy a few more potential reviewers
from commit 17858a1695 ("hw/acpi/ich9: Set ACPI PCI hot-plug as default on
Q35").

On Thu, Jun 29, 2023 at 06:05:00AM -0300, Leonardo Bras wrote:
> When trying to migrate a machine type pc-q35-6.0 or lower, with this
> cmdline options:
> 
> -device 
> driver=pcie-root-port,port=18,chassis=19,id=pcie-root-port18,bus=pcie.0,addr=0x12
>  \
> -device 
> driver=nec-usb-xhci,p2=4,p3=4,id=nex-usb-xhci0,bus=pcie-root-port18,addr=0x12.0x1
> 
> the following bug happens after all ram pages were sent:
> 
> qemu-kvm: get_pci_config_device: Bad config data: i=0x6e read: 0 device: 40 
> cmask: ff wmask: 0 w1cmask:19
> qemu-kvm: Failed to load PCIDevice:config
> qemu-kvm: Failed to load pcie-root-port:parent_obj.parent_obj.parent_obj
> qemu-kvm: error while loading state for instance 0x0 of device 
> ':00:12.0/pcie-root-port'
> qemu-kvm: load of migration failed: Invalid argument
> 
> This happens on pc-q35-6.0 or lower because of:
> { "ICH9-LPC", ACPI_PM_PROP_ACPI_PCIHP_BRIDGE, "off" }
> 
> In this scenario, hotplug_handler_plug() calls pcie_cap_slot_plug_cb(),
> which sets the bus dev->config byte 0x6e with bit PCI_EXP_SLTSTA_PDS to 
> signal PCI hotplug for the guest. After a while the guest will deal with
> this hotplug and qemu will clear the above bit.
> 
> Then, during migration, get_pci_config_device() will compare the
> configs of both the freshly created device and the one that is being
> received via migration, which will differ due to the PCI_EXP_SLTSTA_PDS bit
> and cause the bug to reproduce.
> 
> To avoid this fake incompatibility, there are two fields in PCIDevice that
> can help:
> 
> .wmask: Used to implement R/W bytes, and
> .w1cmask: Used to implement RW1C(Write 1 to Clear) bytes

Is there one more option to clear the bit in cmask?

IIUC w1cmask means the guest can now write to this bit, but afaiu from the
pcie spec it's RO.

> 
> According to pcie_cap_slot_init() the slot status register
> (PCI_EXP_SLTSTA), in which PCI_EXP_SLTSTA_PDS is a flag, seems to fall
> under w1cmask field, with makes sense due to the way signaling the hotplug
> works.
> 
> So, add PCI_EXP_SLTSTA_PDS bit to w1cmask, so the fake incompatibility on
> get_pci_config_device() does not abort the migration.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2215819
> Signed-off-by: Leonardo Bras 

Do we need a Fixes: and also the need to copy stable?

> ---
>  hw/pci/pcie.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index b8c24cf45f..2def1765a5 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -657,7 +657,7 @@ void pcie_cap_slot_init(PCIDevice *dev, PCIESlot *s)
> PCI_EXP_SLTCTL_EIC);
>  
>  pci_word_test_and_set_mask(dev->w1cmask + pos + PCI_EXP_SLTSTA,
> -   PCI_EXP_HP_EV_SUPPORTED);
> +   PCI_EXP_HP_EV_SUPPORTED | PCI_EXP_SLTSTA_PDS);
>  
>  dev->exp.hpev_notified = false;
>  
> -- 
> 2.41.0
> 

-- 
Peter Xu




Re: [QEMU PATCH 1/1] virtgpu: do not destroy resources when guest suspend

2023-06-29 Thread Kim, Dongwon
This method - letting QEMU not remove resources would work on S3 case 
but with S4, the QEMU would lose all the resources anyway as the process 
will be terminated. So objects restoring was only option for us as


in [RFC PATCH 2/2] drm/virtio: restore virtio_gpu_objects upon suspend 
and resume (lists.freedesktop.org) 



But I only considered and tested cases with scanout blob resources, so 
this may not cover other resource types...


On 6/7/2023 7:56 PM, Jiqian Chen wrote:

After suspending and resuming guest VM, you will get
a black screen, and the display can't come back.

This is because when guest did suspending, it called
into qemu to call virtio_gpu_gl_reset. In function
virtio_gpu_gl_reset, it destroyed resources and reset
renderer, which were used for display. As a result,
guest's screen can't come back to the time when it was
suspended and only showed black.

So, this patch adds a new ctrl message
VIRTIO_GPU_CMD_STATUS_FREEZING to get notification from
guest. If guest is during suspending, it sets freezing
status of virtgpu to true, this will prevent destroying
resources and resetting renderer when guest calls into
virtio_gpu_gl_reset. If guest is during resuming, it sets
freezing to false, and then virtio_gpu_gl_reset will keep
its origin actions and has no other impaction.

Signed-off-by: Jiqian Chen 
---
  hw/display/virtio-gpu-gl.c  |  9 ++-
  hw/display/virtio-gpu-virgl.c   |  3 +++
  hw/display/virtio-gpu.c | 26 +++--
  include/hw/virtio/virtio-gpu.h  |  3 +++
  include/standard-headers/linux/virtio_gpu.h |  9 +++
  5 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/hw/display/virtio-gpu-gl.c b/hw/display/virtio-gpu-gl.c
index e06be60dfb..e11ad233eb 100644
--- a/hw/display/virtio-gpu-gl.c
+++ b/hw/display/virtio-gpu-gl.c
@@ -100,7 +100,14 @@ static void virtio_gpu_gl_reset(VirtIODevice *vdev)
   */
  if (gl->renderer_inited && !gl->renderer_reset) {
  virtio_gpu_virgl_reset_scanout(g);
-gl->renderer_reset = true;
+/*
+ * If guest is suspending, we shouldn't reset renderer,
+ * otherwise, the display can't come back to the time when
+ * it was suspended after guest resumed.
+ */
+if (!g->freezing) {
+gl->renderer_reset = true;
+}
  }
  }
  
diff --git a/hw/display/virtio-gpu-virgl.c b/hw/display/virtio-gpu-virgl.c

index 73cb92c8d5..183ec92d53 100644
--- a/hw/display/virtio-gpu-virgl.c
+++ b/hw/display/virtio-gpu-virgl.c
@@ -464,6 +464,9 @@ void virtio_gpu_virgl_process_cmd(VirtIOGPU *g,
  case VIRTIO_GPU_CMD_GET_EDID:
  virtio_gpu_get_edid(g, cmd);
  break;
+case VIRTIO_GPU_CMD_STATUS_FREEZING:
+virtio_gpu_cmd_status_freezing(g, cmd);
+break;
  default:
  cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC;
  break;
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 5e15c79b94..8f235d7848 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -373,6 +373,16 @@ static void virtio_gpu_resource_create_blob(VirtIOGPU *g,
  QTAILQ_INSERT_HEAD(>reslist, res, next);
  }
  
+void virtio_gpu_cmd_status_freezing(VirtIOGPU *g,

+ struct virtio_gpu_ctrl_command *cmd)
+{
+struct virtio_gpu_status_freezing sf;
+
+VIRTIO_GPU_FILL_CMD(sf);
+virtio_gpu_bswap_32(, sizeof(sf));
+g->freezing = sf.freezing;
+}
+
  static void virtio_gpu_disable_scanout(VirtIOGPU *g, int scanout_id)
  {
  struct virtio_gpu_scanout *scanout = >parent_obj.scanout[scanout_id];
@@ -986,6 +996,9 @@ void virtio_gpu_simple_process_cmd(VirtIOGPU *g,
  case VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING:
  virtio_gpu_resource_detach_backing(g, cmd);
  break;
+case VIRTIO_GPU_CMD_STATUS_FREEZING:
+virtio_gpu_cmd_status_freezing(g, cmd);
+break;
  default:
  cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC;
  break;
@@ -1344,6 +1357,8 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error 
**errp)
  QTAILQ_INIT(>reslist);
  QTAILQ_INIT(>cmdq);
  QTAILQ_INIT(>fenceq);
+
+g->freezing = false;
  }
  
  void virtio_gpu_reset(VirtIODevice *vdev)

@@ -1352,8 +1367,15 @@ void virtio_gpu_reset(VirtIODevice *vdev)
  struct virtio_gpu_simple_resource *res, *tmp;
  struct virtio_gpu_ctrl_command *cmd;
  
-QTAILQ_FOREACH_SAFE(res, >reslist, next, tmp) {

-virtio_gpu_resource_destroy(g, res);
+/*
+ * If guest is suspending, we shouldn't destroy resources,
+ * otherwise, the display can't come back to the time when
+ * it was suspended after guest resumed.
+ */
+if (!g->freezing) {
+QTAILQ_FOREACH_SAFE(res, >reslist, next, tmp) {
+virtio_gpu_resource_destroy(g, res);
+}
  }
  
  while (!QTAILQ_EMPTY(>cmdq)) {

diff 

Re: [PATCH 1/2] target/riscv: Check for CF_PARALLEL instead of qemu_tcg_mttcg_enabled

2023-06-29 Thread Alex Bennée


Philippe Mathieu-Daudé  writes:

> A CPU knows whether MTTCG is enabled or not because it is
> reflected in its TCG flags via the CF_PARALLEL bit.
>
> Suggested-by: Alex Bennée 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  target/riscv/cpu.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 4035fe0e62..4dfa64af6a 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -473,7 +473,7 @@ static void rv64_veyron_v1_cpu_init(Object *obj)
>  
>  static void rv128_base_cpu_init(Object *obj)
>  {
> -if (qemu_tcg_mttcg_enabled()) {
> +if (CPU(obj)->tcg_cflags & CF_PARALLEL) {

Hmm have you checked that tcg_cpu_init_cflags() has executed by this point?

>  /* Missing 128-bit aligned atomics */
>  error_report("128-bit RISC-V currently does not work with Multi "
>   "Threaded TCG. Please use: -accel tcg,thread=single");

Not that we can do anything about it but in linux-user we start with
CF_PARALLEL unset and only set it at the point we spawn a new thread.

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro



Re: [PATCH v6 09/15] target/riscv: Add Zvkned ISA extension support

2023-06-29 Thread Richard Henderson

On 6/29/23 17:10, Max Chou wrote:

On 2023/6/28 5:07 PM, Richard Henderson wrote:

You can eliminate the vstart % EGS test, and the vstart < vl test, when 
VSTART_EQ_ZERO.
You can eliminate the vl % EGS test when VL_EQ_VLMAX.

You could move all of these tests out of line, into a helper_foo_chk() function which 
performs the checks and then calls helper_foo().

Hi Richard

Thank you for the suggestion.
I'll provide the v7 patch set with this suggestion.

But I have an question about the vstart < vl test.
I think that we can't eliminate the vstart < vl test when both the vstart and vl are equal 
to zero.

Although this situation means that the instructions will do nothing.


We know vlmax != 0 (there's some architectural minimum).
Therefore if VL_EQ_VLMAX, vl != 0 and if VSTART_EQ_ZERO, then vstart < vl.


r~



Re: [PATCH v2 04/12] hw/ssi: Add an "addr" property to SSIPeripheral

2023-06-29 Thread Cédric Le Goater

On 6/29/23 12:56, Philippe Mathieu-Daudé wrote:

On 7/6/23 16:15, Cédric Le Goater wrote:

On 6/7/23 10:28, Philippe Mathieu-Daudé wrote:

On 7/6/23 10:06, Joel Stanley wrote:

On Wed, 7 Jun 2023 at 04:40, Cédric Le Goater  wrote:


Boards will use this new property to identify the device CS line and
wire the SPI controllers accordingly.


"addr" and not "cs" or even "chip-select"?


"chip-select" is a good suggestion!


I thought of using "cs" initially as it makes more sense for SPI
controllers, I do agree. But then, I tried to be consistent with
what QEMU is proposing today : "bus" and "addr".


We should use a description that stays close with the terms used
by the hardware we model. In that case "cs" seems more appropriate.


OK. I can change the property to "cs".

Thanks,

C.




Re: [PATCH v2 05/12] hw/ssi: Introduce a ssi_get_cs() helper

2023-06-29 Thread Cédric Le Goater

On 6/29/23 13:09, Philippe Mathieu-Daudé wrote:

On 7/6/23 06:39, Cédric Le Goater wrote:

Simple routine to retrieve a DeviceState object on a SPI bus using its
address/cs. It will be useful for the board to wire the CS lines.

Cc: Alistair Francis 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Cédric Le Goater 
---
  include/hw/ssi/ssi.h |  2 ++
  hw/ssi/ssi.c | 15 +++
  2 files changed, 17 insertions(+)

diff --git a/include/hw/ssi/ssi.h b/include/hw/ssi/ssi.h
index 9e0706a5248c..01662521b09a 100644
--- a/include/hw/ssi/ssi.h
+++ b/include/hw/ssi/ssi.h
@@ -112,4 +112,6 @@ SSIBus *ssi_create_bus(DeviceState *parent, const char 
*name);
  uint32_t ssi_transfer(SSIBus *bus, uint32_t val);
+DeviceState *ssi_get_cs(SSIBus *bus, uint8_t addr);


Revisiting this patch, I now think this should be:

   qemu_irq ssi_get_cs(SSIBus *bus, uint8_t chipselect);


The device is needed for some other use. See :

  [PATCH v2 10/12] aspeed: Get the BlockBackend of FMC0 from the flash device

C.





  #endif
diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c
index d4409535429c..7c71fce0db90 100644
--- a/hw/ssi/ssi.c
+++ b/hw/ssi/ssi.c
@@ -27,6 +27,21 @@ struct SSIBus {
  #define TYPE_SSI_BUS "SSI"
  OBJECT_DECLARE_SIMPLE_TYPE(SSIBus, SSI_BUS)
+DeviceState *ssi_get_cs(SSIBus *bus, uint8_t addr)
+{
+    BusState *b = BUS(bus);
+    BusChild *kid;
+
+    QTAILQ_FOREACH(kid, >children, sibling) {
+    SSIPeripheral *kid_ssi = SSI_PERIPHERAL(kid->child);
+    if (kid_ssi->addr == addr) {
+    return kid->child;


and:

    return qdev_get_gpio_in_named(kid->child,
  SSI_GPIO_CS, 0);


+    }
+    }
+
+    return NULL;
+}
+
  static const TypeInfo ssi_bus_info = {
  .name = TYPE_SSI_BUS,
  .parent = TYPE_BUS,







Re: [PATCH v3 05/37] crypto: Add aesenc_SB_SR_AK

2023-06-29 Thread Richard Henderson

On 6/29/23 17:45, Max Chou wrote:

On 2023/6/20 7:07 PM, Richard Henderson wrote:


diff --git a/include/crypto/aes-round.h b/include/crypto/aes-round.h
new file mode 100644
index 00..d675d2468f
--- /dev/null
+++ b/include/crypto/aes-round.h
@@ -0,0 +1,44 @@
+/*
+ * AES round fragments, generic version
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef CRYPTO_AES_ROUND_H
+#define CRYPTO_AES_ROUND_H
+
+/* Hosts with acceleration will usually need a 16-byte vector type. */
+typedef uint8_t AESStateVec __attribute__((vector_size(16)));
+
+typedef union {
+    uint8_t b[16];
+    uint32_t w[4];
+    uint64_t d[4];
+    AESStateVec v;
+} AESState;

Should we change the length of d from 4 to 2 ?


Yes, definitely a typo.

r~




Re: [PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-06-29 Thread Ani Sinha



> On 29-Jun-2023, at 9:02 PM, Michael S. Tsirkin  wrote:
> 
> On Thu, Jun 29, 2023 at 08:07:57PM +0530, Ani Sinha wrote:
>> 
>> 
>>> On 29-Jun-2023, at 7:54 PM, Michael S. Tsirkin  wrote:
>>> 
>>> On Thu, Jun 29, 2023 at 09:37:07AM +0530, Ani Sinha wrote:
 PCI Express ports only have one slot, so PCI Express devices can only be
 plugged into slot 0 on a PCIE port. Enforce it.
 
 The change has been tested to not break ARI by instantiating seven vfs on 
 an
 emulated igb device (the maximum number of vfs the linux igb driver 
 supports).
>>> 
>>> I guess we need to test with some other device then? 7 VFs is same
>>> slot so hardly a good test.
>> 
>> No its not the same slot. Its using different slots/device numbers. I 
>> checked that.
>> The same patch was failing without the vf check.
> 
> Ah, playing with VF stride? Could you show the command line please?

Akhido mentioned this in the other thread. Basically For QEMU:

-device pcie-root-port,id=p -device igb,bus=p

Then from within the guest (in my case RHEL 9.2):

$ echo 7 > /sys/bus/pci/devices/\:01\:00.0/sriov_numvfs

You’ll find that if you use something more than 7 there will be ERANGE from the 
guest kernel because the driver can create maximum 7 vfs.
This above command line will fail if we do not check for !vfs in the patch with 
the following error from QEMU:

(qemu) qemu-system-x86_64: PCI: slot 16 is not valid for igbvf, parent device 
only allows plugging into slot 0.

and an IO error on the write from the guest kernel.

In the current version of the patch with the vf check, you will find the vfs 
created with the addresses:

01:10.{2,4,6,8} and 01.11.{2,4,6} , that is bus 1 for the root port, devices 10 
and 11, functions 2,4,6,8 etc.

There would be no error from QEMU.

> 
>>> 
 The vfs are seen to have non-zero device/slot numbers in the conventional
 PCI BDF representation.
 
 CC: jus...@redhat.com
 CC: imamm...@redhat.com
 CC: akihiko.od...@daynix.com
 
 Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
 Signed-off-by: Ani Sinha 
 Reviewed-by: Julia Suvorova 
 ---
 hw/pci/pci.c | 15 +++
 1 file changed, 15 insertions(+)
 
 diff --git a/hw/pci/pci.c b/hw/pci/pci.c
 index e2eb4c3b4a..0320ac2bb3 100644
 --- a/hw/pci/pci.c
 +++ b/hw/pci/pci.c
 @@ -65,6 +65,7 @@ bool pci_available = true;
 static char *pcibus_get_dev_path(DeviceState *dev);
 static char *pcibus_get_fw_dev_path(DeviceState *dev);
 static void pcibus_reset(BusState *qbus);
 +static bool pcie_has_upstream_port(PCIDevice *dev);
 
 static Property pci_props[] = {
DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
 @@ -1190,6 +1191,20 @@ static PCIDevice *do_pci_register_device(PCIDevice 
 *pci_dev,
   name);
 
   return NULL;
 +} /*
 +   * With SRIOV and ARI, vfs can have non-zero slot in the 
 conventional
 +   * PCI interpretation as all five bits reserved for slot addresses 
 are
 +   * also used for function bits for the various vfs. Ignore that 
 case.
 +   * It is too early here to check for ARI capabilities in the PCI 
 config
 +   * space. Hence, we check for a vf device instead.
 +   */
 +else if (!pci_is_vf(pci_dev) &&
 + pcie_has_upstream_port(pci_dev) &&
 + PCI_SLOT(devfn)) {
 +error_setg(errp, "PCI: slot %d is not valid for %s,"
 +   " parent device only allows plugging into slot 0.",
 +   PCI_SLOT(devfn), name);
 +return NULL;
}
 
pci_dev->devfn = devfn;
 -- 
 2.39.1




Re: [PATCH v3 05/37] crypto: Add aesenc_SB_SR_AK

2023-06-29 Thread Max Chou

On 2023/6/20 7:07 PM, Richard Henderson wrote:


diff --git a/include/crypto/aes-round.h b/include/crypto/aes-round.h
new file mode 100644
index 00..d675d2468f
--- /dev/null
+++ b/include/crypto/aes-round.h
@@ -0,0 +1,44 @@
+/*
+ * AES round fragments, generic version
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2023 Linaro, Ltd.
+ */
+
+#ifndef CRYPTO_AES_ROUND_H
+#define CRYPTO_AES_ROUND_H
+
+/* Hosts with acceleration will usually need a 16-byte vector type. */
+typedef uint8_t AESStateVec __attribute__((vector_size(16)));
+
+typedef union {
+uint8_t b[16];
+uint32_t w[4];
+uint64_t d[4];
+AESStateVec v;
+} AESState;

Should we change the length of d from 4 to 2 ?

With regards,
Max



Re: [PATCH v4 5/5] vfio/migration: Refactor and fix print of "Migration disabled"

2023-06-29 Thread Joao Martins
On 29/06/2023 16:20, Avihai Horon wrote:
> On 29/06/2023 15:44, Joao Martins wrote:
>> On 29/06/2023 09:40, Zhenzhong Duan wrote:
>>> This patch refactors vfio_migration_realize() and its dependend code
>>> as follows:
>>>
>>> 1. It's redundant in vfio_migration_realize() to registers multiple 
>>> blockers,
>>>     e.g: vIOMMU blocker can be refactored as per device blocker.
>>> 2. Change vfio_viommu_preset() to be only a per device checker.
>>> 3. Remove global vIOMMU blocker related stuff, e.g:
>>>     giommu_migration_blocker, vfio_[block|unblock]_giommu_migration()
>>>     and vfio_migration_finalize()
>>> 4. Change vfio_migration_realize(), vfio_block_multiple_devices_migration()
>>>     vfio_block_migration() and vfio_viommu_preset() to return bool type.
>>> 5. Print "Migration disabled" depending on enable_migration property
>>>     and print it as warning instead of error which is overkill.
>>>
>> I am not enterily sure we need to keep "Migration disabled". Perhaps we 
>> should
>> just derisk from error to warning and use always the same error messages.
>>
>>> migrate_add_blocker() returns 0 when successfully adding the migration 
>>> blocker.
>>> However, the caller of vfio_migration_realize() considers that migration was
>>> blocked when the latter returned an error. What matters for migration is 
>>> that
>>> the blocker is added in core migration, so this cleans up usability such 
>>> that
>>> user sees "Migrate disabled" when any of the vfio migration blockers are 
>>> active
>>> and it's not intentionally forced by user with enable-migration=off.
>>>
>>> Signed-off-by: Zhenzhong Duan 
>>> ---
>>>   hw/vfio/common.c  | 66 +++
>>>   hw/vfio/migration.c   | 30 +---
>>>   hw/vfio/pci.c |  4 +--
>>>   include/hw/vfio/vfio-common.h |  7 ++--
>>>   4 files changed, 36 insertions(+), 71 deletions(-)
>>>
>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>> index 77e2ee0e5c6e..c80ecb1da53f 100644
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -362,7 +362,6 @@ bool vfio_mig_active(void)
>>>   }
>>>
>>>   static Error *multiple_devices_migration_blocker;
>>> -static Error *giommu_migration_blocker;
>>>
>>>   static unsigned int vfio_migratable_device_num(void)
>>>   {
>>> @@ -381,19 +380,19 @@ static unsigned int vfio_migratable_device_num(void)
>>>   return device_num;
>>>   }
>>>
>>> -int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error 
>>> **errp)
>>> +bool vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error 
>>> **errp)
>>>   {
>>>   int ret;
>>>
>>>   if (multiple_devices_migration_blocker ||
>>>   vfio_migratable_device_num() <= 1) {
>>> -    return 0;
>>> +    return true;
>>>   }
>>>
>>>   if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
>>>   error_setg(errp, "Migration is currently not supported with 
>>> multiple "
>>>    "VFIO devices");
>>> -    return -EINVAL;
>>> +    return false;
>>>   }
>>>
>>>   error_setg(_devices_migration_blocker,
>>> @@ -403,9 +402,15 @@ int vfio_block_multiple_devices_migration(VFIODevice
>>> *vbasedev, Error **errp)
>>>   if (ret < 0) {
>>>   error_free(multiple_devices_migration_blocker);
>>>   multiple_devices_migration_blocker = NULL;
>>> +    } else {
>>> +    /*
>>> + * Only ON_OFF_AUTO_AUTO case, ON_OFF_AUTO_OFF is checked
>>> + * in vfio_migration_realize().
>>> + */
>>> +    warn_report("Migration disabled, not support multiple VFIO 
>>> devices");
>>>   }
>>>
>> Perhaps you could stash the previous error message and use it in the
>> warn_report_error to consolidate the error messages e.g.
>>
>> bool vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error 
>> **errp)
>> {
>>  Error *err = NULL;
>>
>>  if (multiple_devices_migration_blocker ||
>>  vfio_migratable_device_num() <= 1) {
>>  return true;
>>  }
>>
>>  error_setg(, "%s: Migration is currently not supported with 
>> multiple "
>>   "VFIO devices", vbasedev->name);
>>
>>  if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
>>  error_propagate(errp, err);
>>  return -EINVAL;
>>  }
>>
>>  ...
>>  if (ret < 0) {
>>  } else {
>>  /* Warns only on ON_OFF_AUTO_AUTO case */
>>  warn_report_err(err);
> 
> I'm not sure this warning is needed.
> If I remember correctly, I think Alex didn't want migration error/warning
> messages to be logged in the AUTO case.
> 

Hmm, ok, I missed this from the previous discussions.

So today there are migration warnings in the current code. (even in the AUTO
case). So if we want them removed, then this patch would then just remove the
"Migration disabled" all together (in the two places we commented).

The rest of the cases already propagate the error I think. And the AUTO case
will always 

[PATCH v3 1/1] vfio-user: introduce vfio-user protocol specification

2023-06-29 Thread Jagannathan Raman
From: Thanos Makatos 

This patch introduces the vfio-user protocol specification (formerly
known as VFIO-over-socket), which is designed to allow devices to be
emulated outside QEMU, in a separate process. vfio-user reuses the
existing VFIO defines, structs and concepts.

It has been earlier discussed as an RFC in:
"RFC: use VFIO over a UNIX domain socket to implement device offloading"

Signed-off-by: John G Johnson 
Signed-off-by: Thanos Makatos 
Signed-off-by: John Levon 
---
 MAINTAINERS|4 +-
 docs/devel/index-internals.rst |1 +
 docs/devel/vfio-user.rst   | 1522 
 3 files changed, 1526 insertions(+), 1 deletion(-)
 create mode 100644 docs/devel/vfio-user.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index aba07722f64f..70499379c7ca 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3791,11 +3791,13 @@ F: include/semihosting/
 F: tests/tcg/multiarch/arm-compat-semi/
 F: tests/tcg/aarch64/system/semiheap.c
 
-Multi-process QEMU
+Multi-process QEMU / vfio-user
 M: Elena Ufimtseva 
 M: Jagannathan Raman 
+M: Thanos Makatos 
 S: Maintained
 F: docs/devel/multi-process.rst
+F: docs/devel/vfio-user.rst
 F: docs/system/multi-process.rst
 F: hw/pci-host/remote.c
 F: include/hw/pci-host/remote.h
diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst
index e1a93df26392..0ecb5c6301d8 100644
--- a/docs/devel/index-internals.rst
+++ b/docs/devel/index-internals.rst
@@ -17,5 +17,6 @@ Details about QEMU's various subsystems including how to add 
features to them.
s390-dasd-ipl
tracing
vfio-migration
+   vfio-user
writing-monitor-commands
virtio-backends
diff --git a/docs/devel/vfio-user.rst b/docs/devel/vfio-user.rst
new file mode 100644
index ..0d96477a68b4
--- /dev/null
+++ b/docs/devel/vfio-user.rst
@@ -0,0 +1,1522 @@
+.. include:: 
+
+vfio-user Protocol Specification
+
+
+--
+Version_ 0.9.1
+--
+
+.. contents:: Table of Contents
+
+Introduction
+
+vfio-user is a protocol that allows a device to be emulated in a separate
+process outside of a Virtual Machine Monitor (VMM). vfio-user devices consist
+of a generic VFIO device type, living inside the VMM, which we call the client,
+and the core device implementation, living outside the VMM, which we call the
+server.
+
+The vfio-user specification is partly based on the
+`Linux VFIO ioctl interface 
`_.
+
+VFIO is a mature and stable API, backed by an extensively used framework. The
+existing VFIO client implementation in QEMU (``qemu/hw/vfio/``) can be largely
+re-used, though there is nothing in this specification that requires that
+particular implementation. None of the VFIO kernel modules are required for
+supporting the protocol, on either the client or server side. Some source
+definitions in VFIO are re-used for vfio-user.
+
+The main idea is to allow a virtual device to function in a separate process in
+the same host over a UNIX domain socket. A UNIX domain socket (``AF_UNIX``) is
+chosen because file descriptors can be trivially sent over it, which in turn
+allows:
+
+* Sharing of client memory for DMA with the server.
+* Sharing of server memory with the client for fast MMIO.
+* Efficient sharing of eventfd's for triggering interrupts.
+
+Other socket types could be used which allow the server to run in a separate
+guest in the same host (``AF_VSOCK``) or remotely (``AF_INET``). Theoretically
+the underlying transport does not necessarily have to be a socket, however we 
do
+not examine such alternatives. In this protocol version we focus on using a 
UNIX
+domain socket and introduce basic support for the other two types of sockets
+without considering performance implications.
+
+While passing of file descriptors is desirable for performance reasons, support
+is not necessary for either the client or the server in order to implement the
+protocol. There is always an in-band, message-passing fall back mechanism.
+
+Overview
+
+
+VFIO is a framework that allows a physical device to be securely passed through
+to a user space process; the device-specific kernel driver does not drive the
+device at all.  Typically, the user space process is a VMM and the device is
+passed through to it in order to achieve high performance. VFIO provides an API
+and the required functionality in the kernel. QEMU has adopted VFIO to allow a
+guest to directly access physical devices, instead of emulating them in
+software.
+
+vfio-user reuses the core VFIO concepts defined in its API, but implements them
+as messages to be sent over a socket. It does not change the kernel-based VFIO
+in any way, in fact none of the VFIO kernel modules need to be loaded to use
+vfio-user. It is also possible for the client to concurrently use the current
+kernel-based VFIO for one device, and vfio-user for another 

[PATCH v3 0/1] introduce vfio-user protocol specification

2023-06-29 Thread Jagannathan Raman
Hi,

This patch is a continuation of the following patch that John Johnson sent out
for review already:
[PATCH v2 01/23] vfio-user: introduce vfio-user protocol specification
Message-Id: 


We have separated this patch from the original vfio-user client series. We
will send the other patches in that series in about two weeks.

v2 -> v3:
  - MAINTAINERS: Combined vfio-user and Multiprocess-QEMU sections and
named it "Multiprocess-QEMU / vfio-user" as it already refers to
the vfio-user files
  - We will remove multiprocess support after the vfio-user client gets
through and rename the section "vfio-user."

Thank you!

Thanos Makatos (1):
  vfio-user: introduce vfio-user protocol specification

 MAINTAINERS|4 +-
 docs/devel/index-internals.rst |1 +
 docs/devel/vfio-user.rst   | 1522 
 3 files changed, 1526 insertions(+), 1 deletion(-)
 create mode 100644 docs/devel/vfio-user.rst

-- 
2.20.1




Re: [PATCH v4 3/5] vfio/pci: Disable INTx in vfio_realize error path

2023-06-29 Thread Joao Martins



On 29/06/2023 16:13, Cédric Le Goater wrote:
> On 6/29/23 13:24, Joao Martins wrote:
>> On 29/06/2023 09:40, Zhenzhong Duan wrote:
>>> When vfio realize fails, INTx isn't disabled if it has been enabled.
>>> This may confuse host side with unhandled interrupt report.
>>>
>>> Add a new label to be used for vfio_intx_enable() failed case.
>>>
>>> Fixes: a9994687cb9b ("vfio/display: core & wireup")
>>> Fixes: b290659fc3dd ("hw/vfio/display: add ramfb support")
>>> Fixes: c62a0c7ce34e ("vfio/display: add xres + yres properties")
>>
>> Sounds to me the correct Fixes tag is the same as first patch i.e.:
>>
>> Fixes: c5478fea27ac ("vfio/pci: Respond to KVM irqchip change notifier")
>>
>>> Signed-off-by: Zhenzhong Duan 
>>
>> Looks good, but see some clarifications below.
>>
>>> ---
>>>   hw/vfio/pci.c | 4 +++-
>>>   1 file changed, 3 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>> index ab6645ba60af..54a8179d1c64 100644
>>> --- a/hw/vfio/pci.c
>>> +++ b/hw/vfio/pci.c
>>> @@ -3167,7 +3167,7 @@ static void vfio_realize(PCIDevice *pdev, Error 
>>> **errp)
>>>   kvm_irqchip_add_change_notifier(>irqchip_change_notifier);
>>>   ret = vfio_intx_enable(vdev, errp);
>>>   if (ret) {
>>> -    goto out_deregister;
>>> +    goto out_intx_disable;
>>>   }
>>>   }
>>>   @@ -3220,6 +3220,8 @@ static void vfio_realize(PCIDevice *pdev, Error 
>>> **errp)
>>>   return;
>>>     out_deregister:
>>> +    vfio_disable_interrupts(vdev);
>>
>> You are calling vfio_disable_interrupts() when what you want is
>> vfio_intx_disable() ? But I guess your thinking was to call
>> vfio_disable_interrupt() which eventually calls vfio_intx_disable() in case 
>> INTx
>> was really setup, thus saving the duplicated check. The MSIx/MSI in 
>> realize() I
>> don't think they will be enabled at this point. Let me know if I 
>> misunderstood.
>>
>>> +out_intx_disable:
>>
>> Maybe 'out_intx_teardown' or 'out_intx_deregister' because you are not really
>> disabling INTx.
> 
> or simply extract from vfio_disable_interrupts() :
>  
>     if (vdev->interrupt == VFIO_INT_INTx) {
>     vfio_intx_disable(vdev);
>     }
> 
> and add the above code before cleaning up the intx routing
> notifier without any new goto labels.
> 
An even better option indeed.



Re: [PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-06-29 Thread Michael S. Tsirkin
On Thu, Jun 29, 2023 at 08:07:57PM +0530, Ani Sinha wrote:
> 
> 
> > On 29-Jun-2023, at 7:54 PM, Michael S. Tsirkin  wrote:
> > 
> > On Thu, Jun 29, 2023 at 09:37:07AM +0530, Ani Sinha wrote:
> >> PCI Express ports only have one slot, so PCI Express devices can only be
> >> plugged into slot 0 on a PCIE port. Enforce it.
> >> 
> >> The change has been tested to not break ARI by instantiating seven vfs on 
> >> an
> >> emulated igb device (the maximum number of vfs the linux igb driver 
> >> supports).
> > 
> > I guess we need to test with some other device then? 7 VFs is same
> > slot so hardly a good test.
> 
> No its not the same slot. Its using different slots/device numbers. I checked 
> that.
> The same patch was failing without the vf check.

Ah, playing with VF stride? Could you show the command line please?

> > 
> >> The vfs are seen to have non-zero device/slot numbers in the conventional
> >> PCI BDF representation.
> >> 
> >> CC: jus...@redhat.com
> >> CC: imamm...@redhat.com
> >> CC: akihiko.od...@daynix.com
> >> 
> >> Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
> >> Signed-off-by: Ani Sinha 
> >> Reviewed-by: Julia Suvorova 
> >> ---
> >> hw/pci/pci.c | 15 +++
> >> 1 file changed, 15 insertions(+)
> >> 
> >> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> >> index e2eb4c3b4a..0320ac2bb3 100644
> >> --- a/hw/pci/pci.c
> >> +++ b/hw/pci/pci.c
> >> @@ -65,6 +65,7 @@ bool pci_available = true;
> >> static char *pcibus_get_dev_path(DeviceState *dev);
> >> static char *pcibus_get_fw_dev_path(DeviceState *dev);
> >> static void pcibus_reset(BusState *qbus);
> >> +static bool pcie_has_upstream_port(PCIDevice *dev);
> >> 
> >> static Property pci_props[] = {
> >> DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
> >> @@ -1190,6 +1191,20 @@ static PCIDevice *do_pci_register_device(PCIDevice 
> >> *pci_dev,
> >>name);
> >> 
> >>return NULL;
> >> +} /*
> >> +   * With SRIOV and ARI, vfs can have non-zero slot in the 
> >> conventional
> >> +   * PCI interpretation as all five bits reserved for slot addresses 
> >> are
> >> +   * also used for function bits for the various vfs. Ignore that 
> >> case.
> >> +   * It is too early here to check for ARI capabilities in the PCI 
> >> config
> >> +   * space. Hence, we check for a vf device instead.
> >> +   */
> >> +else if (!pci_is_vf(pci_dev) &&
> >> + pcie_has_upstream_port(pci_dev) &&
> >> + PCI_SLOT(devfn)) {
> >> +error_setg(errp, "PCI: slot %d is not valid for %s,"
> >> +   " parent device only allows plugging into slot 0.",
> >> +   PCI_SLOT(devfn), name);
> >> +return NULL;
> >> }
> >> 
> >> pci_dev->devfn = devfn;
> >> -- 
> >> 2.39.1
> > 




[PATCH RFC v2 4/4] vdpa: Allow VIRTIO_NET_F_CTRL_RX in SVQ

2023-06-29 Thread Hawkins Jiawei
Enable SVQ with VIRTIO_NET_F_CTRL_RX feature.

Signed-off-by: Hawkins Jiawei 
Acked-by: Eugenio Pérez 
---
 net/vhost-vdpa.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 9d5d88756c..0410a52043 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -104,6 +104,7 @@ static const uint64_t vdpa_svq_device_features =
 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
 BIT_ULL(VIRTIO_NET_F_STATUS) |
 BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
+BIT_ULL(VIRTIO_NET_F_CTRL_RX) |
 BIT_ULL(VIRTIO_NET_F_MQ) |
 BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
 BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
-- 
2.25.1




[PATCH RFC v2 3/4] vdpa: Restore packet receive filtering state relative with _F_CTRL_RX feature

2023-06-29 Thread Hawkins Jiawei
This patch introduces vhost_vdpa_net_load_rx_mode()
and vhost_vdpa_net_load_rx() to restore the packet
receive filtering state in relation to
VIRTIO_NET_F_CTRL_RX feature at device's startup.

Signed-off-by: Hawkins Jiawei 
---
v2:
  - avoid sending CVQ command in default state suggested by Eugenio

v1: 
https://lore.kernel.org/all/86eeddcd6f6b04e5c1e44e901ddea3b1b8b6c183.1687402580.git.yin31...@gmail.com/

 net/vhost-vdpa.c | 104 +++
 1 file changed, 104 insertions(+)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index cb45c84c88..9d5d88756c 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -792,6 +792,106 @@ static int vhost_vdpa_net_load_offloads(VhostVDPAState *s,
 return 0;
 }
 
+static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s,
+   uint8_t cmd,
+   uint8_t on)
+{
+ssize_t dev_written;
+const struct iovec data = {
+.iov_base = ,
+.iov_len = sizeof(on),
+};
+dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_RX,
+  cmd, , 1);
+if (unlikely(dev_written < 0)) {
+return dev_written;
+}
+if (*s->status != VIRTIO_NET_OK) {
+return -EINVAL;
+}
+
+return 0;
+}
+
+static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
+  const VirtIONet *n)
+{
+uint8_t on;
+int r;
+
+if (virtio_vdev_has_feature(>parent_obj, VIRTIO_NET_F_CTRL_RX)) {
+/* Load the promiscous mode */
+if (n->mac_table.uni_overflow) {
+/*
+ * According to VirtIO standard, "Since there are no guarantees,
+ * it can use a hash filter or silently switch to
+ * allmulti or promiscuous mode if it is given too many addresses."
+ *
+ * QEMU ignores non-multicast(unicast) MAC addresses and
+ * marks `uni_overflow` for the device internal state
+ * if guest sets too many non-multicast(unicast) MAC addresses.
+ * Therefore, we should turn promiscous mode on in this case.
+ */
+on = 1;
+} else {
+on = n->promisc;
+}
+if (on != 1) {
+/*
+ * According to virtio_net_reset(), device turns promiscuous mode 
on
+ * by default.
+ *
+ * Therefore, there is no need to send this CVQ command if the
+ * driver also sets promiscuous mode on, which aligns with
+ * the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+r = vhost_vdpa_net_load_rx_mode(s, VIRTIO_NET_CTRL_RX_PROMISC, on);
+if (r < 0) {
+return r;
+}
+}
+
+/* Load the all-multicast mode */
+if (n->mac_table.multi_overflow) {
+/*
+ * According to VirtIO standard, "Since there are no guarantees,
+ * it can use a hash filter or silently switch to
+ * allmulti or promiscuous mode if it is given too many addresses."
+ *
+ * QEMU ignores multicast MAC addresses and
+ * marks `multi_overflow` for the device internal state
+ * if guest sets too many multicast MAC addresses.
+ * Therefore, we should turn all-multicast mode on in this case.
+ */
+on = 1;
+} else {
+on = n->allmulti;
+}
+if (on != 0) {
+/*
+ * According to virtio_net_reset(), device turns all-multicast mode
+ * off by default.
+ *
+ * Therefore, there is no need to send this CVQ command if the
+ * driver also sets all-multicast mode off, which aligns with
+ * the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+r = vhost_vdpa_net_load_rx_mode(s, VIRTIO_NET_CTRL_RX_ALLMULTI, 
on);
+if (r < 0) {
+return r;
+}
+}
+}
+
+return 0;
+}
+
 static int vhost_vdpa_net_load(NetClientState *nc)
 {
 VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
@@ -818,6 +918,10 @@ static int vhost_vdpa_net_load(NetClientState *nc)
 if (unlikely(r)) {
 return r;
 }
+r = vhost_vdpa_net_load_rx(s, n);
+if (unlikely(r)) {
+return r;
+}
 
 return 0;
 }
-- 
2.25.1




[PATCH RFC v2 2/4] vdpa: Restore MAC address filtering state

2023-06-29 Thread Hawkins Jiawei
This patch refactors vhost_vdpa_net_load_mac() to
restore the MAC address filtering state at device's startup.

Signed-off-by: Hawkins Jiawei 
---
v2:
  - use iovec suggested by Eugenio
  - avoid sending CVQ command in default state

v1: 
https://lore.kernel.org/all/00f72fe154a882fd6dc15bc39e3a1ac63f9dadce.1687402580.git.yin31...@gmail.com/

 net/vhost-vdpa.c | 51 
 1 file changed, 51 insertions(+)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 0bd1c7817c..cb45c84c88 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -665,6 +665,57 @@ static int vhost_vdpa_net_load_mac(VhostVDPAState *s, 
const VirtIONet *n)
 }
 }
 
+if (virtio_vdev_has_feature(>parent_obj, VIRTIO_NET_F_CTRL_RX)) {
+if (n->mac_table.in_use != 0) {
+/*
+ * According to virtio_net_reset(), device uses an empty MAC filter
+ * table as its default state.
+ *
+ * Therefore, there is no need to send this CVQ command if the
+ * driver also sets an empty MAC filter table, which aligns with
+ * the device's defaults.
+ *
+ * Note that the device's defaults can mismatch the driver's
+ * configuration only at live migration.
+ */
+uint32_t uni_entries = n->mac_table.first_multi,
+ uni_macs_size = uni_entries * ETH_ALEN,
+ mul_entries = n->mac_table.in_use - uni_entries,
+ mul_macs_size = mul_entries * ETH_ALEN;
+struct virtio_net_ctrl_mac uni = {
+.entries = cpu_to_le32(uni_entries),
+};
+struct virtio_net_ctrl_mac mul = {
+.entries = cpu_to_le32(mul_entries),
+};
+const struct iovec data[] = {
+{
+.iov_base = ,
+.iov_len = sizeof(uni),
+}, {
+.iov_base = n->mac_table.macs,
+.iov_len = uni_macs_size,
+}, {
+.iov_base = ,
+.iov_len = sizeof(mul),
+}, {
+.iov_base = >mac_table.macs[uni_macs_size],
+.iov_len = mul_macs_size,
+},
+};
+ssize_t dev_written = vhost_vdpa_net_load_cmd(s,
+VIRTIO_NET_CTRL_MAC,
+VIRTIO_NET_CTRL_MAC_TABLE_SET,
+data, ARRAY_SIZE(data));
+if (unlikely(dev_written < 0)) {
+return dev_written;
+}
+if (*s->status != VIRTIO_NET_OK) {
+return -EINVAL;
+}
+}
+}
+
 return 0;
 }
 
-- 
2.25.1




[PATCH RFC v2 1/4] vdpa: Use iovec for vhost_vdpa_net_load_cmd()

2023-06-29 Thread Hawkins Jiawei
According to VirtIO standard, "The driver MUST follow
the VIRTIO_NET_CTRL_MAC_TABLE_SET command by a le32 number,
followed by that number of non-multicast MAC addresses,
followed by another le32 number, followed by that number
of multicast addresses."

Considering that these data is not stored in contiguous memory,
this patch refactors vhost_vdpa_net_load_cmd() to accept
scattered data, eliminating the need for an addtional data copy or
packing the data into s->cvq_cmd_out_buffer outside of
vhost_vdpa_net_load_cmd().

Signed-off-by: Hawkins Jiawei 
---
v2:
  - refactor vhost_vdpa_load_cmd() to accept iovec suggested by
Eugenio

 net/vhost-vdpa.c | 42 --
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 6f6a5c6df6..0bd1c7817c 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -620,29 +620,43 @@ static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState *s, 
size_t out_len,
 }
 
 static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, uint8_t class,
-   uint8_t cmd, const void *data,
-   size_t data_size)
+   uint8_t cmd, const struct iovec *data,
+   size_t data_len)
 {
 const struct virtio_net_ctrl_hdr ctrl = {
 .class = class,
 .cmd = cmd,
 };
+void *cursor = s->cvq_cmd_out_buffer;
 
-assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
+/* pack the CVQ command header */
+assert(sizeof(ctrl) < vhost_vdpa_net_cvq_cmd_page_len() -
+  (cursor - s->cvq_cmd_out_buffer));
+memcpy(cursor, , sizeof(ctrl));
+cursor += sizeof(ctrl);
 
-memcpy(s->cvq_cmd_out_buffer, , sizeof(ctrl));
-memcpy(s->cvq_cmd_out_buffer + sizeof(ctrl), data, data_size);
+/* pack the CVQ command command-specific-data */
+for (int i = 0; i < data_len; ++i) {
+assert(data[i].iov_len < vhost_vdpa_net_cvq_cmd_page_len() -
+ (cursor - s->cvq_cmd_out_buffer));
+memcpy(cursor, data[i].iov_base, data[i].iov_len);
+cursor += data[i].iov_len;
+}
 
-return vhost_vdpa_net_cvq_add(s, sizeof(ctrl) + data_size,
+return vhost_vdpa_net_cvq_add(s, cursor - s->cvq_cmd_out_buffer,
   sizeof(virtio_net_ctrl_ack));
 }
 
 static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n)
 {
 if (virtio_vdev_has_feature(>parent_obj, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
+const struct iovec data = {
+.iov_base = (void *)n->mac,
+.iov_len = sizeof(n->mac),
+};
 ssize_t dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MAC,
   VIRTIO_NET_CTRL_MAC_ADDR_SET,
-  n->mac, sizeof(n->mac));
+  , 1);
 if (unlikely(dev_written < 0)) {
 return dev_written;
 }
@@ -665,9 +679,13 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
 }
 
 mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
+const struct iovec data = {
+.iov_base = ,
+.iov_len = sizeof(mq),
+};
 dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MQ,
-  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, ,
-  sizeof(mq));
+  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET,
+  , 1);
 if (unlikely(dev_written < 0)) {
 return dev_written;
 }
@@ -706,9 +724,13 @@ static int vhost_vdpa_net_load_offloads(VhostVDPAState *s,
 }
 
 offloads = cpu_to_le64(n->curr_guest_offloads);
+const struct iovec data = {
+.iov_base = ,
+.iov_len = sizeof(offloads),
+};
 dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
   VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
-  , sizeof(offloads));
+  , 1);
 if (unlikely(dev_written < 0)) {
 return dev_written;
 }
-- 
2.25.1




[PATCH RFC v2 0/4] Vhost-vdpa Shadow Virtqueue _F_CTRL_RX commands support

2023-06-29 Thread Hawkins Jiawei
This series enables shadowed CVQ to intercept rx commands related to
VIRTIO_NET_F_CTRL_RX feature through shadowed CVQ, update the virtio
NIC device model so qemu send it in a migration, and the restore of
that rx state in the destination.

Note that this patch should be based on [1], which have not
been merged. I will submit a new version of patch after it is merged.

[1]. https://lists.nongnu.org/archive/html/qemu-devel/2023-06/msg05909.html

Changelog
=
v2:
  - refactor vhost_vdpa_net_load_cmd() to accept iovec suggested by
Eugenio
  - avoid sending CVQ command in default state suggested by Eugenio

v1: https://lists.nongnu.org/archive/html/qemu-devel/2023-06/msg04423.html

Hawkins Jiawei (4):
  vdpa: Use iovec for vhost_vdpa_net_load_cmd()
  vdpa: Restore MAC address filtering state
  vdpa: Restore packet receive filtering state relative with _F_CTRL_RX
feature
  vdpa: Allow VIRTIO_NET_F_CTRL_RX in SVQ

 net/vhost-vdpa.c | 198 ---
 1 file changed, 188 insertions(+), 10 deletions(-)

-- 
2.25.1




Re: [PATCH v4 5/5] vfio/migration: Refactor and fix print of "Migration disabled"

2023-06-29 Thread Avihai Horon



On 29/06/2023 15:44, Joao Martins wrote:

External email: Use caution opening links or attachments


On 29/06/2023 09:40, Zhenzhong Duan wrote:

This patch refactors vfio_migration_realize() and its dependend code
as follows:

1. It's redundant in vfio_migration_realize() to registers multiple blockers,
e.g: vIOMMU blocker can be refactored as per device blocker.
2. Change vfio_viommu_preset() to be only a per device checker.
3. Remove global vIOMMU blocker related stuff, e.g:
giommu_migration_blocker, vfio_[block|unblock]_giommu_migration()
and vfio_migration_finalize()
4. Change vfio_migration_realize(), vfio_block_multiple_devices_migration()
vfio_block_migration() and vfio_viommu_preset() to return bool type.
5. Print "Migration disabled" depending on enable_migration property
and print it as warning instead of error which is overkill.


I am not enterily sure we need to keep "Migration disabled". Perhaps we should
just derisk from error to warning and use always the same error messages.


migrate_add_blocker() returns 0 when successfully adding the migration blocker.
However, the caller of vfio_migration_realize() considers that migration was
blocked when the latter returned an error. What matters for migration is that
the blocker is added in core migration, so this cleans up usability such that
user sees "Migrate disabled" when any of the vfio migration blockers are active
and it's not intentionally forced by user with enable-migration=off.

Signed-off-by: Zhenzhong Duan 
---
  hw/vfio/common.c  | 66 +++
  hw/vfio/migration.c   | 30 +---
  hw/vfio/pci.c |  4 +--
  include/hw/vfio/vfio-common.h |  7 ++--
  4 files changed, 36 insertions(+), 71 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 77e2ee0e5c6e..c80ecb1da53f 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -362,7 +362,6 @@ bool vfio_mig_active(void)
  }

  static Error *multiple_devices_migration_blocker;
-static Error *giommu_migration_blocker;

  static unsigned int vfio_migratable_device_num(void)
  {
@@ -381,19 +380,19 @@ static unsigned int vfio_migratable_device_num(void)
  return device_num;
  }

-int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
+bool vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
  {
  int ret;

  if (multiple_devices_migration_blocker ||
  vfio_migratable_device_num() <= 1) {
-return 0;
+return true;
  }

  if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
  error_setg(errp, "Migration is currently not supported with multiple "
   "VFIO devices");
-return -EINVAL;
+return false;
  }

  error_setg(_devices_migration_blocker,
@@ -403,9 +402,15 @@ int vfio_block_multiple_devices_migration(VFIODevice 
*vbasedev, Error **errp)
  if (ret < 0) {
  error_free(multiple_devices_migration_blocker);
  multiple_devices_migration_blocker = NULL;
+} else {
+/*
+ * Only ON_OFF_AUTO_AUTO case, ON_OFF_AUTO_OFF is checked
+ * in vfio_migration_realize().
+ */
+warn_report("Migration disabled, not support multiple VFIO devices");
  }


Perhaps you could stash the previous error message and use it in the
warn_report_error to consolidate the error messages e.g.

bool vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
{
 Error *err = NULL;

 if (multiple_devices_migration_blocker ||
 vfio_migratable_device_num() <= 1) {
 return true;
 }

 error_setg(, "%s: Migration is currently not supported with multiple "
  "VFIO devices", vbasedev->name);

 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
 error_propagate(errp, err);
 return -EINVAL;
 }

 ...
 if (ret < 0) {
 } else {
 /* Warns only on ON_OFF_AUTO_AUTO case */
 warn_report_err(err);


I'm not sure this warning is needed.
If I remember correctly, I think Alex didn't want migration 
error/warning messages to be logged in the AUTO case.



 }
}


-return ret;
+return !ret;
  }

  void vfio_unblock_multiple_devices_migration(void)
@@ -420,55 +425,10 @@ void vfio_unblock_multiple_devices_migration(void)
  multiple_devices_migration_blocker = NULL;
  }

-static bool vfio_viommu_preset(void)
+/* Block migration with a vIOMMU */

I meant in the previous version to put the comment on top of the caller, not on
the definition. But with the new code structure from Avihai the error message
further below... it will look a bit redundant.


+bool vfio_viommu_preset(VFIODevice *vbasedev)
  {
-VFIOAddressSpace *space;
-
-QLIST_FOREACH(space, _address_spaces, list) {
-if (space->as != _space_memory) {
-return true;
-}
-}
-
-return false;
-}
-
-int 

Re: [PATCH v4 4/5] vfio/pci: Free resources when vfio_migration_realize fails

2023-06-29 Thread Cédric Le Goater

On 6/29/23 13:45, Joao Martins wrote:

On 29/06/2023 09:40, Zhenzhong Duan wrote:

When vfio_realize() succeeds, hot unplug will call vfio_exitfn()
to free resources allocated in vfio_realize(); when vfio_realize()
fails, vfio_exitfn() is never called and we need to free resources
in vfio_realize().

In the case that vfio_migration_realize() fails,
e.g: with -only-migratable & enable-migration=off, we see below:

(qemu) device_add vfio-pci,host=81:11.1,id=vfio1,bus=root1,enable-migration=off
:81:11.1: Migration disabled
Error: disallowing migration blocker (--only-migratable) for: :81:11.1: 
Migration is disabled for VFIO device

If we hotplug again we should see same log as above, but we see:
(qemu) device_add vfio-pci,host=81:11.1,id=vfio1,bus=root1,enable-migration=off
Error: vfio :81:11.1: device is already attached

That's because some references to VFIO device isn't released,
we should check return value of vfio_migration_realize() and
release the references, then VFIO device will be truely
released when hotplug fails.

Fixes: a22651053b59 ("vfio: Make vfio-pci device migration capable")
Signed-off-by: Zhenzhong Duan 
---
  hw/vfio/pci.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 54a8179d1c64..dc69d3031b24 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3210,6 +3210,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  ret = vfio_migration_realize(vbasedev, errp);
  if (ret) {
  error_report("%s: Migration disabled", vbasedev->name);
+goto out_vfio_migration;
  }
  }
  
@@ -3219,6 +3220,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  
  return;
  
+out_vfio_migration:

+vfio_migration_exit(vbasedev);
  out_deregister:
  vfio_disable_interrupts(vdev);
  out_intx_disable:


I agree with the general sentiment behind the change.
Clearly vfio::migration and vfio::migration_blocker are leaking from inside the
migration_realize() function.

But it is kinda awkward semantic that vfio_migration_realize() (or any realize)
failures need to be accompanied with a vfio_migration_exit() that tears down
state *leaked* by its realize() failure.

It sounds to me that this should be inside the vfio_migration_realize() not on
the caller? Unless QEMU ::realize() is expected to do this.



I agree. vfio_migration_realize() should handle the cleanup of the resources
it allocated if there is a failure.

Thanks

C.




Re: [PATCH v4 3/5] vfio/pci: Disable INTx in vfio_realize error path

2023-06-29 Thread Cédric Le Goater

On 6/29/23 13:24, Joao Martins wrote:

On 29/06/2023 09:40, Zhenzhong Duan wrote:

When vfio realize fails, INTx isn't disabled if it has been enabled.
This may confuse host side with unhandled interrupt report.

Add a new label to be used for vfio_intx_enable() failed case.

Fixes: a9994687cb9b ("vfio/display: core & wireup")
Fixes: b290659fc3dd ("hw/vfio/display: add ramfb support")
Fixes: c62a0c7ce34e ("vfio/display: add xres + yres properties")


Sounds to me the correct Fixes tag is the same as first patch i.e.:

Fixes: c5478fea27ac ("vfio/pci: Respond to KVM irqchip change notifier")


Signed-off-by: Zhenzhong Duan 


Looks good, but see some clarifications below.


---
  hw/vfio/pci.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ab6645ba60af..54a8179d1c64 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3167,7 +3167,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  kvm_irqchip_add_change_notifier(>irqchip_change_notifier);
  ret = vfio_intx_enable(vdev, errp);
  if (ret) {
-goto out_deregister;
+goto out_intx_disable;
  }
  }
  
@@ -3220,6 +3220,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)

  return;
  
  out_deregister:

+vfio_disable_interrupts(vdev);


You are calling vfio_disable_interrupts() when what you want is
vfio_intx_disable() ? But I guess your thinking was to call
vfio_disable_interrupt() which eventually calls vfio_intx_disable() in case INTx
was really setup, thus saving the duplicated check. The MSIx/MSI in realize() I
don't think they will be enabled at this point. Let me know if I misunderstood.


+out_intx_disable:


Maybe 'out_intx_teardown' or 'out_intx_deregister' because you are not really
disabling INTx.


or simply extract from vfio_disable_interrupts() :
 
if (vdev->interrupt == VFIO_INT_INTx) {

vfio_intx_disable(vdev);
}

and add the above code before cleaning up the intx routing
notifier without any new goto labels.

Thanks,

C.





  pci_device_set_intx_routing_notifier(>pdev, NULL);
  if (vdev->irqchip_change_notifier.notify) {
  kvm_irqchip_remove_change_notifier(>irqchip_change_notifier);







Re: [PATCH v6 09/15] target/riscv: Add Zvkned ISA extension support

2023-06-29 Thread Max Chou

On 2023/6/28 5:07 PM, Richard Henderson wrote:


On 6/27/23 19:45, Max Chou wrote:
+#define GEN_V_UNMASKED_TRANS(NAME, CHECK, 
EGS)    \
+    static bool trans_##NAME(DisasContext *s, arg_##NAME 
*a)  \

+ { \
+    if (CHECK(s, a)) 
{    \
+    TCGv_ptr rd_v, 
rs2_v; \
+    TCGv_i32 
desc;    \
+    uint32_t data = 
0;    \
+    TCGLabel *over = 
gen_new_label(); \
+    TCGLabel *vl_ok = 
gen_new_label();    \
+    TCGLabel *vstart_ok = 
gen_new_label();    \
+    TCGv_i32 tmp = 
tcg_temp_new_i32();    \

+ \
+    /* save opcode for unwinding in case we throw an 
exception */ \

+ decode_save_opc(s); \
+ \
+    /* check (vl % EGS == 0) assuming it's power of 2 
*/  \
+    tcg_gen_trunc_tl_i32(tmp, 
cpu_vl);    \
+    tcg_gen_andi_i32(tmp, tmp, EGS - 
1);  \
+    tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, 
vl_ok);  \

+ gen_helper_restore_cpu_and_raise_exception( \
+    cpu_env, 
tcg_constant_i32(RISCV_EXCP_ILLEGAL_INST));  \

+ gen_set_label(vl_ok); \
+ \
+    /* check (vstart % EGS == 0) assuming it's power of 2 
*/  \
+    tcg_gen_trunc_tl_i32(tmp, 
cpu_vstart);    \
+    tcg_gen_andi_i32(tmp, tmp, EGS - 
1);  \
+    tcg_gen_brcondi_i32(TCG_COND_EQ, tmp, 0, 
vstart_ok);  \

+ gen_helper_restore_cpu_and_raise_exception( \
+    cpu_env, 
tcg_constant_i32(RISCV_EXCP_ILLEGAL_INST));  \

+ gen_set_label(vstart_ok); \
+ \
+    tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, 
over);    \


This kind of massive macro is bad style.
Much better to have a helper function and pass in gen_helper_foo as a 
parameter.


You can eliminate the vstart % EGS test, and the vstart < vl test, 
when VSTART_EQ_ZERO.

You can eliminate the vl % EGS test when VL_EQ_VLMAX.

You could move all of these tests out of line, into a helper_foo_chk() 
function which performs the checks and then calls helper_foo().

Hi Richard

Thank you for the suggestion.
I'll provide the v7 patch set with this suggestion.

But I have an question about the vstart < vl test.
I think that we can't eliminate the vstart < vl test when both the 
vstart and vl are equal to zero.

Although this situation means that the instructions will do nothing.


+#define GEN_ZVKNED_HELPER_VV(NAME, 
...)   \
+    void HELPER(NAME)(void *vd_vptr, void *vs2_vptr, CPURISCVState 
*env,  \
+  uint32_t 
desc)  \

+ { \
+    uint64_t *vd = 
vd_vptr;   \
+    uint64_t *vs2 = 
vs2_vptr; \
+    uint32_t vl = 
env->vl;    \
+    uint32_t total_elems = vext_get_total_elems(env, desc, 
4);    \
+    uint32_t vta = 
vext_vta(desc);    \

+ \
+    for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) 
{    \
+    AESState 
round_key;   \
+    round_key.d[HOST_BIG_ENDIAN] = cpu_to_le64(vs2[i * 2 + 
0]);   \
+    round_key.d[!HOST_BIG_ENDIAN] = cpu_to_le64(vs2[i * 2 + 
1]);  \
+    AESState 
round_state; \
+    cpu_to_le64s(vd + i * 2 + 
0); \
+    cpu_to_le64s(vd + i * 2 + 
1); \
+    for (int j = 0; j < 16; j++) 
{    \
+    round_state.b[j] = ((uint8_t *)(vd + i * 
2))[j];  \

+ } \


I think all of this byte swapping is wrong.
With this last loop particularly being particularly silly.

You want to present the 16 bytes in *host* endian order.
Because the words are always in little-endian order (see H1 et al),
we only need to swap the words on big-endian hosts.

See 
https://lore.kernel.org/qemu-devel/20230620110758.787479-21-richard.hender...@linaro.org/

where I do exactly the same thing for ARM:

+    AESState *ad = (AESState *)(vd + i);
+    AESState *st = (AESState *)(vm + i);
+    AESState t;
+
+    /* Our uint64_t are in the wrong order for big-endian. */
+    if (HOST_BIG_ENDIAN) {
+    t.d[0] = st->d[1];
+    t.d[1] = st->d[0];
+    aesdec_IMC(, , false);
+    ad->d[0] = t.d[1];
+  

Re: [PATCH] gitlab: Disable plugins for cross-i386-tci

2023-06-29 Thread Thomas Huth

On 29/06/2023 15.08, Richard Henderson wrote:

There are timeouts in the cross-i386-tci job that are related to plugins.
Restrict this job to basic TCI testing.

Signed-off-by: Richard Henderson 
---

E.g. most recent failure(s),

https://gitlab.com/qemu-project/qemu/-/jobs/4565517825
4488: make[1]: *** [Makefile:189: run-plugin-memory-with-libbb.so] Error 124
4497: make[1]: *** [Makefile:189: run-plugin-memory-with-libempty.so] Error 124
4506: make[1]: *** [Makefile:189: run-plugin-memory-with-libinsn.so] Error 124
4550: make[1]: *** [Makefile:189: run-plugin-memory-with-libmem.so] Error 124
4558: make[1]: *** [Makefile:189: run-plugin-memory-with-libsyscall.so] Error 
124

We do tci+plugin testing with an x86_64 job, so I don't think it's
important that we cover plugins here.  Mostly we want to make sure
that TCI *builds* on a 32-bit host.

Anyway, here's a pass with shared infrastructure,
https://gitlab.com/rth7680/qemu/-/jobs/4565547513

There might well be different timings on the project k8s hosts.


r~

---
  .gitlab-ci.d/crossbuilds.yml | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.d/crossbuilds.yml b/.gitlab-ci.d/crossbuilds.yml
index 1e0e6c7f2c..b6ec99ecd1 100644
--- a/.gitlab-ci.d/crossbuilds.yml
+++ b/.gitlab-ci.d/crossbuilds.yml
@@ -57,7 +57,7 @@ cross-i386-tci:
variables:
  IMAGE: fedora-i386-cross
  ACCEL: tcg-interpreter
-EXTRA_CONFIGURE_OPTS: 
--target-list=i386-softmmu,i386-linux-user,aarch64-softmmu,aarch64-linux-user,ppc-softmmu,ppc-linux-user
+EXTRA_CONFIGURE_OPTS: 
--target-list=i386-softmmu,i386-linux-user,aarch64-softmmu,aarch64-linux-user,ppc-softmmu,ppc-linux-user
 --disable-plugins
  MAKE_CHECK_ARGS: check check-tcg
  
  cross-mipsel-system:


FWIW,
Acked-by: Thomas Huth 

I assume you'll apply this directly as a CI fix?




Re: [PATCH] hw: Simplify calls to pci_nic_init_nofail()

2023-06-29 Thread Thomas Huth

On 29/06/2023 15.47, Philippe Mathieu-Daudé wrote:

Hi Thomas,

On 29/6/23 14:54, Thomas Huth wrote:

pci_nic_init_nofail() calls qemu_find_nic_model(), and this function
sets nd->model = g_strdup(default_model) if it has not been initialized
yet. So we don't have to set nd->model to the default_nic in the
calling sites.

Signed-off-by: Thomas Huth 
---
  hw/arm/sbsa-ref.c    | 8 +---
  hw/arm/virt.c    | 8 +---
  hw/loongarch/virt.c  | 8 +---
  hw/mips/loongson3_virt.c | 8 +---
  hw/xtensa/virt.c | 8 +---
  5 files changed, 5 insertions(+), 35 deletions(-)

...

This remind me of a branch from end of April with this
unfinished patch, did we already discuss this together?


No, I haven't seen your patch yet, neither we talked about it. I came up 
with the idea for my patch on my own after looking at certain spots in the 
code. But I guess you could easily rebase your patch on top of mine in case 
you want to finish it ;-)


 Thomas




Re: [PATCH v2 06/12] aspeed/smc: Wire CS lines at reset

2023-06-29 Thread Cédric Le Goater

On 6/7/23 06:39, Cédric Le Goater wrote:

Currently, a set of default flash devices is created at machine init
and drives defined on the QEMU command line are associated to the FMC
and SPI controllers in sequence :

-drive file,format=raw,if=mtd
-drive file,format=raw,if=mtd

The CS lines are wired in the same creation loop. This makes a strong
assumption on the ordering and is not very flexible since only a
limited set of flash devices can be defined : 1 FMC + 1 or 2 SPI,
which is less than what the SoC really supports.

A better alternative would be to define the flash devices on the
command line using a blockdev attached to a CS line of a SSI bus :

 -blockdev node-name=fmc0,driver=file,filename=./flash.img
 -device mx66u51235f,addr=0x0,bus=ssi.0,drive=fmc0

However, user created flash devices are not correctly wired to their
SPI controller and consequently can not be used by the machine. Fix
that and wire the CS lines of all available devices when the SSI bus
is reset.

Signed-off-by: Cédric Le Goater 
---
  hw/arm/aspeed.c | 5 +
  hw/ssi/aspeed_smc.c | 8 
  2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index 76a1e7303de1..e5a49bb0b1a7 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -299,17 +299,14 @@ void aspeed_board_init_flashes(AspeedSMCState *s, const 
char *flashtype,
  
  for (i = 0; i < count; ++i) {

  DriveInfo *dinfo = drive_get(IF_MTD, 0, unit0 + i);
-qemu_irq cs_line;
  DeviceState *dev;
  
  dev = qdev_new(flashtype);

  if (dinfo) {
  qdev_prop_set_drive(dev, "drive", blk_by_legacy_dinfo(dinfo));
  }
+qdev_prop_set_uint8(dev, "addr", i);
  qdev_realize_and_unref(dev, BUS(s->spi), _fatal);
-
-cs_line = qdev_get_gpio_in_named(dev, SSI_GPIO_CS, 0);
-qdev_connect_gpio_out_named(DEVICE(s), "cs", i, cs_line);
  }
  }
  
diff --git a/hw/ssi/aspeed_smc.c b/hw/ssi/aspeed_smc.c

index 72811693224d..2a4001b774a2 100644
--- a/hw/ssi/aspeed_smc.c
+++ b/hw/ssi/aspeed_smc.c
@@ -692,6 +692,14 @@ static void aspeed_smc_reset(DeviceState *d)
  memset(s->regs, 0, sizeof s->regs);
  }
  
+for (i = 0; i < asc->cs_num_max; i++) {

+DeviceState *dev = ssi_get_cs(s->spi, i);
+if (dev) {
+qemu_irq cs_line = qdev_get_gpio_in_named(dev, SSI_GPIO_CS, 0);
+qdev_connect_gpio_out_named(DEVICE(s), "cs", i, cs_line);
+}
+}
+
  /* Unselect all peripherals */
  for (i = 0; i < asc->cs_num_max; ++i) {
  s->regs[s->r_ctrl0 + i] |= CTRL_CE_STOP_ACTIVE;



An alternative for the wiring would be to connect the GPIO lines in the
m25p80 realize routine. See below for a draft. Assumption is made on the
availability of the CS lines at the bus parent level, which should be
the controller.
 
Thanks,


C.


diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c
index aa0bfa57bb26..ae39a1a24b1b 100644
--- a/hw/ssi/ssi.c
+++ b/hw/ssi/ssi.c
@@ -154,6 +154,18 @@ SSIBus *ssi_create_bus(DeviceState *parent, const char 
*name)
 return SSI_BUS(bus);
 }
 
+void ssi_attach(SSIPeripheral *s)

+{
+BusState *bus = BUS(qdev_get_parent_bus(DEVICE(s)));
+qemu_irq cs_line = qdev_get_gpio_in_named(DEVICE(s), SSI_GPIO_CS, 0);
+
+/*
+ * TODO: Will abort if "cs" GPIOs are not defined at the
+ * controller level
+ */
+qdev_connect_gpio_out_named(DEVICE(bus->parent), "cs", s->addr, cs_line);
+}
+
 uint32_t ssi_transfer(SSIBus *bus, uint32_t val)
 {
 BusState *b = BUS(bus);


diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c
index afc3fdf4d60b..89add100aefd 100644
--- a/hw/block/m25p80.c
+++ b/hw/block/m25p80.c
@@ -1628,6 +1628,8 @@ static void m25p80_realize(SSIPeripheral *ss, Error 
**errp)
 
 qdev_init_gpio_in_named(DEVICE(s),

 m25p80_write_protect_pin_irq_handler, "WP#", 1);
+
+ssi_attach(ss);
 }
 
 static void m25p80_reset(DeviceState *d)





Re: [PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-06-29 Thread Ani Sinha



> On 29-Jun-2023, at 7:54 PM, Michael S. Tsirkin  wrote:
> 
> On Thu, Jun 29, 2023 at 09:37:07AM +0530, Ani Sinha wrote:
>> PCI Express ports only have one slot, so PCI Express devices can only be
>> plugged into slot 0 on a PCIE port. Enforce it.
>> 
>> The change has been tested to not break ARI by instantiating seven vfs on an
>> emulated igb device (the maximum number of vfs the linux igb driver 
>> supports).
> 
> I guess we need to test with some other device then? 7 VFs is same
> slot so hardly a good test.

No its not the same slot. Its using different slots/device numbers. I checked 
that.
The same patch was failing without the vf check.

> 
>> The vfs are seen to have non-zero device/slot numbers in the conventional
>> PCI BDF representation.
>> 
>> CC: jus...@redhat.com
>> CC: imamm...@redhat.com
>> CC: akihiko.od...@daynix.com
>> 
>> Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
>> Signed-off-by: Ani Sinha 
>> Reviewed-by: Julia Suvorova 
>> ---
>> hw/pci/pci.c | 15 +++
>> 1 file changed, 15 insertions(+)
>> 
>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> index e2eb4c3b4a..0320ac2bb3 100644
>> --- a/hw/pci/pci.c
>> +++ b/hw/pci/pci.c
>> @@ -65,6 +65,7 @@ bool pci_available = true;
>> static char *pcibus_get_dev_path(DeviceState *dev);
>> static char *pcibus_get_fw_dev_path(DeviceState *dev);
>> static void pcibus_reset(BusState *qbus);
>> +static bool pcie_has_upstream_port(PCIDevice *dev);
>> 
>> static Property pci_props[] = {
>> DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
>> @@ -1190,6 +1191,20 @@ static PCIDevice *do_pci_register_device(PCIDevice 
>> *pci_dev,
>>name);
>> 
>>return NULL;
>> +} /*
>> +   * With SRIOV and ARI, vfs can have non-zero slot in the conventional
>> +   * PCI interpretation as all five bits reserved for slot addresses are
>> +   * also used for function bits for the various vfs. Ignore that case.
>> +   * It is too early here to check for ARI capabilities in the PCI 
>> config
>> +   * space. Hence, we check for a vf device instead.
>> +   */
>> +else if (!pci_is_vf(pci_dev) &&
>> + pcie_has_upstream_port(pci_dev) &&
>> + PCI_SLOT(devfn)) {
>> +error_setg(errp, "PCI: slot %d is not valid for %s,"
>> +   " parent device only allows plugging into slot 0.",
>> +   PCI_SLOT(devfn), name);
>> +return NULL;
>> }
>> 
>> pci_dev->devfn = devfn;
>> -- 
>> 2.39.1
> 




Re: [PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-06-29 Thread Michael S. Tsirkin
On Thu, Jun 29, 2023 at 09:37:07AM +0530, Ani Sinha wrote:
> PCI Express ports only have one slot, so PCI Express devices can only be
> plugged into slot 0 on a PCIE port. Enforce it.
> 
> The change has been tested to not break ARI by instantiating seven vfs on an
> emulated igb device (the maximum number of vfs the linux igb driver supports).

I guess we need to test with some other device then? 7 VFs is same
slot so hardly a good test.

> The vfs are seen to have non-zero device/slot numbers in the conventional
> PCI BDF representation.
> 
> CC: jus...@redhat.com
> CC: imamm...@redhat.com
> CC: akihiko.od...@daynix.com
> 
> Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
> Signed-off-by: Ani Sinha 
> Reviewed-by: Julia Suvorova 
> ---
>  hw/pci/pci.c | 15 +++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index e2eb4c3b4a..0320ac2bb3 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -65,6 +65,7 @@ bool pci_available = true;
>  static char *pcibus_get_dev_path(DeviceState *dev);
>  static char *pcibus_get_fw_dev_path(DeviceState *dev);
>  static void pcibus_reset(BusState *qbus);
> +static bool pcie_has_upstream_port(PCIDevice *dev);
>  
>  static Property pci_props[] = {
>  DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
> @@ -1190,6 +1191,20 @@ static PCIDevice *do_pci_register_device(PCIDevice 
> *pci_dev,
> name);
>  
> return NULL;
> +} /*
> +   * With SRIOV and ARI, vfs can have non-zero slot in the conventional
> +   * PCI interpretation as all five bits reserved for slot addresses are
> +   * also used for function bits for the various vfs. Ignore that case.
> +   * It is too early here to check for ARI capabilities in the PCI config
> +   * space. Hence, we check for a vf device instead.
> +   */
> +else if (!pci_is_vf(pci_dev) &&
> + pcie_has_upstream_port(pci_dev) &&
> + PCI_SLOT(devfn)) {
> +error_setg(errp, "PCI: slot %d is not valid for %s,"
> +   " parent device only allows plugging into slot 0.",
> +   PCI_SLOT(devfn), name);
> +return NULL;
>  }
>  
>  pci_dev->devfn = devfn;
> -- 
> 2.39.1




Re: [PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-06-29 Thread Ani Sinha



> On 29-Jun-2023, at 2:19 PM, Akihiko Odaki  wrote:
> 
> On 2023/06/29 17:05, Ani Sinha wrote:
>> On Thu, 29 Jun, 2023, 12:17 pm Akihiko Odaki, > > wrote:
>>On 2023/06/29 13:07, Ani Sinha wrote:
>> > PCI Express ports only have one slot, so PCI Express devices can
>>only be
>> > plugged into slot 0 on a PCIE port. Enforce it.
>> >
>> > The change has been tested to not break ARI by instantiating
>>seven vfs on an
>> > emulated igb device (the maximum number of vfs the linux igb
>>driver supports).
>> > The vfs are seen to have non-zero device/slot numbers in the
>>conventional
>> > PCI BDF representation.
>> >
>> > CC: jus...@redhat.com 
>> > CC: imamm...@redhat.com 
>> > CC: akihiko.od...@daynix.com 
>> >
>> > Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
>>
>> > Signed-off-by: Ani Sinha >>
>> > Reviewed-by: Julia Suvorova >>
>> > ---
>> >   hw/pci/pci.c | 15 +++
>> >   1 file changed, 15 insertions(+)
>> >
>> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>> > index e2eb4c3b4a..0320ac2bb3 100644
>> > --- a/hw/pci/pci.c
>> > +++ b/hw/pci/pci.c
>> > @@ -65,6 +65,7 @@ bool pci_available = true;
>> >   static char *pcibus_get_dev_path(DeviceState *dev);
>> >   static char *pcibus_get_fw_dev_path(DeviceState *dev);
>> >   static void pcibus_reset(BusState *qbus);
>> > +static bool pcie_has_upstream_port(PCIDevice *dev);
>> >
>> >   static Property pci_props[] = {
>> >   DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
>> > @@ -1190,6 +1191,20 @@ static PCIDevice
>>*do_pci_register_device(PCIDevice *pci_dev,
>> >  name);
>> >
>> >  return NULL;
>> > +} /*
>> > +   * With SRIOV and ARI, vfs can have non-zero slot in the
>>conventional
>> > +   * PCI interpretation as all five bits reserved for slot
>>addresses are
>> > +   * also used for function bits for the various vfs. Ignore
>>that case.
>> > +   * It is too early here to check for ARI capabilities in
>>the PCI config
>> > +   * space. Hence, we check for a vf device instead.
>> > +   */
>>Why don't just perform this check after the capabilities are set?
>> We don't want to allocate resources for wrong device parameters. We want to 
>> error out early. Other checks also are performed at the same place .
> 
> It is indeed better to raise an error as early as possible so that we can 
> avoid allocation and other operations that will be reverted and may go wrong 
> due to the invalid condition. That should be the reason why other checks for 
> the address are performed here.
> 
> However, in this particular case, we cannot confidently perform the check 
> here because it is unknown if the ARI capability will be advertised until the 
> device realization code runs. This can justify delaying the check after the 
> device realization, unlike the other checks.

Ok so are you proposing that the check we have right before (the check for 
unoccupied function 0) be also moved? It also uses the same vf approximation 
for seemingly to support ARI.
Also where do you propose we move the check?

> 
>> Show quoted text
>>Regards,
>>Akihiko Odaki
>> > +else if (!pci_is_vf(pci_dev) &&
>> > + pcie_has_upstream_port(pci_dev) &&
>> > + PCI_SLOT(devfn)) {
>> > +error_setg(errp, "PCI: slot %d is not valid for %s,"
>> > +   " parent device only allows plugging into
>>slot 0.",
>> > +   PCI_SLOT(devfn), name);
>> > +return NULL;
>> >   }
>> >
>> >   pci_dev->devfn = devfn;




  1   2   3   >