Re: [PATCH v4 2/3] virtio-iommu: Add a granule property

2024-02-22 Thread Philippe Mathieu-Daudé

On 23/2/24 08:27, Eric Auger wrote:

This allows to choose which granule will be used by
default by the virtio-iommu. Current page size mask
default is qemu_target_page_mask so this translates
into a 4K granule.

Signed-off-by: Eric Auger 

---

v3 -> v4:
- granule_mode introduction moved to that patch
---
  include/hw/virtio/virtio-iommu.h |  1 +
  hw/virtio/virtio-iommu.c | 27 ---
  qemu-options.hx  |  3 +++
  3 files changed, 28 insertions(+), 3 deletions(-)




@@ -1324,7 +1324,26 @@ static void virtio_iommu_device_realize(DeviceState 
*dev, Error **errp)
   * in vfio realize
   */
  s->config.bypass = s->boot_bypass;
-s->config.page_size_mask = qemu_target_page_mask();
+
+switch (s->granule_mode) {
+case GRANULE_MODE_4K:
+s->config.page_size_mask = ~0xFFF;


Alternatively:

  s->config.page_size_mask = -(4 * KiB);


+break;
+case GRANULE_MODE_8K:
+s->config.page_size_mask = ~0x1FFF;


  s->config.page_size_mask = -(8 * KiB);


+break;
+case GRANULE_MODE_16K:
+s->config.page_size_mask = ~0x3FFF;


...


+break;
+case GRANULE_MODE_64K:
+s->config.page_size_mask = ~0x;
+break;
+case GRANULE_MODE_HOST:
+s->config.page_size_mask = qemu_real_host_page_mask();
+break;
+default:
+error_setg(errp, "Unsupported granule mode");
+}





Re: [PATCH 1/2] hw/arm/smmuv3: Check StreamIDs against SMMU_IDR1.SIDSIZE value

2024-02-22 Thread Eric Auger
Hi,

On 2/21/24 18:17, Nabih Estefan wrote:
> From: Roque Arcudia Hernandez 
>
> Current implementation checks the StreamIDs against STRTAB_BASE_CFG.LOG2SIZE
> register field value and a constant SMMU_IDR1_SIDSIZE which is also used as
> initial value for field SMMU_IDR1.SIDSIZE.
>
> This limits the possibility of extending the SMMUv3 by inheritance and
> redefining the value of SMMU_IDR1.SIDSIZE because the check is hardcoded to 
> the
> constant SMMU_IDR1_SIDSIZE rather than the register value.
>
> Signed-off-by: Roque Arcudia Hernandez 
> Signed-off-by: Nabih Estefan 
Reviewed-by: Eric Auger 

Thanks

Eric
> ---
>  hw/arm/smmuv3.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
> index 9eb56a70f3..a01031821a 100644
> --- a/hw/arm/smmuv3.c
> +++ b/hw/arm/smmuv3.c
> @@ -580,15 +580,17 @@ static int smmu_find_ste(SMMUv3State *s, uint32_t sid, 
> STE *ste,
>  {
>  dma_addr_t addr, strtab_base;
>  uint32_t log2size;
> +uint32_t idr1_sidsize;
>  int strtab_size_shift;
>  int ret;
>  
>  trace_smmuv3_find_ste(sid, s->features, s->sid_split);
>  log2size = FIELD_EX32(s->strtab_base_cfg, STRTAB_BASE_CFG, LOG2SIZE);
> +idr1_sidsize = FIELD_EX32(s->idr[1], IDR1, SIDSIZE);
>  /*
>   * Check SID range against both guest-configured and implementation 
> limits
>   */
> -if (sid >= (1 << MIN(log2size, SMMU_IDR1_SIDSIZE))) {
> +if (sid >= (1 << MIN(log2size, idr1_sidsize))) {
>  event->type = SMMU_EVT_C_BAD_STREAMID;
>  return -EINVAL;
>  }




Re: [PATCH v4 1/3] qdev: Add a granule_mode property

2024-02-22 Thread Philippe Mathieu-Daudé

Hi Eric,

On 23/2/24 08:27, Eric Auger wrote:

Introduce a new enum type property allowing to set an
IOMMU granule. Values are 4K, 8K, 16K, 64K and host.
This latter indicates the vIOMMU granule will match
the host page size.

A subsequent patch will add such a property to the
virtio-iommu device.

Signed-off-by: Eric Auger 

---

v3 -> v4:
- Add 8K
---
  include/hw/qdev-properties-system.h |  3 +++
  include/hw/virtio/virtio-iommu.h| 11 +++
  hw/core/qdev-properties-system.c| 15 +++
  hw/virtio/virtio-iommu.c| 11 +++
  4 files changed, 40 insertions(+)




diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
index 5fbe4677c2..e22327548f 100644
--- a/include/hw/virtio/virtio-iommu.h
+++ b/include/hw/virtio/virtio-iommu.h
@@ -31,6 +31,17 @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIOIOMMU, VIRTIO_IOMMU)
  
  #define TYPE_VIRTIO_IOMMU_MEMORY_REGION "virtio-iommu-memory-region"
  
+typedef enum GranuleMode {

+GRANULE_MODE_4K,
+GRANULE_MODE_8K,
+GRANULE_MODE_16K,
+GRANULE_MODE_64K,
+GRANULE_MODE_HOST,
+GRANULE_MODE__MAX,
+} GranuleMode;
+
+extern const QEnumLookup GranuleMode_lookup;


Aren't this, ...


diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 2ec5ef3cd1..a9bdc03d12 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -44,6 +44,17 @@
  #define VIOMMU_DEFAULT_QUEUE_SIZE 256
  #define VIOMMU_PROBE_SIZE 512
  
+const QEnumLookup GranuleMode_lookup = {

+.array = (const char *const[]) {
+[GRANULE_MODE_4K]   = "4K",
+[GRANULE_MODE_8K]   = "8K",
+[GRANULE_MODE_16K]  = "16K",
+[GRANULE_MODE_64K]  = "64K",
+[GRANULE_MODE_HOST] = "host",
+},
+.size = GRANULE_MODE__MAX
+};

... and this supposed to be QAPI generated?



[PATCH v4 2/3] virtio-iommu: Add a granule property

2024-02-22 Thread Eric Auger
This allows to choose which granule will be used by
default by the virtio-iommu. Current page size mask
default is qemu_target_page_mask so this translates
into a 4K granule.

Signed-off-by: Eric Auger 

---

v3 -> v4:
- granule_mode introduction moved to that patch
---
 include/hw/virtio/virtio-iommu.h |  1 +
 hw/virtio/virtio-iommu.c | 27 ---
 qemu-options.hx  |  3 +++
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
index e22327548f..a3e5b35b1e 100644
--- a/include/hw/virtio/virtio-iommu.h
+++ b/include/hw/virtio/virtio-iommu.h
@@ -78,6 +78,7 @@ struct VirtIOIOMMU {
 Notifier machine_done;
 bool granule_frozen;
 uint8_t aw_bits;
+GranuleMode granule_mode;
 };
 
 #endif
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index a9bdc03d12..0461b87ef2 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -1126,8 +1126,8 @@ static int 
virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
 }
 
 /*
- * The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule,
- * for example 0xf000. When an assigned device has page size
+ * The default mask depends on the "granule" property. For example, with
+ * 4K granule, it is ~0xFFF. When an assigned device has page size
  * restrictions due to the hardware IOMMU configuration, apply this restriction
  * to the mask.
  */
@@ -1324,7 +1324,26 @@ static void virtio_iommu_device_realize(DeviceState 
*dev, Error **errp)
  * in vfio realize
  */
 s->config.bypass = s->boot_bypass;
-s->config.page_size_mask = qemu_target_page_mask();
+
+switch (s->granule_mode) {
+case GRANULE_MODE_4K:
+s->config.page_size_mask = ~0xFFF;
+break;
+case GRANULE_MODE_8K:
+s->config.page_size_mask = ~0x1FFF;
+break;
+case GRANULE_MODE_16K:
+s->config.page_size_mask = ~0x3FFF;
+break;
+case GRANULE_MODE_64K:
+s->config.page_size_mask = ~0x;
+break;
+case GRANULE_MODE_HOST:
+s->config.page_size_mask = qemu_real_host_page_mask();
+break;
+default:
+error_setg(errp, "Unsupported granule mode");
+}
 if (s->aw_bits < 32 || s->aw_bits > 64) {
 error_setg(errp, "aw-bits must be within [32,64]");
 }
@@ -1538,6 +1557,8 @@ static Property virtio_iommu_properties[] = {
  TYPE_PCI_BUS, PCIBus *),
 DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
 DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 0),
+DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode,
+ GRANULE_MODE_4K),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/qemu-options.hx b/qemu-options.hx
index a98bc7bd60..8bc1e9e4aa 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1179,6 +1179,9 @@ SRST
 ``aw-bits=val`` (val between 32 and 64, default depends on machine)
 This decides the address width of IOVA address space. It defaults
 to 39 bits on q35 machines and 48 bits on ARM virt machines.
+``granule=val`` (possible values are 4K, 8K, 16K, 64K and host)
+This decides the default granule to be be exposed by the
+virtio-iommu. If host, the granule matches the host page size.
 
 ERST
 
-- 
2.41.0




[PATCH v4 1/3] qdev: Add a granule_mode property

2024-02-22 Thread Eric Auger
Introduce a new enum type property allowing to set an
IOMMU granule. Values are 4K, 8K, 16K, 64K and host.
This latter indicates the vIOMMU granule will match
the host page size.

A subsequent patch will add such a property to the
virtio-iommu device.

Signed-off-by: Eric Auger 

---

v3 -> v4:
- Add 8K
---
 include/hw/qdev-properties-system.h |  3 +++
 include/hw/virtio/virtio-iommu.h| 11 +++
 hw/core/qdev-properties-system.c| 15 +++
 hw/virtio/virtio-iommu.c| 11 +++
 4 files changed, 40 insertions(+)

diff --git a/include/hw/qdev-properties-system.h 
b/include/hw/qdev-properties-system.h
index 06c359c190..626be87dd3 100644
--- a/include/hw/qdev-properties-system.h
+++ b/include/hw/qdev-properties-system.h
@@ -8,6 +8,7 @@ extern const PropertyInfo qdev_prop_macaddr;
 extern const PropertyInfo qdev_prop_reserved_region;
 extern const PropertyInfo qdev_prop_multifd_compression;
 extern const PropertyInfo qdev_prop_mig_mode;
+extern const PropertyInfo qdev_prop_granule_mode;
 extern const PropertyInfo qdev_prop_losttickpolicy;
 extern const PropertyInfo qdev_prop_blockdev_on_error;
 extern const PropertyInfo qdev_prop_bios_chs_trans;
@@ -47,6 +48,8 @@ extern const PropertyInfo qdev_prop_iothread_vq_mapping_list;
 #define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \
 DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \
MigMode)
+#define DEFINE_PROP_GRANULE_MODE(_n, _s, _f, _d) \
+DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_granule_mode, GranuleMode)
 #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \
 DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \
 LostTickPolicy)
diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
index 5fbe4677c2..e22327548f 100644
--- a/include/hw/virtio/virtio-iommu.h
+++ b/include/hw/virtio/virtio-iommu.h
@@ -31,6 +31,17 @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIOIOMMU, VIRTIO_IOMMU)
 
 #define TYPE_VIRTIO_IOMMU_MEMORY_REGION "virtio-iommu-memory-region"
 
+typedef enum GranuleMode {
+GRANULE_MODE_4K,
+GRANULE_MODE_8K,
+GRANULE_MODE_16K,
+GRANULE_MODE_64K,
+GRANULE_MODE_HOST,
+GRANULE_MODE__MAX,
+} GranuleMode;
+
+extern const QEnumLookup GranuleMode_lookup;
+
 typedef struct IOMMUDevice {
 void *viommu;
 PCIBus   *bus;
diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c
index 1a396521d5..578eac6b14 100644
--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@@ -34,6 +34,7 @@
 #include "net/net.h"
 #include "hw/pci/pci.h"
 #include "hw/pci/pcie.h"
+#include "hw/virtio/virtio-iommu.h"
 #include "hw/i386/x86.h"
 #include "util/block-helpers.h"
 
@@ -679,6 +680,20 @@ const PropertyInfo qdev_prop_mig_mode = {
 .set_default_value = qdev_propinfo_set_default_value_enum,
 };
 
+/* --- GranuleMode --- */
+
+QEMU_BUILD_BUG_ON(sizeof(GranuleMode) != sizeof(int));
+
+const PropertyInfo qdev_prop_granule_mode = {
+.name = "GranuleMode",
+.description = "granule_mode values, "
+   "4K, 8K, 16K, 64K, host",
+.enum_table = _lookup,
+.get = qdev_propinfo_get_enum,
+.set = qdev_propinfo_set_enum,
+.set_default_value = qdev_propinfo_set_default_value_enum,
+};
+
 /* --- Reserved Region --- */
 
 /*
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 2ec5ef3cd1..a9bdc03d12 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -44,6 +44,17 @@
 #define VIOMMU_DEFAULT_QUEUE_SIZE 256
 #define VIOMMU_PROBE_SIZE 512
 
+const QEnumLookup GranuleMode_lookup = {
+.array = (const char *const[]) {
+[GRANULE_MODE_4K]   = "4K",
+[GRANULE_MODE_8K]   = "8K",
+[GRANULE_MODE_16K]  = "16K",
+[GRANULE_MODE_64K]  = "64K",
+[GRANULE_MODE_HOST] = "host",
+},
+.size = GRANULE_MODE__MAX
+};
+
 typedef struct VirtIOIOMMUDomain {
 uint32_t id;
 bool bypass;
-- 
2.41.0




[PATCH v4 0/3] VIRTIO-IOMMU: Set default granule to host page size

2024-02-22 Thread Eric Auger
We used to set the default granule to 4KB but with VFIO assignment
it makes more sense to use the actual host page size.

Indeed when hotplugging a VFIO device protected by a virtio-iommu
on a 64kB/64kB host/guest config, we currently get a qemu crash:

"vfio: DMA mapping failed, unable to continue"

This is due to the hot-attached VFIO device calling
memory_region_iommu_set_page_size_mask() with 64kB granule
whereas the virtio-iommu granule was already frozen to 4KB on
machine init done.

Introduce a new granule property and set this latter to "host"
and introduce a new compat.

Note that the new default will prevent 4kB guest on 64kB host
because the granule will be set to 64kB which would be larger
than the guest page size. In that situation, the virtio-iommu
driver fails on viommu_domain_finalise() with
"granule 0x1 larger than system page size 0x1000".

In that case 4K granule should be used.

To summarize, before the series, the support matrix (credit
to Jean-Philippe Brucker) was:

 Host | Guest | virtio-net | IGB passthrough
  4k  | 4k| Y  | Y
  64k | 64k   | Y  | N
  64k | 4k| Y  | N
  4k  | 64k   | Y  | Y

After the series:

 Host | Guest | virtio-net | IGB passthrough
  4k  | 4k| Y  | Y
  64k | 64k   | Y  | Y
  64k | 4k| 4K | N
  4k  | 64k   | Y  | Y

The current limitation of global granule in the virtio-iommu
should be removed and turned into per domain granule. But
until we get this upgraded, this new default is probably
better because I don't think anyone is currently interested in
running a 4KB page size guest with virtio-iommu on a 64KB host.
However supporting 64kB guest on 64kB host with virtio-iommu and
VFIO looks a more important feature.

This series can be found at:
https://github.com/eauger/qemu/tree/granule-v2

Applied on top of
[PATCH v5 0/4] VIRTIO-IOMMU: Introduce an aw-bits option
https://lore.kernel.org/all/20240215084315.863897-1-eric.au...@redhat.com/

History:
v3 -> v4:
- Add 8K granule (Richard)

v2 -> v3
- introduce a dedicated granule option to handle the compat

Eric Auger (3):
  qdev: Add a granule_mode property
  virtio-iommu: Add a granule property
  virtio-iommu: Change the default granule to the host page size

 include/hw/qdev-properties-system.h |  3 +++
 include/hw/virtio/virtio-iommu.h| 12 +
 hw/core/machine.c   |  1 +
 hw/core/qdev-properties-system.c| 15 
 hw/virtio/virtio-iommu.c| 38 ++---
 qemu-options.hx |  3 +++
 6 files changed, 69 insertions(+), 3 deletions(-)

-- 
2.41.0




[PATCH v4 3/3] virtio-iommu: Change the default granule to the host page size

2024-02-22 Thread Eric Auger
We used to set the default granule to 4KB but with VFIO assignment
it makes more sense to use the actual host page size.

Indeed when hotplugging a VFIO device protected by a virtio-iommu
on a 64kB/64kB host/guest config, we current get a qemu crash:

"vfio: DMA mapping failed, unable to continue"

This is due to the hot-attached VFIO device calling
memory_region_iommu_set_page_size_mask() with 64kB granule
whereas the virtio-iommu granule was already frozen to 4KB on
machine init done.

Set the granule property to "host" and introduce a new compat.

Note that the new default will prevent 4kB guest on 64kB host
because the granule will be set to 64kB which would be larger
than the guest page size. In that situation, the virtio-iommu
driver fails on viommu_domain_finalise() with
"granule 0x1 larger than system page size 0x1000".

In that case the workaround is to request 4K granule.

The current limitation of global granule in the virtio-iommu
should be removed and turned into per domain granule. But
until we get this upgraded, this new default is probably
better because I don't think anyone is currently interested in
running a 4KB page size guest with virtio-iommu on a 64KB host.
However supporting 64kB guest on 64kB host with virtio-iommu and
VFIO looks a more important feature.

Signed-off-by: Eric Auger 
---
 hw/core/machine.c| 1 +
 hw/virtio/virtio-iommu.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 70ac96954c..38851df4b8 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -35,6 +35,7 @@
 
 GlobalProperty hw_compat_8_2[] = {
 { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "64" },
+{ TYPE_VIRTIO_IOMMU_PCI, "granule", "4K" },
 };
 const size_t hw_compat_8_2_len = G_N_ELEMENTS(hw_compat_8_2);
 
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 0461b87ef2..e9e44a8ad8 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -1558,7 +1558,7 @@ static Property virtio_iommu_properties[] = {
 DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
 DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 0),
 DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode,
- GRANULE_MODE_4K),
+ GRANULE_MODE_HOST),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.41.0




Re: [PATCH v6 07/11] virtio-gpu: Handle resource blob commands

2024-02-22 Thread Huang Rui via
On Wed, Jan 10, 2024 at 04:51:31PM +0800, Pierre-Eric Pelloux-Prayer wrote:
> 
> 
> Le 09/01/2024 à 17:50, Pierre-Eric Pelloux-Prayer a écrit :
> > 
> > 
> > Le 19/12/2023 à 08:53, Huang Rui a écrit :
> >> From: Antonio Caggiano 
> >>
> >> Support BLOB resources creation, mapping and unmapping by calling the
> >> new stable virglrenderer 0.10 interface. Only enabled when available and
> >> via the blob config. E.g. -device virtio-vga-gl,blob=true
> >>
> >> Signed-off-by: Antonio Caggiano 
> >> Signed-off-by: Dmitry Osipenko 
> >> Signed-off-by: Xenia Ragiadakou 
> >> Signed-off-by: Huang Rui 
> >> ---
> >>
> >> Changes in v6:
> >> - Use new struct virgl_gpu_resource.
> >> - Unmap, unref and destroy the resource only after the memory region
> >>    has been completely removed.
> >> - In unref check whether the resource is still mapped.
> >> - In unmap_blob check whether the resource has been already unmapped.
> >> - Fix coding style
> >>
> >>   hw/display/virtio-gpu-virgl.c | 274 +-
> >>   hw/display/virtio-gpu.c   |   4 +-
> >>   meson.build   |   4 +
> >>   3 files changed, 276 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/hw/display/virtio-gpu-virgl.c b/hw/display/virtio-gpu-virgl.c
> >> index faab374336..5a3a292f79 100644
> >> --- a/hw/display/virtio-gpu-virgl.c
> >> +++ b/hw/display/virtio-gpu-virgl.c
> >> @@ -17,6 +17,7 @@
> >>   #include "trace.h"
> >>   #include "hw/virtio/virtio.h"
> >>   #include "hw/virtio/virtio-gpu.h"
> >> +#include "hw/virtio/virtio-gpu-bswap.h"
> >>   #include "ui/egl-helpers.h"
> >> @@ -24,8 +25,62 @@
> >>   struct virgl_gpu_resource {
> >>   struct virtio_gpu_simple_resource res;
> >> +    uint32_t ref;
> >> +    VirtIOGPU *g;
> >> +
> >> +#ifdef HAVE_VIRGL_RESOURCE_BLOB
> >> +    /* only blob resource needs this region to be mapped as guest mmio */
> >> +    MemoryRegion *region;
> >> +#endif
> >>   };
> >> +static void vres_get_ref(struct virgl_gpu_resource *vres)
> >> +{
> >> +    uint32_t ref;
> >> +
> >> +    ref = qatomic_fetch_inc(>ref);
> >> +    g_assert(ref < INT_MAX);
> >> +}
> >> +
> >> +static void virgl_resource_destroy(struct virgl_gpu_resource *vres)
> >> +{
> >> +    struct virtio_gpu_simple_resource *res;
> >> +    VirtIOGPU *g;
> >> +
> >> +    if (!vres) {
> >> +    return;
> >> +    }
> >> +
> >> +    g = vres->g;
> >> +    res = >res;
> >> +    QTAILQ_REMOVE(>reslist, res, next);
> >> +    virtio_gpu_cleanup_mapping(g, res);
> >> +    g_free(vres);
> >> +}
> >> +
> >> +static void virgl_resource_unref(struct virgl_gpu_resource *vres)
> >> +{
> >> +    struct virtio_gpu_simple_resource *res;
> >> +
> >> +    if (!vres) {
> >> +    return;
> >> +    }
> >> +
> >> +    res = >res;
> >> +    virgl_renderer_resource_detach_iov(res->resource_id, NULL, NULL);
> >> +    virgl_renderer_resource_unref(res->resource_id);
> >> +}
> >> +
> >> +static void vres_put_ref(struct virgl_gpu_resource *vres)
> >> +{
> >> +    g_assert(vres->ref > 0);
> >> +
> >> +    if (qatomic_fetch_dec(>ref) == 1) {
> >> +    virgl_resource_unref(vres);
> >> +    virgl_resource_destroy(vres);
> >> +    }
> >> +}
> >> +
> >>   static struct virgl_gpu_resource *
> >>   virgl_gpu_find_resource(VirtIOGPU *g, uint32_t resource_id)
> >>   {
> >> @@ -59,6 +114,8 @@ static void virgl_cmd_create_resource_2d(VirtIOGPU *g,
> >>  c2d.width, c2d.height);
> >>   vres = g_new0(struct virgl_gpu_resource, 1);
> >> +    vres_get_ref(vres);
> >> +    vres->g = g;
> >>   vres->res.width = c2d.width;
> >>   vres->res.height = c2d.height;
> >>   vres->res.format = c2d.format;
> >> @@ -91,6 +148,8 @@ static void virgl_cmd_create_resource_3d(VirtIOGPU *g,
> >>  c3d.width, c3d.height, c3d.depth);
> >>   vres = g_new0(struct virgl_gpu_resource, 1);
> >> +    vres_get_ref(vres);
> >> +    vres->g = g;
> >>   vres->res.width = c3d.width;
> >>   vres->res.height = c3d.height;
> >>   vres->res.format = c3d.format;
> >> @@ -126,12 +185,21 @@ static void virgl_cmd_resource_unref(VirtIOGPU *g,
> >>   return;
> >>   }
> >> -    virgl_renderer_resource_detach_iov(unref.resource_id, NULL, NULL);
> >> -    virgl_renderer_resource_unref(unref.resource_id);
> >> +#ifdef HAVE_VIRGL_RESOURCE_BLOB
> >> +    if (vres->region) {
> >> +    VirtIOGPUBase *b = VIRTIO_GPU_BASE(g);
> >> +    MemoryRegion *mr = vres->region;
> >> +
> >> +    warn_report("%s: blob resource %d not unmapped",
> >> +    __func__, unref.resource_id);
> >> +    vres->region = NULL;
> > 
> > Shouldn't there be a call to memory_region_unref(mr)?
> > 
> >> +    memory_region_set_enabled(mr, false);
> >> +    memory_region_del_subregion(>hostmem, mr);
> >> +    object_unparent(OBJECT(mr));
> >> +    }
> >> +#endif /* HAVE_VIRGL_RESOURCE_BLOB */
> >> -    QTAILQ_REMOVE(>reslist, >res, next);
> >> -    

Re: [PATCH v2 1/2] hw/arm: Use TYPE_OR_IRQ when connecting STM32L4x5 EXTI fan-in IRQs

2024-02-22 Thread Philippe Mathieu-Daudé

On 20/2/24 19:34, Inès Varhol wrote:

Fixes: 52671f69f7a4 ("[PATCH v8 0/3] Add device STM32L4x5 EXTI")
Signed-off-by: Inès Varhol 
---
  include/hw/arm/stm32l4x5_soc.h |  4 ++
  hw/arm/stm32l4x5_soc.c | 80 +-
  2 files changed, 74 insertions(+), 10 deletions(-)


Thanks for cleaning that!
Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH v3 0/3] Add support for I2C in BCM2835 boards

2024-02-22 Thread Philippe Mathieu-Daudé

Hi Peter,

On 22/2/24 18:54, Peter Maydell wrote:

On Tue, 20 Feb 2024 at 13:42, Rayhan Faizel  wrote:


This patch series implements support for the Broadcom Serial Controller used
by BCM2835 based boards for I2C.




Rayhan Faizel (3):
   hw/i2c: Implement Broadcom Serial Controller (BSC)
   hw/arm: Connect BSC to BCM2835 board as I2C0, I2C1 and I2C2
   tests/qtest: Add testcase for BCM2835 BSC




Applied to target-arm.next, thanks.


Sorry I didn't notice earlier, the I2C[3] IRQ lines have
to be OR-ed using a TYPE_OR_IRQ object before reaching the
INTC. I'd rather a v4, but if you already posted your PR
this can get fixed on top.

Regards,

Phil.



Re: [PATCH v3 2/3] hw/arm: Connect BSC to BCM2835 board as I2C0, I2C1 and I2C2

2024-02-22 Thread Philippe Mathieu-Daudé

On 20/2/24 14:41, Rayhan Faizel wrote:

BCM2835 has three I2C controllers. All of them share the same interrupt line.

Signed-off-by: Rayhan Faizel 
---
  hw/arm/Kconfig   |  1 +
  hw/arm/bcm2835_peripherals.c | 32 +---
  include/hw/arm/bcm2835_peripherals.h |  3 ++-
  3 files changed, 32 insertions(+), 4 deletions(-)




diff --git a/hw/arm/bcm2835_peripherals.c b/hw/arm/bcm2835_peripherals.c
index d5573fd954..ca692ed9a5 100644
--- a/hw/arm/bcm2835_peripherals.c
+++ b/hw/arm/bcm2835_peripherals.c
@@ -148,6 +148,14 @@ static void bcm2835_peripherals_init(Object *obj)
  /* SPI */
  object_initialize_child(obj, "bcm2835-spi0", >spi[0],
  TYPE_BCM2835_SPI);
+
+/* I2C */
+object_initialize_child(obj, "bcm2835-i2c0", >i2c[0],
+TYPE_BCM2835_I2C);
+object_initialize_child(obj, "bcm2835-i2c1", >i2c[1],
+TYPE_BCM2835_I2C);
+object_initialize_child(obj, "bcm2835-i2c2", >i2c[2],
+TYPE_BCM2835_I2C);
  }
  
  static void bcm2835_peripherals_realize(DeviceState *dev, Error **errp)

@@ -418,14 +426,32 @@ static void bcm2835_peripherals_realize(DeviceState *dev, 
Error **errp)
BCM2835_IC_GPU_IRQ,
INTERRUPT_SPI));
  
+/* I2C */

+for (n = 0; n < 3; n++) {
+if (!sysbus_realize(SYS_BUS_DEVICE(>i2c[n]), errp)) {
+return;
+}
+}
+
+memory_region_add_subregion(>peri_mr, BSC0_OFFSET,
+sysbus_mmio_get_region(SYS_BUS_DEVICE(>i2c[0]), 0));
+memory_region_add_subregion(>peri_mr, BSC1_OFFSET,
+sysbus_mmio_get_region(SYS_BUS_DEVICE(>i2c[1]), 0));
+memory_region_add_subregion(>peri_mr, BSC2_OFFSET,
+sysbus_mmio_get_region(SYS_BUS_DEVICE(>i2c[2]), 0));
+
+for (n = 0; n < 3; n++) {
+sysbus_connect_irq(SYS_BUS_DEVICE(>i2c[n]), 0,
+   qdev_get_gpio_in_named(DEVICE(>ic),
+  BCM2835_IC_GPU_IRQ,


Due to how QEMU IRQs are implemented, we can not wire multiple IRQs
to the same output without using an intermediate "OR gate". We model
it as TYPE_OR_IRQ. See the comment in "hw/qdev-core.h" added in
commit cd07d7f9f5 ("qdev: Document GPIO related functions"):

  * It is not valid to try to connect one outbound GPIO to multiple
  * qemu_irqs at once, or to connect multiple outbound GPIOs to the
  * same qemu_irq. (Warning: there is no assertion or other guard to
  * catch this error: the model will just not do the right thing.)
  * Instead, for fan-out you can use the TYPE_SPLIT_IRQ device: connect
  * a device's outbound GPIO to the splitter's input, and connect each
  * of the splitter's outputs to a different device.  For fan-in you
  * can use the TYPE_OR_IRQ device, which is a model of a logical OR
  * gate with multiple inputs and one output.


+  INTERRUPT_I2C));
+}
+
  create_unimp(s, >txp, "bcm2835-txp", TXP_OFFSET, 0x1000);
  create_unimp(s, >armtmr, "bcm2835-sp804", ARMCTRL_TIMER0_1_OFFSET, 
0x40);
  create_unimp(s, >i2s, "bcm2835-i2s", I2S_OFFSET, 0x100);
  create_unimp(s, >smi, "bcm2835-smi", SMI_OFFSET, 0x100);
  create_unimp(s, >bscsl, "bcm2835-spis", BSC_SL_OFFSET, 0x100);
-create_unimp(s, >i2c[0], "bcm2835-i2c0", BSC0_OFFSET, 0x20);
-create_unimp(s, >i2c[1], "bcm2835-i2c1", BSC1_OFFSET, 0x20);
-create_unimp(s, >i2c[2], "bcm2835-i2c2", BSC2_OFFSET, 0x20);
  create_unimp(s, >otp, "bcm2835-otp", OTP_OFFSET, 0x80);
  create_unimp(s, >dbus, "bcm2835-dbus", DBUS_OFFSET, 0x8000);
  create_unimp(s, >ave0, "bcm2835-ave0", AVE0_OFFSET, 0x8000);




Re: [PATCH v3 1/3] hw/i2c: Implement Broadcom Serial Controller (BSC)

2024-02-22 Thread Philippe Mathieu-Daudé

On 20/2/24 14:41, Rayhan Faizel wrote:

A few deficiencies in the current device model need to be noted.

1. FIFOs are not used. All sends and receives are done directly.
2. Repeated starts are not emulated. Repeated starts can be triggered in real
hardware by sending a new read transfer request in the window time between
transfer active set of write transfer request and done bit set of the same.

Signed-off-by: Rayhan Faizel 
---
  docs/system/arm/raspi.rst|   1 +
  hw/i2c/Kconfig   |   4 +
  hw/i2c/bcm2835_i2c.c | 278 +++
  hw/i2c/meson.build   |   1 +
  include/hw/i2c/bcm2835_i2c.h |  80 ++
  5 files changed, 364 insertions(+)
  create mode 100644 hw/i2c/bcm2835_i2c.c
  create mode 100644 include/hw/i2c/bcm2835_i2c.h




+static const MemoryRegionOps bcm2835_i2c_ops = {
+.read = bcm2835_i2c_read,
+.write = bcm2835_i2c_write,
+.endianness = DEVICE_NATIVE_ENDIAN,


Watch out, your implementation is 32-bit, so this misses:

  .impl = {
  .min_access_size = 4,
  .max_access_size = 4,
  },


+};




diff --git a/include/hw/i2c/bcm2835_i2c.h b/include/hw/i2c/bcm2835_i2c.h
new file mode 100644
index 00..0a56df4720
--- /dev/null
+++ b/include/hw/i2c/bcm2835_i2c.h




+#define BCM2835_I2C_C   0x0   /* Control */
+#define BCM2835_I2C_S   0x4   /* Status */
+#define BCM2835_I2C_DLEN0x8   /* Data Length */
+#define BCM2835_I2C_A   0xc   /* Slave Address */
+#define BCM2835_I2C_FIFO0x10  /* FIFO */
+#define BCM2835_I2C_DIV 0x14  /* Clock Divider */
+#define BCM2835_I2C_DEL 0x18  /* Data Delay */
+#define BCM2835_I2C_CLKT0x20  /* Clock Stretch Timeout */




Re: [PATCH v3 1/3] hw/i2c: Implement Broadcom Serial Controller (BSC)

2024-02-22 Thread Philippe Mathieu-Daudé

Hi Rayhan,

On 20/2/24 14:41, Rayhan Faizel wrote:

A few deficiencies in the current device model need to be noted.

1. FIFOs are not used. All sends and receives are done directly.
2. Repeated starts are not emulated. Repeated starts can be triggered in real
hardware by sending a new read transfer request in the window time between
transfer active set of write transfer request and done bit set of the same.

Signed-off-by: Rayhan Faizel 
---
  docs/system/arm/raspi.rst|   1 +
  hw/i2c/Kconfig   |   4 +
  hw/i2c/bcm2835_i2c.c | 278 +++
  hw/i2c/meson.build   |   1 +
  include/hw/i2c/bcm2835_i2c.h |  80 ++
  5 files changed, 364 insertions(+)
  create mode 100644 hw/i2c/bcm2835_i2c.c
  create mode 100644 include/hw/i2c/bcm2835_i2c.h




new file mode 100644
index 00..d6b9bf887a
--- /dev/null
+++ b/hw/i2c/bcm2835_i2c.c
@@ -0,0 +1,278 @@
+/*
+ * Broadcom Serial Controller (BSC)
+ *
+ * Copyright (c) 2024 Rayhan Faizel 
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "hw/i2c/bcm2835_i2c.h"
+#include "hw/irq.h"
+#include "migration/vmstate.h"
+
+static void bcm2835_i2c_update_interrupt(BCM2835I2CState *s)
+{
+int do_interrupt = 0;
+/* Interrupt on RXR (Needs reading) */
+if (s->c & BCM2835_I2C_C_INTR && s->s & BCM2835_I2C_S_RXR) {
+do_interrupt = 1;
+}
+
+/* Interrupt on TXW (Needs writing) */
+if (s->c & BCM2835_I2C_C_INTT && s->s & BCM2835_I2C_S_TXW) {
+do_interrupt = 1;
+}
+
+/* Interrupt on DONE (Transfer complete) */
+if (s->c & BCM2835_I2C_C_INTD && s->s & BCM2835_I2C_S_DONE) {
+do_interrupt = 1;
+}
+qemu_set_irq(s->irq, do_interrupt);
+}
+
+static void bcm2835_i2c_begin_transfer(BCM2835I2CState *s)
+{
+int direction = s->c & BCM2835_I2C_C_READ;
+if (i2c_start_transfer(s->bus, s->a, direction)) {
+s->s |= BCM2835_I2C_S_ERR;
+}
+s->s |= BCM2835_I2C_S_TA;
+
+if (direction) {
+s->s |= BCM2835_I2C_S_RXR | BCM2835_I2C_S_RXD;
+} else {
+s->s |= BCM2835_I2C_S_TXW;
+}
+}
+
+static void bcm2835_i2c_finish_transfer(BCM2835I2CState *s)
+{
+/*
+ * STOP is sent when DLEN counts down to zero.
+ *
+ * 
https://github.com/torvalds/linux/blob/master/drivers/i2c/busses/i2c-bcm2835.c#L223-L261


Sorry for not reviewing your patches earlier.

Since this documentation will stay for long and the Linux master branch
will change, better use a tag:
https://github.com/torvalds/linux/blob//v6.7/drivers/i2c/busses/i2c-bcm2835.c#L223-L261

Do you mind posting a patch to correct this?

Thanks,

Phil.



Re: [PATCH v3] virtio-pci: correctly set virtio pci queue mem multiplier

2024-02-22 Thread Michael S. Tsirkin
On Fri, Feb 23, 2024 at 10:56:17AM +0530, Srujana Challa wrote:
> Currently, virtio_pci_queue_mem_mult function always returns 4K
> when VIRTIO_PCI_FLAG_PAGE_PER_VQ is set. But this won't
> work for vhost vdpa when host has page size other than 4K.
> This patch introduces a new property(host-page-per-vq) for vdpa
> use case to fix the same.
> 
> Signed-off-by: Srujana Challa 

Looks good. I'd like to fail realize if both 
   (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)
and
   (proxy->flags & VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ)
so users do not start depending on this combination.



> ---
> v2->v3:
> - Modified property name, page-per-vdpa-vq to host-page-per-vq.
> 
> v1->v2:
> - Introduced a new property to get virtqueue mem multiplier for
>   vdpa use case.
> 
>  hw/virtio/virtio-pci.c | 10 --
>  include/hw/virtio/virtio-pci.h |  5 +
>  2 files changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index 1a7039fb0c..f29e60830b 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -320,8 +320,12 @@ static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
>  
>  static inline int virtio_pci_queue_mem_mult(struct VirtIOPCIProxy *proxy)
>  {
> -return (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ) ?
> -QEMU_VIRTIO_PCI_QUEUE_MEM_MULT : 4;
> +if (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)
> +return QEMU_VIRTIO_PCI_QUEUE_MEM_MULT;
> +else if (proxy->flags & VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ)
> +return qemu_real_host_page_size();
> +else
> +return 4;
>  }
>  
>  static int virtio_pci_ioeventfd_assign(DeviceState *d, EventNotifier 
> *notifier,
> @@ -2301,6 +2305,8 @@ static Property virtio_pci_properties[] = {
>  VIRTIO_PCI_FLAG_INIT_FLR_BIT, true),
>  DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags,
>  VIRTIO_PCI_FLAG_AER_BIT, false),
> +DEFINE_PROP_BIT("host-page-per-vq", VirtIOPCIProxy, flags,
> +VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT, false),
>  DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
> index 59d88018c1..81b6de4291 100644
> --- a/include/hw/virtio/virtio-pci.h
> +++ b/include/hw/virtio/virtio-pci.h
> @@ -43,6 +43,7 @@ enum {
>  VIRTIO_PCI_FLAG_INIT_FLR_BIT,
>  VIRTIO_PCI_FLAG_AER_BIT,
>  VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT,
> +VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT,
>  };
>  
>  /* Need to activate work-arounds for buggy guests at vmstate load. */
> @@ -89,6 +90,10 @@ enum {
>  #define VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \
>(1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT)
>  
> +/* page per vdpa vq flag to be used for vhost vdpa backends */
> +#define VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ \
> +(1 << VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT)
> +
>  typedef struct {
>  MSIMessage msg;
>  int virq;
> -- 
> 2.25.1




Re: [PATCH 2/3] linux-user: Add strace for shmat

2024-02-22 Thread Richard Henderson

On 2/22/24 17:03, Richard Henderson wrote:

Signed-off-by: Richard Henderson 
---
  linux-user/strace.c| 23 +++
  linux-user/strace.list |  2 +-
  2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index cf26e55264..47d6ec3263 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -670,6 +670,25 @@ print_semctl(CPUArchState *cpu_env, const struct 
syscallname *name,
  }
  #endif
  
+static void

+print_shmat(CPUArchState *cpu_env, const struct syscallname *name,
+abi_long arg0, abi_long arg1, abi_long arg2,
+abi_long arg3, abi_long arg4, abi_long arg5)
+{
+static const struct flags shmat_flags[] = {
+FLAG_GENERIC(SHM_RND),
+FLAG_GENERIC(SHM_REMAP),
+FLAG_GENERIC(SHM_RDONLY),
+FLAG_GENERIC(SHM_EXEC),
+};


Missing FLAG_END, of course.


r~



Re: [External] Re: [PATCH v2 4/7] migration/multifd: Enable zero page checking from multifd threads.

2024-02-22 Thread Hao Xiang
On Thu, Feb 22, 2024 at 6:33 PM Peter Xu  wrote:
>
> On Wed, Feb 21, 2024 at 06:06:19PM -0300, Fabiano Rosas wrote:
> > Hao Xiang  writes:
> >
> > > This change adds a dedicated handler for 
> > > MigrationOps::ram_save_target_page in
> >
> > nit: Add a dedicated handler...
> >
> > Usually "this patch/change" is used only when necessary to avoid
> > ambiguity.
> >
> > > multifd live migration. Now zero page checking can be done in the multifd 
> > > threads
> > > and this becomes the default configuration. We still provide backward 
> > > compatibility
> > > where zero page checking is done from the migration main thread.
> > >
> > > Signed-off-by: Hao Xiang 
> > > ---
> > >  migration/multifd.c |  1 +
> > >  migration/options.c |  2 +-
> > >  migration/ram.c | 53 ++---
> > >  3 files changed, 42 insertions(+), 14 deletions(-)
> > >
> > > diff --git a/migration/multifd.c b/migration/multifd.c
> > > index fbb40ea10b..ef5dad1019 100644
> > > --- a/migration/multifd.c
> > > +++ b/migration/multifd.c
> > > @@ -13,6 +13,7 @@
> > >  #include "qemu/osdep.h"
> > >  #include "qemu/cutils.h"
> >
> > This include...
> >
> > >  #include "qemu/rcu.h"
> > > +#include "qemu/cutils.h"
> >
> > is there already.
> >
> > >  #include "exec/target_page.h"
> > >  #include "sysemu/sysemu.h"
> > >  #include "exec/ramblock.h"
> > > diff --git a/migration/options.c b/migration/options.c
> > > index 3c603391b0..3c79b6ccd4 100644
> > > --- a/migration/options.c
> > > +++ b/migration/options.c
> > > @@ -181,7 +181,7 @@ Property migration_properties[] = {
> > >MIG_MODE_NORMAL),
> > >  DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", 
> > > MigrationState,
> > > parameters.zero_page_detection,
> > > -   ZERO_PAGE_DETECTION_LEGACY),
> > > +   ZERO_PAGE_DETECTION_MULTIFD),
> >
> > I think we'll need something to avoid a 9.0 -> 8.2 migration with this
> > enabled. Otherwise it will go along happily until we get data corruption
> > because the new QEMU didn't send any zero pages on the migration thread
> > and the old QEMU did not look for them in the multifd packet.
>
> It could be even worse, as the new QEMU will only attach "normal" pages
> after the multifd packet, the old QEMU could read more than it could,
> expecting all pages..
>
> >
> > Perhaps bumping the MULTIFD_VERSION when ZERO_PAGE_DETECTION_MULTIFD is
> > in use. We'd just need to fix the test in the new QEMU to check
> > (msg.version > MULTIFD_VERSION) instead of (msg.version != MULTIFD_VERSION).
>
> IMHO we don't need yet to change MULTIFD_VERSION, what we need is perhaps a
> compat entry in hw_compat_8_2 setting "zero-page-detection" to "legacy".
> We should make sure when "legacy" is set, multifd ran the old protocol
> (zero_num will always be 0, and will be ignored by old QEMUs, IIUC).
>
> One more comment is, when repost please consider split this patch into two;
> The new ram_save_target_page_multifd() hook can be done in another patch,
> AFAIU.

Sorry, I kept missing this. I will keep telling myself, compatibility
is king. I will set the hw_compat_8_2 setting and make sure to test
migration 9.0 -> 8.2 fails with "multifd" option set.
Will split patches.

>
> >
> > >
> > >  /* Migration capabilities */
> > >  DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
> > > diff --git a/migration/ram.c b/migration/ram.c
> > > index 5ece9f042e..b088c5a98c 100644
> > > --- a/migration/ram.c
> > > +++ b/migration/ram.c
> > > @@ -1123,10 +1123,6 @@ static int save_zero_page(RAMState *rs, 
> > > PageSearchStatus *pss,
> > >  QEMUFile *file = pss->pss_channel;
> > >  int len = 0;
> > >
> > > -if (migrate_zero_page_detection() != ZERO_PAGE_DETECTION_LEGACY) {
> > > -return 0;
> > > -}
> >
> > How does 'none' work now?
> >
> > > -
> > >  if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) {
> > >  return 0;
> > >  }
> > > @@ -1256,6 +1252,10 @@ static int ram_save_page(RAMState *rs, 
> > > PageSearchStatus *pss)
> > >
> > >  static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset)
> > >  {
> > > +assert(migrate_multifd());
> > > +assert(!migrate_compress());
> > > +assert(!migration_in_postcopy());
> >
> > Drop these, please. Keep only the asserts that are likely to trigger
> > during development, such as the existing ones at multifd_send_pages.
> >
> > > +
> > >  if (!multifd_queue_page(block, offset)) {
> > >  return -1;
> > >  }
> > > @@ -2046,7 +2046,6 @@ static bool save_compress_page(RAMState *rs, 
> > > PageSearchStatus *pss,
> > >   */
> > >  static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus 
> > > *pss)
> > >  {
> > > -RAMBlock *block = pss->block;
> > >  ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
> > >  int res;
> > >
> > > @@ -2062,17 +2061,40 @@ static int ram_save_target_page_legacy(RAMState 

Re: [PATCH V5 1/5] util: str_split

2024-02-22 Thread Philippe Mathieu-Daudé

On 22/2/24 22:47, Steve Sistare wrote:

Generalize hmp_split_at_comma() to take any delimiter string, rename
as str_split(), and move it to util/strList.c.

No functional change.

Signed-off-by: Steve Sistare 
---
  include/monitor/hmp.h  |  1 -
  include/qemu/strList.h | 24 
  monitor/hmp-cmds.c | 19 ---
  net/net-hmp-cmds.c |  3 ++-
  stats/stats-hmp-cmds.c |  3 ++-
  util/meson.build   |  1 +
  util/strList.c | 24 
  7 files changed, 53 insertions(+), 22 deletions(-)
  create mode 100644 include/qemu/strList.h
  create mode 100644 util/strList.c




+#include "qapi/qapi-builtin-types.h"
+
+/*
+ * Split @str into a strList using the delimiter string @delim.
+ * The delimiter is not included in the result.
+ * Return NULL if @str is NULL or an empty string.
+ * A leading, trailing, or consecutive delimiter produces an
+ * empty string at that position in the output.
+ * All strings are g_strdup'd, and the result can be freed
+ * using qapi_free_strList.


Note "qapi/qapi-builtin-types.h" defines:

  G_DEFINE_AUTOPTR_CLEANUP_FUNC(strList, qapi_free_strList)

Maybe mention we can also use:

  g_autoptr(strList)

?


+ */
+strList *str_split(const char *str, const char *delim);
+
+#endif





Re: [PATCH V5 1/5] util: str_split

2024-02-22 Thread Philippe Mathieu-Daudé

On 22/2/24 22:47, Steve Sistare wrote:

Generalize hmp_split_at_comma() to take any delimiter string, rename
as str_split(), and move it to util/strList.c.

No functional change.

Signed-off-by: Steve Sistare 
---
  include/monitor/hmp.h  |  1 -
  include/qemu/strList.h | 24 
  monitor/hmp-cmds.c | 19 ---
  net/net-hmp-cmds.c |  3 ++-
  stats/stats-hmp-cmds.c |  3 ++-
  util/meson.build   |  1 +
  util/strList.c | 24 
  7 files changed, 53 insertions(+), 22 deletions(-)
  create mode 100644 include/qemu/strList.h
  create mode 100644 util/strList.c


Reviewed-by: Philippe Mathieu-Daudé 




Re: [External] Re: [PATCH v2 4/7] migration/multifd: Enable zero page checking from multifd threads.

2024-02-22 Thread Hao Xiang
On Wed, Feb 21, 2024 at 1:06 PM Fabiano Rosas  wrote:
>
> Hao Xiang  writes:
>
> > This change adds a dedicated handler for MigrationOps::ram_save_target_page 
> > in
>
> nit: Add a dedicated handler...
>
> Usually "this patch/change" is used only when necessary to avoid
> ambiguity.

Will do.

>
> > multifd live migration. Now zero page checking can be done in the multifd 
> > threads
> > and this becomes the default configuration. We still provide backward 
> > compatibility
> > where zero page checking is done from the migration main thread.
> >
> > Signed-off-by: Hao Xiang 
> > ---
> >  migration/multifd.c |  1 +
> >  migration/options.c |  2 +-
> >  migration/ram.c | 53 ++---
> >  3 files changed, 42 insertions(+), 14 deletions(-)
> >
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index fbb40ea10b..ef5dad1019 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -13,6 +13,7 @@
> >  #include "qemu/osdep.h"
> >  #include "qemu/cutils.h"
>
> This include...
>
> >  #include "qemu/rcu.h"
> > +#include "qemu/cutils.h"
>
> is there already.
>
> >  #include "exec/target_page.h"
> >  #include "sysemu/sysemu.h"
> >  #include "exec/ramblock.h"
> > diff --git a/migration/options.c b/migration/options.c
> > index 3c603391b0..3c79b6ccd4 100644
> > --- a/migration/options.c
> > +++ b/migration/options.c
> > @@ -181,7 +181,7 @@ Property migration_properties[] = {
> >MIG_MODE_NORMAL),
> >  DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
> > parameters.zero_page_detection,
> > -   ZERO_PAGE_DETECTION_LEGACY),
> > +   ZERO_PAGE_DETECTION_MULTIFD),
>
> I think we'll need something to avoid a 9.0 -> 8.2 migration with this
> enabled. Otherwise it will go along happily until we get data corruption
> because the new QEMU didn't send any zero pages on the migration thread
> and the old QEMU did not look for them in the multifd packet.
>
> Perhaps bumping the MULTIFD_VERSION when ZERO_PAGE_DETECTION_MULTIFD is
> in use. We'd just need to fix the test in the new QEMU to check
> (msg.version > MULTIFD_VERSION) instead of (msg.version != MULTIFD_VERSION).
>
> >
> >  /* Migration capabilities */
> >  DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
> > diff --git a/migration/ram.c b/migration/ram.c
> > index 5ece9f042e..b088c5a98c 100644
> > --- a/migration/ram.c
> > +++ b/migration/ram.c
> > @@ -1123,10 +1123,6 @@ static int save_zero_page(RAMState *rs, 
> > PageSearchStatus *pss,
> >  QEMUFile *file = pss->pss_channel;
> >  int len = 0;
> >
> > -if (migrate_zero_page_detection() != ZERO_PAGE_DETECTION_LEGACY) {
> > -return 0;
> > -}
>
> How does 'none' work now?

I tested it and all pages are transferred with payload (including the
zero pages).

>
> > -
> >  if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) {
> >  return 0;
> >  }
> > @@ -1256,6 +1252,10 @@ static int ram_save_page(RAMState *rs, 
> > PageSearchStatus *pss)
> >
> >  static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset)
> >  {
> > +assert(migrate_multifd());
> > +assert(!migrate_compress());
> > +assert(!migration_in_postcopy());
>
> Drop these, please. Keep only the asserts that are likely to trigger
> during development, such as the existing ones at multifd_send_pages.

I think I have got enough feedback regarding too many asserts. I will
drop these. assert is not compiled into the free build, correct?

>
> > +
> >  if (!multifd_queue_page(block, offset)) {
> >  return -1;
> >  }
> > @@ -2046,7 +2046,6 @@ static bool save_compress_page(RAMState *rs, 
> > PageSearchStatus *pss,
> >   */
> >  static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
> >  {
> > -RAMBlock *block = pss->block;
> >  ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
> >  int res;
> >
> > @@ -2062,17 +2061,40 @@ static int ram_save_target_page_legacy(RAMState 
> > *rs, PageSearchStatus *pss)
> >  return 1;
> >  }
> >
> > +return ram_save_page(rs, pss);
>
> Look at where git put this! Are you using the default diff algorithm? If
> not try using --patience to see if it improves the diff.

I used the default diff algorithm.

>
> > +}
> > +
> > +/**
> > + * ram_save_target_page_multifd: save one target page
> > + *
> > + * Returns the number of pages written
>
> We could be more precise here:
>
>  ram_save_target_page_multifd: send one target page to multifd workers
>
>  Returns 1 if the page was queued, -1 otherwise.

Will do.

>
> > + *
> > + * @rs: current RAM state
> > + * @pss: data about the page we want to send
> > + */
> > +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus 
> > *pss)
> > +{
> > +RAMBlock *block = pss->block;
> > +ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
> > +
> > +   

Re: [PATCH v3] virtio-pci: correctly set virtio pci queue mem multiplier

2024-02-22 Thread Philippe Mathieu-Daudé

Hi Srujana,

On 23/2/24 06:26, Srujana Challa wrote:

Currently, virtio_pci_queue_mem_mult function always returns 4K
when VIRTIO_PCI_FLAG_PAGE_PER_VQ is set. But this won't
work for vhost vdpa when host has page size other than 4K.
This patch introduces a new property(host-page-per-vq) for vdpa
use case to fix the same.

Signed-off-by: Srujana Challa 
---
v2->v3:
- Modified property name, page-per-vdpa-vq to host-page-per-vq.

v1->v2:
- Introduced a new property to get virtqueue mem multiplier for
   vdpa use case.

  hw/virtio/virtio-pci.c | 10 --
  include/hw/virtio/virtio-pci.h |  5 +
  2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 1a7039fb0c..f29e60830b 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -320,8 +320,12 @@ static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
  
  static inline int virtio_pci_queue_mem_mult(struct VirtIOPCIProxy *proxy)

  {
-return (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ) ?
-QEMU_VIRTIO_PCI_QUEUE_MEM_MULT : 4;
+if (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)
+return QEMU_VIRTIO_PCI_QUEUE_MEM_MULT;
+else if (proxy->flags & VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ)
+return qemu_real_host_page_size();
+else
+return 4;

>   }

Per our coding style, this code should use braces:
https://www.qemu.org/docs/master/devel/style.html#block-structure

Regards,

Phil.



Re: [PATCH v2 0/6] libqos, riscv: libqos fixes, add riscv machine

2024-02-22 Thread Alistair Francis
On Sun, Feb 18, 2024 at 5:27 AM Daniel Henrique Barboza
 wrote:
>
> Hi,
>
> This second version was rebased with current
> alistair/riscv.to.apply-next and has typo fixes in patch 1. No other
> changes were made.
>
> All patches reviewed/acked.
>
> Changes from v1:
> - patch 1: typos in the commit message fixed
> - v1 link: 
> https://lore.kernel.org/qemu-riscv/20240213191736.74-1-dbarb...@ventanamicro.com/
>
> Daniel Henrique Barboza (6):
>   libqos/virtio.c: init all elems in qvring_indirect_desc_setup()
>   libqos/virtio.c: fix 'avail_event' offset in qvring_init()
>   hw/riscv/virt.c: create '/soc/pci@...' fdt node earlier
>   hw/riscv/virt.c: add virtio-iommu-pci hotplug support
>   hw/riscv/virt.c: make aclint compatible with 'qtest' accel
>   tests/libqos: add riscv/virt machine nodes

Thanks!

Applied to riscv-to-apply.next

Alistair

>
>  hw/riscv/virt.c |  97 -
>  tests/qtest/libqos/meson.build  |   1 +
>  tests/qtest/libqos/riscv-virt-machine.c | 137 
>  tests/qtest/libqos/virtio.c |  27 +++--
>  4 files changed, 228 insertions(+), 34 deletions(-)
>  create mode 100644 tests/qtest/libqos/riscv-virt-machine.c
>
> --
> 2.43.2
>
>



[PATCH v3] virtio-pci: correctly set virtio pci queue mem multiplier

2024-02-22 Thread Srujana Challa
Currently, virtio_pci_queue_mem_mult function always returns 4K
when VIRTIO_PCI_FLAG_PAGE_PER_VQ is set. But this won't
work for vhost vdpa when host has page size other than 4K.
This patch introduces a new property(host-page-per-vq) for vdpa
use case to fix the same.

Signed-off-by: Srujana Challa 
---
v2->v3:
- Modified property name, page-per-vdpa-vq to host-page-per-vq.

v1->v2:
- Introduced a new property to get virtqueue mem multiplier for
  vdpa use case.

 hw/virtio/virtio-pci.c | 10 --
 include/hw/virtio/virtio-pci.h |  5 +
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 1a7039fb0c..f29e60830b 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -320,8 +320,12 @@ static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
 
 static inline int virtio_pci_queue_mem_mult(struct VirtIOPCIProxy *proxy)
 {
-return (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ) ?
-QEMU_VIRTIO_PCI_QUEUE_MEM_MULT : 4;
+if (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)
+return QEMU_VIRTIO_PCI_QUEUE_MEM_MULT;
+else if (proxy->flags & VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ)
+return qemu_real_host_page_size();
+else
+return 4;
 }
 
 static int virtio_pci_ioeventfd_assign(DeviceState *d, EventNotifier *notifier,
@@ -2301,6 +2305,8 @@ static Property virtio_pci_properties[] = {
 VIRTIO_PCI_FLAG_INIT_FLR_BIT, true),
 DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags,
 VIRTIO_PCI_FLAG_AER_BIT, false),
+DEFINE_PROP_BIT("host-page-per-vq", VirtIOPCIProxy, flags,
+VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 59d88018c1..81b6de4291 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -43,6 +43,7 @@ enum {
 VIRTIO_PCI_FLAG_INIT_FLR_BIT,
 VIRTIO_PCI_FLAG_AER_BIT,
 VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT,
+VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT,
 };
 
 /* Need to activate work-arounds for buggy guests at vmstate load. */
@@ -89,6 +90,10 @@ enum {
 #define VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED \
   (1 << VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT)
 
+/* page per vdpa vq flag to be used for vhost vdpa backends */
+#define VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ \
+(1 << VIRTIO_PCI_FLAG_HOST_PAGE_PER_VQ_BIT)
+
 typedef struct {
 MSIMessage msg;
 int virq;
-- 
2.25.1




Re: [External] Re: [PATCH v2 4/7] migration/multifd: Enable zero page checking from multifd threads.

2024-02-22 Thread Hao Xiang
On Wed, Feb 21, 2024 at 8:11 AM Elena Ufimtseva  wrote:
>
>
>
> On Fri, Feb 16, 2024 at 2:42 PM Hao Xiang  wrote:
>>
>> This change adds a dedicated handler for MigrationOps::ram_save_target_page 
>> in
>> multifd live migration. Now zero page checking can be done in the multifd 
>> threads
>> and this becomes the default configuration. We still provide backward 
>> compatibility
>> where zero page checking is done from the migration main thread.
>>
>> Signed-off-by: Hao Xiang 
>> ---
>>  migration/multifd.c |  1 +
>>  migration/options.c |  2 +-
>>  migration/ram.c | 53 ++---
>>  3 files changed, 42 insertions(+), 14 deletions(-)
>>
>> diff --git a/migration/multifd.c b/migration/multifd.c
>> index fbb40ea10b..ef5dad1019 100644
>> --- a/migration/multifd.c
>> +++ b/migration/multifd.c
>> @@ -13,6 +13,7 @@
>>  #include "qemu/osdep.h"
>>  #include "qemu/cutils.h"
>>  #include "qemu/rcu.h"
>> +#include "qemu/cutils.h"
>>  #include "exec/target_page.h"
>>  #include "sysemu/sysemu.h"
>>  #include "exec/ramblock.h"
>> diff --git a/migration/options.c b/migration/options.c
>> index 3c603391b0..3c79b6ccd4 100644
>> --- a/migration/options.c
>> +++ b/migration/options.c
>> @@ -181,7 +181,7 @@ Property migration_properties[] = {
>>MIG_MODE_NORMAL),
>>  DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
>> parameters.zero_page_detection,
>> -   ZERO_PAGE_DETECTION_LEGACY),
>> +   ZERO_PAGE_DETECTION_MULTIFD),
>>
>>  /* Migration capabilities */
>>  DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 5ece9f042e..b088c5a98c 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -1123,10 +1123,6 @@ static int save_zero_page(RAMState *rs, 
>> PageSearchStatus *pss,
>>  QEMUFile *file = pss->pss_channel;
>>  int len = 0;
>>
>> -if (migrate_zero_page_detection() != ZERO_PAGE_DETECTION_LEGACY) {
>> -return 0;
>> -}
>> -
>>  if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) {
>>  return 0;
>>  }
>> @@ -1256,6 +1252,10 @@ static int ram_save_page(RAMState *rs, 
>> PageSearchStatus *pss)
>>
>>  static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset)
>>  {
>> +assert(migrate_multifd());
>
> We only call ram_save_multifd_page() if:
>  if (migrate_multifd()) {
> migration_ops->ram_save_target_page = ram_save_target_page_multifd;
> So this assert is not needed.

The point of an assert is to ensure the current function is called
with the correct assumptions. In the future, if someone moves this
function to a different place, we can catch the potential issues.

>
>> +assert(!migrate_compress());
>>
>> +assert(!migration_in_postcopy());
>
> These two are redundant and done before we call in here.
>
>> +
>>  if (!multifd_queue_page(block, offset)) {
>>  return -1;
>>  }
>> @@ -2046,7 +2046,6 @@ static bool save_compress_page(RAMState *rs, 
>> PageSearchStatus *pss,
>>   */
>>  static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
>>  {
>> -RAMBlock *block = pss->block;
>>  ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
>>  int res;
>>
>> @@ -2062,17 +2061,40 @@ static int ram_save_target_page_legacy(RAMState *rs, 
>> PageSearchStatus *pss)
>>  return 1;
>>  }
>>
>> +return ram_save_page(rs, pss);
>> +}
>> +
>> +/**
>> + * ram_save_target_page_multifd: save one target page
>> + *
>> + * Returns the number of pages written
>> + *
>> + * @rs: current RAM state
>> + * @pss: data about the page we want to send
>> + */
>> +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss)
>> +{
>> +RAMBlock *block = pss->block;
>> +ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
>> +
>> +/* Multifd is not compatible with old compression. */
>> +assert(!migrate_compress());
>
> Do we need to check this for every page?
>
>>
>> +/* Multifd is not compabible with postcopy. */
>> +assert(!migration_in_postcopy());
>> +
>>  /*
>> - * Do not use multifd in postcopy as one whole host page should be
>> - * placed.  Meanwhile postcopy requires atomic update of pages, so even
>> - * if host page size == guest page size the dest guest during run may
>> - * still see partially copied pages which is data corruption.
>> + * Backward compatibility support. While using multifd live
>> + * migration, we still need to handle zero page checking on the
>> + * migration main thread.
>>   */
>> -if (migrate_multifd() && !migration_in_postcopy()) {
>> -return ram_save_multifd_page(block, offset);
>> +if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) {
>> +if (save_zero_page(rs, pss, offset)) {
>> +return 1;
>> +}
>>  }
>>
>> -

Re: [External] Re: [PATCH v2 3/7] migration/multifd: Zero page transmission on the multifd thread.

2024-02-22 Thread Hao Xiang
On Wed, Feb 21, 2024 at 1:04 PM Fabiano Rosas  wrote:
>
> Hao Xiang  writes:
>
> > 1. Implements the zero page detection and handling on the multifd
> > threads for non-compression, zlib and zstd compression backends.
> > 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> > 3. Add proper asserts to ensure pages->normal are used for normal pages
> > in all scenarios.
> >
> > Signed-off-by: Hao Xiang 
> > ---
> >  migration/meson.build |  1 +
> >  migration/multifd-zero-page.c | 59 +++
> >  migration/multifd-zlib.c  | 26 ---
> >  migration/multifd-zstd.c  | 25 ---
> >  migration/multifd.c   | 50 +++--
> >  migration/multifd.h   |  7 +
> >  qapi/migration.json   |  4 ++-
> >  7 files changed, 151 insertions(+), 21 deletions(-)
> >  create mode 100644 migration/multifd-zero-page.c
> >
> > diff --git a/migration/meson.build b/migration/meson.build
> > index 92b1cc4297..1eeb915ff6 100644
> > --- a/migration/meson.build
> > +++ b/migration/meson.build
> > @@ -22,6 +22,7 @@ system_ss.add(files(
> >'migration.c',
> >'multifd.c',
> >'multifd-zlib.c',
> > +  'multifd-zero-page.c',
> >'ram-compress.c',
> >'options.c',
> >'postcopy-ram.c',
> > diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> > new file mode 100644
> > index 00..f0cd8e2c53
> > --- /dev/null
> > +++ b/migration/multifd-zero-page.c
> > @@ -0,0 +1,59 @@
> > +/*
> > + * Multifd zero page detection implementation.
> > + *
> > + * Copyright (c) 2024 Bytedance Inc
> > + *
> > + * Authors:
> > + *  Hao Xiang 
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or 
> > later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qemu/cutils.h"
> > +#include "exec/ramblock.h"
> > +#include "migration.h"
> > +#include "multifd.h"
> > +#include "options.h"
> > +#include "ram.h"
> > +
> > +void multifd_zero_page_check_send(MultiFDSendParams *p)
> > +{
> > +/*
> > + * QEMU older than 9.0 don't understand zero page
> > + * on multifd channel. This switch is required to
> > + * maintain backward compatibility.
> > + */
> > +bool use_multifd_zero_page =
> > +(migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> > +MultiFDPages_t *pages = p->pages;
> > +RAMBlock *rb = pages->block;
> > +
> > +assert(pages->num != 0);
> > +assert(pages->normal_num == 0);
> > +assert(pages->zero_num == 0);
>
> We can drop these before the final version.

Elena has the same concern. I will drop these.

>
> > +
> > +for (int i = 0; i < pages->num; i++) {
> > +uint64_t offset = pages->offset[i];
> > +if (use_multifd_zero_page &&
> > +buffer_is_zero(rb->host + offset, p->page_size)) {
> > +pages->zero[pages->zero_num] = offset;
> > +pages->zero_num++;
> > +ram_release_page(rb->idstr, offset);
> > +} else {
> > +pages->normal[pages->normal_num] = offset;
> > +pages->normal_num++;
> > +}
> > +}
>
> I don't think it's super clean to have three arrays offset, zero and
> normal, all sized for the full packet size. It might be possible to just
> carry a bitmap of non-zero pages along with pages->offset and operate on
> that instead.
>
> What do you think?
>
> Peter, any ideas? Should we just leave this for another time?
>
> > +}
> > +
> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> > +{
> > +for (int i = 0; i < p->zero_num; i++) {
> > +void *page = p->host + p->zero[i];
> > +if (!buffer_is_zero(page, p->page_size)) {
> > +memset(page, 0, p->page_size);
> > +}
> > +}
> > +}
> > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > index 012e3bdea1..cdfe0fa70e 100644
> > --- a/migration/multifd-zlib.c
> > +++ b/migration/multifd-zlib.c
> > @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, 
> > Error **errp)
> >  int ret;
> >  uint32_t i;
> >
> > +multifd_zero_page_check_send(p);
> > +
> > +if (!pages->normal_num) {
> > +p->next_packet_size = 0;
> > +goto out;
> > +}
> > +
> >  multifd_send_prepare_header(p);
> >
> > -for (i = 0; i < pages->num; i++) {
> > +for (i = 0; i < pages->normal_num; i++) {
> >  uint32_t available = z->zbuff_len - out_size;
> >  int flush = Z_NO_FLUSH;
> >
> > -if (i == pages->num - 1) {
> > +if (i == pages->normal_num - 1) {
> >  flush = Z_SYNC_FLUSH;
> >  }
> >
> > @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, 
> > Error **errp)
> >   * with compression. zlib does not guarantee that this is safe,
> >   * therefore copy the page before calling deflate().
> >   */
> > -

Re: [External] Re: [PATCH v2 3/7] migration/multifd: Zero page transmission on the multifd thread.

2024-02-22 Thread Hao Xiang
On Thu, Feb 22, 2024 at 6:21 PM Peter Xu  wrote:
>
> On Wed, Feb 21, 2024 at 06:04:10PM -0300, Fabiano Rosas wrote:
> > Hao Xiang  writes:
> >
> > > 1. Implements the zero page detection and handling on the multifd
> > > threads for non-compression, zlib and zstd compression backends.
> > > 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> > > 3. Add proper asserts to ensure pages->normal are used for normal pages
> > > in all scenarios.
> > >
> > > Signed-off-by: Hao Xiang 
> > > ---
> > >  migration/meson.build |  1 +
> > >  migration/multifd-zero-page.c | 59 +++
> > >  migration/multifd-zlib.c  | 26 ---
> > >  migration/multifd-zstd.c  | 25 ---
> > >  migration/multifd.c   | 50 +++--
> > >  migration/multifd.h   |  7 +
> > >  qapi/migration.json   |  4 ++-
> > >  7 files changed, 151 insertions(+), 21 deletions(-)
> > >  create mode 100644 migration/multifd-zero-page.c
> > >
> > > diff --git a/migration/meson.build b/migration/meson.build
> > > index 92b1cc4297..1eeb915ff6 100644
> > > --- a/migration/meson.build
> > > +++ b/migration/meson.build
> > > @@ -22,6 +22,7 @@ system_ss.add(files(
> > >'migration.c',
> > >'multifd.c',
> > >'multifd-zlib.c',
> > > +  'multifd-zero-page.c',
> > >'ram-compress.c',
> > >'options.c',
> > >'postcopy-ram.c',
> > > diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> > > new file mode 100644
> > > index 00..f0cd8e2c53
> > > --- /dev/null
> > > +++ b/migration/multifd-zero-page.c
> > > @@ -0,0 +1,59 @@
> > > +/*
> > > + * Multifd zero page detection implementation.
> > > + *
> > > + * Copyright (c) 2024 Bytedance Inc
> > > + *
> > > + * Authors:
> > > + *  Hao Xiang 
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2 or 
> > > later.
> > > + * See the COPYING file in the top-level directory.
> > > + */
> > > +
> > > +#include "qemu/osdep.h"
> > > +#include "qemu/cutils.h"
> > > +#include "exec/ramblock.h"
> > > +#include "migration.h"
> > > +#include "multifd.h"
> > > +#include "options.h"
> > > +#include "ram.h"
> > > +
> > > +void multifd_zero_page_check_send(MultiFDSendParams *p)
> > > +{
> > > +/*
> > > + * QEMU older than 9.0 don't understand zero page
> > > + * on multifd channel. This switch is required to
> > > + * maintain backward compatibility.
> > > + */
> > > +bool use_multifd_zero_page =
> > > +(migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> > > +MultiFDPages_t *pages = p->pages;
> > > +RAMBlock *rb = pages->block;
> > > +
> > > +assert(pages->num != 0);
> > > +assert(pages->normal_num == 0);
> > > +assert(pages->zero_num == 0);
> >
> > We can drop these before the final version.
> >
> > > +
> > > +for (int i = 0; i < pages->num; i++) {
> > > +uint64_t offset = pages->offset[i];
> > > +if (use_multifd_zero_page &&
> > > +buffer_is_zero(rb->host + offset, p->page_size)) {
> > > +pages->zero[pages->zero_num] = offset;
> > > +pages->zero_num++;
> > > +ram_release_page(rb->idstr, offset);
> > > +} else {
> > > +pages->normal[pages->normal_num] = offset;
> > > +pages->normal_num++;
> > > +}
> > > +}
> >
> > I don't think it's super clean to have three arrays offset, zero and
> > normal, all sized for the full packet size. It might be possible to just
> > carry a bitmap of non-zero pages along with pages->offset and operate on
> > that instead.
> >
> > What do you think?
> >
> > Peter, any ideas? Should we just leave this for another time?
>
> Yeah I think a bitmap should save quite a few fields indeed, it'll however
> make the latter iteration slightly harder by walking both (offset[],
> bitmap), process the page only if bitmap is set for the offset.
>
> IIUC we perhaps don't even need a bitmap?  AFAIU what we only need in
> Multifdpages_t is one extra field to mark "how many normal pages", aka,
> normal_num here (zero_num can be calculated from num-normal_num).  Then
> the zero page detection logic should do two things:
>
>   - Sort offset[] array so that it starts with normal pages, followed up by
> zero pages
>
>   - Setup normal_num to be the number of normal pages
>
> Then we reduce 2 new arrays (normal[], zero[]) + 2 new fields (normal_num,
> zero_num) -> 1 new field (normal_num).  It'll also be trivial to fill the
> packet header later because offset[] is exactly that.
>
> Side note - I still think it's confusing to read this patch and previous
> patch separately.  Obviously previous patch introduced these new fields
> without justifying their values yet.  IMHO it'll be easier to review if you
> merge the two patches.

Fabiano, thanks for catching this. I totally missed the backward
compatibility thing.
Peter, I will code the sorting and merge 

Re: [PATCH] hw/riscv/virt-acpi-build.c: Add SRAT and SLIT ACPI tables

2024-02-22 Thread Alistair Francis
On Tue, Jan 30, 2024 at 12:05 AM Haibo Xu  wrote:
>
> Enable ACPI NUMA support by adding the following 2 ACPI tables:
> SRAT: provides the association for memory/Harts and Proximity Domains
> SLIT: provides the relative distance between Proximity Domains
>
> The SRAT RINTC Affinity Structure definition[1] was based on the recently
> approved ACPI CodeFirst ECR[2].
>
> [1] https://github.com/riscv-non-isa/riscv-acpi/issues/25
> [2] https://mantis.uefi.org/mantis/view.php?id=2433
>
> Signed-off-by: Haibo Xu 

Thanks!

Applied to riscv-to-apply.next

Alistair

> ---
>  hw/riscv/virt-acpi-build.c | 60 ++
>  1 file changed, 60 insertions(+)
>
> diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
> index 26c7e4482d..f0a6b61747 100644
> --- a/hw/riscv/virt-acpi-build.c
> +++ b/hw/riscv/virt-acpi-build.c
> @@ -528,11 +528,61 @@ static void build_madt(GArray *table_data,
>  acpi_table_end(linker, );
>  }
>
> +/*
> + * ACPI spec, Revision 6.5+
> + * 5.2.16 System Resource Affinity Table (SRAT)
> + * REF: https://github.com/riscv-non-isa/riscv-acpi/issues/25
> + *  
> https://drive.google.com/file/d/1YTdDx2IPm5IeZjAW932EYU-tUtgS08tX/view
> + */
> +static void
> +build_srat(GArray *table_data, BIOSLinker *linker, RISCVVirtState *vms)
> +{
> +int i;
> +uint64_t mem_base;
> +MachineClass *mc = MACHINE_GET_CLASS(vms);
> +MachineState *ms = MACHINE(vms);
> +const CPUArchIdList *cpu_list = mc->possible_cpu_arch_ids(ms);
> +AcpiTable table = { .sig = "SRAT", .rev = 3, .oem_id = vms->oem_id,
> +.oem_table_id = vms->oem_table_id };
> +
> +acpi_table_begin(, table_data);
> +build_append_int_noprefix(table_data, 1, 4); /* Reserved */
> +build_append_int_noprefix(table_data, 0, 8); /* Reserved */
> +
> +for (i = 0; i < cpu_list->len; ++i) {
> +uint32_t nodeid = cpu_list->cpus[i].props.node_id;
> +/*
> + * 5.2.16.8 RINTC Affinity Structure
> + */
> +build_append_int_noprefix(table_data, 7, 1);  /* Type */
> +build_append_int_noprefix(table_data, 20, 1); /* Length */
> +build_append_int_noprefix(table_data, 0, 2);/* Reserved */
> +build_append_int_noprefix(table_data, nodeid, 4); /* Proximity 
> Domain */
> +build_append_int_noprefix(table_data, i, 4); /* ACPI Processor UID */
> +/* Flags, Table 5-70 */
> +build_append_int_noprefix(table_data, 1 /* Flags: Enabled */, 4);
> +build_append_int_noprefix(table_data, 0, 4); /* Clock Domain */
> +}
> +
> +mem_base = vms->memmap[VIRT_DRAM].base;
> +for (i = 0; i < ms->numa_state->num_nodes; ++i) {
> +if (ms->numa_state->nodes[i].node_mem > 0) {
> +build_srat_memory(table_data, mem_base,
> +  ms->numa_state->nodes[i].node_mem, i,
> +  MEM_AFFINITY_ENABLED);
> +mem_base += ms->numa_state->nodes[i].node_mem;
> +}
> +}
> +
> +acpi_table_end(linker, );
> +}
> +
>  static void virt_acpi_build(RISCVVirtState *s, AcpiBuildTables *tables)
>  {
>  GArray *table_offsets;
>  unsigned dsdt, xsdt;
>  GArray *tables_blob = tables->table_data;
> +MachineState *ms = MACHINE(s);
>
>  table_offsets = g_array_new(false, true,
>  sizeof(uint32_t));
> @@ -565,6 +615,16 @@ static void virt_acpi_build(RISCVVirtState *s, 
> AcpiBuildTables *tables)
> s->oem_table_id);
>  }
>
> +if (ms->numa_state->num_nodes > 0) {
> +acpi_add_table(table_offsets, tables_blob);
> +build_srat(tables_blob, tables->linker, s);
> +if (ms->numa_state->have_numa_distance) {
> +acpi_add_table(table_offsets, tables_blob);
> +build_slit(tables_blob, tables->linker, ms, s->oem_id,
> +   s->oem_table_id);
> +}
> +}
> +
>  /* XSDT is pointed to by RSDP */
>  xsdt = tables_blob->len;
>  build_xsdt(tables_blob, tables->linker, table_offsets, s->oem_id,
> --
> 2.34.1
>
>



Re: [External] Re: [PATCH v2 3/7] migration/multifd: Zero page transmission on the multifd thread.

2024-02-22 Thread Hao Xiang
On Wed, Feb 21, 2024 at 8:00 AM Elena Ufimtseva  wrote:
>
>
>
> On Fri, Feb 16, 2024 at 2:42 PM Hao Xiang  wrote:
>>
>> 1. Implements the zero page detection and handling on the multifd
>> threads for non-compression, zlib and zstd compression backends.
>> 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
>> 3. Add proper asserts to ensure pages->normal are used for normal pages
>> in all scenarios.
>>
>> Signed-off-by: Hao Xiang 
>> ---
>>  migration/meson.build |  1 +
>>  migration/multifd-zero-page.c | 59 +++
>>  migration/multifd-zlib.c  | 26 ---
>>  migration/multifd-zstd.c  | 25 ---
>>  migration/multifd.c   | 50 +++--
>>  migration/multifd.h   |  7 +
>>  qapi/migration.json   |  4 ++-
>>  7 files changed, 151 insertions(+), 21 deletions(-)
>>  create mode 100644 migration/multifd-zero-page.c
>>
>> diff --git a/migration/meson.build b/migration/meson.build
>> index 92b1cc4297..1eeb915ff6 100644
>> --- a/migration/meson.build
>> +++ b/migration/meson.build
>> @@ -22,6 +22,7 @@ system_ss.add(files(
>>'migration.c',
>>'multifd.c',
>>'multifd-zlib.c',
>> +  'multifd-zero-page.c',
>>'ram-compress.c',
>>'options.c',
>>'postcopy-ram.c',
>> diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
>> new file mode 100644
>> index 00..f0cd8e2c53
>> --- /dev/null
>> +++ b/migration/multifd-zero-page.c
>> @@ -0,0 +1,59 @@
>> +/*
>> + * Multifd zero page detection implementation.
>> + *
>> + * Copyright (c) 2024 Bytedance Inc
>> + *
>> + * Authors:
>> + *  Hao Xiang 
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/cutils.h"
>> +#include "exec/ramblock.h"
>> +#include "migration.h"
>> +#include "multifd.h"
>> +#include "options.h"
>> +#include "ram.h"
>> +
>> +void multifd_zero_page_check_send(MultiFDSendParams *p)
>> +{
>> +/*
>> + * QEMU older than 9.0 don't understand zero page
>> + * on multifd channel. This switch is required to
>> + * maintain backward compatibility.
>> + */
>> +bool use_multifd_zero_page =
>> +(migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
>> +MultiFDPages_t *pages = p->pages;
>> +RAMBlock *rb = pages->block;
>> +
>> +assert(pages->num != 0);
>
>
> Not needed, the check is done right before calling send_prepare.
>
>>
>> +assert(pages->normal_num == 0);
>> +assert(pages->zero_num == 0);
>
>
> Why these asserts are needed?

The idea is that when multifd_zero_page_check_send is called, I want
to make sure zero page checking was not processed on this packet
before. It is perhaps redundant. It won't compile in free build.

>>
>> +
>>
>> +for (int i = 0; i < pages->num; i++) {
>> +uint64_t offset = pages->offset[i];
>> +if (use_multifd_zero_page &&
>> +buffer_is_zero(rb->host + offset, p->page_size)) {
>> +pages->zero[pages->zero_num] = offset;
>> +pages->zero_num++;
>> +ram_release_page(rb->idstr, offset);
>> +} else {
>> +pages->normal[pages->normal_num] = offset;
>> +pages->normal_num++;
>> +}
>> +}
>> +}
>> +
>> +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
>> +{
>> +for (int i = 0; i < p->zero_num; i++) {
>> +void *page = p->host + p->zero[i];
>> +if (!buffer_is_zero(page, p->page_size)) {
>> +memset(page, 0, p->page_size);
>> +}
>> +}
>> +}
>> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
>> index 012e3bdea1..cdfe0fa70e 100644
>> --- a/migration/multifd-zlib.c
>> +++ b/migration/multifd-zlib.c
>> @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, 
>> Error **errp)
>>  int ret;
>>  uint32_t i;
>>
>> +multifd_zero_page_check_send(p);
>> +
>> +if (!pages->normal_num) {
>> +p->next_packet_size = 0;
>> +goto out;
>> +}
>> +
>>  multifd_send_prepare_header(p);
>>
>> -for (i = 0; i < pages->num; i++) {
>> +for (i = 0; i < pages->normal_num; i++) {
>>  uint32_t available = z->zbuff_len - out_size;
>>  int flush = Z_NO_FLUSH;
>>
>> -if (i == pages->num - 1) {
>> +if (i == pages->normal_num - 1) {
>>  flush = Z_SYNC_FLUSH;
>>  }
>>
>> @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error 
>> **errp)
>>   * with compression. zlib does not guarantee that this is safe,
>>   * therefore copy the page before calling deflate().
>>   */
>> -memcpy(z->buf, p->pages->block->host + pages->offset[i], 
>> p->page_size);
>> +memcpy(z->buf, p->pages->block->host + pages->normal[i], 
>> p->page_size);
>>  zs->avail_in = p->page_size;
>> 

Re: [PATCH] target/riscv: Add missing include guard in pmu.h

2024-02-22 Thread Alistair Francis
On Tue, Feb 20, 2024 at 9:09 PM  wrote:
>
> From: Frank Chang 
>
> Add missing include guard in pmu.h to avoid the problem of double
> inclusion.
>
> Signed-off-by: Frank Chang 

Thanks!

Applied to riscv-to-apply.next

Alistair

> ---
>  target/riscv/pmu.h | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/target/riscv/pmu.h b/target/riscv/pmu.h
> index 505fc850d3..7c0ad661e0 100644
> --- a/target/riscv/pmu.h
> +++ b/target/riscv/pmu.h
> @@ -16,6 +16,9 @@
>   * this program.  If not, see .
>   */
>
> +#ifndef RISCV_PMU_H
> +#define RISCV_PMU_H
> +
>  #include "cpu.h"
>  #include "qapi/error.h"
>
> @@ -31,3 +34,5 @@ int riscv_pmu_incr_ctr(RISCVCPU *cpu, enum 
> riscv_pmu_event_idx event_idx);
>  void riscv_pmu_generate_fdt_node(void *fdt, uint32_t cmask, char *pmu_name);
>  int riscv_pmu_setup_timer(CPURISCVState *env, uint64_t value,
>uint32_t ctr_idx);
> +
> +#endif /* RISCV_PMU_H */
> --
> 2.43.0
>
>



Re: [PATCH] target/riscv: Add missing include guard in pmu.h

2024-02-22 Thread Alistair Francis
On Tue, Feb 20, 2024 at 9:09 PM  wrote:
>
> From: Frank Chang 
>
> Add missing include guard in pmu.h to avoid the problem of double
> inclusion.
>
> Signed-off-by: Frank Chang 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/pmu.h | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/target/riscv/pmu.h b/target/riscv/pmu.h
> index 505fc850d3..7c0ad661e0 100644
> --- a/target/riscv/pmu.h
> +++ b/target/riscv/pmu.h
> @@ -16,6 +16,9 @@
>   * this program.  If not, see .
>   */
>
> +#ifndef RISCV_PMU_H
> +#define RISCV_PMU_H
> +
>  #include "cpu.h"
>  #include "qapi/error.h"
>
> @@ -31,3 +34,5 @@ int riscv_pmu_incr_ctr(RISCVCPU *cpu, enum 
> riscv_pmu_event_idx event_idx);
>  void riscv_pmu_generate_fdt_node(void *fdt, uint32_t cmask, char *pmu_name);
>  int riscv_pmu_setup_timer(CPURISCVState *env, uint64_t value,
>uint32_t ctr_idx);
> +
> +#endif /* RISCV_PMU_H */
> --
> 2.43.0
>
>



Re: [External] Re: [PATCH v2 3/7] migration/multifd: Zero page transmission on the multifd thread.

2024-02-22 Thread Hao Xiang
On Fri, Feb 16, 2024 at 9:08 PM Richard Henderson
 wrote:
>
> On 2/16/24 12:39, Hao Xiang wrote:
> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> > +{
> > +for (int i = 0; i < p->zero_num; i++) {
> > +void *page = p->host + p->zero[i];
> > +if (!buffer_is_zero(page, p->page_size)) {
> > +memset(page, 0, p->page_size);
> > +}
> > +}
> > +}
>
> You should not check the buffer is zero here, you should just zero it.

I will fix it in the next version.

>
>
> r~



Re: [External] Re: [PATCH v2 1/7] migration/multifd: Add new migration option zero-page-detection.

2024-02-22 Thread Hao Xiang
On Wed, Feb 21, 2024 at 5:58 AM Elena Ufimtseva  wrote:
>
>
>
> On Fri, Feb 16, 2024 at 2:41 PM Hao Xiang  wrote:
>>
>> This new parameter controls where the zero page checking is running.
>> 1. If this parameter is set to 'legacy', zero page checking is
>> done in the migration main thread.
>> 2. If this parameter is set to 'none', zero page checking is disabled.
>>
>
> Hello Hao
>
> Few questions and comments.
>
> First the commit message states that the parameter control where the checking 
> is done, but it also controls
> if sending of zero pages is done by multifd threads or not.
>
>
>>
>> Signed-off-by: Hao Xiang 
>> ---
>>  hw/core/qdev-properties-system.c| 10 ++
>>  include/hw/qdev-properties-system.h |  4 
>>  migration/migration-hmp-cmds.c  |  9 +
>>  migration/options.c | 21 
>>  migration/options.h |  1 +
>>  migration/ram.c |  4 
>>  qapi/migration.json | 30 ++---
>>  7 files changed, 76 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/core/qdev-properties-system.c 
>> b/hw/core/qdev-properties-system.c
>> index 1a396521d5..63843f18b5 100644
>> --- a/hw/core/qdev-properties-system.c
>> +++ b/hw/core/qdev-properties-system.c
>> @@ -679,6 +679,16 @@ const PropertyInfo qdev_prop_mig_mode = {
>>  .set_default_value = qdev_propinfo_set_default_value_enum,
>>  };
>>
>> +const PropertyInfo qdev_prop_zero_page_detection = {
>> +.name = "ZeroPageDetection",
>> +.description = "zero_page_detection values, "
>> +   "multifd,legacy,none",
>> +.enum_table = _lookup,
>> +.get = qdev_propinfo_get_enum,
>> +.set = qdev_propinfo_set_enum,
>> +.set_default_value = qdev_propinfo_set_default_value_enum,
>> +};
>> +
>>  /* --- Reserved Region --- */
>>
>>  /*
>> diff --git a/include/hw/qdev-properties-system.h 
>> b/include/hw/qdev-properties-system.h
>> index 06c359c190..839b170235 100644
>> --- a/include/hw/qdev-properties-system.h
>> +++ b/include/hw/qdev-properties-system.h
>> @@ -8,6 +8,7 @@ extern const PropertyInfo qdev_prop_macaddr;
>>  extern const PropertyInfo qdev_prop_reserved_region;
>>  extern const PropertyInfo qdev_prop_multifd_compression;
>>  extern const PropertyInfo qdev_prop_mig_mode;
>> +extern const PropertyInfo qdev_prop_zero_page_detection;
>>  extern const PropertyInfo qdev_prop_losttickpolicy;
>>  extern const PropertyInfo qdev_prop_blockdev_on_error;
>>  extern const PropertyInfo qdev_prop_bios_chs_trans;
>> @@ -47,6 +48,9 @@ extern const PropertyInfo 
>> qdev_prop_iothread_vq_mapping_list;
>>  #define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \
>>  DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \
>> MigMode)
>> +#define DEFINE_PROP_ZERO_PAGE_DETECTION(_n, _s, _f, _d) \
>> +DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_zero_page_detection, \
>> +   ZeroPageDetection)
>>  #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \
>>  DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \
>>  LostTickPolicy)
>> diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
>> index 99b49df5dd..7e96ae6ffd 100644
>> --- a/migration/migration-hmp-cmds.c
>> +++ b/migration/migration-hmp-cmds.c
>> @@ -344,6 +344,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const 
>> QDict *qdict)
>>  monitor_printf(mon, "%s: %s\n",
>>  MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION),
>>  MultiFDCompression_str(params->multifd_compression));
>> +assert(params->has_zero_page_detection);
>
>
> What is the reason to have assert here?

It's just to verify that the option is initialized properly before we
reach here. Same things are done for other options.

>
>>
>> +monitor_printf(mon, "%s: %s\n",
>> +MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION),
>> +qapi_enum_lookup(_lookup,
>> +params->zero_page_detection));
>>  monitor_printf(mon, "%s: %" PRIu64 " bytes\n",
>>  MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE),
>>  params->xbzrle_cache_size);
>> @@ -634,6 +639,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const 
>> QDict *qdict)
>>  p->has_multifd_zstd_level = true;
>>  visit_type_uint8(v, param, >multifd_zstd_level, );
>>  break;
>> +case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION:
>> +p->has_zero_page_detection = true;
>> +visit_type_ZeroPageDetection(v, param, >zero_page_detection, 
>> );
>> +break;
>>  case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE:
>>  p->has_xbzrle_cache_size = true;
>>  if (!visit_type_size(v, param, _size, )) {
>> diff --git a/migration/options.c b/migration/options.c
>> index 3e3e0b93b4..3c603391b0 100644
>> --- a/migration/options.c
>> +++ b/migration/options.c

Re: [PATCH] target/riscv: fix ACPI MCFG table

2024-02-22 Thread Alistair Francis
On Tue, Feb 20, 2024 at 2:10 AM X512 via  wrote:
>
> MCFG segments should point to PCI configuration range, not BAR MMIO.
>
> Signed-off-by: Ilya Chugin 

Thanks!

Applied to riscv-to-apply.next

Alistair

> ---
>   hw/riscv/virt-acpi-build.c | 4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
> index fb8baf64f6..fe01b626ea 100644
> --- a/hw/riscv/virt-acpi-build.c
> +++ b/hw/riscv/virt-acpi-build.c
> @@ -558,8 +558,8 @@ static void virt_acpi_build(RISCVVirtState *s,
> AcpiBuildTables *tables)
>   acpi_add_table(table_offsets, tables_blob);
>   {
>   AcpiMcfgInfo mcfg = {
> -   .base = s->memmap[VIRT_PCIE_MMIO].base,
> -   .size = s->memmap[VIRT_PCIE_MMIO].size,
> +   .base = s->memmap[VIRT_PCIE_ECAM].base,
> +   .size = s->memmap[VIRT_PCIE_ECAM].size,
>   };
>   build_mcfg(tables_blob, tables->linker, , s->oem_id,
>  s->oem_table_id);
> --
> 2.42.1
>
>



Re: [RFC PATCH 14/14] migration: Fix return-path thread exit

2024-02-22 Thread Peter Xu
On Fri, Feb 16, 2024 at 02:35:26PM -0300, Fabiano Rosas wrote:
> Cédric Le Goater  writes:
> 
> > Hello Fabiano
> >
> > On 2/14/24 21:35, Fabiano Rosas wrote:
> >> Cédric Le Goater  writes:
> >> 
> >>> Hello Fabiano
> >>>
> >>> On 2/8/24 14:29, Fabiano Rosas wrote:
>  Cédric Le Goater  writes:
> 
> > In case of error, close_return_path_on_source() can perform a shutdown
> > to exit the return-path thread.  However, in migrate_fd_cleanup(),
> > 'to_dst_file' is closed before calling close_return_path_on_source()
> > and the shutdown fails, leaving the source and destination waiting for
> > an event to occur.
> 
>  Hi, Cédric
> 
>  Are you sure this is not caused by patch 13?
> >>>
> >>> It happens with upstream QEMU without any patch.
> >> 
> >> I might have taken that "shutdown fails" in the commit message too
> >> literaly. Anyway, I have a proposed solution:
> >> 
> >> -->8--
> >>  From 729aa7b5b7f130f756d41649fdd0862bd2e90430 Mon Sep 17 00:00:00 2001
> >> From: Fabiano Rosas 
> >> Date: Wed, 14 Feb 2024 16:45:43 -0300
> >> Subject: [PATCH] migration: Join the return path thread before releasing
> >>   to_dst_file
> >> MIME-Version: 1.0
> >> Content-Type: text/plain; charset=UTF-8
> >> Content-Transfer-Encoding: 8bit
> >> 
> >> The return path thread might hang at a blocking system call. Before
> >> joining the thread we might need to issue a shutdown() on the socket
> >> file descriptor to release it. To determine whether the shutdown() is
> >> necessary we look at the QEMUFile error.
> >> 
> >> Make sure we only clean up the QEMUFile after the return path has been
> >> waited for.
> >
> > Yes. That's the important part.
> >
> >> This fixes a hang when qemu_savevm_state_setup() produced an error
> >> that was detected by migration_detect_error(). That skips
> >> migration_completion() so close_return_path_on_source() would get
> >> stuck waiting for the RP thread to terminate.
> >> 
> >> At migrate_fd_cleanup() I'm keeping the relative order of joining the
> >> migration thread and the return path just in case.
> >
> > That doesn't look necessary.
> 
> Indeed. But I don't trust the migration code, it's full of undocumented
> dependencies like that.
> 
> > What was the reason to join the migration thread only when
> > s->to_dst_file is valid ?
> 
> I didn't find any explicit reason looking through the history. It seems
> we used to rely on to_dst_file before migration_thread_running was
> introduced.
> 
> I wouldn't mind keeping that 'if' there.
> 
> Let's see what Peter thinks about it.

Frankly I don't have a strong opinion on current patch 14 or the new
proposal, but it seems we reached a consensus.

Fabiano, would you repost with a formal patch, with the proper tags?

One thing I am still not sure is whether we should still have patch 13
altogether? Please see my other reply on whether it's possible to have
migrate_get_error() == true but qemu_file_get_error() == false.  In
postcopy_pause(), currently we constantly shutdown() so the join() should
always work:

qemu_file_shutdown(file);
qemu_fclose(file);

/*
 * We're already pausing, so ignore any errors on the return
 * path and just wait for the thread to finish. It will be
 * re-created when we resume.
 */
close_return_path_on_source(s);

If move close_return_path_on_source() upper, qemu_file_shutdown() may not
be needed? And I think we need to make sure close_return_path_on_source()
will always properly kick the other thread.

Thanks,

-- 
Peter Xu




Re: [RFC PATCH 14/14] migration: Fix return-path thread exit

2024-02-22 Thread Peter Xu
On Mon, Feb 12, 2024 at 05:04:28PM +0100, Cédric Le Goater wrote:
> and then, in background we have open questions regarding :
> 
> * the QEMUfile implementation and its QIOChannel usage for migration
>   streams
> * qemu_file_set_error* vs. migrate_set_error. It is confusing, at least
>   for me. Do we have some documentation on best practices ?

Right it is confusing..  It can all boil down to the acient qemufile api
that Fabiano also mentioned in the other reply.  IMHO ideally iochannel
errors should be reported through the stack (rather than kept within the
object) from the channel's API and stored with migrate_set_error() if
necessary, and the channel itself may not need to maintain its own errors.
Right now it's needed because many qemufile APIs do not return errors.

Thanks,

-- 
Peter Xu




Re: [External] Re: [PATCH v2 1/7] migration/multifd: Add new migration option zero-page-detection.

2024-02-22 Thread Hao Xiang
On Wed, Feb 21, 2024 at 4:03 AM Markus Armbruster  wrote:
>
> Hao Xiang  writes:
>
> > This new parameter controls where the zero page checking is running.
> > 1. If this parameter is set to 'legacy', zero page checking is
> > done in the migration main thread.
> > 2. If this parameter is set to 'none', zero page checking is disabled.
> >
> > Signed-off-by: Hao Xiang 
>
> [...]
>
> > diff --git a/qapi/migration.json b/qapi/migration.json
> > index 5a565d9b8d..99843a8e95 100644
> > --- a/qapi/migration.json
> > +++ b/qapi/migration.json
> > @@ -653,6 +653,17 @@
> >  { 'enum': 'MigMode',
> >'data': [ 'normal', 'cpr-reboot' ] }
> >
> > +##
> > +# @ZeroPageDetection:
> > +#
> > +# @legacy: Perform zero page checking from main migration thread. (since 
> > 9.0)
> > +#
> > +# @none: Do not perform zero page checking.
> > +#
> > +##
>
> The entire type is since 9.0.  Thus:
>
>##
># @ZeroPageDetection:
>#
># @legacy: Perform zero page checking from main migration thread.
>#
># @none: Do not perform zero page checking.
>#
># Since: 9.0
>##
>
> > +{ 'enum': 'ZeroPageDetection',
> > +  'data': [ 'legacy', 'none' ] }
> > +
> >  ##
> >  # @BitmapMigrationBitmapAliasTransform:
> >  #
> > @@ -874,6 +885,9 @@
> >  # @mode: Migration mode. See description in @MigMode. Default is 'normal'.
> >  #(Since 8.2)
> >  #
> > +# @zero-page-detection: See description in @ZeroPageDetection.
> > +# Default is 'legacy'. (Since 9.0)
>
> The description feels a bit lazy :)
>
> Suggest
>
># @zero-page-detection: Whether and how to detect zero pages.  Default
># is 'legacy'.  (since 9.0)
>
> Same for the other two copies.

I will fix these in the next version.

>
> > +#
> >  # Features:
> >  #
> >  # @deprecated: Member @block-incremental is deprecated.  Use
> > @@ -907,7 +921,8 @@
> > 'block-bitmap-mapping',
> > { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] 
> > },
> > 'vcpu-dirty-limit',
> > -   'mode'] }
> > +   'mode',
> > +   'zero-page-detection'] }
> >
> >  ##
> >  # @MigrateSetParameters:
> > @@ -1066,6 +1081,10 @@
> >  # @mode: Migration mode. See description in @MigMode. Default is 'normal'.
> >  #(Since 8.2)
> >  #
> > +# @zero-page-detection: See description in @ZeroPageDetection.
> > +# Default is 'legacy'. (Since 9.0)
> > +#
> > +#
> >  # Features:
> >  #
> >  # @deprecated: Member @block-incremental is deprecated.  Use
> > @@ -1119,7 +1138,8 @@
> >  '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
> >  'features': [ 'unstable' ] },
> >  '*vcpu-dirty-limit': 'uint64',
> > -'*mode': 'MigMode'} }
> > +'*mode': 'MigMode',
> > +'*zero-page-detection': 'ZeroPageDetection'} }
> >
> >  ##
> >  # @migrate-set-parameters:
> > @@ -1294,6 +1314,9 @@
> >  # @mode: Migration mode. See description in @MigMode. Default is 'normal'.
> >  #(Since 8.2)
> >  #
> > +# @zero-page-detection: See description in @ZeroPageDetection.
> > +# Default is 'legacy'. (Since 9.0)
> > +#
> >  # Features:
> >  #
> >  # @deprecated: Member @block-incremental is deprecated.  Use
> > @@ -1344,7 +1367,8 @@
> >  '*x-vcpu-dirty-limit-period': { 'type': 'uint64',
> >  'features': [ 'unstable' ] },
> >  '*vcpu-dirty-limit': 'uint64',
> > -'*mode': 'MigMode'} }
> > +'*mode': 'MigMode',
> > +'*zero-page-detection': 'ZeroPageDetection'} }
> >
> >  ##
> >  # @query-migrate-parameters:
>



Re: [External] Re: [PATCH v2 5/7] migration/multifd: Add new migration test cases for legacy zero page checking.

2024-02-22 Thread Hao Xiang
On Wed, Feb 21, 2024 at 12:59 PM Fabiano Rosas  wrote:
>
> Hao Xiang  writes:
>
> > Now that zero page checking is done on the multifd sender threads by
> > default, we still provide an option for backward compatibility. This
> > change adds a qtest migration test case to set the zero-page-detection
> > option to "legacy" and run multifd migration with zero page checking on the
> > migration main thread.
> >
> > Signed-off-by: Hao Xiang 
> > ---
> >  tests/qtest/migration-test.c | 52 
> >  1 file changed, 52 insertions(+)
> >
> > diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> > index 8a5bb1752e..c27083110a 100644
> > --- a/tests/qtest/migration-test.c
> > +++ b/tests/qtest/migration-test.c
> > @@ -2621,6 +2621,24 @@ test_migrate_precopy_tcp_multifd_start(QTestState 
> > *from,
> >  return test_migrate_precopy_tcp_multifd_start_common(from, to, "none");
> >  }
> >
> > +static void *
> > +test_migrate_precopy_tcp_multifd_start_zero_page_legacy(QTestState *from,
> > +QTestState *to)
> > +{
> > +test_migrate_precopy_tcp_multifd_start_common(from, to, "none");
> > +migrate_set_parameter_str(from, "zero-page-detection", "legacy");
> > +return NULL;
> > +}
> > +
> > +static void *
> > +test_migration_precopy_tcp_multifd_start_no_zero_page(QTestState *from,
> > +  QTestState *to)
> > +{
> > +test_migrate_precopy_tcp_multifd_start_common(from, to, "none");
> > +migrate_set_parameter_str(from, "zero-page-detection", "none");
> > +return NULL;
> > +}
> > +
> >  static void *
> >  test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from,
> >  QTestState *to)
> > @@ -2652,6 +2670,36 @@ static void test_multifd_tcp_none(void)
> >  test_precopy_common();
> >  }
> >
> > +static void test_multifd_tcp_zero_page_legacy(void)
> > +{
> > +MigrateCommon args = {
> > +.listen_uri = "defer",
> > +.start_hook = 
> > test_migrate_precopy_tcp_multifd_start_zero_page_legacy,
> > +/*
> > + * Multifd is more complicated than most of the features, it
> > + * directly takes guest page buffers when sending, make sure
> > + * everything will work alright even if guest page is changing.
> > + */
> > +.live = true,
> > +};
> > +test_precopy_common();
> > +}
> > +
> > +static void test_multifd_tcp_no_zero_page(void)
> > +{
> > +MigrateCommon args = {
> > +.listen_uri = "defer",
> > +.start_hook = 
> > test_migration_precopy_tcp_multifd_start_no_zero_page,
> > +/*
> > + * Multifd is more complicated than most of the features, it
> > + * directly takes guest page buffers when sending, make sure
> > + * everything will work alright even if guest page is changing.
> > + */
> > +.live = true,
> > +};
> > +test_precopy_common();
> > +}
> > +
> >  static void test_multifd_tcp_zlib(void)
> >  {
> >  MigrateCommon args = {
> > @@ -3550,6 +3598,10 @@ int main(int argc, char **argv)
> >  }
> >  migration_test_add("/migration/multifd/tcp/plain/none",
> > test_multifd_tcp_none);
> > +migration_test_add("/migration/multifd/tcp/plain/zero_page_legacy",
> > +   test_multifd_tcp_zero_page_legacy);
> > +migration_test_add("/migration/multifd/tcp/plain/no_zero_page",
> > +   test_multifd_tcp_no_zero_page);
>
> Here it's better to separate the main feature from the states. That way
> we can run only the zero-page tests with:
>
>  migration-test -r /x86_64/migration/multifd/tcp/plain/zero-page
>
> Like so: (also dashes instead of underscores)
> /zero-page/legacy
> /zero-page/none
>

Sounds good.

> >  migration_test_add("/migration/multifd/tcp/plain/cancel",
> > test_multifd_tcp_cancel);
> >  migration_test_add("/migration/multifd/tcp/plain/zlib",



Re: [PATCH 13/14] migration: Use migrate_has_error() in close_return_path_on_source()

2024-02-22 Thread Peter Xu
On Thu, Feb 08, 2024 at 10:07:44AM -0300, Fabiano Rosas wrote:
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 
> > d5f705ceef4c925589aa49335969672c0d761fa2..5f55af3d7624750ca416c4177781241b3e291e5d
> >  100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -2372,8 +2372,7 @@ static bool 
> > close_return_path_on_source(MigrationState *ms)
> >   * cause it to unblock if it's stuck waiting for the destination.
> >   */
> >  WITH_QEMU_LOCK_GUARD(>qemu_file_lock) {
> > -if (ms->to_dst_file && ms->rp_state.from_dst_file &&
> > -qemu_file_get_error(ms->to_dst_file)) {
> > +if (migrate_has_error(ms) && ms->rp_state.from_dst_file) {
> >  qemu_file_shutdown(ms->rp_state.from_dst_file);
> >  }
> >  }
> 
> Hm, maybe Peter can help defend this, but this assumes that every
> function that takes an 'f' and sets the file error also sets
> migrate_set_error(). I'm not sure we have determined that, have we?

[apologies on getting back to this thread late.. I saw there's yet another
 proposal in the other email, will look at that soon]

I think that should be set, or otherwise we lose an error?  After all
s->error is the only thing we report, if there is a qemufile error that is
not reported into s->error it can be lost then.

On src QEMU we have both migration thread and return path thread.  For
migration thread the file error should always be collected by
migration_detect_error() by the qemu_file_get_error_obj_any() (it also
looks after postcopy_qemufile_src).  For return path thread it's always
collected when the loop quits.

Would migrate_has_error() be safer than qemu_file_get_error() in some
cases?  I'm considering when there is an error outside of qemufile itself,
that's the case where qemu_file_get_error(ms->to_dst_file) can return
false, however we may still need a kick to the from_dst_file?

-- 
Peter Xu




Re: [External] Re: [PATCH v2 2/7] migration/multifd: Support for zero pages transmission in multifd format.

2024-02-22 Thread Hao Xiang
On Wed, Feb 21, 2024 at 7:37 AM Elena Ufimtseva  wrote:
>
>
>
> On Fri, Feb 16, 2024 at 2:41 PM Hao Xiang  wrote:
>>
>> This change adds zero page counters and updates multifd send/receive
>> tracing format to track the newly added counters.
>>
>> Signed-off-by: Hao Xiang 
>> ---
>>  migration/multifd.c| 43 ++
>>  migration/multifd.h| 21 -
>>  migration/ram.c|  1 -
>>  migration/trace-events |  8 
>>  4 files changed, 59 insertions(+), 14 deletions(-)
>>
>> diff --git a/migration/multifd.c b/migration/multifd.c
>> index adfe8c9a0a..a33dba40d9 100644
>> --- a/migration/multifd.c
>> +++ b/migration/multifd.c
>> @@ -236,6 +236,8 @@ static void multifd_pages_reset(MultiFDPages_t *pages)
>>   * overwritten later when reused.
>>   */
>>  pages->num = 0;
>> +pages->normal_num = 0;
>> +pages->zero_num = 0;
>>  pages->block = NULL;
>>  }
>>
>>
>> @@ -309,6 +311,8 @@ static MultiFDPages_t *multifd_pages_init(uint32_t n)
>>
>>  pages->allocated = n;
>>  pages->offset = g_new0(ram_addr_t, n);
>> +pages->normal = g_new0(ram_addr_t, n);
>> +pages->zero = g_new0(ram_addr_t, n);
>>
>>
>>  return pages;
>>  }
>> @@ -319,6 +323,10 @@ static void multifd_pages_clear(MultiFDPages_t *pages)
>>  pages->allocated = 0;
>>  g_free(pages->offset);
>>  pages->offset = NULL;
>> +g_free(pages->normal);
>> +pages->normal = NULL;
>> +g_free(pages->zero);
>> +pages->zero = NULL;
>>  g_free(pages);
>>  }
>>
>> @@ -332,6 +340,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>>  packet->flags = cpu_to_be32(p->flags);
>>  packet->pages_alloc = cpu_to_be32(p->pages->allocated);
>>  packet->normal_pages = cpu_to_be32(pages->num);
>> +packet->zero_pages = cpu_to_be32(pages->zero_num);
>>  packet->next_packet_size = cpu_to_be32(p->next_packet_size);
>>
>>  packet_num = qatomic_fetch_inc(_send_state->packet_num);
>> @@ -350,9 +359,10 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>>
>>  p->packets_sent++;
>>  p->total_normal_pages += pages->num;
>> +p->total_zero_pages += pages->zero_num;
>>
>> -trace_multifd_send(p->id, packet_num, pages->num, p->flags,
>> -   p->next_packet_size);
>> +trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
>> +   p->flags, p->next_packet_size);
>>  }
>>
>>  static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>> @@ -393,20 +403,29 @@ static int 
>> multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>>  p->normal_num = be32_to_cpu(packet->normal_pages);
>>  if (p->normal_num > packet->pages_alloc) {
>>  error_setg(errp, "multifd: received packet "
>> -   "with %u pages and expected maximum pages are %u",
>> +   "with %u normal pages and expected maximum pages are %u",
>> p->normal_num, packet->pages_alloc) ;
>>  return -1;
>>  }
>>
>> +p->zero_num = be32_to_cpu(packet->zero_pages);
>> +if (p->zero_num > packet->pages_alloc - p->normal_num) {
>> +error_setg(errp, "multifd: received packet "
>> +   "with %u zero pages and expected maximum zero pages are 
>> %u",
>> +   p->zero_num, packet->pages_alloc - p->normal_num) ;
>> +return -1;
>> +}
>
>
> You could probably combine this check with normal_num against pages_alloc.
>>
>> +
>>  p->next_packet_size = be32_to_cpu(packet->next_packet_size);
>>  p->packet_num = be64_to_cpu(packet->packet_num);
>>  p->packets_recved++;
>>  p->total_normal_pages += p->normal_num;
>> +p->total_zero_pages += p->zero_num;
>>
>> -trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags,
>> -   p->next_packet_size);
>> +trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num,
>> +   p->flags, p->next_packet_size);
>>
>> -if (p->normal_num == 0) {
>> +if (p->normal_num == 0 && p->zero_num == 0) {
>>  return 0;
>>  }
>>
>> @@ -823,6 +842,8 @@ static void *multifd_send_thread(void *opaque)
>>
>>  stat64_add(_stats.multifd_bytes,
>> p->next_packet_size + p->packet_len);
>> +stat64_add(_stats.normal_pages, pages->num);
>
>
> That seems wrong. pages->num is the number of pages total in the packet.
> But next patch changes it, so I suggest or change it here and not in 3/7.

In this patch, multifd zero pages are not enabled yet. So pages->num
are the number of normal pages not pages total in the packet. The zero
pages were send in a different format in save_zero_page. Later on,
when multifd zero page is enabled, pages->normal_num counts the number
of normal pages and hence the accounting is changed.

>
>> +stat64_add(_stats.zero_pages, pages->zero_num);
>>
>>  

Re: [PATCH 1/3] linux-user/loongarch64: Remove TARGET_FORCE_SHMLBA

2024-02-22 Thread gaosong

在 2024/2/23 上午11:03, Richard Henderson 写道:

The upstream linux kernel does not define __ARCH_FORCE_SHMLBA.

Cc: Song Gao 
Signed-off-by: Richard Henderson 

---

Did this definition come from the port before it was merged upstream?

Yes,

The patch [1]  dropped it .
    [1] 
https://patchew.org/linux/20240106145501.3370364-1-chenhua...@loongson.cn/



Reviewed-by: Song Gao 

Thanks.
Song Gao

Or was it incorrectly copied from MIPS?
---
  linux-user/loongarch64/target_syscall.h | 7 ---
  1 file changed, 7 deletions(-)

diff --git a/linux-user/loongarch64/target_syscall.h 
b/linux-user/loongarch64/target_syscall.h
index 8b5de52124..39f229bb9c 100644
--- a/linux-user/loongarch64/target_syscall.h
+++ b/linux-user/loongarch64/target_syscall.h
@@ -38,11 +38,4 @@ struct target_pt_regs {
  #define TARGET_MCL_FUTURE  2
  #define TARGET_MCL_ONFAULT 4
  
-#define TARGET_FORCE_SHMLBA

-
-static inline abi_ulong target_shmlba(CPULoongArchState *env)
-{
-return 64 * KiB;
-}
-
  #endif





Re: [PATCH v4 2/2] target/i386: add control bits support for LAM

2024-02-22 Thread Zhao Liu
On Fri, Jan 12, 2024 at 02:00:42PM +0800, Binbin Wu wrote:
> Date: Fri, 12 Jan 2024 14:00:42 +0800
> From: Binbin Wu 
> Subject: [PATCH v4 2/2] target/i386: add control bits support for LAM
> X-Mailer: git-send-email 2.25.1
> 
> LAM uses CR3[61] and CR3[62] to configure/enable LAM on user pointers.
> LAM uses CR4[28] to configure/enable LAM on supervisor pointers.
> 
> For CR3 LAM bits, no additional handling needed:
> - TCG
>   LAM is not supported for TCG of target-i386.  helper_write_crN() and
>   helper_vmrun() check max physical address bits before calling
>   cpu_x86_update_cr3(), no change needed, i.e. CR3 LAM bits are not allowed
>   to be set in TCG.
> - gdbstub
>   x86_cpu_gdb_write_register() will call cpu_x86_update_cr3() to update cr3.
>   Allow gdb to set the LAM bit(s) to CR3, if vcpu doesn't support LAM,
>   KVM_SET_SREGS will fail as other reserved bits.
> 
> For CR4 LAM bit, its reservation depends on vcpu supporting LAM feature or
> not.
> - TCG
>   LAM is not supported for TCG of target-i386.  helper_write_crN() and
>   helper_vmrun() check CR4 reserved bit before calling cpu_x86_update_cr4(),
>   i.e. CR4 LAM bit is not allowed to be set in TCG.
> - gdbstub
>   x86_cpu_gdb_write_register() will call cpu_x86_update_cr4() to update cr4.
>   Mask out LAM bit on CR4 if vcpu doesn't support LAM.
> - x86_cpu_reset_hold() doesn't need special handling.
> 
> Signed-off-by: Binbin Wu 
> Tested-by: Xuelian Guo 
> ---
>  target/i386/cpu.h| 7 ++-
>  target/i386/helper.c | 4 
>  2 files changed, 10 insertions(+), 1 deletion(-)

Reviewed-by: Zhao Liu 

> 
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 18ea755644..598a3fa140 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -261,6 +261,7 @@ typedef enum X86Seg {
>  #define CR4_SMAP_MASK   (1U << 21)
>  #define CR4_PKE_MASK   (1U << 22)
>  #define CR4_PKS_MASK   (1U << 24)
> +#define CR4_LAM_SUP_MASK (1U << 28)
>  
>  #define CR4_RESERVED_MASK \
>  (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \
> @@ -269,7 +270,8 @@ typedef enum X86Seg {
>  | CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK | CR4_UMIP_MASK \
>  | CR4_LA57_MASK \
>  | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \
> -| CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | 
> CR4_PKS_MASK))
> +| CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | 
> CR4_PKS_MASK \
> +| CR4_LAM_SUP_MASK))
>  
>  #define DR6_BD  (1 << 13)
>  #define DR6_BS  (1 << 14)
> @@ -2522,6 +2524,9 @@ static inline uint64_t cr4_reserved_bits(CPUX86State 
> *env)
>  if (!(env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS)) {
>  reserved_bits |= CR4_PKS_MASK;
>  }
> +if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) {
> +reserved_bits |= CR4_LAM_SUP_MASK;
> +}
>  return reserved_bits;
>  }
>  
> diff --git a/target/i386/helper.c b/target/i386/helper.c
> index 2070dd0dda..1da7a7d315 100644
> --- a/target/i386/helper.c
> +++ b/target/i386/helper.c
> @@ -219,6 +219,10 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t 
> new_cr4)
>  new_cr4 &= ~CR4_PKS_MASK;
>  }
>  
> +if (!(env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM)) {
> +new_cr4 &= ~CR4_LAM_SUP_MASK;
> +}
> +
>  env->cr[4] = new_cr4;
>  env->hflags = hflags;
>  
> -- 
> 2.25.1
> 
> 



Re: [PATCH v4 05/34] migration/multifd: Release recv sem_sync earlier

2024-02-22 Thread Peter Xu
On Tue, Feb 20, 2024 at 07:41:09PM -0300, Fabiano Rosas wrote:
> Now that multifd_recv_terminate_threads() is called only once, release
> the recv side sem_sync earlier like we do for the send side.
> 
> Signed-off-by: Fabiano Rosas 

Reviewed-by: Peter Xu 

-- 
Peter Xu




Re: [PATCH v4 04/34] migration/multifd: Remove p->quit from recv side

2024-02-22 Thread Peter Xu
On Tue, Feb 20, 2024 at 07:41:08PM -0300, Fabiano Rosas wrote:
> Like we did on the sending side, replace the p->quit per-channel flag
> with a global atomic 'exiting' flag.
> 
> Signed-off-by: Fabiano Rosas 

Reviewed-by: Peter Xu 

-- 
Peter Xu




Re: [PATCH v2] target/i386/kvm: Refine VMX controls setting for backward compatibility

2024-02-22 Thread Ewan Hai




On 2/20/24 06:07, Ewan Hai wrote:

On 2/20/24 03:32, Xiaoyao Li wrote:

diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 11b8177eff..c8f6c0b531 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -2296,6 +2296,7 @@ void kvm_arch_do_init_vcpu(X86CPU *cpu)
  static int kvm_get_supported_feature_msrs(KVMState *s)
  {
  int ret = 0;
+    int i;

  if (kvm_feature_msrs != NULL) {
  return 0;
@@ -2330,6 +2331,19 @@ static int 
kvm_get_supported_feature_msrs(KVMState *s)

  return ret;
  }

+    /*
+ * Compatibility fix:
+ * Older Linux kernels(<5.3) include the 
MSR_IA32_VMX_PROCBASED_CTLS2


we can be more accurate, that kernel version 4.17 to 5.2, reports
MSR_IA32_VMX_PROCBASED_CTLS2 in KVM_GET_MSR_FEATURE_INDEX_LIST but not
KVM_GET_MSR_INDEX_LIST.


Yeah, I'll add this more precise comment to the next patch.
+ * only in feature msr list, but not in regular msr list. This 
lead to
+ * an issue in older kernel versions where QEMU, through the 
regular

+ * MSR list check, assumes the kernel doesn't maintain this msr,
+ * resulting in incorrect settings by QEMU for this msr.
+ */
+    for (i = 0; i < kvm_feature_msrs->nmsrs; i++) {
+    if (kvm_feature_msrs->indices[i] == 
MSR_IA32_VMX_PROCBASED_CTLS2) {

+    has_msr_vmx_procbased_ctls2 = true;
+    }
+    }


I'm wondering should we move all the initialization of has_msr_*, that
associated with feature MSRs, to here. e.g., has_msr_arch_capabs,
has_msr_vmx_vmfunc,...

I believe this is a more elegant way to fix the issue, which will be 
reflected in my next patch.

When attempting to move the detection logic for feature MSRs (currently
including VMX_VMFUNC, UCODE_REV, ARCH_CAPABILITIES,
PROCBASED_CTLS2) from kvm_get_supported_msrs to
kvm_get_supported_feature_msrs in the current QEMU,
I encountered an "error: failed to set MSR 0x491 to 0x***" on kernel 
4.19.67.

This issue is due to commit 27c42a1bb ("KVM: nVMX: Enable VMFUNC for
the L1 hypervisor", 2017-08-03) exposing VMFUNC to the QEMU guest
without corresponding VMFUNC MSR modification code, leading to an error
when QEMU proactively tries to set the VMFUNC MSR. This bug affects kernels
from 4.14 to 5.2, with a fix introduced in 5.3 by Paolo (e8a70bd4e ("KVM:
nVMX: allow setting the VMFUNC controls MSR", 2019-07-02)).

Therefore, even if we were to move all feature MSRs to
kvm_get_supported_feature_msrs,VMX_VMFUNC could not be moved due to
the need to maintain compatibility with different kernel versions. This
exception makes our move less elegant. Hence, I am wondering whether we
need to move all feature MSRs to kvm_get_supported_feature_msrs. Perhaps
we just need to simply move PROCBASED_CTLS2 to fix the "failed set 0x48b 
..."

type of bugs, and add a comment about it?





Re: [PATCH v4 03/34] tests/qtest/migration: Add a fd + file test

2024-02-22 Thread Peter Xu
On Tue, Feb 20, 2024 at 07:41:07PM -0300, Fabiano Rosas wrote:
> The fd URI supports an fd that is backed by a file. The code should
> select between QIOChannelFile and QIOChannelSocket, depending on the
> type of the fd. Add a test for that.
> 
> Signed-off-by: Fabiano Rosas 

Reviewed-by: Peter Xu 

-- 
Peter Xu




[PATCH 0/3] linux-user: Rewrite target_shmat

2024-02-22 Thread Richard Henderson
There are multiple issues with the implementation of shmat().

(1) With reserved_va, which is the default for 32-on-64-bit, we mmap the
entire guest address space.  Unlike mmap, shmat refuses to replace an
existing mapping without setting SHM_REMAP.  This is the original
subject of issue #115, though it quicky gets distracted by
something else.

(2) With target page size > host page size, and a shm area
that is not a multiple of the target page size, we leave
an unmapped hole that the target expects to be mapped.
This is the subject of 


https://lore.kernel.org/qemu-devel/2no4imvz2zrar5kchz2l3oddqbgpj77jgwcuf7aritkn2ok763@i2mvpcihztho/

wherein qemu itself expects a mapping to exist, and
dies in open_self_maps_2.

So: reimplement the thing.

There are a number of target page size != host page size and
target SHMLBA != host SHMLBA corner cases that are not implementable
without softmmu and a non-linear host to target address space.
I simply bail out in these situations and return EINVAL.

Based-on: <2024004323.268539-1-richard.hender...@linaro.org>
("[PULL 00/39] tcg and linux-user patch queue")


r~


Richard Henderson (3):
  linux-user/loongarch64: Remove TARGET_FORCE_SHMLBA
  linux-user: Add strace for shmat
  linux-user: Rewrite target_shmat

 linux-user/loongarch64/target_syscall.h |   7 --
 linux-user/mmap.c   | 146 ++--
 linux-user/strace.c |  23 
 linux-user/strace.list  |   2 +-
 4 files changed, 134 insertions(+), 44 deletions(-)

-- 
2.34.1




Re: [PATCH v4 02/34] tests/qtest/migration: Rename fd_proto test

2024-02-22 Thread Peter Xu
On Tue, Feb 20, 2024 at 07:41:06PM -0300, Fabiano Rosas wrote:
> Next patch adds another fd test. Rename the existing one closer to
> what's used on other tests, with the 'precopy' prefix.
> 
> Signed-off-by: Fabiano Rosas 

Reviewed-by: Peter Xu 

-- 
Peter Xu




[PATCH 3/3] linux-user: Rewrite target_shmat

2024-02-22 Thread Richard Henderson
Handle combined host and guest alignment requirements.
Handle host and guest page size differences.
Handle SHM_EXEC.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/115
Signed-off-by: Richard Henderson 
---
 linux-user/mmap.c | 146 ++
 1 file changed, 110 insertions(+), 36 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 82f4026283..29421cfab0 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -1236,69 +1236,143 @@ static inline abi_ulong target_shmlba(CPUArchState 
*cpu_env)
 }
 #endif
 
+#if defined(__arm__) || defined(__mips__) || defined(__sparc__)
+#define HOST_FORCE_SHMLBA 1
+#else
+#define HOST_FORCE_SHMLBA 0
+#endif
+
 abi_ulong target_shmat(CPUArchState *cpu_env, int shmid,
abi_ulong shmaddr, int shmflg)
 {
 CPUState *cpu = env_cpu(cpu_env);
-abi_ulong raddr;
 struct shmid_ds shm_info;
 int ret;
-abi_ulong shmlba;
+int h_pagesize;
+int t_shmlba, h_shmlba, m_shmlba;
+size_t t_len, h_len, m_len;
 
 /* shmat pointers are always untagged */
 
-/* find out the length of the shared memory segment */
+/*
+ * Because we can't use host shmat() unless the address is sufficiently
+ * aligned for the host, we'll need to check both.
+ * TODO: Could be fixed with softmmu.
+ */
+t_shmlba = target_shmlba(cpu_env);
+h_pagesize = qemu_real_host_page_size();
+h_shmlba = (HOST_FORCE_SHMLBA ? SHMLBA : h_pagesize);
+m_shmlba = MAX(t_shmlba, h_shmlba);
+
+if (shmaddr) {
+if (shmaddr & (m_shmlba - 1)) {
+if (shmflg & SHM_RND) {
+/*
+ * The guest is allowing the kernel to round the address.
+ * Assume that the guest is ok with us rounding to the
+ * host required alignment too.  Anyway if we don't, we'll
+ * get an error from the kernel.
+ */
+shmaddr &= ~(m_shmlba - 1);
+if (shmaddr == 0 && (shmflg & SHM_REMAP)) {
+return -TARGET_EINVAL;
+}
+} else {
+int require = TARGET_PAGE_SIZE;
+#ifdef TARGET_FORCE_SHMLBA
+require = t_shmlba;
+#endif
+/*
+ * Include host required alignment, as otherwise we cannot
+ * use host shmat at all.
+ */
+require = MAX(require, h_shmlba);
+if (shmaddr & (require - 1)) {
+return -TARGET_EINVAL;
+}
+}
+}
+} else {
+if (shmflg & SHM_REMAP) {
+return -TARGET_EINVAL;
+}
+}
+/* All rounding now manually concluded. */
+shmflg &= ~SHM_RND;
+
+/* Find out the length of the shared memory segment. */
 ret = get_errno(shmctl(shmid, IPC_STAT, _info));
 if (is_error(ret)) {
 /* can't get length, bail out */
 return ret;
 }
+t_len = TARGET_PAGE_ALIGN(shm_info.shm_segsz);
+h_len = ROUND_UP(shm_info.shm_segsz, h_pagesize);
+m_len = MAX(t_len, h_len);
 
-shmlba = target_shmlba(cpu_env);
-
-if (shmaddr & (shmlba - 1)) {
-if (shmflg & SHM_RND) {
-shmaddr &= ~(shmlba - 1);
-} else {
-return -TARGET_EINVAL;
-}
-}
-if (!guest_range_valid_untagged(shmaddr, shm_info.shm_segsz)) {
+if (!guest_range_valid_untagged(shmaddr, m_len)) {
 return -TARGET_EINVAL;
 }
 
 WITH_MMAP_LOCK_GUARD() {
-void *host_raddr;
+void *host_raddr, *test;
 abi_ulong last;
 
-if (shmaddr) {
-host_raddr = shmat(shmid, (void *)g2h_untagged(shmaddr), shmflg);
-} else {
-abi_ulong mmap_start;
-
-/* In order to use the host shmat, we need to honor host SHMLBA.  
*/
-mmap_start = mmap_find_vma(0, shm_info.shm_segsz,
-   MAX(SHMLBA, shmlba));
-
-if (mmap_start == -1) {
+if (!shmaddr) {
+shmaddr = mmap_find_vma(0, m_len, m_shmlba);
+if (shmaddr == -1) {
 return -TARGET_ENOMEM;
 }
-host_raddr = shmat(shmid, g2h_untagged(mmap_start),
-   shmflg | SHM_REMAP);
+} else if (!(shmflg & SHM_REMAP)) {
+if (!page_check_range_empty(shmaddr, shmaddr + m_len - 1)) {
+return -TARGET_EINVAL;
+}
+} else if (t_len < h_len) {
+/*
+ * ??? If the host page size is larger than host page size,
+ * then we might be mapping more pages than the guest expects.
+ * TODO: Could be fixed with softmmu.
+ */
+if (!page_check_range_empty(shmaddr + t_len, shmaddr + h_len - 1)) 
{
+return -TARGET_EINVAL;
+}
 }
 
-if (host_raddr == (void *)-1) {
-

[PATCH 1/3] linux-user/loongarch64: Remove TARGET_FORCE_SHMLBA

2024-02-22 Thread Richard Henderson
The upstream linux kernel does not define __ARCH_FORCE_SHMLBA.

Cc: Song Gao 
Signed-off-by: Richard Henderson 

---

Did this definition come from the port before it was merged upstream?
Or was it incorrectly copied from MIPS?
---
 linux-user/loongarch64/target_syscall.h | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/linux-user/loongarch64/target_syscall.h 
b/linux-user/loongarch64/target_syscall.h
index 8b5de52124..39f229bb9c 100644
--- a/linux-user/loongarch64/target_syscall.h
+++ b/linux-user/loongarch64/target_syscall.h
@@ -38,11 +38,4 @@ struct target_pt_regs {
 #define TARGET_MCL_FUTURE  2
 #define TARGET_MCL_ONFAULT 4
 
-#define TARGET_FORCE_SHMLBA
-
-static inline abi_ulong target_shmlba(CPULoongArchState *env)
-{
-return 64 * KiB;
-}
-
 #endif
-- 
2.34.1




[PATCH 2/3] linux-user: Add strace for shmat

2024-02-22 Thread Richard Henderson
Signed-off-by: Richard Henderson 
---
 linux-user/strace.c| 23 +++
 linux-user/strace.list |  2 +-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index cf26e55264..47d6ec3263 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -670,6 +670,25 @@ print_semctl(CPUArchState *cpu_env, const struct 
syscallname *name,
 }
 #endif
 
+static void
+print_shmat(CPUArchState *cpu_env, const struct syscallname *name,
+abi_long arg0, abi_long arg1, abi_long arg2,
+abi_long arg3, abi_long arg4, abi_long arg5)
+{
+static const struct flags shmat_flags[] = {
+FLAG_GENERIC(SHM_RND),
+FLAG_GENERIC(SHM_REMAP),
+FLAG_GENERIC(SHM_RDONLY),
+FLAG_GENERIC(SHM_EXEC),
+};
+
+print_syscall_prologue(name);
+print_raw_param(TARGET_ABI_FMT_ld, arg0, 0);
+print_pointer(arg1, 0);
+print_flags(shmat_flags, arg2, 1);
+print_syscall_epilogue(name);
+}
+
 #ifdef TARGET_NR_ipc
 static void
 print_ipc(CPUArchState *cpu_env, const struct syscallname *name,
@@ -683,6 +702,10 @@ print_ipc(CPUArchState *cpu_env, const struct syscallname 
*name,
 print_ipc_cmd(arg3);
 qemu_log(",0x" TARGET_ABI_FMT_lx ")", arg4);
 break;
+case IPCOP_shmat:
+print_shmat(cpu_env, &(const struct syscallname){ .name = "shmat" },
+arg1, arg4, arg2, 0, 0, 0);
+break;
 default:
 qemu_log(("%s("
   TARGET_ABI_FMT_ld ","
diff --git a/linux-user/strace.list b/linux-user/strace.list
index 6655d4f26d..dfd4237d14 100644
--- a/linux-user/strace.list
+++ b/linux-user/strace.list
@@ -1398,7 +1398,7 @@
 { TARGET_NR_sgetmask, "sgetmask" , NULL, NULL, NULL },
 #endif
 #ifdef TARGET_NR_shmat
-{ TARGET_NR_shmat, "shmat" , NULL, NULL, print_syscall_ret_addr },
+{ TARGET_NR_shmat, "shmat" , NULL, print_shmat, print_syscall_ret_addr },
 #endif
 #ifdef TARGET_NR_shmctl
 { TARGET_NR_shmctl, "shmctl" , NULL, NULL, NULL },
-- 
2.34.1




Re: [PATCH v4 01/34] docs/devel/migration.rst: Document the file transport

2024-02-22 Thread Peter Xu
On Tue, Feb 20, 2024 at 07:41:05PM -0300, Fabiano Rosas wrote:
> When adding the support for file migration with the file: transport,
> we missed adding documentation for it.
> 
> Signed-off-by: Fabiano Rosas 

Reviewed-by: Peter Xu 

-- 
Peter Xu




Re: [PATCH v4 00/34] migration: File based migration with multifd and fixed-ram

2024-02-22 Thread Peter Xu
On Tue, Feb 20, 2024 at 07:41:04PM -0300, Fabiano Rosas wrote:
> Latest numbers
> ==
> 
> => guest: 128 GB RAM - 120 GB dirty - 1 vcpu in tight loop dirtying memory
> => host: 128 CPU AMD EPYC 7543 - 2 NVMe disks in RAID0 (8586 MiB/s) - xfs
> => pinned vcpus w/ NUMA shortest distances - average of 3 runs - results
>from query-migrate
> 
> non-live   | time (ms)   pages/s   mb/s   MB/s
> ---+---
> file   |110512256258   9549   1193
>   + bg-snapshot|245660119581   4303537

Is this the one using userfault?  I'm surprised it's much slower when
enabled; logically for a non-live snapshot it should take similar loops
like a normal migration as it should have zero faults, then it should be
similar performance.

> ---+---
> fixed-ram  |157975216877   6672834
>   + multifd 8 ch.  | 95922292178  10982   1372
>  + direct-io   | 23268   1936897  45330   5666
> ---
> 
> live   | time (ms)   pages/s   mb/s   MB/s
> ---+---
> file   | - -  -  - (file grew 4x the VM 
> size)
>   + bg-snapshot|357635141747   2974371
> ---+---
> fixed-ram  | - -  -  - (no convergence in 5 
> min)
>   + multifd 8 ch.  |230812497551  14900   1862
>  + direct-io   | 27475   1788025  46736   5842
> ---

Also surprised on direct-io too.. that is definitely something tremendous.

-- 
Peter Xu




Re: [PATCH v4 1/2] target/i386: add support for LAM in CPUID enumeration

2024-02-22 Thread Zhao Liu
On Fri, Jan 12, 2024 at 02:00:41PM +0800, Binbin Wu wrote:
> Date: Fri, 12 Jan 2024 14:00:41 +0800
> From: Binbin Wu 
> Subject: [PATCH v4 1/2] target/i386: add support for LAM in CPUID
>  enumeration
> X-Mailer: git-send-email 2.25.1
> 
> From: Robert Hoo 
> 
> Linear Address Masking (LAM) is a new Intel CPU feature, which allows
> software to use of the untranslated address bits for metadata.
> 
> The bit definition:
> CPUID.(EAX=7,ECX=1):EAX[26]
> 
> Add CPUID definition for LAM.
> 
> Note LAM feature is not supported for TCG of target-i386, LAM CPIUD bit
> will not be added to TCG_7_1_EAX_FEATURES.
> 
> More info can be found in Intel ISE Chapter "LINEAR ADDRESS MASKING(LAM)"
> https://cdrdv2.intel.com/v1/dl/getContent/671368
> 
> Signed-off-by: Robert Hoo 
> Co-developed-by: Binbin Wu 
> Signed-off-by: Binbin Wu 
> Tested-by: Xuelian Guo 
> Reviewed-by: Xiaoyao Li 
> ---
>  target/i386/cpu.c | 2 +-
>  target/i386/cpu.h | 2 ++
>  2 files changed, 3 insertions(+), 1 deletion(-)

Reviewed-by: Zhao Liu 

> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 2524881ce2..fc862dfeb1 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -967,7 +967,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
>  "fsrc", NULL, NULL, NULL,
>  NULL, NULL, NULL, NULL,
>  NULL, "amx-fp16", NULL, "avx-ifma",
> -NULL, NULL, NULL, NULL,
> +NULL, NULL, "lam", NULL,
>  NULL, NULL, NULL, NULL,
>  },
>  .cpuid = {
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 7f0786e8b9..18ea755644 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -925,6 +925,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
>  #define CPUID_7_1_EAX_AMX_FP16  (1U << 21)
>  /* Support for VPMADD52[H,L]UQ */
>  #define CPUID_7_1_EAX_AVX_IFMA  (1U << 23)
> +/* Linear Address Masking */
> +#define CPUID_7_1_EAX_LAM   (1U << 26)
>  
>  /* Support for VPDPB[SU,UU,SS]D[,S] */
>  #define CPUID_7_1_EDX_AVX_VNNI_INT8 (1U << 4)
> -- 
> 2.25.1
> 
> 



Re: [PATCH v2 4/7] migration/multifd: Enable zero page checking from multifd threads.

2024-02-22 Thread Peter Xu
On Wed, Feb 21, 2024 at 06:06:19PM -0300, Fabiano Rosas wrote:
> Hao Xiang  writes:
> 
> > This change adds a dedicated handler for MigrationOps::ram_save_target_page 
> > in
> 
> nit: Add a dedicated handler...
> 
> Usually "this patch/change" is used only when necessary to avoid
> ambiguity.
> 
> > multifd live migration. Now zero page checking can be done in the multifd 
> > threads
> > and this becomes the default configuration. We still provide backward 
> > compatibility
> > where zero page checking is done from the migration main thread.
> >
> > Signed-off-by: Hao Xiang 
> > ---
> >  migration/multifd.c |  1 +
> >  migration/options.c |  2 +-
> >  migration/ram.c | 53 ++---
> >  3 files changed, 42 insertions(+), 14 deletions(-)
> >
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index fbb40ea10b..ef5dad1019 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -13,6 +13,7 @@
> >  #include "qemu/osdep.h"
> >  #include "qemu/cutils.h"
> 
> This include...
> 
> >  #include "qemu/rcu.h"
> > +#include "qemu/cutils.h"
> 
> is there already.
> 
> >  #include "exec/target_page.h"
> >  #include "sysemu/sysemu.h"
> >  #include "exec/ramblock.h"
> > diff --git a/migration/options.c b/migration/options.c
> > index 3c603391b0..3c79b6ccd4 100644
> > --- a/migration/options.c
> > +++ b/migration/options.c
> > @@ -181,7 +181,7 @@ Property migration_properties[] = {
> >MIG_MODE_NORMAL),
> >  DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
> > parameters.zero_page_detection,
> > -   ZERO_PAGE_DETECTION_LEGACY),
> > +   ZERO_PAGE_DETECTION_MULTIFD),
> 
> I think we'll need something to avoid a 9.0 -> 8.2 migration with this
> enabled. Otherwise it will go along happily until we get data corruption
> because the new QEMU didn't send any zero pages on the migration thread
> and the old QEMU did not look for them in the multifd packet.

It could be even worse, as the new QEMU will only attach "normal" pages
after the multifd packet, the old QEMU could read more than it could,
expecting all pages..

> 
> Perhaps bumping the MULTIFD_VERSION when ZERO_PAGE_DETECTION_MULTIFD is
> in use. We'd just need to fix the test in the new QEMU to check
> (msg.version > MULTIFD_VERSION) instead of (msg.version != MULTIFD_VERSION).

IMHO we don't need yet to change MULTIFD_VERSION, what we need is perhaps a
compat entry in hw_compat_8_2 setting "zero-page-detection" to "legacy".
We should make sure when "legacy" is set, multifd ran the old protocol
(zero_num will always be 0, and will be ignored by old QEMUs, IIUC).

One more comment is, when repost please consider split this patch into two;
The new ram_save_target_page_multifd() hook can be done in another patch,
AFAIU.

> 
> >  
> >  /* Migration capabilities */
> >  DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
> > diff --git a/migration/ram.c b/migration/ram.c
> > index 5ece9f042e..b088c5a98c 100644
> > --- a/migration/ram.c
> > +++ b/migration/ram.c
> > @@ -1123,10 +1123,6 @@ static int save_zero_page(RAMState *rs, 
> > PageSearchStatus *pss,
> >  QEMUFile *file = pss->pss_channel;
> >  int len = 0;
> >  
> > -if (migrate_zero_page_detection() != ZERO_PAGE_DETECTION_LEGACY) {
> > -return 0;
> > -}
> 
> How does 'none' work now?
> 
> > -
> >  if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) {
> >  return 0;
> >  }
> > @@ -1256,6 +1252,10 @@ static int ram_save_page(RAMState *rs, 
> > PageSearchStatus *pss)
> >  
> >  static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset)
> >  {
> > +assert(migrate_multifd());
> > +assert(!migrate_compress());
> > +assert(!migration_in_postcopy());
> 
> Drop these, please. Keep only the asserts that are likely to trigger
> during development, such as the existing ones at multifd_send_pages.
> 
> > +
> >  if (!multifd_queue_page(block, offset)) {
> >  return -1;
> >  }
> > @@ -2046,7 +2046,6 @@ static bool save_compress_page(RAMState *rs, 
> > PageSearchStatus *pss,
> >   */
> >  static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
> >  {
> > -RAMBlock *block = pss->block;
> >  ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
> >  int res;
> >  
> > @@ -2062,17 +2061,40 @@ static int ram_save_target_page_legacy(RAMState 
> > *rs, PageSearchStatus *pss)
> >  return 1;
> >  }
> >  
> > +return ram_save_page(rs, pss);
> 
> Look at where git put this! Are you using the default diff algorithm? If
> not try using --patience to see if it improves the diff.
> 
> > +}
> > +
> > +/**
> > + * ram_save_target_page_multifd: save one target page
> > + *
> > + * Returns the number of pages written
> 
> We could be more precise here:
> 
>  ram_save_target_page_multifd: send one target page to multifd workers
>  

[PATCH v2 1/4] target/riscv: Add functions for common matching conditions of trigger

2024-02-22 Thread Alvin Chang via
According to RISC-V Debug specification, there are several common
matching conditions before firing a trigger, including the enabled
privilege levels of the trigger.

This commit adds trigger_common_match() to prepare the common matching
conditions for the type 2/3/6 triggers. For now, we just implement
trigger_priv_match() to check if the enabled privilege levels of the
trigger match CPU's current privilege level.

Signed-off-by: Alvin Chang 
---
 target/riscv/debug.c | 70 
 1 file changed, 70 insertions(+)

diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index e30d99cc2f..3891236b82 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -241,6 +241,76 @@ static void do_trigger_action(CPURISCVState *env, 
target_ulong trigger_index)
 }
 }
 
+/*
+ * Check the privilege level of specific trigger matches CPU's current 
privilege
+ * level.
+ */
+static bool trigger_priv_match(CPURISCVState *env, trigger_type_t type,
+   int trigger_index)
+{
+target_ulong ctrl = env->tdata1[trigger_index];
+
+switch (type) {
+case TRIGGER_TYPE_AD_MATCH:
+/* type 2 trigger cannot be fired in VU/VS mode */
+if (env->virt_enabled) {
+return false;
+}
+/* check U/S/M bit against current privilege level */
+if ((ctrl >> 3) & BIT(env->priv)) {
+return true;
+}
+break;
+case TRIGGER_TYPE_AD_MATCH6:
+if (env->virt_enabled) {
+/* check VU/VS bit against current privilege level */
+if ((ctrl >> 23) & BIT(env->priv)) {
+return true;
+}
+} else {
+/* check U/S/M bit against current privilege level */
+if ((ctrl >> 3) & BIT(env->priv)) {
+return true;
+}
+}
+break;
+case TRIGGER_TYPE_INST_CNT:
+if (env->virt_enabled) {
+/* check VU/VS bit against current privilege level */
+if ((ctrl >> 25) & BIT(env->priv)) {
+return true;
+}
+} else {
+/* check U/S/M bit against current privilege level */
+if ((ctrl >> 6) & BIT(env->priv)) {
+return true;
+}
+}
+break;
+case TRIGGER_TYPE_INT:
+case TRIGGER_TYPE_EXCP:
+case TRIGGER_TYPE_EXT_SRC:
+qemu_log_mask(LOG_UNIMP, "trigger type: %d is not supported\n", type);
+break;
+case TRIGGER_TYPE_NO_EXIST:
+case TRIGGER_TYPE_UNAVAIL:
+qemu_log_mask(LOG_GUEST_ERROR, "trigger type: %d does not exist\n",
+  type);
+break;
+default:
+g_assert_not_reached();
+}
+
+return false;
+}
+
+/* Common matching conditions for all types of the triggers. */
+static bool trigger_common_match(CPURISCVState *env, trigger_type_t type,
+ int trigger_index)
+{
+return trigger_priv_match(env, type, trigger_index);
+}
+
 /* type 2 trigger */
 
 static uint32_t type2_breakpoint_size(CPURISCVState *env, target_ulong ctrl)
-- 
2.34.1




[PATCH v2 0/4] RISC-V: Modularize common match conditions for trigger

2024-02-22 Thread Alvin Chang via
According to RISC-V Debug specification, the enabled privilege levels of
the trigger is common match conditions for all the types of the trigger.
This series modularize the code for checking the privilege levels of
type 2/3/6 triggers by implementing functions trigger_common_match()
and trigger_priv_match().

Additional match conditions, such as CSR tcontrol and textra, can be
further implemented into trigger_common_match() in the future.

Changes from v1:
- Fix typo
- Add commit description for changing behavior of looping the triggers
  when we check type 2 triggers.

Alvin Chang (4):
  target/riscv: Add functions for common matching conditions of trigger
  target/riscv: Apply modularized matching conditions for breakpoint
  target/riscv: Apply modularized matching conditions for watchpoint
  target/riscv: Apply modularized matching conditions for icount trigger

 target/riscv/debug.c | 124 +--
 1 file changed, 83 insertions(+), 41 deletions(-)

-- 
2.34.1




[PATCH v2 4/4] target/riscv: Apply modularized matching conditions for icount trigger

2024-02-22 Thread Alvin Chang via
We have implemented trigger_common_match(), which checks if the enabled
privilege levels of the trigger match CPU's current privilege level. We
can invoke trigger_common_match() to check the privilege levels of the
type 3 triggers.

Signed-off-by: Alvin Chang 
---
 target/riscv/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index 9f9f332019..eb45e2c147 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -624,7 +624,7 @@ void helper_itrigger_match(CPURISCVState *env)
 if (get_trigger_type(env, i) != TRIGGER_TYPE_INST_CNT) {
 continue;
 }
-if (check_itrigger_priv(env, i)) {
+if (!trigger_common_match(env, TRIGGER_TYPE_INST_CNT, i)) {
 continue;
 }
 count = itrigger_get_count(env, i);
-- 
2.34.1




[PATCH v2 3/4] target/riscv: Apply modularized matching conditions for watchpoint

2024-02-22 Thread Alvin Chang via
We have implemented trigger_common_match(), which checks if the enabled
privilege levels of the trigger match CPU's current privilege level.
Remove the related code in riscv_cpu_debug_check_watchpoint() and invoke
trigger_common_match() to check the privilege levels of the type 2 and
type 6 triggers for the watchpoints.

This commit also changes the behavior of looping the triggers. In
previous implementation, if we have a type 2 trigger and
env->virt_enabled is true, we directly return false to stop the loop.
Now we keep looping all the triggers until we find a matched trigger.

Only load/store bits and loaded/stored address should be further checked
in riscv_cpu_debug_check_watchpoint().

Signed-off-by: Alvin Chang 
---
 target/riscv/debug.c | 26 ++
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index b7b0fa8945..9f9f332019 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -899,13 +899,12 @@ bool riscv_cpu_debug_check_watchpoint(CPUState *cs, 
CPUWatchpoint *wp)
 for (i = 0; i < RV_MAX_TRIGGERS; i++) {
 trigger_type = get_trigger_type(env, i);
 
+if (!trigger_common_match(env, trigger_type, i)) {
+continue;
+}
+
 switch (trigger_type) {
 case TRIGGER_TYPE_AD_MATCH:
-/* type 2 trigger cannot be fired in VU/VS mode */
-if (env->virt_enabled) {
-return false;
-}
-
 ctrl = env->tdata1[i];
 addr = env->tdata2[i];
 flags = 0;
@@ -918,10 +917,7 @@ bool riscv_cpu_debug_check_watchpoint(CPUState *cs, 
CPUWatchpoint *wp)
 }
 
 if ((wp->flags & flags) && (wp->vaddr == addr)) {
-/* check U/S/M bit against current privilege level */
-if ((ctrl >> 3) & BIT(env->priv)) {
-return true;
-}
+return true;
 }
 break;
 case TRIGGER_TYPE_AD_MATCH6:
@@ -937,17 +933,7 @@ bool riscv_cpu_debug_check_watchpoint(CPUState *cs, 
CPUWatchpoint *wp)
 }
 
 if ((wp->flags & flags) && (wp->vaddr == addr)) {
-if (env->virt_enabled) {
-/* check VU/VS bit against current privilege level */
-if ((ctrl >> 23) & BIT(env->priv)) {
-return true;
-}
-} else {
-/* check U/S/M bit against current privilege level */
-if ((ctrl >> 3) & BIT(env->priv)) {
-return true;
-}
-}
+return true;
 }
 break;
 default:
-- 
2.34.1




[PATCH v2 2/4] target/riscv: Apply modularized matching conditions for breakpoint

2024-02-22 Thread Alvin Chang via
We have implemented trigger_common_match(), which checks if the enabled
privilege levels of the trigger match CPU's current privilege level.
Remove the related code in riscv_cpu_debug_check_breakpoint() and invoke
trigger_common_match() to check the privilege levels of the type 2 and
type 6 triggers for the breakpoints.

This commit also changes the behavior of looping the triggers. In
previous implementation, if we have a type 2 trigger and
env->virt_enabled is true, we directly return false to stop the loop.
Now we keep looping all the triggers until we find a matched trigger.

Only the execution bit and the executed PC should be futher checked in
riscv_cpu_debug_check_breakpoint().

Signed-off-by: Alvin Chang 
---
 target/riscv/debug.c | 26 ++
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index 3891236b82..b7b0fa8945 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -855,21 +855,17 @@ bool riscv_cpu_debug_check_breakpoint(CPUState *cs)
 for (i = 0; i < RV_MAX_TRIGGERS; i++) {
 trigger_type = get_trigger_type(env, i);
 
+if (!trigger_common_match(env, trigger_type, i)) {
+continue;
+}
+
 switch (trigger_type) {
 case TRIGGER_TYPE_AD_MATCH:
-/* type 2 trigger cannot be fired in VU/VS mode */
-if (env->virt_enabled) {
-return false;
-}
-
 ctrl = env->tdata1[i];
 pc = env->tdata2[i];
 
 if ((ctrl & TYPE2_EXEC) && (bp->pc == pc)) {
-/* check U/S/M bit against current privilege level */
-if ((ctrl >> 3) & BIT(env->priv)) {
-return true;
-}
+return true;
 }
 break;
 case TRIGGER_TYPE_AD_MATCH6:
@@ -877,17 +873,7 @@ bool riscv_cpu_debug_check_breakpoint(CPUState *cs)
 pc = env->tdata2[i];
 
 if ((ctrl & TYPE6_EXEC) && (bp->pc == pc)) {
-if (env->virt_enabled) {
-/* check VU/VS bit against current privilege level */
-if ((ctrl >> 23) & BIT(env->priv)) {
-return true;
-}
-} else {
-/* check U/S/M bit against current privilege level */
-if ((ctrl >> 3) & BIT(env->priv)) {
-return true;
-}
-}
+return true;
 }
 break;
 default:
-- 
2.34.1




Re: [PATCH v2 3/7] migration/multifd: Zero page transmission on the multifd thread.

2024-02-22 Thread Peter Xu
On Wed, Feb 21, 2024 at 06:04:10PM -0300, Fabiano Rosas wrote:
> Hao Xiang  writes:
> 
> > 1. Implements the zero page detection and handling on the multifd
> > threads for non-compression, zlib and zstd compression backends.
> > 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> > 3. Add proper asserts to ensure pages->normal are used for normal pages
> > in all scenarios.
> >
> > Signed-off-by: Hao Xiang 
> > ---
> >  migration/meson.build |  1 +
> >  migration/multifd-zero-page.c | 59 +++
> >  migration/multifd-zlib.c  | 26 ---
> >  migration/multifd-zstd.c  | 25 ---
> >  migration/multifd.c   | 50 +++--
> >  migration/multifd.h   |  7 +
> >  qapi/migration.json   |  4 ++-
> >  7 files changed, 151 insertions(+), 21 deletions(-)
> >  create mode 100644 migration/multifd-zero-page.c
> >
> > diff --git a/migration/meson.build b/migration/meson.build
> > index 92b1cc4297..1eeb915ff6 100644
> > --- a/migration/meson.build
> > +++ b/migration/meson.build
> > @@ -22,6 +22,7 @@ system_ss.add(files(
> >'migration.c',
> >'multifd.c',
> >'multifd-zlib.c',
> > +  'multifd-zero-page.c',
> >'ram-compress.c',
> >'options.c',
> >'postcopy-ram.c',
> > diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> > new file mode 100644
> > index 00..f0cd8e2c53
> > --- /dev/null
> > +++ b/migration/multifd-zero-page.c
> > @@ -0,0 +1,59 @@
> > +/*
> > + * Multifd zero page detection implementation.
> > + *
> > + * Copyright (c) 2024 Bytedance Inc
> > + *
> > + * Authors:
> > + *  Hao Xiang 
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or 
> > later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qemu/cutils.h"
> > +#include "exec/ramblock.h"
> > +#include "migration.h"
> > +#include "multifd.h"
> > +#include "options.h"
> > +#include "ram.h"
> > +
> > +void multifd_zero_page_check_send(MultiFDSendParams *p)
> > +{
> > +/*
> > + * QEMU older than 9.0 don't understand zero page
> > + * on multifd channel. This switch is required to
> > + * maintain backward compatibility.
> > + */
> > +bool use_multifd_zero_page =
> > +(migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> > +MultiFDPages_t *pages = p->pages;
> > +RAMBlock *rb = pages->block;
> > +
> > +assert(pages->num != 0);
> > +assert(pages->normal_num == 0);
> > +assert(pages->zero_num == 0);
> 
> We can drop these before the final version.
> 
> > +
> > +for (int i = 0; i < pages->num; i++) {
> > +uint64_t offset = pages->offset[i];
> > +if (use_multifd_zero_page &&
> > +buffer_is_zero(rb->host + offset, p->page_size)) {
> > +pages->zero[pages->zero_num] = offset;
> > +pages->zero_num++;
> > +ram_release_page(rb->idstr, offset);
> > +} else {
> > +pages->normal[pages->normal_num] = offset;
> > +pages->normal_num++;
> > +}
> > +}
> 
> I don't think it's super clean to have three arrays offset, zero and
> normal, all sized for the full packet size. It might be possible to just
> carry a bitmap of non-zero pages along with pages->offset and operate on
> that instead.
> 
> What do you think?
> 
> Peter, any ideas? Should we just leave this for another time?

Yeah I think a bitmap should save quite a few fields indeed, it'll however
make the latter iteration slightly harder by walking both (offset[],
bitmap), process the page only if bitmap is set for the offset.

IIUC we perhaps don't even need a bitmap?  AFAIU what we only need in
Multifdpages_t is one extra field to mark "how many normal pages", aka,
normal_num here (zero_num can be calculated from num-normal_num).  Then
the zero page detection logic should do two things:

  - Sort offset[] array so that it starts with normal pages, followed up by
zero pages

  - Setup normal_num to be the number of normal pages

Then we reduce 2 new arrays (normal[], zero[]) + 2 new fields (normal_num,
zero_num) -> 1 new field (normal_num).  It'll also be trivial to fill the
packet header later because offset[] is exactly that.

Side note - I still think it's confusing to read this patch and previous
patch separately.  Obviously previous patch introduced these new fields
without justifying their values yet.  IMHO it'll be easier to review if you
merge the two patches.

> 
> > +}
> > +
> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> > +{
> > +for (int i = 0; i < p->zero_num; i++) {
> > +void *page = p->host + p->zero[i];
> > +if (!buffer_is_zero(page, p->page_size)) {
> > +memset(page, 0, p->page_size);
> > +}
> > +}
> > +}
> > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > index 

Re: [PATCH v4 50/66] i386/tdx: handle TDG.VP.VMCALL

2024-02-22 Thread Qiu, Feng
Actually the 4 byte length header is provided by client 
library(https://github.com/intel/SGXDataCenterAttestationPrimitives/blob/master/QuoteGeneration/quote_wrapper/tdx_attest/tdx_attest.c#L295), 
not QEMU. QEMUjust treats the how payload including the header a whole blob.
BTW, in the latest stable kernel, the TDX guest driver changed to TSM 
based 
solution(https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/drivers/virt/coco/tdx-guest/tdx-guest.c?h=v6.7.5) 
and it will only send raw report without 4 byte length header and other 
stuff. Existing official QGS doesn't compatible with this change and we 
will deliver compatible QGS in the end of Q1.


On 2/23/2024 9:06 AM, Xiaoyao Li wrote:

+ Feng Qiu,

On 2/23/2024 12:30 AM, Daniel P. Berrangé wrote:

On Wed, Jan 24, 2024 at 10:23:12PM -0500, Xiaoyao Li wrote:

From: Isaku Yamahata 

Add property "quote-generation-socket" to tdx-guest, which is a property
of type SocketAddress to specify Quote Generation Service(QGS).

On request of GetQuote, it connects to the QGS socket, read request
data from shared guest memory, send the request data to the QGS,
and store the response into shared guest memory, at last notify
TD guest by interrupt.

command line example:
   qemu-system-x86_64 \
 -object 
'{"qom-type":"tdx-guest","id":"tdx0","quote-generation-socket":{"type": "vsock", "cid":"1","port":"1234"}}' \

 -machine confidential-guest-support=tdx0

Note, above example uses vsock type socket because the QGS we used
implements the vsock socket. It can be other types, like UNIX socket,
which depends on the implementation of QGS.


Can you confirm again exactly what QGS impl you are testing against ?
> I've tried the impl at


https://github.com/intel/SGXDataCenterAttestationPrimitives/tree/master/QuoteGeneration/quote_wrapper/qgs


which supports UNIX sockets and VSOCK. In both cases, however, it
appears to be speaking a different protocol than your QEMU impl
below uses.

Specifically here:

   
https://github.com/intel/SGXDataCenterAttestationPrimitives/blob/master/QuoteGeneration/quote_wrapper/qgs/qgs_server.cpp#L143


it is reading 4 bytes of header, which are interpreted as the length
of the payload which will then be read off the wire. IIUC the payload
it expects is the TDREPORT struct.

Your QEMU patches here meanwhile are just sending the payload from
the GetQuote hypercall which is the TDREPORT struct.

IOW, QEMU is not sending the 4 byte length header the QGS expects.
and whole thing fails.


I'm using the one provided by internal folks, which supports 
interpreting the payload without the header.


I don't know when will the updated implementation show up in public 
github. @Feng Liu can help on it.




To avoid no response from QGS server, setup a timer for the transaction.
If timeout, make it an error and interrupt guest. Define the 
threshold of

time to 30s at present, maybe change to other value if not appropriate.

Signed-off-by: Isaku Yamahata 
Codeveloped-by: Chenyi Qiang 
Signed-off-by: Chenyi Qiang 
Codeveloped-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
---


With regards,
Daniel






Re: [PULL 0/1] loongarch-to-apply queue

2024-02-22 Thread gaosong

在 2024/2/22 下午8:42, Peter Maydell 写道:

On Wed, 21 Feb 2024 at 09:11, Song Gao  wrote:

The following changes since commit 760b4dcdddba4a40b9fa0eb78fdfc7eda7cb83d0:

   Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging 
(2024-02-20 10:11:08 +)

are available in the Git repository at:

   https://gitlab.com/gaosong/qemu.git tags/pull-loongarch-20240221

for you to fetch changes up to be57fd1e526e70fd55f1e87b0b70fab3c6baf089:

   loongarch: Change the UEFI loading mode to loongarch (2024-02-21 17:06:23 
+0800)


pull-loongarch-20240221


Xianglai Li (1):
   loongarch: Change the UEFI loading mode to loongarch

Hi; this fails to build for mipsel:
https://gitlab.com/qemu-project/qemu/-/jobs/6232698129

../hw/loongarch/acpi-build.c: In function ‘build_flash_aml’:
../hw/loongarch/acpi-build.c:327:19: error: incompatible types when
assigning to type ‘hwaddr’ {aka ‘long long unsigned int’} from type
‘Int128’
327 | flash0_size = flash_mem->size;
| ^
../hw/loongarch/acpi-build.c:331:19: error: incompatible types when
assigning to type ‘hwaddr’ {aka ‘long long unsigned int’} from type
‘Int128’
331 | flash1_size = flash_mem->size;
| ^


../hw/loongarch/virt.c: In function ‘fdt_add_flash_node’:
../hw/loongarch/virt.c:131:19: error: incompatible types when
assigning to type ‘hwaddr’ {aka ‘long long unsigned int’} from type
‘Int128’
131 | flash0_size = flash_mem->size;
| ^
../hw/loongarch/virt.c:135:19: error: incompatible types when
assigning to type ‘hwaddr’ {aka ‘long long unsigned int’} from type
‘Int128’
135 | flash1_size = flash_mem->size;
| ^


The value you get back from pflash_cfi01_get_memory() is a
MemoryRegion -- this should be an opaque struct to you, not
something you can reach in and get the 'size' field from.
(The 'size' field is an Int128, which is not necessarily an
integer type known to the compiler -- on some platforms it is
a struct -- which is why this doesn't compile here.)

Your board code created these memory regions so typically it
should already know how big they are.  If you really
do need to get the size of a MemoryRegion, the function
to use is memory_region_size(

Got it ,  I will correct it.

Thanks.
Song Gao

thanks
-- PMM





Re: [PATCH] system/physmem: Fix migration dirty bitmap coherency with TCG memory access

2024-02-22 Thread Nicholas Piggin
On Fri Feb 23, 2024 at 6:59 AM AEST, Thomas Huth wrote:
> On 20/02/2024 02.13, Nicholas Piggin wrote:
> > On Tue Feb 20, 2024 at 12:10 AM AEST, Thomas Huth wrote:
> >> On 19/02/2024 07.17, Nicholas Piggin wrote:
> >>> The fastpath in cpu_physical_memory_sync_dirty_bitmap() to test large
> >>> aligned ranges forgot to bring the TCG TLB up to date after clearing
> >>> some of the dirty memory bitmap bits. This can result in stores though
> >>> the TCG TLB not setting the dirty memory bitmap and ultimately causes
> >>> memory corruption / lost updates during migration from a TCG host.
> >>>
> >>> Fix this by exporting an abstracted function to call when dirty bits
> >>> have been cleared.
> >>>
> >>> Fixes: aa8dc044772 ("migration: synchronize memory bitmap 64bits at a 
> >>> time")
> >>> Signed-off-by: Nicholas Piggin 
> >>> ---
> >>
> >> Sounds promising! ... but it doesn't seem to fix the migration-test qtest
> >> with s390x when it gets enabled again:
> > 
> > Did it fix kvm-unit-tests for you?
>
> It does, indeed! With your QEMU patch here, your new selftest-migration test 
> of the k-u-t is working reliably with TCG now, indeed. Thus feel free to add:
>
> Tested-by: Thomas Huth 

Great, thanks.

>
> >> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> >> --- a/tests/qtest/migration-test.c
> >> +++ b/tests/qtest/migration-test.c
> >> @@ -3385,15 +3385,6 @@ int main(int argc, char **argv)
> >>return g_test_run();
> >>}
> >>
> >> -/*
> >> - * Similar to ppc64, s390x seems to be touchy with TCG, so disable it
> >> - * there until the problems are resolved
> >> - */
> >> -if (g_str_equal(arch, "s390x") && !has_kvm) {
> >> -g_test_message("Skipping test: s390x host with KVM is required");
> >> -return g_test_run();
> >> -}
> >> -
> >>tmpfs = g_dir_make_tmp("migration-test-XX", );
> >>if (!tmpfs) {
> >>g_test_message("Can't create temporary directory in %s: %s",
> >>
> >> I wonder whether there is more stuff like this necessary somewhere?
> > 
> > Possibly. That's what the commit logs for the TCG disable indicate. I
> > have found another dirty bitmap TCG race too. I'll send it out after
> > some more testing.
> > 
> >> Did you try to re-enable tests/qtest/migration-test.c for ppc64 with TCG to
> >> see whether that works fine now?
> > 
> > Hmm, I did try and so far ppc64 is not failing even with upstream QEMU.
>
> Oh, indeed! Actually, now that you mentioned it, I remembered that I checked 
> it a couple of weeks ago already:
>
> https://lore.kernel.org/qemu-devel/7d4f5624-83d2-4330-9315-b23869529...@redhat.com/

Okay I'll look at re-enabling it then.

> > I'll try with s390x. Any additional build or runtime options to make it
> > break? How long does it take for breakage to be evident?
>
> For me, it normally breaks after running the migration test a couple of few 
> times already, let's say one time out of ten runs?

Seems like a tricky one to debug.

It looks like the migration qtest is just migrating while incrementing each
char in 99MB of memory? Interesting if that breaks but k-u-t multi
migration on s390x does not. Could be worth looking at the differences
between them.

It is also odd the qtest didn't trigger this TCG bug. I have another
multi-migration test for kvm-unit-tests (not yet submitted) which does
similar dirtying of memory and that *does* break TCG.

Thanks,
Nick



Re: [PATCH v4 50/66] i386/tdx: handle TDG.VP.VMCALL

2024-02-22 Thread Xiaoyao Li

+ Feng Qiu,

On 2/23/2024 12:30 AM, Daniel P. Berrangé wrote:

On Wed, Jan 24, 2024 at 10:23:12PM -0500, Xiaoyao Li wrote:

From: Isaku Yamahata 

Add property "quote-generation-socket" to tdx-guest, which is a property
of type SocketAddress to specify Quote Generation Service(QGS).

On request of GetQuote, it connects to the QGS socket, read request
data from shared guest memory, send the request data to the QGS,
and store the response into shared guest memory, at last notify
TD guest by interrupt.

command line example:
   qemu-system-x86_64 \
 -object '{"qom-type":"tdx-guest","id":"tdx0","quote-generation-socket":{"type": "vsock", 
"cid":"1","port":"1234"}}' \
 -machine confidential-guest-support=tdx0

Note, above example uses vsock type socket because the QGS we used
implements the vsock socket. It can be other types, like UNIX socket,
which depends on the implementation of QGS.


Can you confirm again exactly what QGS impl you are testing against ?
> I've tried the impl at


https://github.com/intel/SGXDataCenterAttestationPrimitives/tree/master/QuoteGeneration/quote_wrapper/qgs

which supports UNIX sockets and VSOCK. In both cases, however, it
appears to be speaking a different protocol than your QEMU impl
below uses.

Specifically here:

   
https://github.com/intel/SGXDataCenterAttestationPrimitives/blob/master/QuoteGeneration/quote_wrapper/qgs/qgs_server.cpp#L143

it is reading 4 bytes of header, which are interpreted as the length
of the payload which will then be read off the wire. IIUC the payload
it expects is the TDREPORT struct.

Your QEMU patches here meanwhile are just sending the payload from
the GetQuote hypercall which is the TDREPORT struct.

IOW, QEMU is not sending the 4 byte length header the QGS expects.
and whole thing fails.


I'm using the one provided by internal folks, which supports 
interpreting the payload without the header.


I don't know when will the updated implementation show up in public 
github. @Feng Liu can help on it.




To avoid no response from QGS server, setup a timer for the transaction.
If timeout, make it an error and interrupt guest. Define the threshold of
time to 30s at present, maybe change to other value if not appropriate.

Signed-off-by: Isaku Yamahata 
Codeveloped-by: Chenyi Qiang 
Signed-off-by: Chenyi Qiang 
Codeveloped-by: Xiaoyao Li 
Signed-off-by: Xiaoyao Li 
---


With regards,
Daniel





Re: [RFC QEMU PATCH v4 1/1] xen: Use gsi instead of irq for mapping pirq

2024-02-22 Thread Stefano Stabellini
On Fri, 5 Jan 2024, Jiqian Chen wrote:
> In PVH dom0, it uses the linux local interrupt mechanism,
> when it allocs irq for a gsi, it is dynamic, and follow
> the principle of applying first, distributing first. And
> the irq number is alloced from small to large, but the
> applying gsi number is not, may gsi 38 comes before gsi
> 28, that causes the irq number is not equal with the gsi
> number. And when passthrough a device, qemu wants to use
> gsi to map pirq, xen_pt_realize->xc_physdev_map_pirq, but
> the gsi number is got from file
> /sys/bus/pci/devices//irq in current code, so it
> will fail when mapping.
> 
> Add gsi into XenHostPCIDevice and use gsi number that
> read from gsi sysfs if it exists.
> 
> Co-developed-by: Huang Rui 
> Signed-off-by: Jiqian Chen 

Reviewed-by: Stefano Stabellini 


> ---
>  hw/xen/xen-host-pci-device.c | 7 +++
>  hw/xen/xen-host-pci-device.h | 1 +
>  hw/xen/xen_pt.c  | 6 +-
>  3 files changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/xen/xen-host-pci-device.c b/hw/xen/xen-host-pci-device.c
> index 8c6e9a1716a2..5be3279aa25b 100644
> --- a/hw/xen/xen-host-pci-device.c
> +++ b/hw/xen/xen-host-pci-device.c
> @@ -370,6 +370,13 @@ void xen_host_pci_device_get(XenHostPCIDevice *d, 
> uint16_t domain,
>  }
>  d->irq = v;
>  
> +xen_host_pci_get_dec_value(d, "gsi", , errp);
> +if (*errp) {
> +d->gsi = -1;
> +} else {
> +d->gsi = v;
> +}
> +
>  xen_host_pci_get_hex_value(d, "class", , errp);
>  if (*errp) {
>  goto error;
> diff --git a/hw/xen/xen-host-pci-device.h b/hw/xen/xen-host-pci-device.h
> index 4d8d34ecb024..74c552bb5548 100644
> --- a/hw/xen/xen-host-pci-device.h
> +++ b/hw/xen/xen-host-pci-device.h
> @@ -27,6 +27,7 @@ typedef struct XenHostPCIDevice {
>  uint16_t device_id;
>  uint32_t class_code;
>  int irq;
> +int gsi;
>  
>  XenHostPCIIORegion io_regions[PCI_NUM_REGIONS - 1];
>  XenHostPCIIORegion rom;
> diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c
> index 36e6f93c372f..d448f3a17306 100644
> --- a/hw/xen/xen_pt.c
> +++ b/hw/xen/xen_pt.c
> @@ -839,7 +839,11 @@ static void xen_pt_realize(PCIDevice *d, Error **errp)
>  goto out;
>  }
>  
> -machine_irq = s->real_device.irq;
> +if (s->real_device.gsi < 0) {
> +machine_irq = s->real_device.irq;
> +} else {
> +machine_irq = s->real_device.gsi;
> +}
>  if (machine_irq == 0) {
>  XEN_PT_LOG(d, "machine irq is 0\n");
>  cmd |= PCI_COMMAND_INTX_DISABLE;
> -- 
> 2.34.1
> 



Re: [PATCH] migration: Fix qmp_query_migrate mbps value

2024-02-22 Thread Peter Xu
On Thu, Feb 22, 2024 at 10:49:12AM -0300, Fabiano Rosas wrote:
> Peter Xu  writes:
> 
> > On Thu, Feb 22, 2024 at 05:40:41PM +0800, Peter Xu wrote:
> >> On Wed, Feb 21, 2024 at 09:56:36AM -0300, Fabiano Rosas wrote:
> >> > Peter Xu  writes:
> >> > 
> >> > > On Mon, Feb 19, 2024 at 04:44:57PM -0300, Fabiano Rosas wrote:
> >> > >> The QMP command query_migrate might see incorrect throughput numbers
> >> > >> if it runs after we've set the migration completion status but before
> >> > >> migration_calculate_complete() has updated s->total_time and s->mbps.
> >> > >> 
> >> > >> The migration status would show COMPLETED, but the throughput value
> >> > >> would be the one from the last iteration and not the one from the
> >> > >> whole migration. This will usually be a larger value due to the time
> >> > >> period being smaller (one iteration).
> >> > >> 
> >> > >> Move migration_calculate_complete() earlier so that the status
> >> > >> MIGRATION_STATUS_COMPLETED is only emitted after the final counters
> >> > >> update.
> >> > >> 
> >> > >> Signed-off-by: Fabiano Rosas 
> >> > >> ---
> >> > >> CI run: https://gitlab.com/farosas/qemu/-/pipelines/1182405776
> >> > >> ---
> >> > >>  migration/migration.c | 10 ++
> >> > >>  1 file changed, 6 insertions(+), 4 deletions(-)
> >> > >> 
> >> > >> diff --git a/migration/migration.c b/migration/migration.c
> >> > >> index ab21de2cad..7486d59da0 100644
> >> > >> --- a/migration/migration.c
> >> > >> +++ b/migration/migration.c
> >> > >> @@ -102,6 +102,7 @@ static int migration_maybe_pause(MigrationState 
> >> > >> *s,
> >> > >>   int new_state);
> >> > >>  static void migrate_fd_cancel(MigrationState *s);
> >> > >>  static bool close_return_path_on_source(MigrationState *s);
> >> > >> +static void migration_calculate_complete(MigrationState *s);
> >> > >>  
> >> > >>  static void migration_downtime_start(MigrationState *s)
> >> > >>  {
> >> > >> @@ -2746,6 +2747,7 @@ static void migration_completion(MigrationState 
> >> > >> *s)
> >> > >>  migrate_set_state(>state, MIGRATION_STATUS_ACTIVE,
> >> > >>MIGRATION_STATUS_COLO);
> >> > >>  } else {
> >> > >> +migration_calculate_complete(s);
> >> > >>  migrate_set_state(>state, current_active_state,
> >> > >>MIGRATION_STATUS_COMPLETED);
> >> > >>  }
> >> > >> @@ -2784,6 +2786,7 @@ static void 
> >> > >> bg_migration_completion(MigrationState *s)
> >> > >>  goto fail;
> >> > >>  }
> >> > >>  
> >> > >> +migration_calculate_complete(s);
> >> > >>  migrate_set_state(>state, current_active_state,
> >> > >>MIGRATION_STATUS_COMPLETED);
> >> > >>  return;
> >> > >> @@ -2993,12 +2996,15 @@ static void 
> >> > >> migration_calculate_complete(MigrationState *s)
> >> > >>  int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> >> > >>  int64_t transfer_time;
> >> > >>  
> >> > >> +/* QMP could read from these concurrently */
> >> > >> +bql_lock();
> >> > >>  migration_downtime_end(s);
> >> > >>  s->total_time = end_time - s->start_time;
> >> > >>  transfer_time = s->total_time - s->setup_time;
> >> > >>  if (transfer_time) {
> >> > >>  s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
> >> > >>  }
> >> > >> +bql_unlock();
> >> > >
> >> > > The lock is not needed?
> >> > >
> >> > > AFAIU that was needed because of things like runstate_set() rather than
> >> > > setting of these fields.
> >> > >
> >> > 
> >> > Don't we need to keep the total_time and mbps update atomic? Otherwise
> >> > query-migrate might see (say) total_time=0 and mbps= or
> >> > total_time= and mbps=.
> >> 
> >> I thought it wasn't a major concern, but what you said makes sense; taking
> >> it one more time doesn't really hurt after all to provide such benefit.
> >> 
> >> > 
> >> > Also, what orders s->mbps update before the s->state update? I'd say we
> >> > should probably hold the lock around the whole total_time,mbps,state
> >> > update.
> >> 
> >> IMHO that's fine; mutex unlock implies a RELEASE.  See atomic.rst:
> >> 
> >> - ``pthread_mutex_lock`` has acquire semantics, ``pthread_mutex_unlock`` 
> >> has
> >>   release semantics and synchronizes with a ``pthread_mutex_lock`` for the
> >>   same mutex.
> >
> > Hmm perhaps I wrote too soon.. it should only guarantee the ordering of the
> > update on the lock variable itself v.s. any previous R, nothing else.
> > Only if the other side uses bql_lock() will it guarantee proper ordering.
> >
> > Put them in bql should work, but I hesitate such use to start using bql
> > to protect state updates.
> 
> Well, on the other hand that's a major use-case of the BQL: protecting
> state that's used by QMP.
> 
> >
> > How about we drop the lock, but use an explicit smp_mb_release()?  We may
> > also want to use smb_load_acquire() in fill_source_migration_info() to use
> > on reading >state (all 

[PATCH 1/6] hw/misc/ivshmem: Add ivshmem-flat device

2024-02-22 Thread Gustavo Romero
Add a new device, ivshmem-flat, which is similar to the ivshmem PCI but
does not require a PCI bus. It's meant to be used on machines like those
with Cortex-M MCUs, which usually lack a PCI/PCIe bus, e.g. lm3s6965evb
and mps2-an385.

The device currently only supports the sysbus bus.

The new device, just like the ivshmem PCI device, supports both peer
notification via hardware interrupts and shared memory.

The device shared memory size can be set using the 'shmem-size' option
and it defaults to 4 MiB, which is the default size of shmem allocated
by the ivshmem server.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1134
Message-ID: <20231127052024.435743-2-gustavo.rom...@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé 
Signed-off-by: Gustavo Romero 
---
 docs/system/devices/ivshmem-flat.rst |  33 ++
 hw/misc/Kconfig  |   5 +
 hw/misc/ivshmem-flat.c   | 463 +++
 hw/misc/meson.build  |   2 +
 hw/misc/trace-events |  17 +
 include/hw/misc/ivshmem-flat.h   |  85 +
 6 files changed, 605 insertions(+)
 create mode 100644 docs/system/devices/ivshmem-flat.rst
 create mode 100644 hw/misc/ivshmem-flat.c
 create mode 100644 include/hw/misc/ivshmem-flat.h

diff --git a/docs/system/devices/ivshmem-flat.rst 
b/docs/system/devices/ivshmem-flat.rst
new file mode 100644
index 00..1f97052804
--- /dev/null
+++ b/docs/system/devices/ivshmem-flat.rst
@@ -0,0 +1,33 @@
+Inter-VM Shared Memory Flat Device
+--
+
+The ivshmem-flat device is meant to be used on machines that lack a PCI bus,
+making them unsuitable for the use of the traditional ivshmem device modeled as
+a PCI device. Machines like those with a Cortex-M MCU are good candidates to 
use
+the ivshmem-flat device. Also, since the flat version maps the control and
+status registers directly to the memory, it requires a quite tiny "device
+driver" to interact with other VMs, which is useful in some RTOSes, like
+Zephyr, which usually run on constrained resource targets.
+
+Similar to the ivshmem device, the ivshmem-flat device supports both peer
+notification via HW interrupts and Inter-VM shared memory. This allows the
+device to be used together with the traditional ivshmem, enabling communication
+between, for instance, an aarch64 VM  (using the traditional ivshmem device and
+running Linux), and an arm VM (using the ivshmem-flat device and running Zephyr
+instead).
+
+The ivshmem-flat device does not support the use of a ``memdev`` option (see
+ivshmem.rst for more details). It relies on the ivshmem server to create and
+distribute the proper shared memory file descriptor and the eventfd(s) to 
notify
+(interrupt) the peers. Therefore, to use this device, it is always necessary to
+have an ivshmem server up and running for proper device creation.
+
+Although the ivshmem-flat supports both peer notification (interrupts) and
+shared memory, the interrupt mechanism is optional. If no input IRQ is
+specified for the device it is disabled, preventing the VM from notifying or
+being notified by other VMs (a warning will be displayed to the user to inform
+the IRQ mechanism is disabled). The shared memory region is always present.
+
+The MMRs (INTRMASK, INTRSTATUS, IVPOSITION, and DOORBELL registers) offsets at
+the MMR region, and their functions, follow the ivshmem spec, so they work
+exactly as in the ivshmem PCI device (see ./specs/ivshmem-spec.txt).
diff --git a/hw/misc/Kconfig b/hw/misc/Kconfig
index 4fc6b29b43..a643cfac3a 100644
--- a/hw/misc/Kconfig
+++ b/hw/misc/Kconfig
@@ -68,6 +68,11 @@ config IVSHMEM_DEVICE
 default y if PCI_DEVICES
 depends on PCI && LINUX && IVSHMEM && MSI_NONBROKEN
 
+config IVSHMEM_FLAT_DEVICE
+bool
+default y
+depends on LINUX && IVSHMEM
+
 config ECCMEMCTL
 bool
 select ECC
diff --git a/hw/misc/ivshmem-flat.c b/hw/misc/ivshmem-flat.c
new file mode 100644
index 00..833ee2fefb
--- /dev/null
+++ b/hw/misc/ivshmem-flat.c
@@ -0,0 +1,463 @@
+/*
+ * Inter-VM Shared Memory Flat Device
+ *
+ * SPDX-FileCopyrightText: 2023 Linaro Ltd.
+ * SPDX-FileContributor: Gustavo Romero 
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "qapi/error.h"
+#include "hw/irq.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/sysbus.h"
+#include "chardev/char-fe.h"
+#include "exec/address-spaces.h"
+#include "trace.h"
+
+#include "hw/misc/ivshmem-flat.h"
+
+static int64_t ivshmem_flat_recv_msg(IvshmemFTState *s, int *pfd)
+{
+int64_t msg;
+int n, ret;
+
+n = 0;
+do {
+ret = qemu_chr_fe_read_all(>server_chr, (uint8_t *) + n,
+   sizeof(msg) - n);
+if (ret < 0) {
+if (ret == -EINTR) {
+continue;
+}
+exit(1);
+}
+n += ret;
+} 

[PATCH 0/6] Add ivshmem-flat device

2024-02-22 Thread Gustavo Romero
Since v1:
- Correct code style
- Correct trace event format strings
- Include minimum headers in ivshmem-flat.h
- Allow ivshmem_flat_recv_msg() take NULL
- Factored ivshmem_flat_connect_server() out
- Split sysbus-auto-wire controversial code in different patch
- Document QDev interface

Since v2:
- Addressed all comments from Thomas Huth about qtest:
  1) Use of g_usleep + number of attemps for timeout
  2) Use of g_get_tmp_dir instead of hard-coded /tmp
  3) Test if machine lm3s6965evb is available, if not skip test
- Use of qemu_irq_pulse instead of 2x qemu_set_irq
- Fixed all tests for new device options and IRQ name change
- Updated doc and commit messages regarding new/deleted device options
- Turned device options 'x-bus-address-iomem' and 'x-bus-address-shmem' 
mandatory

--

This patchset introduces a new device, ivshmem-flat, which is similar to the
current ivshmem device but does not require a PCI bus. It implements the ivshmem
status and control registers as MMRs and the shared memory as a directly
accessible memory region in the VM memory layout. It's meant to be used on
machines like those with Cortex-M MCUs, which usually lack a PCI bus, e.g.,
lm3s6965evb and mps2-an385. Additionally, it has the benefit of requiring a tiny
'device driver,' which is helpful on some RTOSes, like Zephyr, that run on
memory-constrained resource targets.

The patchset includes a QTest for the ivshmem-flat device, however, it's also
possible to experiment with it in two ways:

(a) using two Cortex-M VMs running Zephyr; or
(b) using one aarch64 VM running Linux with the ivshmem PCI device and another
arm (Cortex-M) VM running Zephyr with the new ivshmem-flat device.

Please note that for running the ivshmem-flat QTests the following patch, which
is not committed to the tree yet, must be applied:

https://lists.nongnu.org/archive/html/qemu-devel/2023-11/msg03176.html

--

To experiment with (a), clone this Zephyr repo [0], set the Zephyr build
environment [1], and follow the instructions in the 'ivshmem' sample main.c [2].

[0] https://github.com/gromero/zephyr/tree/ivshmem
[1] https://docs.zephyrproject.org/latest/develop/getting_started/index.html
[2] 
https://github.com/gromero/zephyr/commit/73fbd481e352b25ae5483ba5048a2182b90b7f00#diff-16fa1f481a49b995d0d1a62da37b9f33033f5ee477035e73465e7208521ddbe0R9-R70
[3] 
https://lore.kernel.org/qemu-devel/20231127052024.435743-1-gustavo.rom...@linaro.org/

To experiment with (b):

$ git clone -b uio_ivshmem --single-branch https://github.com/gromero/linux.git
$ cd linux
$ wget 
https://people.linaro.org/~gustavo.romero/ivshmem/arm64_uio_ivshmem.config -O 
.config

If in an x86_64 machine, cross compile the kernel, for instance:

$ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j 36

Install image in some directory, let's say, in ~/linux:

$ mkdir ~/linux
$ export INSTALL_PATH=~/linux
$ make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j 36 install

or, if you prefer, download the compiled image from:

$ wget 
https://people.linaro.org/~gustavo.romero/ivshmem/vmlinuz-6.6.0-rc1-g28f3f88ee261

... and then the rootfs:

$ wget https://people.linaro.org/~gustavo.romero/ivshmem/rootfs.qcow2

Now, build QEMU with this patchset applied:

$ mkdir build && cd build
$ ../configure --target-list=arm-softmmu,aarch64-softmmu
$ make -j 36

Start the ivshmem server:

$ contrib/ivshmem-server/ivshmem-server -F

Start the aarch64 VM + Linux + ivshmem PCI device:

$ ./qemu-system-aarch64 -kernel ~/linux/vmlinuz-6.6.0-rc1-g28f3f88ee261 -append 
"root=/dev/vda initrd=/bin/bash console=ttyAMA0,115200" -drive 
file=~/linux/rootfs.qcow2,media=disk,if=virtio -machine virt-6.2 -nographic 
-accel tcg -cpu cortex-a57 -m 8192 -netdev 
bridge,id=hostnet0,br=virbr0,helper=/usr/lib/qemu/qemu-bridge-helper -device 
pcie-root-port,port=8,chassis=1,id=pci.1,bus=pcie.0,multifunction=on,addr=0x1 
-device 
virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:d9:d1:12,bus=pci.1,addr=0x0 
-device ivshmem-doorbell,vectors=2,chardev=ivshmem -chardev 
socket,path=/tmp/ivshmem_socket,id=ivshmem

Log into the VM with user/pass: root/abc123

should show:

[2.656367] uio_ivshmem :00:02.0: ivshmem-mmr at 0x10203000, 
size 0x1000
[2.656931] uio_ivshmem :00:02.0: ivshmem-shmem at 0x0080, 
size 0x0040
[2.662554] uio_ivshmem :00:02.0: module successfully loaded

In another console, clone and build Zephyr image from 'uio_ivhsmem' branch:

$ git clone -b uio_ivshmem --single-branch https://github.com/gromero/zephyr
$ west -v --verbose build -p always -b qemu_cortex_m3 ./samples/uio_ivshmem/

... and then start the arm VM + Zephyr image + ivshmem-flat device:

$ ./qemu-system-arm -machine lm3s6965evb -nographic -net none -chardev 
socket,path=/tmp/ivshmem_socket,id=ivshmem_flat -device 
ivshmem-flat,chardev=ivshmem_flat,x-irq-qompath='/machine/unattached/device[1]/nvic/unnamed-gpio-in[0]',x-bus-qompath='/sysbus'
 -kernel 

[PATCH 4/6] hw/misc/ivshmem: Rename ivshmem to ivshmem-pci

2024-02-22 Thread Gustavo Romero
Because now there is also an MMIO ivshmem device (ivshmem-flat.c), and
ivshmem.c is a PCI specific implementation, rename it to ivshmem-pci.c.

Reviewed-by: Philippe Mathieu-Daudé 
Message-ID: <20231127052024.435743-5-gustavo.rom...@linaro.org>
Signed-off-by: Gustavo Romero 
---
 hw/misc/{ivshmem.c => ivshmem-pci.c} | 0
 hw/misc/meson.build  | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename hw/misc/{ivshmem.c => ivshmem-pci.c} (100%)

diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem-pci.c
similarity index 100%
rename from hw/misc/ivshmem.c
rename to hw/misc/ivshmem-pci.c
diff --git a/hw/misc/meson.build b/hw/misc/meson.build
index 84dff09f5d..4a9369082b 100644
--- a/hw/misc/meson.build
+++ b/hw/misc/meson.build
@@ -39,7 +39,7 @@ system_ss.add(when: 'CONFIG_SIFIVE_U_PRCI', if_true: 
files('sifive_u_prci.c'))
 subdir('macio')
 
 # ivshmem devices
-system_ss.add(when: 'CONFIG_IVSHMEM_DEVICE', if_true: files('ivshmem.c'))
+system_ss.add(when: 'CONFIG_IVSHMEM_DEVICE', if_true: files('ivshmem-pci.c'))
 system_ss.add(when: 'CONFIG_IVSHMEM_FLAT_DEVICE', if_true: 
files('ivshmem-flat.c'))
 
 system_ss.add(when: 'CONFIG_ALLWINNER_SRAMC', if_true: 
files('allwinner-sramc.c'))
-- 
2.34.1




[PATCH 5/6] tests/qtest: Reorganize common code in ivshmem-test

2024-02-22 Thread Gustavo Romero
This commit reorganizes the ivshmem-test qtest by moving common structs,
functions, and code that can be utilized by other ivshmem qtests into
two new files: ivshmem-utils.h and ivshmem-utils.c.

Enum Reg, struct ServerThread, and mktempshm() have been relocated to
these new files. Two new functions have been introduced to handle the
ivshmem server start/stop: test_ivshmem_server_{start,stop}.

To accommodate the new way for starting/stopping the ivshmem server,
struct ServerThread now includes two new members: 'server', previously
present but not a member of any struct; and 'status', a new member of a
new type, ServerStartStatus, used to track and handle service
termination properly.

Additionally, a new function, mktempsocket(), has been added to help
create a unix socket filename, similar to what mktempshm() does for the
creation of a shm file.

Finally, the ivshmem-test qtest has been adapted to use the new ivhsmem
utils. Adjustments in that sense have also been made to meson.build;
also 'rt' have been removed as a lib dependency for ivhsmem-test.c.

Two lines unrelated to these changes have had their line indentation
also fixed in meson.build.

Message-ID: <20231127052024.435743-3-gustavo.rom...@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé 
Signed-off-by: Gustavo Romero 
---
 tests/qtest/ivshmem-test.c  | 113 ++
 tests/qtest/ivshmem-utils.c | 156 
 tests/qtest/ivshmem-utils.h |  56 +
 tests/qtest/meson.build |   6 +-
 4 files changed, 222 insertions(+), 109 deletions(-)
 create mode 100644 tests/qtest/ivshmem-utils.c
 create mode 100644 tests/qtest/ivshmem-utils.h

diff --git a/tests/qtest/ivshmem-test.c b/tests/qtest/ivshmem-test.c
index 9bf8e78df6..5ce43e2f76 100644
--- a/tests/qtest/ivshmem-test.c
+++ b/tests/qtest/ivshmem-test.c
@@ -3,17 +3,17 @@
  *
  * Copyright (c) 2014 SUSE LINUX Products GmbH
  * Copyright (c) 2015 Red Hat, Inc.
+ * Copyright (c) 2023 Linaro Ltd.
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  */
 
-#include "qemu/osdep.h"
-#include 
-#include "contrib/ivshmem-server/ivshmem-server.h"
+#include "ivshmem-utils.h"
 #include "libqos/libqos-pc.h"
 #include "libqos/libqos-spapr.h"
-#include "libqtest.h"
+
+static ServerThread thread;
 
 #define TMPSHMSIZE (1 << 20)
 static char *tmpshm;
@@ -45,13 +45,6 @@ typedef struct _IVState {
 QPCIDevice *dev;
 } IVState;
 
-enum Reg {
-INTRMASK = 0,
-INTRSTATUS = 4,
-IVPOSITION = 8,
-DOORBELL = 12,
-};
-
 static const char* reg2str(enum Reg reg) {
 switch (reg) {
 case INTRMASK:
@@ -241,54 +234,6 @@ static void test_ivshmem_pair(void)
 g_free(data);
 }
 
-typedef struct ServerThread {
-GThread *thread;
-IvshmemServer *server;
-int pipe[2]; /* to handle quit */
-} ServerThread;
-
-static void *server_thread(void *data)
-{
-ServerThread *t = data;
-IvshmemServer *server = t->server;
-
-while (true) {
-fd_set fds;
-int maxfd, ret;
-
-FD_ZERO();
-FD_SET(t->pipe[0], );
-maxfd = t->pipe[0] + 1;
-
-ivshmem_server_get_fds(server, , );
-
-ret = select(maxfd, , NULL, NULL, NULL);
-
-if (ret < 0) {
-if (errno == EINTR) {
-continue;
-}
-
-g_critical("select error: %s\n", strerror(errno));
-break;
-}
-if (ret == 0) {
-continue;
-}
-
-if (FD_ISSET(t->pipe[0], )) {
-break;
-}
-
-if (ivshmem_server_handle_fds(server, , maxfd) < 0) {
-g_critical("ivshmem_server_handle_fds() failed\n");
-break;
-}
-}
-
-return NULL;
-}
-
 static void setup_vm_with_server(IVState *s, int nvectors)
 {
 char *cmd;
@@ -304,27 +249,12 @@ static void setup_vm_with_server(IVState *s, int nvectors)
 
 static void test_ivshmem_server(void)
 {
-g_autoptr(GError) err = NULL;
 IVState state1, state2, *s1, *s2;
-ServerThread thread;
-IvshmemServer server;
 int ret, vm1, vm2;
 int nvectors = 2;
 guint64 end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND;
 
-ret = ivshmem_server_init(, tmpserver, tmpshm, true,
-  TMPSHMSIZE, nvectors,
-  g_test_verbose());
-g_assert_cmpint(ret, ==, 0);
-
-ret = ivshmem_server_start();
-g_assert_cmpint(ret, ==, 0);
-
-thread.server = 
-g_unix_open_pipe(thread.pipe, FD_CLOEXEC, );
-g_assert_no_error(err);
-thread.thread = g_thread_new("ivshmem-server", server_thread, );
-g_assert(thread.thread != NULL);
+test_ivshmem_server_start(, tmpserver, tmpshm, nvectors);
 
 setup_vm_with_server(, nvectors);
 s1 = 
@@ -367,15 +297,7 @@ static void test_ivshmem_server(void)
 cleanup_vm(s2);
 cleanup_vm(s1);
 
-if (qemu_write_full(thread.pipe[1], "q", 1) != 1) 

[PATCH 2/6] hw/misc/ivshmem-flat: Allow device to wire itself on sysbus

2024-02-22 Thread Gustavo Romero
This commit enables the ivshmem-flat device to wire itself on sysbus. It
maps the device's Memory-Mapped Registers (MMRs) and shared memory
(shmem) into the VM's memory layout and also allows connection to an
input IRQ so that the device can trigger an interrupt for notification.

Three device options are introduced to control how this is done:
x-bus-address-iomem, x-bus-address-shmem, and x-irq-qompath.

The following is an example on how to create the ivshmem-flat device on
a Stellaris machine:

$ qemu-system-arm -cpu cortex-m3 -machine lm3s6965evb -nographic
  -net none -chardev stdio,id=con,mux=on
  -serial chardev:con -mon chardev=con,mode=readline
  -chardev socket,path=/tmp/ivshmem_socket,id=ivshmem_flat
  -device 
ivshmem-flat,chardev=ivshmem_flat,x-irq-qompath='/machine/soc/v7m/nvic/unnamed-gpio-in[0]',x-bus-address-iomem=0x400FF000,x-bus-address-shmem=0x4010
  -kernel zephyr_kernel.elf

The IRQ QOM path option for the target machine can be determined by
creating the VM without the ivshmem-flat device, going to the QEMU
console and listing the QOM nodes with 'info qom-tree'. In the Stellaris
example above the input IRQ is in the machine's NVIC Interrupt
Controller.

If 'x-irq-qompath' is not provided the device won't be able to be
interrupted by other VMs (peers) and only the shared memory (shmem)
feature will be supported.

The MMRs for status and control (notification) are mapped to the MMIO
region at 'x-bus-address-iomem', whilst the shared memory region start
is mapped at address specified by 'x-bus-address-shmem'.

Message-ID: <20231127052024.435743-2-gustavo.rom...@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé 
Signed-off-by: Gustavo Romero 
---
 docs/system/devices/ivshmem-flat.rst | 57 +
 hw/core/sysbus-fdt.c |  2 +
 hw/misc/ivshmem-flat.c   | 74 ++--
 include/hw/misc/ivshmem-flat.h   |  9 
 4 files changed, 139 insertions(+), 3 deletions(-)

diff --git a/docs/system/devices/ivshmem-flat.rst 
b/docs/system/devices/ivshmem-flat.rst
index 1f97052804..ddc3477f52 100644
--- a/docs/system/devices/ivshmem-flat.rst
+++ b/docs/system/devices/ivshmem-flat.rst
@@ -31,3 +31,60 @@ the IRQ mechanism is disabled). The shared memory region is 
always present.
 The MMRs (INTRMASK, INTRSTATUS, IVPOSITION, and DOORBELL registers) offsets at
 the MMR region, and their functions, follow the ivshmem spec, so they work
 exactly as in the ivshmem PCI device (see ./specs/ivshmem-spec.txt).
+
+
+Device Options
+--
+
+The required options to create an ivshmem-flat device are: (a) the UNIX
+socket where the ivshmem server is listening, usually ``/tmp/ivshmem_socket``;
+(b) the address where to map the MMRs (``x-bus-address-iomem=``) in the VM
+memory layout; and (c) the address where to map the shared memory in the VM
+memory layout (``x-bus-address-shmem=``). Both (a) and (b) depend on the VM
+being used, as the MMRs and shmem must be mapped to a region not previously
+occupied in the VM.
+
+Example:
+
+.. parsed-literal::
+
+|qemu-system-arm| -chardev socket,path=/tmp/ivshmem_socket,id=ivshmem_flat 
-device 
ivshmem-flat,chardev=ivshmem_flat,x-irq-qompath='/machine/soc/v7m/nvic/unnamed-gpio-in[0]',x-bus-address-iomem=0x400FF000,x-bus-address-shmem=0x4010
+
+The other option, ``x-irq-qompath=``, is not required if the user doesn't want
+the device supporting notifications.
+
+``x-irq-qompath``. Used to inform the device which IRQ input line it can attach
+to enable the notification mechanism (IRQ). The ivshmem-flat device currently
+only supports notification via vector 0. Notifications via other vectors are
+ignored. (optional)
+
+Two examples for different machines follow.
+
+Stellaris machine (``- machine lm3s6965evb``):
+
+::
+
+x-irq-qompath=/machine/soc/v7m/nvic/unnamed-gpio-in[0]
+
+Arm mps2-an385 machine (``-machine mps2-an385``):
+
+::
+
+x-irq-qompath=/machine/armv7m/nvic/unnamed-gpio-in[0]
+
+The available IRQ input lines on a given VM that the ivshmem-flat device can be
+attached to can be found from the QEMU monitor (Ctrl-a + c) with:
+
+(qemu) info qom-tree
+
+``x-bus-address-iomem``. Allows changing the address where the MMRs are mapped
+into the VM memory layout. (required)
+
+ ``x-bus-address-shmem``. Allows changing the address where the shared memory
+region is mapped into the VM memory layout. (required)
+
+``shmem-size``. Allows changing the size (in bytes) of shared memory region.
+Default is 4 MiB, which is the same default used by the ivshmem server, so
+usually it's not necessary to change it. The size must match the size of the
+shared memory reserverd and informed by the ivshmem server, otherwise device
+creation fails. (optional)
diff --git a/hw/core/sysbus-fdt.c b/hw/core/sysbus-fdt.c
index eebcd28f9a..40d7356cae 100644
--- a/hw/core/sysbus-fdt.c
+++ b/hw/core/sysbus-fdt.c
@@ -31,6 +31,7 @@
 

[PATCH 6/6] tests/qtest: Add ivshmem-flat test

2024-02-22 Thread Gustavo Romero
This commit adds a qtest for the ivshmem-flat device to test memory
sharing, IRQ triggering, and the memory mapped registers in the device.

Based-on: https://lists.gnu.org/archive/html/qemu-devel/2023-11/msg03176.html
Message-ID: <20231127052024.435743-4-gustavo.rom...@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé 
Signed-off-by: Gustavo Romero 
---
 tests/qtest/ivshmem-flat-test.c | 338 
 tests/qtest/meson.build |   2 +
 2 files changed, 340 insertions(+)
 create mode 100644 tests/qtest/ivshmem-flat-test.c

diff --git a/tests/qtest/ivshmem-flat-test.c b/tests/qtest/ivshmem-flat-test.c
new file mode 100644
index 00..b6f59bba54
--- /dev/null
+++ b/tests/qtest/ivshmem-flat-test.c
@@ -0,0 +1,338 @@
+/*
+ * Inter-VM Shared Memory Flat Device qtests
+ *
+ * SPDX-FileCopyrightText: 2023 Linaro Ltd.
+ * SPDX-FileContributor: Gustavo Romero 
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ */
+
+#include "ivshmem-utils.h"
+
+#define IVSHMEM_FLAT_MMR_ADDR 0x400FF000
+#define IVSHMEM_FLAT_SHM_ADDR 0x4010
+#define SHM_SIZE 131072 /* 128k */
+
+static ServerThread thread;
+
+uint32_t *shm_ptr;
+char *shm_rel_path;
+char *server_socket_path;
+
+static void cleanup(void)
+{
+if (shm_ptr) {
+munmap(shm_ptr, SHM_SIZE);
+shm_ptr = NULL;
+}
+
+if (shm_rel_path) {
+shm_unlink(shm_rel_path);
+shm_rel_path = NULL;
+}
+
+if (server_socket_path) {
+unlink(server_socket_path);
+server_socket_path = NULL;
+}
+}
+
+static void abort_handler(void *data)
+{
+test_ivshmem_server_stop();
+cleanup();
+}
+
+/*
+ * Check if exactly 1 positive pulse (low->high->low) on 'irq' qtest IRQ line
+ * happens. N.B.: 'irq' must be intercepted using qtest_irq_intercept_* before
+ * this function can be used on it. It returns 0 when pulse is detected,
+ * otherwise 1.
+ */
+static int test_ivshmem_flat_irq_positive_pulse(QTestState *qts, int irq)
+{
+uint64_t num_raises = 0;
+uint64_t num_lows = 0;
+int attempts = 0;
+
+while (attempts < 5) {
+num_raises = qtest_get_irq_raised_counter(qts, 0);
+if (num_raises) {
+num_lows = qtest_get_irq_lowered_counter(qts, 0);
+/* Check for exactly 1 raise and 1 low IRQ event */
+if (num_raises == num_lows && num_lows == 1) {
+return 0; /* Pulse detected */
+}
+}
+
+   g_usleep(1);
+   attempts++;
+}
+
+g_message("%s: Timeout expired", __func__);
+return 1;
+}
+
+static inline uint32_t read_reg(QTestState *qts, enum Reg reg)
+{
+uint32_t v;
+
+qtest_memread(qts, IVSHMEM_FLAT_MMR_ADDR + reg, , sizeof(v));
+
+return v;
+}
+
+static inline void write_reg(QTestState *qts, enum Reg reg, uint32_t v)
+{
+qtest_memwrite(qts, IVSHMEM_FLAT_MMR_ADDR + reg, , sizeof(v));
+}
+
+/*
+ * Setup a test VM with ivshmem-flat device attached, IRQ properly set, and
+ * connected to the ivshmem-server.
+ */
+static QTestState *setup_vm(void)
+{
+QTestState *qts;
+const char *cmd_line;
+
+/*
+ * x-bus-address-{iomem,shmem} are just random addresses that don't 
conflict
+ * with any other address in the lm3s6965evb machine. shmem-size used is
+ * much smaller than the ivshmem server default (4 MiB) to save memory
+ * resources when testing.
+ */
+cmd_line = g_strdup_printf("-machine lm3s6965evb "
+   "-chardev socket,path=%s,id=ivshm "
+   "-device ivshmem-flat,chardev=ivshm,"
+   
"x-irq-qompath='/machine/soc/v7m/nvic/unnamed-gpio-in[0]',"
+   "x-bus-address-iomem=%#x,"
+   "x-bus-address-shmem=%#x,"
+   "shmem-size=%d",
+   server_socket_path,
+   IVSHMEM_FLAT_MMR_ADDR,
+   IVSHMEM_FLAT_SHM_ADDR,
+   SHM_SIZE);
+
+qts = qtest_init(cmd_line);
+
+return qts;
+}
+
+static void test_ivshmem_flat_irq(void)
+{
+QTestState *vm_state;
+uint16_t own_id;
+
+vm_state = setup_vm();
+
+qtest_irq_intercept_out_named(vm_state,
+  "/machine/peripheral-anon/device[0]",
+  "sysbus-irq");
+
+/* IVPOSTION has the device's own ID distributed by the ivshmem-server. */
+own_id = read_reg(vm_state, IVPOSITION);
+
+/* Make device notify itself. */
+write_reg(vm_state, DOORBELL, (own_id << 16) | 0 /* vector 0 */);
+
+/*
+ * Check intercepted device's IRQ output line. 'sysbus-irq' was associated
+ * to qtest IRQ 0 when intercepted and after self notification qtest IRQ 0
+ * must be toggled by the device. The test fails if no toggling is 
detected.
+ */
+g_assert(test_ivshmem_flat_irq_positive_pulse(vm_state,
+ 

[PATCH 3/6] hw/arm: Allow some machines to use the ivshmem-flat device

2024-02-22 Thread Gustavo Romero
Allow Arm machine lm3s6965evb and the mps2 ones, like the mps2-an385, to
use the ivshmem-flat device.

Message-ID: <20231127052024.435743-2-gustavo.rom...@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé 
Signed-off-by: Gustavo Romero 
---
 hw/arm/mps2.c  | 3 +++
 hw/arm/stellaris.c | 3 +++
 hw/arm/virt.c  | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/hw/arm/mps2.c b/hw/arm/mps2.c
index 50919ee46d..fe158dfbc0 100644
--- a/hw/arm/mps2.c
+++ b/hw/arm/mps2.c
@@ -42,6 +42,7 @@
 #include "hw/timer/cmsdk-apb-dualtimer.h"
 #include "hw/misc/mps2-scc.h"
 #include "hw/misc/mps2-fpgaio.h"
+#include "hw/misc/ivshmem-flat.h"
 #include "hw/ssi/pl022.h"
 #include "hw/i2c/arm_sbcon_i2c.h"
 #include "hw/net/lan9118.h"
@@ -472,6 +473,8 @@ static void mps2_class_init(ObjectClass *oc, void *data)
 mc->max_cpus = 1;
 mc->default_ram_size = 16 * MiB;
 mc->default_ram_id = "mps.ram";
+
+machine_class_allow_dynamic_sysbus_dev(mc, TYPE_IVSHMEM_FLAT);
 }
 
 static void mps2_an385_class_init(ObjectClass *oc, void *data)
diff --git a/hw/arm/stellaris.c b/hw/arm/stellaris.c
index a2f998bf9e..e25858f232 100644
--- a/hw/arm/stellaris.c
+++ b/hw/arm/stellaris.c
@@ -28,6 +28,7 @@
 #include "hw/watchdog/cmsdk-apb-watchdog.h"
 #include "migration/vmstate.h"
 #include "hw/misc/unimp.h"
+#include "hw/misc/ivshmem-flat.h"
 #include "hw/timer/stellaris-gptm.h"
 #include "hw/qdev-clock.h"
 #include "qom/object.h"
@@ -1404,6 +1405,8 @@ static void lm3s6965evb_class_init(ObjectClass *oc, void 
*data)
 mc->init = lm3s6965evb_init;
 mc->ignore_memory_transaction_failures = true;
 mc->default_cpu_type = ARM_CPU_TYPE_NAME("cortex-m3");
+
+machine_class_allow_dynamic_sysbus_dev(mc, TYPE_IVSHMEM_FLAT);
 }
 
 static const TypeInfo lm3s6965evb_type = {
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 0af1943697..6c0917f3b2 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -84,6 +84,7 @@
 #include "hw/virtio/virtio-iommu.h"
 #include "hw/char/pl011.h"
 #include "qemu/guest-random.h"
+#include "hw/misc/ivshmem-flat.h"
 
 #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
 static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
@@ -2973,6 +2974,7 @@ static void virt_machine_class_init(ObjectClass *oc, void 
*data)
 machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE);
 machine_class_allow_dynamic_sysbus_dev(mc, TYPE_RAMFB_DEVICE);
 machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_PLATFORM);
+machine_class_allow_dynamic_sysbus_dev(mc, TYPE_IVSHMEM_FLAT);
 #ifdef CONFIG_TPM
 machine_class_allow_dynamic_sysbus_dev(mc, TYPE_TPM_TIS_SYSBUS);
 #endif
-- 
2.34.1




[PATCH V5 5/5] migration: simplify exec migration functions

2024-02-22 Thread Steve Sistare
Simplify the exec migration code by using list utility functions.

As a side effect, this also fixes a minor memory leak.  On function return,
"g_auto(GStrv) argv" frees argv and each element, which is wrong, because
the function does not own the individual elements.  To compensate, the code
uses g_steal_pointer which NULLs argv and prevents the destructor from
running, but argv is leaked.

Fixes: cbab4face57b ("migration: convert exec backend ...")
Signed-off-by: Steve Sistare 
Reviewed-by: Fabiano Rosas 
---
 migration/exec.c | 57 
 1 file changed, 8 insertions(+), 49 deletions(-)

diff --git a/migration/exec.c b/migration/exec.c
index 47d2f3b..1518409 100644
--- a/migration/exec.c
+++ b/migration/exec.c
@@ -19,6 +19,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/error-report.h"
+#include "qemu/strList.h"
 #include "channel.h"
 #include "exec.h"
 #include "migration.h"
@@ -39,51 +40,16 @@ const char *exec_get_cmd_path(void)
 }
 #endif
 
-/* provides the length of strList */
-static int
-str_list_length(strList *list)
-{
-int len = 0;
-strList *elem;
-
-for (elem = list; elem != NULL; elem = elem->next) {
-len++;
-}
-
-return len;
-}
-
-static void
-init_exec_array(strList *command, char **argv, Error **errp)
-{
-int i = 0;
-strList *lst;
-
-for (lst = command; lst; lst = lst->next) {
-argv[i++] = lst->value;
-}
-
-argv[i] = NULL;
-return;
-}
-
 void exec_start_outgoing_migration(MigrationState *s, strList *command,
Error **errp)
 {
-QIOChannel *ioc;
-
-int length = str_list_length(command);
-g_auto(GStrv) argv = (char **) g_new0(const char *, length + 1);
-
-init_exec_array(command, argv, errp);
+QIOChannel *ioc = NULL;
+g_auto(GStrv) argv = strv_from_strList(command);
+const char * const *args = (const char * const *) argv;
 g_autofree char *new_command = g_strjoinv(" ", (char **)argv);
 
 trace_migration_exec_outgoing(new_command);
-ioc = QIO_CHANNEL(
-qio_channel_command_new_spawn(
-(const char * const *) g_steal_pointer(),
-O_RDWR,
-errp));
+ioc = QIO_CHANNEL(qio_channel_command_new_spawn(args, O_RDWR, errp));
 if (!ioc) {
 return;
 }
@@ -105,19 +71,12 @@ static gboolean exec_accept_incoming_migration(QIOChannel 
*ioc,
 void exec_start_incoming_migration(strList *command, Error **errp)
 {
 QIOChannel *ioc;
-
-int length = str_list_length(command);
-g_auto(GStrv) argv = (char **) g_new0(const char *, length + 1);
-
-init_exec_array(command, argv, errp);
+g_auto(GStrv) argv = strv_from_strList(command);
+const char * const *args = (const char * const *) argv;
 g_autofree char *new_command = g_strjoinv(" ", (char **)argv);
 
 trace_migration_exec_incoming(new_command);
-ioc = QIO_CHANNEL(
-qio_channel_command_new_spawn(
-(const char * const *) g_steal_pointer(),
-O_RDWR,
-errp));
+ioc = QIO_CHANNEL(qio_channel_command_new_spawn(args, O_RDWR, errp));
 if (!ioc) {
 return;
 }
-- 
1.8.3.1




[PATCH V5 2/5] qapi: QAPI_LIST_LENGTH

2024-02-22 Thread Steve Sistare
Signed-off-by: Steve Sistare 
Reviewed-by: Marc-André Lureau 
---
 include/qapi/util.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/include/qapi/util.h b/include/qapi/util.h
index 81a2b13..20dfea8 100644
--- a/include/qapi/util.h
+++ b/include/qapi/util.h
@@ -56,4 +56,17 @@ int parse_qapi_name(const char *name, bool complete);
 (tail) = &(*(tail))->next; \
 } while (0)
 
+/*
+ * For any GenericList @list, return its length.
+ */
+#define QAPI_LIST_LENGTH(list)  \
+({  \
+size_t _len = 0;\
+typeof(list) _tail; \
+for (_tail = list; _tail != NULL; _tail = _tail->next) {\
+_len++; \
+}   \
+_len;   \
+})
+
 #endif
-- 
1.8.3.1




[PATCH V5 0/5] string list functions

2024-02-22 Thread Steve Sistare
Add some handy string list functions for general use, and use them in
live migration functions.  These will also be needed for cpr exec mode.

Changes in V5:
  * renamed some variables and one function, replaced GStrv with char **
  * aligned backslashes in QAPI_LIST_LENGTH
  * restored cutils.h to exec.c

Changes in V4:
  * added exec migration patch

Steve Sistare (5):
  util: str_split
  qapi: QAPI_LIST_LENGTH
  util: strv_from_strList
  util: strList unit tests
  migration: simplify exec migration functions

 include/monitor/hmp.h |  1 -
 include/qapi/util.h   | 13 
 include/qemu/strList.h| 30 ++
 migration/exec.c  | 57 +
 monitor/hmp-cmds.c| 19 ---
 net/net-hmp-cmds.c|  3 +-
 stats/stats-hmp-cmds.c|  3 +-
 tests/unit/meson.build|  1 +
 tests/unit/test-strList.c | 80 +++
 util/meson.build  |  1 +
 util/strList.c| 38 ++
 11 files changed, 175 insertions(+), 71 deletions(-)
 create mode 100644 include/qemu/strList.h
 create mode 100644 tests/unit/test-strList.c
 create mode 100644 util/strList.c

-- 
1.8.3.1




[PATCH V5 1/5] util: str_split

2024-02-22 Thread Steve Sistare
Generalize hmp_split_at_comma() to take any delimiter string, rename
as str_split(), and move it to util/strList.c.

No functional change.

Signed-off-by: Steve Sistare 
---
 include/monitor/hmp.h  |  1 -
 include/qemu/strList.h | 24 
 monitor/hmp-cmds.c | 19 ---
 net/net-hmp-cmds.c |  3 ++-
 stats/stats-hmp-cmds.c |  3 ++-
 util/meson.build   |  1 +
 util/strList.c | 24 
 7 files changed, 53 insertions(+), 22 deletions(-)
 create mode 100644 include/qemu/strList.h
 create mode 100644 util/strList.c

diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 13f9a2d..2df661e 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -19,7 +19,6 @@
 
 bool hmp_handle_error(Monitor *mon, Error *err);
 void hmp_help_cmd(Monitor *mon, const char *name);
-strList *hmp_split_at_comma(const char *str);
 
 void hmp_info_name(Monitor *mon, const QDict *qdict);
 void hmp_info_version(Monitor *mon, const QDict *qdict);
diff --git a/include/qemu/strList.h b/include/qemu/strList.h
new file mode 100644
index 000..0f26116
--- /dev/null
+++ b/include/qemu/strList.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022 - 2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_STR_LIST_H
+#define QEMU_STR_LIST_H
+
+#include "qapi/qapi-builtin-types.h"
+
+/*
+ * Split @str into a strList using the delimiter string @delim.
+ * The delimiter is not included in the result.
+ * Return NULL if @str is NULL or an empty string.
+ * A leading, trailing, or consecutive delimiter produces an
+ * empty string at that position in the output.
+ * All strings are g_strdup'd, and the result can be freed
+ * using qapi_free_strList.
+ */
+strList *str_split(const char *str, const char *delim);
+
+#endif
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 871898a..66b68a0 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -38,25 +38,6 @@ bool hmp_handle_error(Monitor *mon, Error *err)
 return false;
 }
 
-/*
- * Split @str at comma.
- * A null @str defaults to "".
- */
-strList *hmp_split_at_comma(const char *str)
-{
-char **split = g_strsplit(str ?: "", ",", -1);
-strList *res = NULL;
-strList **tail = 
-int i;
-
-for (i = 0; split[i]; i++) {
-QAPI_LIST_APPEND(tail, split[i]);
-}
-
-g_free(split);
-return res;
-}
-
 void hmp_info_name(Monitor *mon, const QDict *qdict)
 {
 NameInfo *info;
diff --git a/net/net-hmp-cmds.c b/net/net-hmp-cmds.c
index 41d326b..969cdd1 100644
--- a/net/net-hmp-cmds.c
+++ b/net/net-hmp-cmds.c
@@ -26,6 +26,7 @@
 #include "qemu/config-file.h"
 #include "qemu/help_option.h"
 #include "qemu/option.h"
+#include "qemu/strList.h"
 
 void hmp_info_network(Monitor *mon, const QDict *qdict)
 {
@@ -72,7 +73,7 @@ void hmp_announce_self(Monitor *mon, const QDict *qdict)
 migrate_announce_params());
 
 qapi_free_strList(params->interfaces);
-params->interfaces = hmp_split_at_comma(interfaces_str);
+params->interfaces = str_split(interfaces_str, ",");
 params->has_interfaces = params->interfaces != NULL;
 params->id = g_strdup(id);
 qmp_announce_self(params, NULL);
diff --git a/stats/stats-hmp-cmds.c b/stats/stats-hmp-cmds.c
index 1f91bf8..62db8c6 100644
--- a/stats/stats-hmp-cmds.c
+++ b/stats/stats-hmp-cmds.c
@@ -10,6 +10,7 @@
 #include "monitor/hmp.h"
 #include "monitor/monitor.h"
 #include "qemu/cutils.h"
+#include "qemu/strList.h"
 #include "hw/core/cpu.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/error.h"
@@ -176,7 +177,7 @@ static StatsFilter *stats_filter(StatsTarget target, const 
char *names,
 request->provider = provider_idx;
 if (names && !g_str_equal(names, "*")) {
 request->has_names = true;
-request->names = hmp_split_at_comma(names);
+request->names = str_split(names, ",");
 }
 QAPI_LIST_PREPEND(request_list, request);
 }
diff --git a/util/meson.build b/util/meson.build
index 0ef9886..bd125a4 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -1,4 +1,5 @@
 util_ss.add(files('osdep.c', 'cutils.c', 'unicode.c', 'qemu-timer-common.c'))
+util_ss.add(files('strList.c'))
 util_ss.add(files('thread-context.c'), numa)
 if not config_host_data.get('CONFIG_ATOMIC64')
   util_ss.add(files('atomic64.c'))
diff --git a/util/strList.c b/util/strList.c
new file mode 100644
index 000..7588c7c
--- /dev/null
+++ b/util/strList.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2023 Red Hat, Inc.
+ * Copyright (c) 2022 - 2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/strList.h"
+
+strList *str_split(const char 

[PATCH V5 3/5] util: strv_from_strList

2024-02-22 Thread Steve Sistare
Signed-off-by: Steve Sistare 
Reviewed-by: Marc-André Lureau 
---
 include/qemu/strList.h |  6 ++
 util/strList.c | 14 ++
 2 files changed, 20 insertions(+)

diff --git a/include/qemu/strList.h b/include/qemu/strList.h
index 0f26116..c1eb1dd 100644
--- a/include/qemu/strList.h
+++ b/include/qemu/strList.h
@@ -21,4 +21,10 @@
  */
 strList *str_split(const char *str, const char *delim);
 
+/*
+ * Produce and return a NULL-terminated array of strings from @list.
+ * The result is g_malloc'd and all strings are g_strdup'd.
+ */
+char **strv_from_strList(const strList *list);
+
 #endif
diff --git a/util/strList.c b/util/strList.c
index 7588c7c..6da6762 100644
--- a/util/strList.c
+++ b/util/strList.c
@@ -22,3 +22,17 @@ strList *str_split(const char *str, const char *delim)
 
 return res;
 }
+
+char **strv_from_strList(const strList *list)
+{
+const strList *tail;
+int i = 0;
+char **argv = g_new(char *, QAPI_LIST_LENGTH(list) + 1);
+
+for (tail = list; tail != NULL; tail = tail->next) {
+argv[i++] = g_strdup(tail->value);
+}
+argv[i] = NULL;
+
+return argv;
+}
-- 
1.8.3.1




[PATCH V5 4/5] util: strList unit tests

2024-02-22 Thread Steve Sistare
Signed-off-by: Steve Sistare 
Reviewed-by: Marc-André Lureau 
---
 tests/unit/meson.build|  1 +
 tests/unit/test-strList.c | 80 +++
 2 files changed, 81 insertions(+)
 create mode 100644 tests/unit/test-strList.c

diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index cae925c..9984860 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -35,6 +35,7 @@ tests = {
   'test-rcu-simpleq': [],
   'test-rcu-tailq': [],
   'test-rcu-slist': [],
+  'test-strList': [],
   'test-qdist': [],
   'test-qht': [],
   'test-qtree': [],
diff --git a/tests/unit/test-strList.c b/tests/unit/test-strList.c
new file mode 100644
index 000..40af6b2
--- /dev/null
+++ b/tests/unit/test-strList.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022 - 2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/strList.h"
+
+static strList *make_list(int length)
+{
+strList *head = 0, *list, **prev = 
+
+while (length--) {
+list = *prev = g_new0(strList, 1);
+list->value = g_strdup("aaa");
+prev = >next;
+}
+return head;
+}
+
+static void test_length(void)
+{
+strList *list;
+int i;
+
+for (i = 0; i < 5; i++) {
+list = make_list(i);
+g_assert_cmpint(i, ==, QAPI_LIST_LENGTH(list));
+qapi_free_strList(list);
+}
+}
+
+struct {
+const char *string;
+const char *delim;
+const char *argv[5];
+} list_data[] = {
+{ NULL, ",", { NULL } },
+{ "", ",", { NULL } },
+{ "a", ",", { "a", NULL } },
+{ "a,b", ",", { "a", "b", NULL } },
+{ "a,b,c", ",", { "a", "b", "c", NULL } },
+{ "first last", " ", { "first", "last", NULL } },
+{ "a:", ":", { "a", "", NULL } },
+{ "a::b", ":", { "a", "", "b", NULL } },
+{ ":", ":", { "", "", NULL } },
+{ ":a", ":", { "", "a", NULL } },
+{ "::a", ":", { "", "", "a", NULL } },
+};
+
+static void test_strv(void)
+{
+int i, j;
+const char **expect;
+strList *list;
+char **argv;
+
+for (i = 0; i < ARRAY_SIZE(list_data); i++) {
+expect = list_data[i].argv;
+list = str_split(list_data[i].string, list_data[i].delim);
+argv = strv_from_strList(list);
+qapi_free_strList(list);
+for (j = 0; expect[j] && argv[j]; j++) {
+g_assert_cmpstr(expect[j], ==, argv[j]);
+}
+g_assert_null(expect[j]);
+g_assert_null(argv[j]);
+g_strfreev(argv);
+}
+}
+
+int main(int argc, char **argv)
+{
+g_test_init(, , NULL);
+g_test_add_func("/test-string/length", test_length);
+g_test_add_func("/test-string/strv", test_strv);
+return g_test_run();
+}
-- 
1.8.3.1




Re: [RFC PATCH v2] arm/ptw: Handle atomic updates of page tables entries in MMIO during PTW.

2024-02-22 Thread Richard Henderson

On 2/19/24 06:12, Jonathan Cameron wrote:

I'm far from confident this handling here is correct. Hence
RFC.  In particular not sure on what locks I should hold for this
to be even moderately safe.

The function already appears to be inconsistent in what it returns
as the CONFIG_ATOMIC64 block returns the endian converted 'eventual'
value of the cmpxchg whereas the TCG_OVERSIZED_GUEST case returns
the previous value.

Signed-off-by: Jonathan Cameron 
---
v2: Thanks Peter for reviewing.
  - Handle the address space as in arm_ldq_ptw() - I should have looked
at the code immediately above :(
The result ends up a little more convoluted than I'd like. Could factor
this block of code out perhaps. I'm also not sure on the fault type
that is appropriate here.
  - Switch to 'need_lock' as per Philippe's feedback on the x86 fixes.
likely() doesn't seem appropriate here though.
  
target/arm/ptw.c | 64 ++--

  1 file changed, 62 insertions(+), 2 deletions(-)

diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 5eb3577bcd..ba1a27ca2b 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -711,8 +711,68 @@ static uint64_t arm_casq_ptw(CPUARMState *env, uint64_t 
old_val,
  void *host = ptw->out_host;
  
  if (unlikely(!host)) {

-fi->type = ARMFault_UnsuppAtomicUpdate;
-return 0;
+/* Page table in MMIO Memory Region */
+CPUState *cs = env_cpu(env);
+MemTxAttrs attrs = {
+.space = ptw->out_space,
+.secure = arm_space_is_secure(ptw->out_space),
+};
+AddressSpace *as = arm_addressspace(cs, attrs);
+MemTxResult result = MEMTX_OK;
+bool need_lock = !bql_locked();
+
+if (need_lock) {
+bql_lock();
+}
+if (ptw->out_be) {
+cur_val = address_space_ldq_be(as, ptw->out_phys, attrs, );
+if (unlikely(result != MEMTX_OK)) {
+fi->type = ARMFault_SyncExternalOnWalk;
+fi->ea = arm_extabort_type(result);
+if (need_lock) {
+bql_unlock();
+}
+return old_val;
+}


Use BQL_LOCK_GUARD() and avoid all of the repeated unlocks at each return point.

You can merge all of the error paths, e.g.

cur_val = (ptw->out_be
   ? address_space_ldq_be(as, ptw->out_phys, attrs, )
   : address_space_ldq_le(as, ptw->out_phys, attrs, ));
if (result == MEMTX_OK && cur_val == old_val) {
if (ptw->out_be) {
address_space_stq_be(as, ptw->out_phys, new_val, attrs, );
} else {
address_space_stq_le(as, ptw->out_phys, new_val, attrs, );
}
}
if (unlikely(result != MEMTX_OK)) {
fi->type = ...
return old_val;
}
return cur_val;



r~



Re: [PATCH] system/physmem: Fix migration dirty bitmap coherency with TCG memory access

2024-02-22 Thread Thomas Huth

On 20/02/2024 02.13, Nicholas Piggin wrote:

On Tue Feb 20, 2024 at 12:10 AM AEST, Thomas Huth wrote:

On 19/02/2024 07.17, Nicholas Piggin wrote:

The fastpath in cpu_physical_memory_sync_dirty_bitmap() to test large
aligned ranges forgot to bring the TCG TLB up to date after clearing
some of the dirty memory bitmap bits. This can result in stores though
the TCG TLB not setting the dirty memory bitmap and ultimately causes
memory corruption / lost updates during migration from a TCG host.

Fix this by exporting an abstracted function to call when dirty bits
have been cleared.

Fixes: aa8dc044772 ("migration: synchronize memory bitmap 64bits at a time")
Signed-off-by: Nicholas Piggin 
---


Sounds promising! ... but it doesn't seem to fix the migration-test qtest
with s390x when it gets enabled again:


Did it fix kvm-unit-tests for you?


It does, indeed! With your QEMU patch here, your new selftest-migration test 
of the k-u-t is working reliably with TCG now, indeed. Thus feel free to add:


Tested-by: Thomas Huth 


diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -3385,15 +3385,6 @@ int main(int argc, char **argv)
   return g_test_run();
   }

-/*
- * Similar to ppc64, s390x seems to be touchy with TCG, so disable it
- * there until the problems are resolved
- */
-if (g_str_equal(arch, "s390x") && !has_kvm) {
-g_test_message("Skipping test: s390x host with KVM is required");
-return g_test_run();
-}
-
   tmpfs = g_dir_make_tmp("migration-test-XX", );
   if (!tmpfs) {
   g_test_message("Can't create temporary directory in %s: %s",

I wonder whether there is more stuff like this necessary somewhere?


Possibly. That's what the commit logs for the TCG disable indicate. I
have found another dirty bitmap TCG race too. I'll send it out after
some more testing.


Did you try to re-enable tests/qtest/migration-test.c for ppc64 with TCG to
see whether that works fine now?


Hmm, I did try and so far ppc64 is not failing even with upstream QEMU.


Oh, indeed! Actually, now that you mentioned it, I remembered that I checked 
it a couple of weeks ago already:


https://lore.kernel.org/qemu-devel/7d4f5624-83d2-4330-9315-b23869529...@redhat.com/


I'll try with s390x. Any additional build or runtime options to make it
break? How long does it take for breakage to be evident?


For me, it normally breaks after running the migration test a couple of few 
times already, let's say one time out of ten runs?


 Thomas




[PULL 33/39] accel/tcg: Disconnect TargetPageDataNode from page size

2024-02-22 Thread Richard Henderson
Dynamically size the node for the runtime target page size.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-29-richard.hender...@linaro.org>
---
 accel/tcg/user-exec.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/accel/tcg/user-exec.c b/accel/tcg/user-exec.c
index 69b7429e31..3cac3a78c4 100644
--- a/accel/tcg/user-exec.c
+++ b/accel/tcg/user-exec.c
@@ -864,7 +864,7 @@ tb_page_addr_t get_page_addr_code_hostp(CPUArchState *env, 
vaddr addr,
 typedef struct TargetPageDataNode {
 struct rcu_head rcu;
 IntervalTreeNode itree;
-char data[TPD_PAGES][TARGET_PAGE_DATA_SIZE] __attribute__((aligned));
+char data[] __attribute__((aligned));
 } TargetPageDataNode;
 
 static IntervalTreeRoot targetdata_root;
@@ -902,7 +902,8 @@ void page_reset_target_data(target_ulong start, 
target_ulong last)
 n_last = MIN(last, n->last);
 p_len = (n_last + 1 - n_start) >> TARGET_PAGE_BITS;
 
-memset(t->data[p_ofs], 0, p_len * TARGET_PAGE_DATA_SIZE);
+memset(t->data + p_ofs * TARGET_PAGE_DATA_SIZE, 0,
+   p_len * TARGET_PAGE_DATA_SIZE);
 }
 }
 
@@ -910,7 +911,7 @@ void *page_get_target_data(target_ulong address)
 {
 IntervalTreeNode *n;
 TargetPageDataNode *t;
-target_ulong page, region;
+target_ulong page, region, p_ofs;
 
 page = address & TARGET_PAGE_MASK;
 region = address & TBD_MASK;
@@ -926,7 +927,8 @@ void *page_get_target_data(target_ulong address)
 mmap_lock();
 n = interval_tree_iter_first(_root, page, page);
 if (!n) {
-t = g_new0(TargetPageDataNode, 1);
+t = g_malloc0(sizeof(TargetPageDataNode)
+  + TPD_PAGES * TARGET_PAGE_DATA_SIZE);
 n = >itree;
 n->start = region;
 n->last = region | ~TBD_MASK;
@@ -936,7 +938,8 @@ void *page_get_target_data(target_ulong address)
 }
 
 t = container_of(n, TargetPageDataNode, itree);
-return t->data[(page - region) >> TARGET_PAGE_BITS];
+p_ofs = (page - region) >> TARGET_PAGE_BITS;
+return t->data + p_ofs * TARGET_PAGE_DATA_SIZE;
 }
 #else
 void page_reset_target_data(target_ulong start, target_ulong last) { }
-- 
2.34.1




[PULL 34/39] linux-user: Allow TARGET_PAGE_BITS_VARY

2024-02-22 Thread Richard Henderson
If set, match the host and guest page sizes.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-30-richard.hender...@linaro.org>
---
 linux-user/main.c | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index bad03f06d3..12bb839982 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -55,6 +55,7 @@
 #include "loader.h"
 #include "user-mmap.h"
 #include "tcg/perf.h"
+#include "exec/page-vary.h"
 
 #ifdef CONFIG_SEMIHOSTING
 #include "semihosting/semihost.h"
@@ -680,6 +681,7 @@ int main(int argc, char **argv, char **envp)
 int i;
 int ret;
 int execfd;
+int host_page_size;
 unsigned long max_reserved_va;
 bool preserve_argv0;
 
@@ -791,6 +793,16 @@ int main(int argc, char **argv, char **envp)
  opt_one_insn_per_tb, _abort);
 ac->init_machine(NULL);
 }
+
+/*
+ * Finalize page size before creating CPUs.
+ * This will do nothing if !TARGET_PAGE_BITS_VARY.
+ * The most efficient setting is to match the host.
+ */
+host_page_size = qemu_real_host_page_size();
+set_preferred_target_page_bits(ctz32(host_page_size));
+finalize_target_page_bits();
+
 cpu = cpu_create(cpu_type);
 env = cpu_env(cpu);
 cpu_reset(cpu);
@@ -804,8 +816,6 @@ int main(int argc, char **argv, char **envp)
  */
 max_reserved_va = MAX_RESERVED_VA(cpu);
 if (reserved_va != 0) {
-int host_page_size = qemu_real_host_page_size();
-
 if ((reserved_va + 1) % host_page_size) {
 char *s = size_to_str(host_page_size);
 fprintf(stderr, "Reserved virtual address not aligned mod %s\n", 
s);
@@ -904,7 +914,7 @@ int main(int argc, char **argv, char **envp)
  * If we're in a chroot with no /proc, fall back to 1 page.
  */
 if (mmap_min_addr == 0) {
-mmap_min_addr = qemu_real_host_page_size();
+mmap_min_addr = host_page_size;
 qemu_log_mask(CPU_LOG_PAGE,
   "host mmap_min_addr=0x%lx (fallback)\n",
   mmap_min_addr);
-- 
2.34.1




[PULL 30/39] tests/tcg: Extend file in linux-madvise.c

2024-02-22 Thread Richard Henderson
When guest page size > host page size, this test can fail
due to the SIGBUS protection hack.  Avoid this by making
sure that the file size is at least one guest page.

Visible with alpha guest on x86_64 host.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-26-richard.hender...@linaro.org>
---
 tests/tcg/multiarch/linux/linux-madvise.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/tcg/multiarch/linux/linux-madvise.c 
b/tests/tcg/multiarch/linux/linux-madvise.c
index 29d0997e68..539fb3b772 100644
--- a/tests/tcg/multiarch/linux/linux-madvise.c
+++ b/tests/tcg/multiarch/linux/linux-madvise.c
@@ -42,6 +42,8 @@ static void test_file(void)
 assert(ret == 0);
 written = write(fd, , sizeof(c));
 assert(written == sizeof(c));
+ret = ftruncate(fd, pagesize);
+assert(ret == 0);
 page = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE, fd, 0);
 assert(page != MAP_FAILED);
 
-- 
2.34.1




[PULL 31/39] *-user: Deprecate and disable -p pagesize

2024-02-22 Thread Richard Henderson
This option controls the host page size.  From the mis-usage in
our own testsuite, this is easily confused with guest page size.

The only thing that occurs when changing the host page size is
that stuff breaks, because one cannot actually change the host
page size.  Therefore reject all but the no-op setting as part
of the deprecation process.

Reviewed-by: Warner Losh 
Signed-off-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-27-richard.hender...@linaro.org>
---
 docs/about/deprecated.rst | 10 ++
 docs/user/main.rst|  3 ---
 bsd-user/main.c   | 10 +-
 linux-user/main.c | 12 ++--
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 5a2305ccd6..3074303b9c 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -63,6 +63,16 @@ as short-form boolean values, and passed to plugins as 
``arg_name=on``.
 However, short-form booleans are deprecated and full explicit ``arg_name=on``
 form is preferred.
 
+User-mode emulator command line arguments
+-
+
+``-p`` (since 9.0)
+''
+
+The ``-p`` option pretends to control the host page size.  However,
+it is not possible to change the host page size, and using the
+option only causes failures.
+
 QEMU Machine Protocol (QMP) commands
 
 
diff --git a/docs/user/main.rst b/docs/user/main.rst
index 7e7ad07409..d5fbb78d3c 100644
--- a/docs/user/main.rst
+++ b/docs/user/main.rst
@@ -87,9 +87,6 @@ Debug options:
Activate logging of the specified items (use '-d help' for a list of
log items)
 
-``-p pagesize``
-   Act as if the host page size was 'pagesize' bytes
-
 ``-g port``
Wait gdb connection to port
 
diff --git a/bsd-user/main.c b/bsd-user/main.c
index e5efb7b845..521b58b880 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -364,11 +364,11 @@ int main(int argc, char **argv)
 } else if (!strcmp(r, "L")) {
 interp_prefix = argv[optind++];
 } else if (!strcmp(r, "p")) {
-qemu_host_page_size = atoi(argv[optind++]);
-if (qemu_host_page_size == 0 ||
-(qemu_host_page_size & (qemu_host_page_size - 1)) != 0) {
-fprintf(stderr, "page size must be a power of two\n");
-exit(1);
+unsigned size, want = qemu_real_host_page_size();
+
+if (qemu_strtoui(arg, NULL, 10, ) || size != want) {
+warn_report("Deprecated page size option cannot "
+"change host page size (%u)", want);
 }
 } else if (!strcmp(r, "g")) {
 gdbstub = g_strdup(argv[optind++]);
diff --git a/linux-user/main.c b/linux-user/main.c
index e540acb84a..bad03f06d3 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -332,11 +332,11 @@ static void handle_arg_ld_prefix(const char *arg)
 
 static void handle_arg_pagesize(const char *arg)
 {
-qemu_host_page_size = atoi(arg);
-if (qemu_host_page_size == 0 ||
-(qemu_host_page_size & (qemu_host_page_size - 1)) != 0) {
-fprintf(stderr, "page size must be a power of two\n");
-exit(EXIT_FAILURE);
+unsigned size, want = qemu_real_host_page_size();
+
+if (qemu_strtoui(arg, NULL, 10, ) || size != want) {
+warn_report("Deprecated page size option cannot "
+"change host page size (%u)", want);
 }
 }
 
@@ -496,7 +496,7 @@ static const struct qemu_argument arg_table[] = {
 {"D",  "QEMU_LOG_FILENAME", true, handle_arg_log_filename,
  "logfile", "write logs to 'logfile' (default stderr)"},
 {"p",  "QEMU_PAGESIZE",true,  handle_arg_pagesize,
- "pagesize",   "set the host page size to 'pagesize'"},
+ "pagesize",   "deprecated change to host page size"},
 {"one-insn-per-tb",
"QEMU_ONE_INSN_PER_TB",  false, handle_arg_one_insn_per_tb,
  "",   "run with one guest instruction per emulated TB"},
-- 
2.34.1




[PULL 38/39] target/alpha: Enable TARGET_PAGE_BITS_VARY for user-only

2024-02-22 Thread Richard Henderson
Since alpha binaries are generally built for multiple
page sizes, it is trivial to allow the page size to vary.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-34-richard.hender...@linaro.org>
---
 target/alpha/cpu-param.h | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/target/alpha/cpu-param.h b/target/alpha/cpu-param.h
index 68c46f7998..c969cb016b 100644
--- a/target/alpha/cpu-param.h
+++ b/target/alpha/cpu-param.h
@@ -9,10 +9,22 @@
 #define ALPHA_CPU_PARAM_H
 
 #define TARGET_LONG_BITS 64
-#define TARGET_PAGE_BITS 13
 
 /* ??? EV4 has 34 phys addr bits, EV5 has 40, EV6 has 44.  */
 #define TARGET_PHYS_ADDR_SPACE_BITS  44
-#define TARGET_VIRT_ADDR_SPACE_BITS  (30 + TARGET_PAGE_BITS)
+
+#ifdef CONFIG_USER_ONLY
+/*
+ * Allow user-only to vary page size.  Real hardware allows only 8k and 64k,
+ * but since any variance means guests cannot assume a fixed value, allow
+ * a 4k minimum to match x86 host, which can minimize emulation issues.
+ */
+# define TARGET_PAGE_BITS_VARY
+# define TARGET_PAGE_BITS_MIN 12
+# define TARGET_VIRT_ADDR_SPACE_BITS  63
+#else
+# define TARGET_PAGE_BITS 13
+# define TARGET_VIRT_ADDR_SPACE_BITS  (30 + TARGET_PAGE_BITS)
+#endif
 
 #endif
-- 
2.34.1




[PULL 15/39] hw/tpm: Remove HOST_PAGE_ALIGN from tpm_ppi_init

2024-02-22 Thread Richard Henderson
This removes a hidden use of qemu_host_page_size, hoisting
two uses of qemu_real_host_page_size to a local variable.

Signed-off-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
---
 hw/tpm/tpm_ppi.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hw/tpm/tpm_ppi.c b/hw/tpm/tpm_ppi.c
index 7f74e26ec6..f27ed6c35e 100644
--- a/hw/tpm/tpm_ppi.c
+++ b/hw/tpm/tpm_ppi.c
@@ -47,8 +47,10 @@ void tpm_ppi_reset(TPMPPI *tpmppi)
 void tpm_ppi_init(TPMPPI *tpmppi, MemoryRegion *m,
   hwaddr addr, Object *obj)
 {
-tpmppi->buf = qemu_memalign(qemu_real_host_page_size(),
-HOST_PAGE_ALIGN(TPM_PPI_ADDR_SIZE));
+size_t host_page_size = qemu_real_host_page_size();
+
+tpmppi->buf = qemu_memalign(host_page_size,
+ROUND_UP(TPM_PPI_ADDR_SIZE, host_page_size));
 memory_region_init_ram_device_ptr(>ram, obj, "tpm-ppi",
   TPM_PPI_ADDR_SIZE, tpmppi->buf);
 vmstate_register_ram(>ram, DEVICE(obj));
-- 
2.34.1




[PULL 29/39] tests/tcg: Remove run-test-mmap-*

2024-02-22 Thread Richard Henderson
These tests are confused, because -p does not change
the guest page size, but the host page size.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-25-richard.hender...@linaro.org>
---
 tests/tcg/alpha/Makefile.target |  3 ---
 tests/tcg/arm/Makefile.target   |  3 ---
 tests/tcg/hppa/Makefile.target  |  3 ---
 tests/tcg/i386/Makefile.target  |  3 ---
 tests/tcg/m68k/Makefile.target  |  3 ---
 tests/tcg/multiarch/Makefile.target |  9 -
 tests/tcg/ppc/Makefile.target   | 12 
 tests/tcg/sh4/Makefile.target   |  3 ---
 tests/tcg/sparc64/Makefile.target   |  6 --
 9 files changed, 45 deletions(-)
 delete mode 100644 tests/tcg/ppc/Makefile.target
 delete mode 100644 tests/tcg/sparc64/Makefile.target

diff --git a/tests/tcg/alpha/Makefile.target b/tests/tcg/alpha/Makefile.target
index b94500a7d9..fdd7ddf64e 100644
--- a/tests/tcg/alpha/Makefile.target
+++ b/tests/tcg/alpha/Makefile.target
@@ -13,6 +13,3 @@ test-cmov: test-cond.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
 
 run-test-cmov: test-cmov
-
-# On Alpha Linux only supports 8k pages
-EXTRA_RUNS+=run-test-mmap-8192
diff --git a/tests/tcg/arm/Makefile.target b/tests/tcg/arm/Makefile.target
index 3473f4619e..0a1965fce7 100644
--- a/tests/tcg/arm/Makefile.target
+++ b/tests/tcg/arm/Makefile.target
@@ -79,6 +79,3 @@ sha512-vector: sha512.c
 ARM_TESTS += sha512-vector
 
 TESTS += $(ARM_TESTS)
-
-# On ARM Linux only supports 4k pages
-EXTRA_RUNS+=run-test-mmap-4096
diff --git a/tests/tcg/hppa/Makefile.target b/tests/tcg/hppa/Makefile.target
index cdd0d572a7..ea5ae2186d 100644
--- a/tests/tcg/hppa/Makefile.target
+++ b/tests/tcg/hppa/Makefile.target
@@ -2,9 +2,6 @@
 #
 # HPPA specific tweaks - specifically masking out broken tests
 
-# On parisc Linux supports 4K/16K/64K (but currently only 4k works)
-EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-16384 run-test-mmap-65536
-
 # This triggers failures for hppa-linux about 1% of the time
 # HPPA is the odd target that can't use the sigtramp page;
 # it requires the full vdso with dwarf2 unwind info.
diff --git a/tests/tcg/i386/Makefile.target b/tests/tcg/i386/Makefile.target
index 9906f9e116..bbe2c44b2a 100644
--- a/tests/tcg/i386/Makefile.target
+++ b/tests/tcg/i386/Makefile.target
@@ -71,9 +71,6 @@ endif
 I386_TESTS:=$(filter-out $(SKIP_I386_TESTS), $(ALL_X86_TESTS))
 TESTS=$(MULTIARCH_TESTS) $(I386_TESTS)
 
-# On i386 and x86_64 Linux only supports 4k pages (large pages are a different 
hack)
-EXTRA_RUNS+=run-test-mmap-4096
-
 sha512-sse: CFLAGS=-msse4.1 -O3
 sha512-sse: sha512.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
diff --git a/tests/tcg/m68k/Makefile.target b/tests/tcg/m68k/Makefile.target
index 6ff214e60a..33f7b1b127 100644
--- a/tests/tcg/m68k/Makefile.target
+++ b/tests/tcg/m68k/Makefile.target
@@ -5,6 +5,3 @@
 
 VPATH += $(SRC_PATH)/tests/tcg/m68k
 TESTS += trap denormal
-
-# On m68k Linux supports 4k and 8k pages (but 8k is currently broken)
-EXTRA_RUNS+=run-test-mmap-4096 # run-test-mmap-8192
diff --git a/tests/tcg/multiarch/Makefile.target 
b/tests/tcg/multiarch/Makefile.target
index e10951a801..f11f3b084d 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -51,18 +51,9 @@ run-plugin-vma-pthread-with-%: vma-pthread
$(call skip-test, $<, "flaky on CI?")
 endif
 
-# We define the runner for test-mmap after the individual
-# architectures have defined their supported pages sizes. If no
-# additional page sizes are defined we only run the default test.
-
-# default case (host page size)
 run-test-mmap: test-mmap
$(call run-test, test-mmap, $(QEMU) $<, $< (default))
 
-# additional page sizes (defined by each architecture adding to EXTRA_RUNS)
-run-test-mmap-%: test-mmap
-   $(call run-test, test-mmap-$*, $(QEMU) -p $* $<, $< ($* byte pages))
-
 ifneq ($(GDB),)
 GDB_SCRIPT=$(SRC_PATH)/tests/guest-debug/run-test.py
 
diff --git a/tests/tcg/ppc/Makefile.target b/tests/tcg/ppc/Makefile.target
deleted file mode 100644
index f5e08c7376..00
--- a/tests/tcg/ppc/Makefile.target
+++ /dev/null
@@ -1,12 +0,0 @@
-# -*- Mode: makefile -*-
-#
-# PPC - included from tests/tcg/Makefile
-#
-
-ifneq (,$(findstring 64,$(TARGET_NAME)))
-# On PPC64 Linux can be configured with 4k (default) or 64k pages (currently 
broken)
-EXTRA_RUNS+=run-test-mmap-4096 #run-test-mmap-65536
-else
-# On PPC32 Linux supports 4K/16K/64K/256K (but currently only 4k works)
-EXTRA_RUNS+=run-test-mmap-4096 #run-test-mmap-16384 run-test-mmap-65536 
run-test-mmap-262144
-endif
diff --git a/tests/tcg/sh4/Makefile.target b/tests/tcg/sh4/Makefile.target
index 47c39a44b6..16eaa850a8 100644
--- a/tests/tcg/sh4/Makefile.target
+++ b/tests/tcg/sh4/Makefile.target
@@ -3,9 +3,6 @@
 # SuperH specific tweaks
 #
 
-# On sh Linux supports 4k, 8k, 16k and 64k pages (but only 4k currently works)

[PULL 25/39] linux-user: Use do_munmap for target_mmap failure

2024-02-22 Thread Richard Henderson
For the cases for which the host mmap succeeds, but does
not yield the desired address, use do_munmap to restore
the reserved_va memory reservation.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/mmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 8ebcca..cbcd31e941 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -326,7 +326,7 @@ static bool mmap_frag(abi_ulong real_start, abi_ulong 
start, abi_ulong last,
flags | MAP_ANONYMOUS, -1, 0);
 if (p != host_start) {
 if (p != MAP_FAILED) {
-munmap(p, host_page_size);
+do_munmap(p, host_page_size);
 errno = EEXIST;
 }
 return false;
@@ -622,7 +622,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 p = mmap(g2h_untagged(start), len, host_prot,
  flags | MAP_FIXED, fd, host_offset);
 if (p == MAP_FAILED) {
-munmap(g2h_untagged(start), host_len);
+do_munmap(g2h_untagged(start), host_len);
 return -1;
 }
 host_start += offset - host_offset;
@@ -735,7 +735,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
  flags, fd, offset1);
 if (p != want_p) {
 if (p != MAP_FAILED) {
-munmap(p, len1);
+do_munmap(p, len1);
 errno = EEXIST;
 }
 return -1;
-- 
2.34.1




[PULL 09/39] linux-user/nios2: Remove qemu_host_page_size from init_guest_commpage

2024-02-22 Thread Richard Henderson
Use qemu_real_host_page_size.
If !reserved_va, use MAP_FIXED_NOREPLACE.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-7-richard.hender...@linaro.org>
---
 linux-user/elfload.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 1893b3c192..a9f1077861 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1532,10 +1532,14 @@ static bool init_guest_commpage(void)
  0x3a, 0x68, 0x3b, 0x00,  /* trap 0 */
 };
 
-void *want = g2h_untagged(LO_COMMPAGE & -qemu_host_page_size);
-void *addr = mmap(want, qemu_host_page_size, PROT_READ | PROT_WRITE,
-  MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
+int host_page_size = qemu_real_host_page_size();
+void *want, *addr;
 
+want = g2h_untagged(LO_COMMPAGE & -host_page_size);
+addr = mmap(want, host_page_size, PROT_READ | PROT_WRITE,
+MAP_ANONYMOUS | MAP_PRIVATE |
+(reserved_va ? MAP_FIXED : MAP_FIXED_NOREPLACE),
+-1, 0);
 if (addr == MAP_FAILED) {
 perror("Allocating guest commpage");
 exit(EXIT_FAILURE);
@@ -1544,9 +1548,9 @@ static bool init_guest_commpage(void)
 return false;
 }
 
-memcpy(addr, kuser_page, sizeof(kuser_page));
+memcpy(g2h_untagged(LO_COMMPAGE), kuser_page, sizeof(kuser_page));
 
-if (mprotect(addr, qemu_host_page_size, PROT_READ)) {
+if (mprotect(addr, host_page_size, PROT_READ)) {
 perror("Protecting guest commpage");
 exit(EXIT_FAILURE);
 }
-- 
2.34.1




[PULL 03/39] tcg: Avoid double lock if page tables happen to be in mmio memory.

2024-02-22 Thread Richard Henderson
From: Jonathan Cameron 

On i386, after fixing the page walking code to work with pages in
MMIO memory (specifically CXL emulated interleaved memory),
a crash was seen in an interrupt handling path.

Useful part of backtrace

7  0x55ab1929 in bql_lock_impl (file=0x56049122 
"../../accel/tcg/cputlb.c", line=2033) at ../../system/cpus.c:524
8  bql_lock_impl (file=file@entry=0x56049122 "../../accel/tcg/cputlb.c", 
line=line@entry=2033) at ../../system/cpus.c:520
9  0x55c9f7d6 in do_ld_mmio_beN (cpu=0x578e0cb0, 
full=0x7ffe88012950, ret_be=ret_be@entry=0, addr=19595792376, 
size=size@entry=8, mmu_idx=4, type=MMU_DATA_LOAD, ra=0) at 
../../accel/tcg/cputlb.c:2033
10 0x55ca0fbd in do_ld_8 (cpu=cpu@entry=0x578e0cb0, 
p=p@entry=0x74efd1d0, mmu_idx=, 
type=type@entry=MMU_DATA_LOAD, memop=, ra=ra@entry=0) at 
../../accel/tcg/cputlb.c:2356
11 0x55ca341f in do_ld8_mmu (cpu=cpu@entry=0x578e0cb0, 
addr=addr@entry=19595792376, oi=oi@entry=52, ra=0, ra@entry=52, 
access_type=access_type@entry=MMU_DATA_LOAD) at ../../accel/tcg/cputlb.c:2439
12 0x55ca5f59 in cpu_ldq_mmu (ra=52, oi=52, addr=19595792376, 
env=0x578e3470) at ../../accel/tcg/ldst_common.c.inc:169
13 cpu_ldq_le_mmuidx_ra (env=0x578e3470, addr=19595792376, 
mmu_idx=, ra=ra@entry=0) at ../../accel/tcg/ldst_common.c.inc:301
14 0x55b4b5fc in ptw_ldq (ra=0, in=0x74efd320) at 
../../target/i386/tcg/sysemu/excp_helper.c:98
15 ptw_ldq (ra=0, in=0x74efd320) at 
../../target/i386/tcg/sysemu/excp_helper.c:93
16 mmu_translate (env=env@entry=0x578e3470, in=0x74efd3e0, 
out=0x74efd3b0, err=err@entry=0x74efd3c0, ra=ra@entry=0) at 
../../target/i386/tcg/sysemu/excp_helper.c:174
17 0x55b4c4b3 in get_physical_address (ra=0, err=0x74efd3c0, 
out=0x74efd3b0, mmu_idx=0, access_type=MMU_DATA_LOAD, 
addr=18446741874686299840, env=0x578e3470) at 
../../target/i386/tcg/sysemu/excp_helper.c:580
18 x86_cpu_tlb_fill (cs=0x578e0cb0, addr=18446741874686299840, 
size=, access_type=MMU_DATA_LOAD, mmu_idx=0, probe=, retaddr=0) at ../../target/i386/tcg/sysemu/excp_helper.c:606
19 0x55ca0ee9 in tlb_fill (retaddr=0, mmu_idx=0, 
access_type=MMU_DATA_LOAD, size=, addr=18446741874686299840, 
cpu=0x74efd540) at ../../accel/tcg/cputlb.c:1315
20 mmu_lookup1 (cpu=cpu@entry=0x578e0cb0, data=data@entry=0x74efd540, 
mmu_idx=0, access_type=access_type@entry=MMU_DATA_LOAD, ra=ra@entry=0) at 
../../accel/tcg/cputlb.c:1713
21 0x55ca2c61 in mmu_lookup (cpu=cpu@entry=0x578e0cb0, 
addr=addr@entry=18446741874686299840, oi=oi@entry=32, ra=ra@entry=0, 
type=type@entry=MMU_DATA_LOAD, l=l@entry=0x74efd540) at 
../../accel/tcg/cputlb.c:1803
22 0x55ca3165 in do_ld4_mmu (cpu=cpu@entry=0x578e0cb0, 
addr=addr@entry=18446741874686299840, oi=oi@entry=32, ra=ra@entry=0, 
access_type=access_type@entry=MMU_DATA_LOAD) at ../../accel/tcg/cputlb.c:2416
23 0x55ca5ef9 in cpu_ldl_mmu (ra=0, oi=32, addr=18446741874686299840, 
env=0x578e3470) at ../../accel/tcg/ldst_common.c.inc:158
24 cpu_ldl_le_mmuidx_ra (env=env@entry=0x578e3470, 
addr=addr@entry=18446741874686299840, mmu_idx=, ra=ra@entry=0) 
at ../../accel/tcg/ldst_common.c.inc:294
25 0x55bb6cdd in do_interrupt64 (is_hw=1, 
next_eip=18446744072399775809, error_code=0, is_int=0, intno=236, 
env=0x578e3470) at ../../target/i386/tcg/seg_helper.c:889
26 do_interrupt_all (cpu=cpu@entry=0x578e0cb0, intno=236, 
is_int=is_int@entry=0, error_code=error_code@entry=0, 
next_eip=next_eip@entry=0, is_hw=is_hw@entry=1) at 
../../target/i386/tcg/seg_helper.c:1130
27 0x55bb87da in do_interrupt_x86_hardirq 
(env=env@entry=0x578e3470, intno=, is_hw=is_hw@entry=1) at 
../../target/i386/tcg/seg_helper.c:1162
28 0x55b5039c in x86_cpu_exec_interrupt (cs=0x578e0cb0, 
interrupt_request=) at 
../../target/i386/tcg/sysemu/seg_helper.c:197
29 0x55c94480 in cpu_handle_interrupt (last_tb=, 
cpu=0x578e0cb0) at ../../accel/tcg/cpu-exec.c:844

Peter identified this as being due to the BQL already being
held when the page table walker encounters MMIO memory and attempts
to take the lock again.  There are other examples of similar paths
TCG, so this follows the approach taken in those of simply checking
if the lock is already held and if it is, don't take it again.

Reviewed-by: Peter Maydell 
Suggested-by: Peter Maydell 
Signed-off-by: Jonathan Cameron 
Message-Id: <20240219173153.12114-4-jonathan.came...@huawei.com>
[rth: Use BQL_LOCK_GUARD]
Signed-off-by: Richard Henderson 
---
 accel/tcg/cputlb.c | 34 ++
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 047cd2cc0a..6243bcb179 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -2022,7 +2022,6 @@ static uint64_t do_ld_mmio_beN(CPUState *cpu, 
CPUTLBEntryFull *full,
 MemoryRegion *mr;
 hwaddr mr_offset;
 

[PULL 13/39] linux-user: Remove HOST_PAGE_ALIGN from mmap.c

2024-02-22 Thread Richard Henderson
This removes a hidden use of qemu_host_page_size, using instead
the existing host_page_size local within each function.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-11-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 53e5486cc8..d11f758d07 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -191,7 +191,7 @@ int target_mprotect(abi_ulong start, abi_ulong len, int 
target_prot)
 
 last = start + len - 1;
 host_start = start & -host_page_size;
-host_last = HOST_PAGE_ALIGN(last) - 1;
+host_last = ROUND_UP(last, host_page_size) - 1;
 nranges = 0;
 
 mmap_lock();
@@ -389,8 +389,7 @@ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, 
abi_ulong align)
 start &= -host_page_size;
 }
 start = ROUND_UP(start, align);
-
-size = HOST_PAGE_ALIGN(size);
+size = ROUND_UP(size, host_page_size);
 
 if (reserved_va) {
 return mmap_find_vma_reserved(start, size, align);
@@ -550,7 +549,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
  */
 if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
 host_len = len + offset - host_offset;
-host_len = HOST_PAGE_ALIGN(host_len);
+host_len = ROUND_UP(host_len, host_page_size);
 start = mmap_find_vma(real_start, host_len, TARGET_PAGE_SIZE);
 if (start == (abi_ulong)-1) {
 errno = ENOMEM;
@@ -595,7 +594,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 void *p;
 
 host_len = len + offset - host_offset;
-host_len = HOST_PAGE_ALIGN(host_len);
+host_len = ROUND_UP(host_len, host_page_size);
 host_prot = target_to_host_prot(target_prot);
 
 /* Note: we prefer to control the mapping address. */
@@ -625,7 +624,7 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
 goto fail;
 }
 last = start + len - 1;
-real_last = HOST_PAGE_ALIGN(last) - 1;
+real_last = ROUND_UP(last, host_page_size) - 1;
 
 /*
  * Test if requested memory area fits target address space
@@ -794,7 +793,7 @@ static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong 
len)
 
 last = start + len - 1;
 real_start = start & -host_page_size;
-real_last = HOST_PAGE_ALIGN(last) - 1;
+real_last = ROUND_UP(last, host_page_size) - 1;
 
 /*
  * If guest pages remain on the first or last host pages,
-- 
2.34.1




[PULL 22/39] linux-user: Split out mmap_end

2024-02-22 Thread Richard Henderson
Use a subroutine instead of a goto within target_mmap__locked.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-20-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 71 +++
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 48fcdd4a32..cc983bedbd 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -490,6 +490,43 @@ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, 
abi_ulong align)
 }
 }
 
+/*
+ * Record a successful mmap within the user-exec interval tree.
+ */
+static abi_long mmap_end(abi_ulong start, abi_ulong last,
+ abi_ulong passthrough_start,
+ abi_ulong passthrough_last,
+ int flags, int page_flags)
+{
+if (flags & MAP_ANONYMOUS) {
+page_flags |= PAGE_ANON;
+}
+page_flags |= PAGE_RESET;
+if (passthrough_start > passthrough_last) {
+page_set_flags(start, last, page_flags);
+} else {
+if (start < passthrough_start) {
+page_set_flags(start, passthrough_start - 1, page_flags);
+}
+page_set_flags(passthrough_start, passthrough_last,
+   page_flags | PAGE_PASSTHROUGH);
+if (passthrough_last < last) {
+page_set_flags(passthrough_last + 1, last, page_flags);
+}
+}
+shm_region_rm_complete(start, last);
+trace_target_mmap_complete(start);
+if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
+FILE *f = qemu_log_trylock();
+if (f) {
+fprintf(f, "page layout changed following mmap\n");
+page_dump(f);
+qemu_log_unlock(f);
+}
+}
+return start;
+}
+
 static abi_long target_mmap__locked(abi_ulong start, abi_ulong len,
 int target_prot, int flags, int page_flags,
 int fd, off_t offset)
@@ -632,7 +669,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 ret = target_mprotect(start, len, target_prot);
 assert(ret == 0);
 }
-goto the_end;
+return mmap_end(start, last, -1, 0, flags, page_flags);
 }
 
 /* handle the start of the mapping */
@@ -643,7 +680,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
target_prot, flags, fd, offset)) {
 return -1;
 }
-goto the_end;
+return mmap_end(start, last, -1, 0, flags, page_flags);
 }
 if (!mmap_frag(real_start, start,
real_start + host_page_size - 1,
@@ -690,34 +727,8 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 passthrough_last = real_last;
 }
 }
- the_end:
-if (flags & MAP_ANONYMOUS) {
-page_flags |= PAGE_ANON;
-}
-page_flags |= PAGE_RESET;
-if (passthrough_start > passthrough_last) {
-page_set_flags(start, last, page_flags);
-} else {
-if (start < passthrough_start) {
-page_set_flags(start, passthrough_start - 1, page_flags);
-}
-page_set_flags(passthrough_start, passthrough_last,
-   page_flags | PAGE_PASSTHROUGH);
-if (passthrough_last < last) {
-page_set_flags(passthrough_last + 1, last, page_flags);
-}
-}
-shm_region_rm_complete(start, last);
-trace_target_mmap_complete(start);
-if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
-FILE *f = qemu_log_trylock();
-if (f) {
-fprintf(f, "page layout changed following mmap\n");
-page_dump(f);
-qemu_log_unlock(f);
-}
-}
-return start;
+return mmap_end(start, last, passthrough_start, passthrough_last,
+flags, page_flags);
 }
 
 /* NOTE: all the constants are the HOST ones */
-- 
2.34.1




[PULL 00/39] tcg and linux-user patch queue

2024-02-22 Thread Richard Henderson
The following changes since commit 6630bc04bccadcf868165ad6bca5a964bb69b067:

  Merge tag 'pull-trivial-patches' of https://gitlab.com/mjt0k/qemu into 
staging (2024-02-22 12:42:52 +)

are available in the Git repository at:

  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20240222

for you to fetch changes up to a06efc2615a1283e139e35ae8a8875925766268f:

  linux-user: Remove pgb_dynamic alignment assertion (2024-02-22 09:04:05 -1000)


tcg/aarch64: Apple does not align __int128_t in even registers
accel/tcg: Fixes for page tables in mmio memory
linux-user: Remove qemu_host_page_{size,mask}, HOST_PAGE_ALIGN
migration: Remove qemu_host_page_size
hw/tpm: Remove qemu_host_page_size
softmmu: Remove qemu_host_page_{size,mask}, HOST_PAGE_ALIGN
linux-user: Split and reorganize target_mmap.
*-user: Deprecate and disable -p pagesize
linux-user: Allow TARGET_PAGE_BITS_VARY
target/alpha: Enable TARGET_PAGE_BITS_VARY for user-only
target/arm: Enable TARGET_PAGE_BITS_VARY for AArch64 user-only
target/ppc: Enable TARGET_PAGE_BITS_VARY for user-only
linux-user: Remove pgb_dynamic alignment assertion


Jonathan Cameron (1):
  tcg: Avoid double lock if page tables happen to be in mmio memory.

Peter Maydell (1):
  accel/tcg: Set can_do_io at at start of lookup_tb_ptr helper

Richard Henderson (37):
  tcg/aarch64: Apple does not align __int128_t in even registers
  accel/tcg: Remove qemu_host_page_size from page_protect/page_unprotect
  linux-user: Adjust SVr4 NULL page mapping
  linux-user: Remove qemu_host_page_{size, mask} in probe_guest_base
  linux-user: Remove qemu_host_page_size from create_elf_tables
  linux-user/hppa: Simplify init_guest_commpage
  linux-user/nios2: Remove qemu_host_page_size from init_guest_commpage
  linux-user/arm: Remove qemu_host_page_size from init_guest_commpage
  linux-user: Remove qemu_host_page_{size, mask} from mmap.c
  linux-user: Remove REAL_HOST_PAGE_ALIGN from mmap.c
  linux-user: Remove HOST_PAGE_ALIGN from mmap.c
  migration: Remove qemu_host_page_size
  hw/tpm: Remove HOST_PAGE_ALIGN from tpm_ppi_init
  softmmu/physmem: Remove qemu_host_page_size
  softmmu/physmem: Remove HOST_PAGE_ALIGN
  linux-user: Remove qemu_host_page_size from main
  linux-user: Split out target_mmap__locked
  linux-user: Move some mmap checks outside the lock
  linux-user: Fix sub-host-page mmap
  linux-user: Split out mmap_end
  linux-user: Do early mmap placement only for reserved_va
  linux-user: Split out do_munmap
  linux-user: Use do_munmap for target_mmap failure
  linux-user: Split out mmap_h_eq_g
  linux-user: Split out mmap_h_lt_g
  linux-user: Split out mmap_h_gt_g
  tests/tcg: Remove run-test-mmap-*
  tests/tcg: Extend file in linux-madvise.c
  *-user: Deprecate and disable -p pagesize
  cpu: Remove page_size_init
  accel/tcg: Disconnect TargetPageDataNode from page size
  linux-user: Allow TARGET_PAGE_BITS_VARY
  target/arm: Enable TARGET_PAGE_BITS_VARY for AArch64 user-only
  linux-user: Bound mmap_min_addr by host page size
  target/ppc: Enable TARGET_PAGE_BITS_VARY for user-only
  target/alpha: Enable TARGET_PAGE_BITS_VARY for user-only
  linux-user: Remove pgb_dynamic alignment assertion

 docs/about/deprecated.rst |  10 +
 docs/user/main.rst|   3 -
 bsd-user/qemu.h   |   7 +
 include/exec/cpu-common.h |   7 -
 include/hw/core/cpu.h |   2 -
 target/alpha/cpu-param.h  |  16 +-
 target/arm/cpu-param.h|   6 +-
 target/ppc/cpu-param.h|   9 +-
 tcg/aarch64/tcg-target.h  |   6 +-
 accel/tcg/cpu-exec.c  |   8 +
 accel/tcg/cputlb.c|  34 +-
 accel/tcg/translate-all.c |   1 -
 accel/tcg/user-exec.c |  31 +-
 bsd-user/main.c   |  22 +-
 cpu-target.c  |  13 -
 hw/tpm/tpm_ppi.c  |   6 +-
 linux-user/elfload.c  |  68 +--
 linux-user/main.c |  34 +-
 linux-user/mmap.c | 767 ++
 migration/ram.c   |  22 +-
 system/physmem.c  |  17 +-
 system/vl.c   |   1 -
 target/arm/cpu.c  |  51 +-
 tests/tcg/multiarch/linux/linux-madvise.c |   2 +
 tests/tcg/alpha/Makefile.target   |   3 -
 tests/tcg/arm/Makefile.target |   3 -
 tests/tcg/hppa/Makefile.target|   3 -
 tests/tcg/i386/Makefile.target|   3 -
 tests/tcg/m68k/Makefile.target|   3 -
 tests/tcg/multiarch

[PULL 21/39] linux-user: Fix sub-host-page mmap

2024-02-22 Thread Richard Henderson
We cannot skip over the_end1 to the_end, because we fail to
record the validity of the guest page with the interval tree.
Remove "the_end" and rename "the_end1" to "the_end".

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-19-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index fbaea832c5..48fcdd4a32 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -643,7 +643,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
target_prot, flags, fd, offset)) {
 return -1;
 }
-goto the_end1;
+goto the_end;
 }
 if (!mmap_frag(real_start, start,
real_start + host_page_size - 1,
@@ -690,7 +690,7 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 passthrough_last = real_last;
 }
 }
- the_end1:
+ the_end:
 if (flags & MAP_ANONYMOUS) {
 page_flags |= PAGE_ANON;
 }
@@ -708,7 +708,6 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 }
 }
 shm_region_rm_complete(start, last);
- the_end:
 trace_target_mmap_complete(start);
 if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
 FILE *f = qemu_log_trylock();
-- 
2.34.1




[PULL 20/39] linux-user: Move some mmap checks outside the lock

2024-02-22 Thread Richard Henderson
Basic validation of operands does not require the lock.
Hoist them from target_mmap__locked back into target_mmap.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-18-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 107 +++---
 1 file changed, 53 insertions(+), 54 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index b4c3cc65aa..fbaea832c5 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -491,52 +491,14 @@ abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, 
abi_ulong align)
 }
 
 static abi_long target_mmap__locked(abi_ulong start, abi_ulong len,
-int target_prot, int flags,
+int target_prot, int flags, int page_flags,
 int fd, off_t offset)
 {
 int host_page_size = qemu_real_host_page_size();
 abi_ulong ret, last, real_start, real_last, retaddr, host_len;
 abi_ulong passthrough_start = -1, passthrough_last = 0;
-int page_flags;
 off_t host_offset;
 
-if (!len) {
-errno = EINVAL;
-return -1;
-}
-
-page_flags = validate_prot_to_pageflags(target_prot);
-if (!page_flags) {
-errno = EINVAL;
-return -1;
-}
-
-/* Also check for overflows... */
-len = TARGET_PAGE_ALIGN(len);
-if (!len) {
-errno = ENOMEM;
-return -1;
-}
-
-if (offset & ~TARGET_PAGE_MASK) {
-errno = EINVAL;
-return -1;
-}
-
-/*
- * If we're mapping shared memory, ensure we generate code for parallel
- * execution and flush old translations.  This will work up to the level
- * supported by the host -- anything that requires EXCP_ATOMIC will not
- * be atomic with respect to an external process.
- */
-if (flags & MAP_SHARED) {
-CPUState *cpu = thread_cpu;
-if (!(cpu->tcg_cflags & CF_PARALLEL)) {
-cpu->tcg_cflags |= CF_PARALLEL;
-tb_flush(cpu);
-}
-}
-
 real_start = start & -host_page_size;
 host_offset = offset & -host_page_size;
 
@@ -616,23 +578,9 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 passthrough_start = start;
 passthrough_last = last;
 } else {
-if (start & ~TARGET_PAGE_MASK) {
-errno = EINVAL;
-return -1;
-}
 last = start + len - 1;
 real_last = ROUND_UP(last, host_page_size) - 1;
 
-/*
- * Test if requested memory area fits target address space
- * It can fail only on 64-bit host with 32-bit target.
- * On any other target/host host mmap() handles this error correctly.
- */
-if (last < start || !guest_range_valid_untagged(start, len)) {
-errno = ENOMEM;
-return -1;
-}
-
 if (flags & MAP_FIXED_NOREPLACE) {
 /* Validate that the chosen range is empty. */
 if (!page_check_range_empty(start, last)) {
@@ -778,13 +726,64 @@ abi_long target_mmap(abi_ulong start, abi_ulong len, int 
target_prot,
  int flags, int fd, off_t offset)
 {
 abi_long ret;
+int page_flags;
 
 trace_target_mmap(start, len, target_prot, flags, fd, offset);
+
+if (!len) {
+errno = EINVAL;
+return -1;
+}
+
+page_flags = validate_prot_to_pageflags(target_prot);
+if (!page_flags) {
+errno = EINVAL;
+return -1;
+}
+
+/* Also check for overflows... */
+len = TARGET_PAGE_ALIGN(len);
+if (!len || len != (size_t)len) {
+errno = ENOMEM;
+return -1;
+}
+
+if (offset & ~TARGET_PAGE_MASK) {
+errno = EINVAL;
+return -1;
+}
+if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) {
+if (start & ~TARGET_PAGE_MASK) {
+errno = EINVAL;
+return -1;
+}
+if (!guest_range_valid_untagged(start, len)) {
+errno = ENOMEM;
+return -1;
+}
+}
+
 mmap_lock();
 
-ret = target_mmap__locked(start, len, target_prot, flags, fd, offset);
+ret = target_mmap__locked(start, len, target_prot, flags,
+  page_flags, fd, offset);
 
 mmap_unlock();
+
+/*
+ * If we're mapping shared memory, ensure we generate code for parallel
+ * execution and flush old translations.  This will work up to the level
+ * supported by the host -- anything that requires EXCP_ATOMIC will not
+ * be atomic with respect to an external process.
+ */
+if (ret != -1 && (flags & MAP_TYPE) != MAP_PRIVATE) {
+CPUState *cpu = thread_cpu;
+if (!(cpu->tcg_cflags & CF_PARALLEL)) {
+cpu->tcg_cflags |= CF_PARALLEL;
+tb_flush(cpu);
+}
+}
+
 return ret;
 }
 
-- 
2.34.1




[PULL 01/39] tcg/aarch64: Apple does not align __int128_t in even registers

2024-02-22 Thread Richard Henderson
>From 
>https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms

  When passing an argument with 16-byte alignment in integer registers,
  Apple platforms allow the argument to start in an odd-numbered xN
  register. The standard ABI requires it to begin in an even-numbered
  xN register.

Cc: qemu-sta...@nongnu.org
Fixes: 5427a9a7604 ("tcg: Add TCG_TARGET_CALL_{RET,ARG}_I128")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2169
Signed-off-by: Richard Henderson 
Message-Id: <9fc0c2c7-dd57-459e-aecb-528edb74b...@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé 
---
 tcg/aarch64/tcg-target.h | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index ef5ebe91bd..85d5746e47 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -55,7 +55,11 @@ typedef enum {
 #define TCG_TARGET_CALL_STACK_OFFSET0
 #define TCG_TARGET_CALL_ARG_I32 TCG_CALL_ARG_NORMAL
 #define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
-#define TCG_TARGET_CALL_ARG_I128TCG_CALL_ARG_EVEN
+#ifdef CONFIG_DARWIN
+# define TCG_TARGET_CALL_ARG_I128   TCG_CALL_ARG_NORMAL
+#else
+# define TCG_TARGET_CALL_ARG_I128   TCG_CALL_ARG_EVEN
+#endif
 #define TCG_TARGET_CALL_RET_I128TCG_CALL_RET_NORMAL
 
 #define have_lse(cpuinfo & CPUINFO_LSE)
-- 
2.34.1




[PULL 27/39] linux-user: Split out mmap_h_lt_g

2024-02-22 Thread Richard Henderson
Work much harder to get alignment and mapping beyond the end
of the file correct.  Both of which are excercised by our
test-mmap for alpha (8k pages) on any 4k page host.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-23-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 184 ++
 1 file changed, 153 insertions(+), 31 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index d3556bcc14..ff8f9f7ed0 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -569,6 +569,156 @@ static abi_long mmap_h_eq_g(abi_ulong start, abi_ulong 
len,
 return mmap_end(start, last, start, last, flags, page_flags);
 }
 
+/*
+ * Special case host page size < target page size.
+ *
+ * The two special cases are increased guest alignment, and mapping
+ * past the end of a file.
+ *
+ * When mapping files into a memory area larger than the file,
+ * accesses to pages beyond the file size will cause a SIGBUS.
+ *
+ * For example, if mmaping a file of 100 bytes on a host with 4K
+ * pages emulating a target with 8K pages, the target expects to
+ * be able to access the first 8K. But the host will trap us on
+ * any access beyond 4K.
+ *
+ * When emulating a target with a larger page-size than the hosts,
+ * we may need to truncate file maps at EOF and add extra anonymous
+ * pages up to the targets page boundary.
+ *
+ * This workaround only works for files that do not change.
+ * If the file is later extended (e.g. ftruncate), the SIGBUS
+ * vanishes and the proper behaviour is that changes within the
+ * anon page should be reflected in the file.
+ *
+ * However, this case is rather common with executable images,
+ * so the workaround is important for even trivial tests, whereas
+ * the mmap of of a file being extended is less common.
+ */
+static abi_long mmap_h_lt_g(abi_ulong start, abi_ulong len, int host_prot,
+int mmap_flags, int page_flags, int fd,
+off_t offset, int host_page_size)
+{
+void *p, *want_p = g2h_untagged(start);
+off_t fileend_adj = 0;
+int flags = mmap_flags;
+abi_ulong last, pass_last;
+
+if (!(flags & MAP_ANONYMOUS)) {
+struct stat sb;
+
+if (fstat(fd, ) == -1) {
+return -1;
+}
+if (offset >= sb.st_size) {
+/*
+ * The entire map is beyond the end of the file.
+ * Transform it to an anonymous mapping.
+ */
+flags |= MAP_ANONYMOUS;
+fd = -1;
+offset = 0;
+} else if (offset + len > sb.st_size) {
+/*
+ * A portion of the map is beyond the end of the file.
+ * Truncate the file portion of the allocation.
+ */
+fileend_adj = offset + len - sb.st_size;
+}
+}
+
+if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) {
+if (fileend_adj) {
+p = mmap(want_p, len, host_prot, flags | MAP_ANONYMOUS, -1, 0);
+} else {
+p = mmap(want_p, len, host_prot, flags, fd, offset);
+}
+if (p != want_p) {
+if (p != MAP_FAILED) {
+/* Host does not support MAP_FIXED_NOREPLACE: emulate. */
+do_munmap(p, len);
+errno = EEXIST;
+}
+return -1;
+}
+
+if (fileend_adj) {
+void *t = mmap(p, len - fileend_adj, host_prot,
+   (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED,
+   fd, offset);
+
+if (t == MAP_FAILED) {
+int save_errno = errno;
+
+/*
+ * We failed a map over the top of the successful anonymous
+ * mapping above. The only failure mode is running out of VMAs,
+ * and there's nothing that we can do to detect that earlier.
+ * If we have replaced an existing mapping with MAP_FIXED,
+ * then we cannot properly recover.  It's a coin toss whether
+ * it would be better to exit or continue here.
+ */
+if (!(flags & MAP_FIXED_NOREPLACE) &&
+!page_check_range_empty(start, start + len - 1)) {
+qemu_log("QEMU target_mmap late failure: %s",
+ strerror(save_errno));
+}
+
+do_munmap(want_p, len);
+errno = save_errno;
+return -1;
+}
+}
+} else {
+size_t host_len, part_len;
+
+/*
+ * Take care to align the host memory.  Perform a larger anonymous
+ * allocation and extract the aligned portion.  Remap the file on
+ * top of that.
+ */
+host_len = len + TARGET_PAGE_SIZE - host_page_size;
+p = mmap(want_p, host_len, host_prot, flags | MAP_ANONYMOUS, 

[PULL 28/39] linux-user: Split out mmap_h_gt_g

2024-02-22 Thread Richard Henderson
Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-24-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 288 ++
 1 file changed, 139 insertions(+), 149 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index ff8f9f7ed0..82f4026283 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -282,7 +282,16 @@ static int do_munmap(void *addr, size_t len)
 return munmap(addr, len);
 }
 
-/* map an incomplete host page */
+/*
+ * Map an incomplete host page.
+ *
+ * Here be dragons.  This case will not work if there is an existing
+ * overlapping host page, which is file mapped, and for which the mapping
+ * is beyond the end of the file.  In that case, we will see SIGBUS when
+ * trying to write a portion of this page.
+ *
+ * FIXME: Work around this with a temporary signal handler and longjmp.
+ */
 static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last,
   int prot, int flags, int fd, off_t offset)
 {
@@ -719,19 +728,138 @@ static abi_long mmap_h_lt_g(abi_ulong start, abi_ulong 
len, int host_prot,
 return mmap_end(start, last, start, pass_last, mmap_flags, page_flags);
 }
 
+/*
+ * Special case host page size > target page size.
+ *
+ * The two special cases are address and file offsets that are valid
+ * for the guest that cannot be directly represented by the host.
+ */
+static abi_long mmap_h_gt_g(abi_ulong start, abi_ulong len,
+int target_prot, int host_prot,
+int flags, int page_flags, int fd,
+off_t offset, int host_page_size)
+{
+void *p, *want_p = g2h_untagged(start);
+off_t host_offset = offset & -host_page_size;
+abi_ulong last, real_start, real_last;
+bool misaligned_offset = false;
+size_t host_len;
+
+if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
+/*
+ * Adjust the offset to something representable on the host.
+ */
+host_len = len + offset - host_offset;
+p = mmap(want_p, host_len, host_prot, flags, fd, host_offset);
+if (p == MAP_FAILED) {
+return -1;
+}
+
+/* Update start to the file position at offset. */
+p += offset - host_offset;
+
+start = h2g(p);
+last = start + len - 1;
+return mmap_end(start, last, start, last, flags, page_flags);
+}
+
+if (!(flags & MAP_ANONYMOUS)) {
+misaligned_offset = (start ^ offset) & (host_page_size - 1);
+
+/*
+ * The fallback for misalignment is a private mapping + read.
+ * This carries none of semantics required of MAP_SHARED.
+ */
+if (misaligned_offset && (flags & MAP_TYPE) != MAP_PRIVATE) {
+errno = EINVAL;
+return -1;
+}
+}
+
+last = start + len - 1;
+real_start = start & -host_page_size;
+real_last = ROUND_UP(last, host_page_size) - 1;
+
+/*
+ * Handle the start and end of the mapping.
+ */
+if (real_start < start) {
+abi_ulong real_page_last = real_start + host_page_size - 1;
+if (last <= real_page_last) {
+/* Entire allocation a subset of one host page. */
+if (!mmap_frag(real_start, start, last, target_prot,
+   flags, fd, offset)) {
+return -1;
+}
+return mmap_end(start, last, -1, 0, flags, page_flags);
+}
+
+if (!mmap_frag(real_start, start, real_page_last, target_prot,
+   flags, fd, offset)) {
+return -1;
+}
+real_start = real_page_last + 1;
+}
+
+if (last < real_last) {
+abi_ulong real_page_start = real_last - host_page_size + 1;
+if (!mmap_frag(real_page_start, real_page_start, last,
+   target_prot, flags, fd,
+   offset + real_page_start - start)) {
+return -1;
+}
+real_last = real_page_start - 1;
+}
+
+if (real_start > real_last) {
+return mmap_end(start, last, -1, 0, flags, page_flags);
+}
+
+/*
+ * Handle the middle of the mapping.
+ */
+
+host_len = real_last - real_start + 1;
+want_p += real_start - start;
+
+if (flags & MAP_ANONYMOUS) {
+p = mmap(want_p, host_len, host_prot, flags, -1, 0);
+} else if (!misaligned_offset) {
+p = mmap(want_p, host_len, host_prot, flags, fd,
+ offset + real_start - start);
+} else {
+p = mmap(want_p, host_len, host_prot | PROT_WRITE,
+ flags | MAP_ANONYMOUS, -1, 0);
+}
+if (p != want_p) {
+if (p != MAP_FAILED) {
+do_munmap(p, host_len);
+errno = EEXIST;
+}
+return -1;
+}
+
+if (misaligned_offset) {
+/* TODO: The read could be short. */
+if 

[PULL 23/39] linux-user: Do early mmap placement only for reserved_va

2024-02-22 Thread Richard Henderson
For reserved_va, place all non-fixed maps then proceed
as for MAP_FIXED.

Signed-off-by: Richard Henderson 
Reviewed-by: Pierrick Bouvier 
Reviewed-by: Ilya Leoshkevich 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-21-richard.hender...@linaro.org>
---
 linux-user/mmap.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index cc983bedbd..1bbfeb25b1 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -540,17 +540,19 @@ static abi_long target_mmap__locked(abi_ulong start, 
abi_ulong len,
 host_offset = offset & -host_page_size;
 
 /*
- * If the user is asking for the kernel to find a location, do that
- * before we truncate the length for mapping files below.
+ * For reserved_va, we are in full control of the allocation.
+ * Find a suitable hole and convert to MAP_FIXED.
  */
-if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
+if (reserved_va && !(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
 host_len = len + offset - host_offset;
-host_len = ROUND_UP(host_len, host_page_size);
-start = mmap_find_vma(real_start, host_len, TARGET_PAGE_SIZE);
+start = mmap_find_vma(real_start, host_len,
+  MAX(host_page_size, TARGET_PAGE_SIZE));
 if (start == (abi_ulong)-1) {
 errno = ENOMEM;
 return -1;
 }
+start += offset - host_offset;
+flags |= MAP_FIXED;
 }
 
 /*
-- 
2.34.1




[PULL 07/39] linux-user: Remove qemu_host_page_size from create_elf_tables

2024-02-22 Thread Richard Henderson
AT_PAGESZ is supposed to advertise the guest page size.
The random adjustment made here using qemu_host_page_size
does not match anything else within linux-user.

The idea here is good, but should be done more systemically
via adjustment to TARGET_PAGE_SIZE.

Signed-off-by: Richard Henderson 
Reviewed-by: Ilya Leoshkevich 
Reviewed-by: Pierrick Bouvier 
Acked-by: Helge Deller 
Message-Id: <20240102015808.132373-5-richard.hender...@linaro.org>
---
 linux-user/elfload.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index e84a201448..dfb152bfcb 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -2679,13 +2679,7 @@ static abi_ulong create_elf_tables(abi_ulong p, int 
argc, int envc,
 NEW_AUX_ENT(AT_PHDR, (abi_ulong)(info->load_addr + exec->e_phoff));
 NEW_AUX_ENT(AT_PHENT, (abi_ulong)(sizeof (struct elf_phdr)));
 NEW_AUX_ENT(AT_PHNUM, (abi_ulong)(exec->e_phnum));
-if ((info->alignment & ~qemu_host_page_mask) != 0) {
-/* Target doesn't support host page size alignment */
-NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(TARGET_PAGE_SIZE));
-} else {
-NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(MAX(TARGET_PAGE_SIZE,
-   qemu_host_page_size)));
-}
+NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(TARGET_PAGE_SIZE));
 NEW_AUX_ENT(AT_BASE, (abi_ulong)(interp_info ? interp_info->load_addr : 
0));
 NEW_AUX_ENT(AT_FLAGS, (abi_ulong)0);
 NEW_AUX_ENT(AT_ENTRY, info->entry);
-- 
2.34.1




  1   2   3   4   >