[PATCH v2 11/11] backends/iommufd: Make iommufd_backend_*() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

The changed functions include:

iommufd_backend_connect
iommufd_backend_alloc_ioas

By this chance, simplify the functions a bit by avoiding duplicate
recordings, e.g., log through either error interface or trace, not
both.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/sysemu/iommufd.h |  6 +++---
 backends/iommufd.c   | 29 +
 hw/vfio/iommufd.c|  5 ++---
 backends/trace-events|  4 ++--
 4 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9af27ebd6c..293bfbe967 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -23,11 +23,11 @@ struct IOMMUFDBackend {
 /*< public >*/
 };
 
-int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
+bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp);
 void iommufd_backend_disconnect(IOMMUFDBackend *be);
 
-int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
-   Error **errp);
+bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+Error **errp);
 void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id);
 int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id, hwaddr iova,
 ram_addr_t size, void *vaddr, bool readonly);
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 76a0204852..c506afbdac 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -72,24 +72,22 @@ static void iommufd_backend_class_init(ObjectClass *oc, 
void *data)
 object_class_property_add_str(oc, "fd", NULL, iommufd_backend_set_fd);
 }
 
-int iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
+bool iommufd_backend_connect(IOMMUFDBackend *be, Error **errp)
 {
-int fd, ret = 0;
+int fd;
 
 if (be->owned && !be->users) {
 fd = qemu_open_old("/dev/iommu", O_RDWR);
 if (fd < 0) {
 error_setg_errno(errp, errno, "/dev/iommu opening failed");
-ret = fd;
-goto out;
+return false;
 }
 be->fd = fd;
 }
 be->users++;
-out:
-trace_iommufd_backend_connect(be->fd, be->owned,
-  be->users, ret);
-return ret;
+
+trace_iommufd_backend_connect(be->fd, be->owned, be->users);
+return true;
 }
 
 void iommufd_backend_disconnect(IOMMUFDBackend *be)
@@ -106,25 +104,24 @@ out:
 trace_iommufd_backend_disconnect(be->fd, be->users);
 }
 
-int iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
-   Error **errp)
+bool iommufd_backend_alloc_ioas(IOMMUFDBackend *be, uint32_t *ioas_id,
+Error **errp)
 {
-int ret, fd = be->fd;
+int fd = be->fd;
 struct iommu_ioas_alloc alloc_data  = {
 .size = sizeof(alloc_data),
 .flags = 0,
 };
 
-ret = ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data);
-if (ret) {
+if (ioctl(fd, IOMMU_IOAS_ALLOC, &alloc_data)) {
 error_setg_errno(errp, errno, "Failed to allocate ioas");
-return ret;
+return false;
 }
 
 *ioas_id = alloc_data.out_ioas_id;
-trace_iommufd_backend_alloc_ioas(fd, *ioas_id, ret);
+trace_iommufd_backend_alloc_ioas(fd, *ioas_id);
 
-return ret;
+return true;
 }
 
 void iommufd_backend_free_id(IOMMUFDBackend *be, uint32_t id)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 6a446b16dc..554f9a6292 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -71,7 +71,7 @@ static bool iommufd_cdev_connect_and_bind(VFIODevice 
*vbasedev, Error **errp)
 .flags = 0,
 };
 
-if (iommufd_backend_connect(iommufd, errp)) {
+if (!iommufd_backend_connect(iommufd, errp)) {
 return false;
 }
 
@@ -346,8 +346,7 @@ static bool iommufd_cdev_attach(const char *name, 
VFIODevice *vbasedev,
 }
 
 /* Need to allocate a new dedicated container */
-ret = iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp);
-if (ret < 0) {
+if (!iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp)) {
 goto err_alloc_ioas;
 }
 
diff --git a/backends/trace-events b/backends/trace-events
index d45c6e31a6..211e6f374a 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -7,11 +7,11 @@ dbus_vmstate_loading(const char *id) "id: %s"
 dbus_vmstate_saving(const char *id) "id: %s"
 
 # iommufd.c
-iommufd_backend_connect(int fd, bool owned, uint32_t users, int ret) "fd=%d 
owned=%d users=%d (%d)"
+iommufd_backend_connect(int fd, bool owned, uint32_t users) "fd=%d owned=%d 
users=%d"
 iommufd_backend_disconnect(int fd, uint32_t users) "fd=%d users=%d"
 iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
 iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t 
size, void *vaddr, bool reado

[PATCH v2 08/11] vfio/container: Make vfio_get_device() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 5fb4bee082..b02583ea16 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -800,8 +800,8 @@ static void vfio_put_group(VFIOGroup *group)
 g_free(group);
 }
 
-static int vfio_get_device(VFIOGroup *group, const char *name,
-   VFIODevice *vbasedev, Error **errp)
+static bool vfio_get_device(VFIOGroup *group, const char *name,
+VFIODevice *vbasedev, Error **errp)
 {
 g_autofree struct vfio_device_info *info = NULL;
 int fd;
@@ -813,14 +813,14 @@ static int vfio_get_device(VFIOGroup *group, const char 
*name,
 error_append_hint(errp,
   "Verify all devices in group %d are bound to vfio- "
   "or pci-stub and not already in use\n", group->groupid);
-return fd;
+return false;
 }
 
 info = vfio_get_device_info(fd);
 if (!info) {
 error_setg_errno(errp, errno, "error getting device info");
 close(fd);
-return -1;
+return false;
 }
 
 /*
@@ -835,7 +835,7 @@ static int vfio_get_device(VFIOGroup *group, const char 
*name,
 error_setg(errp, "Inconsistent setting of support for discarding "
"RAM (e.g., balloon) within group");
 close(fd);
-return -1;
+return false;
 }
 
 if (!group->ram_block_discard_allowed) {
@@ -856,7 +856,7 @@ static int vfio_get_device(VFIOGroup *group, const char 
*name,
 
 vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
 
-return 0;
+return true;
 }
 
 static void vfio_put_base_device(VFIODevice *vbasedev)
@@ -909,7 +909,6 @@ static bool vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
 VFIODevice *vbasedev_iter;
 VFIOGroup *group;
 VFIOContainerBase *bcontainer;
-int ret;
 
 if (groupid < 0) {
 return false;
@@ -929,8 +928,7 @@ static bool vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
 return false;
 }
 }
-ret = vfio_get_device(group, name, vbasedev, errp);
-if (ret) {
+if (!vfio_get_device(group, name, vbasedev, errp)) {
 vfio_put_group(group);
 return false;
 }
-- 
2.34.1




[PATCH v2 05/11] vfio: Make VFIOIOMMUClass::add_window() and its wrapper return bool

2024-05-06 Thread Zhenzhong Duan
Make VFIOIOMMUClass::add_window() and its wrapper function
vfio_container_add_section_window() return bool.

This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-container-base.h | 12 ++--
 hw/vfio/common.c  |  2 +-
 hw/vfio/container-base.c  |  8 
 hw/vfio/spapr.c   | 16 
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 68539e3bed..e96cda78c8 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -76,9 +76,9 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer,
 int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
  hwaddr iova, ram_addr_t size,
  IOMMUTLBEntry *iotlb);
-int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
-  MemoryRegionSection *section,
-  Error **errp);
+bool vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+   MemoryRegionSection *section,
+   Error **errp);
 void vfio_container_del_section_window(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
 int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
@@ -131,9 +131,9 @@ struct VFIOIOMMUClass {
 int (*pci_hot_reset)(VFIODevice *vbasedev, bool single);
 
 /* SPAPR specific */
-int (*add_window)(VFIOContainerBase *bcontainer,
-  MemoryRegionSection *section,
-  Error **errp);
+bool (*add_window)(VFIOContainerBase *bcontainer,
+   MemoryRegionSection *section,
+   Error **errp);
 void (*del_window)(VFIOContainerBase *bcontainer,
MemoryRegionSection *section);
 void (*release)(VFIOContainerBase *bcontainer);
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 890d30910e..9f1f2e19f7 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -585,7 +585,7 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 return;
 }
 
-if (vfio_container_add_section_window(bcontainer, section, &err)) {
+if (!vfio_container_add_section_window(bcontainer, section, &err)) {
 goto fail;
 }
 
diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
index 913ae49077..98d71b3144 100644
--- a/hw/vfio/container-base.c
+++ b/hw/vfio/container-base.c
@@ -31,12 +31,12 @@ int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
 return bcontainer->ops->dma_unmap(bcontainer, iova, size, iotlb);
 }
 
-int vfio_container_add_section_window(VFIOContainerBase *bcontainer,
-  MemoryRegionSection *section,
-  Error **errp)
+bool vfio_container_add_section_window(VFIOContainerBase *bcontainer,
+   MemoryRegionSection *section,
+   Error **errp)
 {
 if (!bcontainer->ops->add_window) {
-return 0;
+return true;
 }
 
 return bcontainer->ops->add_window(bcontainer, section, errp);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 148b257c9c..47b040f1bc 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -323,7 +323,7 @@ static int vfio_spapr_create_window(VFIOContainer 
*container,
 return 0;
 }
 
-static int
+static bool
 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
 MemoryRegionSection *section,
 Error **errp)
@@ -351,13 +351,13 @@ vfio_spapr_container_add_section_window(VFIOContainerBase 
*bcontainer,
 error_setg(errp, "Container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
iova, end);
-return -EINVAL;
+return false;
 }
-return 0;
+return true;
 }
 
 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
-return 0;
+return true;
 }
 
 /* For now intersections are not allowed, we may relax this later */
@@ -373,14 +373,14 @@ vfio_spapr_container_add_section_window(VFIOContainerBase 
*bcontainer,
 section->offset_within_address_space +
 int128_get64(section->size) - 1,
 hostwin->min_iova, hostwin->max_iova);
-return -EINVAL;
+return false;
 }
 }
 
 ret = vfio_spapr_create_window(container, section, &pgsize);
 if (ret) {
 error_setg_errno(errp, -re

[PATCH v2 09/11] vfio/iommufd: Make iommufd_cdev_*() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

The changed functions include:

iommufd_cdev_kvm_device_add
iommufd_cdev_connect_and_bind
iommufd_cdev_attach_ioas_hwpt
iommufd_cdev_detach_ioas_hwpt
iommufd_cdev_attach_container
iommufd_cdev_get_info_iova_range

After the change, all functions in hw/vfio/iommufd.c follows the
standand.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 88 +--
 1 file changed, 39 insertions(+), 49 deletions(-)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 4c6992fca1..84c86b970e 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -49,9 +49,9 @@ static int iommufd_cdev_unmap(const VFIOContainerBase 
*bcontainer,
  container->ioas_id, iova, size);
 }
 
-static int iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp)
+static bool iommufd_cdev_kvm_device_add(VFIODevice *vbasedev, Error **errp)
 {
-return vfio_kvm_device_add_fd(vbasedev->fd, errp);
+return !vfio_kvm_device_add_fd(vbasedev->fd, errp);
 }
 
 static void iommufd_cdev_kvm_device_del(VFIODevice *vbasedev)
@@ -63,18 +63,16 @@ static void iommufd_cdev_kvm_device_del(VFIODevice 
*vbasedev)
 }
 }
 
-static int iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
+static bool iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
 {
 IOMMUFDBackend *iommufd = vbasedev->iommufd;
 struct vfio_device_bind_iommufd bind = {
 .argsz = sizeof(bind),
 .flags = 0,
 };
-int ret;
 
-ret = iommufd_backend_connect(iommufd, errp);
-if (ret) {
-return ret;
+if (iommufd_backend_connect(iommufd, errp)) {
+return false;
 }
 
 /*
@@ -82,15 +80,13 @@ static int iommufd_cdev_connect_and_bind(VFIODevice 
*vbasedev, Error **errp)
  * in KVM. Especially for some emulated devices, it requires
  * to have kvm information in the device open.
  */
-ret = iommufd_cdev_kvm_device_add(vbasedev, errp);
-if (ret) {
+if (!iommufd_cdev_kvm_device_add(vbasedev, errp)) {
 goto err_kvm_device_add;
 }
 
 /* Bind device to iommufd */
 bind.iommufd = iommufd->fd;
-ret = ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
-if (ret) {
+if (ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind)) {
 error_setg_errno(errp, errno, "error bind device fd=%d to iommufd=%d",
  vbasedev->fd, bind.iommufd);
 goto err_bind;
@@ -99,12 +95,12 @@ static int iommufd_cdev_connect_and_bind(VFIODevice 
*vbasedev, Error **errp)
 vbasedev->devid = bind.out_devid;
 trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name,
 vbasedev->fd, vbasedev->devid);
-return ret;
+return true;
 err_bind:
 iommufd_cdev_kvm_device_del(vbasedev);
 err_kvm_device_add:
 iommufd_backend_disconnect(iommufd);
-return ret;
+return false;
 }
 
 static void iommufd_cdev_unbind_and_disconnect(VFIODevice *vbasedev)
@@ -176,10 +172,10 @@ out:
 return ret;
 }
 
-static int iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id,
+static bool iommufd_cdev_attach_ioas_hwpt(VFIODevice *vbasedev, uint32_t id,
  Error **errp)
 {
-int ret, iommufd = vbasedev->iommufd->fd;
+int iommufd = vbasedev->iommufd->fd;
 struct vfio_device_attach_iommufd_pt attach_data = {
 .argsz = sizeof(attach_data),
 .flags = 0,
@@ -187,38 +183,38 @@ static int iommufd_cdev_attach_ioas_hwpt(VFIODevice 
*vbasedev, uint32_t id,
 };
 
 /* Attach device to an IOAS or hwpt within iommufd */
-ret = ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data);
-if (ret) {
+if (ioctl(vbasedev->fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data)) {
 error_setg_errno(errp, errno,
  "[iommufd=%d] error attach %s (%d) to id=%d",
  iommufd, vbasedev->name, vbasedev->fd, id);
-} else {
-trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name,
-vbasedev->fd, id);
+return false;
 }
-return ret;
+
+trace_iommufd_cdev_attach_ioas_hwpt(iommufd, vbasedev->name,
+vbasedev->fd, id);
+return true;
 }
 
-static int iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
+static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice *vbasedev, Error **errp)
 {
-int ret, iommufd = vbasedev->iommufd->fd;
+int iommufd = vbasedev->iommufd->fd;
 struct vfio_device_detach_iommufd_pt detach_data = {
 .argsz = sizeof(detach_data),
 .flags = 0,
 };
 
-ret = ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach_data);
-if (ret) {
+if (ioctl(vbasedev->fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach

[PATCH v2 07/11] vfio/container: Make vfio_set_iommu() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 0a7edfcc43..5fb4bee082 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -391,21 +391,20 @@ static const VFIOIOMMUClass *vfio_get_iommu_class(int 
iommu_type, Error **errp)
 return VFIO_IOMMU_CLASS(klass);
 }
 
-static int vfio_set_iommu(VFIOContainer *container, int group_fd,
-  VFIOAddressSpace *space, Error **errp)
+static bool vfio_set_iommu(VFIOContainer *container, int group_fd,
+   VFIOAddressSpace *space, Error **errp)
 {
-int iommu_type, ret;
+int iommu_type;
 const VFIOIOMMUClass *vioc;
 
 iommu_type = vfio_get_iommu_type(container, errp);
 if (iommu_type < 0) {
-return iommu_type;
+return false;
 }
 
-ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
-if (ret) {
+if (ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
 error_setg_errno(errp, errno, "Failed to set group container");
-return -errno;
+return false;
 }
 
 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
@@ -420,7 +419,7 @@ static int vfio_set_iommu(VFIOContainer *container, int 
group_fd,
 continue;
 }
 error_setg_errno(errp, errno, "Failed to set iommu for container");
-return -errno;
+return false;
 }
 
 container->iommu_type = iommu_type;
@@ -428,11 +427,11 @@ static int vfio_set_iommu(VFIOContainer *container, int 
group_fd,
 vioc = vfio_get_iommu_class(iommu_type, errp);
 if (!vioc) {
 error_setg(errp, "No available IOMMU models");
-return -EINVAL;
+return false;
 }
 
 vfio_container_init(&container->bcontainer, space, vioc);
-return 0;
+return true;
 }
 
 static int vfio_get_iommu_info(VFIOContainer *container,
@@ -613,8 +612,7 @@ static bool vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 container->fd = fd;
 bcontainer = &container->bcontainer;
 
-ret = vfio_set_iommu(container, group->fd, space, errp);
-if (ret) {
+if (!vfio_set_iommu(container, group->fd, space, errp)) {
 goto free_container_exit;
 }
 
-- 
2.34.1




[PATCH v2 06/11] vfio/container: Make vfio_connect_container() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/container.c | 18 +++---
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 85a8a369dc..0a7edfcc43 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -534,8 +534,8 @@ static bool vfio_legacy_setup(VFIOContainerBase 
*bcontainer, Error **errp)
 return true;
 }
 
-static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
-  Error **errp)
+static bool vfio_connect_container(VFIOGroup *group, AddressSpace *as,
+   Error **errp)
 {
 VFIOContainer *container;
 VFIOContainerBase *bcontainer;
@@ -587,19 +587,18 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 error_report("vfio: error disconnecting group %d from"
  " container", group->groupid);
 }
-return ret;
+return false;
 }
 group->container = container;
 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
 vfio_kvm_device_add_group(group);
-return 0;
+return true;
 }
 }
 
 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
 if (fd < 0) {
 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
-ret = -errno;
 goto put_space_exit;
 }
 
@@ -607,7 +606,6 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 if (ret != VFIO_API_VERSION) {
 error_setg(errp, "supported vfio version: %d, "
"reported version: %d", VFIO_API_VERSION, ret);
-ret = -EINVAL;
 goto close_fd_exit;
 }
 
@@ -634,7 +632,6 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 assert(bcontainer->ops->setup);
 
 if (!bcontainer->ops->setup(bcontainer, errp)) {
-ret = -EINVAL;
 goto enable_discards_exit;
 }
 
@@ -650,7 +647,6 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 memory_listener_register(&bcontainer->listener, bcontainer->space->as);
 
 if (bcontainer->error) {
-ret = -1;
 error_propagate_prepend(errp, bcontainer->error,
 "memory listener initialization failed: ");
 goto listener_release_exit;
@@ -658,7 +654,7 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 
 bcontainer->initialized = true;
 
-return 0;
+return true;
 listener_release_exit:
 QLIST_REMOVE(group, container_next);
 QLIST_REMOVE(bcontainer, next);
@@ -683,7 +679,7 @@ close_fd_exit:
 put_space_exit:
 vfio_put_address_space(space);
 
-return ret;
+return false;
 }
 
 static void vfio_disconnect_container(VFIOGroup *group)
@@ -770,7 +766,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace 
*as, Error **errp)
 group->groupid = groupid;
 QLIST_INIT(&group->device_list);
 
-if (vfio_connect_container(group, as, errp)) {
+if (!vfio_connect_container(group, as, errp)) {
 error_prepend(errp, "failed to setup container for group %d: ",
   groupid);
 goto close_fd_exit;
-- 
2.34.1




[PATCH v2 10/11] vfio/cpr: Make vfio_cpr_register_container() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h | 2 +-
 hw/vfio/container.c   | 3 +--
 hw/vfio/cpr.c | 4 ++--
 hw/vfio/iommufd.c | 3 +--
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index a7b6fc8f46..e4c60374fa 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -205,7 +205,7 @@ void vfio_detach_device(VFIODevice *vbasedev);
 int vfio_kvm_device_add_fd(int fd, Error **errp);
 int vfio_kvm_device_del_fd(int fd, Error **errp);
 
-int vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp);
+bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp);
 void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer);
 
 extern const MemoryRegionOps vfio_region_ops;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index b02583ea16..86266f3b83 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -616,8 +616,7 @@ static bool vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 goto free_container_exit;
 }
 
-ret = vfio_cpr_register_container(bcontainer, errp);
-if (ret) {
+if (!vfio_cpr_register_container(bcontainer, errp)) {
 goto free_container_exit;
 }
 
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index 392c2dd95d..87e51fcee1 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -25,12 +25,12 @@ static int vfio_cpr_reboot_notifier(NotifierWithReturn 
*notifier,
 return 0;
 }
 
-int vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp)
+bool vfio_cpr_register_container(VFIOContainerBase *bcontainer, Error **errp)
 {
 migration_add_notifier_mode(&bcontainer->cpr_reboot_notifier,
 vfio_cpr_reboot_notifier,
 MIG_MODE_CPR_REBOOT);
-return 0;
+return true;
 }
 
 void vfio_cpr_unregister_container(VFIOContainerBase *bcontainer)
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 84c86b970e..6a446b16dc 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -396,8 +396,7 @@ found_container:
 goto err_listener_register;
 }
 
-ret = vfio_cpr_register_container(bcontainer, errp);
-if (ret) {
+if (!vfio_cpr_register_container(bcontainer, errp)) {
 goto err_listener_register;
 }
 
-- 
2.34.1




[PATCH v2 01/11] vfio/pci: Use g_autofree in vfio_realize

2024-05-06 Thread Zhenzhong Duan
Local pointer name is allocated before vfio_attach_device() call
and freed after the call.

Same for tmp when calling realpath().

Use 'g_autofree' to avoid the g_free() calls.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/pci.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 64780d1b79..576b21e2bb 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2946,12 +2946,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 ERRP_GUARD();
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
 VFIODevice *vbasedev = &vdev->vbasedev;
-char *tmp, *subsys;
+char *subsys;
 Error *err = NULL;
 int i, ret;
 bool is_mdev;
 char uuid[UUID_STR_LEN];
-char *name;
+g_autofree char *name = NULL;
+g_autofree char *tmp = NULL;
 
 if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
 if (!(~vdev->host.domain || ~vdev->host.bus ||
@@ -2982,7 +2983,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  */
 tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
 subsys = realpath(tmp, NULL);
-g_free(tmp);
 is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
 free(subsys);
 
@@ -3003,7 +3003,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 ret = vfio_attach_device(name, vbasedev,
  pci_device_iommu_address_space(pdev), errp);
-g_free(name);
 if (ret) {
 goto error;
 }
-- 
2.34.1




[PATCH v2 02/11] vfio/pci: Use g_autofree in iommufd_cdev_get_info_iova_range()

2024-05-06 Thread Zhenzhong Duan
Local pointer info is freed before return from
iommufd_cdev_get_info_iova_range().

Use 'g_autofree' to avoid the g_free() calls.

Signed-off-by: Zhenzhong Duan 
---
 hw/vfio/iommufd.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 8827ffe636..c644127972 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -258,7 +258,7 @@ static int 
iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container,
 uint32_t ioas_id, Error **errp)
 {
 VFIOContainerBase *bcontainer = &container->bcontainer;
-struct iommu_ioas_iova_ranges *info;
+g_autofree struct iommu_ioas_iova_ranges *info = NULL;
 struct iommu_iova_range *iova_ranges;
 int ret, sz, fd = container->be->fd;
 
@@ -291,12 +291,10 @@ static int 
iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer *container,
 }
 bcontainer->pgsizes = info->out_iova_alignment;
 
-g_free(info);
 return 0;
 
 error:
 ret = -errno;
-g_free(info);
 error_setg_errno(errp, errno, "Cannot get IOVA ranges");
 return ret;
 }
-- 
2.34.1




[PATCH v2 04/11] vfio: Make VFIOIOMMUClass::setup() return bool

2024-05-06 Thread Zhenzhong Duan
This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
Reviewed-by: Cédric Le Goater 
---
 include/hw/vfio/vfio-container-base.h |  2 +-
 hw/vfio/container.c   | 10 +-
 hw/vfio/spapr.c   | 12 +---
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index c839cfd9cb..68539e3bed 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -111,7 +111,7 @@ struct VFIOIOMMUClass {
 InterfaceClass parent_class;
 
 /* basic feature */
-int (*setup)(VFIOContainerBase *bcontainer, Error **errp);
+bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
 int (*dma_map)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
void *vaddr, bool readonly);
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index ea3b145913..85a8a369dc 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -505,7 +505,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer 
*container,
 }
 }
 
-static int vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
+static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp)
 {
 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 bcontainer);
@@ -515,7 +515,7 @@ static int vfio_legacy_setup(VFIOContainerBase *bcontainer, 
Error **errp)
 ret = vfio_get_iommu_info(container, &info);
 if (ret) {
 error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
-return ret;
+return false;
 }
 
 if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
@@ -531,7 +531,7 @@ static int vfio_legacy_setup(VFIOContainerBase *bcontainer, 
Error **errp)
 vfio_get_info_iova_range(info, bcontainer);
 
 vfio_get_iommu_info_migration(container, info);
-return 0;
+return true;
 }
 
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
@@ -633,8 +633,8 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 
 assert(bcontainer->ops->setup);
 
-ret = bcontainer->ops->setup(bcontainer, errp);
-if (ret) {
+if (!bcontainer->ops->setup(bcontainer, errp)) {
+ret = -EINVAL;
 goto enable_discards_exit;
 }
 
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 0d949bb728..148b257c9c 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -458,8 +458,8 @@ static void vfio_spapr_container_release(VFIOContainerBase 
*bcontainer)
 }
 }
 
-static int vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
-  Error **errp)
+static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
+   Error **errp)
 {
 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
 bcontainer);
@@ -480,7 +480,7 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
 if (ret) {
 error_setg_errno(errp, errno, "failed to enable container");
-return -errno;
+return false;
 }
 } else {
 scontainer->prereg_listener = vfio_prereg_listener;
@@ -488,7 +488,6 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
 memory_listener_register(&scontainer->prereg_listener,
  &address_space_memory);
 if (bcontainer->error) {
-ret = -1;
 error_propagate_prepend(errp, bcontainer->error,
 "RAM memory listener initialization failed: ");
 goto listener_unregister_exit;
@@ -500,7 +499,6 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
 if (ret) {
 error_setg_errno(errp, errno,
  "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
-ret = -errno;
 goto listener_unregister_exit;
 }
 
@@ -527,13 +525,13 @@ static int vfio_spapr_container_setup(VFIOContainerBase 
*bcontainer,
   0x1000);
 }
 
-return 0;
+return true;
 
 listener_unregister_exit:
 if (v2) {
 memory_listener_unregister(&scontainer->prereg_listener);
 }
-return ret;
+return false;
 }
 
 static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data)
-- 
2.34.1




[PATCH v2 03/11] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool

2024-05-06 Thread Zhenzhong Duan
Make VFIOIOMMUClass::attach_device() and its wrapper function
vfio_attach_device() return bool.

This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
 include/hw/vfio/vfio-common.h |  4 ++--
 include/hw/vfio/vfio-container-base.h |  4 ++--
 hw/vfio/ap.c  |  6 ++
 hw/vfio/ccw.c |  6 ++
 hw/vfio/common.c  |  4 ++--
 hw/vfio/container.c   | 14 +++---
 hw/vfio/iommufd.c | 11 +--
 hw/vfio/pci.c |  5 ++---
 hw/vfio/platform.c|  7 +++
 9 files changed, 27 insertions(+), 34 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..a7b6fc8f46 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
 struct vfio_device_info *vfio_get_device_info(int fd);
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp);
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp);
 void vfio_detach_device(VFIODevice *vbasedev);
 
 int vfio_kvm_device_add_fd(int fd, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 3582d5f97a..c839cfd9cb 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -118,8 +118,8 @@ struct VFIOIOMMUClass {
 int (*dma_unmap)(const VFIOContainerBase *bcontainer,
  hwaddr iova, ram_addr_t size,
  IOMMUTLBEntry *iotlb);
-int (*attach_device)(const char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp);
+bool (*attach_device)(const char *name, VFIODevice *vbasedev,
+  AddressSpace *as, Error **errp);
 void (*detach_device)(VFIODevice *vbasedev);
 /* migration feature */
 int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 7c4caa5938..d50600b702 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -156,7 +156,6 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice 
*vapdev,
 static void vfio_ap_realize(DeviceState *dev, Error **errp)
 {
 ERRP_GUARD();
-int ret;
 Error *err = NULL;
 VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
 VFIODevice *vbasedev = &vapdev->vdev;
@@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
 return;
 }
 
-ret = vfio_attach_device(vbasedev->name, vbasedev,
- &address_space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(vbasedev->name, vbasedev,
+&address_space_memory, errp)) {
 goto error;
 }
 
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 90e4a53437..782bd4bed7 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
 VFIODevice *vbasedev = &vcdev->vdev;
 Error *err = NULL;
-int ret;
 
 /* Call the class init function for subchannel. */
 if (cdc->realize) {
@@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
 return;
 }
 
-ret = vfio_attach_device(cdev->mdevid, vbasedev,
- &address_space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(cdev->mdevid, vbasedev,
+&address_space_memory, errp)) {
 goto out_attach_dev_err;
 }
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8f9cbdc026..890d30910e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1492,8 +1492,8 @@ retry:
 return info;
 }
 
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp)
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp)
 {
 const VFIOIOMMUClass *ops =
 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 77bdec276e..ea3b145913 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -908,8 +908,8 @@ static int vfio_device_groupid(VFIODevice *vbasedev, Error 
**errp)
  * @name and @vbasedev->name are likely to be different depending
  * on the type of the device, hence the need for passing @name
  */
-static int vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp)
+static bool vfio_legacy_

[PATCH v2 00/11] VFIO: misc cleanups

2024-05-06 Thread Zhenzhong Duan
Hi

This is a cleanup series to change functions in hw/vfio/ to return bool 
when the error is passed through errp parameter, also some cleanup
with g_autofree.

See discussion at 
https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg04782.html

This series processed below files:
hw/vfio/container.c
hw/vfio/iommufd.c
hw/vfio/cpr.c
backends/iommufd.c

So above files are clean now, there are still other files need processing
in hw/vfio.

Test done on x86 platform:
vfio device hotplug/unplug with different backend
reboot

Thanks
Zhenzhong

Changelog:
v2:
- split out g_autofree code as a patch (Cédric)
- add processing for more files

Zhenzhong Duan (11):
  vfio/pci: Use g_autofree in vfio_realize
  vfio/pci: Use g_autofree in iommufd_cdev_get_info_iova_range()
  vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool
  vfio: Make VFIOIOMMUClass::setup() return bool
  vfio: Make VFIOIOMMUClass::add_window() and its wrapper return bool
  vfio/container: Make vfio_connect_container() return bool
  vfio/container: Make vfio_set_iommu() return bool
  vfio/container: Make vfio_get_device() return bool
  vfio/iommufd: Make iommufd_cdev_*() return bool
  vfio/cpr: Make vfio_cpr_register_container() return bool
  backends/iommufd: Make iommufd_backend_*() return bool

 include/hw/vfio/vfio-common.h |   6 +-
 include/hw/vfio/vfio-container-base.h |  18 ++---
 include/sysemu/iommufd.h  |   6 +-
 backends/iommufd.c|  29 +++
 hw/vfio/ap.c  |   6 +-
 hw/vfio/ccw.c |   6 +-
 hw/vfio/common.c  |   6 +-
 hw/vfio/container-base.c  |   8 +-
 hw/vfio/container.c   |  81 +--
 hw/vfio/cpr.c |   4 +-
 hw/vfio/iommufd.c | 109 +++---
 hw/vfio/pci.c |  12 ++-
 hw/vfio/platform.c|   7 +-
 hw/vfio/spapr.c   |  28 +++
 backends/trace-events |   4 +-
 15 files changed, 147 insertions(+), 183 deletions(-)

-- 
2.34.1




Re: [PATCH 1/3] target/ppc: Move sync instructions to decodetree

2024-05-06 Thread Chinmay Rath




On 5/1/24 18:34, Nicholas Piggin wrote:

This tries to faithfully reproduce the odd BookE logic.

It does change the handling of non-zero reserved bits outside the
defined fields from being illegal to being ignored, which the
architecture specifies ot help with backward compatibility of new
fields. The existing behaviour causes illegal instruction exceptions
when using new POWER10 sync variants that add new fields, after this
the instructions are accepted and are implemented as supersets of
the new behaviour, as intended.

Signed-off-by: Nicholas Piggin 

Reviewed-by: Chinmay Rath 

---
  target/ppc/insn32.decode |   7 ++
  target/ppc/translate.c   | 102 +---
  target/ppc/translate/misc-impl.c.inc | 135 +++
  3 files changed, 144 insertions(+), 100 deletions(-)
  create mode 100644 target/ppc/translate/misc-impl.c.inc

diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index eada59f59f..6b89804b15 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -998,3 +998,10 @@ MSGSND  01 - - . 0011001110 -   
@X_rb
  MSGCLRP 01 - - . 0010101110 -   @X_rb
  MSGSNDP 01 - - . 0010001110 -   @X_rb
  MSGSYNC 01 - - - 1101110110 -
+
+# Memory Barrier Instructions
+
+&X_sync l
+@X_sync .. ... l:2 . . .. .   &X_sync
+SYNC01 --- ..  - - 1001010110 -   @X_sync
+EIEIO   01 - - - 1101010110 -
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 93ffec787c..bb2cabae10 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -3423,59 +3423,6 @@ static void gen_stswx(DisasContext *ctx)
  gen_helper_stsw(tcg_env, t0, t1, t2);
  }
  
-/***Memory synchronisation ***/

-/* eieio */
-static void gen_eieio(DisasContext *ctx)
-{
-TCGBar bar = TCG_MO_ALL;
-
-/*
- * eieio has complex semanitcs. It provides memory ordering between
- * operations in the set:
- * - loads from CI memory.
- * - stores to CI memory.
- * - stores to WT memory.
- *
- * It separately also orders memory for operations in the set:
- * - stores to cacheble memory.
- *
- * It also serializes instructions:
- * - dcbt and dcbst.
- *
- * It separately serializes:
- * - tlbie and tlbsync.
- *
- * And separately serializes:
- * - slbieg, slbiag, and slbsync.
- *
- * The end result is that CI memory ordering requires TCG_MO_ALL
- * and it is not possible to special-case more relaxed ordering for
- * cacheable accesses. TCG_BAR_SC is required to provide this
- * serialization.
- */
-
-/*
- * POWER9 has a eieio instruction variant using bit 6 as a hint to
- * tell the CPU it is a store-forwarding barrier.
- */
-if (ctx->opcode & 0x200) {
-/*
- * ISA says that "Reserved fields in instructions are ignored
- * by the processor". So ignore the bit 6 on non-POWER9 CPU but
- * as this is not an instruction software should be using,
- * complain to the user.
- */
-if (!(ctx->insns_flags2 & PPC2_ISA300)) {
-qemu_log_mask(LOG_GUEST_ERROR, "invalid eieio using bit 6 at @"
-  TARGET_FMT_lx "\n", ctx->cia);
-} else {
-bar = TCG_MO_ST_LD;
-}
-}
-
-tcg_gen_mb(bar | TCG_BAR_SC);
-}
-
  #if !defined(CONFIG_USER_ONLY)
  static inline void gen_check_tlb_flush(DisasContext *ctx, bool global)
  {
@@ -3877,31 +3824,6 @@ static void gen_stqcx_(DisasContext *ctx)
  }
  #endif /* defined(TARGET_PPC64) */
  
-/* sync */

-static void gen_sync(DisasContext *ctx)
-{
-TCGBar bar = TCG_MO_ALL;
-uint32_t l = (ctx->opcode >> 21) & 3;
-
-if ((l == 1) && (ctx->insns_flags2 & PPC2_MEM_LWSYNC)) {
-bar = TCG_MO_LD_LD | TCG_MO_LD_ST | TCG_MO_ST_ST;
-}
-
-/*
- * We may need to check for a pending TLB flush.
- *
- * We do this on ptesync (l == 2) on ppc64 and any sync pn ppc32.
- *
- * Additionally, this can only happen in kernel mode however so
- * check MSR_PR as well.
- */
-if (((l == 2) || !(ctx->insns_flags & PPC_64B)) && !ctx->pr) {
-gen_check_tlb_flush(ctx, true);
-}
-
-tcg_gen_mb(bar | TCG_BAR_SC);
-}
-
  /* wait */
  static void gen_wait(DisasContext *ctx)
  {
@@ -6010,23 +5932,6 @@ static void gen_dlmzb(DisasContext *ctx)
   cpu_gpr[rS(ctx->opcode)], cpu_gpr[rB(ctx->opcode)], t0);
  }
  
-/* mbar replaces eieio on 440 */

-static void gen_mbar(DisasContext *ctx)
-{
-/* interpreted as no-op */
-}
-
-/* msync replaces sync on 440 */
-static void gen_msync_4xx(DisasContext *ctx)
-{
-/* Only e500 seems to treat reserved bits as invalid */
-if ((ctx->insns_flags2 & PPC2_BOOKE206) &&
-  

RE: [PATCH v3 05/19] backends/host_iommu_device: Introduce HostIOMMUDeviceCaps

2024-05-06 Thread Duan, Zhenzhong


>-Original Message-
>From: Cédric Le Goater 
>Subject: Re: [PATCH v3 05/19] backends/host_iommu_device: Introduce
>HostIOMMUDeviceCaps
>
>Hello Zhenzhong,
>
>On 4/29/24 08:50, Zhenzhong Duan wrote:
>> HostIOMMUDeviceCaps's elements map to the host IOMMU's capabilities.
>> Different platform IOMMU can support different elements.
>>
>> Currently only two elements, type and aw_bits, type hints the host
>> platform IOMMU type, i.e., INTEL vtd, ARM smmu, etc; aw_bits hints
>> host IOMMU address width.
>>
>> Introduce .check_cap() handler to check if
>HOST_IOMMU_DEVICE_CAP_XXX
>> is supported.
>>
>> Introduce a HostIOMMUDevice API host_iommu_device_check_cap()
>which
>> is a wrapper of .check_cap().
>>
>> Introduce a HostIOMMUDevice API
>host_iommu_device_check_cap_common()
>> to check common capabalities of different host platform IOMMUs.
>>
>> Suggested-by: Cédric Le Goater 
>> Signed-off-by: Zhenzhong Duan 
>> ---
>>   include/sysemu/host_iommu_device.h | 44
>++
>>   backends/host_iommu_device.c   | 29 
>>   2 files changed, 73 insertions(+)
>>
>> diff --git a/include/sysemu/host_iommu_device.h
>b/include/sysemu/host_iommu_device.h
>> index 2b58a94d62..12b6afb463 100644
>> --- a/include/sysemu/host_iommu_device.h
>> +++ b/include/sysemu/host_iommu_device.h
>> @@ -14,12 +14,27 @@
>>
>>   #include "qom/object.h"
>>   #include "qapi/error.h"
>> +#include "linux/iommufd.h"
>
>
>Please use instead :
>
>#include 

Got it.

Thanks
Zhenzhong


Re: [PATCH v3 05/19] backends/host_iommu_device: Introduce HostIOMMUDeviceCaps

2024-05-06 Thread Cédric Le Goater

Hello Zhenzhong,

On 4/29/24 08:50, Zhenzhong Duan wrote:

HostIOMMUDeviceCaps's elements map to the host IOMMU's capabilities.
Different platform IOMMU can support different elements.

Currently only two elements, type and aw_bits, type hints the host
platform IOMMU type, i.e., INTEL vtd, ARM smmu, etc; aw_bits hints
host IOMMU address width.

Introduce .check_cap() handler to check if HOST_IOMMU_DEVICE_CAP_XXX
is supported.

Introduce a HostIOMMUDevice API host_iommu_device_check_cap() which
is a wrapper of .check_cap().

Introduce a HostIOMMUDevice API host_iommu_device_check_cap_common()
to check common capabalities of different host platform IOMMUs.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
  include/sysemu/host_iommu_device.h | 44 ++
  backends/host_iommu_device.c   | 29 
  2 files changed, 73 insertions(+)

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
index 2b58a94d62..12b6afb463 100644
--- a/include/sysemu/host_iommu_device.h
+++ b/include/sysemu/host_iommu_device.h
@@ -14,12 +14,27 @@
  
  #include "qom/object.h"

  #include "qapi/error.h"
+#include "linux/iommufd.h"



Please use instead :

   #include 


Thanks,

C.



+
+/**
+ * struct HostIOMMUDeviceCaps - Define host IOMMU device capabilities.
+ *
+ * @type: host platform IOMMU type.
+ *
+ * @aw_bits: host IOMMU address width. 0xff if no limitation.
+ */
+typedef struct HostIOMMUDeviceCaps {
+enum iommu_hw_info_type type;
+uint8_t aw_bits;
+} HostIOMMUDeviceCaps;
  
  #define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"

  OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass, HOST_IOMMU_DEVICE)
  
  struct HostIOMMUDevice {

  Object parent_obj;
+
+HostIOMMUDeviceCaps caps;
  };
  
  /**

@@ -47,5 +62,34 @@ struct HostIOMMUDeviceClass {
   * Returns: true on success, false on failure.
   */
  bool (*realize)(HostIOMMUDevice *hiod, void *opaque, Error **errp);
+/**
+ * @check_cap: check if a host IOMMU device capability is supported.
+ *
+ * Optional callback, if not implemented, hint not supporting query
+ * of @cap.
+ *
+ * @hiod: pointer to a host IOMMU device instance.
+ *
+ * @cap: capability to check.
+ *
+ * @errp: pass an Error out when fails to query capability.
+ *
+ * Returns: <0 on failure, 0 if a @cap is unsupported, or else
+ * 1 or some positive value for some special @cap,
+ * i.e., HOST_IOMMU_DEVICE_CAP_AW_BITS.
+ */
+int (*check_cap)(HostIOMMUDevice *hiod, int cap, Error **errp);
  };
+
+/*
+ * Host IOMMU device capability list.
+ */
+#define HOST_IOMMU_DEVICE_CAP_IOMMUFD   0
+#define HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE1
+#define HOST_IOMMU_DEVICE_CAP_AW_BITS   2
+
+
+int host_iommu_device_check_cap(HostIOMMUDevice *hiod, int cap, Error **errp);
+int host_iommu_device_check_cap_common(HostIOMMUDevice *hiod, int cap,
+   Error **errp);
  #endif
diff --git a/backends/host_iommu_device.c b/backends/host_iommu_device.c
index 41f2fdce20..b97d008cc7 100644
--- a/backends/host_iommu_device.c
+++ b/backends/host_iommu_device.c
@@ -28,3 +28,32 @@ static void host_iommu_device_init(Object *obj)
  static void host_iommu_device_finalize(Object *obj)
  {
  }
+
+/* Wrapper of HostIOMMUDeviceClass:check_cap */
+int host_iommu_device_check_cap(HostIOMMUDevice *hiod, int cap, Error **errp)
+{
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
+if (!hiodc->check_cap) {
+error_setg(errp, ".check_cap() not implemented");
+return -EINVAL;
+}
+
+return hiodc->check_cap(hiod, cap, errp);
+}
+
+/* Implement check on common IOMMU capabilities */
+int host_iommu_device_check_cap_common(HostIOMMUDevice *hiod, int cap,
+   Error **errp)
+{
+HostIOMMUDeviceCaps *caps = &hiod->caps;
+
+switch (cap) {
+case HOST_IOMMU_DEVICE_CAP_IOMMU_TYPE:
+return caps->type;
+case HOST_IOMMU_DEVICE_CAP_AW_BITS:
+return caps->aw_bits;
+default:
+error_setg(errp, "Not support query cap %x", cap);
+return -EINVAL;
+}
+}





Re: [PATCH v3 1/5] hw/loongarch: Rename LOONGARCH_MACHINE with VIRT_MACHINE

2024-05-06 Thread Thomas Huth

On 07/05/2024 03.18, maobibo wrote:



On 2024/5/6 下午2:09, maobibo wrote:



On 2024/5/6 下午12:24, Thomas Huth wrote:

On 06/05/2024 05.02, Bibo Mao wrote:

On LoongArch system, there is only virt machine type now, name
LOONGARCH_MACHINE is confused, rename it with VIRT_MACHINE. Machine name
about Other real hw boards can be added in future.

Signed-off-by: Bibo Mao 
---

...
@@ -1245,7 +1244,7 @@ static void loongarch_class_init(ObjectClass *oc, 
void *data)

  static const TypeInfo loongarch_machine_types[] = {
  {
-    .name   = TYPE_LOONGARCH_MACHINE,
+    .name   = TYPE_VIRT_MACHINE,
  .parent = TYPE_MACHINE,
  .instance_size  = sizeof(LoongArchMachineState),
  .class_init = loongarch_class_init,
diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h
index 4e14bf6060..5ea2f0370d 100644
--- a/include/hw/loongarch/virt.h
+++ b/include/hw/loongarch/virt.h
@@ -73,8 +73,8 @@ struct LoongArchMachineState {
  struct loongarch_boot_info bootinfo;
  };
-#define TYPE_LOONGARCH_MACHINE  MACHINE_TYPE_NAME("virt")
-OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, LOONGARCH_MACHINE)
+#define TYPE_VIRT_MACHINE  MACHINE_TYPE_NAME("virt")
+OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, VIRT_MACHINE)
  bool loongarch_is_acpi_enabled(LoongArchMachineState *lams);
  void loongarch_acpi_setup(LoongArchMachineState *lams);
  #endif


  Hi,

there are currently some efforts going on to create the possibility to 
link a QEMU binary that contains all targets in one binary. Since we 
already have a TYPE_VIRT_MACHINE for other targets, I wonder whether it 
might be better to use LOONGARCH_VIRT_MACHINE than just VIRT_MACHINE 
here? Philippe, could you comment on this?


It is great if there is one QEMU binary which supports different targets. 
And LOONGARCH_VIRT_MACHINE is ok for me.

Hi Thomas, Philippe,

Does machine name "virt" need be changed if LOONGARCH_VIRT_MACHINE is used? 
There will be compatible issues if "virt" machine type is not suggested to use.


However CPU type "max" is not widely used now, can we get different 
architectures from CPU type rather than machine type for one QEMU binary 
which supports different targets?


I assume it should be fine to keep the "virt" machine name and "max" CPU 
type for each target, we've got a bunch of those already. I assume we'll 
keep the binary names as symlinks to the generic binary around and then 
decide via argv[0] about the main target...? Philippe, do you have already 
concrete plans for this?


 Thomas





RE: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool

2024-05-06 Thread Duan, Zhenzhong


>-Original Message-
>From: Cédric Le Goater 
>Subject: Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and
>its wrapper return bool
>
>On 5/7/24 04:09, Duan, Zhenzhong wrote:
>>
>>
>>> -Original Message-
>>> From: Cédric Le Goater 
>>> Subject: Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device()
>and
>>> its wrapper return bool
>>>
>>> On 5/6/24 10:33, Zhenzhong Duan wrote:
 Make VFIOIOMMUClass::attach_device() and its wrapper function
 vfio_attach_device() return bool.

 This is to follow the coding standand to return bool if 'Error **'
 is used to pass error.

 Suggested-by: Cédric Le Goater 
 Signed-off-by: Zhenzhong Duan 
 ---
include/hw/vfio/vfio-common.h |  4 ++--
include/hw/vfio/vfio-container-base.h |  4 ++--
hw/vfio/ap.c  |  6 ++
hw/vfio/ccw.c |  6 ++
hw/vfio/common.c  |  4 ++--
hw/vfio/container.c   | 14 +++---
hw/vfio/iommufd.c | 11 +--
hw/vfio/pci.c |  8 +++-
hw/vfio/platform.c|  7 +++
9 files changed, 28 insertions(+), 36 deletions(-)

 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>>> common.h
 index b9da6c08ef..a7b6fc8f46 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
 @@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region);
void vfio_region_finalize(VFIORegion *region);
void vfio_reset_handler(void *opaque);
struct vfio_device_info *vfio_get_device_info(int fd);
 -int vfio_attach_device(char *name, VFIODevice *vbasedev,
 -   AddressSpace *as, Error **errp);
 +bool vfio_attach_device(char *name, VFIODevice *vbasedev,
 +AddressSpace *as, Error **errp);
void vfio_detach_device(VFIODevice *vbasedev);

int vfio_kvm_device_add_fd(int fd, Error **errp);
 diff --git a/include/hw/vfio/vfio-container-base.h
>b/include/hw/vfio/vfio-
>>> container-base.h
 index 3582d5f97a..c839cfd9cb 100644
 --- a/include/hw/vfio/vfio-container-base.h
 +++ b/include/hw/vfio/vfio-container-base.h
 @@ -118,8 +118,8 @@ struct VFIOIOMMUClass {
int (*dma_unmap)(const VFIOContainerBase *bcontainer,
 hwaddr iova, ram_addr_t size,
 IOMMUTLBEntry *iotlb);
 -int (*attach_device)(const char *name, VFIODevice *vbasedev,
 - AddressSpace *as, Error **errp);
 +bool (*attach_device)(const char *name, VFIODevice *vbasedev,
 +  AddressSpace *as, Error **errp);
void (*detach_device)(VFIODevice *vbasedev);
/* migration feature */
int (*set_dirty_page_tracking)(const VFIOContainerBase
>*bcontainer,
 diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
 index 7c4caa5938..d50600b702 100644
 --- a/hw/vfio/ap.c
 +++ b/hw/vfio/ap.c
 @@ -156,7 +156,6 @@ static void
>>> vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev,
static void vfio_ap_realize(DeviceState *dev, Error **errp)
{
ERRP_GUARD();
 -int ret;
Error *err = NULL;
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
VFIODevice *vbasedev = &vapdev->vdev;
 @@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev,
>Error
>>> **errp)
return;
}

 -ret = vfio_attach_device(vbasedev->name, vbasedev,
 - &address_space_memory, errp);
 -if (ret) {
 +if (!vfio_attach_device(vbasedev->name, vbasedev,
 +&address_space_memory, errp)) {
goto error;
}

 diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
 index 90e4a53437..782bd4bed7 100644
 --- a/hw/vfio/ccw.c
 +++ b/hw/vfio/ccw.c
 @@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev,
>>> Error **errp)
S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
VFIODevice *vbasedev = &vcdev->vdev;
Error *err = NULL;
 -int ret;

/* Call the class init function for subchannel. */
if (cdc->realize) {
 @@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev,
>>> Error **errp)
return;
}

 -ret = vfio_attach_device(cdev->mdevid, vbasedev,
 - &address_space_memory, errp);
 -if (ret) {
 +if (!vfio_attach_device(cdev->mdevid, vbasedev,
 +&address_space_memory, errp)) {
goto out_attach_dev_err;
}

 diff --git a/hw/vfio/common.c b/hw/vfio/common.c
 index

Re: [PATCH] gitlab: Rename ubuntu-22.04-s390x-all to *-system

2024-05-06 Thread Thomas Huth

On 06/05/2024 22.23, Richard Henderson wrote:

We already build the linux-user binaries with
ubuntu-22.04-s390x-all-linux, so there's no need to do it again.

Signed-off-by: Richard Henderson 
---
  .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml 
b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
index 3a2b1e1d24..c3f16d77bb 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
@@ -21,7 +21,7 @@ ubuntu-22.04-s390x-all-linux:
   - make --output-sync check-tcg
   - make --output-sync -j`nproc` check
  
-ubuntu-22.04-s390x-all:

+ubuntu-22.04-s390x-all-system:
   extends: .custom_runner_template
   needs: []
   stage: build
@@ -35,7 +35,7 @@ ubuntu-22.04-s390x-all:
   script:
   - mkdir build
   - cd build
- - ../configure --disable-libssh
+ - ../configure --disable-libssh --disable-user


While you're at it, you could also drop the --disable-libssh now (it was a 
work-around for a bug in Ubuntu 18.04 IIRC).


Reviewed-by: Thomas Huth 




Re: [PATCH] gitlab: Drop --static from s390x linux-user build

2024-05-06 Thread Thomas Huth

On 06/05/2024 22.20, Richard Henderson wrote:

The host does not have the correct libraries installed for static pie,
which causes host/guest address space interference for some tests.
There's no real gain from linking statically, so drop it.

Signed-off-by: Richard Henderson 
---
Per my suggestion in

https://lore.kernel.org/qemu-devel/50c27a9f-fd75-4f8e-9a2d-488d8df4f...@linaro.org


r~
---
  .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml 
b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
index 105981879f..3a2b1e1d24 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
@@ -2,7 +2,7 @@
  # setup by the scripts/ci/setup/build-environment.yml task
  # "Install basic packages to build QEMU on Ubuntu 22.04"
  
-ubuntu-22.04-s390x-all-linux-static:

+ubuntu-22.04-s390x-all-linux:
   extends: .custom_runner_template
   needs: []
   stage: build
@@ -15,7 +15,7 @@ ubuntu-22.04-s390x-all-linux-static:
   script:
   - mkdir build
   - cd build
- - ../configure --enable-debug --static --disable-system
+ - ../configure --enable-debug-tcg --disable-system --disable-tools 
--disable-docs


Maybe mention the --disable-tools and --disable-docs in the commit message, too?

Anyway:
Reviewed-by: Thomas Huth 





Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool

2024-05-06 Thread Cédric Le Goater

On 5/7/24 04:09, Duan, Zhenzhong wrote:




-Original Message-
From: Cédric Le Goater 
Subject: Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and
its wrapper return bool

On 5/6/24 10:33, Zhenzhong Duan wrote:

Make VFIOIOMMUClass::attach_device() and its wrapper function
vfio_attach_device() return bool.

This is to follow the coding standand to return bool if 'Error **'
is used to pass error.

Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
   include/hw/vfio/vfio-common.h |  4 ++--
   include/hw/vfio/vfio-container-base.h |  4 ++--
   hw/vfio/ap.c  |  6 ++
   hw/vfio/ccw.c |  6 ++
   hw/vfio/common.c  |  4 ++--
   hw/vfio/container.c   | 14 +++---
   hw/vfio/iommufd.c | 11 +--
   hw/vfio/pci.c |  8 +++-
   hw/vfio/platform.c|  7 +++
   9 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-

common.h

index b9da6c08ef..a7b6fc8f46 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region);
   void vfio_region_finalize(VFIORegion *region);
   void vfio_reset_handler(void *opaque);
   struct vfio_device_info *vfio_get_device_info(int fd);
-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp);
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp);
   void vfio_detach_device(VFIODevice *vbasedev);

   int vfio_kvm_device_add_fd(int fd, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-

container-base.h

index 3582d5f97a..c839cfd9cb 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -118,8 +118,8 @@ struct VFIOIOMMUClass {
   int (*dma_unmap)(const VFIOContainerBase *bcontainer,
hwaddr iova, ram_addr_t size,
IOMMUTLBEntry *iotlb);
-int (*attach_device)(const char *name, VFIODevice *vbasedev,
- AddressSpace *as, Error **errp);
+bool (*attach_device)(const char *name, VFIODevice *vbasedev,
+  AddressSpace *as, Error **errp);
   void (*detach_device)(VFIODevice *vbasedev);
   /* migration feature */
   int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 7c4caa5938..d50600b702 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -156,7 +156,6 @@ static void

vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev,

   static void vfio_ap_realize(DeviceState *dev, Error **errp)
   {
   ERRP_GUARD();
-int ret;
   Error *err = NULL;
   VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
   VFIODevice *vbasedev = &vapdev->vdev;
@@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev, Error

**errp)

   return;
   }

-ret = vfio_attach_device(vbasedev->name, vbasedev,
- &address_space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(vbasedev->name, vbasedev,
+&address_space_memory, errp)) {
   goto error;
   }

diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 90e4a53437..782bd4bed7 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev,

Error **errp)

   S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
   VFIODevice *vbasedev = &vcdev->vdev;
   Error *err = NULL;
-int ret;

   /* Call the class init function for subchannel. */
   if (cdc->realize) {
@@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev,

Error **errp)

   return;
   }

-ret = vfio_attach_device(cdev->mdevid, vbasedev,
- &address_space_memory, errp);
-if (ret) {
+if (!vfio_attach_device(cdev->mdevid, vbasedev,
+&address_space_memory, errp)) {
   goto out_attach_dev_err;
   }

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8f9cbdc026..890d30910e 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1492,8 +1492,8 @@ retry:
   return info;
   }

-int vfio_attach_device(char *name, VFIODevice *vbasedev,
-   AddressSpace *as, Error **errp)
+bool vfio_attach_device(char *name, VFIODevice *vbasedev,
+AddressSpace *as, Error **errp)
   {
   const VFIOIOMMUClass *ops =


VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY));


I think vfio_attach_device() can be cleaned up a little further :

ret = ops->attach_device(name, vbasedev, as, errp);
 if (ret < 0) {
 return ret;
 }


Not understand this.
I have both ops->attac

[PATCH v2 0/3] Upgrade ACPI SPCR table to support SPCR table version 4 format

2024-05-06 Thread Sia Jee Heng
Update the SPCR table to accommodate the SPCR Table version 4 [1].
The SPCR table has been modified to adhere to the version 4 format [2].

Meanwhile, the virt SPCR golden reference files have been updated to
accommodate the SPCR Table version 4.

This patch series depends on Sunil's patch series [3], where Bios-Table-Test
is now supported by both ARM and RISC-V.

[1]: 
https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table
[2]: https://github.com/acpica/acpica/pull/931
[3]: 
https://lore.kernel.org/all/20240315130519.2378765-1-suni...@ventanamicro.com/

Changes in v2:
- Utilizes a three-patch approach to modify the ACPI pre-built binary
  files required by the Bios-Table-Test.
- Rebases and incorporates changes to support both ARM and RISC-V ACPI
  pre-built binary files.

Sia Jee Heng (3):
  qtest: allow SPCR acpi table changes
  hw/acpi: Upgrade ACPI SPCR table to support SPCR table version 4
format
  tests/qtest/bios-tables-test: Update virt SPCR golden references

 hw/acpi/aml-build.c   |  14 +++---
 hw/arm/virt-acpi-build.c  |  10 --
 hw/riscv/virt-acpi-build.c|  12 +---
 include/hw/acpi/acpi-defs.h   |   7 +--
 include/hw/acpi/aml-build.h   |   2 +-
 tests/data/acpi/virt/aarch64/SPCR | Bin 80 -> 90 bytes
 tests/data/acpi/virt/riscv64/SPCR | Bin 80 -> 90 bytes
 7 files changed, 34 insertions(+), 11 deletions(-)

-- 
2.34.1




[PATCH v2 3/3] tests/qtest/bios-tables-test: Update virt SPCR golden references

2024-05-06 Thread Sia Jee Heng
Update the virt SPCR golden reference files to accommodate the
SPCR Table version 4 [1], utilizing the iasl binary compiled from the
latest ACPICA repository. The SPCR table has been modified to
adhere to the version 4 format [2].

[1]: 
https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table
[2]: https://github.com/acpica/acpica/pull/931

Diffs from iasl:

 /*
  * Intel ACPI Component Architecture
  * AML/ASL+ Disassembler version 20240322 (64-bit version)
  * Copyright (c) 2000 - 2023 Intel Corporation
  *
- * Disassembly of tests/data/acpi/virt/aarch64/SPCR
+ * Disassembly of /tmp/aml-DVTAN2
  *
  * ACPI Data Table [SPCR]
  *
  * Format: [HexOffset DecimalOffset ByteLength]  FieldName : FieldValue (in 
hex)
  */

 [000h  004h]   Signature : "SPCR"[Serial Port Console 
Redirection Table]
-[004h 0004 004h]Table Length : 0050
-[008h 0008 001h]Revision : 02
-[009h 0009 001h]Checksum : B1
+[004h 0004 004h]Table Length : 005A
+[008h 0008 001h]Revision : 04
+[009h 0009 001h]Checksum : 1D
 [00Ah 0010 006h]  Oem ID : "BOCHS "
 [010h 0016 008h]Oem Table ID : "BXPC"
 [018h 0024 004h]Oem Revision : 0001
 [01Ch 0028 004h] Asl Compiler ID : "BXPC"
 [020h 0032 004h]   Asl Compiler Revision : 0001

 [024h 0036 001h]  Interface Type : 03
 [025h 0037 003h]Reserved : 00

 [028h 0040 00Ch]Serial Port Register : [Generic Address Structure]
 [028h 0040 001h]Space ID : 00 [SystemMemory]
 [029h 0041 001h]   Bit Width : 20
 [02Ah 0042 001h]  Bit Offset : 00
 [02Bh 0043 001h]Encoded Access Width : 03 [DWord Access:32]
 [02Ch 0044 008h] Address : 0900

 [035h 0053 001h] PCAT-compatible IRQ : 00
 [036h 0054 004h]   Interrupt : 0021
 [03Ah 0058 001h]   Baud Rate : 03
 [03Bh 0059 001h]  Parity : 00
 [03Ch 0060 001h]   Stop Bits : 01
 [03Dh 0061 001h]Flow Control : 02
 [03Eh 0062 001h]   Terminal Type : 00
 [03Fh 0063 001h]Language : 00
 [040h 0064 002h]   PCI Device ID : 
 [042h 0066 002h]   PCI Vendor ID : 
 [044h 0068 001h] PCI Bus : 00
 [045h 0069 001h]  PCI Device : 00
 [046h 0070 001h]PCI Function : 00
 [047h 0071 004h]   PCI Flags : 
 [04Bh 0075 001h] PCI Segment : 00
 [04Ch 0076 004h] Uart Clock Freq : 
-/ ACPI table terminates in the middle of a data structure! (dump table)
-CurrentOffset: 50, TableLength: 50 ***/
\ No newline at end of file
+[050h 0080 004h]   Precise Baud rate : 
+[054h 0084 002h]   NameSpaceStringLength : 0002
+[056h 0086 002h]   NameSpaceStringOffset : 0058
+[058h 0088 002h] NamespaceString : "."
+
+Raw Table Data: Length 90 (0x5A)
+
+: 53 50 43 52 5A 00 00 00 04 1D 42 4F 43 48 53 20  // SPCRZ.BOCHS
+0010: 42 58 50 43 20 20 20 20 01 00 00 00 42 58 50 43  // BXPCBXPC
+0020: 01 00 00 00 03 00 00 00 00 20 00 03 00 00 00 09  // . ..
+0030: 00 00 00 00 08 00 21 00 00 00 03 00 01 02 00 00  // ..!.
+0040: FF FF FF FF 00 00 00 00 00 00 00 00 00 00 00 00  // 
+0050: 00 00 00 00 02 00 58 00 2E 00// ..X...

 /*
  * Intel ACPI Component Architecture
  * AML/ASL+ Disassembler version 20200925 (64-bit version)
  * Copyright (c) 2000 - 2020 Intel Corporation
  *
- * Disassembly of tests/data/acpi/virt/riscv64/SPCR, Mon May  6 05:34:22 2024
+ * Disassembly of /tmp/aml-27E8M2, Mon May  6 05:34:22 2024
  *
  * ACPI Data Table [SPCR]
  *
  * Format: [HexOffset DecimalOffset ByteLength]  FieldName : FieldValue
  */

 [000h    4]Signature : "SPCR"[Serial Port Console 
Redirection table]
-[004h 0004   4] Table Length : 0050
-[008h 0008   1] Revision : 02
-[009h 0009   1] Checksum : B9
+[004h 0004   4] Table Length : 005A
+[008h 0008   1] Revision : 04
+[009h 0009   1] Checksum : 25
 [00Ah 0010   6]   Oem ID : "BOCHS "
 [010h 0016   8] Oem Table ID : "BXPC"
 [018h 0024   4] Oem Revision : 0001
 [01Ch 0028   4]  Asl Compiler ID : "BXPC"
 [020h 0032   4]Asl Compiler Revision : 0001

 [024h 0036   1]   Interface Type : 00
 [025h 0037   3] Reserved : 00

 [028h 0040  12] Serial Port Register : [Generic Address Structure]
 [028h 0040   1] 

[PATCH v2 1/3] qtest: allow SPCR acpi table changes

2024-05-06 Thread Sia Jee Heng
Signed-off-by: Sia Jee Heng 
---
 tests/qtest/bios-tables-test-allowed-diff.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qtest/bios-tables-test-allowed-diff.h 
b/tests/qtest/bios-tables-test-allowed-diff.h
index dfb8523c8b..3f12ca546b 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1 +1,3 @@
 /* List of comma-separated changed AML files to ignore */
+"tests/data/acpi/virt/riscv64/SPCR",
+"tests/data/acpi/virt/aarch64/SPCR",
-- 
2.34.1




[PATCH v2 2/3] hw/acpi: Upgrade ACPI SPCR table to support SPCR table version 4 format

2024-05-06 Thread Sia Jee Heng
Update the SPCR table to accommodate the SPCR Table version 4 [1].
The SPCR table has been modified to adhere to the version 4 format [2].

[1]: 
https://learn.microsoft.com/en-us/windows-hardware/drivers/serports/serial-port-console-redirection-table
[2]: https://github.com/acpica/acpica/pull/931

Signed-off-by: Sia Jee Heng 
---
 hw/acpi/aml-build.c | 14 +++---
 hw/arm/virt-acpi-build.c| 10 --
 hw/riscv/virt-acpi-build.c  | 12 +---
 include/hw/acpi/acpi-defs.h |  7 +--
 include/hw/acpi/aml-build.h |  2 +-
 5 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 6d4517cfbe..7c43573eef 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1996,7 +1996,7 @@ static void build_processor_hierarchy_node(GArray *tbl, 
uint32_t flags,
 
 void build_spcr(GArray *table_data, BIOSLinker *linker,
 const AcpiSpcrData *f, const uint8_t rev,
-const char *oem_id, const char *oem_table_id)
+const char *oem_id, const char *oem_table_id, const char *name)
 {
 AcpiTable table = { .sig = "SPCR", .rev = rev, .oem_id = oem_id,
 .oem_table_id = oem_table_id };
@@ -2042,8 +2042,16 @@ void build_spcr(GArray *table_data, BIOSLinker *linker,
 build_append_int_noprefix(table_data, f->pci_flags, 4);
 /* PCI Segment */
 build_append_int_noprefix(table_data, f->pci_segment, 1);
-/* Reserved */
-build_append_int_noprefix(table_data, 0, 4);
+/* UartClkFreq */
+build_append_int_noprefix(table_data, f->uart_clk_freq, 4);
+/* PreciseBaudrate */
+build_append_int_noprefix(table_data, f->precise_baudrate, 4);
+/* NameSpaceStringLength */
+build_append_int_noprefix(table_data, f->namespace_string_length, 2);
+/* NameSpaceStringOffset */
+build_append_int_noprefix(table_data, f->namespace_string_offset, 2);
+/* NamespaceString[] */
+g_array_append_vals(table_data, name, f->namespace_string_length);
 
 acpi_table_end(linker, &table);
 }
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 6a1bde61ce..cb345e8659 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -428,11 +428,12 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 
 /*
  * Serial Port Console Redirection Table (SPCR)
- * Rev: 1.07
+ * Rev: 1.10
  */
 static void
 spcr_setup(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 {
+const char name[] = ".";
 AcpiSpcrData serial = {
 .interface_type = 3,   /* ARM PL011 UART */
 .base_addr.id = AML_AS_SYSTEM_MEMORY,
@@ -456,9 +457,14 @@ spcr_setup(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 .pci_function = 0,
 .pci_flags = 0,
 .pci_segment = 0,
+.uart_clk_freq = 0,
+.precise_baudrate = 0,
+.namespace_string_length = sizeof(name),
+.namespace_string_offset = 88,
 };
 
-build_spcr(table_data, linker, &serial, 2, vms->oem_id, vms->oem_table_id);
+build_spcr(table_data, linker, &serial, 4, vms->oem_id, vms->oem_table_id,
+   name);
 }
 
 /*
diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c
index 0925528160..5fa3942491 100644
--- a/hw/riscv/virt-acpi-build.c
+++ b/hw/riscv/virt-acpi-build.c
@@ -176,14 +176,15 @@ acpi_dsdt_add_uart(Aml *scope, const MemMapEntry 
*uart_memmap,
 
 /*
  * Serial Port Console Redirection Table (SPCR)
- * Rev: 1.07
+ * Rev: 1.10
  */
 
 static void
 spcr_setup(GArray *table_data, BIOSLinker *linker, RISCVVirtState *s)
 {
+const char name[] = ".";
 AcpiSpcrData serial = {
-.interface_type = 0,   /* 16550 compatible */
+.interface_type = 0x12,   /* 16550 compatible */
 .base_addr.id = AML_AS_SYSTEM_MEMORY,
 .base_addr.width = 32,
 .base_addr.offset = 0,
@@ -205,9 +206,14 @@ spcr_setup(GArray *table_data, BIOSLinker *linker, 
RISCVVirtState *s)
 .pci_function = 0,
 .pci_flags = 0,
 .pci_segment = 0,
+.uart_clk_freq = 0,
+.precise_baudrate = 0,
+.namespace_string_length = sizeof(name),
+.namespace_string_offset = 88,
 };
 
-build_spcr(table_data, linker, &serial, 2, s->oem_id, s->oem_table_id);
+build_spcr(table_data, linker, &serial, 4, s->oem_id, s->oem_table_id,
+   name);
 }
 
 /* RHCT Node[N] starts at offset 56 */
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index 0e6e82b339..2e6e341998 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -112,7 +112,6 @@ typedef struct AcpiSpcrData {
 uint8_t flow_control;
 uint8_t terminal_type;
 uint8_t language;
-uint8_t reserved1;
 uint16_t pci_device_id;/* Must be 0x if not PCI device */
 uint16_t pci_vendor_id;/* Must be 0x if not PCI device */
 uint8_t pci_bus;
@@ -120,7 +119,11 @@ ty

Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-05-06 Thread Jinpu Wang
Hi Peter, hi Daniel,
On Mon, May 6, 2024 at 5:29 PM Peter Xu  wrote:
>
> On Mon, May 06, 2024 at 12:08:43PM +0200, Jinpu Wang wrote:
> > Hi Peter, hi Daniel,
>
> Hi, Jinpu,
>
> Thanks for sharing this test results.  Sounds like a great news.
>
> What's your plan next?  Would it then be worthwhile / possible moving QEMU
> into that direction?  Would that greatly simplify rdma code as Dan
> mentioned?
I'm rather not familiar with QEMU migration yet,  from the test
result, I think it's a possible direction,
just we need to at least based on a rather recent release like
rdma-core v33 with proper 'fork' support.

Maybe Dan or you could give more detail about what you have in mind
for using rsocket as a replacement for the future.
We will also look into the implementation details in the meantime.

Thx!
J

>
> Thanks,
>
> >
> > On Fri, May 3, 2024 at 4:33 PM Peter Xu  wrote:
> > >
> > > On Fri, May 03, 2024 at 08:40:03AM +0200, Jinpu Wang wrote:
> > > > I had a brief check in the rsocket changelog, there seems some
> > > > improvement over time,
> > > >  might be worth revisiting this. due to socket abstraction, we can't
> > > > use some feature like
> > > >  ODP, it won't be a small and easy task.
> > >
> > > It'll be good to know whether Dan's suggestion would work first, without
> > > rewritting everything yet so far.  Not sure whether some perf test could
> > > help with the rsocket APIs even without QEMU's involvements (or looking 
> > > for
> > > test data supporting / invalidate such conversions).
> > >
> > I did a quick test with iperf on 100 G environment and 40 G
> > environment, in summary rsocket works pretty well.
> >
> > iperf tests between 2 hosts with 40 G (IB),
> > first  a few test with different num. of threads on top of ipoib
> > interface, later with preload rsocket on top of same ipoib interface.
> >
> > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145
> > 
> > Client connecting to 10.43.3.145, TCP port 52000
> > TCP window size:  165 KByte (default)
> > 
> > [  3] local 10.43.3.146 port 55602 connected with 10.43.3.145 port 52000
> > [ ID] Interval   Transfer Bandwidth
> > [  3] 0.-10.0001 sec  2.85 GBytes  2.44 Gbits/sec
> > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 2
> > 
> > Client connecting to 10.43.3.145, TCP port 52000
> > TCP window size:  165 KByte (default)
> > 
> > [  4] local 10.43.3.146 port 39640 connected with 10.43.3.145 port 52000
> > [  3] local 10.43.3.146 port 39626 connected with 10.43.3.145 port 52000
> > [ ID] Interval   Transfer Bandwidth
> > [  3] 0.-10.0012 sec  2.85 GBytes  2.45 Gbits/sec
> > [  4] 0.-10.0026 sec  2.86 GBytes  2.45 Gbits/sec
> > [SUM] 0.-10.0026 sec  5.71 GBytes  4.90 Gbits/sec
> > [ CT] final connect times (min/avg/max/stdev) =
> > 0.281/0.300/0.318/0.318 ms (tot/err) = 2/0
> > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 4
> > 
> > Client connecting to 10.43.3.145, TCP port 52000
> > TCP window size:  165 KByte (default)
> > 
> > [  4] local 10.43.3.146 port 46956 connected with 10.43.3.145 port 52000
> > [  6] local 10.43.3.146 port 46978 connected with 10.43.3.145 port 52000
> > [  3] local 10.43.3.146 port 46944 connected with 10.43.3.145 port 52000
> > [  5] local 10.43.3.146 port 46962 connected with 10.43.3.145 port 52000
> > [ ID] Interval   Transfer Bandwidth
> > [  3] 0.-10.0017 sec  2.85 GBytes  2.45 Gbits/sec
> > [  4] 0.-10.0015 sec  2.85 GBytes  2.45 Gbits/sec
> > [  5] 0.-10.0026 sec  2.85 GBytes  2.45 Gbits/sec
> > [  6] 0.-10.0005 sec  2.85 GBytes  2.45 Gbits/sec
> > [SUM] 0.-10.0005 sec  11.4 GBytes  9.80 Gbits/sec
> > [ CT] final connect times (min/avg/max/stdev) =
> > 0.274/0.312/0.360/0.212 ms (tot/err) = 4/0
> > jw...@ps401a-914.nst:~$ iperf -p 52000 -c 10.43.3.145 -P 8
> > 
> > Client connecting to 10.43.3.145, TCP port 52000
> > TCP window size:  165 KByte (default)
> > 
> > [  7] local 10.43.3.146 port 35062 connected with 10.43.3.145 port 52000
> > [  6] local 10.43.3.146 port 35058 connected with 10.43.3.145 port 52000
> > [  8] local 10.43.3.146 port 35066 connected with 10.43.3.145 port 52000
> > [  9] local 10.43.3.146 port 35074 connected with 10.43.3.145 port 52000
> > [  3] local 10.43.3.146 port 35038 connected with 10.43.3.145 port 52000
> > [ 12] local 10.43.3.146 port 35088 connected with 10.43.3.145 port 52000
> > [  5] local 10.43.3.146 port 35048 connected with 10.43.3.145 port 52000
> > [  4] local 10.43.3.146 port 35050 connected with 10.43.3.1

Re: [PATCH 1/2] ppc/pnv: Begin a more complete ADU LPC model for POWER9/10

2024-05-06 Thread Nicholas Piggin
On Fri May 3, 2024 at 3:44 PM AEST, Cédric Le Goater wrote:
> On 5/3/24 06:51, Nicholas Piggin wrote:
> > On Thu May 2, 2024 at 6:47 PM AEST, Cédric Le Goater wrote:
> >> On 5/1/24 14:39, Nicholas Piggin wrote:
> >>> On Wed Apr 17, 2024 at 9:25 PM AEST, Cédric Le Goater wrote:
>  Hello Nick,
> 
>  On 4/17/24 13:02, Nicholas Piggin wrote:
> > This implements a framework for an ADU unit model.
> >
> > The ADU unit actually implements XSCOM, which is the bridge between MMIO
> > and PIB. However it also includes control and status registers and other
> > functions that are exposed as PIB (xscom) registers.
> >
> > To keep things simple, pnv_xscom.c remains the XSCOM bridge
> > implementation, and pnv_adu.c implements the ADU registers and other
> > functions.
> >
> > So far, just the ADU no-op registers in the pnv_xscom.c default handler
> > are moved over to the adu model.
> >
> > Signed-off-by: Nicholas Piggin 
> > ---
> > include/hw/ppc/pnv_adu.h   |  34 
> > include/hw/ppc/pnv_chip.h  |   3 +
> > include/hw/ppc/pnv_xscom.h |   6 ++
> > hw/ppc/pnv.c   |  16 ++
> > hw/ppc/pnv_adu.c   | 111 
> > +
> > hw/ppc/pnv_xscom.c |   9 ---
> > hw/ppc/meson.build |   1 +
> > hw/ppc/trace-events|   4 ++
> > 8 files changed, 175 insertions(+), 9 deletions(-)
> > create mode 100644 include/hw/ppc/pnv_adu.h
> > create mode 100644 hw/ppc/pnv_adu.c
> >
> > diff --git a/include/hw/ppc/pnv_adu.h b/include/hw/ppc/pnv_adu.h
> > new file mode 100644
> > index 00..9dc91857a9
> > --- /dev/null
> > +++ b/include/hw/ppc/pnv_adu.h
> > @@ -0,0 +1,34 @@
> > +/*
> > + * QEMU PowerPC PowerNV Emulation of some ADU behaviour
> > + *
> > + * Copyright (c) 2024, IBM Corporation.
> > + *
> > + * SPDX-License-Identifier: LGPL-2.1-or-later
> 
> 
>  Did you mean GPL-2.0-or-later ?
> >>>
> >>> Hey Cedric,
> >>>
> >>> Thanks for reviewing, I've been away so sorry for the late reply.
> >>>
> >>> It just came from one of the headers I copied which was LGPL. But
> >>> there's really nothing much in it and could find a GPL header to
> >>> copy. Is GPL-2.0-or-later preferred?
> >>
> >> I would since all pnv models are GPL.
> > 
> > Some of pnv is actually LGPL. 
>
> I was grepping for 'LGPL' and not 'Lesser' ... Indeed you are right.
> Most files miss an SPDX-License-Identifier tag also.
>
> > That's okay I'll change to GPL.
>
> LGPL is more relaxed if the code needs to be used in libraries, but
> I am not sure it applies to the PNV models. What would you prefer ?

GPL seems to be more common and I don't see a need for LGPL here,
so maybe GPL?

We could probably switch all LGPL pnv over to GPL if we wanted to.
I think LGPL permits such relicensing. Will leave this discussion
for another time though.

Thanks,
Nick



Re: [PATCH] spapr: Migrate ail-mode-3 spapr cap

2024-05-06 Thread Harsh Prateek Bora




On 5/6/24 17:26, Nicholas Piggin wrote:

This cap did not add the migration code when it was introduced. This
results in migration failure when changing the default using the
command line.

Cc: qemu-sta...@nongnu.org
Fixes: ccc5a4c5e10 ("spapr: Add SPAPR_CAP_AIL_MODE_3 for AIL mode 3 support for 
H_SET_MODE hcall")
Signed-off-by: Nicholas Piggin 


Reviewed-by: Harsh Prateek Bora 


---
  include/hw/ppc/spapr.h | 1 +
  hw/ppc/spapr.c | 1 +
  hw/ppc/spapr_caps.c| 1 +
  3 files changed, 3 insertions(+)

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 4aaf23d28f..f6de3e9972 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -1004,6 +1004,7 @@ extern const VMStateDescription 
vmstate_spapr_cap_large_decr;
  extern const VMStateDescription vmstate_spapr_cap_ccf_assist;
  extern const VMStateDescription vmstate_spapr_cap_fwnmi;
  extern const VMStateDescription vmstate_spapr_cap_rpt_invalidate;
+extern const VMStateDescription vmstate_spapr_cap_ail_mode_3;
  extern const VMStateDescription vmstate_spapr_wdt;
  
  static inline uint8_t spapr_get_cap(SpaprMachineState *spapr, int cap)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d2d1e310a3..065f58ec93 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2169,6 +2169,7 @@ static const VMStateDescription vmstate_spapr = {
  &vmstate_spapr_cap_fwnmi,
  &vmstate_spapr_fwnmi,
  &vmstate_spapr_cap_rpt_invalidate,
+&vmstate_spapr_cap_ail_mode_3,
  &vmstate_spapr_cap_nested_papr,
  NULL
  }
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index 0a15415a1d..2f74923560 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -974,6 +974,7 @@ SPAPR_CAP_MIG_STATE(large_decr, 
SPAPR_CAP_LARGE_DECREMENTER);
  SPAPR_CAP_MIG_STATE(ccf_assist, SPAPR_CAP_CCF_ASSIST);
  SPAPR_CAP_MIG_STATE(fwnmi, SPAPR_CAP_FWNMI);
  SPAPR_CAP_MIG_STATE(rpt_invalidate, SPAPR_CAP_RPT_INVALIDATE);
+SPAPR_CAP_MIG_STATE(ail_mode_3, SPAPR_CAP_AIL_MODE_3);
  
  void spapr_caps_init(SpaprMachineState *spapr)

  {




[PATCH v2 1/6] hw/loongarch: Refine acpi srat table for numa memory

2024-05-06 Thread Bibo Mao
One LoongArch virt machine platform, there is limitation for memory
map information. The minimum memory size is 256M and minimum memory
size for numa node0 is 256M also. With qemu numa qtest, it is possible
that memory size of numa node0 is 128M.

Limitations for minimum memory size for both total memory and numa
node0 is removed for acpi srat table creation.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/acpi-build.c | 58 +++
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/hw/loongarch/acpi-build.c b/hw/loongarch/acpi-build.c
index e5ab1080af..d0247d93ee 100644
--- a/hw/loongarch/acpi-build.c
+++ b/hw/loongarch/acpi-build.c
@@ -165,8 +165,9 @@ static void
 build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
 {
 int i, arch_id, node_id;
-uint64_t mem_len, mem_base;
-int nb_numa_nodes = machine->numa_state->num_nodes;
+hwaddr len, base, gap;
+NodeInfo *numa_info;
+int nodes, nb_numa_nodes = machine->numa_state->num_nodes;
 LoongArchMachineState *lams = LOONGARCH_MACHINE(machine);
 MachineClass *mc = MACHINE_GET_CLASS(lams);
 const CPUArchIdList *arch_ids = mc->possible_cpu_arch_ids(machine);
@@ -195,35 +196,44 @@ build_srat(GArray *table_data, BIOSLinker *linker, 
MachineState *machine)
 build_append_int_noprefix(table_data, 0, 4); /* Reserved */
 }
 
-/* Node0 */
-build_srat_memory(table_data, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE,
-  0, MEM_AFFINITY_ENABLED);
-mem_base = VIRT_HIGHMEM_BASE;
-if (!nb_numa_nodes) {
-mem_len = machine->ram_size - VIRT_LOWMEM_SIZE;
-} else {
-mem_len = machine->numa_state->nodes[0].node_mem - VIRT_LOWMEM_SIZE;
+base = VIRT_LOWMEM_BASE;
+gap = VIRT_LOWMEM_SIZE;
+numa_info = machine->numa_state->nodes;
+nodes = nb_numa_nodes;
+if (!nodes) {
+nodes = 1;
 }
-if (mem_len)
-build_srat_memory(table_data, mem_base, mem_len, 0, 
MEM_AFFINITY_ENABLED);
-
-/* Node1 - Nodemax */
-if (nb_numa_nodes) {
-mem_base += mem_len;
-for (i = 1; i < nb_numa_nodes; ++i) {
-if (machine->numa_state->nodes[i].node_mem > 0) {
-build_srat_memory(table_data, mem_base,
-  machine->numa_state->nodes[i].node_mem, i,
-  MEM_AFFINITY_ENABLED);
-mem_base += machine->numa_state->nodes[i].node_mem;
-}
+
+for (i = 0; i < nodes; i++) {
+if (nb_numa_nodes) {
+len = numa_info[i].node_mem;
+} else {
+len = machine->ram_size;
+}
+
+/*
+ * memory for the node splited into two part
+ *   lowram:  [base, +gap)
+ *   highram: [VIRT_HIGHMEM_BASE, +(len - gap))
+ */
+if (len >= gap) {
+build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED);
+len -= gap;
+base = VIRT_HIGHMEM_BASE;
+gap = machine->ram_size - VIRT_LOWMEM_SIZE;
+}
+
+if (len) {
+build_srat_memory(table_data, base, len, i, MEM_AFFINITY_ENABLED);
+base += len;
+gap  -= len;
 }
 }
 
 if (machine->device_memory) {
 build_srat_memory(table_data, machine->device_memory->base,
   memory_region_size(&machine->device_memory->mr),
-  nb_numa_nodes - 1,
+  nodes - 1,
   MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
 }
 
-- 
2.39.3




[PATCH v2 6/6] tests/qtest: Add numa test for loongarch system

2024-05-06 Thread Bibo Mao
Add numa test case for loongarch system, it passes to run
with command "make check-qtest", after the following patch
is applied.
https://lore.kernel.org/all/20240319022606.2994565-1-maob...@loongson.cn/

Signed-off-by: Bibo Mao 
---
 tests/qtest/meson.build | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 6f2f594ace..a72436df65 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -256,6 +256,8 @@ qtests_s390x = \
 qtests_riscv32 = \
   (config_all_devices.has_key('CONFIG_SIFIVE_E_AON') ? 
['sifive-e-aon-watchdog-test'] : [])
 
+qtests_loongarch64 = ['numa-test']
+
 qos_test_ss = ss.source_set()
 qos_test_ss.add(
   'ac97-test.c',
-- 
2.39.3




[PATCH v2 4/6] hw/loongarch: Refine system dram memory region

2024-05-06 Thread Bibo Mao
For system dram memory region, it is not necessary to use numa node
information. There is only low memory region and high memory region.

Remove numa node information for ddr memory region here, it can reduce
memory region number about LoongArch virt machine.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/virt.c | 55 ++---
 1 file changed, 17 insertions(+), 38 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index aca8290795..5abfe0a9d7 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -974,18 +974,13 @@ static void loongarch_init(MachineState *machine)
 {
 LoongArchCPU *lacpu;
 const char *cpu_model = machine->cpu_type;
-ram_addr_t offset = 0;
-ram_addr_t ram_size = machine->ram_size;
-uint64_t highram_size = 0, phyAddr = 0;
 MemoryRegion *address_space_mem = get_system_memory();
 LoongArchMachineState *lams = LOONGARCH_MACHINE(machine);
-int nb_numa_nodes = machine->numa_state->num_nodes;
-NodeInfo *numa_info = machine->numa_state->nodes;
 int i;
+hwaddr base, size, ram_size = machine->ram_size;
 const CPUArchIdList *possible_cpus;
 MachineClass *mc = MACHINE_GET_CLASS(machine);
 CPUState *cpu;
-char *ramName = NULL;
 
 if (!cpu_model) {
 cpu_model = LOONGARCH_CPU_TYPE_NAME("la464");
@@ -1020,41 +1015,27 @@ static void loongarch_init(MachineState *machine)
 fdt_add_memory_nodes(machine);
 fw_cfg_add_memory(machine);
 
-/* Node0 memory */
-memory_region_init_alias(&lams->lowmem, NULL, "loongarch.node0.lowram",
- machine->ram, offset, VIRT_LOWMEM_SIZE);
-memory_region_add_subregion(address_space_mem, phyAddr, &lams->lowmem);
-
-offset += VIRT_LOWMEM_SIZE;
-if (nb_numa_nodes > 0) {
-assert(numa_info[0].node_mem > VIRT_LOWMEM_SIZE);
-highram_size = numa_info[0].node_mem - VIRT_LOWMEM_SIZE;
-} else {
-highram_size = ram_size - VIRT_LOWMEM_SIZE;
+size = ram_size;
+base = VIRT_LOWMEM_BASE;
+if (size > VIRT_LOWMEM_SIZE) {
+size = VIRT_LOWMEM_SIZE;
 }
-phyAddr = VIRT_HIGHMEM_BASE;
-memory_region_init_alias(&lams->highmem, NULL, "loongarch.node0.highram",
-  machine->ram, offset, highram_size);
-memory_region_add_subregion(address_space_mem, phyAddr, &lams->highmem);
-
-/* Node1 - Nodemax memory */
-offset += highram_size;
-phyAddr += highram_size;
-
-for (i = 1; i < nb_numa_nodes; i++) {
-MemoryRegion *nodemem = g_new(MemoryRegion, 1);
-ramName = g_strdup_printf("loongarch.node%d.ram", i);
-memory_region_init_alias(nodemem, NULL, ramName, machine->ram,
- offset,  numa_info[i].node_mem);
-memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
-offset += numa_info[i].node_mem;
-phyAddr += numa_info[i].node_mem;
+
+memory_region_init_alias(&lams->lowmem, NULL, "loongarch.lowram",
+  machine->ram, base, size);
+memory_region_add_subregion(address_space_mem, base, &lams->lowmem);
+base += size;
+if (ram_size - size) {
+base = VIRT_HIGHMEM_BASE;
+memory_region_init_alias(&lams->highmem, NULL, "loongarch.highram",
+machine->ram, VIRT_LOWMEM_BASE + size, ram_size - size);
+memory_region_add_subregion(address_space_mem, base, &lams->highmem);
+base += ram_size - size;
 }
 
 /* initialize device memory address space */
 if (machine->ram_size < machine->maxram_size) {
 ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size;
-hwaddr device_mem_base;
 
 if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) {
 error_report("unsupported amount of memory slots: %"PRIu64,
@@ -1068,9 +1049,7 @@ static void loongarch_init(MachineState *machine)
  "%d bytes", TARGET_PAGE_SIZE);
 exit(EXIT_FAILURE);
 }
-/* device memory base is the top of high memory address. */
-device_mem_base = ROUND_UP(VIRT_HIGHMEM_BASE + highram_size, 1 * GiB);
-machine_memory_devices_init(machine, device_mem_base, device_mem_size);
+machine_memory_devices_init(machine, base, device_mem_size);
 }
 
 /* load the BIOS image. */
-- 
2.39.3




[PATCH v2 3/6] hw/loongarch: Refine fwcfg memory map

2024-05-06 Thread Bibo Mao
Memory map table for fwcfg is used for UEFI BIOS, UEFI BIOS uses the first
entry from fwcfg memory map as the first memory HOB, the second memory HOB
will be used if the first memory HOB is used up.

Memory map table for fwcfg does not care about numa node, however in
generic the first memory HOB is part of numa node0, so that runtime
memory of UEFI which is allocated from the first memory HOB is located
at numa node0.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/virt.c | 60 ++---
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index db76bc94f8..aca8290795 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -914,6 +914,62 @@ static const MemoryRegionOps loongarch_qemu_ops = {
 },
 };
 
+static void fw_cfg_add_memory(MachineState *ms)
+{
+hwaddr base, size, ram_size, gap;
+int nb_numa_nodes, nodes;
+NodeInfo *numa_info;
+
+ram_size = ms->ram_size;
+base = VIRT_LOWMEM_BASE;
+gap = VIRT_LOWMEM_SIZE;
+nodes = nb_numa_nodes = ms->numa_state->num_nodes;
+numa_info = ms->numa_state->nodes;
+if (!nodes) {
+nodes = 1;
+}
+
+/* add fw_cfg memory map of node0 */
+if (nb_numa_nodes) {
+size = numa_info[0].node_mem;
+} else {
+size = ram_size;
+}
+
+if (size >= gap) {
+memmap_add_entry(base, gap, 1);
+size -= gap;
+base = VIRT_HIGHMEM_BASE;
+gap = ram_size - VIRT_LOWMEM_SIZE;
+}
+
+if (size) {
+memmap_add_entry(base, size, 1);
+base += size;
+}
+
+if (nodes < 2) {
+return;
+}
+
+/* add fw_cfg memory map of other nodes */
+size = ram_size - numa_info[0].node_mem;
+gap  = VIRT_LOWMEM_BASE + VIRT_LOWMEM_SIZE;
+if (base < gap && (base + size) > gap) {
+/*
+ * memory map for the maining nodes splited into two part
+ *   lowram:  [base, +(gap - base))
+ *   highram: [VIRT_HIGHMEM_BASE, +(size - (gap - base)))
+ */
+memmap_add_entry(base, gap - base, 1);
+size -= gap - base;
+base = VIRT_HIGHMEM_BASE;
+}
+
+   if (size)
+memmap_add_entry(base, size, 1);
+}
+
 static void loongarch_init(MachineState *machine)
 {
 LoongArchCPU *lacpu;
@@ -962,9 +1018,9 @@ static void loongarch_init(MachineState *machine)
 fdt_add_cpu_nodes(lams);
 
 fdt_add_memory_nodes(machine);
+fw_cfg_add_memory(machine);
 
 /* Node0 memory */
-memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1);
 memory_region_init_alias(&lams->lowmem, NULL, "loongarch.node0.lowram",
  machine->ram, offset, VIRT_LOWMEM_SIZE);
 memory_region_add_subregion(address_space_mem, phyAddr, &lams->lowmem);
@@ -977,7 +1033,6 @@ static void loongarch_init(MachineState *machine)
 highram_size = ram_size - VIRT_LOWMEM_SIZE;
 }
 phyAddr = VIRT_HIGHMEM_BASE;
-memmap_add_entry(phyAddr, highram_size, 1);
 memory_region_init_alias(&lams->highmem, NULL, "loongarch.node0.highram",
   machine->ram, offset, highram_size);
 memory_region_add_subregion(address_space_mem, phyAddr, &lams->highmem);
@@ -992,7 +1047,6 @@ static void loongarch_init(MachineState *machine)
 memory_region_init_alias(nodemem, NULL, ramName, machine->ram,
  offset,  numa_info[i].node_mem);
 memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
-memmap_add_entry(phyAddr, numa_info[i].node_mem, 1);
 offset += numa_info[i].node_mem;
 phyAddr += numa_info[i].node_mem;
 }
-- 
2.39.3




[PATCH v2 5/6] hw/loongarch: Remove minimum and default memory size

2024-05-06 Thread Bibo Mao
Some qtest test cases such as numa use default memory size of generic
machine class, which is 128M by fault.

Here generic default memory size is used, and also remove minimum memory
size which is 1G originally.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/virt.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 5abfe0a9d7..36477172ea 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -986,10 +986,6 @@ static void loongarch_init(MachineState *machine)
 cpu_model = LOONGARCH_CPU_TYPE_NAME("la464");
 }
 
-if (ram_size < 1 * GiB) {
-error_report("ram_size must be greater than 1G.");
-exit(1);
-}
 create_fdt(lams);
 
 /* Create IOCSR space */
@@ -1284,7 +1280,6 @@ static void loongarch_class_init(ObjectClass *oc, void 
*data)
 
 mc->desc = "Loongson-3A5000 LS7A1000 machine";
 mc->init = loongarch_init;
-mc->default_ram_size = 1 * GiB;
 mc->default_cpu_type = LOONGARCH_CPU_TYPE_NAME("la464");
 mc->default_ram_id = "loongarch.ram";
 mc->max_cpus = LOONGARCH_MAX_CPUS;
-- 
2.39.3




[PATCH v2 0/6] hw/loongarch: Refine numa memory map

2024-05-06 Thread Bibo Mao
One LoongArch virt machine platform, there is limitation for memory
map information. The minimum memory size is 256M and minimum memory
size for numa node0 is 256M also. With qemu numa qtest, it is possible
that memory size of numa node0 is 128M.

Limitations for minimum memory size for both total memory and numa
node0 is removed here, including acpi srat table, fadt memory map table
and fw_cfg memory map table.

Also remove numa node about memory region, there is only low memory
region and how memory region.

---
v1 ... v2:
  1. Refresh the patch based on the latest qemu version, solve the
confliction issue.
  2. Add numa test case for loongarch system.
---
Bibo Mao (6):
  hw/loongarch: Refine acpi srat table for numa memory
  hw/loongarch: Refine fadt memory table for numa memory
  hw/loongarch: Refine fwcfg memory map
  hw/loongarch: Refine system dram memory region
  hw/loongarch: Remove minimum and default memory size
  tests/qtest: Add numa test for loongarch system

 hw/loongarch/acpi-build.c |  58 +++--
 hw/loongarch/virt.c   | 167 +++---
 tests/qtest/meson.build   |   2 +
 3 files changed, 154 insertions(+), 73 deletions(-)


base-commit: 248f6f62df073a3b4158fd0093863ab885feabb5
-- 
2.39.3




[PATCH v2 2/6] hw/loongarch: Refine fadt memory table for numa memory

2024-05-06 Thread Bibo Mao
One LoongArch virt machine platform, there is limitation for memory
map information. The minimum memory size is 256M and minimum memory
size for numa node0 is 256M also. With qemu numa qtest, it is possible
that memory size of numa node0 is 128M.

Limitations for minimum memory size for both total memory and numa
node0 is removed for fadt numa memory table creation.

Signed-off-by: Bibo Mao 
---
 hw/loongarch/virt.c | 47 ++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index c0999878df..db76bc94f8 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -473,6 +473,48 @@ static void fdt_add_memory_node(MachineState *ms,
 g_free(nodename);
 }
 
+static void fdt_add_memory_nodes(MachineState *ms)
+{
+hwaddr base, size, ram_size, gap;
+int i, nb_numa_nodes, nodes;
+NodeInfo *numa_info;
+
+ram_size = ms->ram_size;
+base = VIRT_LOWMEM_BASE;
+gap = VIRT_LOWMEM_SIZE;
+nodes = nb_numa_nodes = ms->numa_state->num_nodes;
+numa_info = ms->numa_state->nodes;
+if (!nodes) {
+nodes = 1;
+}
+
+for (i = 0; i < nodes; i++) {
+if (nb_numa_nodes) {
+size = numa_info[i].node_mem;
+} else {
+size = ram_size;
+}
+
+/*
+ * memory for the node splited into two part
+ *   lowram:  [base, +gap)
+ *   highram: [VIRT_HIGHMEM_BASE, +(len - gap))
+ */
+if (size >= gap) {
+fdt_add_memory_node(ms, base, gap, i);
+size -= gap;
+base = VIRT_HIGHMEM_BASE;
+gap = ram_size - VIRT_LOWMEM_SIZE;
+}
+
+if (size) {
+fdt_add_memory_node(ms, base, size, i);
+base += size;
+gap -= size;
+}
+}
+}
+
 static void virt_build_smbios(LoongArchMachineState *lams)
 {
 MachineState *ms = MACHINE(lams);
@@ -919,9 +961,10 @@ static void loongarch_init(MachineState *machine)
 }
 fdt_add_cpu_nodes(lams);
 
+fdt_add_memory_nodes(machine);
+
 /* Node0 memory */
 memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1);
-fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0);
 memory_region_init_alias(&lams->lowmem, NULL, "loongarch.node0.lowram",
  machine->ram, offset, VIRT_LOWMEM_SIZE);
 memory_region_add_subregion(address_space_mem, phyAddr, &lams->lowmem);
@@ -935,7 +978,6 @@ static void loongarch_init(MachineState *machine)
 }
 phyAddr = VIRT_HIGHMEM_BASE;
 memmap_add_entry(phyAddr, highram_size, 1);
-fdt_add_memory_node(machine, phyAddr, highram_size, 0);
 memory_region_init_alias(&lams->highmem, NULL, "loongarch.node0.highram",
   machine->ram, offset, highram_size);
 memory_region_add_subregion(address_space_mem, phyAddr, &lams->highmem);
@@ -951,7 +993,6 @@ static void loongarch_init(MachineState *machine)
  offset,  numa_info[i].node_mem);
 memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
 memmap_add_entry(phyAddr, numa_info[i].node_mem, 1);
-fdt_add_memory_node(machine, phyAddr, numa_info[i].node_mem, i);
 offset += numa_info[i].node_mem;
 phyAddr += numa_info[i].node_mem;
 }
-- 
2.39.3




Re: [PATCH v3 1/1] accel/kvm: Fix segmentation fault

2024-05-06 Thread Zhijian Li (Fujitsu)


on 5/7/2024 10:50 AM, Masato Imai wrote:
> When the KVM acceleration parameter is not set, executing calc_dirty_rate
> with the -r or -b option results in a segmentation fault due to accessing
> a null kvm_state pointer in the kvm_dirty_ring_enabled function. This
> commit adds a null check for kvm_status to prevent segmentation faults.
>
> Signed-off-by: Masato Imai 

LGTM,
Tested-by: Li Zhijian 


> ---
>   accel/kvm/kvm-all.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index c0be9f5eed..544293be8a 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -2329,7 +2329,7 @@ bool kvm_vcpu_id_is_valid(int vcpu_id)
>   
>   bool kvm_dirty_ring_enabled(void)
>   {
> -return kvm_state->kvm_dirty_ring_size ? true : false;
> +return kvm_state && kvm_state->kvm_dirty_ring_size;
>   }
>   
>   static void query_stats_cb(StatsResultList **result, StatsTarget target,


[PATCH v3 1/1] accel/kvm: Fix segmentation fault

2024-05-06 Thread Masato Imai
When the KVM acceleration parameter is not set, executing calc_dirty_rate
with the -r or -b option results in a segmentation fault due to accessing
a null kvm_state pointer in the kvm_dirty_ring_enabled function. This
commit adds a null check for kvm_status to prevent segmentation faults.

Signed-off-by: Masato Imai 
---
 accel/kvm/kvm-all.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index c0be9f5eed..544293be8a 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2329,7 +2329,7 @@ bool kvm_vcpu_id_is_valid(int vcpu_id)
 
 bool kvm_dirty_ring_enabled(void)
 {
-return kvm_state->kvm_dirty_ring_size ? true : false;
+return kvm_state && kvm_state->kvm_dirty_ring_size;
 }
 
 static void query_stats_cb(StatsResultList **result, StatsTarget target,
-- 
2.34.1




[PATCH v3 0/1] accel/kvm: Fix segmentation fault

2024-05-06 Thread Masato Imai
Changes from v2:
- avoid segfault in kvm/accel instead of migration/dirtyrate

v2: 
https://lore.kernel.org/qemu-devel/20240423091306.754432-1-...@sfc.wide.ad.jp

Masato Imai (1):
  accel/kvm: Fix segmentation fault

 accel/kvm/kvm-all.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

-- 
2.34.1




RE: [PATCH v3 00/19] Add a host IOMMU device abstraction to check with vIOMMU

2024-05-06 Thread Duan, Zhenzhong



>-Original Message-
>From: Jason Gunthorpe 
>Subject: Re: [PATCH v3 00/19] Add a host IOMMU device abstraction to
>check with vIOMMU
>
>On Mon, May 06, 2024 at 02:30:47AM +, Duan, Zhenzhong wrote:
>
>> I'm not clear how useful multiple iommufd instances support are.
>> One possible benefit is for security? It may bring a slightly fine-grained
>> isolation in kernel.
>
>No. I don't think there is any usecase, it is only harmful.

OK, so we need to limit QEMU to only one iommufd instance.

In cdev series, we support mix of legacy and iommufd backend and multiple 
iommufd backend instances for flexibility.
We need to make a choice to have this limitation only for nesting series or 
globally(including cdev).
May I ask what harmfulness we may have?

Thanks
Zhenzhong



[PATCH] hw/loongarch/virt: Fix memory leak

2024-05-06 Thread Song Gao
The char pointer 'ramName' point to a block of memory,
but never free it. Use 'g_autofree' to automatically free it.

Resolves: Coverity CID 1544773

Fixes: 0cf1478d6 ("hw/loongarch: Add numa support")
Signed-off-by: Song Gao 
---
 hw/loongarch/virt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index c0999878df..ea5100be6d 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -887,7 +887,6 @@ static void loongarch_init(MachineState *machine)
 const CPUArchIdList *possible_cpus;
 MachineClass *mc = MACHINE_GET_CLASS(machine);
 CPUState *cpu;
-char *ramName = NULL;
 
 if (!cpu_model) {
 cpu_model = LOONGARCH_CPU_TYPE_NAME("la464");
@@ -946,7 +945,7 @@ static void loongarch_init(MachineState *machine)
 
 for (i = 1; i < nb_numa_nodes; i++) {
 MemoryRegion *nodemem = g_new(MemoryRegion, 1);
-ramName = g_strdup_printf("loongarch.node%d.ram", i);
+g_autofree char *ramName = g_strdup_printf("loongarch.node%d.ram", i);
 memory_region_init_alias(nodemem, NULL, ramName, machine->ram,
  offset,  numa_info[i].node_mem);
 memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
-- 
2.25.1




RE: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool

2024-05-06 Thread Duan, Zhenzhong


>-Original Message-
>From: Cédric Le Goater 
>Subject: Re: [PATCH 1/3] vfio: Make VFIOIOMMUClass::attach_device() and
>its wrapper return bool
>
>On 5/6/24 10:33, Zhenzhong Duan wrote:
>> Make VFIOIOMMUClass::attach_device() and its wrapper function
>> vfio_attach_device() return bool.
>>
>> This is to follow the coding standand to return bool if 'Error **'
>> is used to pass error.
>>
>> Suggested-by: Cédric Le Goater 
>> Signed-off-by: Zhenzhong Duan 
>> ---
>>   include/hw/vfio/vfio-common.h |  4 ++--
>>   include/hw/vfio/vfio-container-base.h |  4 ++--
>>   hw/vfio/ap.c  |  6 ++
>>   hw/vfio/ccw.c |  6 ++
>>   hw/vfio/common.c  |  4 ++--
>>   hw/vfio/container.c   | 14 +++---
>>   hw/vfio/iommufd.c | 11 +--
>>   hw/vfio/pci.c |  8 +++-
>>   hw/vfio/platform.c|  7 +++
>>   9 files changed, 28 insertions(+), 36 deletions(-)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>common.h
>> index b9da6c08ef..a7b6fc8f46 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -198,8 +198,8 @@ void vfio_region_exit(VFIORegion *region);
>>   void vfio_region_finalize(VFIORegion *region);
>>   void vfio_reset_handler(void *opaque);
>>   struct vfio_device_info *vfio_get_device_info(int fd);
>> -int vfio_attach_device(char *name, VFIODevice *vbasedev,
>> -   AddressSpace *as, Error **errp);
>> +bool vfio_attach_device(char *name, VFIODevice *vbasedev,
>> +AddressSpace *as, Error **errp);
>>   void vfio_detach_device(VFIODevice *vbasedev);
>>
>>   int vfio_kvm_device_add_fd(int fd, Error **errp);
>> diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-
>container-base.h
>> index 3582d5f97a..c839cfd9cb 100644
>> --- a/include/hw/vfio/vfio-container-base.h
>> +++ b/include/hw/vfio/vfio-container-base.h
>> @@ -118,8 +118,8 @@ struct VFIOIOMMUClass {
>>   int (*dma_unmap)(const VFIOContainerBase *bcontainer,
>>hwaddr iova, ram_addr_t size,
>>IOMMUTLBEntry *iotlb);
>> -int (*attach_device)(const char *name, VFIODevice *vbasedev,
>> - AddressSpace *as, Error **errp);
>> +bool (*attach_device)(const char *name, VFIODevice *vbasedev,
>> +  AddressSpace *as, Error **errp);
>>   void (*detach_device)(VFIODevice *vbasedev);
>>   /* migration feature */
>>   int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
>> diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
>> index 7c4caa5938..d50600b702 100644
>> --- a/hw/vfio/ap.c
>> +++ b/hw/vfio/ap.c
>> @@ -156,7 +156,6 @@ static void
>vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev,
>>   static void vfio_ap_realize(DeviceState *dev, Error **errp)
>>   {
>>   ERRP_GUARD();
>> -int ret;
>>   Error *err = NULL;
>>   VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
>>   VFIODevice *vbasedev = &vapdev->vdev;
>> @@ -165,9 +164,8 @@ static void vfio_ap_realize(DeviceState *dev, Error
>**errp)
>>   return;
>>   }
>>
>> -ret = vfio_attach_device(vbasedev->name, vbasedev,
>> - &address_space_memory, errp);
>> -if (ret) {
>> +if (!vfio_attach_device(vbasedev->name, vbasedev,
>> +&address_space_memory, errp)) {
>>   goto error;
>>   }
>>
>> diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
>> index 90e4a53437..782bd4bed7 100644
>> --- a/hw/vfio/ccw.c
>> +++ b/hw/vfio/ccw.c
>> @@ -580,7 +580,6 @@ static void vfio_ccw_realize(DeviceState *dev,
>Error **errp)
>>   S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
>>   VFIODevice *vbasedev = &vcdev->vdev;
>>   Error *err = NULL;
>> -int ret;
>>
>>   /* Call the class init function for subchannel. */
>>   if (cdc->realize) {
>> @@ -594,9 +593,8 @@ static void vfio_ccw_realize(DeviceState *dev,
>Error **errp)
>>   return;
>>   }
>>
>> -ret = vfio_attach_device(cdev->mdevid, vbasedev,
>> - &address_space_memory, errp);
>> -if (ret) {
>> +if (!vfio_attach_device(cdev->mdevid, vbasedev,
>> +&address_space_memory, errp)) {
>>   goto out_attach_dev_err;
>>   }
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 8f9cbdc026..890d30910e 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -1492,8 +1492,8 @@ retry:
>>   return info;
>>   }
>>
>> -int vfio_attach_device(char *name, VFIODevice *vbasedev,
>> -   AddressSpace *as, Error **errp)
>> +bool vfio_attach_device(char *name, VFIODevice *vbasedev,
>> +AddressSpace *as, Error **errp)
>>   {
>>   const VFIOIOMMUClass *ops =
>>
>VFIO_IOMMU_CLASS(object_class_by_name(TYPE

RE: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-05-06 Thread Gonglei (Arei)
Hello,

> -Original Message-
> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Monday, May 6, 2024 11:18 PM
> To: Gonglei (Arei) 
> Cc: Daniel P. Berrangé ; Markus Armbruster
> ; Michael Galaxy ; Yu Zhang
> ; Zhijian Li (Fujitsu) ; Jinpu Wang
> ; Elmar Gerdes ;
> qemu-devel@nongnu.org; Yuval Shaia ; Kevin Wolf
> ; Prasanna Kumar Kalever
> ; Cornelia Huck ;
> Michael Roth ; Prasanna Kumar Kalever
> ; integrat...@gluster.org; Paolo Bonzini
> ; qemu-bl...@nongnu.org; de...@lists.libvirt.org;
> Hanna Reitz ; Michael S. Tsirkin ;
> Thomas Huth ; Eric Blake ; Song
> Gao ; Marc-André Lureau
> ; Alex Bennée ;
> Wainer dos Santos Moschetta ; Beraldo Leal
> ; Pannengyuan ;
> Xiexiangyou 
> Subject: Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling
> 
> On Mon, May 06, 2024 at 02:06:28AM +, Gonglei (Arei) wrote:
> > Hi, Peter
> 
> Hey, Lei,
> 
> Happy to see you around again after years.
> 
Haha, me too.

> > RDMA features high bandwidth, low latency (in non-blocking lossless
> > network), and direct remote memory access by bypassing the CPU (As you
> > know, CPU resources are expensive for cloud vendors, which is one of
> > the reasons why we introduced offload cards.), which TCP does not have.
> 
> It's another cost to use offload cards, v.s. preparing more cpu resources?
> 
Software and hardware offload converged architecture is the way to go for all 
cloud vendors 
(Including comprehensive benefits in terms of performance, cost, security, and 
innovation speed), 
it's not just a matter of adding the resource of a DPU card.

> > In some scenarios where fast live migration is needed (extremely short
> > interruption duration and migration duration) is very useful. To this
> > end, we have also developed RDMA support for multifd.
> 
> Will any of you upstream that work?  I'm curious how intrusive would it be
> when adding it to multifd, if it can keep only 5 exported functions like what
> rdma.h does right now it'll be pretty nice.  We also want to make sure it 
> works
> with arbitrary sized loads and buffers, e.g. vfio is considering to add IO 
> loads to
> multifd channels too.
> 

In fact, we sent the patchset to the community in 2021. Pls see:
https://lore.kernel.org/all/20210203185906.GT2950@work-vm/T/


> One thing to note that the question here is not about a pure performance
> comparison between rdma and nics only.  It's about help us make a decision
> on whether to drop rdma, iow, even if rdma performs well, the community still
> has the right to drop it if nobody can actively work and maintain it.
> It's just that if nics can perform as good it's more a reason to drop, unless
> companies can help to provide good support and work together.
> 

We are happy to provide the necessary review and maintenance work for RDMA
if the community needs it.

CC'ing Chuan Zheng.


Regards,
-Gonglei



Re: [RFC PATCH v3 3/5] KVM: x86: Add notifications for Heki policy configuration and violation

2024-05-06 Thread Sean Christopherson
On Mon, May 06, 2024, Mickaël Salaün wrote:
> On Fri, May 03, 2024 at 07:03:21AM GMT, Sean Christopherson wrote:
> > > ---
> > > 
> > > Changes since v1:
> > > * New patch. Making user space aware of Heki properties was requested by
> > >   Sean Christopherson.
> > 
> > No, I suggested having userspace _control_ the pinning[*], not merely be 
> > notified
> > of pinning.
> > 
> >  : IMO, manipulation of protections, both for memory (this patch) and CPU 
> > state
> >  : (control registers in the next patch) should come from userspace.  I 
> > have no
> >  : objection to KVM providing plumbing if necessary, but I think userspace 
> > needs to
> >  : to have full control over the actual state.
> >  : 
> >  : One of the things that caused Intel's control register pinning series to 
> > stall
> >  : out was how to handle edge cases like kexec() and reboot.  Deferring to 
> > userspace
> >  : means the kernel doesn't need to define policy, e.g. when to unprotect 
> > memory,
> >  : and avoids questions like "should userspace be able to overwrite pinned 
> > control
> >  : registers".
> >  : 
> >  : And like the confidential VM use case, keeping userspace in the loop is 
> > a big
> >  : beneifit, e.g. the guest can't circumvent protections by coercing 
> > userspace into
> >  : writing to protected memory.
> > 
> > I stand by that suggestion, because I don't see a sane way to handle things 
> > like
> > kexec() and reboot without having a _much_ more sophisticated policy than 
> > would
> > ever be acceptable in KVM.
> > 
> > I think that can be done without KVM having any awareness of CR pinning 
> > whatsoever.
> > E.g. userspace just needs to ability to intercept CR writes and inject 
> > #GPs.  Off
> > the cuff, I suspect the uAPI could look very similar to MSR filtering.  
> > E.g. I bet
> > userspace could enforce MSR pinning without any new KVM uAPI at all.
> > 
> > [*] https://lore.kernel.org/all/zfuyhpuhtmbyd...@google.com
> 
> OK, I had concern about the control not directly coming from the guest,
> especially in the case of pKVM and confidential computing, but I get you

Hardware-based CoCo is completely out of scope, because KVM has zero visibility
into the guest (well, SNP technically allows trapping CR0/CR4, but KVM really
shouldn't intercept CR0/CR4 for SNP guests).

And more importantly, _KVM_ doesn't define any policies for CoCo VMs.  KVM might
help enforce policies that are defined by hardware/firmware, but KVM doesn't
define any of its own.

If pKVM on x86 comes along, then KVM will likely get in the business of defining
policy, but until that happens, KVM needs to stay firmly out of the picture.

> point.  It should indeed be quite similar to the MSR filtering on the
> userspace side, except that we need another interface for the guest to
> request such change (i.e. self-protection).
> 
> Would it be OK to keep this new KVM_HC_LOCK_CR_UPDATE hypercall but
> forward the request to userspace with a VM exit instead?  That would
> also enable userspace to get the request and directly configure the CR
> pinning with the same VM exit.

No?  Maybe?  I strongly suspect that full support will need a richer set of APIs
than a single hypercall.  E.g. to handle kexec(), suspend+resume, emulated SMM,
and so on and so forth.  And that's just for CR pinning.

And hypercalls are hampered by the fact that VMCALL/VMMCALL don't allow for
delegation or restriction, i.e. there's no way for the guest to communicate to
the hypervisor that a less privileged component is allowed to perform some 
action,
nor is there a way for the guest to say some chunk of CPL0 code *isn't* allowed
to request transition.  Delegation and restriction all has to be done 
out-of-band.

It'd probably be more annoying to setup initially, but I think a synthetic 
device
with an MMIO-based interface would be more powerful and flexible in the long 
run.
Then userspace can evolve without needing to wait for KVM to catch up.

Actually, potential bad/crazy idea.  Why does the _host_ need to define policy?
Linux already knows what assets it wants to (un)protect and when.  What's 
missing
is a way for the guest kernel to effectively deprivilege and re-authenticate
itself as needed.  We've been tossing around the idea of paired VMs+vCPUs to
support VTLs and SEV's VMPLs, what if we usurped/piggybacked those ideas, with a
bit of pKVM mixed in?

Borrowing VTL terminology, where VTL0 is the least privileged, userspace 
launches
the VM at VTL0.  At some point, the guest triggers the deprivileging sequence 
and
userspace creates VTL1.  Userpace also provides a way for VTL0 restrict access 
to
its memory, e.g. to effectively make the page tables for the kernel's direct map
writable only from VTL1, to make kernel text RO (or XO), etc.  And VTL0 could 
then
also completely remove its access to code that changes CR0/CR4.

It would obviously require a _lot_ more upfront work, e.g. to isolate the kernel
text that modifies CR0/CR4 so that it can be removed fro

Re: [PULL 3/5] hw/loongarch: Add numa support

2024-05-06 Thread gaosong

在 2024/5/3 下午8:50, Peter Maydell 写道:

On Fri, 16 Jun 2023 at 11:03, Song Gao  wrote:

From: Tianrui Zhao 

1. Implement some functions for LoongArch numa support;
2. Implement fdt_add_memory_node() for fdt;
3. build_srat() fills node_id and adds build numa memory.

Reviewed-by: Song Gao 
Signed-off-by: Tianrui Zhao 
Signed-off-by: Song Gao 
Message-Id: <20230613122613.2471743-1-zhaotian...@loongson.cn>

Hi; Coverity has pointed out a memory leak in this commit
(CID 1544773):


@@ -799,17 +823,43 @@ static void loongarch_init(MachineState *machine)
  machine->possible_cpus->cpus[i].cpu = OBJECT(cpu);
  }
  fdt_add_cpu_nodes(lams);
-/* Add memory region */
-memory_region_init_alias(&lams->lowmem, NULL, "loongarch.lowram",
- machine->ram, 0, 256 * MiB);
-memory_region_add_subregion(address_space_mem, offset, &lams->lowmem);
-offset += 256 * MiB;
-memmap_add_entry(0, 256 * MiB, 1);
-highram_size = ram_size - 256 * MiB;
-memory_region_init_alias(&lams->highmem, NULL, "loongarch.highmem",
- machine->ram, offset, highram_size);
-memory_region_add_subregion(address_space_mem, 0x9000, &lams->highmem);
-memmap_add_entry(0x9000, highram_size, 1);
+
+/* Node0 memory */
+memmap_add_entry(VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 1);
+fdt_add_memory_node(machine, VIRT_LOWMEM_BASE, VIRT_LOWMEM_SIZE, 0);
+memory_region_init_alias(&lams->lowmem, NULL, "loongarch.node0.lowram",
+ machine->ram, offset, VIRT_LOWMEM_SIZE);
+memory_region_add_subregion(address_space_mem, phyAddr, &lams->lowmem);
+
+offset += VIRT_LOWMEM_SIZE;
+if (nb_numa_nodes > 0) {
+assert(numa_info[0].node_mem > VIRT_LOWMEM_SIZE);
+highram_size = numa_info[0].node_mem - VIRT_LOWMEM_SIZE;
+} else {
+highram_size = ram_size - VIRT_LOWMEM_SIZE;
+}
+phyAddr = VIRT_HIGHMEM_BASE;
+memmap_add_entry(phyAddr, highram_size, 1);
+fdt_add_memory_node(machine, phyAddr, highram_size, 0);
+memory_region_init_alias(&lams->highmem, NULL, "loongarch.node0.highram",
+  machine->ram, offset, highram_size);
+memory_region_add_subregion(address_space_mem, phyAddr, &lams->highmem);
+
+/* Node1 - Nodemax memory */
+offset += highram_size;
+phyAddr += highram_size;
+
+for (i = 1; i < nb_numa_nodes; i++) {
+MemoryRegion *nodemem = g_new(MemoryRegion, 1);
+ramName = g_strdup_printf("loongarch.node%d.ram", i);
+memory_region_init_alias(nodemem, NULL, ramName, machine->ram,
+ offset,  numa_info[i].node_mem);
+memory_region_add_subregion(address_space_mem, phyAddr, nodemem);
+memmap_add_entry(phyAddr, numa_info[i].node_mem, 1);
+fdt_add_memory_node(machine, phyAddr, numa_info[i].node_mem, i);
+offset += numa_info[i].node_mem;
+phyAddr += numa_info[i].node_mem;

In this loop, we allocate memory via g_strdup_printf(),
but never free it. The nicest fix for this is to use the
g_autofree mechanism so that the memory is automatically
freed when execution reaches the end of the block:
g_autofree ramName = g_strdup_printf("", ...);

Thank you.   I will fix it.

Thanks
Song Gao

thanks
-- PMM





Re: [PATCH v3 1/5] hw/loongarch: Rename LOONGARCH_MACHINE with VIRT_MACHINE

2024-05-06 Thread maobibo




On 2024/5/6 下午2:09, maobibo wrote:



On 2024/5/6 下午12:24, Thomas Huth wrote:

On 06/05/2024 05.02, Bibo Mao wrote:

On LoongArch system, there is only virt machine type now, name
LOONGARCH_MACHINE is confused, rename it with VIRT_MACHINE. Machine name
about Other real hw boards can be added in future.

Signed-off-by: Bibo Mao 
---

...
@@ -1245,7 +1244,7 @@ static void loongarch_class_init(ObjectClass 
*oc, void *data)

  static const TypeInfo loongarch_machine_types[] = {
  {
-    .name   = TYPE_LOONGARCH_MACHINE,
+    .name   = TYPE_VIRT_MACHINE,
  .parent = TYPE_MACHINE,
  .instance_size  = sizeof(LoongArchMachineState),
  .class_init = loongarch_class_init,
diff --git a/include/hw/loongarch/virt.h b/include/hw/loongarch/virt.h
index 4e14bf6060..5ea2f0370d 100644
--- a/include/hw/loongarch/virt.h
+++ b/include/hw/loongarch/virt.h
@@ -73,8 +73,8 @@ struct LoongArchMachineState {
  struct loongarch_boot_info bootinfo;
  };
-#define TYPE_LOONGARCH_MACHINE  MACHINE_TYPE_NAME("virt")
-OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, LOONGARCH_MACHINE)
+#define TYPE_VIRT_MACHINE  MACHINE_TYPE_NAME("virt")
+OBJECT_DECLARE_SIMPLE_TYPE(LoongArchMachineState, VIRT_MACHINE)
  bool loongarch_is_acpi_enabled(LoongArchMachineState *lams);
  void loongarch_acpi_setup(LoongArchMachineState *lams);
  #endif


  Hi,

there are currently some efforts going on to create the possibility to 
link a QEMU binary that contains all targets in one binary. Since we 
already have a TYPE_VIRT_MACHINE for other targets, I wonder whether 
it might be better to use LOONGARCH_VIRT_MACHINE than just 
VIRT_MACHINE here? Philippe, could you comment on this?


It is great if there is one QEMU binary which supports different 
targets. And LOONGARCH_VIRT_MACHINE is ok for me.

Hi Thomas, Philippe,

Does machine name "virt" need be changed if LOONGARCH_VIRT_MACHINE is 
used? There will be compatible issues if "virt" machine type is not 
suggested to use.


However CPU type "max" is not widely used now, can we get different 
architectures from CPU type rather than machine type for one QEMU binary 
which supports different targets?


Regards
Bibo Mao



Regards
Bibo Mao


  Thomas







RE: [PATCH v2] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways

2024-05-06 Thread Xingtao Yao (Fujitsu)



> -Original Message-
> From: Jonathan Cameron 
> Sent: Tuesday, April 30, 2024 10:43 PM
> To: Yao, Xingtao/姚 幸涛 
> Cc: fan...@samsung.com; qemu-devel@nongnu.org
> Subject: Re: [PATCH v2] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways
> 
> On Wed, 24 Apr 2024 01:36:56 +
> "Xingtao Yao (Fujitsu)"  wrote:
> 
> > ping.
> >
> > > -Original Message-
> > > From: Yao Xingtao 
> > > Sent: Sunday, April 7, 2024 11:07 AM
> > > To: jonathan.came...@huawei.com; fan...@samsung.com
> > > Cc: qemu-devel@nongnu.org; Yao, Xingtao/姚 幸涛
> 
> > > Subject: [PATCH v2] mem/cxl_type3: support 3, 6, 12 and 16 interleave ways
> > >
> > > Since the kernel does not check the interleave capability, a
> > > 3-way, 6-way, 12-way or 16-way region can be create normally.
> > >
> > > Applications can access the memory of 16-way region normally because
> > > qemu can convert hpa to dpa correctly for the power of 2 interleave
> > > ways, after kernel implementing the check, this kind of region will
> > > not be created any more.
> > >
> > > For non power of 2 interleave ways, applications could not access the
> > > memory normally and may occur some unexpected behaviors, such as
> > > segmentation fault.
> > >
> > > So implements this feature is needed.
> > >
> > > Link:
> > >
> https://lore.kernel.org/linux-cxl/3e84b919-7631-d1db-3e1d-33000f3f3868@fujits
> > > u.com/
> > > Signed-off-by: Yao Xingtao 
> > > ---
> > >  hw/mem/cxl_type3.c | 18 ++
> > >  1 file changed, 14 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
> > > index b0a7e9f11b..d6ef784e96 100644
> > > --- a/hw/mem/cxl_type3.c
> > > +++ b/hw/mem/cxl_type3.c
> > > @@ -805,10 +805,17 @@ static bool cxl_type3_dpa(CXLType3Dev *ct3d,
> hwaddr
> > > host_addr, uint64_t *dpa)
> > >  continue;
> > >  }
> > >
> > > -*dpa = dpa_base +
> > > -((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) |
> > > - ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) & 
> > > hpa_offset)
> > > -  >> iw));
> > > +if (iw < 8) {
> > > +*dpa = dpa_base +
> > > +((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) |
> > > + ((MAKE_64BIT_MASK(8 + ig + iw, 64 - 8 - ig - iw) &
> hpa_offset)
> > > +  >> iw));
> > > +} else {
> > > +*dpa = dpa_base +
> > > +((MAKE_64BIT_MASK(0, 8 + ig) & hpa_offset) |
> > > + MAKE_64BIT_MASK(ig + iw, 64 - ig - iw) & hpa_offset)
> > > +   >> (ig + iw)) / 3) << (ig + 8)));
> > > +}
> > >
> > >  return true;
> > >  }
> > > @@ -906,6 +913,9 @@ static void ct3d_reset(DeviceState *dev)
> > >  uint32_t *write_msk =
> ct3d->cxl_cstate.crb.cache_mem_regs_write_mask;
> > >
> > >  cxl_component_register_init_common(reg_state, write_msk,
> > > CXL2_TYPE3_DEVICE);
> > > +ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY,
> > > 3_6_12_WAY, 1);
> > > +ARRAY_FIELD_DP32(reg_state, CXL_HDM_DECODER_CAPABILITY,
> > > 16_WAY, 1);
> > > +
> 
> Why here rather than in hdm_reg_init_common()?
> It's constant data and is currently being set to 0 in there.

according to the CXL specifications (8.2.4.20.1 CXL HDM Decoder Capability 
Register (Offset 00h)),
this feature is only applicable to cxl.mem, upstream switch port and CXL host 
bridges shall hardwrite
these bits to 0.

so I think it would be more appropriate to set these bits here.

> 
> > >  cxl_device_register_init_t3(ct3d);
> > >
> > >  /*
> > > --
> > > 2.37.3
> >




RE: [PATCH V8 6/8] physmem: Add helper function to destroy CPU AddressSpace

2024-05-06 Thread Salil Mehta via
>  From: Peter Maydell 
>  Sent: Monday, May 6, 2024 10:29 AM
>  To: Salil Mehta 
>  
>  On Mon, 6 May 2024 at 10:06, Salil Mehta 
>  wrote:
>  >
>  > Hi Peter,
>  >
>  > Thanks for the review.
>  >
>  > >  From: Peter Maydell   When do we need to
>  > > destroy a single address space in this way that means  we need to
>  > > keep a count of how many ASes the CPU currently has? The  commit
>  > > message talks about the case when we unrealize the whole CPU
>  > > object, but in that situation you can just throw away all the ASes
>  > > at once (eg  by calling some
>  > >  cpu_destroy_address_spaces() function from
>  cpu_common_unrealizefn()).
>  >
>  >
>  > Yes, maybe, we can destroy all at once from common leg as well. I'd
>  > prefer this to be done from the arch specific function for ARM to
>  > maintain the clarity & symmetry of initialization and
>  > un-initialization legs.  For now, all of these address space destruction is
>  happening in context to the arm_cpu_unrealizefn().
>  >
>  > It’s a kind of trade-off between little more code and clarity but I'm
>  > open to further suggestions.
>  >
>  >
>  > >
>  > >  Also, if we're leaking stuff here by failing to destroy it, is that
>  > > a problem for  existing CPU types like x86 that we can already hotplug?
>  >
>  > No we are not. We are taking care of these in the ARM arch specific
>  > legs within functions arm_cpu_(un)realizefn().
>  
>  How can you be taking care of *x86* CPU types in the Arm unrealize?


Sorry, yes, I missed to reply that clearly. There was indeed a leak with x86 
reported
by Phillipe/David last year. In fact, Phillipe floated a patch last year for 
this.
I thought it was fixed already as part of cpu_common_unrealize() but I just
checked and realized that the below proposed changed still isn’t part of the
mainline

https://lore.kernel.org/qemu-devel/20230918160257.30127-9-phi...@linaro.org/

I can definitely add a common CPU AddressSpace destruction leg as part of this
patch if in case arch specific CPU unrealize does not cleans up its CPU
AddressSpace?


Thanks
Salil.

>  
>  thanks
>  -- PMM


Re: [PATCH V1 06/26] migration: precreate vmstate for exec

2024-05-06 Thread Fabiano Rosas
Steve Sistare  writes:

> Provide migration_precreate_save for saving precreate vmstate across exec.
> Create a memfd, save its value in the environment, and serialize state
> to it.  Reverse the process in migration_precreate_load.
>
> Signed-off-by: Steve Sistare 
> ---
>  include/migration/misc.h |   5 ++
>  migration/meson.build|   1 +
>  migration/precreate.c| 139 
> +++
>  3 files changed, 145 insertions(+)
>  create mode 100644 migration/precreate.c
>
> diff --git a/include/migration/misc.h b/include/migration/misc.h
> index c9e200f..cf30351 100644
> --- a/include/migration/misc.h
> +++ b/include/migration/misc.h
> @@ -56,6 +56,11 @@ AnnounceParameters *migrate_announce_params(void);
>  
>  void dump_vmstate_json_to_file(FILE *out_fp);
>  
> +/* migration/precreate.c */
> +int migration_precreate_save(Error **errp);
> +void migration_precreate_unsave(void);
> +int migration_precreate_load(Error **errp);
> +
>  /* migration/migration.c */
>  void migration_object_init(void);
>  void migration_shutdown(void);
> diff --git a/migration/meson.build b/migration/meson.build
> index f76b1ba..50e7cb2 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -26,6 +26,7 @@ system_ss.add(files(
>'ram-compress.c',
>'options.c',
>'postcopy-ram.c',
> +  'precreate.c',
>'savevm.c',
>'socket.c',
>'tls.c',
> diff --git a/migration/precreate.c b/migration/precreate.c
> new file mode 100644
> index 000..0bf5e1f
> --- /dev/null
> +++ b/migration/precreate.c
> @@ -0,0 +1,139 @@
> +/*
> + * Copyright (c) 2022, 2024 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/cutils.h"
> +#include "qemu/memfd.h"
> +#include "qapi/error.h"
> +#include "io/channel-file.h"
> +#include "migration/misc.h"
> +#include "migration/qemu-file.h"
> +#include "migration/savevm.h"
> +
> +#define PRECREATE_STATE_NAME "QEMU_PRECREATE_STATE"
> +
> +static QEMUFile *qemu_file_new_fd_input(int fd, const char *name)
> +{
> +g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd);
> +QIOChannel *ioc = QIO_CHANNEL(fioc);
> +qio_channel_set_name(ioc, name);
> +return qemu_file_new_input(ioc);
> +}
> +
> +static QEMUFile *qemu_file_new_fd_output(int fd, const char *name)
> +{
> +g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd);
> +QIOChannel *ioc = QIO_CHANNEL(fioc);
> +qio_channel_set_name(ioc, name);
> +return qemu_file_new_output(ioc);
> +}
> +
> +static int memfd_create_named(const char *name, Error **errp)
> +{
> +int mfd;
> +char val[16];
> +
> +mfd = memfd_create(name, 0);
> +if (mfd < 0) {
> +error_setg_errno(errp, errno, "memfd_create failed");
> +return -1;
> +}
> +
> +/* Remember mfd in environment for post-exec load */
> +qemu_clear_cloexec(mfd);
> +snprintf(val, sizeof(val), "%d", mfd);
> +g_setenv(name, val, 1);
> +
> +return mfd;
> +}
> +
> +static int memfd_find_named(const char *name, int *mfd_p, Error **errp)
> +{
> +const char *val = g_getenv(name);
> +
> +if (!val) {
> +*mfd_p = -1;
> +return 0;   /* No memfd was created, not an error */
> +}
> +g_unsetenv(name);
> +if (qemu_strtoi(val, NULL, 10, mfd_p)) {
> +error_setg(errp, "Bad %s env value %s", PRECREATE_STATE_NAME, val);
> +return -1;
> +}
> +lseek(*mfd_p, 0, SEEK_SET);
> +return 0;
> +}
> +
> +static void memfd_delete_named(const char *name)
> +{
> +int mfd;
> +const char *val = g_getenv(name);
> +
> +if (val) {
> +g_unsetenv(name);
> +if (!qemu_strtoi(val, NULL, 10, &mfd)) {
> +close(mfd);
> +}
> +}
> +}
> +
> +static QEMUFile *qemu_file_new_memfd_output(const char *name, Error **errp)
> +{
> +int mfd = memfd_create_named(name, errp);
> +
> +if (mfd < 0) {
> +return NULL;
> +}
> +
> +return qemu_file_new_fd_output(mfd, name);
> +}
> +
> +static QEMUFile *qemu_file_new_memfd_input(const char *name, Error **errp)
> +{
> +int ret, mfd;
> +
> +ret = memfd_find_named(name, &mfd, errp);
> +if (ret || mfd < 0) {
> +return NULL;
> +}
> +
> +return qemu_file_new_fd_input(mfd, name);
> +}
> +
> +int migration_precreate_save(Error **errp)
> +{
> +QEMUFile *f = qemu_file_new_memfd_output(PRECREATE_STATE_NAME, errp);
> +
> +if (!f) {
> +return -1;
> +} else if (qemu_savevm_precreate_save(f, errp)) {
> +memfd_delete_named(PRECREATE_STATE_NAME);
> +return -1;
> +} else {
> +/* Do not close f, as mfd must remain open. */
> +return 0;
> +}
> +}
> +
> +void migration_precreate_unsave(void)
> +{
> +memfd_delete_named(PRECREATE_STATE_NAME);
> +}
> +
> +int migration_precreate_load(Error **errp)
> +

Re: [PATCH V1 01/26] oslib: qemu_clear_cloexec

2024-05-06 Thread Fabiano Rosas
Steve Sistare  writes:

+cc dgilbert, marcandre

> Define qemu_clear_cloexec, analogous to qemu_set_cloexec.
>
> Signed-off-by: Steve Sistare 
> Reviewed-by: Dr. David Alan Gilbert 
> Reviewed-by: Marc-André Lureau 

A v1 patch with two reviews already, from people from another company
and they're not in CC. Looks suspicious. =)

Here's a fresh one, hopefully it won't spend another 4 years in the
drawer:

Reviewed-by: Fabiano Rosas 

> ---
>  include/qemu/osdep.h | 9 +
>  util/oslib-posix.c   | 9 +
>  util/oslib-win32.c   | 4 
>  3 files changed, 22 insertions(+)
>
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index c7053cd..b58f312 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -660,6 +660,15 @@ ssize_t qemu_write_full(int fd, const void *buf, size_t 
> count)
>  
>  void qemu_set_cloexec(int fd);
>  
> +/*
> + * Clear FD_CLOEXEC for a descriptor.
> + *
> + * The caller must guarantee that no other fork+exec's occur before the
> + * exec that is intended to inherit this descriptor, eg by suspending CPUs
> + * and blocking monitor commands.
> + */
> +void qemu_clear_cloexec(int fd);
> +
>  /* Return a dynamically allocated directory path that is appropriate for 
> storing
>   * local state.
>   *
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index e764416..614c3e5 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -272,6 +272,15 @@ int qemu_socketpair(int domain, int type, int protocol, 
> int sv[2])
>  return ret;
>  }
>  
> +void qemu_clear_cloexec(int fd)
> +{
> +int f;
> +f = fcntl(fd, F_GETFD);
> +assert(f != -1);
> +f = fcntl(fd, F_SETFD, f & ~FD_CLOEXEC);
> +assert(f != -1);
> +}
> +
>  char *
>  qemu_get_local_state_dir(void)
>  {
> diff --git a/util/oslib-win32.c b/util/oslib-win32.c
> index b623830..c3e969a 100644
> --- a/util/oslib-win32.c
> +++ b/util/oslib-win32.c
> @@ -222,6 +222,10 @@ void qemu_set_cloexec(int fd)
>  {
>  }
>  
> +void qemu_clear_cloexec(int fd)
> +{
> +}
> +
>  int qemu_get_thread_id(void)
>  {
>  return GetCurrentThreadId();



Re: [PATCH V1 03/26] migration: SAVEVM_FOREACH

2024-05-06 Thread Fabiano Rosas
Steve Sistare  writes:

> Define an abstraction SAVEVM_FOREACH to loop over all savevm state
> handlers, and replace QTAILQ_FOREACH.  Define variants for ALL so
> we can loop over all handlers vs a subset of handlers in a subsequent
> patch, but at this time there is no distinction between the two.
> No functional change.
>
> Signed-off-by: Steve Sistare 
> ---
>  migration/savevm.c | 55 
> +++---
>  1 file changed, 32 insertions(+), 23 deletions(-)
>
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 4509482..6829ba3 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -237,6 +237,15 @@ static SaveState savevm_state = {
>  .global_section_id = 0,
>  };
>  
> +#define SAVEVM_FOREACH(se, entry)\
> +QTAILQ_FOREACH(se, &savevm_state.handlers, entry)\
> +
> +#define SAVEVM_FOREACH_ALL(se, entry)\
> +QTAILQ_FOREACH(se, &savevm_state.handlers, entry)

This feels worse than SAVEVM_FOREACH_NOT_PRECREATED. We'll have to keep
coming back to the definition to figure out which FOREACH is the real
deal.

> +
> +#define SAVEVM_FOREACH_SAFE_ALL(se, entry, new_se)   \
> +QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se)
> +
>  static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id);
>  
>  static bool should_validate_capability(int capability)
> @@ -674,7 +683,7 @@ static uint32_t calculate_new_instance_id(const char 
> *idstr)
>  SaveStateEntry *se;
>  uint32_t instance_id = 0;
>  
> -QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> +SAVEVM_FOREACH_ALL(se, entry) {

In this patch we can't have both instances...

>  if (strcmp(idstr, se->idstr) == 0
>  && instance_id <= se->instance_id) {
>  instance_id = se->instance_id + 1;
> @@ -690,7 +699,7 @@ static int calculate_compat_instance_id(const char *idstr)
>  SaveStateEntry *se;
>  int instance_id = 0;
>  
> -QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {

...otherwise one of the two changes will go undocumented because the
actual reason for it will only be described in the next patch.

>  if (!se->compat) {
>  continue;
>  }
> @@ -816,7 +825,7 @@ void unregister_savevm(VMStateIf *obj, const char *idstr, 
> void *opaque)
>  }
>  pstrcat(id, sizeof(id), idstr);
>  
> -QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
> +SAVEVM_FOREACH_SAFE_ALL(se, entry, new_se) {
>  if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
>  savevm_state_handler_remove(se);
>  g_free(se->compat);
> @@ -939,7 +948,7 @@ void vmstate_unregister(VMStateIf *obj, const 
> VMStateDescription *vmsd,
>  {
>  SaveStateEntry *se, *new_se;
>  
> -QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
> +SAVEVM_FOREACH_SAFE_ALL(se, entry, new_se) {
>  if (se->vmsd == vmsd && se->opaque == opaque) {
>  savevm_state_handler_remove(se);
>  g_free(se->compat);
> @@ -1223,7 +1232,7 @@ bool qemu_savevm_state_blocked(Error **errp)
>  {
>  SaveStateEntry *se;
>  
> -QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (se->vmsd && se->vmsd->unmigratable) {
>  error_setg(errp, "State blocked by non-migratable device '%s'",
> se->idstr);
> @@ -1237,7 +1246,7 @@ void qemu_savevm_non_migratable_list(strList **reasons)
>  {
>  SaveStateEntry *se;
>  
> -QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (se->vmsd && se->vmsd->unmigratable) {
>  QAPI_LIST_PREPEND(*reasons,
>g_strdup_printf("non-migratable device: %s",
> @@ -1276,7 +1285,7 @@ bool qemu_savevm_state_guest_unplug_pending(void)
>  {
>  SaveStateEntry *se;
>  
> -QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (se->vmsd && se->vmsd->dev_unplug_pending &&
>  se->vmsd->dev_unplug_pending(se->opaque)) {
>  return true;
> @@ -1291,7 +1300,7 @@ int qemu_savevm_state_prepare(Error **errp)
>  SaveStateEntry *se;
>  int ret;
>  
> -QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (!se->ops || !se->ops->save_prepare) {
>  continue;
>  }
> @@ -1321,7 +1330,7 @@ int qemu_savevm_state_setup(QEMUFile *f, Error **errp)
>  json_writer_start_array(ms->vmdesc, "devices");
>  
>  trace_savevm_state_setup();
> -QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> +SAVEVM_FOREACH(se, entry) {
>  if (se->vmsd && se->vmsd->early_setup) {
>  ret = vmstate_save(f, se, ms->vmdesc, errp);
>  if (ret) {
> @@ -1365,7 

Re: [PATCH v2 04/33] accel/tcg: Reorg translator_ld*

2024-05-06 Thread Philippe Mathieu-Daudé

On 25/4/24 01:31, Richard Henderson wrote:

Reorg translator_access into translator_ld, with a more
memcpy-ish interface.  If both pages are in ram, do not
go through the caller's slow path.

Assert that the access is within the two pages that we are
prepared to protect, per TranslationBlock.  Allow access
prior to pc_first, so long as it is within the first page.

Signed-off-by: Richard Henderson 
---
  accel/tcg/translator.c | 189 ++---
  1 file changed, 101 insertions(+), 88 deletions(-)




  uint64_t translator_ldq(CPUArchState *env, DisasContextBase *db, vaddr pc)
  {
-uint64_t ret, plug;
-void *p = translator_access(env, db, pc, sizeof(ret));
+uint64_t raw, tgt;
  
-if (p) {

-plugin_insn_append(pc, p, sizeof(ret));
-return ldq_p(p);
+if (translator_ld(env, db, &raw, pc, sizeof(raw))) {
+tgt = tswap64(raw);
+} else {
+tgt = cpu_ldl_code(env, pc);


cpu_ldq_code() ?


+raw = tswap64(tgt);
  }
-ret = cpu_ldq_code(env, pc);
-plug = tswap64(ret);
-plugin_insn_append(pc, &plug, sizeof(ret));
-return ret;
+plugin_insn_append(pc, &raw, sizeof(raw));
+return tgt;
  }






Re: [PATCH v2 32/33] target/s390x: Use translator_lduw in get_next_pc

2024-05-06 Thread Philippe Mathieu-Daudé

On 25/4/24 01:31, Richard Henderson wrote:

Signed-off-by: Richard Henderson 
---
  target/s390x/tcg/translate.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 





Re: [PATCH v2 15/33] plugins: Merge alloc_tcg_plugin_context into plugin_gen_tb_start

2024-05-06 Thread Philippe Mathieu-Daudé

On 25/4/24 01:31, Richard Henderson wrote:

We don't need to allocate plugin context at startup,
we can wait until we actually use it.

Signed-off-by: Richard Henderson 
---
  accel/tcg/plugin-gen.c | 36 
  tcg/tcg.c  | 11 ---
  2 files changed, 20 insertions(+), 27 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH 02/57] target/arm: Split out gengvec64.c

2024-05-06 Thread Philippe Mathieu-Daudé

On 6/5/24 03:03, Richard Henderson wrote:

Split some routines out of translate-a64.c and translate-sve.c
that are used by both.

Signed-off-by: Richard Henderson 
---
  target/arm/tcg/translate-a64.h |   4 +
  target/arm/tcg/gengvec64.c | 190 +
  target/arm/tcg/translate-a64.c |  26 -
  target/arm/tcg/translate-sve.c | 145 +
  target/arm/tcg/meson.build |   1 +
  5 files changed, 197 insertions(+), 169 deletions(-)
  create mode 100644 target/arm/tcg/gengvec64.c


Reviewed using git-diff --color-moved=dimmed-zebra

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH 01/57] target/arm: Split out gengvec.c

2024-05-06 Thread Philippe Mathieu-Daudé

On 6/5/24 03:03, Richard Henderson wrote:

Signed-off-by: Richard Henderson 
---
  target/arm/tcg/translate.h |5 +
  target/arm/tcg/gengvec.c   | 1612 
  target/arm/tcg/translate.c | 1588 ---
  target/arm/tcg/meson.build |1 +
  4 files changed, 1618 insertions(+), 1588 deletions(-)
  create mode 100644 target/arm/tcg/gengvec.c


Reviewed using git-diff --color-moved=dimmed-zebra

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH] gitlab: Rename ubuntu-22.04-s390x-all to *-system

2024-05-06 Thread Philippe Mathieu-Daudé

On 6/5/24 22:23, Richard Henderson wrote:

We already build the linux-user binaries with
ubuntu-22.04-s390x-all-linux, so there's no need to do it again.

Signed-off-by: Richard Henderson 
---
  .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH V1 04/26] migration: delete unused parameter mis

2024-05-06 Thread Fabiano Rosas
Steve Sistare  writes:

> Signed-off-by: Steve Sistare 

Reviewed-by: Fabiano Rosas 



Re: [PATCH v8 0/5] Support message-based DMA in vfio-user server

2024-05-06 Thread Mattias Nissler
On Mon, May 6, 2024 at 4:44 PM Peter Xu  wrote:

> On Thu, Mar 28, 2024 at 08:53:36AM +0100, Mattias Nissler wrote:
> > Stefan, to the best of my knowledge this is fully reviewed and ready
> > to go in - can you kindly pick it up or advise in case there's
> > something I missed? Thanks!
>
> Fails cross-compile on mipsel:
>
> https://gitlab.com/peterx/qemu/-/jobs/6787790601


Ah, bummer, thanks for reporting. 4GB of bounce buffer should be plenty, so
switching to 32 bit atomics seems a good idea at first glance. I'll take a
closer look tomorrow and send a respin with a fix.


Re: [PATCH v8 0/5] Support message-based DMA in vfio-user server

2024-05-06 Thread Mattias Nissler
On Mon, May 6, 2024 at 5:01 PM Stefan Hajnoczi  wrote:

> On Thu, 28 Mar 2024 at 03:54, Mattias Nissler 
> wrote:
> >
> > Stefan, to the best of my knowledge this is fully reviewed and ready
> > to go in - can you kindly pick it up or advise in case there's
> > something I missed? Thanks!
>
> This code is outside the areas that I maintain. I think it would make
> sense for Jag to merge it and send a pull request as vfio-user
> maintainer.


OK, thanks for following up, I'll check with Jag.


[PATCH] gitlab: Rename ubuntu-22.04-s390x-all to *-system

2024-05-06 Thread Richard Henderson
We already build the linux-user binaries with
ubuntu-22.04-s390x-all-linux, so there's no need to do it again.

Signed-off-by: Richard Henderson 
---
 .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml 
b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
index 3a2b1e1d24..c3f16d77bb 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
@@ -21,7 +21,7 @@ ubuntu-22.04-s390x-all-linux:
  - make --output-sync check-tcg
  - make --output-sync -j`nproc` check
 
-ubuntu-22.04-s390x-all:
+ubuntu-22.04-s390x-all-system:
  extends: .custom_runner_template
  needs: []
  stage: build
@@ -35,7 +35,7 @@ ubuntu-22.04-s390x-all:
  script:
  - mkdir build
  - cd build
- - ../configure --disable-libssh
+ - ../configure --disable-libssh --disable-user
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make --output-sync -j`nproc`
  - make --output-sync -j`nproc` check
-- 
2.34.1




[PATCH] gitlab: Drop --static from s390x linux-user build

2024-05-06 Thread Richard Henderson
The host does not have the correct libraries installed for static pie,
which causes host/guest address space interference for some tests.
There's no real gain from linking statically, so drop it.

Signed-off-by: Richard Henderson 
---
Per my suggestion in

https://lore.kernel.org/qemu-devel/50c27a9f-fd75-4f8e-9a2d-488d8df4f...@linaro.org


r~
---
 .gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml 
b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
index 105981879f..3a2b1e1d24 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-22.04-s390x.yml
@@ -2,7 +2,7 @@
 # setup by the scripts/ci/setup/build-environment.yml task
 # "Install basic packages to build QEMU on Ubuntu 22.04"
 
-ubuntu-22.04-s390x-all-linux-static:
+ubuntu-22.04-s390x-all-linux:
  extends: .custom_runner_template
  needs: []
  stage: build
@@ -15,7 +15,7 @@ ubuntu-22.04-s390x-all-linux-static:
  script:
  - mkdir build
  - cd build
- - ../configure --enable-debug --static --disable-system
+ - ../configure --enable-debug-tcg --disable-system --disable-tools 
--disable-docs
|| { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make --output-sync -j`nproc`
  - make --output-sync check-tcg
-- 
2.34.1




Re: qemu-img cache modes with Linux cgroup v1

2024-05-06 Thread Alex Kalenyuk
Hey, just FYI about tmpfs, during some development on Fedora 39 I noticed
O_DIRECT is now supported on tmpfs (as opposed to our CI which runs Centos
9 Stream).
`qemu-img convert -t none -O raw tests/images/cirros-qcow2.img
/tmp/cirros.raw`
where /tmp is indeed a tmpfs.

I might be missing something so feel free to call that out

On Tue, Aug 1, 2023 at 6:38 PM Stefan Hajnoczi  wrote:

> Hi Daniel,
> I agree with your points.
>
> Stefan
>


Re: [PULL 00/12] qemu-sparc queue 20240506

2024-05-06 Thread Richard Henderson

On 5/6/24 04:44, Mark Cave-Ayland wrote:

The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5:

   Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-04 08:39:46 -0700)

are available in the Git repository at:

   https://github.com/mcayland/qemu.git  tags/qemu-sparc-20240506

for you to fetch changes up to d6f898cf85c92389182d22f0bcc3a11d7194fc94:

   target/sparc: Split out do_ms16b (2024-05-05 21:02:48 +0100)


qemu-sparc queue
- Default to modern virtio with iommu_platform enabled for sun4u
- Fixes for various VIS instructions from Richard
- CPU name updates from Thomas


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




Re: [PULL 0/7] QAPI patches patches for 2024-05-06

2024-05-06 Thread Richard Henderson

On 5/6/24 04:02, Markus Armbruster wrote:

The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5:

   Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-04 08:39:46 -0700)

are available in the Git repository at:

   https://repo.or.cz/qemu/armbru.git  tags/pull-qapi-2024-05-06

for you to fetch changes up to 285a8f209af7b4992aa91e8bea03a303fb6406ab:

   qapi: Simplify QAPISchemaVariants @tag_member (2024-05-06 12:38:27 +0200)


QAPI patches patches for 2024-05-06


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




Re: [PULL 00/28] Accelerator patches for 2024-05-06

2024-05-06 Thread Richard Henderson

On 5/6/24 05:37, Philippe Mathieu-Daudé wrote:

The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5:

   Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-04 08:39:46 -0700)

are available in the Git repository at:

   https://github.com/philmd/qemu.git  tags/accel-next-20240506

for you to fetch changes up to c984d1d8916df8abac71325a5a135cd851b2106a:

   MAINTAINERS: Update my email address (2024-05-06 14:33:49 +0200)


Accelerator patches

- Extract page-protection definitions to page-protection.h
- Rework in accel/tcg in preparation of extracting TCG fields from CPUState
- More uses of get_task_state() in user emulation
- Xen refactors in preparation for adding multiple map caches (Juergen & Edgar)
- MAINTAINERS updates (Aleksandar and Bin)


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




[PATCH] target/sh4: Update DisasContextBase.insn_start

2024-05-06 Thread Richard Henderson
Match the extra inserts of INDEX_op_insn_start, fixing
the db->num_insns != 1 assert in translator_loop.

Fixes: dcd092a0636 ("accel/tcg: Improve can_do_io management")
Signed-off-by: Richard Henderson 
---
 target/sh4/translate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index e599ab9d1a..b3282f3ac7 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -2189,6 +2189,7 @@ static void decode_gusa(DisasContext *ctx, CPUSH4State 
*env)
  */
 for (i = 1; i < max_insns; ++i) {
 tcg_gen_insn_start(pc + i * 2, ctx->envflags);
+ctx->base.insn_start = tcg_last_op();
 }
 }
 #endif
-- 
2.34.1




[PATCH 2/2] aio: warn about iohandler_ctx special casing

2024-05-06 Thread Stefan Hajnoczi
The main loop has two AioContexts: qemu_aio_context and iohandler_ctx.
The main loop runs them both, but nested aio_poll() calls on
qemu_aio_context exclude iohandler_ctx.

Which one should qemu_get_current_aio_context() return when called from
the main loop? Document that it's always qemu_aio_context.

This has subtle effects on functions that use
qemu_get_current_aio_context(). For example, aio_co_reschedule_self()
does not work when moving from iohandler_ctx to qemu_aio_context because
qemu_get_current_aio_context() does not differentiate these two
AioContexts.

Document this in order to reduce the chance of future bugs.

Signed-off-by: Stefan Hajnoczi 
---
 include/block/aio.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/block/aio.h b/include/block/aio.h
index 8378553eb9..4ee81936ed 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -629,6 +629,9 @@ void aio_co_schedule(AioContext *ctx, Coroutine *co);
  *
  * Move the currently running coroutine to new_ctx. If the coroutine is already
  * running in new_ctx, do nothing.
+ *
+ * Note that this function cannot reschedule from iohandler_ctx to
+ * qemu_aio_context.
  */
 void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
 
@@ -661,6 +664,9 @@ void aio_co_enter(AioContext *ctx, Coroutine *co);
  * If called from an IOThread this will be the IOThread's AioContext.  If
  * called from the main thread or with the "big QEMU lock" taken it
  * will be the main loop AioContext.
+ *
+ * Note that the return value is never the main loop's iohandler_ctx and the
+ * return value is the main loop AioContext instead.
  */
 AioContext *qemu_get_current_aio_context(void);
 
-- 
2.45.0




[PATCH 1/2] Revert "monitor: use aio_co_reschedule_self()"

2024-05-06 Thread Stefan Hajnoczi
Commit 1f25c172f837 ("monitor: use aio_co_reschedule_self()") was a code
cleanup that uses aio_co_reschedule_self() instead of open coding
coroutine rescheduling.

Bug RHEL-34618 was reported and Kevin Wolf  identified
the root cause. I missed that aio_co_reschedule_self() ->
qemu_get_current_aio_context() only knows about
qemu_aio_context/IOThread AioContexts and not about iohandler_ctx. It
does not function correctly when going back from the iohandler_ctx to
qemu_aio_context.

Go back to open coding the AioContext transitions to avoid this bug.

This reverts commit 1f25c172f83704e350c0829438d832384084a74d.

Buglink: https://issues.redhat.com/browse/RHEL-34618
Signed-off-by: Stefan Hajnoczi 
---
 qapi/qmp-dispatch.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
index f3488afeef..176b549473 100644
--- a/qapi/qmp-dispatch.c
+++ b/qapi/qmp-dispatch.c
@@ -212,7 +212,8 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList 
*cmds, QObject *requ
  * executing the command handler so that it can make progress if it
  * involves an AIO_WAIT_WHILE().
  */
-aio_co_reschedule_self(qemu_get_aio_context());
+aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self());
+qemu_coroutine_yield();
 }
 
 monitor_set_cur(qemu_coroutine_self(), cur_mon);
@@ -226,7 +227,9 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList 
*cmds, QObject *requ
  * Move back to iohandler_ctx so that nested event loops for
  * qemu_aio_context don't start new monitor commands.
  */
-aio_co_reschedule_self(iohandler_get_aio_context());
+aio_co_schedule(iohandler_get_aio_context(),
+qemu_coroutine_self());
+qemu_coroutine_yield();
 }
 } else {
/*
-- 
2.45.0




[PATCH 0/2] Revert "monitor: use aio_co_reschedule_self()"

2024-05-06 Thread Stefan Hajnoczi
This series fixes RHEL-34618 "qemu crash on Assertion `luringcb->co->ctx ==
s->aio_context' failed when do block_resize on hotplug disk with aio=io_uring":
https://issues.redhat.com/browse/RHEL-34618

Kevin identified commit 1f25c172f837 ("monitor: use aio_co_reschedule_self()")
as the root cause. There is a subtlety regarding how
qemu_get_current_aio_context() returns qemu_aio_context even though we may be
running in iohandler_ctx.

Revert commit 1f25c172f837, it was just intended as a code cleanup.

Stefan Hajnoczi (2):
  Revert "monitor: use aio_co_reschedule_self()"
  aio: warn about iohandler_ctx special casing

 include/block/aio.h | 6 ++
 qapi/qmp-dispatch.c | 7 +--
 2 files changed, 11 insertions(+), 2 deletions(-)

-- 
2.45.0




Re: Intention to work on GSoC project

2024-05-06 Thread Sahil
Hi,

It's been a while since I last gave an update. Sorry about that. I am ready
to get my hands dirty and start with the implementation.

I have gone through the source of linux's drivers/virtio/virtio_ring.c [1], and
QEMU's hw/virtio/virtio.c [2] and hw/virtio/vhost-shadow-virtqueue.c [3].

Before actually starting I would like to make sure I am on the right track. In
vhost-shadow-virtqueue.c, there's a function "vhost_svq_add" which in turn
calls "vhost_svq_add_split".

Shall I start by implementing a mechanism to check if the feature bit
"VIRTIO_F_RING_PACKED" is set (using "virtio_vdev_has_feature")? And
if it's supported, "vhost_svq_add" should call "vhost_svq_add_packed".
Following this, I can then start implementing "vhost_svq_add_packed"
and progress from there.

What are your thoughts on this?

Thanks,
Sahil

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/virtio/virtio.c
[2] https://gitlab.com/qemu-project/qemu/-/blob/master/hw/virtio/virtio.c
[3] 
https://gitlab.com/qemu-project/qemu/-/blob/master/hw/virtio/vhost-shadow-virtqueue.c





RE: [PATCH 2/4] target/hexagon: idef-parser remove undefined functions

2024-05-06 Thread ltaylorsimpson



> -Original Message-
> From: Anton Johansson 
> Sent: Monday, May 6, 2024 1:31 PM
> To: qemu-devel@nongnu.org
> Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com
> Subject: [PATCH 2/4] target/hexagon: idef-parser remove undefined
> functions
> 
> Signed-off-by: Anton Johansson 
> ---
>  target/hexagon/idef-parser/parser-helpers.h | 13 -
>  1 file changed, 13 deletions(-)

Reviewed-by: Taylor Simpson 




RE: [PATCH 3/4] target/hexagon: idef-parser fix leak of init_list

2024-05-06 Thread ltaylorsimpson



> -Original Message-
> From: Anton Johansson 
> Sent: Monday, May 6, 2024 1:31 PM
> To: qemu-devel@nongnu.org
> Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com
> Subject: [PATCH 3/4] target/hexagon: idef-parser fix leak of init_list
> 
> gen_inst_init_args() is called for instructions using a predicate as an
rvalue.
> Upon first call, the list of arguments which might need initialization
init_list is
> freed to indicate that they have been processed. For instructions without
an
> rvalue predicate,
> gen_inst_init_args() isn't called and init_list will never be freed.
> 
> Free init_list from free_instruction() if it hasn't already been freed.
> 
> Signed-off-by: Anton Johansson 
> ---
>  target/hexagon/idef-parser/parser-helpers.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/target/hexagon/idef-parser/parser-helpers.c
> b/target/hexagon/idef-parser/parser-helpers.c
> index 95f2b43076..bae01c2bb8 100644
> --- a/target/hexagon/idef-parser/parser-helpers.c
> +++ b/target/hexagon/idef-parser/parser-helpers.c
> @@ -2121,6 +2121,13 @@ void free_instruction(Context *c)
>  g_string_free(g_array_index(c->inst.strings, GString*, i), TRUE);
>  }
>  g_array_free(c->inst.strings, TRUE);
> +/*
> + * Free list of arguments that might need initialization, if they
haven't
> + * already been free'd.
> + */
> +if (c->inst.init_list) {
> +g_array_free(c->inst.init_list, TRUE);
> +}
>  /* Free INAME token value */
>  g_string_free(c->inst.name, TRUE);
>  /* Free variables and registers */

Why not do this in gen_inst_init_args just before this?
   /* Free argument init list once we have initialized everything */
g_array_free(c->inst.init_list, TRUE);
c->inst.init_list = NULL;


Taylor






RE: [PATCH 4/4] target/hexagon: idef-parser simplify predicate init

2024-05-06 Thread ltaylorsimpson



> -Original Message-
> From: Anton Johansson 
> Sent: Monday, May 6, 2024 1:31 PM
> To: qemu-devel@nongnu.org
> Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com
> Subject: [PATCH 4/4] target/hexagon: idef-parser simplify predicate init
> 
> Only predicate instruction arguments need to be initialized by
idef-parser.
> This commit removes registers from the init_list and simplifies
> gen_inst_init_args() slightly.
> 
> Signed-off-by: Anton Johansson 
> ---
>  target/hexagon/idef-parser/idef-parser.y|  2 --
>  target/hexagon/idef-parser/parser-helpers.c | 20 +---
>  2 files changed, 9 insertions(+), 13 deletions(-)

> diff --git a/target/hexagon/idef-parser/parser-helpers.c
> b/target/hexagon/idef-parser/parser-helpers.c
> index bae01c2bb8..33e8f82007 100644
> --- a/target/hexagon/idef-parser/parser-helpers.c
> +++ b/target/hexagon/idef-parser/parser-helpers.c
> @@ -1652,26 +1652,24 @@ void gen_inst(Context *c, GString *iname)
> 
>  void gen_inst_init_args(Context *c, YYLTYPE *locp)  {
> +/* If init_list is NULL arguments have already been initialized */
>  if (!c->inst.init_list) {
>  return;
>  }
> 
>  for (unsigned i = 0; i < c->inst.init_list->len; i++) {
>  HexValue *val = &g_array_index(c->inst.init_list, HexValue, i);
> -if (val->type == REGISTER_ARG) {
> -/* Nothing to do here */
> -} else if (val->type == PREDICATE) {
> -char suffix = val->is_dotnew ? 'N' : 'V';
> -EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width,
> -  val->pred.id, suffix);
> -} else {
> -yyassert(c, locp, false, "Invalid arg type!");
> -}
> +yyassert(c, locp, val->type == PREDICATE,
> + "Only predicates need to be initialized!");
> +char suffix = val->is_dotnew ? 'N' : 'V';

Declarations should be at the beginning of the function per QEMU coding
standards.

> +EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width,

Since you know this is a predicate, the bit_width will always be 32.  You
can hard-code that instead of using %u.

> +  val->pred.id, suffix);
>  }
> 
>  /* Free argument init list once we have initialized everything */

Taylor





RE: [PATCH 1/4] target/hexagon: idef-parser remove unused defines

2024-05-06 Thread ltaylorsimpson



> -Original Message-
> From: Anton Johansson 
> Sent: Monday, May 6, 2024 1:31 PM
> To: qemu-devel@nongnu.org
> Cc: a...@rev.ng; ltaylorsimp...@gmail.com; bc...@quicinc.com
> Subject: [PATCH 1/4] target/hexagon: idef-parser remove unused defines
> 
> Before switching to GArray/g_string_printf we used fixed size arrays for
> output buffers and instructions arguments among other things.
> 
> Macros defining the sizes of these buffers were left behind, remove them.
> 
> Signed-off-by: Anton Johansson 
> ---
>  target/hexagon/idef-parser/idef-parser.h | 10 --
>  1 file changed, 10 deletions(-)

Reviewed-by: Taylor Simpson 





[PATCH 2/4] target/hexagon: idef-parser remove undefined functions

2024-05-06 Thread Anton Johansson via
Signed-off-by: Anton Johansson 
---
 target/hexagon/idef-parser/parser-helpers.h | 13 -
 1 file changed, 13 deletions(-)

diff --git a/target/hexagon/idef-parser/parser-helpers.h 
b/target/hexagon/idef-parser/parser-helpers.h
index 7c58087169..2087d534a9 100644
--- a/target/hexagon/idef-parser/parser-helpers.h
+++ b/target/hexagon/idef-parser/parser-helpers.h
@@ -143,8 +143,6 @@ void commit(Context *c);
 
 #define OUT(c, locp, ...) FOR_EACH((c), (locp), OUT_IMPL, __VA_ARGS__)
 
-const char *cmp_swap(Context *c, YYLTYPE *locp, const char *type);
-
 /**
  * Temporary values creation
  */
@@ -236,8 +234,6 @@ HexValue gen_extract_op(Context *c,
 HexValue *index,
 HexExtract *extract);
 
-HexValue gen_read_reg(Context *c, YYLTYPE *locp, HexValue *reg);
-
 void gen_write_reg(Context *c, YYLTYPE *locp, HexValue *reg, HexValue *value);
 
 void gen_assign(Context *c,
@@ -274,13 +270,6 @@ HexValue gen_ctpop_op(Context *c, YYLTYPE *locp, HexValue 
*src);
 
 HexValue gen_rotl(Context *c, YYLTYPE *locp, HexValue *src, HexValue *n);
 
-HexValue gen_deinterleave(Context *c, YYLTYPE *locp, HexValue *mixed);
-
-HexValue gen_interleave(Context *c,
-YYLTYPE *locp,
-HexValue *odd,
-HexValue *even);
-
 HexValue gen_carry_from_add(Context *c,
 YYLTYPE *locp,
 HexValue *op1,
@@ -349,8 +338,6 @@ HexValue gen_rvalue_ternary(Context *c, YYLTYPE *locp, 
HexValue *cond,
 
 const char *cond_to_str(TCGCond cond);
 
-void emit_header(Context *c);
-
 void emit_arg(Context *c, YYLTYPE *locp, HexValue *arg);
 
 void emit_footer(Context *c);
-- 
2.44.0




[PATCH 4/4] target/hexagon: idef-parser simplify predicate init

2024-05-06 Thread Anton Johansson via
Only predicate instruction arguments need to be initialized by
idef-parser. This commit removes registers from the init_list and
simplifies gen_inst_init_args() slightly.

Signed-off-by: Anton Johansson 
---
 target/hexagon/idef-parser/idef-parser.y|  2 --
 target/hexagon/idef-parser/parser-helpers.c | 20 +---
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/target/hexagon/idef-parser/idef-parser.y 
b/target/hexagon/idef-parser/idef-parser.y
index cd2612eb8c..9ffb9f9699 100644
--- a/target/hexagon/idef-parser/idef-parser.y
+++ b/target/hexagon/idef-parser/idef-parser.y
@@ -233,8 +233,6 @@ code : '{' statements '}'
 argument_decl : REG
 {
 emit_arg(c, &@1, &$1);
-/* Enqueue register into initialization list */
-g_array_append_val(c->inst.init_list, $1);
 }
   | PRED
 {
diff --git a/target/hexagon/idef-parser/parser-helpers.c 
b/target/hexagon/idef-parser/parser-helpers.c
index bae01c2bb8..33e8f82007 100644
--- a/target/hexagon/idef-parser/parser-helpers.c
+++ b/target/hexagon/idef-parser/parser-helpers.c
@@ -1652,26 +1652,24 @@ void gen_inst(Context *c, GString *iname)
 
 
 /*
- * Initialize declared but uninitialized registers, but only for
- * non-conditional instructions
+ * Initialize declared but uninitialized instruction arguments. Only needed for
+ * predicate arguments, initialization of registers is handled by the Hexagon
+ * frontend.
  */
 void gen_inst_init_args(Context *c, YYLTYPE *locp)
 {
+/* If init_list is NULL arguments have already been initialized */
 if (!c->inst.init_list) {
 return;
 }
 
 for (unsigned i = 0; i < c->inst.init_list->len; i++) {
 HexValue *val = &g_array_index(c->inst.init_list, HexValue, i);
-if (val->type == REGISTER_ARG) {
-/* Nothing to do here */
-} else if (val->type == PREDICATE) {
-char suffix = val->is_dotnew ? 'N' : 'V';
-EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width,
-  val->pred.id, suffix);
-} else {
-yyassert(c, locp, false, "Invalid arg type!");
-}
+yyassert(c, locp, val->type == PREDICATE,
+ "Only predicates need to be initialized!");
+char suffix = val->is_dotnew ? 'N' : 'V';
+EMIT_HEAD(c, "tcg_gen_movi_i%u(P%c%c, 0);\n", val->bit_width,
+  val->pred.id, suffix);
 }
 
 /* Free argument init list once we have initialized everything */
-- 
2.44.0




[PATCH 3/4] target/hexagon: idef-parser fix leak of init_list

2024-05-06 Thread Anton Johansson via
gen_inst_init_args() is called for instructions using a predicate as an
rvalue. Upon first call, the list of arguments which might need
initialization init_list is freed to indicate that they have been
processed. For instructions without an rvalue predicate,
gen_inst_init_args() isn't called and init_list will never be freed.

Free init_list from free_instruction() if it hasn't already been freed.

Signed-off-by: Anton Johansson 
---
 target/hexagon/idef-parser/parser-helpers.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/target/hexagon/idef-parser/parser-helpers.c 
b/target/hexagon/idef-parser/parser-helpers.c
index 95f2b43076..bae01c2bb8 100644
--- a/target/hexagon/idef-parser/parser-helpers.c
+++ b/target/hexagon/idef-parser/parser-helpers.c
@@ -2121,6 +2121,13 @@ void free_instruction(Context *c)
 g_string_free(g_array_index(c->inst.strings, GString*, i), TRUE);
 }
 g_array_free(c->inst.strings, TRUE);
+/*
+ * Free list of arguments that might need initialization, if they haven't
+ * already been free'd.
+ */
+if (c->inst.init_list) {
+g_array_free(c->inst.init_list, TRUE);
+}
 /* Free INAME token value */
 g_string_free(c->inst.name, TRUE);
 /* Free variables and registers */
-- 
2.44.0




[PATCH 0/4] target/hexagon: Minor idef-parser cleanup

2024-05-06 Thread Anton Johansson via
Was running idef-parser with valgrind and noticed we were leaking the
init_list GArray, which is used to hold instruction arguments that may
need initialization.  This patchset fixes the leak, removes unused
macros and undefined functions, and simplifies gen_inst_init_args() to
only handle predicate values.

Anton Johansson (4):
  target/hexagon: idef-parser remove unused defines
  target/hexagon: idef-parser remove undefined functions
  target/hexagon: idef-parser fix leak of init_list
  target/hexagon: idef-parser simplify predicate init

 target/hexagon/idef-parser/idef-parser.h| 10 
 target/hexagon/idef-parser/idef-parser.y|  2 --
 target/hexagon/idef-parser/parser-helpers.c | 27 -
 target/hexagon/idef-parser/parser-helpers.h | 13 --
 4 files changed, 16 insertions(+), 36 deletions(-)

-- 
2.44.0




[PATCH 1/4] target/hexagon: idef-parser remove unused defines

2024-05-06 Thread Anton Johansson via
Before switching to GArray/g_string_printf we used fixed size arrays for
output buffers and instructions arguments among other things.

Macros defining the sizes of these buffers were left behind, remove
them.

Signed-off-by: Anton Johansson 
---
 target/hexagon/idef-parser/idef-parser.h | 10 --
 1 file changed, 10 deletions(-)

diff --git a/target/hexagon/idef-parser/idef-parser.h 
b/target/hexagon/idef-parser/idef-parser.h
index 3faa1deecd..8594cbe3a2 100644
--- a/target/hexagon/idef-parser/idef-parser.h
+++ b/target/hexagon/idef-parser/idef-parser.h
@@ -23,16 +23,6 @@
 #include 
 #include 
 
-#define TCGV_NAME_SIZE 7
-#define MAX_WRITTEN_REGS 32
-#define OFFSET_STR_LEN 32
-#define ALLOC_LIST_LEN 32
-#define ALLOC_NAME_SIZE 32
-#define INIT_LIST_LEN 32
-#define OUT_BUF_LEN (1024 * 1024)
-#define SIGNATURE_BUF_LEN (128 * 1024)
-#define HEADER_BUF_LEN (128 * 1024)
-
 /* Variadic macros to wrap the buffer printing functions */
 #define EMIT(c, ...)   
\
 do {   
\
-- 
2.44.0




Re: qemu-img cache modes with Linux cgroup v1

2024-05-06 Thread Stefan Hajnoczi
On Mon, May 06, 2024 at 08:10:25PM +0300, Alex Kalenyuk wrote:
> Hey, just FYI about tmpfs, during some development on Fedora 39 I noticed
> O_DIRECT is now supported on tmpfs (as opposed to our CI which runs Centos
> 9 Stream).
> `qemu-img convert -t none -O raw tests/images/cirros-qcow2.img
> /tmp/cirros.raw`
> where /tmp is indeed a tmpfs.
> 
> I might be missing something so feel free to call that out

Yes, it was added by:

commit e88e0d366f9cfbb810b0c8509dc5d130d5a53e02
Author: Hugh Dickins 
Date:   Thu Aug 10 23:27:07 2023 -0700

tmpfs: trivial support for direct IO

It's fairly new but great to have.

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v2 5/5] monitor: use aio_co_reschedule_self()

2024-05-06 Thread Stefan Hajnoczi
On Fri, May 03, 2024 at 07:33:17PM +0200, Kevin Wolf wrote:
> Am 06.02.2024 um 20:06 hat Stefan Hajnoczi geschrieben:
> > The aio_co_reschedule_self() API is designed to avoid the race
> > condition between scheduling the coroutine in another AioContext and
> > yielding.
> > 
> > The QMP dispatch code uses the open-coded version that appears
> > susceptible to the race condition at first glance:
> > 
> >   aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self());
> >   qemu_coroutine_yield();
> > 
> > The code is actually safe because the iohandler and qemu_aio_context
> > AioContext run under the Big QEMU Lock. Nevertheless, set a good example
> > and use aio_co_reschedule_self() so it's obvious that there is no race.
> > 
> > Suggested-by: Hanna Reitz 
> > Reviewed-by: Manos Pitsidianakis 
> > Reviewed-by: Hanna Czenczek 
> > Signed-off-by: Stefan Hajnoczi 
> > ---
> >  qapi/qmp-dispatch.c | 7 ++-
> >  1 file changed, 2 insertions(+), 5 deletions(-)
> > 
> > diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
> > index 176b549473..f3488afeef 100644
> > --- a/qapi/qmp-dispatch.c
> > +++ b/qapi/qmp-dispatch.c
> > @@ -212,8 +212,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const 
> > QmpCommandList *cmds, QObject *requ
> >   * executing the command handler so that it can make progress 
> > if it
> >   * involves an AIO_WAIT_WHILE().
> >   */
> > -aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self());
> > -qemu_coroutine_yield();
> > +aio_co_reschedule_self(qemu_get_aio_context());
> 
> Turns out that this one actually causes a regression. [1] This code is
> ŕun in iohandler_ctx, aio_co_reschedule_self() looks at the new context
> and compares it with qemu_get_current_aio_context() - and because both
> are qemu_aio_context, it decides that it has nothing to do. So the
> command handler coroutine actually still runs in iohandler_ctx now,
> which is not what we want.
> 
> We could just revert this patch because it was only meant as a cleanup
> without a semantic difference.
> 
> Or aio_co_reschedule_self() could look at qemu_coroutine_self()->ctx
> instead of using qemu_get_current_aio_context(). That would be a little
> more indirect, though, and I'm not sure if co->ctx is always up to date.
> 
> Any opinions on what is the best way to fix this?

If the commit is reverted then similar bugs may be introduced again in
the future. The qemu_get_current_aio_context() API is unaware of
iohandler_ctx and this can lead to unexpected results.

I will send patches to revert the commit and add doc comments explaining
iohandler_ctx's special behavior. This will reduce, but not eliminate,
the risk of future bugs.

Modifying aio_co_reschedule_self() might be better long-term fix, but
I'm afraid it will create more bugs because it will expose the subtle
distinction between the current coroutine AioContext and non-coroutine
AioContext in new places. I think the root cause is that iohandler_ctx
isn't a full-fledged AioContext with its own event loop. iohandler_ctx
is a special superset of qemu_aio_context that the main loop monitors.

Stefan

> 
> Kevin
> 
> [1] https://issues.redhat.com/browse/RHEL-34618
> 


signature.asc
Description: PGP signature


Re: [PATCH 4/4] tests/qtest: Check STM32L4x5 clock connections

2024-05-06 Thread Inès Varhol



- Le 6 Mai 24, à 6:16, Thomas Huth th...@redhat.com a écrit :

> On 05/05/2024 16.05, Inès Varhol wrote:
>> For USART, GPIO and SYSCFG devices, check that clock frequency before
>> and after enabling the peripheral clock in RCC is correct.
>> 
>> Signed-off-by: Inès Varhol 
>> ---
>> Hello,
>> 
>> Should these tests be regrouped in stm32l4x5_rcc-test.c ?
> 
>  Hi,
> 
> sounds mostly like a matter of taste at a first glance. Or what would be the
> benefit of putting everything into the *rcc-test.c file? Could you maybe
> consolidate the get_clock_freq_hz() function that way? (maybe that
> get_clock_freq_hz() function could also be consolidated as a inline function
> in a shared header instead?)
> 
>  Thomas

Hello,

I was indeed looking to consolidate the functions get_clock_freq_hz() and
check_clock() from *usart-test.c (along with the definitions for RCC
registers).
Thank you for your suggestion, I'll use a header file.

Inès Varhol




Re: [PATCH RFC 00/26] Multifd 🔀 device state transfer support with VFIO consumer

2024-05-06 Thread Peter Xu
On Mon, May 06, 2024 at 06:26:46PM +0200, Maciej S. Szmigiero wrote:
> On 29.04.2024 17:09, Peter Xu wrote:
> > On Fri, Apr 26, 2024 at 07:34:09PM +0200, Maciej S. Szmigiero wrote:
> > > On 24.04.2024 00:35, Peter Xu wrote:
> > > > On Wed, Apr 24, 2024 at 12:25:08AM +0200, Maciej S. Szmigiero wrote:
> > > > > On 24.04.2024 00:20, Peter Xu wrote:
> > > > > > On Tue, Apr 23, 2024 at 06:15:35PM +0200, Maciej S. Szmigiero wrote:
> > > > > > > On 19.04.2024 17:31, Peter Xu wrote:
> > > > > > > > On Fri, Apr 19, 2024 at 11:07:21AM +0100, Daniel P. Berrangé 
> > > > > > > > wrote:
> > > > > > > > > On Thu, Apr 18, 2024 at 04:02:49PM -0400, Peter Xu wrote:
> > > > > > > > > > On Thu, Apr 18, 2024 at 08:14:15PM +0200, Maciej S. 
> > > > > > > > > > Szmigiero wrote:
> > > > > > > > > > > I think one of the reasons for these results is that 
> > > > > > > > > > > mixed (RAM + device
> > > > > > > > > > > state) multifd channels participate in the RAM sync 
> > > > > > > > > > > process
> > > > > > > > > > > (MULTIFD_FLAG_SYNC) whereas device state dedicated 
> > > > > > > > > > > channels don't.
> > > > > > > > > > 
> > > > > > > > > > Firstly, I'm wondering whether we can have better names for 
> > > > > > > > > > these new
> > > > > > > > > > hooks.  Currently (only comment on the async* stuff):
> > > > > > > > > > 
> > > > > > > > > >   - complete_precopy_async
> > > > > > > > > >   - complete_precopy
> > > > > > > > > >   - complete_precopy_async_wait
> > > > > > > > > > 
> > > > > > > > > > But perhaps better:
> > > > > > > > > > 
> > > > > > > > > >   - complete_precopy_begin
> > > > > > > > > >   - complete_precopy
> > > > > > > > > >   - complete_precopy_end
> > > > > > > > > > 
> > > > > > > > > > ?
> > > > > > > > > > 
> > > > > > > > > > As I don't see why the device must do something with async 
> > > > > > > > > > in such hook.
> > > > > > > > > > To me it's more like you're splitting one process into 
> > > > > > > > > > multiple, then
> > > > > > > > > > begin/end sounds more generic.
> > > > > > > > > > 
> > > > > > > > > > Then, if with that in mind, IIUC we can already split 
> > > > > > > > > > ram_save_complete()
> > > > > > > > > > into >1 phases too. For example, I would be curious whether 
> > > > > > > > > > the performance
> > > > > > > > > > will go back to normal if we offloading 
> > > > > > > > > > multifd_send_sync_main() into the
> > > > > > > > > > complete_precopy_end(), because we really only need one 
> > > > > > > > > > shot of that, and I
> > > > > > > > > > am quite surprised it already greatly affects VFIO dumping 
> > > > > > > > > > its own things.
> > > > > > > > > > 
> > > > > > > > > > I would even ask one step further as what Dan was asking: 
> > > > > > > > > > have you thought
> > > > > > > > > > about dumping VFIO states via multifd even during 
> > > > > > > > > > iterations?  Would that
> > > > > > > > > > help even more than this series (which IIUC only helps 
> > > > > > > > > > during the blackout
> > > > > > > > > > phase)?
> > > > > > > > > 
> > > > > > > > > To dump during RAM iteration, the VFIO device will need to 
> > > > > > > > > have
> > > > > > > > > dirty tracking and iterate on its state, because the guest 
> > > > > > > > > CPUs
> > > > > > > > > will still be running potentially changing VFIO state. That 
> > > > > > > > > seems
> > > > > > > > > impractical in the general case.
> > > > > > > > 
> > > > > > > > We already do such interations in vfio_save_iterate()?
> > > > > > > > 
> > > > > > > > My understanding is the recent VFIO work is based on the fact 
> > > > > > > > that the VFIO
> > > > > > > > device can track device state changes more or less (besides 
> > > > > > > > being able to
> > > > > > > > save/load full states).  E.g. I still remember in our QE tests 
> > > > > > > > some old
> > > > > > > > devices report much more dirty pages than expected during the 
> > > > > > > > iterations
> > > > > > > > when we were looking into such issue that a huge amount of 
> > > > > > > > dirty pages
> > > > > > > > reported.  But newer models seem to have fixed that and report 
> > > > > > > > much less.
> > > > > > > > 
> > > > > > > > That issue was about GPU not NICs, though, and IIUC a major 
> > > > > > > > portion of such
> > > > > > > > tracking used to be for GPU vRAMs.  So maybe I was mixing up 
> > > > > > > > these, and
> > > > > > > > maybe they work differently.
> > > > > > > 
> > > > > > > The device which this series was developed against (Mellanox 
> > > > > > > ConnectX-7)
> > > > > > > is already transferring its live state before the VM gets stopped 
> > > > > > > (via
> > > > > > > save_live_iterate SaveVMHandler).
> > > > > > > 
> > > > > > > It's just that in addition to the live state it has more than 400 
> > > > > > > MiB
> > > > > > > of state that cannot be transferred while the VM is still running.
> > > > > > > And that fact hurts a lot with respect to the migration downtime.
> > > > > > > 
> > >

Re: [RFC PATCH v3 3/5] KVM: x86: Add notifications for Heki policy configuration and violation

2024-05-06 Thread Mickaël Salaün
On Fri, May 03, 2024 at 07:03:21AM GMT, Sean Christopherson wrote:
> On Fri, May 03, 2024, Mickaël Salaün wrote:
> > Add an interface for user space to be notified about guests' Heki policy
> > and related violations.
> > 
> > Extend the KVM_ENABLE_CAP IOCTL with KVM_CAP_HEKI_CONFIGURE and
> > KVM_CAP_HEKI_DENIAL. Each one takes a bitmask as first argument that can
> > contains KVM_HEKI_EXIT_REASON_CR0 and KVM_HEKI_EXIT_REASON_CR4. The
> > returned value is the bitmask of known Heki exit reasons, for now:
> > KVM_HEKI_EXIT_REASON_CR0 and KVM_HEKI_EXIT_REASON_CR4.
> > 
> > If KVM_CAP_HEKI_CONFIGURE is set, a VM exit will be triggered for each
> > KVM_HC_LOCK_CR_UPDATE hypercalls according to the requested control
> > register. This enables to enlighten the VMM with the guest
> > auto-restrictions.
> > 
> > If KVM_CAP_HEKI_DENIAL is set, a VM exit will be triggered for each
> > pinned CR violation. This enables the VMM to react to a policy
> > violation.
> > 
> > Cc: Borislav Petkov 
> > Cc: Dave Hansen 
> > Cc: H. Peter Anvin 
> > Cc: Ingo Molnar 
> > Cc: Kees Cook 
> > Cc: Madhavan T. Venkataraman 
> > Cc: Paolo Bonzini 
> > Cc: Sean Christopherson 
> > Cc: Thomas Gleixner 
> > Cc: Vitaly Kuznetsov 
> > Cc: Wanpeng Li 
> > Signed-off-by: Mickaël Salaün 
> > Link: https://lore.kernel.org/r/20240503131910.307630-4-...@digikod.net
> > ---
> > 
> > Changes since v1:
> > * New patch. Making user space aware of Heki properties was requested by
> >   Sean Christopherson.
> 
> No, I suggested having userspace _control_ the pinning[*], not merely be 
> notified
> of pinning.
> 
>  : IMO, manipulation of protections, both for memory (this patch) and CPU 
> state
>  : (control registers in the next patch) should come from userspace.  I have 
> no
>  : objection to KVM providing plumbing if necessary, but I think userspace 
> needs to
>  : to have full control over the actual state.
>  : 
>  : One of the things that caused Intel's control register pinning series to 
> stall
>  : out was how to handle edge cases like kexec() and reboot.  Deferring to 
> userspace
>  : means the kernel doesn't need to define policy, e.g. when to unprotect 
> memory,
>  : and avoids questions like "should userspace be able to overwrite pinned 
> control
>  : registers".
>  : 
>  : And like the confidential VM use case, keeping userspace in the loop is a 
> big
>  : beneifit, e.g. the guest can't circumvent protections by coercing 
> userspace into
>  : writing to protected memory.
> 
> I stand by that suggestion, because I don't see a sane way to handle things 
> like
> kexec() and reboot without having a _much_ more sophisticated policy than 
> would
> ever be acceptable in KVM.
> 
> I think that can be done without KVM having any awareness of CR pinning 
> whatsoever.
> E.g. userspace just needs to ability to intercept CR writes and inject #GPs.  
> Off
> the cuff, I suspect the uAPI could look very similar to MSR filtering.  E.g. 
> I bet
> userspace could enforce MSR pinning without any new KVM uAPI at all.
> 
> [*] https://lore.kernel.org/all/zfuyhpuhtmbyd...@google.com

OK, I had concern about the control not directly coming from the guest,
especially in the case of pKVM and confidential computing, but I get you
point.  It should indeed be quite similar to the MSR filtering on the
userspace side, except that we need another interface for the guest to
request such change (i.e. self-protection).

Would it be OK to keep this new KVM_HC_LOCK_CR_UPDATE hypercall but
forward the request to userspace with a VM exit instead?  That would
also enable userspace to get the request and directly configure the CR
pinning with the same VM exit.



Re: [PULL 00/46] Mostly build system and other cleanups patches for 2024-05-06

2024-05-06 Thread Richard Henderson

On 5/6/24 00:50, Paolo Bonzini wrote:

The following changes since commit 4977ce198d2390bff8c71ad5cb1a5f6aa24b56fb:

   Merge tag 'pull-tcg-20240501' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-01 15:15:33 -0700)

are available in the Git repository at:

   https://gitlab.com/bonzini/qemu.git  tags/for-upstream

for you to fetch changes up to deb686ef0e609ceaec0daa5dc88eb5b3dd9701b0:

   qga/commands-posix: fix typo in qmp_guest_set_user_password (2024-05-03 
19:36:51 +0200)


* target/i386: Introduce SapphireRapids-v3 to add missing features
* switch boards to "default y"
* allow building emulators without any board
* configs: list "implied" device groups in the default configs
* remove unnecessary declarations from typedefs.h
* target/i386: Give IRQs a chance when resetting HF_INHIBIT_IRQ_MASK


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




Re: [PULL 00/15] Hexagon: simplify gen for packets w/o read-after-write

2024-05-06 Thread Richard Henderson

On 5/5/24 19:42, Brian Cain wrote:

The following changes since commit 248f6f62df073a3b4158fd0093863ab885feabb5:

   Merge tag 'pull-axp-20240504' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-05-04 08:39:46 -0700)

are available in the Git repository at:

   https://github.com/quic/qemu  tags/pull-hex-20240505

for you to fetch changes up to a4696661491cac8c1c08e7d482d751f808ce3143:

   Hexagon (target/hexagon) Remove hex_common.read_attribs_file (2024-05-05 
16:22:07 -0700)


Short-circuit for packets w/o read-after-write
Cleanup unused code in gen_*.py scripts


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




Re: [PATCH v2 17/25] target/i386: move C0-FF opcodes to new decoder (except for x87)

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

The shift instructions are rewritten instead of reusing code from the old
decoder.  Rotates use CC_OP_ADCOX more extensively and generally rely
more on the optimizer, so that the code generators are shared between
the immediate-count and variable-count cases.

In particular, this makes gen_RCL and gen_RCR pretty efficient for the
count == 1 case, which becomes (apart from a few extra movs) something like:

   (compute_cc_all if needed)
   // save old value for OF calculation
   mov cc_src2, T0
   // the bulk of RCL is just this!
   deposit T0, cc_src, T0, 1, TARGET_LONG_BITS - 1
   // compute carry
   shr cc_dst, cc_src2, length - 1
   and cc_dst, cc_dst, 1
   // compute overflow
   xor cc_src2, cc_src2, T0
   extract cc_src2, cc_src2, length - 1, 1

32-bit MUL and IMUL are also slightly more efficient on 64-bit hosts.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/decode-new.h |1 +
  target/i386/tcg/translate.c  |   23 +-
  target/i386/tcg/decode-new.c.inc |  142 +
  target/i386/tcg/emit.c.inc   | 1014 +-
  4 files changed, 1169 insertions(+), 11 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 15/25] target/i386: move 60-BF opcodes to new decoder

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Compared to the old decoder, the main differences in translation
are for the little-used ARPL instruction.  IMUL is adjusted a bit
to share more code to produce flags, but is otherwise very similar.

Signed-off-by: Paolo Bonzini 


Reviewed-by: Richard Henderson 


+static void gen_POPA(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+   gen_popa(s);
+}

...

+static void gen_PUSHA(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+{
+   gen_pusha(s);
+}


3-space indent?


r~



Re: [PATCH v2 05/25] target/i386: cleanup cc_op changes for REP/REPZ/REPNZ

2024-05-06 Thread Richard Henderson

On 5/6/24 09:31, Paolo Bonzini wrote:

The comment deals with the former, the removal with the latter.

The idea of the comment is that after SCAS/CMPS you have CC_OP_SUB*,
so in principle you may expect that you need to set CC_OP_DYNAMIC
explicitly at the end of a REPZ/REPNZ, which is where the CX != 0 and
CX == 0 paths join. But this is not necessary, because there is
nothing after that instruction - the translation block ends.

So I guess the comment could instead be placed at the end of the function?

 /*
  * Only one iteration is done at a time, so the translation
  * block has ended unconditionally at this point and there
  * is no control flow junction - no need to set CC_OP_DYNAMIC.
  */

What do you think?


Just before gen_jmp_rel_csize?  Yes, that seems good.


r~



Re: [PATCH v2 05/25] target/i386: cleanup cc_op changes for REP/REPZ/REPNZ

2024-05-06 Thread Paolo Bonzini
On Mon, May 6, 2024 at 6:08 PM Richard Henderson
 wrote:
> > -gen_update_cc_op(s);
> >   l2 = gen_jz_ecx_string(s);
> > +/*
> > + * Only one iteration is done at a time, so there is
> > + * no control flow junction here and cc_op is never dynamic.
> > + */
> >   fn(s, ot);
> >   gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
> > -gen_update_cc_op(s);
> >   gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2);
> >   if (s->repz_opt) {
> >   gen_op_jz_ecx(s, l2);
>
> Ok, but only because gen_jcc1 does the gen_update_cc_op.  The comment is 
> neither correct
> nor necessary.

Yeah, it's true that gen_jcc1 does the update. On the other hand,
there are two different kinds of cc_op updates:

1) at branches, if you know that only one of the sides might write
cc_op - so you ensure it's up-to-date before the branch - and set
CC_OP_DYNAMIC at the junction. Same if you have movcond instead of a
branch.

2) at end of translation block, to spill the value lazily (because in
the middle of the TB we are able to restore it from insn data). With
these patches there is never a need to do this, because gen_jmp_rel()
and gen_jmp_rel_csize() take care of it.

The comment deals with the former, the removal with the latter.

The idea of the comment is that after SCAS/CMPS you have CC_OP_SUB*,
so in principle you may expect that you need to set CC_OP_DYNAMIC
explicitly at the end of a REPZ/REPNZ, which is where the CX != 0 and
CX == 0 paths join. But this is not necessary, because there is
nothing after that instruction - the translation block ends.

So I guess the comment could instead be placed at the end of the function?

/*
 * Only one iteration is done at a time, so the translation
 * block has ended unconditionally at this point and there
 * is no control flow junction - no need to set CC_OP_DYNAMIC.
 */

What do you think?

Paolo




Re: [PATCH RFC 00/26] Multifd 🔀 device state transfer support with VFIO consumer

2024-05-06 Thread Maciej S. Szmigiero

On 29.04.2024 17:09, Peter Xu wrote:

On Fri, Apr 26, 2024 at 07:34:09PM +0200, Maciej S. Szmigiero wrote:

On 24.04.2024 00:35, Peter Xu wrote:

On Wed, Apr 24, 2024 at 12:25:08AM +0200, Maciej S. Szmigiero wrote:

On 24.04.2024 00:20, Peter Xu wrote:

On Tue, Apr 23, 2024 at 06:15:35PM +0200, Maciej S. Szmigiero wrote:

On 19.04.2024 17:31, Peter Xu wrote:

On Fri, Apr 19, 2024 at 11:07:21AM +0100, Daniel P. Berrangé wrote:

On Thu, Apr 18, 2024 at 04:02:49PM -0400, Peter Xu wrote:

On Thu, Apr 18, 2024 at 08:14:15PM +0200, Maciej S. Szmigiero wrote:

I think one of the reasons for these results is that mixed (RAM + device
state) multifd channels participate in the RAM sync process
(MULTIFD_FLAG_SYNC) whereas device state dedicated channels don't.


Firstly, I'm wondering whether we can have better names for these new
hooks.  Currently (only comment on the async* stuff):

  - complete_precopy_async
  - complete_precopy
  - complete_precopy_async_wait

But perhaps better:

  - complete_precopy_begin
  - complete_precopy
  - complete_precopy_end

?

As I don't see why the device must do something with async in such hook.
To me it's more like you're splitting one process into multiple, then
begin/end sounds more generic.

Then, if with that in mind, IIUC we can already split ram_save_complete()
into >1 phases too. For example, I would be curious whether the performance
will go back to normal if we offloading multifd_send_sync_main() into the
complete_precopy_end(), because we really only need one shot of that, and I
am quite surprised it already greatly affects VFIO dumping its own things.

I would even ask one step further as what Dan was asking: have you thought
about dumping VFIO states via multifd even during iterations?  Would that
help even more than this series (which IIUC only helps during the blackout
phase)?


To dump during RAM iteration, the VFIO device will need to have
dirty tracking and iterate on its state, because the guest CPUs
will still be running potentially changing VFIO state. That seems
impractical in the general case.


We already do such interations in vfio_save_iterate()?

My understanding is the recent VFIO work is based on the fact that the VFIO
device can track device state changes more or less (besides being able to
save/load full states).  E.g. I still remember in our QE tests some old
devices report much more dirty pages than expected during the iterations
when we were looking into such issue that a huge amount of dirty pages
reported.  But newer models seem to have fixed that and report much less.

That issue was about GPU not NICs, though, and IIUC a major portion of such
tracking used to be for GPU vRAMs.  So maybe I was mixing up these, and
maybe they work differently.


The device which this series was developed against (Mellanox ConnectX-7)
is already transferring its live state before the VM gets stopped (via
save_live_iterate SaveVMHandler).

It's just that in addition to the live state it has more than 400 MiB
of state that cannot be transferred while the VM is still running.
And that fact hurts a lot with respect to the migration downtime.

AFAIK it's a very similar story for (some) GPUs.


So during iteration phase VFIO cannot yet leverage the multifd channels
when with this series, am I right?


That's right.


Is it possible to extend that use case too?


I guess so, but since this phase (iteration while the VM is still
running) doesn't impact downtime it is much less critical.


But it affects the bandwidth, e.g. even with multifd enabled, the device
iteration data will still bottleneck at ~15Gbps on a common system setup
the best case, even if the hosts are 100Gbps direct connected.  Would that
be a concern in the future too, or it's known problem and it won't be fixed
anyway?


I think any improvements to the migration performance are good, even if
they don't impact downtime.

It's just that this patch set focuses on the downtime phase as the more
critical thing.

After this gets improved there's no reason why not to look at improving
performance of the VM live phase too if it brings sensible improvements.


I remember Avihai used to have plan to look into similar issues, I hope
this is exactly what he is looking for.  Otherwise changing migration
protocol from time to time is cumbersome; we always need to provide a flag
to make sure old systems migrates in the old ways, new systems run the new
ways, and for such a relatively major change I'd want to double check on
how far away we can support offload VFIO iterations data to multifd.


The device state transfer is indicated by a new flag in the multifd
header (MULTIFD_FLAG_DEVICE_STATE).

If we are to use multifd channels for VM live phase transfers these
could simply re-use the same flag type.


Right, and that's also my major purpose of such request to consider both
issues.

If supporting iterators can be easy on top of this, I am thinking whether
we should do this in one shot.  The

Re: [PATCH RFC 23/26] migration/multifd: Device state transfer support - send side

2024-05-06 Thread Maciej S. Szmigiero

On 29.04.2024 22:04, Peter Xu wrote:

On Tue, Apr 16, 2024 at 04:43:02PM +0200, Maciej S. Szmigiero wrote:

+bool multifd_queue_page(RAMBlock *block, ram_addr_t offset)
+{
+g_autoptr(GMutexLocker) locker = NULL;
+
+/*
+ * Device state submissions for shared channels can come
+ * from multiple threads and conflict with page submissions
+ * with respect to multifd_send_state access.
+ */
+if (!multifd_send_state->device_state_dedicated_channels) {
+locker = g_mutex_locker_new(&multifd_send_state->queue_job_mutex);


Haven't read the rest, but suggest to stick with QemuMutex for the whole
patchset, as that's what we use in the rest migration code, along with
QEMU_LOCK_GUARD().



Ack, from a quick scan of QEMU thread sync primitives it seems that
QemuMutex with QemuLockable and QemuCond should fulfill the
requirements to replace GMutex, GMutexLocker and GCond.

Thanks,
Maciej




Re: [PATCH v2 11/25] target/i386: reintroduce debugging mechanism

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c  | 27 +++
  target/i386/tcg/decode-new.c.inc |  3 +++
  2 files changed, 30 insertions(+)


Acked-by: Richard Henderson 

r~



Re: [PATCH v2 10/25] target/i386: cleanup *gen_eob*

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Create a new wrapper for syscall/sysret, and do not go through multiple
layers of wrappers.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 25 -
  1 file changed, 12 insertions(+), 13 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 09/25] target/i386: clarify the "reg" argument of functions returning CCPrepare

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 15 ---
  1 file changed, 8 insertions(+), 7 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 08/25] target/i386: do not use s->T0 and s->T1 as scratch registers for CCPrepare

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Instead of using s->T0 or s->T1, create a scratch register
when computing the C, NC, L or LE conditions.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 17 ++---
  1 file changed, 10 insertions(+), 7 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 07/25] target/i386: extend cc_* when using them to compute flags

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Instead of using s->tmp0 or s->tmp4 as the result, just extend the cc_*
registers in place.  It is harmless and, if multiple setcc instructions
are used, the optimizer will be able to remove the redundant ones.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 44 +++--
  1 file changed, 18 insertions(+), 26 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 06/25] target/i386: pull cc_op update to callers of gen_jmp_rel{,_csize}

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

gen_update_cc_op must be called before control flow splits.  Doing it
in gen_jmp_rel{,_csize} may hide bugs, instead assert that cc_op is
clean---even if that means a few more calls to gen_update_cc_op().

With this new invariant, setting cc_op to CC_OP_DYNAMIC is unnecessary
since the caller should have done it.

Signed-off-by: Paolo Bonzini
---
  target/i386/tcg/translate.c | 8 +---
  1 file changed, 5 insertions(+), 3 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH v2 05/25] target/i386: cleanup cc_op changes for REP/REPZ/REPNZ

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

gen_update_cc_op must be called before control flow splits.  Do it
where the jump on ECX!=0 is translated.

On the other hand, remove the call before gen_jcc1, which takes care of
it already, and explain why REPZ/REPNZ need not use CC_OP_DYNAMIC---the
translation block ends before any control-flow-dependent cc_op could
be observed.

Signed-off-by: Paolo Bonzini 
---
  target/i386/tcg/translate.c | 13 +
  1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 3f1d2858fc9..6b766f5dd3f 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -1242,11 +1242,15 @@ static inline void gen_jcc1(DisasContext *s, int b, 
TCGLabel *l1)
  }
  
  /* XXX: does not work with gdbstub "ice" single step - not a

-   serious problem */
+   serious problem.  The caller can jump to the returned label
+   to stop the REP but, if the flags have changed, it has to call
+   gen_update_cc_op before doing so.  */
  static TCGLabel *gen_jz_ecx_string(DisasContext *s)
  {
  TCGLabel *l1 = gen_new_label();
  TCGLabel *l2 = gen_new_label();
+
+gen_update_cc_op(s);
  gen_op_jnz_ecx(s, l1);
  gen_set_label(l2);
  gen_jmp_rel_csize(s, 0, 1);
@@ -1342,7 +1346,6 @@ static void gen_repz(DisasContext *s, MemOp ot,
   void (*fn)(DisasContext *s, MemOp ot))
  {
  TCGLabel *l2;
-gen_update_cc_op(s);
  l2 = gen_jz_ecx_string(s);
  fn(s, ot);
  gen_op_add_reg_im(s, s->aflag, R_ECX, -1);


Ok.



@@ -1364,11 +1367,13 @@ static void gen_repz2(DisasContext *s, MemOp ot, int nz,
void (*fn)(DisasContext *s, MemOp ot))
  {
  TCGLabel *l2;
-gen_update_cc_op(s);
  l2 = gen_jz_ecx_string(s);
+/*
+ * Only one iteration is done at a time, so there is
+ * no control flow junction here and cc_op is never dynamic.
+ */
  fn(s, ot);
  gen_op_add_reg_im(s, s->aflag, R_ECX, -1);
-gen_update_cc_op(s);
  gen_jcc1(s, (JCC_Z << 1) | (nz ^ 1), l2);
  if (s->repz_opt) {
  gen_op_jz_ecx(s, l2);


Ok, but only because gen_jcc1 does the gen_update_cc_op.  The comment is neither correct 
nor necessary.


The reason to write cc_op before branches instead of junctions is to avoid having *two* 
writes of cc_op on either side of the branch.



r~



Re: [PATCH 0/3] Make it possible to compile the x86 binaries without FDC

2024-05-06 Thread Paolo Bonzini
On Thu, Apr 25, 2024 at 8:43 PM Thomas Huth  wrote:
> OTOH, it seems
> to work fine, and the FDC is only disabled when it is not available
> in the binary, so I hope this patch is fine, too.

We do the same for parallel so i think it should be fine---definitely
for -nodefaults, and I'd say in general too.  The CMOS byte already
has a way to communicate no-floppy (0, see cmos_get_fd_drive_type).

Paolo




Re: [PATCH v2 04/25] target/i386: cc_op is not dynamic in gen_jcc1

2024-05-06 Thread Richard Henderson

On 5/6/24 01:09, Paolo Bonzini wrote:

Resetting cc_op to CC_OP_DYNAMIC should be done at control flow junctions,
which is not the case here.  This translation block is ending and the
only effect of calling set_cc_op() would be a discard of s->cc_srcT.
This discard is useless (it's a temporary, not a global) and in fact
prevents gen_prepare_cc from returning s->cc_srcT.

Signed-off-by: Paolo Bonzini 
---
  target/i386/tcg/translate.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)


Reviewed-by: Richard Henderson 


r~



  1   2   3   4   >