[PATCH 2/2] virtio-blk: add zoned storage emulation for zoned devices

2022-09-09 Thread Sam Li
This patch extends virtio-blk emulation to handle zoned device commands
by calling the new block layer APIs to perform zoned device I/O on
behalf of the guest. It supports Report Zone, four zone oparations (open,
close, finish, reset), and Append Zone.

The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
support zoned block devices. Regular block devices(conventional zones)
will not be set.

The guest os having zoned device support can use blkzone(8) to test those
commands. Furthermore, using zonefs to test zone append write is also
supported.

Signed-off-by: Sam Li 
---
 hw/block/virtio-blk.c | 326 ++
 1 file changed, 326 insertions(+)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index e9ba752f6b..3ef74c01db 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -46,6 +46,8 @@ static const VirtIOFeature feature_sizes[] = {
  .end = endof(struct virtio_blk_config, discard_sector_alignment)},
 {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
  .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
+{.flags = 1ULL << VIRTIO_BLK_F_ZONED,
+ .end = endof(struct virtio_blk_config, zoned)},
 {}
 };
 
@@ -614,6 +616,273 @@ err:
 return err_status;
 }
 
+typedef struct ZoneCmdData {
+VirtIOBlockReq *req;
+union {
+struct {
+unsigned int nr_zones;
+BlockZoneDescriptor *zones;
+} ZoneReportData;
+struct {
+int64_t append_sector;
+} ZoneAppendData;
+};
+} ZoneCmdData;
+
+/*
+ * check zone_model: error checking before issuing requests. If all checks
+ * passed, return true.
+ * append: true if only zone append request issued.
+ */
+static bool check_zone_model(VirtIOBlock *s, int64_t sector, int64_t nr_sector,
+ bool append, uint8_t *status) {
+BlockDriverState *bs = blk_bs(s->blk);
+BlockZoneDescriptor *zone = &bs->bl.zones[sector / bs->bl.zone_sectors];
+int64_t max_append_sector = bs->bl.max_append_sectors;
+
+if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
+*status = VIRTIO_BLK_S_UNSUPP;
+return false;
+}
+
+if (zone->cond == BLK_ZS_OFFLINE) {
+*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+return false;
+}
+
+if (append) {
+if ((zone->type != BLK_ZT_SWR) || (zone->cond == BLK_ZS_RDONLY) ||
+(sector + nr_sector > (*(zone + 1)).start)) {
+/* the end sector of the request exceeds to next zone */
+*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+return false;
+}
+
+if (nr_sector > max_append_sector) {
+if (max_append_sector == 0) {
+*status = VIRTIO_BLK_S_UNSUPP;
+} else {
+*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+}
+return false;
+}
+}
+return true;
+}
+
+static void virtio_blk_zone_report_complete(void *opaque, int ret)
+{
+ZoneCmdData *data = opaque;
+VirtIOBlockReq *req = data->req;
+VirtIOBlock *s = req->dev;
+VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+struct iovec *in_iov = req->elem.in_sg;
+unsigned in_num = req->elem.in_num;
+int64_t zrp_size, nz, n, j = 0;
+int8_t err_status = VIRTIO_BLK_S_OK;
+
+nz = data->ZoneReportData.nr_zones;
+struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
+.nr_zones = cpu_to_le64(nz),
+};
+
+zrp_size = sizeof(struct virtio_blk_zone_report)
+   + sizeof(struct virtio_blk_zone_descriptor) * nz;
+n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
+if (n != sizeof(zrp_hdr)) {
+virtio_error(vdev, "Driver provided intput buffer that is too small!");
+err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+goto out;
+}
+
+for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct 
virtio_blk_zone_descriptor), ++j) {
+struct virtio_blk_zone_descriptor desc =
+(struct virtio_blk_zone_descriptor) {
+.z_start = 
cpu_to_le64(data->ZoneReportData.zones[j].start),
+.z_cap = 
cpu_to_le64(data->ZoneReportData.zones[j].cap),
+.z_wp = cpu_to_le64(data->ZoneReportData.zones[j].wp),
+.z_type = data->ZoneReportData.zones[j].type,
+.z_state = data->ZoneReportData.zones[j].cond,
+};
+n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
+if (n != sizeof(desc)) {
+virtio_error(vdev, "Driver provided input buffer "
+   "for descriptors that is too small!");
+err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+goto out;
+}
+}
+goto out;
+
+out:
+aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+virtio_blk_req_complete(req, err_status);
+

[PATCH 1/2] include: import virtio_blk headers from linux with zoned device support

2022-09-09 Thread Sam Li
Add file from Dmitry's "virtio-blk:add support for zoned block devices"
linux patch using scripts/update-linux-headers.sh. There is a link for
more information: https://github.com/dmitry-fomichev/virtblk-zbd

Signed-off-by: Sam Li 
---
 include/standard-headers/linux/virtio_blk.h | 109 
 1 file changed, 109 insertions(+)

diff --git a/include/standard-headers/linux/virtio_blk.h 
b/include/standard-headers/linux/virtio_blk.h
index 2dcc90826a..490bd21c76 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -40,6 +40,7 @@
 #define VIRTIO_BLK_F_MQ12  /* support more than one vq */
 #define VIRTIO_BLK_F_DISCARD   13  /* DISCARD is supported */
 #define VIRTIO_BLK_F_WRITE_ZEROES  14  /* WRITE ZEROES is supported */
+#define VIRTIO_BLK_F_ZONED 17  /* Zoned block device */
 
 /* Legacy feature bits */
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -119,6 +120,20 @@ struct virtio_blk_config {
uint8_t write_zeroes_may_unmap;
 
uint8_t unused1[3];
+
+   /* Secure erase fields that are defined in the virtio spec */
+   uint8_t sec_erase[12];
+
+   /* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
+   struct virtio_blk_zoned_characteristics {
+   __virtio32 zone_sectors;
+   __virtio32 max_open_zones;
+   __virtio32 max_active_zones;
+   __virtio32 max_append_sectors;
+   __virtio32 write_granularity;
+   uint8_t model;
+   uint8_t unused2[3];
+   } zoned;
 } QEMU_PACKED;
 
 /*
@@ -153,6 +168,27 @@ struct virtio_blk_config {
 /* Write zeroes command */
 #define VIRTIO_BLK_T_WRITE_ZEROES  13
 
+/* Zone append command */
+#define VIRTIO_BLK_T_ZONE_APPEND15
+
+/* Report zones command */
+#define VIRTIO_BLK_T_ZONE_REPORT16
+
+/* Open zone command */
+#define VIRTIO_BLK_T_ZONE_OPEN  18
+
+/* Close zone command */
+#define VIRTIO_BLK_T_ZONE_CLOSE 20
+
+/* Finish zone command */
+#define VIRTIO_BLK_T_ZONE_FINISH22
+
+/* Reset zone command */
+#define VIRTIO_BLK_T_ZONE_RESET 24
+
+/* Reset All zones command */
+#define VIRTIO_BLK_T_ZONE_RESET_ALL 26
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER   0x8000
@@ -172,6 +208,72 @@ struct virtio_blk_outhdr {
__virtio64 sector;
 };
 
+/*
+ * Supported zoned device models.
+ */
+
+/* Regular block device */
+#define VIRTIO_BLK_Z_NONE  0
+/* Host-managed zoned device */
+#define VIRTIO_BLK_Z_HM1
+/* Host-aware zoned device */
+#define VIRTIO_BLK_Z_HA2
+
+/*
+ * Zone descriptor. A part of VIRTIO_BLK_T_ZONE_REPORT command reply.
+ */
+struct virtio_blk_zone_descriptor {
+   /* Zone capacity */
+   __virtio64 z_cap;
+   /* The starting sector of the zone */
+   __virtio64 z_start;
+   /* Zone write pointer position in sectors */
+   __virtio64 z_wp;
+   /* Zone type */
+   uint8_t z_type;
+   /* Zone state */
+   uint8_t z_state;
+   uint8_t reserved[38];
+};
+
+struct virtio_blk_zone_report {
+   __virtio64 nr_zones;
+   uint8_t reserved[56];
+   struct virtio_blk_zone_descriptor zones[];
+};
+
+/*
+ * Supported zone types.
+ */
+
+/* Conventional zone */
+#define VIRTIO_BLK_ZT_CONV 1
+/* Sequential Write Required zone */
+#define VIRTIO_BLK_ZT_SWR  2
+/* Sequential Write Preferred zone */
+#define VIRTIO_BLK_ZT_SWP  3
+
+/*
+ * Zone states that are available for zones of all types.
+ */
+
+/* Not a write pointer (conventional zones only) */
+#define VIRTIO_BLK_ZS_NOT_WP   0
+/* Empty */
+#define VIRTIO_BLK_ZS_EMPTY1
+/* Implicitly Open */
+#define VIRTIO_BLK_ZS_IOPEN2
+/* Explicitly Open */
+#define VIRTIO_BLK_ZS_EOPEN3
+/* Closed */
+#define VIRTIO_BLK_ZS_CLOSED   4
+/* Read-Only */
+#define VIRTIO_BLK_ZS_RDONLY   13
+/* Full */
+#define VIRTIO_BLK_ZS_FULL 14
+/* Offline */
+#define VIRTIO_BLK_ZS_OFFLINE  15
+
 /* Unmap this range (only valid for write zeroes command) */
 #define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP 0x0001
 
@@ -198,4 +300,11 @@ struct virtio_scsi_inhdr {
 #define VIRTIO_BLK_S_OK0
 #define VIRTIO_BLK_S_IOERR 1
 #define VIRTIO_BLK_S_UNSUPP2
+
+/* Error codes that are specific to zoned block devices */
+#define VIRTIO_BLK_S_ZONE_INVALID_CMD 3
+#define VIRTIO_BLK_S_ZONE_UNALIGNED_WP4
+#define VIRTIO_BLK_S_ZONE_OPEN_RESOURCE   5
+#define VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE 6
+
 #endif /* _LINUX_VIRTIO_BLK_H */
-- 
2.37.3




[PATCH] block: introduce zone append write for zoned devices

2022-09-09 Thread Sam Li
A zone append command is a write operation that specifies the first
logical block of a zone as the write position. When writing to a zoned
block device using zone append, the byte offset of the write is pointing
to the write pointer of that zone. Upon completion the device will
respond with the position the data has been placed in the zone.

Signed-off-by: Sam Li 
---
 block/block-backend.c  |  65 +++
 block/file-posix.c | 169 -
 block/io.c |  21 
 block/raw-format.c |   7 ++
 include/block/block-common.h   |   2 +
 include/block/block-io.h   |   3 +
 include/block/block_int-common.h   |   9 ++
 include/block/raw-aio.h|   4 +-
 include/sysemu/block-backend-io.h  |   9 ++
 qemu-io-cmds.c |  62 +++
 tests/qemu-iotests/tests/zoned.out |   7 ++
 tests/qemu-iotests/tests/zoned.sh  |   9 ++
 12 files changed, 360 insertions(+), 7 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index ebe8d7bdf3..b77a1cb24b 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
 struct {
 BlockZoneOp op;
 } zone_mgmt;
+struct {
+int64_t *append_sector;
+} zone_append;
 };
 } BlkRwCo;
 
@@ -1869,6 +1872,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return &acb->common;
 }
 
+static void blk_aio_zone_append_entry(void *opaque) {
+BlkAioEmAIOCB *acb = opaque;
+BlkRwCo *rwco = &acb->rwco;
+
+rwco->ret = blk_co_zone_append(rwco->blk, rwco->zone_append.append_sector,
+   rwco->iobuf, rwco->flags);
+blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags,
+BlockCompletionFunc *cb, void *opaque) {
+BlkAioEmAIOCB *acb;
+Coroutine *co;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+acb->rwco = (BlkRwCo) {
+.blk= blk,
+.ret= NOT_DONE,
+.flags  = flags,
+.iobuf  = qiov,
+.zone_append = {
+.append_sector = offset,
+},
+};
+acb->has_returned = false;
+
+co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
+bdrv_coroutine_enter(blk_bs(blk), co);
+
+acb->has_returned = true;
+if (acb->rwco.ret != NOT_DONE) {
+replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+}
+
+return &acb->common;
+}
+
 /*
  * Send a zone_report command.
  * offset is a byte offset from the start of the device. No alignment
@@ -1920,6 +1964,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
BlockZoneOp op,
 return ret;
 }
 
+/*
+ * Send a zone_append command.
+ */
+int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
+QEMUIOVector *qiov, BdrvRequestFlags flags)
+{
+int ret;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+blk_wait_while_drained(blk);
+if (!blk_is_available(blk)) {
+blk_dec_in_flight(blk);
+return -ENOMEDIUM;
+}
+
+ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
+blk_dec_in_flight(blk);
+return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 354de22860..65500e43f4 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -173,6 +173,7 @@ typedef struct BDRVRawState {
 } stats;
 
 PRManager *pr_mgr;
+CoRwlock zones_lock;
 } BDRVRawState;
 
 typedef struct BDRVRawReopenState {
@@ -206,6 +207,8 @@ typedef struct RawPosixAIOData {
 struct {
 struct iovec *iov;
 int niov;
+int64_t *append_sector;
+BlockZoneDescriptor *zone;
 } io;
 struct {
 uint64_t cmd;
@@ -1333,6 +1336,9 @@ static int hdev_get_max_segments(int fd, struct stat *st) 
{
 #endif
 }
 
+static inline void parse_zone(struct BlockZoneDescriptor *zone,
+  const struct blk_zone *blkz);
+static int do_zone_report(int64_t offset, int fd, struct BlockZoneDescriptor 
*zones, unsigned int nrz);
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
 BDRVRawState *s = bs->opaque;
@@ -1402,6 +1408,19 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 if (ret >= 0) {
 bs->bl.max_active_zones = ret;
 }
+
+ret = get_sysfs_long_val(&st, "logical_block_size");
+if (ret >= 0) {
+bs->bl.logical_block_size = ret;
+}
+
+ret = get_sysfs_long_val(&st, "nr_zones");
+if (ret 

[PATCH] bugfix:migrate with block-dirty-bitmap (disk size is big enough) can't be finished

2022-09-09 Thread liuhaiwei
From: liuhaiwei 

bug description as  https://gitlab.com/qemu-project/qemu/-/issues/1203
Usually,we use the precopy or postcopy mode to migrate block dirty bitmap.
but if block-dirty-bitmap size more than threshold size,we cannot entry the 
migration_completion in migration_iteration_run function
To solve this problem, we can setting  the pending size to a fake 
value(threshold-1 or 0) to tell  migration_iteration_run function to entry the 
migration_completion,if pending size > threshold size

Signed-off-by: liuhaiwei 
Signed-off-by: liuhaiwei 
---
 migration/block-dirty-bitmap.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 9aba7d9c22..5cbf365f46 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -782,6 +782,10 @@ static void dirty_bitmap_save_pending(QEMUFile *f, void 
*opaque,
 }
 
 qemu_mutex_unlock_iothread();
+ /*we set the fake pending size  when the dirty bitmap size more than 
max_size */
+if(pending >= max_size && max_size != 0){
+pending = max_size - 1;
+}
 
 trace_dirty_bitmap_save_pending(pending, max_size);
 
-- 
2.27.0




[PATCH v9 5/7] config: add check to block layer

2022-09-09 Thread Sam Li
Putting zoned/non-zoned BlockDrivers on top of each other is not
allowed.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
---
 block.c  | 14 ++
 block/file-posix.c   | 14 ++
 block/raw-format.c   |  1 +
 include/block/block_int-common.h |  5 +
 4 files changed, 34 insertions(+)

diff --git a/block.c b/block.c
index bc85f46eed..dad2ed3959 100644
--- a/block.c
+++ b/block.c
@@ -7947,6 +7947,20 @@ void bdrv_add_child(BlockDriverState *parent_bs, 
BlockDriverState *child_bs,
 return;
 }
 
+/*
+ * Non-zoned block drivers do not follow zoned storage constraints
+ * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
+ * drivers in a graph.
+ */
+if (!parent_bs->drv->supports_zoned_children &&
+child_bs->bl.zoned == BLK_Z_HM) {
+error_setg(errp, "Cannot add a %s child to a %s parent",
+   child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
+   parent_bs->drv->supports_zoned_children ?
+   "support zoned children" : "not support zoned children");
+return;
+}
+
 if (!QLIST_EMPTY(&child_bs->parents)) {
 error_setg(errp, "The node %s already has a parent",
child_bs->node_name);
diff --git a/block/file-posix.c b/block/file-posix.c
index 4edfa25d04..354de22860 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -779,6 +779,20 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 goto fail;
 }
 }
+#ifdef CONFIG_BLKZONED
+/*
+ * The kernel page chache does not reliably work for writes to SWR zones
+ * of zoned block device because it can not guarantee the order of writes.
+ */
+if (strcmp(bs->drv->format_name, "zoned_host_device") == 0) {
+if (!(s->open_flags & O_DIRECT)) {
+error_setg(errp, "driver=zoned_host_device was specified, but it "
+ "requires cache.direct=on, which was not 
specified.");
+ret = -EINVAL;
+return ret; /* No host kernel page cache */
+}
+}
+#endif
 
 if (S_ISBLK(st.st_mode)) {
 #ifdef BLKDISCARDZEROES
diff --git a/block/raw-format.c b/block/raw-format.c
index 6b20bd22ef..9441536819 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -614,6 +614,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild 
*c,
 BlockDriver bdrv_raw = {
 .format_name  = "raw",
 .instance_size= sizeof(BDRVRawState),
+.supports_zoned_children = true,
 .bdrv_probe   = &raw_probe,
 .bdrv_reopen_prepare  = &raw_reopen_prepare,
 .bdrv_reopen_commit   = &raw_reopen_commit,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 078ddd7e67..043aa161a0 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -127,6 +127,11 @@ struct BlockDriver {
  */
 bool is_format;
 
+/*
+ * Set to true if the BlockDriver supports zoned children.
+ */
+bool supports_zoned_children;
+
 /*
  * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
  * this field set to true, except ones that are defined only by their
-- 
2.37.3




[PATCH v9 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-09-09 Thread Sam Li
Add a new zoned_host_device BlockDriver. The zoned_host_device option
accepts only zoned host block devices. By adding zone management
operations in this new BlockDriver, users can use the new block
layer APIs including Report Zone and four zone management operations
(open, close, finish, reset).

Qemu-io uses the new APIs to perform zoned storage commands of the device:
zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs),
zone_finish(zf).

For example, to test zone_report, use following command:
$ ./build/qemu-io --image-opts -n driver=zoned_host_device, filename=/dev/nullb0
-c "zrp offset nr_zones"

Signed-off-by: Sam Li 
Reviewed-by: Hannes Reinecke 
---
 block/block-backend.c | 145 ++
 block/file-posix.c| 323 +-
 block/io.c|  41 
 include/block/block-io.h  |   7 +
 include/block/block_int-common.h  |  21 ++
 include/block/raw-aio.h   |   6 +-
 include/sysemu/block-backend-io.h |  17 ++
 meson.build   |   1 +
 qapi/block-core.json  |   8 +-
 qemu-io-cmds.c| 143 +
 10 files changed, 708 insertions(+), 4 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index d4a5df2ac2..ebe8d7bdf3 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
 void *iobuf;
 int ret;
 BdrvRequestFlags flags;
+union {
+struct {
+unsigned int *nr_zones;
+BlockZoneDescriptor *zones;
+} zone_report;
+struct {
+BlockZoneOp op;
+} zone_mgmt;
+};
 } BlkRwCo;
 
 int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
@@ -1775,6 +1784,142 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
 return ret;
 }
 
+static void blk_aio_zone_report_entry(void *opaque) {
+BlkAioEmAIOCB *acb = opaque;
+BlkRwCo *rwco = &acb->rwco;
+
+rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
+   rwco->zone_report.nr_zones,
+   rwco->zone_report.zones);
+blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
+unsigned int *nr_zones,
+BlockZoneDescriptor  *zones,
+BlockCompletionFunc *cb, void *opaque)
+{
+BlkAioEmAIOCB *acb;
+Coroutine *co;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+acb->rwco = (BlkRwCo) {
+.blk= blk,
+.offset = offset,
+.ret= NOT_DONE,
+.zone_report = {
+.zones = zones,
+.nr_zones = nr_zones,
+},
+};
+acb->has_returned = false;
+
+co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
+bdrv_coroutine_enter(blk_bs(blk), co);
+
+acb->has_returned = true;
+if (acb->rwco.ret != NOT_DONE) {
+replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+}
+
+return &acb->common;
+}
+
+static void blk_aio_zone_mgmt_entry(void *opaque) {
+BlkAioEmAIOCB *acb = opaque;
+BlkRwCo *rwco = &acb->rwco;
+
+rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
+ rwco->offset, acb->bytes);
+blk_aio_complete(acb);
+}
+
+BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+  int64_t offset, int64_t len,
+  BlockCompletionFunc *cb, void *opaque) {
+BlkAioEmAIOCB *acb;
+Coroutine *co;
+IO_CODE();
+
+blk_inc_in_flight(blk);
+acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
+acb->rwco = (BlkRwCo) {
+.blk= blk,
+.offset = offset,
+.ret= NOT_DONE,
+.zone_mgmt = {
+.op = op,
+},
+};
+acb->bytes = len;
+acb->has_returned = false;
+
+co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
+bdrv_coroutine_enter(blk_bs(blk), co);
+
+acb->has_returned = true;
+if (acb->rwco.ret != NOT_DONE) {
+replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
+ blk_aio_complete_bh, acb);
+}
+
+return &acb->common;
+}
+
+/*
+ * Send a zone_report command.
+ * offset is a byte offset from the start of the device. No alignment
+ * required for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+unsigned int *nr_zones,
+BlockZoneDescriptor *zones)
+{
+int ret;
+IO_CODE();
+
+blk_inc_in_flight(blk); /* increase before wait

[PATCH v9 2/7] file-posix: introduce helper functions for sysfs attributes

2022-09-09 Thread Sam Li
Use get_sysfs_str_val() to get the string value of device
zoned model. Then get_sysfs_zoned_model() can convert it to
BlockZoneModel type of QEMU.

Use get_sysfs_long_val() to get the long value of zoned device
information.

Signed-off-by: Sam Li 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Stefan Hajnoczi 
---
 block/file-posix.c   | 121 ++-
 include/block/block_int-common.h |   3 +
 2 files changed, 88 insertions(+), 36 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 48cd096624..0a8b4b426e 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1210,66 +1210,109 @@ static int hdev_get_max_hw_transfer(int fd, struct 
stat *st)
 #endif
 }
 
-static int hdev_get_max_segments(int fd, struct stat *st)
-{
+/*
+ * Get a sysfs attribute value as character string.
+ */
+static int get_sysfs_str_val(struct stat *st, const char *attribute,
+ char **val) {
 #ifdef CONFIG_LINUX
-char buf[32];
-const char *end;
-char *sysfspath = NULL;
+g_autofree char *sysfspath = NULL;
 int ret;
-int sysfd = -1;
-long max_segments;
+size_t len;
 
-if (S_ISCHR(st->st_mode)) {
-if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
-return ret;
-}
+if (!S_ISBLK(st->st_mode)) {
 return -ENOTSUP;
 }
 
-if (!S_ISBLK(st->st_mode)) {
-return -ENOTSUP;
+sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
+major(st->st_rdev), minor(st->st_rdev),
+attribute);
+ret = g_file_get_contents(sysfspath, val, &len, NULL);
+if (ret == -1) {
+return -ENOENT;
 }
 
-sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-major(st->st_rdev), minor(st->st_rdev));
-sysfd = open(sysfspath, O_RDONLY);
-if (sysfd == -1) {
-ret = -errno;
-goto out;
+/* The file is ended with '\n' */
+char *p;
+p = *val;
+if (*(p + len - 1) == '\n') {
+*(p + len - 1) = '\0';
 }
-do {
-ret = read(sysfd, buf, sizeof(buf) - 1);
-} while (ret == -1 && errno == EINTR);
+return ret;
+#else
+return -ENOTSUP;
+#endif
+}
+
+static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned) {
+g_autofree char *val = NULL;
+int ret;
+
+ret = get_sysfs_str_val(st, "zoned", &val);
 if (ret < 0) {
-ret = -errno;
-goto out;
-} else if (ret == 0) {
-ret = -EIO;
-goto out;
+return ret;
 }
-buf[ret] = 0;
-/* The file is ended with '\n', pass 'end' to accept that. */
-ret = qemu_strtol(buf, &end, 10, &max_segments);
-if (ret == 0 && end && *end == '\n') {
-ret = max_segments;
+
+if (strcmp(val, "host-managed") == 0) {
+*zoned = BLK_Z_HM;
+} else if (strcmp(val, "host-aware") == 0) {
+*zoned = BLK_Z_HA;
+} else if (strcmp(val, "none") == 0) {
+*zoned = BLK_Z_NONE;
+} else {
+return -ENOTSUP;
 }
+return 0;
+}
 
-out:
-if (sysfd != -1) {
-close(sysfd);
+/*
+ * Get a sysfs attribute value as a long integer.
+ */
+static long get_sysfs_long_val(struct stat *st, const char *attribute) {
+#ifdef CONFIG_LINUX
+g_autofree char *str = NULL;
+const char *end;
+long val;
+int ret;
+
+ret = get_sysfs_str_val(st, attribute, &str);
+if (ret < 0) {
+return ret;
+}
+
+/* The file is ended with '\n', pass 'end' to accept that. */
+ret = qemu_strtol(str, &end, 10, &val);
+if (ret == 0 && end && *end == '\0') {
+ret = val;
 }
-g_free(sysfspath);
 return ret;
 #else
 return -ENOTSUP;
 #endif
 }
 
+static int hdev_get_max_segments(int fd, struct stat *st) {
+#ifdef CONFIG_LINUX
+int ret;
+
+if (S_ISCHR(st->st_mode)) {
+if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
+return ret;
+}
+return -ENOTSUP;
+}
+return get_sysfs_long_val(st, "max_segments");
+#else
+return -ENOTSUP;
+#endif
+}
+
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
 BDRVRawState *s = bs->opaque;
 struct stat st;
+int ret;
+BlockZoneModel zoned;
 
 s->needs_alignment = raw_needs_alignment(bs);
 raw_probe_alignment(bs, s->fd, errp);
@@ -1307,6 +1350,12 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 bs->bl.max_hw_iov = ret;
 }
 }
+
+ret = get_sysfs_zoned_model(&st, &zoned);
+if (ret < 0) {
+zoned = BLK_Z_NONE;
+}
+bs->bl.zoned = zoned;
 }
 
 static int check_for_dasd(int fd)
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 8947abab76..7f7863cc9e 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -825,6 +825,9 @@ typedef struct BlockLimits {
 
 /* maximum number of iovec el

[PATCH v9 6/7] qemu-iotests: test new zone operations

2022-09-09 Thread Sam Li
We have added new block layer APIs of zoned block devices. Test it with:
Create a null_blk device, run each zone operation on it and see
whether reporting right zone information.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
---
 tests/qemu-iotests/tests/zoned.out | 53 +++
 tests/qemu-iotests/tests/zoned.sh  | 85 ++
 2 files changed, 138 insertions(+)
 create mode 100644 tests/qemu-iotests/tests/zoned.out
 create mode 100755 tests/qemu-iotests/tests/zoned.sh

diff --git a/tests/qemu-iotests/tests/zoned.out 
b/tests/qemu-iotests/tests/zoned.out
new file mode 100644
index 00..0c8f96deb9
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned.out
@@ -0,0 +1,53 @@
+QA output created by zoned.sh
+Testing a null_blk device:
+Simple cases: if the operations work
+(1) report the first zone:
+start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
+
+report the first 10 zones
+start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
+start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 2]
+start: 0x10, len 0x8, cap 0x8, wptr 0x10, zcond:1, [type: 2]
+start: 0x18, len 0x8, cap 0x8, wptr 0x18, zcond:1, [type: 2]
+start: 0x20, len 0x8, cap 0x8, wptr 0x20, zcond:1, [type: 2]
+start: 0x28, len 0x8, cap 0x8, wptr 0x28, zcond:1, [type: 2]
+start: 0x30, len 0x8, cap 0x8, wptr 0x30, zcond:1, [type: 2]
+start: 0x38, len 0x8, cap 0x8, wptr 0x38, zcond:1, [type: 2]
+start: 0x40, len 0x8, cap 0x8, wptr 0x40, zcond:1, [type: 2]
+start: 0x48, len 0x8, cap 0x8, wptr 0x48, zcond:1, [type: 2]
+
+report the last zone:
+start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1, [type: 
2]
+
+
+(2) opening the first zone
+report after:
+start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:3, [type: 2]
+
+opening the second zone
+report after:
+start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:3, [type: 2]
+
+opening the last zone
+report after:
+start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:3, [type: 
2]
+
+
+(3) closing the first zone
+report after:
+start: 0x0, len 0x8, cap 0x8, wptr 0x0, zcond:1, [type: 2]
+
+closing the last zone
+report after:
+start: 0x1f38, len 0x8, cap 0x8, wptr 0x1f38, zcond:1, [type: 
2]
+
+
+(4) finishing the second zone
+After finishing a zone:
+start: 0x8, len 0x8, cap 0x8, wptr 0x10, zcond:14, [type: 2]
+
+
+(5) resetting the second zone
+After resetting a zone:
+start: 0x8, len 0x8, cap 0x8, wptr 0x8, zcond:1, [type: 2]
+*** done
diff --git a/tests/qemu-iotests/tests/zoned.sh 
b/tests/qemu-iotests/tests/zoned.sh
new file mode 100755
index 00..871f47efed
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+#
+# Test zone management operations.
+#
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+status=1 # failure is the default!
+
+_cleanup()
+{
+  _cleanup_test_img
+  sudo rmmod null_blk
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+# This test only runs on Linux hosts with raw image files.
+_supported_fmt raw
+_supported_proto file
+_supported_os Linux
+
+QEMU_IO="build/qemu-io"
+IMG="--image-opts -n driver=zoned_host_device,filename=/dev/nullb0"
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+
+echo "Testing a null_blk device:"
+echo "Simple cases: if the operations work"
+sudo modprobe null_blk nr_devices=1 zoned=1
+
+echo "(1) report the first zone:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "report the first 10 zones"
+sudo $QEMU_IO $IMG -c "zrp 0 10"
+echo
+echo "report the last zone:"
+sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2" # 0x3e7000 / 512 = 0x1f38
+echo
+echo
+echo "(2) opening the first zone"
+sudo $QEMU_IO $IMG -c "zo 0 268435456"  # 268435456 / 512 = 524288
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "opening the second zone"
+sudo $QEMU_IO $IMG -c "zo 268435456 268435456" #
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+echo
+echo "opening the last zone"
+sudo $QEMU_IO $IMG -c "zo 0x3e7000 268435456"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2"
+echo
+echo
+echo "(3) closing the first zone"
+sudo $QEMU_IO $IMG -c "zc 0 268435456"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "closing the last zone"
+sudo $QEMU_IO $IMG -c "zc 0x3e7000 268435456"
+echo "report after:"
+sudo $QEMU_IO $IMG -c "zrp 0x3e7000 2"
+echo
+echo
+echo "(4) finishing the second zone"
+sudo $QEMU_IO $IMG -c "zf 268435456 268435456"
+echo "After finishing a zone:"
+sudo $QEMU_IO $IMG -c "zrp 268435456 1"
+echo
+echo
+echo "(5) resetting the second zone"
+sudo $QEMU_IO $IMG -c "zrs 268435456 268435456"
+echo "After resetting a zone:"
+sudo $QEM

[PATCH v9 7/7] docs/zoned-storage: add zoned device documentation

2022-09-09 Thread Sam Li
Add the documentation about the zoned device support to virtio-blk
emulation.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
---
 docs/devel/zoned-storage.rst   | 41 ++
 docs/system/qemu-block-drivers.rst.inc |  6 
 2 files changed, 47 insertions(+)
 create mode 100644 docs/devel/zoned-storage.rst

diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
new file mode 100644
index 00..ead2d149cc
--- /dev/null
+++ b/docs/devel/zoned-storage.rst
@@ -0,0 +1,41 @@
+=
+zoned-storage
+=
+
+Zoned Block Devices (ZBDs) devide the LBA space into block regions called zones
+that are larger than the LBA size. It can only allow sequential writes, which
+reduces write amplification in SSDs, leading to higher throughput and increased
+capacity. More details about ZBDs can be found at:
+
+https://zonedstorage.io/docs/introduction/zoned-storage
+
+1. Block layer APIs for zoned storage
+-
+QEMU block layer has three zoned storage model:
+- BLK_Z_HM: This model only allows sequential writes access. It supports a set
+of ZBD-specific I/O request that used by the host to manage device zones.
+- BLK_Z_HA: It deals with both sequential writes and random writes access.
+- BLK_Z_NONE: Regular block devices and drive-managed ZBDs are treated as
+non-zoned devices.
+
+The block device information resides inside BlockDriverState. QEMU uses
+BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
+block layer while processing I/O requests. A BlockBackend has a root pointer to
+a BlockDriverState graph(for example, raw format on top of file-posix). The
+zoned storage information can be propagated from the leaf BlockDriverState all
+the way up to the BlockBackend. If the zoned storage model in file-posix is
+set to BLK_Z_HM, then block drivers will declare support for zoned host device.
+
+The block layer APIs support commands needed for zoned storage devices,
+including report zones, four zone operations, and zone append.
+
+2. Emulating zoned storage controllers
+--
+When the BlockBackend's BlockLimits model reports a zoned storage device, users
+like the virtio-blk emulation or the qemu-io-cmds.c utility can use block layer
+APIs for zoned storage emulation or testing.
+
+For example, the command line for zone report testing a null_blk device of
+qemu-io-cmds.c is:
+$ path/to/qemu-io --image-opts driver=zoned_host_device,filename=/dev/nullb0 -c
+"zrp offset nr_zones"
diff --git a/docs/system/qemu-block-drivers.rst.inc 
b/docs/system/qemu-block-drivers.rst.inc
index dfe5d2293d..0b97227fd9 100644
--- a/docs/system/qemu-block-drivers.rst.inc
+++ b/docs/system/qemu-block-drivers.rst.inc
@@ -430,6 +430,12 @@ Hard disks
   you may corrupt your host data (use the ``-snapshot`` command
   line option or modify the device permissions accordingly).
 
+Zoned block devices
+  Zoned block devices can be passed through to the guest if the emulated 
storage
+  controller supports zoned storage. Use ``--blockdev zoned_host_device,
+  node-name=drive0,filename=/dev/nullb0`` to pass through ``/dev/nullb0``
+  as ``drive0``.
+
 Windows
 ^^^
 
-- 
2.37.3




[PATCH v9 4/7] raw-format: add zone operations to pass through requests

2022-09-09 Thread Sam Li
raw-format driver usually sits on top of file-posix driver. It needs to
pass through requests of zone commands.

Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
---
 block/raw-format.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/block/raw-format.c b/block/raw-format.c
index 69fd650eaf..6b20bd22ef 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -314,6 +314,17 @@ static int coroutine_fn raw_co_pdiscard(BlockDriverState 
*bs,
 return bdrv_co_pdiscard(bs->file, offset, bytes);
 }
 
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t 
offset,
+   unsigned int *nr_zones,
+   BlockZoneDescriptor *zones) {
+return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
+}
+
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
+ int64_t offset, int64_t len) {
+return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
+}
+
 static int64_t raw_getlength(BlockDriverState *bs)
 {
 int64_t len;
@@ -614,6 +625,8 @@ BlockDriver bdrv_raw = {
 .bdrv_co_pwritev  = &raw_co_pwritev,
 .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
 .bdrv_co_pdiscard = &raw_co_pdiscard,
+.bdrv_co_zone_report  = &raw_co_zone_report,
+.bdrv_co_zone_mgmt  = &raw_co_zone_mgmt,
 .bdrv_co_block_status = &raw_co_block_status,
 .bdrv_co_copy_range_from = &raw_co_copy_range_from,
 .bdrv_co_copy_range_to  = &raw_co_copy_range_to,
-- 
2.37.3




[PATCH v9 1/7] include: add zoned device structs

2022-09-09 Thread Sam Li
Signed-off-by: Sam Li 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Damien Le Moal 
---
 include/block/block-common.h | 43 
 1 file changed, 43 insertions(+)

diff --git a/include/block/block-common.h b/include/block/block-common.h
index fdb7306e78..36bd0e480e 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -49,6 +49,49 @@ typedef struct BlockDriver BlockDriver;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildClass BdrvChildClass;
 
+typedef enum BlockZoneOp {
+BLK_ZO_OPEN,
+BLK_ZO_CLOSE,
+BLK_ZO_FINISH,
+BLK_ZO_RESET,
+} BlockZoneOp;
+
+typedef enum BlockZoneModel {
+BLK_Z_NONE = 0x0, /* Regular block device */
+BLK_Z_HM = 0x1, /* Host-managed zoned block device */
+BLK_Z_HA = 0x2, /* Host-aware zoned block device */
+} BlockZoneModel;
+
+typedef enum BlockZoneCondition {
+BLK_ZS_NOT_WP = 0x0,
+BLK_ZS_EMPTY = 0x1,
+BLK_ZS_IOPEN = 0x2,
+BLK_ZS_EOPEN = 0x3,
+BLK_ZS_CLOSED = 0x4,
+BLK_ZS_RDONLY = 0xD,
+BLK_ZS_FULL = 0xE,
+BLK_ZS_OFFLINE = 0xF,
+} BlockZoneCondition;
+
+typedef enum BlockZoneType {
+BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
+BLK_ZT_SWR = 0x2, /* Sequential writes required */
+BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
+} BlockZoneType;
+
+/*
+ * Zone descriptor data structure.
+ * Provides information on a zone with all position and size values in bytes.
+ */
+typedef struct BlockZoneDescriptor {
+uint64_t start;
+uint64_t length;
+uint64_t cap;
+uint64_t wp;
+BlockZoneType type;
+BlockZoneCondition cond;
+} BlockZoneDescriptor;
+
 typedef struct BlockDriverInfo {
 /* in bytes, 0 if irrelevant */
 int cluster_size;
-- 
2.37.3




[PATCH v9 0/7] Add support for zoned device

2022-09-09 Thread Sam Li
Zoned Block Devices (ZBDs) devide the LBA space to block regions called zones
that are larger than the LBA size. It can only allow sequential writes, which
reduces write amplification in SSD, leading to higher throughput and increased
capacity. More details about ZBDs can be found at:

https://zonedstorage.io/docs/introduction/zoned-storage

The zoned device support aims to let guests (virtual machines) access zoned
storage devices on the host (hypervisor) through a virtio-blk device. This
involves extending QEMU's block layer and virtio-blk emulation code.  In its
current status, the virtio-blk device is not aware of ZBDs but the guest sees
host-managed drives as regular drive that will runs correctly under the most
common write workloads.

This patch series extend the block layer APIs with the minimum set of zoned
commands that are necessary to support zoned devices. The commands are - Report
Zones, four zone operations and Zone Append (developing).

It can be tested on a null_blk device using qemu-io or qemu-iotests. For
example, to test zone report using qemu-io:
$ path/to/qemu-io --image-opts -n driver=zoned_host_device,filename=/dev/nullb0
-c "zrp offset nr_zones"

v9:
- address review comments
  * specify units of zone commands requests [Stefan]
  * fix some error handling in file-posix [Stefan]
  * introduce zoned_host_devcie in the commit message [Markus]

v8:
- address review comments
  * solve patch conflicts and merge sysfs helper funcations into one patch
  * add cache.direct=on check in config

v7:
- address review comments
  * modify sysfs attribute helper funcations
  * move the input validation and error checking into raw_co_zone_* function
  * fix checks in config

v6:
- drop virtio-blk emulation changes
- address Stefan's review comments
  * fix CONFIG_BLKZONED configs in related functions
  * replace reading fd by g_file_get_contents() in get_sysfs_str_val()
  * rewrite documentation for zoned storage

v5:
- add zoned storage emulation to virtio-blk device
- add documentation for zoned storage
- address review comments
  * fix qemu-iotests
  * fix check to block layer
  * modify interfaces of sysfs helper functions
  * rename zoned device structs according to QEMU styles
  * reorder patches

v4:
- add virtio-blk headers for zoned device
- add configurations for zoned host device
- add zone operations for raw-format
- address review comments
  * fix memory leak bug in zone_report
  * add checks to block layers
  * fix qemu-iotests format
  * fix sysfs helper functions

v3:
- add helper functions to get sysfs attributes
- address review comments
  * fix zone report bugs
  * fix the qemu-io code path
  * use thread pool to avoid blocking ioctl() calls

v2:
- add qemu-io sub-commands
- address review comments
  * modify interfaces of APIs

v1:
- add block layer APIs resembling Linux ZoneBlockDevice ioctls

Sam Li (7):
  include: add zoned device structs
  file-posix: introduce helper functions for sysfs attributes
  block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  raw-format: add zone operations to pass through requests
  config: add check to block layer
  qemu-iotests: test new zone operations
  docs/zoned-storage: add zoned device documentation

 block.c|  14 +
 block/block-backend.c  | 145 
 block/file-posix.c | 458 +++--
 block/io.c |  41 +++
 block/raw-format.c |  14 +
 docs/devel/zoned-storage.rst   |  41 +++
 docs/system/qemu-block-drivers.rst.inc |   6 +
 include/block/block-common.h   |  43 +++
 include/block/block-io.h   |   7 +
 include/block/block_int-common.h   |  29 ++
 include/block/raw-aio.h|   6 +-
 include/sysemu/block-backend-io.h  |  17 +
 meson.build|   1 +
 qapi/block-core.json   |   8 +-
 qemu-io-cmds.c | 143 
 tests/qemu-iotests/tests/zoned.out |  53 +++
 tests/qemu-iotests/tests/zoned.sh  |  85 +
 17 files changed, 1071 insertions(+), 40 deletions(-)
 create mode 100644 docs/devel/zoned-storage.rst
 create mode 100644 tests/qemu-iotests/tests/zoned.out
 create mode 100755 tests/qemu-iotests/tests/zoned.sh

-- 
2.37.3




Re: [PATCH 6/7] .gitlab-ci.d/windows.yml: Unify the prerequisite packages

2022-09-09 Thread Yonggang Luo
How about using github actions, I tried before and it's running fast, there
is no resource restriction.
Just don't know how to trigger it through gitlab, if that's possible, then
it's would be good

On Sat, Sep 10, 2022 at 8:33 AM Bin Meng  wrote:
>
> On Sat, Sep 10, 2022 at 12:32 AM Thomas Huth  wrote:
> >
> > On 08/09/2022 15.28, Bin Meng wrote:
> > > From: Bin Meng 
> > >
> > > At present the prerequisite packages for 64-bit and 32-bit builds
> > > are slightly different. Let's use the same packages for both.
> >
> > Not sure whether that's a good idea ... I did that on purpose to save
some
> > few time during compilation (since the Windows jobs are running very
long
> > already) ... did you check whether it makes a difference in the run
time now?
> >
>
> Not much difference on the build time. Actually I found after we
> switched to single thread build the time did not increase too.
>
> One side note regarding the gitlab shared runner:
>
> It seems the shared runner Windows VM is quite slow. Is it possible to
> get a faster VM externally?
>
> Regards,
> Bin
>


--
 此致
礼
罗勇刚
Yours
sincerely,
Yonggang Luo


[PATCH] vfio/common: Do not g_free in vfio_get_iommu_info

2022-09-09 Thread Nicolin Chen
Its caller vfio_connect_container() assigns a default value
to info->iova_pgsizes, even if vfio_get_iommu_info() fails.
This would result in a "Segmentation fault" error, when the
VFIO_IOMMU_GET_INFO ioctl errors out.

Since the caller has g_free already, drop the g_free in its
rollback routine and add a line of comments to highlight it.

Fixes: 87ea529c50 ("vfio: Get migration capability flags for container")
Cc: Kirti Wankhede 
Signed-off-by: Nicolin Chen 
---
 hw/vfio/common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index ace9562a9b..51b2e05c76 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1940,6 +1940,7 @@ static int vfio_init_container(VFIOContainer *container, 
int group_fd,
 return 0;
 }
 
+/* The caller is responsible for g_free(*info) */
 static int vfio_get_iommu_info(VFIOContainer *container,
struct vfio_iommu_type1_info **info)
 {
@@ -1951,8 +1952,6 @@ again:
 (*info)->argsz = argsz;
 
 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
-g_free(*info);
-*info = NULL;
 return -errno;
 }
 
-- 
2.17.1




Re: [PATCH 3/7] scripts/nsis.py: Automatically package required DLLs of QEMU executables

2022-09-09 Thread Bin Meng
On Sat, Sep 10, 2022 at 12:49 AM Mark Cave-Ayland
 wrote:
>
> On 08/09/2022 14:28, Bin Meng wrote:
>
> > From: Bin Meng 
> >
> > At present packaging the required DLLs of QEMU executables is a
> > manual process, and error prone.
> >
> > Actually build/config-host.mak contains a GLIB_BINDIR variable
> > which is the directory where glib and other DLLs reside. This
> > works for both Windows native build and cross-build on Linux.
> > We can use it as the search directory for DLLs and automate
> > the whole DLL packaging process.
> >
> > Signed-off-by: Bin Meng 
> > ---
> >
> >   meson.build |  1 +
> >   scripts/nsis.py | 46 ++
> >   2 files changed, 43 insertions(+), 4 deletions(-)
> >
> > diff --git a/meson.build b/meson.build
> > index c2adb7caf4..4c03850f9f 100644
> > --- a/meson.build
> > +++ b/meson.build
> > @@ -3657,6 +3657,7 @@ if host_machine.system() == 'windows'
> >   '@OUTPUT@',
> >   get_option('prefix'),
> >   meson.current_source_dir(),
> > +config_host['GLIB_BINDIR'],
> >   host_machine.cpu(),
> >   '--',
> >   '-DDISPLAYVERSION=' + meson.project_version(),
> > diff --git a/scripts/nsis.py b/scripts/nsis.py
> > index baa6ef9594..03ed7608a2 100644
> > --- a/scripts/nsis.py
> > +++ b/scripts/nsis.py
> > @@ -18,12 +18,36 @@ def signcode(path):
> >   return
> >   subprocess.run([cmd, path])
> >
> > +def find_deps(exe_or_dll, search_path, analyzed_deps):
> > +deps = [exe_or_dll]
> > +output = subprocess.check_output(["objdump", "-p", exe_or_dll], 
> > text=True)
> > +output = output.split("\n")
> > +for line in output:
> > +if not line.startswith("\tDLL Name: "):
> > +continue
> > +
> > +dep = line.split("DLL Name: ")[1].strip()
> > +if dep in analyzed_deps:
> > +continue
> > +
> > +dll = os.path.join(search_path, dep)
> > +if not os.path.exists(dll):
> > +# assume it's a Windows provided dll, skip it
> > +continue
> > +
> > +analyzed_deps.add(dep)
> > +# locate the dll dependencies recursively
> > +rdeps = find_deps(dll, search_path, analyzed_deps)
> > +deps.extend(rdeps)
> > +
> > +return deps
> >
> >   def main():
> >   parser = argparse.ArgumentParser(description="QEMU NSIS build 
> > helper.")
> >   parser.add_argument("outfile")
> >   parser.add_argument("prefix")
> >   parser.add_argument("srcdir")
> > +parser.add_argument("dlldir")
> >   parser.add_argument("cpu")
> >   parser.add_argument("nsisargs", nargs="*")
> >   args = parser.parse_args()
> > @@ -63,9 +87,26 @@ def main():
> >   !insertmacro MUI_DESCRIPTION_TEXT ${{Section_{0}}} "{1}"
> >   """.format(arch, desc))
> >
> > +search_path = args.dlldir
> > +print("Searching '%s' for the dependent dlls ..." % search_path)
> > +dlldir = os.path.join(destdir + prefix, "dll")
> > +os.mkdir(dlldir)
> > +
> >   for exe in glob.glob(os.path.join(destdir + prefix, "*.exe")):
> >   signcode(exe)
> >
> > +# find all dll dependencies
> > +deps = set(find_deps(exe, search_path, set()))
> > +deps.remove(exe)
> > +
> > +# copy all dlls to the DLLDIR
> > +for dep in deps:
> > +dllfile = os.path.join(dlldir, os.path.basename(dep))
> > +if (os.path.exists(dllfile)):
> > +continue
> > +print("Copying '%s' to '%s'" % (dep, dllfile))
> > +shutil.copy(dep, dllfile)
> > +
> >   makensis = [
> >   "makensis",
> >   "-V2",
> > @@ -73,12 +114,9 @@ def main():
> >   "-DSRCDIR=" + args.srcdir,
> >   "-DBINDIR=" + destdir + prefix,
> >   ]
> > -dlldir = "w32"
> >   if args.cpu == "x86_64":
> > -dlldir = "w64"
> >   makensis += ["-DW64"]
> > -if os.path.exists(os.path.join(args.srcdir, "dll")):
> > -makensis += ["-DDLLDIR={0}/dll/{1}".format(args.srcdir, 
> > dlldir)]
> > +makensis += ["-DDLLDIR=" + dlldir]
> >
> >   makensis += ["-DOUTFILE=" + args.outfile] + args.nsisargs
> >   subprocess.run(makensis)
>
> FWIW I wrote a similar script a while back to help package a custom Windows 
> build for
> a client, however I used ldd instead of objdump since it provided the full 
> paths for
> DLLs installed in the msys2/mingw-w64 environment via pacman which were 
> outside the
> QEMU build tree.
>

Yep, ldd also works, but only on Windows native build. objdump can
work on both Windows native and Linux cross builds.

> Once the complete list of DLLs was obtained, it was simple matter of 
> filtering out
> those DLLs that started with the %WINDIR% prefix before copying them to the 
> final
> distribution directory.
>

Regards,
Bin



Re: [PATCH 6/7] .gitlab-ci.d/windows.yml: Unify the prerequisite packages

2022-09-09 Thread Bin Meng
On Sat, Sep 10, 2022 at 12:32 AM Thomas Huth  wrote:
>
> On 08/09/2022 15.28, Bin Meng wrote:
> > From: Bin Meng 
> >
> > At present the prerequisite packages for 64-bit and 32-bit builds
> > are slightly different. Let's use the same packages for both.
>
> Not sure whether that's a good idea ... I did that on purpose to save some
> few time during compilation (since the Windows jobs are running very long
> already) ... did you check whether it makes a difference in the run time now?
>

Not much difference on the build time. Actually I found after we
switched to single thread build the time did not increase too.

One side note regarding the gitlab shared runner:

It seems the shared runner Windows VM is quite slow. Is it possible to
get a faster VM externally?

Regards,
Bin



Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

2022-09-09 Thread Kirill A . Shutemov
On Fri, Sep 09, 2022 at 12:11:05PM -0700, Andy Lutomirski wrote:
> 
> 
> On Fri, Sep 9, 2022, at 7:32 AM, Kirill A . Shutemov wrote:
> > On Thu, Sep 08, 2022 at 09:48:35PM -0700, Andy Lutomirski wrote:
> >> On 8/19/22 17:27, Kirill A. Shutemov wrote:
> >> > On Thu, Aug 18, 2022 at 08:00:41PM -0700, Hugh Dickins wrote:
> >> > > On Thu, 18 Aug 2022, Kirill A . Shutemov wrote:
> >> > > > On Wed, Aug 17, 2022 at 10:40:12PM -0700, Hugh Dickins wrote:
> >> > > > > 
> >> > > > > If your memory could be swapped, that would be enough of a good 
> >> > > > > reason
> >> > > > > to make use of shmem.c: but it cannot be swapped; and although 
> >> > > > > there
> >> > > > > are some references in the mailthreads to it perhaps being 
> >> > > > > swappable
> >> > > > > in future, I get the impression that will not happen soon if ever.
> >> > > > > 
> >> > > > > If your memory could be migrated, that would be some reason to use
> >> > > > > filesystem page cache (because page migration happens to understand
> >> > > > > that type of memory): but it cannot be migrated.
> >> > > > 
> >> > > > Migration support is in pipeline. It is part of TDX 1.5 [1]. And 
> >> > > > swapping
> >> > > > theoretically possible, but I'm not aware of any plans as of now.
> >> > > > 
> >> > > > [1] 
> >> > > > https://www.intel.com/content/www/us/en/developer/articles/technical/intel-trust-domain-extensions.html
> >> > > 
> >> > > I always forget, migration means different things to different 
> >> > > audiences.
> >> > > As an mm person, I was meaning page migration, whereas a virtualization
> >> > > person thinks VM live migration (which that reference appears to be 
> >> > > about),
> >> > > a scheduler person task migration, an ornithologist bird migration, 
> >> > > etc.
> >> > > 
> >> > > But you're an mm person too: you may have cited that reference in the
> >> > > knowledge that TDX 1.5 Live Migration will entail page migration of the
> >> > > kind I'm thinking of.  (Anyway, it's not important to clarify that 
> >> > > here.)
> >> > 
> >> > TDX 1.5 brings both.
> >> > 
> >> > In TDX speak, mm migration called relocation. See TDH.MEM.PAGE.RELOCATE.
> >> > 
> >> 
> >> This seems to be a pretty bad fit for the way that the core mm migrates
> >> pages.  The core mm unmaps the page, then moves (in software) the contents
> >> to a new address, then faults it in.  TDH.MEM.PAGE.RELOCATE doesn't fit 
> >> into
> >> that workflow very well.  I'm not saying it can't be done, but it won't 
> >> just
> >> work.
> >
> > Hm. From what I see we have all necessary infrastructure in place.
> >
> > Unmaping is NOP for inaccessible pages as it is never mapped and we have
> > mapping->a_ops->migrate_folio() callback that allows to replace software
> > copying with whatever is needed, like TDH.MEM.PAGE.RELOCATE.
> >
> > What do I miss?
> 
> Hmm, maybe this isn't as bad as I thought.
> 
> Right now, unless I've missed something, the migration workflow is to
> unmap (via try_to_migrate) all mappings, then migrate the backing store
> (with ->migrate_folio(), although it seems like most callers expect the
> actual copy to happen outside of ->migrate_folio(),

Most? I guess you are talking about MIGRATE_SYNC_NO_COPY, right? AFAICS,
it is HMM thing and not a common thing.

> and then make new
> mappings.  With the *current* (vma-based, not fd-based) model for KVM
> memory, this won't work -- we can't unmap before calling
> TDH.MEM.PAGE.RELOCATE.

We don't need to unmap. The page is not mapped from core-mm PoV.

> But maybe it's actually okay with some care or maybe mild modifications
> with the fd-based model.  We don't have any mmaps, per se, to unmap for
> secret / INACCESSIBLE memory.  So maybe we can get all the way to
> ->migrate_folio() without zapping anything in the secure EPT and just
> call TDH-MEM.PAGE.RELOCATE from inside migrate_folio().  And there will
> be nothing to fault back in.  From the core code's perspective, it's
> like migrating a memfd that doesn't happen to have my mappings at the
> time.

Modifications needed if we want to initiate migation from userspace. IIRC,
we don't have any API that can initiate page migration for file ranges,
without mapping the file.

But kernel can do it fine for own housekeeping, like compaction doesn't
need any VMA. And we need compaction working for long term stability of
the system.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov



virtiofsd: Any reason why there's not an "openat2" sandbox mode?

2022-09-09 Thread Colin Walters
We previously had a chat here 
https://lore.kernel.org/all/348d4774-bd5f-4832-bd7e-a21491fda...@www.fastmail.com/T/
around virtiofsd and privileges and the case of trying to run virtiofsd inside 
an unprivileged (Kubernetes) container.

Right now we're still using 9p, and it has bugs (basically it seems like the 9p 
inode flushing callback tries to allocate memory to send an RPC, and this 
causes OOM problems)
https://github.com/coreos/coreos-assembler/issues/1812

Coming back to this...as of lately in Linux, there's support for strongly 
isolated filesystem access via openat2():
https://lwn.net/Articles/796868/

Is there any reason we couldn't do an -o sandbox=openat2 ?  This operates 
without any privileges at all, and should be usable (and secure enough) in our 
use case.

I may try a patch if this sounds OK...



Re: [RFC PATCH 2/3] hw/peci: add PECI support for NPCM7xx BMCs

2022-09-09 Thread Peter Delevoryas
On Tue, Sep 06, 2022 at 10:05:51PM +, Titus Rwantare wrote:
> This allows BMC firmware for npcm7xx BMCs to talk to a PECI client
> in qemu.
> 
> Signed-off-by: Titus Rwantare 
> Reviewed-by: Patrick Venture 

Looks good to me!

Reviewed-by: Peter Delevoryas 

> ---
>  MAINTAINERS|   3 +-
>  hw/arm/Kconfig |   1 +
>  hw/arm/npcm7xx.c   |   9 ++
>  hw/peci/meson.build|   1 +
>  hw/peci/npcm7xx_peci.c | 204 +
>  hw/peci/trace-events   |   5 +
>  include/hw/arm/npcm7xx.h   |   2 +
>  include/hw/peci/npcm7xx_peci.h |  37 ++
>  8 files changed, 261 insertions(+), 1 deletion(-)
>  create mode 100644 hw/peci/npcm7xx_peci.c
>  create mode 100644 include/hw/peci/npcm7xx_peci.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 14ab29679d..f87dfe5bfa 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2959,7 +2959,7 @@ R: Paolo Bonzini 
>  R: Bandan Das 
>  R: Stefan Hajnoczi 
>  R: Thomas Huth 
> -R: Darren Kenny  
> +R: Darren Kenny 
>  R: Qiuhao Li 
>  S: Maintained
>  F: tests/qtest/fuzz/
> @@ -3218,6 +3218,7 @@ S: Maintained
>  F: hw/peci/peci-core.c
>  F: hw/peci/peci-client.c
>  F: include/hw/peci/peci.h
> +F: hw/peci/npcm7xx_peci.c
>  
>  Firmware schema specifications
>  M: Philippe Mathieu-Daudé 
> diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
> index 15fa79afd3..cb38c6c88f 100644
> --- a/hw/arm/Kconfig
> +++ b/hw/arm/Kconfig
> @@ -408,6 +408,7 @@ config NPCM7XX
>  select SSI
>  select UNIMP
>  select PCA954X
> +select PECI
>  
>  config FSL_IMX25
>  bool
> diff --git a/hw/arm/npcm7xx.c b/hw/arm/npcm7xx.c
> index d85cc02765..d408dd7eb4 100644
> --- a/hw/arm/npcm7xx.c
> +++ b/hw/arm/npcm7xx.c
> @@ -45,6 +45,7 @@
>  #define NPCM7XX_CLK_BA  (0xf0801000)
>  #define NPCM7XX_MC_BA   (0xf0824000)
>  #define NPCM7XX_RNG_BA  (0xf000b000)
> +#define NPCM7XX_PECI_BA (0xf010)
>  
>  /* USB Host modules */
>  #define NPCM7XX_EHCI_BA (0xf0806000)
> @@ -83,6 +84,7 @@ enum NPCM7xxInterrupt {
>  NPCM7XX_UART1_IRQ,
>  NPCM7XX_UART2_IRQ,
>  NPCM7XX_UART3_IRQ,
> +NPCM7XX_PECI_IRQ= 6,
>  NPCM7XX_EMC1RX_IRQ  = 15,
>  NPCM7XX_EMC1TX_IRQ,
>  NPCM7XX_MMC_IRQ = 26,
> @@ -445,6 +447,7 @@ static void npcm7xx_init(Object *obj)
>  }
>  
>  object_initialize_child(obj, "mmc", &s->mmc, TYPE_NPCM7XX_SDHCI);
> +object_initialize_child(obj, "peci", &s->peci, TYPE_NPCM7XX_PECI);
>  }
>  
>  static void npcm7xx_realize(DeviceState *dev, Error **errp)
> @@ -715,6 +718,12 @@ static void npcm7xx_realize(DeviceState *dev, Error 
> **errp)
>  sysbus_connect_irq(SYS_BUS_DEVICE(&s->mmc), 0,
>  npcm7xx_irq(s, NPCM7XX_MMC_IRQ));
>  
> + /* PECI */
> +sysbus_realize(SYS_BUS_DEVICE(&s->peci), &error_abort);
> +sysbus_mmio_map(SYS_BUS_DEVICE(&s->peci), 0, NPCM7XX_PECI_BA);
> +sysbus_connect_irq(SYS_BUS_DEVICE(&s->peci), 0,
> +   npcm7xx_irq(s, NPCM7XX_PECI_IRQ));
> +
>  create_unimplemented_device("npcm7xx.shm",  0xc0001000,   4 * 
> KiB);
>  create_unimplemented_device("npcm7xx.vdmx", 0xe080,   4 * 
> KiB);
>  create_unimplemented_device("npcm7xx.pcierc",   0xe100,  64 * 
> KiB);
> diff --git a/hw/peci/meson.build b/hw/peci/meson.build
> index 01cfa95abe..ee033eb915 100644
> --- a/hw/peci/meson.build
> +++ b/hw/peci/meson.build
> @@ -1 +1,2 @@
>  softmmu_ss.add(when: 'CONFIG_PECI', if_true: files('peci-core.c', 
> 'peci-client.c'))
> +softmmu_ss.add(when: 'CONFIG_NPCM7XX', if_true: files('npcm7xx_peci.c'))
> diff --git a/hw/peci/npcm7xx_peci.c b/hw/peci/npcm7xx_peci.c
> new file mode 100644
> index 00..17a2642898
> --- /dev/null
> +++ b/hw/peci/npcm7xx_peci.c
> @@ -0,0 +1,204 @@
> +/*
> + * Nuvoton NPCM7xx PECI Module
> + *
> + * Copyright 2021 Google LLC
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/peci/npcm7xx_peci.h"
> +#include "qemu/bitops.h"
> +#include "qemu/log.h"
> +#include "qemu/units.h"
> +#include "trace.h"
> +
> +#define PECI_CTL_STS0
> +#define PECI_CTL_STS_DONE_EN  BIT(6)
> +#define PECI_CTL_STS_ABRT_ERR BIT(4)
> +#define PECI_CTL_STS_CRC_ERR  BIT(3)
> +#define PECI_CTL_STS_DONE BIT(1)
> +#define PECI_CTL_STS_START_BUSY   BIT(0)
> +#define PECI_RD_LENGTH  0x4
> +#define PECI_ADDR   0x8
> +#define PECI_CMD0xC
> +#define PECI_CTL2   0x10
> +#define PECI_WR_LENGTH  0x1C
> +#define PECI_PDDR   0x2C
> +#define PECI_DAT_INOUT(reg)(0x100 + (reg) * 4)
> +
> +static uint64_t npcm7xx_peci_read(void *opaque, hwaddr offset, unsigned size)
> +{
> +NPCM7xxPECIState *ps = NPCM7XX_PECI(opaque);
> +uint8_t ret = 0;
> +
> +if (!ps->bus->num_clients) {
> +qemu_log_mask(LOG_GUEST_ERROR, "%s: no peci clien

Re: [RFC PATCH 1/3] hw/peci: add initial support for PECI

2022-09-09 Thread Peter Delevoryas
On Tue, Sep 06, 2022 at 10:05:50PM +, Titus Rwantare wrote:
> PECI - Platform Environment Control Interface
> 
> This commit adds support for reading basic sensor values from a client
> on the PECI bus.
> BMCs can use the PECI wire to get thermal information out of an Intel
> cpu. Additionally, on hardware, various MSRs are exposed over the
> PECI bus. Part of PCI config space is exposed due to Intel posting
> various platform configuration in PCI config space.
> 
> Commands implemented:
> - Ping
> - GetDIB
> - GetTemp
> - GetPkgConfig (partial)
> 
> Commands not implemented:
> - RdIAMSR
> - RdPCIConfig
> - RdPCIConfigLocal
> 
> Signed-off-by: Titus Rwantare 
> Reviewed-by: Hao Wu 
> Reviewed-by: Patrick Venture 
> ---
>  MAINTAINERS|   7 ++
>  hw/Kconfig |   1 +
>  hw/meson.build |   1 +
>  hw/peci/Kconfig|   2 +
>  hw/peci/meson.build|   1 +
>  hw/peci/peci-client.c  | 230 +
>  hw/peci/peci-core.c| 182 
>  hw/peci/trace-events   |   5 +
>  hw/peci/trace.h|   1 +
>  include/hw/peci/peci.h | 194 ++
>  meson.build|   1 +
>  11 files changed, 625 insertions(+)
>  create mode 100644 hw/peci/Kconfig
>  create mode 100644 hw/peci/meson.build
>  create mode 100644 hw/peci/peci-client.c
>  create mode 100644 hw/peci/peci-core.c
>  create mode 100644 hw/peci/trace-events
>  create mode 100644 hw/peci/trace.h
>  create mode 100644 include/hw/peci/peci.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 5ce4227ff6..14ab29679d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3212,6 +3212,13 @@ F: tests/qtest/adm1272-test.c
>  F: tests/qtest/max34451-test.c
>  F: tests/qtest/isl_pmbus_vr-test.c
>  
> +PECI
> +M: Titus Rwantare 
> +S: Maintained
> +F: hw/peci/peci-core.c
> +F: hw/peci/peci-client.c
> +F: include/hw/peci/peci.h
> +
>  Firmware schema specifications
>  M: Philippe Mathieu-Daudé 
>  R: Daniel P. Berrange 
> diff --git a/hw/Kconfig b/hw/Kconfig
> index 38233bbb0f..300ab48127 100644
> --- a/hw/Kconfig
> +++ b/hw/Kconfig
> @@ -24,6 +24,7 @@ source net/Kconfig
>  source nubus/Kconfig
>  source nvme/Kconfig
>  source nvram/Kconfig
> +source peci/Kconfig
>  source pci-bridge/Kconfig
>  source pci-host/Kconfig
>  source pcmcia/Kconfig
> diff --git a/hw/meson.build b/hw/meson.build
> index c7ac7d3d75..340cc88a52 100644
> --- a/hw/meson.build
> +++ b/hw/meson.build
> @@ -28,6 +28,7 @@ subdir('pci')
>  subdir('pci-bridge')
>  subdir('pci-host')
>  subdir('pcmcia')
> +subdir('peci')
>  subdir('rdma')
>  subdir('rtc')
>  subdir('scsi')
> diff --git a/hw/peci/Kconfig b/hw/peci/Kconfig
> new file mode 100644
> index 00..fe4f665d21
> --- /dev/null
> +++ b/hw/peci/Kconfig
> @@ -0,0 +1,2 @@
> +config PECI
> +bool
> diff --git a/hw/peci/meson.build b/hw/peci/meson.build
> new file mode 100644
> index 00..01cfa95abe
> --- /dev/null
> +++ b/hw/peci/meson.build
> @@ -0,0 +1 @@
> +softmmu_ss.add(when: 'CONFIG_PECI', if_true: files('peci-core.c', 
> 'peci-client.c'))
> diff --git a/hw/peci/peci-client.c b/hw/peci/peci-client.c
> new file mode 100644
> index 00..2aa797b5f6
> --- /dev/null
> +++ b/hw/peci/peci-client.c
> @@ -0,0 +1,230 @@
> +/*
> + * PECI Client device
> + * Copyright 2021 Google LLC
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later

Not sure, but I think the SPDX license identifier is supposed to be in
the first line? Maybe not though. I would have expected:

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * PECI Client device
 *
 * Copyright 2021 Google LLC
 */

> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/peci/peci.h"
> +#include "hw/qdev-core.h"
> +#include "hw/qdev-properties.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qapi/visitor.h"
> +#include "qemu/module.h"
> +#include "qemu/log.h"
> +#include "qom/object.h"
> +
> +/*
> + * A PECI client represents an Intel socket and the peripherals attached to 
> it
> + * that are accessible over the PECI bus.
> + */
> +
> +#define PECI_CLIENT_DEFAULT_TEMP 30
> +
> +static void peci_client_update_temps(PECIClientDevice *client)
> +{
> +uint8_t temp_cpu = 0;
> +for (size_t i = 0; i < client->cpu_cores; i++) {
> +if (temp_cpu < client->core_temp[i]) {
> +temp_cpu = client->core_temp[i];
> +}
> +}
> +client->core_temp_max = -1 * (client->tjmax - temp_cpu);
> +
> +uint8_t temp_dimm = 0;
> +for (size_t i = 0; i < client->dimms; i++) {
> +if (temp_dimm < client->dimm_temp[i]) {
> +temp_dimm = client->dimm_temp[i];
> +}
> +}
> +client->dimm_temp_max = temp_dimm;
> +}
> +
> +PECIClientDevice *peci_get_client(PECIBus *bus, uint8_t addr)
> +{
> +PECIClientDevice *client;
> +BusChild *var;
> +
> +QTAILQ_FOREACH(var, &bus->qbus.children, sibling) {
> +DeviceState *dev = var->child;
> +client = PECI_CLIENT(dev);

Re: [RFC PATCH 0/3] Initial PECI bus support

2022-09-09 Thread Peter Delevoryas
On Tue, Sep 06, 2022 at 10:05:49PM +, Titus Rwantare wrote:
> The Platform Environment Control Interface (PECI), is a way for Intel
> processors to communicate with management controllers.
> 
> This series of patches simulate some PECI subsystem functionality. This
> work is currently used against Nuvoton 7xx BMC, but it can easily be
> extended to support Aspeed BMCs. Most of the functionality is derived
> from PECI support in openbmc. See https://github.com/openbmc/libpeci
> 
> The main consumer of this work is openbmc, so functionality not
> exercised by the openbmc/libpeci is unlikely to be present here.
> 
> peci-core.c is an attempt to split out functionality defined by the
> spec. Anything that is not expected to change between BMC vendors.
> 
> The following commands have some support:
> Ping()
> GetDIB()
> GetTemp()
> ~RdPkgConfig()
> ~RdEndPtConfig()
> 
> To be implemented:
> RdIAMSR()
> RdPCIConfig()
> RdPCIConfigLocal()
> 
> Currently, in the board file during bmc_init() one may specify defaults
> as follows:
> 
> static void my_machine_peci_init(NPCM7xxState *soc)
> {
> PECIBus *peci_bus = npcm7xx_peci_get_bus(soc);
> DeviceState *dev;
> 
> /* per socket properties - both sockets are identical in this case */
> PECIClientProperties peci_props = {
> .cpu_family = FAM6_SAPPHIRE_RAPIDS_X,
> .cpus = 56,
> .dimms = 16
> };
> 
> /* socket 0 - with example setting a few of the cpu and dimm temperatures 
> in millidegrees */
> dev = DEVICE(peci_add_client(peci_bus, 0x30, &peci_props));
> object_property_set_uint(OBJECT(dev), "cpu_temp[0]", 3, &error_abort);
> object_property_set_uint(OBJECT(dev), "cpu_temp[2]", 35000, &error_abort);
> object_property_set_uint(OBJECT(dev), "dimm_temp[1]", 4, 
> &error_abort);
> object_property_set_uint(OBJECT(dev), "dimm_temp[8]", 36000, 
> &error_abort);
> 
> /* socket 1 */
> dev = DEVICE(peci_add_client(peci_bus, 0x31, &peci_props));
> object_property_set_uint(OBJECT(dev), "cpu_temp[9]", 5, &error_abort);
> object_property_set_uint(OBJECT(dev), "dimm_temp[0]", 31000, 
> &error_abort);
> object_property_set_uint(OBJECT(dev), "dimm_temp[14]", 36000, 
> &error_abort);
> ...
> }
> 
> This is something that can also be extended as other parameters arise that 
> need
> to differ between platforms. So far you can have have different CPUs, DIMM 
> counts,
> DIMM temperatures here. These fields can also be adjusted at runtime through 
> qmp.

That looks good to me, seems like the standard way to do it in QEMU.

> 
> A lot of the registers are hard coded, see hw/peci/peci-client.c. I'd like to
> gauge interest in what potential users would like to be adjustable at runtime.
> I've not written QEMU models that read config files at runtime, something I'd
> appreciate guidance on.

This part I don't totally understand. I also barely know anything about
PECI.

Is the register location for things different between CPU generations?

If so (and I expect it probably is), why is there only a configuration
for Sapphire Rapids, and not for the other ones?

Is that because of PECI protocol changes between generations?

In which case, maybe there needs to be a notion of PECI version
somewhere?

Also, I don't understand why it would be adjustable at runtime, do we
change register locations during execution?

I would expect it to be part of the board definition.

You could provide a bunch of sample configs for the CPU's that you're
testing for, and the board configuration could just select the sample
config it is using (corresponding to the CPU model).

That's the model I would imagine, but I might be missing some important
context here.

Thanks,
Peter

> 
> Thanks all
> 
> Titus Rwantare (3):
>   hw/peci: add initial support for PECI
>   hw/peci: add PECI support for NPCM7xx BMCs
>   hw/peci: add support for EndPointConfig reads
> 
>  MAINTAINERS|  10 +-
>  hw/Kconfig |   1 +
>  hw/arm/Kconfig |   1 +
>  hw/arm/npcm7xx.c   |   9 +
>  hw/meson.build |   1 +
>  hw/peci/Kconfig|   2 +
>  hw/peci/meson.build|   2 +
>  hw/peci/npcm7xx_peci.c | 204 +++
>  hw/peci/peci-client.c  | 293 +
>  hw/peci/peci-core.c| 222 +
>  hw/peci/trace-events   |  10 ++
>  hw/peci/trace.h|   1 +
>  include/hw/arm/npcm7xx.h   |   2 +
>  include/hw/peci/npcm7xx_peci.h |  37 +
>  include/hw/peci/peci.h | 217 
>  meson.build|   1 +
>  16 files changed, 1012 insertions(+), 1 deletion(-)
>  create mode 100644 hw/peci/Kconfig
>  create mode 100644 hw/peci/meson.build
>  create mode 100644 hw/peci/npcm7xx_peci.c
>  create mode 100644 hw/peci/peci-client.c
>  create mode 100644 hw/peci/peci-core

Re: [RFC PATCH 3/3] hw/peci: add support for EndPointConfig reads

2022-09-09 Thread Peter Delevoryas
On Tue, Sep 06, 2022 at 10:05:52PM +, Titus Rwantare wrote:
> Signed-off-by: Titus Rwantare 
> Reviewed-by: Hao Wu 
> ---
>  hw/peci/peci-client.c  | 63 ++
>  hw/peci/peci-core.c| 44 +++--
>  include/hw/peci/peci.h | 23 +++
>  3 files changed, 128 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/peci/peci-client.c b/hw/peci/peci-client.c
> index 2aa797b5f6..8d9248 100644
> --- a/hw/peci/peci-client.c
> +++ b/hw/peci/peci-client.c
> @@ -23,6 +23,64 @@
>  
>  #define PECI_CLIENT_DEFAULT_TEMP 30
>  
> +/* TODO: move this out into a config */
> +static const PECIEndPtConfig spr_config[] = {
> +{
> +.hdr.msg_type = LOCAL_PCI_CFG,
> +.hdr.addr_type = 0x4,
> +.hdr.bus = 31,
> +.hdr.dev = 0,
> +.hdr.func = 2,
> +.hdr.reg = 0xD4,
> +.data = BIT(31)
> +},
> +{
> +.hdr.msg_type = LOCAL_PCI_CFG,
> +.hdr.addr_type = 0x4,
> +.hdr.bus = 31,
> +.hdr.dev = 0,
> +.hdr.func = 2,
> +.hdr.reg = 0xD0,
> +.data = BIT(31) | BIT(30)
> +},
> +{
> +.hdr.msg_type = LOCAL_PCI_CFG,
> +.hdr.addr_type = 0x4,
> +.hdr.bus = 31,
> +.hdr.dev = 30,
> +.hdr.func = 6,
> +.hdr.reg = 0x84,
> +.data = 0x03FF
> +},
> +{
> +.hdr.msg_type = LOCAL_PCI_CFG,
> +.hdr.addr_type = 0x4,
> +.hdr.bus = 31,
> +.hdr.dev = 30,
> +.hdr.func = 6,
> +.hdr.reg = 0x80,
> +.data = 0x
> +},
> +{
> +.hdr.msg_type = LOCAL_PCI_CFG,
> +.hdr.addr_type = 0x4,
> +.hdr.bus = 31,
> +.hdr.dev = 30,
> +.hdr.func = 6,
> +.hdr.reg = 0x84,
> +.data = 0x03FF
> +},
> +{
> +.hdr.msg_type = LOCAL_PCI_CFG,
> +.hdr.addr_type = 0x4,
> +.hdr.bus = 31,
> +.hdr.dev = 30,
> +.hdr.func = 6,
> +.hdr.reg = 0x80,
> +.data = 0x
> +},
> +};
> +
>  static void peci_client_update_temps(PECIClientDevice *client)
>  {
>  uint8_t temp_cpu = 0;
> @@ -115,7 +173,12 @@ PECIClientDevice *peci_add_client(PECIBus *bus,
>  break;
>  
>  case FAM6_ICELAKE_X:
> +client->revision = 0x40;
> +break;
> +
>  case FAM6_SAPPHIRE_RAPIDS_X:
> +client->endpt_conf = spr_config;
> +client->num_entries = sizeof(spr_config) / sizeof(spr_config[0]);
>  client->revision = 0x40;
>  client->ucode = 0x8c0004a0;
>  break;
> diff --git a/hw/peci/peci-core.c b/hw/peci/peci-core.c
> index 8210bfa198..a961ae51f3 100644
> --- a/hw/peci/peci-core.c
> +++ b/hw/peci/peci-core.c
> @@ -22,6 +22,47 @@
>  #define PECI_FCS_OK 0
>  #define PECI_FCS_ERR1
>  
> +static PECIEndPtHeader peci_fmt_end_pt_header(PECICmd *pcmd)
> +{
> +uint32_t val = pcmd->rx[7] | (pcmd->rx[8] << 8) | (pcmd->rx[9] << 16) |
> +  (pcmd->rx[10] << 24);
> +
> +PECIEndPtHeader header = {
> +.msg_type = pcmd->rx[1],
> +.addr_type = pcmd->rx[5],
> +.bus = (val >> 20) & 0xFF,
> +.dev = (val >> 15) & 0x1F,
> +.func = (val >> 12) & 0x7,
> +.reg = val & 0xFFF,
> +};
> +
> +return header;
> +}
> +
> +static void peci_rd_endpt_cfg(PECIClientDevice *client, PECICmd *pcmd)
> +{
> +PECIPkgCfg *resp = (PECIPkgCfg *)pcmd->tx;
> +PECIEndPtHeader req = peci_fmt_end_pt_header(pcmd);
> +PECIEndPtConfig const *c;
> +
> +if (client->endpt_conf) {
> +for (size_t i = 0; i < client->num_entries; i++) {
> +c = &client->endpt_conf[i];
> +
> +if (!memcmp(&req, &c->hdr, sizeof(PECIEndPtHeader))) {
> +resp->data = c->data;
> +resp->cc = PECI_DEV_CC_SUCCESS;
> +return;
> +}
> +}
> +}
> +
> +qemu_log_mask(LOG_UNIMP,
> +  "%s: msg_type: 0x%x bus: %u, dev: %u, func: %u, reg: 
> 0x%x\n",
> +  __func__, req.msg_type, req.bus, req.dev, req.func, 
> req.reg);
> +
> +}
> +
>  static void peci_rd_pkg_cfg(PECIClientDevice *client, PECICmd *pcmd)
>  {
>  PECIPkgCfg *resp = (PECIPkgCfg *)pcmd->tx;
> @@ -153,8 +194,7 @@ int peci_handle_cmd(PECIBus *bus, PECICmd *pcmd)
>  break;
>  
>  case PECI_CMD_RD_END_PT_CFG:
> -qemu_log_mask(LOG_UNIMP, "%s: unimplemented CMD_RD_END_PT_CFG\n",
> -  __func__);
> +peci_rd_endpt_cfg(client, pcmd);
>  break;
>  
>  default:
> diff --git a/include/hw/peci/peci.h b/include/hw/peci/peci.h
> index 1a0abe65cd..4fb2fc236e 100644
> --- a/include/hw/peci/peci.h
> +++ b/include/hw/peci/peci.h
> @@ -112,6 +112,26 @@ typedef struct PECITempTarget {
>  uint8_t tjmax;
>  } PECITempTarget;
>  
> +typedef enum PECIEndPtType {
> +LOCAL_PCI_CFG = 3,
> +PCI_CFG,
> +MMIO_BDF,
> +} PECIEndPtType;
> +
>

Re: [PATCH 04/20] ppc4xx: Use Ppc4xxSdramBank in ppc4xx_sdram_banks()

2022-09-09 Thread BALATON Zoltan

On Fri, 2 Sep 2022, BALATON Zoltan wrote:

On Fri, 2 Sep 2022, Cédric Le Goater wrote:

On 8/19/22 18:55, BALATON Zoltan wrote:

Change ppc4xx_sdram_banks() to take one Ppc4xxSdramBank array instead
of the separate arrays and adjust ppc4xx_sdram_init() and
ppc440_sdram_init() accordingly as well as machines using these.

Signed-off-by: BALATON Zoltan 
---
  hw/ppc/ppc405.h |  4 +---
  hw/ppc/ppc405_uc.c  | 10 +-
  hw/ppc/ppc440.h |  5 ++---
  hw/ppc/ppc440_bamboo.c  | 15 ++-
  hw/ppc/ppc440_uc.c  |  9 -
  hw/ppc/ppc4xx_devs.c| 21 +
  hw/ppc/sam460ex.c   | 15 +--
  include/hw/ppc/ppc4xx.h |  9 +++--
  8 files changed, 35 insertions(+), 53 deletions(-)

diff --git a/hw/ppc/ppc405.h b/hw/ppc/ppc405.h
index 756865621b..ca0972b88b 100644
--- a/hw/ppc/ppc405.h
+++ b/hw/ppc/ppc405.h
@@ -167,9 +167,7 @@ struct Ppc405SoCState {
  DeviceState parent_obj;
/* Public */
-MemoryRegion ram_banks[2];
-hwaddr ram_bases[2], ram_sizes[2];
-
+Ppc4xxSdramBank ram_banks[2];
  MemoryRegion *dram_mr;
  hwaddr ram_size;
  diff --git a/hw/ppc/ppc405_uc.c b/hw/ppc/ppc405_uc.c
index 2833d0d538..461d18c8a5 100644
--- a/hw/ppc/ppc405_uc.c
+++ b/hw/ppc/ppc405_uc.c
@@ -1071,14 +1071,14 @@ static void ppc405_soc_realize(DeviceState *dev, 
Error **errp)

/* SDRAM controller */
  /* XXX 405EP has no ECC interrupt */
-s->ram_bases[0] = 0;
-s->ram_sizes[0] = s->ram_size;
-memory_region_init_alias(&s->ram_banks[0], OBJECT(s),
+s->ram_banks[0].base = 0;
+s->ram_banks[0].size = s->ram_size;
+memory_region_init_alias(&s->ram_banks[0].ram, OBJECT(s),
   "ppc405.sdram0", s->dram_mr,
- s->ram_bases[0], s->ram_sizes[0]);
+ s->ram_banks[0].base, s->ram_banks[0].size);
ppc4xx_sdram_init(env, qdev_get_gpio_in(DEVICE(&s->uic), 17), 1,
-  s->ram_banks, s->ram_bases, s->ram_sizes);
+  s->ram_banks);


Compile fails later on :

../hw/ppc/ppc405_uc.c: In function ‘ppc405_soc_realize’:
../hw/ppc/ppc405_uc.c:1083:5: error: ‘ppc4xx_sdram_init’ accessing 576 
bytes in a region of size 272 [-Werror=stringop-overflow=]

1083 | ppc4xx_sdram_init(env, qdev_get_gpio_in(DEVICE(&s->uic), 17), 1,
 | ^~~~
1084 |   s->ram_banks);
 |   ~
../hw/ppc/ppc405_uc.c:1083:5: note: referencing argument 4 of type 
‘Ppc4xxSdramBank[0]’


I don't understand this error. The 576 bytes seems to be 
sizeof(Ppc4xxSdramBank) and 272 is sizeof(MemoryRegion) but I don't see what 
the compiler means here or how this could be avoided. Also my compiler 
doesn't warn for this so I can't check any alternative solutions. Any ideas?


It would be good to know if this is a compilet bug or a valid error before 
I try to change it. We have a MemoryRegion within Ppc4xxSdramBank not the 
other way around which the error seems to say. I'm still lost why we got 
this error with your compiler version.


Also this part is removed again two patches later so the best I could do is 
maybe try rearranging it to swap these patches but if there's a simpler way 
I'd go for that instead.


I've also sent a few questions replying to your review. I waited for a 
reply for those before preparing a v2. Could you have a look please? It's 
not urgent but I've already forgot some of the details so it will be more 
difficult to redo this later. Just changing the patch order to avoid this 
error would be a bigger task so it there's some other way I'd choose that.


Regards,
BALATON Zoltan



I am using :

 gcc version 12.2.1 20220819 (Red Hat 12.2.1-1) (GCC)

Thanks,

C.




/* External bus controller */
  if (!ppc4xx_dcr_realize(PPC4xx_DCR_DEVICE(&s->ebc), &s->cpu, errp)) 
{

diff --git a/hw/ppc/ppc440.h b/hw/ppc/ppc440.h
index 7cef936125..5eb2f9a6b3 100644
--- a/hw/ppc/ppc440.h
+++ b/hw/ppc/ppc440.h
@@ -11,14 +11,13 @@
  #ifndef PPC440_H
  #define PPC440_H
  -#include "hw/ppc/ppc.h"
+#include "hw/ppc/ppc4xx.h"
void ppc4xx_l2sram_init(CPUPPCState *env);
  void ppc4xx_cpr_init(CPUPPCState *env);
  void ppc4xx_sdr_init(CPUPPCState *env);
  void ppc440_sdram_init(CPUPPCState *env, int nbanks,
-   MemoryRegion *ram_memories,
-   hwaddr *ram_bases, hwaddr *ram_sizes,
+   Ppc4xxSdramBank ram_banks[],
 int do_init);
  void ppc4xx_ahb_init(CPUPPCState *env);
  void ppc4xx_dma_init(CPUPPCState *env, int dcr_base);
diff --git a/hw/ppc/ppc440_bamboo.c b/hw/ppc/ppc440_bamboo.c
index e3412c4fcd..2aac8a3fe9 100644
--- a/hw/ppc/ppc440_bamboo.c
+++ b/hw/ppc/ppc440_bamboo.c
@@ -168,9 +168,8 @@ static void bamboo_init(MachineState *machine)
  unsigned int pci_irq_nrs[4] = { 28, 27, 26, 25 };
  MemoryRegion *address

Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

2022-09-09 Thread Andy Lutomirski



On Fri, Sep 9, 2022, at 7:32 AM, Kirill A . Shutemov wrote:
> On Thu, Sep 08, 2022 at 09:48:35PM -0700, Andy Lutomirski wrote:
>> On 8/19/22 17:27, Kirill A. Shutemov wrote:
>> > On Thu, Aug 18, 2022 at 08:00:41PM -0700, Hugh Dickins wrote:
>> > > On Thu, 18 Aug 2022, Kirill A . Shutemov wrote:
>> > > > On Wed, Aug 17, 2022 at 10:40:12PM -0700, Hugh Dickins wrote:
>> > > > > 
>> > > > > If your memory could be swapped, that would be enough of a good 
>> > > > > reason
>> > > > > to make use of shmem.c: but it cannot be swapped; and although there
>> > > > > are some references in the mailthreads to it perhaps being swappable
>> > > > > in future, I get the impression that will not happen soon if ever.
>> > > > > 
>> > > > > If your memory could be migrated, that would be some reason to use
>> > > > > filesystem page cache (because page migration happens to understand
>> > > > > that type of memory): but it cannot be migrated.
>> > > > 
>> > > > Migration support is in pipeline. It is part of TDX 1.5 [1]. And 
>> > > > swapping
>> > > > theoretically possible, but I'm not aware of any plans as of now.
>> > > > 
>> > > > [1] 
>> > > > https://www.intel.com/content/www/us/en/developer/articles/technical/intel-trust-domain-extensions.html
>> > > 
>> > > I always forget, migration means different things to different audiences.
>> > > As an mm person, I was meaning page migration, whereas a virtualization
>> > > person thinks VM live migration (which that reference appears to be 
>> > > about),
>> > > a scheduler person task migration, an ornithologist bird migration, etc.
>> > > 
>> > > But you're an mm person too: you may have cited that reference in the
>> > > knowledge that TDX 1.5 Live Migration will entail page migration of the
>> > > kind I'm thinking of.  (Anyway, it's not important to clarify that here.)
>> > 
>> > TDX 1.5 brings both.
>> > 
>> > In TDX speak, mm migration called relocation. See TDH.MEM.PAGE.RELOCATE.
>> > 
>> 
>> This seems to be a pretty bad fit for the way that the core mm migrates
>> pages.  The core mm unmaps the page, then moves (in software) the contents
>> to a new address, then faults it in.  TDH.MEM.PAGE.RELOCATE doesn't fit into
>> that workflow very well.  I'm not saying it can't be done, but it won't just
>> work.
>
> Hm. From what I see we have all necessary infrastructure in place.
>
> Unmaping is NOP for inaccessible pages as it is never mapped and we have
> mapping->a_ops->migrate_folio() callback that allows to replace software
> copying with whatever is needed, like TDH.MEM.PAGE.RELOCATE.
>
> What do I miss?

Hmm, maybe this isn't as bad as I thought.

Right now, unless I've missed something, the migration workflow is to unmap 
(via try_to_migrate) all mappings, then migrate the backing store (with 
->migrate_folio(), although it seems like most callers expect the actual copy 
to happen outside of ->migrate_folio(), and then make new mappings.  With the 
*current* (vma-based, not fd-based) model for KVM memory, this won't work -- we 
can't unmap before calling TDH.MEM.PAGE.RELOCATE.

But maybe it's actually okay with some care or maybe mild modifications with 
the fd-based model.  We don't have any mmaps, per se, to unmap for secret / 
INACCESSIBLE memory.  So maybe we can get all the way to ->migrate_folio() 
without zapping anything in the secure EPT and just call TDH-MEM.PAGE.RELOCATE 
from inside migrate_folio().  And there will be nothing to fault back in.  From 
the core code's perspective, it's like migrating a memfd that doesn't happen to 
have my mappings at the time.

--Andy



Re: [PATCH 10/11] RISC-V: Adding T-Head FMemIdx extension

2022-09-09 Thread Christoph Müllner
On Thu, Sep 8, 2022 at 9:45 AM Richard Henderson <
richard.hender...@linaro.org> wrote:

> On 9/6/22 13:22, Christoph Muellner wrote:
> > @@ -732,6 +733,7 @@ static int ex_rvc_shifti(DisasContext *ctx, int imm)
> >   #include "decode-xtheadbs.c.inc"
> >   #include "decode-xtheadcmo.c.inc"
> >   #include "decode-xtheadcondmov.c.inc"
> > +#include "decode-xtheadfmemidx.c.inc"
> >   #include "decode-xtheadmac.c.inc"
> >   #include "decode-xtheadmemidx.c.inc"
> >   #include "decode-xtheadmempair.c.inc"
> > @@ -1061,6 +1063,7 @@ static void decode_opc(CPURISCVState *env,
> DisasContext *ctx, uint16_t opcode)
> >   { has_xtheadbs_p, decode_xtheadbs },
> >   { has_xtheadcmo_p, decode_xtheadcmo },
> >   { has_xtheadcondmov_p, decode_xtheadcondmov },
> > +{ has_xtheadfmemidx_p, decode_xtheadfmemidx },
> >   { has_xtheadmac_p, decode_xtheadmac },
> >   { has_xtheadmemidx_p, decode_xtheadmemidx },
> >   { has_xtheadmempair_p, decode_xtheadmempair },
>
> I think you should have a single decoder for all of the xthread
> extensions, and each
> translate function should test for the individual extension.  You know
> up-front that these
> extensions do not conflict.
>
>
Ok, I will restructure the series and use a single decoder.

Thanks!


>
> r~
>


Re: [PATCH 03/11] RISC-V: Adding T-Head SYNC instructions

2022-09-09 Thread Christoph Müllner
On Thu, Sep 8, 2022 at 9:30 AM Richard Henderson <
richard.hender...@linaro.org> wrote:

> On 9/6/22 13:22, Christoph Muellner wrote:
> > +NOP_PRIVCHECK(th_sfence_vmas, REQUIRE_PRIV_MHS)
> > +NOP_PRIVCHECK(th_sync, REQUIRE_PRIV_MHSU)
> > +NOP_PRIVCHECK(th_sync_i, REQUIRE_PRIV_MHSU)
> > +NOP_PRIVCHECK(th_sync_is, REQUIRE_PRIV_MHSU)
> > +NOP_PRIVCHECK(th_sync_s, REQUIRE_PRIV_MHSU)
>
> These should not be nops: th_sfence_vmas requires a tlb flush;
> th.sync{,.i} needs to end
> the current translation block; th.sync.{s,is} needs multiprocessor sync,
> which involves a
> call to async_safe_run_on_cpu.
>

Understood.
For a new revision, I'll do the following:
* th_sfence_vmas: async_safe_run_on_cpu() with run_on_cpu_func which
flushes TLB on all CPUs (similar like trans_sfence_vma())
* th_sync/th_sync_i: end the TB (similar like trans_fence_i())
* th_sync_s/th_sync_is: async_safe_run_on_cpu() with run_on_cpu_func which
ends the TB (similar like trans_fence_i())

Thanks!


>
>
> r~
>


Re: [PATCH 3/7] scripts/nsis.py: Automatically package required DLLs of QEMU executables

2022-09-09 Thread Mark Cave-Ayland

On 08/09/2022 14:28, Bin Meng wrote:


From: Bin Meng 

At present packaging the required DLLs of QEMU executables is a
manual process, and error prone.

Actually build/config-host.mak contains a GLIB_BINDIR variable
which is the directory where glib and other DLLs reside. This
works for both Windows native build and cross-build on Linux.
We can use it as the search directory for DLLs and automate
the whole DLL packaging process.

Signed-off-by: Bin Meng 
---

  meson.build |  1 +
  scripts/nsis.py | 46 ++
  2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/meson.build b/meson.build
index c2adb7caf4..4c03850f9f 100644
--- a/meson.build
+++ b/meson.build
@@ -3657,6 +3657,7 @@ if host_machine.system() == 'windows'
  '@OUTPUT@',
  get_option('prefix'),
  meson.current_source_dir(),
+config_host['GLIB_BINDIR'],
  host_machine.cpu(),
  '--',
  '-DDISPLAYVERSION=' + meson.project_version(),
diff --git a/scripts/nsis.py b/scripts/nsis.py
index baa6ef9594..03ed7608a2 100644
--- a/scripts/nsis.py
+++ b/scripts/nsis.py
@@ -18,12 +18,36 @@ def signcode(path):
  return
  subprocess.run([cmd, path])
  
+def find_deps(exe_or_dll, search_path, analyzed_deps):

+deps = [exe_or_dll]
+output = subprocess.check_output(["objdump", "-p", exe_or_dll], text=True)
+output = output.split("\n")
+for line in output:
+if not line.startswith("\tDLL Name: "):
+continue
+
+dep = line.split("DLL Name: ")[1].strip()
+if dep in analyzed_deps:
+continue
+
+dll = os.path.join(search_path, dep)
+if not os.path.exists(dll):
+# assume it's a Windows provided dll, skip it
+continue
+
+analyzed_deps.add(dep)
+# locate the dll dependencies recursively
+rdeps = find_deps(dll, search_path, analyzed_deps)
+deps.extend(rdeps)
+
+return deps
  
  def main():

  parser = argparse.ArgumentParser(description="QEMU NSIS build helper.")
  parser.add_argument("outfile")
  parser.add_argument("prefix")
  parser.add_argument("srcdir")
+parser.add_argument("dlldir")
  parser.add_argument("cpu")
  parser.add_argument("nsisargs", nargs="*")
  args = parser.parse_args()
@@ -63,9 +87,26 @@ def main():
  !insertmacro MUI_DESCRIPTION_TEXT ${{Section_{0}}} "{1}"
  """.format(arch, desc))
  
+search_path = args.dlldir

+print("Searching '%s' for the dependent dlls ..." % search_path)
+dlldir = os.path.join(destdir + prefix, "dll")
+os.mkdir(dlldir)
+
  for exe in glob.glob(os.path.join(destdir + prefix, "*.exe")):
  signcode(exe)
  
+# find all dll dependencies

+deps = set(find_deps(exe, search_path, set()))
+deps.remove(exe)
+
+# copy all dlls to the DLLDIR
+for dep in deps:
+dllfile = os.path.join(dlldir, os.path.basename(dep))
+if (os.path.exists(dllfile)):
+continue
+print("Copying '%s' to '%s'" % (dep, dllfile))
+shutil.copy(dep, dllfile)
+
  makensis = [
  "makensis",
  "-V2",
@@ -73,12 +114,9 @@ def main():
  "-DSRCDIR=" + args.srcdir,
  "-DBINDIR=" + destdir + prefix,
  ]
-dlldir = "w32"
  if args.cpu == "x86_64":
-dlldir = "w64"
  makensis += ["-DW64"]
-if os.path.exists(os.path.join(args.srcdir, "dll")):
-makensis += ["-DDLLDIR={0}/dll/{1}".format(args.srcdir, dlldir)]
+makensis += ["-DDLLDIR=" + dlldir]
  
  makensis += ["-DOUTFILE=" + args.outfile] + args.nsisargs

  subprocess.run(makensis)


FWIW I wrote a similar script a while back to help package a custom Windows build for 
a client, however I used ldd instead of objdump since it provided the full paths for 
DLLs installed in the msys2/mingw-w64 environment via pacman which were outside the 
QEMU build tree.


Once the complete list of DLLs was obtained, it was simple matter of filtering out 
those DLLs that started with the %WINDIR% prefix before copying them to the final 
distribution directory.



ATB,

Mark.



Re: [PATCH v9 08/10] target/s390x: interception of PTF instruction

2022-09-09 Thread Janis Schoetterl-Glausch
On Fri, 2022-09-02 at 09:55 +0200, Pierre Morel wrote:
> When the host supports the CPU topology facility, the PTF
> instruction with function code 2 is interpreted by the SIE,
> provided that the userland hypervizor activates the interpretation
> by using the KVM_CAP_S390_CPU_TOPOLOGY KVM extension.
> 
> The PTF instructions with function code 0 and 1 are intercepted
> and must be emulated by the userland hypervizor.
> 
> Signed-off-by: Pierre Morel 

Reviewed-by: Janis Schoetterl-Glausch 

See note below.
> ---
>  hw/s390x/cpu-topology.c| 52 ++
>  include/hw/s390x/s390-virtio-ccw.h |  6 
>  target/s390x/kvm/kvm.c | 13 
>  3 files changed, 71 insertions(+)
> 
> diff --git a/hw/s390x/cpu-topology.c b/hw/s390x/cpu-topology.c
> index b6bf839e40..7dcaa28ca3 100644
> --- a/hw/s390x/cpu-topology.c
> +++ b/hw/s390x/cpu-topology.c
> @@ -20,6 +20,58 @@
>  #include "hw/s390x/s390-virtio-ccw.h"
>  #include "hw/s390x/cpu-topology.h"
>  #include "migration/vmstate.h"
> +#include "target/s390x/cpu.h"
> +#include "hw/s390x/s390-virtio-ccw.h"
> +
> +/*
> + * s390_handle_ptf:
> + *
> + * @register 1: contains the function code
> + *
> + * Function codes 0 and 1 handle the CPU polarization.
> + * We assume an horizontal topology, the only one supported currently
> + * by Linux, consequently we answer to function code 0, requesting
> + * horizontal polarization that it is already the current polarization
> + * and reject vertical polarization request without further explanation.
> + *
> + * Function code 2 is handling topology changes and is interpreted
> + * by the SIE.
> + */
> +void s390_handle_ptf(S390CPU *cpu, uint8_t r1, uintptr_t ra)
> +{
> +CPUS390XState *env = &cpu->env;
> +uint64_t reg = env->regs[r1];
> +uint8_t fc = reg & S390_TOPO_FC_MASK;
> +
> +if (!s390_has_feat(S390_FEAT_CONFIGURATION_TOPOLOGY)) {
> +s390_program_interrupt(env, PGM_OPERATION, ra);
> +return;

I'm either expecting this function to return -1 here...
> +}
> +
> +if (env->psw.mask & PSW_MASK_PSTATE) {
> +s390_program_interrupt(env, PGM_PRIVILEGED, ra);
> +return;
> +}
> +
> +if (reg & ~S390_TOPO_FC_MASK) {
> +s390_program_interrupt(env, PGM_SPECIFICATION, ra);
> +return;
> +}
> +
> +switch (fc) {
> +case 0:/* Horizontal polarization is already set */
> +env->regs[r1] |= S390_PTF_REASON_DONE;
> +setcc(cpu, 2);
> +break;
> +case 1:/* Vertical polarization is not supported */
> +env->regs[r1] |= S390_PTF_REASON_NONE;
> +setcc(cpu, 2);
> +break;
> +default:
> +/* Note that fc == 2 is interpreted by the SIE */
> +s390_program_interrupt(env, PGM_SPECIFICATION, ra);
> +}
> +}

[...]
>  
> +static int kvm_handle_ptf(S390CPU *cpu, struct kvm_run *run)
> +{
> +uint8_t r1 = (run->s390_sieic.ipb >> 20) & 0x0f;
> +
> +s390_handle_ptf(cpu, r1, RA_IGNORED);

... and this being returned here...
> +
> +return 0;

... or this function being void.
> +}
> +
>  static int handle_b9(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
>  {
>  int r = 0;
> @@ -1480,6 +1490,9 @@ static int handle_b9(S390CPU *cpu, struct kvm_run *run, 
> uint8_t ipa1)
>  case PRIV_B9_RPCIT:
>  r = kvm_rpcit_service_call(cpu, run);
>  break;
> +case PRIV_B9_PTF:
> +r = kvm_handle_ptf(cpu, run);
> +break;
>  case PRIV_B9_EQBS:
>  /* just inject exception */
>  r = -1;




Call for Outreachy Dec-Mar internship project ideas

2022-09-09 Thread Stefan Hajnoczi
Dear QEMU & KVM community,
The Outreachy open source internship program
(https://www.outreachy.org/) is running again from December-March. If
you have a project idea you'd like to mentor and are a regular
contributor to QEMU or KVM, please reply to this email by September
22nd.

I have CCed active contributors based on git-log(1) but you don't need
to be CCed to become a mentor.

Mentoring an intern is a great way to give back for the support you
received along the way of your open source journey. You'll get
experience with interviewing and running projects. And most of all,
it's fun to work with talented contributors excited about open source!

You must be willing to commit around 5 hours per week during the
application phase and coding period.

Project ideas should be:
- 12 weeks of full-time work for a competent programmer without prior
exposure to the code base.
- Well-defined: scope is clear.
- Self-contained: has few dependencies.
- Uncontroversial: acceptable to the community.
- Incremental: produces deliverables along the way.

Project description template:
 === TITLE ===
 '''Summary:''' Short description of the project

 Detailed description of the project for someone who is not familiar
with the code base.

 '''Links:'''
 * Wiki links to relevant material
 * External links to mailing lists or web sites

 '''Details:'''
 * Skill level: beginner or intermediate or advanced
 * Language: C/Rust/Python
 * Mentors: Email address and IRC nick

Thank you,
Stefan



Re: [PATCH 6/7] .gitlab-ci.d/windows.yml: Unify the prerequisite packages

2022-09-09 Thread Thomas Huth

On 08/09/2022 15.28, Bin Meng wrote:

From: Bin Meng 

At present the prerequisite packages for 64-bit and 32-bit builds
are slightly different. Let's use the same packages for both.


Not sure whether that's a good idea ... I did that on purpose to save some 
few time during compilation (since the Windows jobs are running very long 
already) ... did you check whether it makes a difference in the run time now?


 Thomas





Re: [PATCH 4/7] .gitlab-ci.d/windows.yml: Drop the sed processing in the 64-bit build

2022-09-09 Thread Thomas Huth

On 08/09/2022 16.04, Marc-André Lureau wrote:

Hi

On Thu, Sep 8, 2022 at 5:33 PM Bin Meng > wrote:


From: Bin Meng mailto:bin.m...@windriver.com>>

The sed processing of build/config-host.mak seems to be no longer
needed, and there is no such in the 32-bit build too. Drop it.

Signed-off-by: Bin Meng mailto:bin.m...@windriver.com>>
---

  .gitlab-ci.d/windows.yml | 1 -
  1 file changed, 1 deletion(-)

diff --git a/.gitlab-ci.d/windows.yml b/.gitlab-ci.d/windows.yml
index da6013904a..86a4339c48 100644
--- a/.gitlab-ci.d/windows.yml
+++ b/.gitlab-ci.d/windows.yml
@@ -60,7 +60,6 @@ msys2-64bit:
    - $env:MSYS = 'winsymlinks:native' # Enable native Windows symlink
    - .\msys64\usr\bin\bash -lc './configure --target-list=x86_64-softmmu
        --enable-capstone --without-default-devices'
-  - .\msys64\usr\bin\bash -lc "sed -i '/^ROMS=/d' build/config-host.mak"


It looks like it is there to remove the ROMS from the make build. No idea if 
that still makes sense. Thomas, do you remember?


I originally had to add this sed statement since there was a compile error 
otherwise in the ROMS ... if it now works fine without this line, this 
should be fine, of course, too.


Reviewed-by: Thomas Huth 




Re: [PATCH v5 3/3] i386: Add notify VM exit support

2022-09-09 Thread Peter Xu
On Wed, Aug 17, 2022 at 10:08:45AM +0800, Chenyi Qiang wrote:
> There are cases that malicious virtual machine can cause CPU stuck (due
> to event windows don't open up), e.g., infinite loop in microcode when
> nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and
> IRQ) can be delivered. It leads the CPU to be unavailable to host or
> other VMs. Notify VM exit is introduced to mitigate such kind of
> attacks, which will generate a VM exit if no event window occurs in VM
> non-root mode for a specified amount of time (notify window).
> 
> A new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT is exposed to user space
> so that the user can query the capability and set the expected notify
> window when creating VMs. The format of the argument when enabling this
> capability is as follows:
>   Bit 63:32 - notify window specified in qemu command
>   Bit 31:0  - some flags (e.g. KVM_X86_NOTIFY_VMEXIT_ENABLED is set to
>   enable the feature.)
> 
> Because there are some concerns, e.g. a notify VM exit may happen with
> VM_CONTEXT_INVALID set in exit qualification (no cases are anticipated
> that would set this bit), which means VM context is corrupted. To avoid
> the false positive and a well-behaved guest gets killed, make this
> feature disabled by default. Users can enable the feature by a new
> machine property:
> qemu -machine notify_vmexit=on,notify_window=0 ...

The patch looks sane to me; I only read the KVM interface, though.  Worth
add a section to qemu-options.hx?  It'll also be worthwhile to mention the
valid range of notify_window and meaning of zero (IIUC that's also a valid
input, just use the hardware default window size).

Thanks,

> 
> A new KVM exit reason KVM_EXIT_NOTIFY is defined for notify VM exit. If
> it happens with VM_INVALID_CONTEXT, hypervisor exits to user space to
> inform the fatal case. Then user space can inject a SHUTDOWN event to
> the target vcpu. This is implemented by injecting a sythesized triple
> fault event.

-- 
Peter Xu




Re: [PATCH v5 2/3] i386: kvm: extend kvm_{get, put}_vcpu_events to support pending triple fault

2022-09-09 Thread Peter Xu
On Wed, Aug 17, 2022 at 10:08:44AM +0800, Chenyi Qiang wrote:
> For the direct triple faults, i.e. hardware detected and KVM morphed
> to VM-Exit, KVM will never lose them. But for triple faults sythesized
> by KVM, e.g. the RSM path, if KVM exits to userspace before the request
> is serviced, userspace could migrate the VM and lose the triple fault.
> 
> A new flag KVM_VCPUEVENT_VALID_TRIPLE_FAULT is defined to signal that
> the event.triple_fault_pending field contains a valid state if the
> KVM_CAP_X86_TRIPLE_FAULT_EVENT capability is enabled.
> 
> Signed-off-by: Chenyi Qiang 

Acked-by: Peter Xu 

-- 
Peter Xu




Re: [PATCH v3] target/i386: Set maximum APIC ID to KVM prior to vCPU creation

2022-09-09 Thread Peter Xu
On Thu, Aug 25, 2022 at 10:52:46AM +0800, Zeng Guang wrote:
> Specify maximum possible APIC ID assigned for current VM session to KVM
> prior to the creation of vCPUs. By this setting, KVM can set up VM-scoped
> data structure indexed by the APIC ID, e.g. Posted-Interrupt Descriptor
> pointer table to support Intel IPI virtualization, with the most optimal
> memory footprint.
> 
> It can be achieved by calling KVM_ENABLE_CAP for KVM_CAP_MAX_VCPU_ID
> capability once KVM has enabled it. Ignoring the return error if KVM
> doesn't support this capability yet.
> 
> Signed-off-by: Zeng Guang 

Acked-by: Peter Xu 

-- 
Peter Xu




Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

2022-09-09 Thread Michael Roth
On Wed, Jul 06, 2022 at 04:20:02PM +0800, Chao Peng wrote:
> This is the v7 of this series which tries to implement the fd-based KVM
> guest private memory. The patches are based on latest kvm/queue branch
> commit:
> 
>   b9b71f43683a (kvm/queue) KVM: x86/mmu: Buffer nested MMU
> split_desc_cache only by default capacity
> 
> Introduction
> 
> In general this patch series introduce fd-based memslot which provides
> guest memory through memory file descriptor fd[offset,size] instead of
> hva/size. The fd can be created from a supported memory filesystem
> like tmpfs/hugetlbfs etc. which we refer as memory backing store. KVM
> and the the memory backing store exchange callbacks when such memslot
> gets created. At runtime KVM will call into callbacks provided by the
> backing store to get the pfn with the fd+offset. Memory backing store
> will also call into KVM callbacks when userspace punch hole on the fd
> to notify KVM to unmap secondary MMU page table entries.
> 
> Comparing to existing hva-based memslot, this new type of memslot allows
> guest memory unmapped from host userspace like QEMU and even the kernel
> itself, therefore reduce attack surface and prevent bugs.
> 
> Based on this fd-based memslot, we can build guest private memory that
> is going to be used in confidential computing environments such as Intel
> TDX and AMD SEV. When supported, the memory backing store can provide
> more enforcement on the fd and KVM can use a single memslot to hold both
> the private and shared part of the guest memory. 

Hi everyone,

Just wanted to let you all know that I reserved a slot at the LPC
Confidential Computing Microconference to discuss some topics related
to unmapped/inaccessible private memory support:

  "Unmapped Private Memory for Confidential Guests"
  Tuesday, Sep 13th, 10:00am (Dublin time)
  https://lpc.events/event/16/sessions/133/#20220913

The discussion agenda is still a bit in flux, but one topic I really
wanted to cover is how we intend to deal with the kernel directmap
for TDX/SNP, where there is a need to either remove or split mappings
so that KVM or other kernel threads writing to non-private pages
don't run into issues due mappings overlapping with private pages.[1]

Other possible discussion topics:

  - guarding against shared->private conversions while KVM is
attempting to access a shared page (separate PFN pools for
shared/private seems to resolve this nicely, but may not be
compatible with things like pKVM where the underlying PFN
is the same for shared/private)[2]

  - extending KVM_EXIT_MEMORY_FAULT to handle batched requests to
better handle things like explicit batched conversions initiated
by the guest

It's a short session so not sure how much time we'll actually have
to discuss things in detail, but maybe this can at least be a good
jumping off point for other discussions.

Thanks, and hope to see you there!

[1] https://lore.kernel.org/all/ywb8wg6ravbs1...@google.com/
[2] 
https://lore.kernel.org/lkml/CA+EHjTy6NF=bkcqk0vhxldtkzmahp55jumsfxn96-nt3yim...@mail.gmail.com/



[PATCH] target/riscv/pmp: fix non-translated page size address checks w/ MPU

2022-09-09 Thread leon
From: Leon Schuermann 

This commit fixes PMP address access checks with non page-aligned PMP
regions on harts with MPU enabled. Without this change, the presence
of an MPU in the virtual CPU model would influence the PMP address
check behavior when an access size was unknown (`size == 0`),
regardless of whether virtual memory has actually been enabled by the
guest.

The RISC-V Privileged Spec Version 20211203[1] states in 4.3.1
Addressing and Memory Protection that "[...]  [w]hen Sv32 virtual
memory mode is selected in the MODE field of the satp register,
supervisor virtual addresses are translated into supervisor physical
addresses via a two-level page table. The 20-bit VPN is translated
into a 22-bit physical page number (PPN), while the 12-bit page offset
is untranslated. The resulting supervisor-level physical addresses are
then checked using any physical memory protection structures (Sections
3.7), before being directly converted to machine-level physical
addresses. [...]" and "[...] [w]hen the value of satp.MODE is Bare,
the 32-bit virtual address is translated (unmodified) into a 32-bit
physical address [...]". Other modes such as Sv39, Sv48 and Sv57 are
said to behave similar in this regard.

>From this specification it can be inferred that any access made when
virtual memory is disabled, which is the case when satp.MODE is set to
"Bare" (0), should behave identically with respect to PMP checks as if
no MPU were present in the system at all. The current implementation,
however, degrades any PMP address checks of unknown access size (which
seems to be the case for instruction fetches at least) to be of
page-granularity, just based on the fact that the hart has MPU support
enabled. This causes systems that rely on 4-byte aligned PMP regions
to incur access faults, which are not occurring with the MPU disabled,
independent of any runtime guest configuration.

While there possibly are other unhandled edge cases in which
page-granularity access checks might not be appropriate, this commit
appears to be a strict improvement over the current implementation's
behavior. It has been tested using Tock OS, but not with other
systems (e.g., Linux) yet.

[1]: 
https://github.com/riscv/riscv-isa-manual/releases/download/Priv-v1.12/riscv-privileged-20211203.pdf

Signed-off-by: Leon Schuermann 
---

This patch is a resubmission to include all maintainers of the
modified files and main QEMU mailing list, as determined through the
`get_maintainer.pl` script.

Also, one particular example of an additional edge case not handled
through this patch might be a hart operating in M-mode. Given that
virtual memory through {Sv32,Sv39,Sv48,Sv57} is only supported for
S-mode and U-mode respectively, enabling virtual memory in the satp
CSR should not have any effect on the behavior of memory accesses
w.r.t. PMP checks for harts operating in M-mode.

I'm going to defer adding this additional check, as I'd appreciate some
feedback as to whether my reasoning is correct here at all first.

Thanks!

-Leon

---
 target/riscv/pmp.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/target/riscv/pmp.c b/target/riscv/pmp.c
index ea2b67d947..48f64a4aef 100644
--- a/target/riscv/pmp.c
+++ b/target/riscv/pmp.c
@@ -300,6 +300,7 @@ bool pmp_hart_has_privs(CPURISCVState *env, target_ulong 
addr,
 int i = 0;
 int ret = -1;
 int pmp_size = 0;
+uint64_t satp_mode;
 target_ulong s = 0;
 target_ulong e = 0;
 
@@ -310,10 +311,17 @@ bool pmp_hart_has_privs(CPURISCVState *env, target_ulong 
addr,
 }
 
 if (size == 0) {
-if (riscv_feature(env, RISCV_FEATURE_MMU)) {
+if (riscv_cpu_mxl(env) == MXL_RV32) {
+satp_mode = SATP32_MODE;
+} else {
+satp_mode = SATP64_MODE;
+}
+
+if (riscv_feature(env, RISCV_FEATURE_MMU)
+&& get_field(env->satp, satp_mode)) {
 /*
- * If size is unknown (0), assume that all bytes
- * from addr to the end of the page will be accessed.
+ * If size is unknown (0) and virtual memory is enabled, assume 
that
+ * all bytes from addr to the end of the page will be accessed.
  */
 pmp_size = -(addr | TARGET_PAGE_MASK);
 } else {

base-commit: 61fd710b8da8aedcea9b4f197283dc38638e4b60
-- 
2.36.0




Re: [PATCH v3] audio: add help option for -audio and -audiodev

2022-09-09 Thread Daniel P . Berrangé
On Fri, Sep 09, 2022 at 04:27:34PM +0200, Claudio Fontana wrote:
> On 9/9/22 15:50, Daniel P. Berrangé wrote:
> > On Fri, Sep 09, 2022 at 03:41:22PM +0200, Claudio Fontana wrote:
> >> On 9/9/22 00:05, Paolo Bonzini wrote:
> >>> Il gio 8 set 2022, 15:47 Claudio Fontana  ha scritto:
> >>>
>  On 9/8/22 11:39, Paolo Bonzini wrote:
> > Queued, thanks.
> >
> > Paolo
> >
> >
> 
>  Thanks. When it comes to programmatic checks about what QEMU supports in
>  terms of audio,
> 
>  is there something that can be done with QMP?
> 
>  I checked the QMP manual at:
> 
> 
>  https://qemu.readthedocs.io/en/latest/interop/qemu-qmp-ref.html#qapidoc-2948
> 
>  but in the "Audio" section there is a bunch of Objects and enums defined,
>  but no command to query them...
> 
> >>> No, there's nothing yet.
> > 
> > You're now reminding me of the patch I sent a while ago for reporting
> > audiodev backends and then completely forgot to followup on
> > 
> >   https://mail.gnu.org/archive/html/qemu-devel/2021-03/msg00656.html
> > 
> > 
> >> Interesting. What about Display (ie ui-*) ? I mean how do I figure out 
> >> from, say, libvirt,
> >> everything that QEMU can do in terms of display, which drivers are 
> >> actually installed?
> >>
> >> Same for block...
> >>
> >> with the increasing modularization of QEMU we should I presume strengthen 
> >> the discoverability of QEMU capabilities right?
> >> This way we can configure once, and install just what is needed to match 
> >> the user requirements, or distro variant.
> >>
> >> As Markus mentioned maybe a more general solution would be to have these 
> >> things as qom objects so that a
> >>
> >> qom-list-types
> >> can be used to get all 'audiodev' types, or all 'display' types, or all 
> >> 'block' types and solve the problem this way?
> > 
> >> Is there a more general problem / solution that I am not seeing?
> > 
> > In an idealized world (where we can ignore the reality of our
> > existing legacy codebase) I think all backends would simply
> > be QOM objects, and created with -object, avoiding the need for
> > any backend type specific CLI args like -audiodev / -netdev / etc.
> > 
> > This would actually also extend to frontends, devices, cpus,
> > machine types etc all being objects, ought to be creatable via
> > -object, not requiring -device, -machine.
> > 
> > If we lived in the world where everything was a QOM Object,
> > then qom-list-types would serve as the universal detection
> > mechanism for everything.
> > 
> > Back in our current reality of pre-existing legacy code though,
> > we have to be a little more pragmattic. If we can make things
> > into QOM objects that work with -object / qom-list-types that's
> > great, but if it is too much work we'll just have to create other
> > QMP commands for querying, such as the query-audiodev patch.
> > 
> > 
> > With regards,
> > Daniel
> 
> Hmm the patch I am seeing though says that it
> 
> "reflects back the list of configured -audiodev command line options".
> 
> Maybe we are saying the same thing, but maybe we aren't,
> what we are actually trying to achieve is to probe which audiodev drivers are 
> available, either built-in or loaded as modules.
> Same for display, block, etc.
> 
> You are instead trying to fetch the command line options?

Sort of, its a gross hack. By adding query-audiodev for querying the
config of -audiodev command line options, the audiodefv backend types
get added to the QMP schema. You can thus query for whether a backend
exists using query-qmp-schema'


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

2022-09-09 Thread Kirill A . Shutemov
On Thu, Sep 08, 2022 at 09:48:35PM -0700, Andy Lutomirski wrote:
> On 8/19/22 17:27, Kirill A. Shutemov wrote:
> > On Thu, Aug 18, 2022 at 08:00:41PM -0700, Hugh Dickins wrote:
> > > On Thu, 18 Aug 2022, Kirill A . Shutemov wrote:
> > > > On Wed, Aug 17, 2022 at 10:40:12PM -0700, Hugh Dickins wrote:
> > > > > 
> > > > > If your memory could be swapped, that would be enough of a good reason
> > > > > to make use of shmem.c: but it cannot be swapped; and although there
> > > > > are some references in the mailthreads to it perhaps being swappable
> > > > > in future, I get the impression that will not happen soon if ever.
> > > > > 
> > > > > If your memory could be migrated, that would be some reason to use
> > > > > filesystem page cache (because page migration happens to understand
> > > > > that type of memory): but it cannot be migrated.
> > > > 
> > > > Migration support is in pipeline. It is part of TDX 1.5 [1]. And 
> > > > swapping
> > > > theoretically possible, but I'm not aware of any plans as of now.
> > > > 
> > > > [1] 
> > > > https://www.intel.com/content/www/us/en/developer/articles/technical/intel-trust-domain-extensions.html
> > > 
> > > I always forget, migration means different things to different audiences.
> > > As an mm person, I was meaning page migration, whereas a virtualization
> > > person thinks VM live migration (which that reference appears to be 
> > > about),
> > > a scheduler person task migration, an ornithologist bird migration, etc.
> > > 
> > > But you're an mm person too: you may have cited that reference in the
> > > knowledge that TDX 1.5 Live Migration will entail page migration of the
> > > kind I'm thinking of.  (Anyway, it's not important to clarify that here.)
> > 
> > TDX 1.5 brings both.
> > 
> > In TDX speak, mm migration called relocation. See TDH.MEM.PAGE.RELOCATE.
> > 
> 
> This seems to be a pretty bad fit for the way that the core mm migrates
> pages.  The core mm unmaps the page, then moves (in software) the contents
> to a new address, then faults it in.  TDH.MEM.PAGE.RELOCATE doesn't fit into
> that workflow very well.  I'm not saying it can't be done, but it won't just
> work.

Hm. From what I see we have all necessary infrastructure in place.

Unmaping is NOP for inaccessible pages as it is never mapped and we have
mapping->a_ops->migrate_folio() callback that allows to replace software
copying with whatever is needed, like TDH.MEM.PAGE.RELOCATE.

What do I miss?

-- 
  Kiryl Shutsemau / Kirill A. Shutemov



Re: [PATCH v3] audio: add help option for -audio and -audiodev

2022-09-09 Thread Claudio Fontana
On 9/9/22 15:50, Daniel P. Berrangé wrote:
> On Fri, Sep 09, 2022 at 03:41:22PM +0200, Claudio Fontana wrote:
>> On 9/9/22 00:05, Paolo Bonzini wrote:
>>> Il gio 8 set 2022, 15:47 Claudio Fontana  ha scritto:
>>>
 On 9/8/22 11:39, Paolo Bonzini wrote:
> Queued, thanks.
>
> Paolo
>
>

 Thanks. When it comes to programmatic checks about what QEMU supports in
 terms of audio,

 is there something that can be done with QMP?

 I checked the QMP manual at:


 https://qemu.readthedocs.io/en/latest/interop/qemu-qmp-ref.html#qapidoc-2948

 but in the "Audio" section there is a bunch of Objects and enums defined,
 but no command to query them...

>>> No, there's nothing yet.
> 
> You're now reminding me of the patch I sent a while ago for reporting
> audiodev backends and then completely forgot to followup on
> 
>   https://mail.gnu.org/archive/html/qemu-devel/2021-03/msg00656.html
> 
> 
>> Interesting. What about Display (ie ui-*) ? I mean how do I figure out from, 
>> say, libvirt,
>> everything that QEMU can do in terms of display, which drivers are actually 
>> installed?
>>
>> Same for block...
>>
>> with the increasing modularization of QEMU we should I presume strengthen 
>> the discoverability of QEMU capabilities right?
>> This way we can configure once, and install just what is needed to match the 
>> user requirements, or distro variant.
>>
>> As Markus mentioned maybe a more general solution would be to have these 
>> things as qom objects so that a
>>
>> qom-list-types
>> can be used to get all 'audiodev' types, or all 'display' types, or all 
>> 'block' types and solve the problem this way?
> 
>> Is there a more general problem / solution that I am not seeing?
> 
> In an idealized world (where we can ignore the reality of our
> existing legacy codebase) I think all backends would simply
> be QOM objects, and created with -object, avoiding the need for
> any backend type specific CLI args like -audiodev / -netdev / etc.
> 
> This would actually also extend to frontends, devices, cpus,
> machine types etc all being objects, ought to be creatable via
> -object, not requiring -device, -machine.
> 
> If we lived in the world where everything was a QOM Object,
> then qom-list-types would serve as the universal detection
> mechanism for everything.
> 
> Back in our current reality of pre-existing legacy code though,
> we have to be a little more pragmattic. If we can make things
> into QOM objects that work with -object / qom-list-types that's
> great, but if it is too much work we'll just have to create other
> QMP commands for querying, such as the query-audiodev patch.
> 
> 
> With regards,
> Daniel

Hmm the patch I am seeing though says that it

"reflects back the list of configured -audiodev command line options".

Maybe we are saying the same thing, but maybe we aren't,
what we are actually trying to achieve is to probe which audiodev drivers are 
available, either built-in or loaded as modules.
Same for display, block, etc.

You are instead trying to fetch the command line options?










Re: [PATCH v3 4/5] acpi/nvdimm: Implement ACPI NVDIMM Label Methods

2022-09-09 Thread Robert Hoo
On Fri, 2022-09-09 at 15:39 +0200, Igor Mammedov wrote:
> On Thu,  1 Sep 2022 11:27:20 +0800
> Robert Hoo  wrote:
> 
> > Recent ACPI spec [1] has defined NVDIMM Label Methods _LS{I,R,W},
> > which
> > deprecates corresponding _DSM Functions defined by PMEM _DSM
> > Interface spec
> > [2].
> > 
> > Since the semantics of the new Label Methods are same as old _DSM
> > methods, the implementations here simply wrapper old ones.
> > 
> > ASL form diff can be found in next patch of updating golden master
> > binaries.
> > 
> > [1] ACPI Spec v6.4, 6.5.10 NVDIMM Label Methods
> > https://uefi.org/sites/default/files/resources/ACPI_Spec_6_4_Jan22.pdf
> > [2] Intel PMEM _DSM Interface Spec v2.0, 3.10 Deprecated Functions
> > https://pmem.io/documents/IntelOptanePMem_DSM_Interface-V2.0.pdf
> > 
> > Signed-off-by: Robert Hoo 
> > Reviewed-by: Jingqi Liu 
> 
> looks more or less fine except of excessive use of named variables
> which creates global scope variables.
> 
> I'd suggest to store temporary buffers/packages in LocalX variales,
> you should be able to do that for everything modulo
> aml_create_dword_field().
> 
> see an example below
> 
> > ---
> >  hw/acpi/nvdimm.c | 91
> > 
> >  1 file changed, 91 insertions(+)
> > 
> > diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
> > index afff911c1e..516acfe53b 100644
> > --- a/hw/acpi/nvdimm.c
> > +++ b/hw/acpi/nvdimm.c
> > @@ -1243,6 +1243,7 @@ static void nvdimm_build_fit(Aml *dev)
> >  static void nvdimm_build_nvdimm_devices(Aml *root_dev, uint32_t
> > ram_slots)
> >  {
> >  uint32_t slot;
> > +Aml *method, *pkg, *field, *com_call;
> >  
> >  for (slot = 0; slot < ram_slots; slot++) {
> >  uint32_t handle = nvdimm_slot_to_handle(slot);
> > @@ -1260,6 +1261,96 @@ static void nvdimm_build_nvdimm_devices(Aml
> > *root_dev, uint32_t ram_slots)
> >   */
> >  aml_append(nvdimm_dev, aml_name_decl("_ADR",
> > aml_int(handle)));
> >  
> > +/*
> > + * ACPI v6.4: Section 6.5.10 NVDIMM Label Methods
> > + */
> > +/* _LSI */
> > +method = aml_method("_LSI", 0, AML_SERIALIZED);
> > +com_call = aml_call5(NVDIMM_COMMON_DSM,
> > +aml_touuid(NVDIMM_DEVICE_DSM_UUID),
> > +aml_int(1), aml_int(4), aml_int(0),
> > +aml_int(handle));
> > +aml_append(method, aml_store(com_call, aml_local(0)));
> > +
> > +aml_append(method, aml_create_dword_field(aml_local(0),
> > +  aml_int(0),
> > "STTS"));
> > +aml_append(method, aml_create_dword_field(aml_local(0),
> > aml_int(4),
> > +  "SLSA"));
> > +aml_append(method, aml_create_dword_field(aml_local(0),
> > aml_int(8),
> > +  "MAXT"));
> > +
> > +pkg = aml_package(3);
> > +aml_append(pkg, aml_name("STTS"));
> > +aml_append(pkg, aml_name("SLSA"));
> > +aml_append(pkg, aml_name("MAXT"));
> > +aml_append(method, aml_name_decl("RET", pkg));
> 
> ex: put it in local instead of named variable and return that
> the same applies to other named temporary named variables.

Could you help provide the example in form of ASL?
Thanks.
> 
> > +aml_append(method, aml_return(aml_name("RET")));
> > +
> > +aml_append(nvdimm_dev, method);
> > +
> > +/* _LSR */
> > +method = aml_method("_LSR", 2, AML_SERIALIZED);
> > +aml_append(method, aml_name_decl("INPT", aml_buffer(8,
> > NULL)));
> > +
> > +aml_append(method,
> > aml_create_dword_field(aml_name("INPT"),
> > +  aml_int(0),
> > "OFST"));
> > +aml_append(method,
> > aml_create_dword_field(aml_name("INPT"),
> > +  aml_int(4),
> > "LEN"));
> > +aml_append(method, aml_store(aml_arg(0),
> > aml_name("OFST")));
> > +aml_append(method, aml_store(aml_arg(1),
> > aml_name("LEN")));
> > +
> > +pkg = aml_package(1);
> > +aml_append(pkg, aml_name("INPT"));
> > +aml_append(method, aml_name_decl("PKG1", pkg));
> > +
> > +com_call = aml_call5(NVDIMM_COMMON_DSM,
> > +aml_touuid(NVDIMM_DEVICE_DSM_UUID),
> > +aml_int(1), aml_int(5),
> > aml_name("PKG1"),
> > +aml_int(handle));
> > +aml_append(method, aml_store(com_call, aml_local(3)));
> > +field = aml_create_dword_field(aml_local(3), aml_int(0),
> > "STTS");
> > +aml_append(method, field);
> > +field = aml_create_field(aml_local(3), aml_int(32),
> > + aml_shiftleft(aml_name("LEN"),
> > aml_int(3)),
> > + "LDAT");
> > +aml_append(method, field);
> > +aml_append(method, am

[PATCH v2 5/8] target/riscv: debug: Introduce tinfo CSR

2022-09-09 Thread Bin Meng
From: Frank Chang 

tinfo.info:
  One bit for each possible type enumerated in tdata1.
  If the bit is set, then that type is supported by the currently
  selected trigger.

Signed-off-by: Frank Chang 
Reviewed-by: Bin Meng 
Signed-off-by: Bin Meng 
---

(no changes since v1)

 target/riscv/cpu_bits.h |  1 +
 target/riscv/debug.h|  2 ++
 target/riscv/csr.c  |  8 
 target/riscv/debug.c| 10 +++---
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/target/riscv/cpu_bits.h b/target/riscv/cpu_bits.h
index 7be12cac2e..1972aee3bb 100644
--- a/target/riscv/cpu_bits.h
+++ b/target/riscv/cpu_bits.h
@@ -321,6 +321,7 @@
 #define CSR_TDATA1  0x7a1
 #define CSR_TDATA2  0x7a2
 #define CSR_TDATA3  0x7a3
+#define CSR_TINFO   0x7a4
 
 /* Debug Mode Registers */
 #define CSR_DCSR0x7b0
diff --git a/target/riscv/debug.h b/target/riscv/debug.h
index 76146f373a..9f69c64591 100644
--- a/target/riscv/debug.h
+++ b/target/riscv/debug.h
@@ -95,6 +95,8 @@ void tselect_csr_write(CPURISCVState *env, target_ulong val);
 target_ulong tdata_csr_read(CPURISCVState *env, int tdata_index);
 void tdata_csr_write(CPURISCVState *env, int tdata_index, target_ulong val);
 
+target_ulong tinfo_csr_read(CPURISCVState *env);
+
 void riscv_cpu_debug_excp_handler(CPUState *cs);
 bool riscv_cpu_debug_check_breakpoint(CPUState *cs);
 bool riscv_cpu_debug_check_watchpoint(CPUState *cs, CPUWatchpoint *wp);
diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index 3d0d8e0340..e66019048d 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -3089,6 +3089,13 @@ static RISCVException write_tdata(CPURISCVState *env, 
int csrno,
 return RISCV_EXCP_NONE;
 }
 
+static RISCVException read_tinfo(CPURISCVState *env, int csrno,
+ target_ulong *val)
+{
+*val = tinfo_csr_read(env);
+return RISCV_EXCP_NONE;
+}
+
 /*
  * Functions to access Pointer Masking feature registers
  * We have to check if current priv lvl could modify
@@ -3893,6 +3900,7 @@ riscv_csr_operations csr_ops[CSR_TABLE_SIZE] = {
 [CSR_TDATA1]=  { "tdata1",  debug, read_tdata,   write_tdata   },
 [CSR_TDATA2]=  { "tdata2",  debug, read_tdata,   write_tdata   },
 [CSR_TDATA3]=  { "tdata3",  debug, read_tdata,   write_tdata   },
+[CSR_TINFO] =  { "tinfo",   debug, read_tinfo,   write_ignore  },
 
 /* User Pointer Masking */
 [CSR_UMTE]={ "umte",pointer_masking, read_umte,  write_umte },
diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index d164cd..7d546ace42 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -37,9 +37,7 @@
  * - tdata1
  * - tdata2
  * - tdata3
- *
- * We don't support writable 'type' field in the tdata1 register, so there is
- * no need to implement the "tinfo" CSR.
+ * - tinfo
  *
  * The following triggers are implemented:
  *
@@ -372,6 +370,12 @@ void tdata_csr_write(CPURISCVState *env, int tdata_index, 
target_ulong val)
 }
 }
 
+target_ulong tinfo_csr_read(CPURISCVState *env)
+{
+/* assume all triggers support the same types of triggers */
+return BIT(TRIGGER_TYPE_AD_MATCH);
+}
+
 void riscv_cpu_debug_excp_handler(CPUState *cs)
 {
 RISCVCPU *cpu = RISCV_CPU(cs);
-- 
2.34.1




Re: [PATCH v3] audio: add help option for -audio and -audiodev

2022-09-09 Thread Daniel P . Berrangé
On Fri, Sep 09, 2022 at 03:41:22PM +0200, Claudio Fontana wrote:
> On 9/9/22 00:05, Paolo Bonzini wrote:
> > Il gio 8 set 2022, 15:47 Claudio Fontana  ha scritto:
> > 
> >> On 9/8/22 11:39, Paolo Bonzini wrote:
> >>> Queued, thanks.
> >>>
> >>> Paolo
> >>>
> >>>
> >>
> >> Thanks. When it comes to programmatic checks about what QEMU supports in
> >> terms of audio,
> >>
> >> is there something that can be done with QMP?
> >>
> >> I checked the QMP manual at:
> >>
> >>
> >> https://qemu.readthedocs.io/en/latest/interop/qemu-qmp-ref.html#qapidoc-2948
> >>
> >> but in the "Audio" section there is a bunch of Objects and enums defined,
> >> but no command to query them...
> >>
> > No, there's nothing yet.

You're now reminding me of the patch I sent a while ago for reporting
audiodev backends and then completely forgot to followup on

  https://mail.gnu.org/archive/html/qemu-devel/2021-03/msg00656.html


> Interesting. What about Display (ie ui-*) ? I mean how do I figure out from, 
> say, libvirt,
> everything that QEMU can do in terms of display, which drivers are actually 
> installed?
> 
> Same for block...
> 
> with the increasing modularization of QEMU we should I presume strengthen the 
> discoverability of QEMU capabilities right?
> This way we can configure once, and install just what is needed to match the 
> user requirements, or distro variant.
> 
> As Markus mentioned maybe a more general solution would be to have these 
> things as qom objects so that a
> 
> qom-list-types
> can be used to get all 'audiodev' types, or all 'display' types, or all 
> 'block' types and solve the problem this way?

> Is there a more general problem / solution that I am not seeing?

In an idealized world (where we can ignore the reality of our
existing legacy codebase) I think all backends would simply
be QOM objects, and created with -object, avoiding the need for
any backend type specific CLI args like -audiodev / -netdev / etc.

This would actually also extend to frontends, devices, cpus,
machine types etc all being objects, ought to be creatable via
-object, not requiring -device, -machine.

If we lived in the world where everything was a QOM Object,
then qom-list-types would serve as the universal detection
mechanism for everything.

Back in our current reality of pre-existing legacy code though,
we have to be a little more pragmattic. If we can make things
into QOM objects that work with -object / qom-list-types that's
great, but if it is too much work we'll just have to create other
QMP commands for querying, such as the query-audiodev patch.


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




[PATCH v2 3/8] target/riscv: debug: Introduce tdata1, tdata2, and tdata3 CSRs

2022-09-09 Thread Bin Meng
From: Frank Chang 

Replace type2_trigger_t with the real tdata1, tdata2, and tdata3 CSRs,
which allows us to support more types of triggers in the future.

Signed-off-by: Frank Chang 
Reviewed-by: Bin Meng 
Signed-off-by: Bin Meng 
---

(no changes since v1)

 target/riscv/cpu.h |   6 ++-
 target/riscv/debug.h   |   7 ---
 target/riscv/debug.c   | 103 +++--
 target/riscv/machine.c |  20 ++--
 4 files changed, 48 insertions(+), 88 deletions(-)

diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 4d82a3250b..b0b05c19ca 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -324,7 +324,11 @@ struct CPUArchState {
 
 /* trigger module */
 target_ulong trigger_cur;
-type2_trigger_t type2_trig[RV_MAX_TRIGGERS];
+target_ulong tdata1[RV_MAX_TRIGGERS];
+target_ulong tdata2[RV_MAX_TRIGGERS];
+target_ulong tdata3[RV_MAX_TRIGGERS];
+struct CPUBreakpoint *cpu_breakpoint[RV_MAX_TRIGGERS];
+struct CPUWatchpoint *cpu_watchpoint[RV_MAX_TRIGGERS];
 
 /* machine specific rdtime callback */
 uint64_t (*rdtime_fn)(void *);
diff --git a/target/riscv/debug.h b/target/riscv/debug.h
index c422553c27..76146f373a 100644
--- a/target/riscv/debug.h
+++ b/target/riscv/debug.h
@@ -44,13 +44,6 @@ typedef enum {
 TRIGGER_TYPE_NUM
 } trigger_type_t;
 
-typedef struct {
-target_ulong mcontrol;
-target_ulong maddress;
-struct CPUBreakpoint *bp;
-struct CPUWatchpoint *wp;
-} type2_trigger_t;
-
 /* tdata1 field masks */
 
 #define RV32_TYPE(t)((uint32_t)(t) << 28)
diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index 45aae87ec3..06feef7d67 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -91,8 +91,7 @@ static inline target_ulong extract_trigger_type(CPURISCVState 
*env,
 static inline target_ulong get_trigger_type(CPURISCVState *env,
 target_ulong trigger_index)
 {
-target_ulong tdata1 = env->type2_trig[trigger_index].mcontrol;
-return extract_trigger_type(env, tdata1);
+return extract_trigger_type(env, env->tdata1[trigger_index]);
 }
 
 static inline target_ulong build_tdata1(CPURISCVState *env,
@@ -188,6 +187,8 @@ static inline void warn_always_zero_bit(target_ulong val, 
target_ulong mask,
 }
 }
 
+/* type 2 trigger */
+
 static uint32_t type2_breakpoint_size(CPURISCVState *env, target_ulong ctrl)
 {
 uint32_t size, sizelo, sizehi = 0;
@@ -247,8 +248,8 @@ static target_ulong type2_mcontrol_validate(CPURISCVState 
*env,
 
 static void type2_breakpoint_insert(CPURISCVState *env, target_ulong index)
 {
-target_ulong ctrl = env->type2_trig[index].mcontrol;
-target_ulong addr = env->type2_trig[index].maddress;
+target_ulong ctrl = env->tdata1[index];
+target_ulong addr = env->tdata2[index];
 bool enabled = type2_breakpoint_enabled(ctrl);
 CPUState *cs = env_cpu(env);
 int flags = BP_CPU | BP_STOP_BEFORE_ACCESS;
@@ -259,7 +260,7 @@ static void type2_breakpoint_insert(CPURISCVState *env, 
target_ulong index)
 }
 
 if (ctrl & TYPE2_EXEC) {
-cpu_breakpoint_insert(cs, addr, flags, &env->type2_trig[index].bp);
+cpu_breakpoint_insert(cs, addr, flags, &env->cpu_breakpoint[index]);
 }
 
 if (ctrl & TYPE2_LOAD) {
@@ -273,10 +274,10 @@ static void type2_breakpoint_insert(CPURISCVState *env, 
target_ulong index)
 size = type2_breakpoint_size(env, ctrl);
 if (size != 0) {
 cpu_watchpoint_insert(cs, addr, size, flags,
-  &env->type2_trig[index].wp);
+  &env->cpu_watchpoint[index]);
 } else {
 cpu_watchpoint_insert(cs, addr, 8, flags,
-  &env->type2_trig[index].wp);
+  &env->cpu_watchpoint[index]);
 }
 }
 }
@@ -285,36 +286,17 @@ static void type2_breakpoint_remove(CPURISCVState *env, 
target_ulong index)
 {
 CPUState *cs = env_cpu(env);
 
-if (env->type2_trig[index].bp) {
-cpu_breakpoint_remove_by_ref(cs, env->type2_trig[index].bp);
-env->type2_trig[index].bp = NULL;
+if (env->cpu_breakpoint[index]) {
+cpu_breakpoint_remove_by_ref(cs, env->cpu_breakpoint[index]);
+env->cpu_breakpoint[index] = NULL;
 }
 
-if (env->type2_trig[index].wp) {
-cpu_watchpoint_remove_by_ref(cs, env->type2_trig[index].wp);
-env->type2_trig[index].wp = NULL;
+if (env->cpu_watchpoint[index]) {
+cpu_watchpoint_remove_by_ref(cs, env->cpu_watchpoint[index]);
+env->cpu_watchpoint[index] = NULL;
 }
 }
 
-static target_ulong type2_reg_read(CPURISCVState *env,
-   target_ulong index, int tdata_index)
-{
-target_ulong tdata;
-
-switch (tdata_index) {
-case TDATA1:
-tdata = env->type2_trig[index].mcontrol;
-break;
-case TDATA2:
-tdata = env->type2_trig[index].maddress;
-break;
- 

[PATCH v2 4/8] target/riscv: debug: Restrict the range of tselect value can be written

2022-09-09 Thread Bin Meng
From: Frank Chang 

The value of tselect CSR can be written should be limited within the
range of supported triggers number.

Signed-off-by: Frank Chang 
Reviewed-by: Bin Meng 
Signed-off-by: Bin Meng 
---

(no changes since v1)

 target/riscv/debug.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index 06feef7d67..d164cd 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -127,10 +127,6 @@ bool tdata_available(CPURISCVState *env, int tdata_index)
 return false;
 }
 
-if (unlikely(env->trigger_cur >= RV_MAX_TRIGGERS)) {
-return false;
-}
-
 return tdata_mapping[trigger_type][tdata_index];
 }
 
@@ -141,8 +137,9 @@ target_ulong tselect_csr_read(CPURISCVState *env)
 
 void tselect_csr_write(CPURISCVState *env, target_ulong val)
 {
-/* all target_ulong bits of tselect are implemented */
-env->trigger_cur = val;
+if (val < RV_MAX_TRIGGERS) {
+env->trigger_cur = val;
+}
 }
 
 static target_ulong tdata1_validate(CPURISCVState *env, target_ulong val,
-- 
2.34.1




[PATCH v2 8/8] target/riscv: debug: Add initial support of type 6 trigger

2022-09-09 Thread Bin Meng
From: Frank Chang 

Type 6 trigger is similar to a type 2 trigger, but provides additional
functionality and should be used instead of type 2 in newer
implementations.

Signed-off-by: Frank Chang 
Reviewed-by: Bin Meng 
Signed-off-by: Bin Meng 

---

(no changes since v1)

 target/riscv/debug.h |  18 +
 target/riscv/debug.c | 174 ++-
 2 files changed, 188 insertions(+), 4 deletions(-)

diff --git a/target/riscv/debug.h b/target/riscv/debug.h
index 0e4859cf74..a1226b4d29 100644
--- a/target/riscv/debug.h
+++ b/target/riscv/debug.h
@@ -85,6 +85,24 @@ typedef enum {
 #define TYPE2_HIT   BIT(20)
 #define TYPE2_SIZEHI(0x3 << 21) /* RV64 only */
 
+/* mcontrol6 field masks */
+
+#define TYPE6_LOAD  BIT(0)
+#define TYPE6_STORE BIT(1)
+#define TYPE6_EXEC  BIT(2)
+#define TYPE6_U BIT(3)
+#define TYPE6_S BIT(4)
+#define TYPE6_M BIT(6)
+#define TYPE6_MATCH (0xf << 7)
+#define TYPE6_CHAIN BIT(11)
+#define TYPE6_ACTION(0xf << 12)
+#define TYPE6_SIZE  (0xf << 16)
+#define TYPE6_TIMINGBIT(20)
+#define TYPE6_SELECTBIT(21)
+#define TYPE6_HIT   BIT(22)
+#define TYPE6_VUBIT(23)
+#define TYPE6_VSBIT(24)
+
 /* access size */
 enum {
 SIZE_ANY = 0,
diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index e16d5c070a..26ea764407 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -39,7 +39,7 @@
  * - tdata3
  * - tinfo
  *
- * The following triggers are implemented:
+ * The following triggers are initialized by default:
  *
  * Index | Type |  tdata mapping | Description
  * --+--++
@@ -103,10 +103,12 @@ static trigger_action_t get_trigger_action(CPURISCVState 
*env,
 case TRIGGER_TYPE_AD_MATCH:
 action = (tdata1 & TYPE2_ACTION) >> 12;
 break;
+case TRIGGER_TYPE_AD_MATCH6:
+action = (tdata1 & TYPE6_ACTION) >> 12;
+break;
 case TRIGGER_TYPE_INST_CNT:
 case TRIGGER_TYPE_INT:
 case TRIGGER_TYPE_EXCP:
-case TRIGGER_TYPE_AD_MATCH6:
 case TRIGGER_TYPE_EXT_SRC:
 qemu_log_mask(LOG_UNIMP, "trigger type: %d is not supported\n",
   trigger_type);
@@ -379,6 +381,123 @@ static void type2_reg_write(CPURISCVState *env, 
target_ulong index,
 return;
 }
 
+/* type 6 trigger */
+
+static inline bool type6_breakpoint_enabled(target_ulong ctrl)
+{
+bool mode = !!(ctrl & (TYPE6_VU | TYPE6_VS | TYPE6_U | TYPE6_S | TYPE6_M));
+bool rwx = !!(ctrl & (TYPE6_LOAD | TYPE6_STORE | TYPE6_EXEC));
+
+return mode && rwx;
+}
+
+static target_ulong type6_mcontrol6_validate(CPURISCVState *env,
+ target_ulong ctrl)
+{
+target_ulong val;
+uint32_t size;
+
+/* validate the generic part first */
+val = tdata1_validate(env, ctrl, TRIGGER_TYPE_AD_MATCH6);
+
+/* validate unimplemented (always zero) bits */
+warn_always_zero_bit(ctrl, TYPE6_MATCH, "match");
+warn_always_zero_bit(ctrl, TYPE6_CHAIN, "chain");
+warn_always_zero_bit(ctrl, TYPE6_ACTION, "action");
+warn_always_zero_bit(ctrl, TYPE6_TIMING, "timing");
+warn_always_zero_bit(ctrl, TYPE6_SELECT, "select");
+warn_always_zero_bit(ctrl, TYPE6_HIT, "hit");
+
+/* validate size encoding */
+size = extract32(ctrl, 16, 4);
+if (access_size[size] == -1) {
+qemu_log_mask(LOG_UNIMP, "access size %d is not supported, using 
SIZE_ANY\n",
+  size);
+} else {
+val |= (ctrl & TYPE6_SIZE);
+}
+
+/* keep the mode and attribute bits */
+val |= (ctrl & (TYPE6_VU | TYPE6_VS | TYPE6_U | TYPE6_S | TYPE6_M |
+TYPE6_LOAD | TYPE6_STORE | TYPE6_EXEC));
+
+return val;
+}
+
+static void type6_breakpoint_insert(CPURISCVState *env, target_ulong index)
+{
+target_ulong ctrl = env->tdata1[index];
+target_ulong addr = env->tdata2[index];
+bool enabled = type6_breakpoint_enabled(ctrl);
+CPUState *cs = env_cpu(env);
+int flags = BP_CPU | BP_STOP_BEFORE_ACCESS;
+uint32_t size;
+
+if (!enabled) {
+return;
+}
+
+if (ctrl & TYPE6_EXEC) {
+cpu_breakpoint_insert(cs, addr, flags, &env->cpu_breakpoint[index]);
+}
+
+if (ctrl & TYPE6_LOAD) {
+flags |= BP_MEM_READ;
+}
+
+if (ctrl & TYPE6_STORE) {
+flags |= BP_MEM_WRITE;
+}
+
+if (flags & BP_MEM_ACCESS) {
+size = extract32(ctrl, 16, 4);
+if (size != 0) {
+cpu_watchpoint_insert(cs, addr, size, flags,
+  &env->cpu_watchpoint[index]);
+} else {
+cpu_watchpoint_insert(cs, addr, 8, flags,
+  &env->cpu_watchpoint[index]);
+}
+}
+}
+
+static void type6_breakpoint_remove(CPURISCVState *env, target_ulong index)
+{
+type2_breakpoint_remove(env, index);
+}
+
+static void type6_reg_write(CPURISCVState *env, target_ulong index,
+   

[PATCH v2 0/8] target/riscv: Improve RISC-V Debug support

2022-09-09 Thread Bin Meng
This patchset refactors RISC-V Debug support to allow more types of
triggers to be extended.

The initial support of type 6 trigger, which is similar to type 2
trigger with additional functionality, is also introduced in this
patchset.

This is a v2 respin of previous patch originally done by Frank Chang
at SiFive. I've incorperated my review comments in v2 and rebased
against QEMU mainline.

Changes in v2:
- fixed MXL_RV128 case
- moved macros to patch#2
- added log guest errors for TRIGGER_TYPE_{NO_EXIST,UNAVAIL}
- moved RV{32,64}_DATA_MASK definition to this patch
- add handling of the DBG_ACTION_NONE case in do_trigger_action()
- drop patch: "target/riscv: debug: Return 0 if previous value written to 
tselect >= number of triggers"

Frank Chang (8):
  target/riscv: debug: Determine the trigger type from tdata1.type
  target/riscv: debug: Introduce build_tdata1() to build tdata1 register
content
  target/riscv: debug: Introduce tdata1, tdata2, and tdata3 CSRs
  target/riscv: debug: Restrict the range of tselect value can be
written
  target/riscv: debug: Introduce tinfo CSR
  target/riscv: debug: Create common trigger actions function
  target/riscv: debug: Check VU/VS modes for type 2 trigger
  target/riscv: debug: Add initial support of type 6 trigger

 target/riscv/cpu.h  |   6 +-
 target/riscv/cpu_bits.h |   1 +
 target/riscv/debug.h|  55 +++--
 target/riscv/csr.c  |  10 +-
 target/riscv/debug.c| 484 
 target/riscv/machine.c  |  20 +-
 6 files changed, 445 insertions(+), 131 deletions(-)

-- 
2.34.1




[PATCH v2 2/8] target/riscv: debug: Introduce build_tdata1() to build tdata1 register content

2022-09-09 Thread Bin Meng
From: Frank Chang 

Introduce build_tdata1() to build tdata1 register content, which can be
shared among all types of triggers.

Signed-off-by: Frank Chang 
Reviewed-by: Bin Meng 
[bmeng: moved RV{32,64}_DATA_MASK definition to this patch]
Signed-off-by: Bin Meng 

---

Changes in v2:
- moved RV{32,64}_DATA_MASK definition to this patch

 target/riscv/debug.h |  2 ++
 target/riscv/debug.c | 15 ++-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/target/riscv/debug.h b/target/riscv/debug.h
index 72e4edcd8c..c422553c27 100644
--- a/target/riscv/debug.h
+++ b/target/riscv/debug.h
@@ -56,9 +56,11 @@ typedef struct {
 #define RV32_TYPE(t)((uint32_t)(t) << 28)
 #define RV32_TYPE_MASK  (0xf << 28)
 #define RV32_DMODE  BIT(27)
+#define RV32_DATA_MASK  0x7ff
 #define RV64_TYPE(t)((uint64_t)(t) << 60)
 #define RV64_TYPE_MASK  (0xfULL << 60)
 #define RV64_DMODE  BIT_ULL(59)
+#define RV64_DATA_MASK  0x7ff
 
 /* mcontrol field masks */
 
diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index 9dd468753a..45aae87ec3 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -95,18 +95,23 @@ static inline target_ulong get_trigger_type(CPURISCVState 
*env,
 return extract_trigger_type(env, tdata1);
 }
 
-static inline target_ulong trigger_type(CPURISCVState *env,
-trigger_type_t type)
+static inline target_ulong build_tdata1(CPURISCVState *env,
+trigger_type_t type,
+bool dmode, target_ulong data)
 {
 target_ulong tdata1;
 
 switch (riscv_cpu_mxl(env)) {
 case MXL_RV32:
-tdata1 = RV32_TYPE(type);
+tdata1 = RV32_TYPE(type) |
+ (dmode ? RV32_DMODE : 0) |
+ (data & RV32_DATA_MASK);
 break;
 case MXL_RV64:
 case MXL_RV128:
-tdata1 = RV64_TYPE(type);
+tdata1 = RV64_TYPE(type) |
+ (dmode ? RV64_DMODE : 0) |
+ (data & RV64_DATA_MASK);
 break;
 default:
 g_assert_not_reached();
@@ -495,7 +500,7 @@ bool riscv_cpu_debug_check_watchpoint(CPUState *cs, 
CPUWatchpoint *wp)
 
 void riscv_trigger_init(CPURISCVState *env)
 {
-target_ulong tdata1 = trigger_type(env, TRIGGER_TYPE_AD_MATCH);
+target_ulong tdata1 = build_tdata1(env, TRIGGER_TYPE_AD_MATCH, 0, 0);
 int i;
 
 /* init to type 2 triggers */
-- 
2.34.1




[PATCH v2 6/8] target/riscv: debug: Create common trigger actions function

2022-09-09 Thread Bin Meng
From: Frank Chang 

Trigger actions are shared among all triggers. Extract to a common
function.

Signed-off-by: Frank Chang 
Reviewed-by: Bin Meng 
[bmeng: handle the DBG_ACTION_NONE case]
Signed-off-by: Bin Meng 

---

Changes in v2:
- add handling of the DBG_ACTION_NONE case in do_trigger_action()

 target/riscv/debug.h | 13 ++
 target/riscv/debug.c | 59 ++--
 2 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/target/riscv/debug.h b/target/riscv/debug.h
index 9f69c64591..0e4859cf74 100644
--- a/target/riscv/debug.h
+++ b/target/riscv/debug.h
@@ -44,6 +44,19 @@ typedef enum {
 TRIGGER_TYPE_NUM
 } trigger_type_t;
 
+/* actions */
+typedef enum {
+DBG_ACTION_NONE = -1,   /* sentinel value */
+DBG_ACTION_BP = 0,
+DBG_ACTION_DBG_MODE,
+DBG_ACTION_TRACE0,
+DBG_ACTION_TRACE1,
+DBG_ACTION_TRACE2,
+DBG_ACTION_TRACE3,
+DBG_ACTION_EXT_DBG0 = 8,
+DBG_ACTION_EXT_DBG1
+} trigger_action_t;
+
 /* tdata1 field masks */
 
 #define RV32_TYPE(t)((uint32_t)(t) << 28)
diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index 7d546ace42..7a8910f980 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -92,6 +92,37 @@ static inline target_ulong get_trigger_type(CPURISCVState 
*env,
 return extract_trigger_type(env, env->tdata1[trigger_index]);
 }
 
+static trigger_action_t get_trigger_action(CPURISCVState *env,
+   target_ulong trigger_index)
+{
+target_ulong tdata1 = env->tdata1[trigger_index];
+int trigger_type = get_trigger_type(env, trigger_index);
+trigger_action_t action = DBG_ACTION_NONE;
+
+switch (trigger_type) {
+case TRIGGER_TYPE_AD_MATCH:
+action = (tdata1 & TYPE2_ACTION) >> 12;
+break;
+case TRIGGER_TYPE_INST_CNT:
+case TRIGGER_TYPE_INT:
+case TRIGGER_TYPE_EXCP:
+case TRIGGER_TYPE_AD_MATCH6:
+case TRIGGER_TYPE_EXT_SRC:
+qemu_log_mask(LOG_UNIMP, "trigger type: %d is not supported\n",
+  trigger_type);
+break;
+case TRIGGER_TYPE_NO_EXIST:
+case TRIGGER_TYPE_UNAVAIL:
+qemu_log_mask(LOG_GUEST_ERROR, "trigger type: %d does not exit\n",
+  trigger_type);
+break;
+default:
+g_assert_not_reached();
+}
+
+return action;
+}
+
 static inline target_ulong build_tdata1(CPURISCVState *env,
 trigger_type_t type,
 bool dmode, target_ulong data)
@@ -182,6 +213,30 @@ static inline void warn_always_zero_bit(target_ulong val, 
target_ulong mask,
 }
 }
 
+static void do_trigger_action(CPURISCVState *env, target_ulong trigger_index)
+{
+trigger_action_t action = get_trigger_action(env, trigger_index);
+
+switch (action) {
+case DBG_ACTION_NONE:
+break;
+case DBG_ACTION_BP:
+riscv_raise_exception(env, RISCV_EXCP_BREAKPOINT, 0);
+break;
+case DBG_ACTION_DBG_MODE:
+case DBG_ACTION_TRACE0:
+case DBG_ACTION_TRACE1:
+case DBG_ACTION_TRACE2:
+case DBG_ACTION_TRACE3:
+case DBG_ACTION_EXT_DBG0:
+case DBG_ACTION_EXT_DBG1:
+qemu_log_mask(LOG_UNIMP, "action: %d is not supported\n", action);
+break;
+default:
+g_assert_not_reached();
+}
+}
+
 /* type 2 trigger */
 
 static uint32_t type2_breakpoint_size(CPURISCVState *env, target_ulong ctrl)
@@ -384,11 +439,11 @@ void riscv_cpu_debug_excp_handler(CPUState *cs)
 if (cs->watchpoint_hit) {
 if (cs->watchpoint_hit->flags & BP_CPU) {
 cs->watchpoint_hit = NULL;
-riscv_raise_exception(env, RISCV_EXCP_BREAKPOINT, 0);
+do_trigger_action(env, DBG_ACTION_BP);
 }
 } else {
 if (cpu_breakpoint_test(cs, env->pc, BP_CPU)) {
-riscv_raise_exception(env, RISCV_EXCP_BREAKPOINT, 0);
+do_trigger_action(env, DBG_ACTION_BP);
 }
 }
 }
-- 
2.34.1




[PATCH v2 1/8] target/riscv: debug: Determine the trigger type from tdata1.type

2022-09-09 Thread Bin Meng
From: Frank Chang 

Current RISC-V debug assumes that only type 2 trigger is supported.
To allow more types of triggers to be supported in the future
(e.g. type 6 trigger, which is similar to type 2 trigger with additional
 functionality), we should determine the trigger type from tdata1.type.

RV_MAX_TRIGGERS is also introduced in replacement of TRIGGER_TYPE2_NUM.

Signed-off-by: Frank Chang 
Reviewed-by: Bin Meng 
[bmeng: fixed MXL_RV128 case, and moved macros to the following patch]
Signed-off-by: Bin Meng 

---

Changes in v2:
- fixed MXL_RV128 case
- moved macros to patch#2
- added log guest errors for TRIGGER_TYPE_{NO_EXIST,UNAVAIL}

 target/riscv/cpu.h |   2 +-
 target/riscv/debug.h   |  13 +--
 target/riscv/csr.c |   2 +-
 target/riscv/debug.c   | 188 +
 target/riscv/machine.c |   2 +-
 5 files changed, 140 insertions(+), 67 deletions(-)

diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 06751e1e3e..4d82a3250b 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -324,7 +324,7 @@ struct CPUArchState {
 
 /* trigger module */
 target_ulong trigger_cur;
-type2_trigger_t type2_trig[TRIGGER_TYPE2_NUM];
+type2_trigger_t type2_trig[RV_MAX_TRIGGERS];
 
 /* machine specific rdtime callback */
 uint64_t (*rdtime_fn)(void *);
diff --git a/target/riscv/debug.h b/target/riscv/debug.h
index 27b9cac6b4..72e4edcd8c 100644
--- a/target/riscv/debug.h
+++ b/target/riscv/debug.h
@@ -22,13 +22,7 @@
 #ifndef RISCV_DEBUG_H
 #define RISCV_DEBUG_H
 
-/* trigger indexes implemented */
-enum {
-TRIGGER_TYPE2_IDX_0 = 0,
-TRIGGER_TYPE2_IDX_1,
-TRIGGER_TYPE2_NUM,
-TRIGGER_NUM = TRIGGER_TYPE2_NUM
-};
+#define RV_MAX_TRIGGERS 2
 
 /* register index of tdata CSRs */
 enum {
@@ -46,7 +40,8 @@ typedef enum {
 TRIGGER_TYPE_EXCP = 5,  /* exception trigger */
 TRIGGER_TYPE_AD_MATCH6 = 6, /* new address/data match trigger */
 TRIGGER_TYPE_EXT_SRC = 7,   /* external source trigger */
-TRIGGER_TYPE_UNAVAIL = 15   /* trigger exists, but unavailable */
+TRIGGER_TYPE_UNAVAIL = 15,  /* trigger exists, but unavailable */
+TRIGGER_TYPE_NUM
 } trigger_type_t;
 
 typedef struct {
@@ -56,7 +51,7 @@ typedef struct {
 struct CPUWatchpoint *wp;
 } type2_trigger_t;
 
-/* tdata field masks */
+/* tdata1 field masks */
 
 #define RV32_TYPE(t)((uint32_t)(t) << 28)
 #define RV32_TYPE_MASK  (0xf << 28)
diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index b96db1b62b..3d0d8e0340 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -3065,7 +3065,7 @@ static RISCVException read_tdata(CPURISCVState *env, int 
csrno,
  target_ulong *val)
 {
 /* return 0 in tdata1 to end the trigger enumeration */
-if (env->trigger_cur >= TRIGGER_NUM && csrno == CSR_TDATA1) {
+if (env->trigger_cur >= RV_MAX_TRIGGERS && csrno == CSR_TDATA1) {
 *val = 0;
 return RISCV_EXCP_NONE;
 }
diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index fc6e13222f..9dd468753a 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -52,8 +52,15 @@
 /* tdata availability of a trigger */
 typedef bool tdata_avail[TDATA_NUM];
 
-static tdata_avail tdata_mapping[TRIGGER_NUM] = {
-[TRIGGER_TYPE2_IDX_0 ... TRIGGER_TYPE2_IDX_1] = { true, true, false },
+static tdata_avail tdata_mapping[TRIGGER_TYPE_NUM] = {
+[TRIGGER_TYPE_NO_EXIST] = { false, false, false },
+[TRIGGER_TYPE_AD_MATCH] = { true, true, true },
+[TRIGGER_TYPE_INST_CNT] = { true, false, true },
+[TRIGGER_TYPE_INT] = { true, true, true },
+[TRIGGER_TYPE_EXCP] = { true, true, true },
+[TRIGGER_TYPE_AD_MATCH6] = { true, true, true },
+[TRIGGER_TYPE_EXT_SRC] = { true, false, false },
+[TRIGGER_TYPE_UNAVAIL] = { true, true, true }
 };
 
 /* only breakpoint size 1/2/4/8 supported */
@@ -67,6 +74,27 @@ static int access_size[SIZE_NUM] = {
 [6 ... 15] = -1,
 };
 
+static inline target_ulong extract_trigger_type(CPURISCVState *env,
+target_ulong tdata1)
+{
+switch (riscv_cpu_mxl(env)) {
+case MXL_RV32:
+return extract32(tdata1, 28, 4);
+case MXL_RV64:
+case MXL_RV128:
+return extract64(tdata1, 60, 4);
+default:
+g_assert_not_reached();
+}
+}
+
+static inline target_ulong get_trigger_type(CPURISCVState *env,
+target_ulong trigger_index)
+{
+target_ulong tdata1 = env->type2_trig[trigger_index].mcontrol;
+return extract_trigger_type(env, tdata1);
+}
+
 static inline target_ulong trigger_type(CPURISCVState *env,
 trigger_type_t type)
 {
@@ -89,15 +117,17 @@ static inline target_ulong trigger_type(CPURISCVState *env,
 
 bool tdata_available(CPURISCVState *env, int tdata_index)
 {
+int trigger_type = get_trigger_type(env, env->trigger_cur);
+
 if (unlikely(tdata_index >= TD

[PATCH v2 7/8] target/riscv: debug: Check VU/VS modes for type 2 trigger

2022-09-09 Thread Bin Meng
From: Frank Chang 

Type 2 trigger cannot be fired in VU/VS modes.

Signed-off-by: Frank Chang 
Reviewed-by: Bin Meng 
Signed-off-by: Bin Meng 
---

(no changes since v1)

 target/riscv/debug.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/target/riscv/debug.c b/target/riscv/debug.c
index 7a8910f980..e16d5c070a 100644
--- a/target/riscv/debug.c
+++ b/target/riscv/debug.c
@@ -464,6 +464,11 @@ bool riscv_cpu_debug_check_breakpoint(CPUState *cs)
 
 switch (trigger_type) {
 case TRIGGER_TYPE_AD_MATCH:
+/* type 2 trigger cannot be fired in VU/VS mode */
+if (riscv_cpu_virt_enabled(env)) {
+return false;
+}
+
 ctrl = env->tdata1[i];
 pc = env->tdata2[i];
 
@@ -499,6 +504,11 @@ bool riscv_cpu_debug_check_watchpoint(CPUState *cs, 
CPUWatchpoint *wp)
 
 switch (trigger_type) {
 case TRIGGER_TYPE_AD_MATCH:
+/* type 2 trigger cannot be fired in VU/VS mode */
+if (riscv_cpu_virt_enabled(env)) {
+return false;
+}
+
 ctrl = env->tdata1[i];
 addr = env->tdata2[i];
 flags = 0;
-- 
2.34.1




Re: [PATCH v3] audio: add help option for -audio and -audiodev

2022-09-09 Thread Claudio Fontana
On 9/9/22 00:05, Paolo Bonzini wrote:
> Il gio 8 set 2022, 15:47 Claudio Fontana  ha scritto:
> 
>> On 9/8/22 11:39, Paolo Bonzini wrote:
>>> Queued, thanks.
>>>
>>> Paolo
>>>
>>>
>>
>> Thanks. When it comes to programmatic checks about what QEMU supports in
>> terms of audio,
>>
>> is there something that can be done with QMP?
>>
>> I checked the QMP manual at:
>>
>>
>> https://qemu.readthedocs.io/en/latest/interop/qemu-qmp-ref.html#qapidoc-2948
>>
>> but in the "Audio" section there is a bunch of Objects and enums defined,
>> but no command to query them...
>>
>> Thanks,
>>
>> Claudio
>>
>>
> No, there's nothing yet.
> 
> Paolo
> 
 
Interesting. What about Display (ie ui-*) ? I mean how do I figure out from, 
say, libvirt,
everything that QEMU can do in terms of display, which drivers are actually 
installed?

Same for block...

with the increasing modularization of QEMU we should I presume strengthen the 
discoverability of QEMU capabilities right?
This way we can configure once, and install just what is needed to match the 
user requirements, or distro variant.

As Markus mentioned maybe a more general solution would be to have these things 
as qom objects so that a

qom-list-types

can be used to get all 'audiodev' types, or all 'display' types, or all 'block' 
types and solve the problem this way?

Is there a more general problem / solution that I am not seeing?

Thanks,

Claudio



Re: [PATCH v3 4/5] acpi/nvdimm: Implement ACPI NVDIMM Label Methods

2022-09-09 Thread Igor Mammedov
On Thu,  1 Sep 2022 11:27:20 +0800
Robert Hoo  wrote:

> Recent ACPI spec [1] has defined NVDIMM Label Methods _LS{I,R,W}, which
> deprecates corresponding _DSM Functions defined by PMEM _DSM Interface spec
> [2].
> 
> Since the semantics of the new Label Methods are same as old _DSM
> methods, the implementations here simply wrapper old ones.
> 
> ASL form diff can be found in next patch of updating golden master
> binaries.
> 
> [1] ACPI Spec v6.4, 6.5.10 NVDIMM Label Methods
> https://uefi.org/sites/default/files/resources/ACPI_Spec_6_4_Jan22.pdf
> [2] Intel PMEM _DSM Interface Spec v2.0, 3.10 Deprecated Functions
> https://pmem.io/documents/IntelOptanePMem_DSM_Interface-V2.0.pdf
> 
> Signed-off-by: Robert Hoo 
> Reviewed-by: Jingqi Liu 

looks more or less fine except of excessive use of named variables
which creates global scope variables.

I'd suggest to store temporary buffers/packages in LocalX variales,
you should be able to do that for everything modulo aml_create_dword_field().

see an example below

> ---
>  hw/acpi/nvdimm.c | 91 
>  1 file changed, 91 insertions(+)
> 
> diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
> index afff911c1e..516acfe53b 100644
> --- a/hw/acpi/nvdimm.c
> +++ b/hw/acpi/nvdimm.c
> @@ -1243,6 +1243,7 @@ static void nvdimm_build_fit(Aml *dev)
>  static void nvdimm_build_nvdimm_devices(Aml *root_dev, uint32_t ram_slots)
>  {
>  uint32_t slot;
> +Aml *method, *pkg, *field, *com_call;
>  
>  for (slot = 0; slot < ram_slots; slot++) {
>  uint32_t handle = nvdimm_slot_to_handle(slot);
> @@ -1260,6 +1261,96 @@ static void nvdimm_build_nvdimm_devices(Aml *root_dev, 
> uint32_t ram_slots)
>   */
>  aml_append(nvdimm_dev, aml_name_decl("_ADR", aml_int(handle)));
>  
> +/*
> + * ACPI v6.4: Section 6.5.10 NVDIMM Label Methods
> + */
> +/* _LSI */
> +method = aml_method("_LSI", 0, AML_SERIALIZED);
> +com_call = aml_call5(NVDIMM_COMMON_DSM,
> +aml_touuid(NVDIMM_DEVICE_DSM_UUID),
> +aml_int(1), aml_int(4), aml_int(0),
> +aml_int(handle));
> +aml_append(method, aml_store(com_call, aml_local(0)));
> +
> +aml_append(method, aml_create_dword_field(aml_local(0),
> +  aml_int(0), "STTS"));
> +aml_append(method, aml_create_dword_field(aml_local(0), aml_int(4),
> +  "SLSA"));
> +aml_append(method, aml_create_dword_field(aml_local(0), aml_int(8),
> +  "MAXT"));
> +
> +pkg = aml_package(3);
> +aml_append(pkg, aml_name("STTS"));
> +aml_append(pkg, aml_name("SLSA"));
> +aml_append(pkg, aml_name("MAXT"));
> +aml_append(method, aml_name_decl("RET", pkg));
ex: put it in local instead of named variable and return that
the same applies to other named temporary named variables.

> +aml_append(method, aml_return(aml_name("RET")));
> +
> +aml_append(nvdimm_dev, method);
> +
> +/* _LSR */
> +method = aml_method("_LSR", 2, AML_SERIALIZED);
> +aml_append(method, aml_name_decl("INPT", aml_buffer(8, NULL)));
> +
> +aml_append(method, aml_create_dword_field(aml_name("INPT"),
> +  aml_int(0), "OFST"));
> +aml_append(method, aml_create_dword_field(aml_name("INPT"),
> +  aml_int(4), "LEN"));
> +aml_append(method, aml_store(aml_arg(0), aml_name("OFST")));
> +aml_append(method, aml_store(aml_arg(1), aml_name("LEN")));
> +
> +pkg = aml_package(1);
> +aml_append(pkg, aml_name("INPT"));
> +aml_append(method, aml_name_decl("PKG1", pkg));
> +
> +com_call = aml_call5(NVDIMM_COMMON_DSM,
> +aml_touuid(NVDIMM_DEVICE_DSM_UUID),
> +aml_int(1), aml_int(5), aml_name("PKG1"),
> +aml_int(handle));
> +aml_append(method, aml_store(com_call, aml_local(3)));
> +field = aml_create_dword_field(aml_local(3), aml_int(0), "STTS");
> +aml_append(method, field);
> +field = aml_create_field(aml_local(3), aml_int(32),
> + aml_shiftleft(aml_name("LEN"), aml_int(3)),
> + "LDAT");
> +aml_append(method, field);
> +aml_append(method, aml_name_decl("LSA", aml_buffer(0, NULL)));
> +aml_append(method, aml_to_buffer(aml_name("LDAT"), aml_name("LSA")));
> +pkg = aml_package(2);
> +aml_append(pkg, aml_name("STTS"));
> +aml_append(pkg, aml_name("LSA"));
> +aml_append(method, aml_name_decl("RET", pkg));
> +aml_append(method, aml_return(aml_name("RET")));
> +aml_append(nvdimm_dev, method)

Re: [PATCH v2 4/5] msmouse: Add pnp data

2022-09-09 Thread Marc-André Lureau
On Thu, Sep 8, 2022 at 9:34 PM Arwed Meyer  wrote:

> Make msmouse send serial pnp data.
> Enables you to see nice qemu device name in Win9x.
>
> Signed-off-by: Arwed Meyer 
>

lgtm
Reviewed-by: Marc-André Lureau 


> ---
>  chardev/msmouse.c | 58 ++-
>  1 file changed, 47 insertions(+), 11 deletions(-)
>
> diff --git a/chardev/msmouse.c b/chardev/msmouse.c
> index e9aa3eab55..29eb97fedc 100644
> --- a/chardev/msmouse.c
> +++ b/chardev/msmouse.c
> @@ -35,11 +35,24 @@
>  #define MSMOUSE_HI2(n)  (((n) & 0xc0) >> 6)
>  #define MSMOUSE_PWR(cm) (cm & (CHR_TIOCM_RTS | CHR_TIOCM_DTR))
>
> +/* Serial PnP for 6 bit devices/mice sends all ASCII chars - 0x20 */
> +#define M(c) (c - 0x20)
>  /* Serial fifo size. */
>  #define MSMOUSE_BUF_SZ 64
>
>  /* Mouse ID: Send "M3" cause we behave like a 3 button logitech mouse. */
>  const uint8_t mouse_id[] = {'M', '3'};
> +/*
> + * PnP start "(", PnP version (1.0), vendor ID, product ID, '\\',
> + * serial ID (omitted), '\\', MS class name, '\\', driver ID (omitted),
> '\\',
> + * product description, checksum, ")"
> + * Missing parts are inserted later.
> + */
> +const uint8_t pnp_data[] = {M('('), 1, '$', M('Q'), M('M'), M('U'),
> + M('0'), M('0'), M('0'), M('1'),
> + M('\\'), M('\\'),
> + M('M'), M('O'), M('U'), M('S'), M('E'),
> + M('\\'), M('\\')};
>
>  struct MouseChardev {
>  Chardev parent;
> @@ -154,11 +167,22 @@ static int msmouse_chr_write(struct Chardev *s,
> const uint8_t *buf, int len)
>  return len;
>  }
>
> +static QemuInputHandler msmouse_handler = {
> +.name  = "QEMU Microsoft Mouse",
> +.mask  = INPUT_EVENT_MASK_BTN | INPUT_EVENT_MASK_REL,
> +.event = msmouse_input_event,
> +.sync  = msmouse_input_sync,
> +};
> +
>  static int msmouse_ioctl(Chardev *chr, int cmd, void *arg)
>  {
>  MouseChardev *mouse = MOUSE_CHARDEV(chr);
> -int c;
> +int c, i, j;
> +uint8_t bytes[MSMOUSE_BUF_SZ / 2];
>  int *targ = (int *)arg;
> +const uint8_t hexchr[16] = {M('0'), M('1'), M('2'), M('3'), M('4'),
> M('5'),
> + M('6'), M('7'), M('8'), M('9'), M('A'),
> M('B'),
> + M('C'), M('D'), M('E'), M('F')};
>
>  switch (cmd) {
>  case CHR_IOCTL_SERIAL_SET_TIOCM:
> @@ -167,11 +191,30 @@ static int msmouse_ioctl(Chardev *chr, int cmd, void
> *arg)
>  if (MSMOUSE_PWR(mouse->tiocm)) {
>  if (!MSMOUSE_PWR(c)) {
>  /*
> - * Power on after reset: send "M3"
> - * cause we behave like a 3 button logitech
> - * mouse.
> + * Power on after reset: Send ID and PnP data
> + * No need to check fifo space as it is empty at this
> point.
>   */
>  fifo8_push_all(&mouse->outbuf, mouse_id,
> sizeof(mouse_id));
> +/* Add PnP data: */
> +fifo8_push_all(&mouse->outbuf, pnp_data,
> sizeof(pnp_data));
> +/*
> + * Add device description from qemu handler name.
> + * Make sure this all fits into the queue beforehand!
> + */
> +c = M(')');
> +for (i = 0; msmouse_handler.name[i]; i++) {
> +bytes[i] = M(msmouse_handler.name[i]);
> +c += bytes[i];
> +}
> +/* Calc more of checksum */
> +for (j = 0; j < sizeof(pnp_data); j++) {
> +c += pnp_data[j];
> +}
> +c &= 0xff;
> +bytes[i++] = hexchr[c >> 4];
> +bytes[i++] = hexchr[c & 0x0f];
> +bytes[i++] = M(')');
> +fifo8_push_all(&mouse->outbuf, bytes, i);
>  /* Start sending data to serial. */
>  msmouse_chr_accept_input(chr);
>  }
> @@ -204,13 +247,6 @@ static void char_msmouse_finalize(Object *obj)
>  fifo8_destroy(&mouse->outbuf);
>  }
>
> -static QemuInputHandler msmouse_handler = {
> -.name  = "QEMU Microsoft Mouse",
> -.mask  = INPUT_EVENT_MASK_BTN | INPUT_EVENT_MASK_REL,
> -.event = msmouse_input_event,
> -.sync  = msmouse_input_sync,
> -};
> -
>  static void msmouse_chr_open(Chardev *chr,
>   ChardevBackend *backend,
>   bool *be_opened,
> --
> 2.34.1
>
>
>

-- 
Marc-André Lureau


Re: [PATCH v2 3/5] msmouse: Use fifo8 instead of array

2022-09-09 Thread Marc-André Lureau
Hi

On Thu, Sep 8, 2022 at 9:38 PM Arwed Meyer  wrote:

> Make use of fifo8 functions instead of implementing own fifo code.
> This makes the code more readable and reduces risk of bugs.
>
> Signed-off-by: Arwed Meyer 
>

Reviewed-by: Marc-André Lureau 


> ---
>  chardev/msmouse.c | 43 +--
>  1 file changed, 21 insertions(+), 22 deletions(-)
>
> diff --git a/chardev/msmouse.c b/chardev/msmouse.c
> index 95fa488339..e9aa3eab55 100644
> --- a/chardev/msmouse.c
> +++ b/chardev/msmouse.c
> @@ -24,6 +24,7 @@
>
>  #include "qemu/osdep.h"
>  #include "qemu/module.h"
> +#include "qemu/fifo8.h"
>  #include "chardev/char.h"
>  #include "chardev/char-serial.h"
>  #include "ui/console.h"
> @@ -34,6 +35,12 @@
>  #define MSMOUSE_HI2(n)  (((n) & 0xc0) >> 6)
>  #define MSMOUSE_PWR(cm) (cm & (CHR_TIOCM_RTS | CHR_TIOCM_DTR))
>
> +/* Serial fifo size. */
> +#define MSMOUSE_BUF_SZ 64
>

Why bump to 64 bytes?


> +
> +/* Mouse ID: Send "M3" cause we behave like a 3 button logitech mouse. */
> +const uint8_t mouse_id[] = {'M', '3'};
> +
>  struct MouseChardev {
>  Chardev parent;
>
> @@ -42,8 +49,7 @@ struct MouseChardev {
>  int axis[INPUT_AXIS__MAX];
>  bool btns[INPUT_BUTTON__MAX];
>  bool btnc[INPUT_BUTTON__MAX];
> -uint8_t outbuf[32];
> -int outlen;
> +Fifo8 outbuf;
>  };
>  typedef struct MouseChardev MouseChardev;
>
> @@ -54,21 +60,15 @@ DECLARE_INSTANCE_CHECKER(MouseChardev, MOUSE_CHARDEV,
>  static void msmouse_chr_accept_input(Chardev *chr)
>  {
>  MouseChardev *mouse = MOUSE_CHARDEV(chr);
> -int len;
> +uint32_t len_out, len;
>
> -len = qemu_chr_be_can_write(chr);
> -if (len > mouse->outlen) {
> -len = mouse->outlen;
> -}
> -if (!len) {
> +len_out = qemu_chr_be_can_write(chr);
> +if (!len_out || fifo8_is_empty(&mouse->outbuf)) {
>  return;
>  }
> -
> -qemu_chr_be_write(chr, mouse->outbuf, len);
> -mouse->outlen -= len;
> -if (mouse->outlen) {
> -memmove(mouse->outbuf, mouse->outbuf + len, mouse->outlen);
> -}
> +len = MIN(fifo8_num_used(&mouse->outbuf), len_out);
> +qemu_chr_be_write(chr, fifo8_pop_buf(&mouse->outbuf, len, &len_out),
> +len_out);
>  }
>
>  static void msmouse_queue_event(MouseChardev *mouse)
> @@ -94,12 +94,11 @@ static void msmouse_queue_event(MouseChardev *mouse)
>  mouse->btnc[INPUT_BUTTON_MIDDLE]) {
>  bytes[3] |= (mouse->btns[INPUT_BUTTON_MIDDLE] ? 0x20 : 0x00);
>  mouse->btnc[INPUT_BUTTON_MIDDLE] = false;
> -count = 4;
> +count++;
>

a bit superfluous,  ok

 }
>
> -if (mouse->outlen <= sizeof(mouse->outbuf) - count) {
> -memcpy(mouse->outbuf + mouse->outlen, bytes, count);
> -mouse->outlen += count;
> +if (fifo8_num_free(&mouse->outbuf) >= count) {
> +fifo8_push_all(&mouse->outbuf, bytes, count);
>  } else {
>  /* queue full -> drop event */
>  }
> @@ -172,9 +171,7 @@ static int msmouse_ioctl(Chardev *chr, int cmd, void
> *arg)
>   * cause we behave like a 3 button logitech
>   * mouse.
>   */
> -mouse->outbuf[0] = 'M';
> -mouse->outbuf[1] = '3';
> -mouse->outlen = 2;
> +fifo8_push_all(&mouse->outbuf, mouse_id,
> sizeof(mouse_id));
>  /* Start sending data to serial. */
>  msmouse_chr_accept_input(chr);
>  }
> @@ -184,7 +181,7 @@ static int msmouse_ioctl(Chardev *chr, int cmd, void
> *arg)
>   * Reset mouse buffers on power down.
>   * Mouse won't send anything without power.
>   */
> -mouse->outlen = 0;
> +fifo8_reset(&mouse->outbuf);
>  memset(mouse->axis, 0, sizeof(mouse->axis));
>  memset(mouse->btns, false, sizeof(mouse->btns));
>  memset(mouse->btnc, false, sizeof(mouse->btns));
> @@ -204,6 +201,7 @@ static void char_msmouse_finalize(Object *obj)
>  MouseChardev *mouse = MOUSE_CHARDEV(obj);
>
>  qemu_input_handler_unregister(mouse->hs);
> +fifo8_destroy(&mouse->outbuf);
>  }
>
>  static QemuInputHandler msmouse_handler = {
> @@ -224,6 +222,7 @@ static void msmouse_chr_open(Chardev *chr,
>  mouse->hs = qemu_input_handler_register((DeviceState *)mouse,
>  &msmouse_handler);
>  mouse->tiocm = 0;
> +fifo8_create(&mouse->outbuf, MSMOUSE_BUF_SZ);
>  }
>
>  static void char_msmouse_class_init(ObjectClass *oc, void *data)
> --
> 2.34.1
>
>
>

-- 
Marc-André Lureau


Re: [PATCH v2 2/5] chardev: src buffer const for write functions

2022-09-09 Thread Marc-André Lureau
Hi

On Thu, Sep 8, 2022 at 9:44 PM Arwed Meyer  wrote:

> Make source buffers const for char be write functions.
> This allows using buffers returned by fifo as buf parameter and source
> buffer
> should not be changed by write functions anyway.
>
> Signed-off-by: Arwed Meyer 
>

Reviewed-by: Marc-André Lureau 


> ---
>  chardev/char.c  | 4 ++--
>  include/chardev/char.h  | 4 ++--
>  include/sysemu/replay.h | 2 +-
>  replay/replay-char.c| 2 +-
>  stubs/replay-tools.c| 2 +-
>  5 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/chardev/char.c b/chardev/char.c
> index 0169d8dde4..b005df3ccf 100644
> --- a/chardev/char.c
> +++ b/chardev/char.c
> @@ -193,7 +193,7 @@ int qemu_chr_be_can_write(Chardev *s)
>  return be->chr_can_read(be->opaque);
>  }
>
> -void qemu_chr_be_write_impl(Chardev *s, uint8_t *buf, int len)
> +void qemu_chr_be_write_impl(Chardev *s, const uint8_t *buf, int len)
>  {
>  CharBackend *be = s->be;
>
> @@ -202,7 +202,7 @@ void qemu_chr_be_write_impl(Chardev *s, uint8_t *buf,
> int len)
>  }
>  }
>
> -void qemu_chr_be_write(Chardev *s, uint8_t *buf, int len)
> +void qemu_chr_be_write(Chardev *s, const uint8_t *buf, int len)
>  {
>  if (qemu_chr_replay(s)) {
>  if (replay_mode == REPLAY_MODE_PLAY) {
> diff --git a/include/chardev/char.h b/include/chardev/char.h
> index a319b5fdff..44cd82e405 100644
> --- a/include/chardev/char.h
> +++ b/include/chardev/char.h
> @@ -186,7 +186,7 @@ int qemu_chr_be_can_write(Chardev *s);
>   * the caller should call @qemu_chr_be_can_write to determine how much
> data
>   * the front end can currently accept.
>   */
> -void qemu_chr_be_write(Chardev *s, uint8_t *buf, int len);
> +void qemu_chr_be_write(Chardev *s, const uint8_t *buf, int len);
>
>  /**
>   * qemu_chr_be_write_impl:
> @@ -195,7 +195,7 @@ void qemu_chr_be_write(Chardev *s, uint8_t *buf, int
> len);
>   *
>   * Implementation of back end writing. Used by replay module.
>   */
> -void qemu_chr_be_write_impl(Chardev *s, uint8_t *buf, int len);
> +void qemu_chr_be_write_impl(Chardev *s, const uint8_t *buf, int len);
>
>  /**
>   * qemu_chr_be_update_read_handlers:
> diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
> index 73dee9ccdf..7ec0882b50 100644
> --- a/include/sysemu/replay.h
> +++ b/include/sysemu/replay.h
> @@ -198,7 +198,7 @@ uint64_t blkreplay_next_id(void);
>  /*! Registers char driver to save it's events */
>  void replay_register_char_driver(struct Chardev *chr);
>  /*! Saves write to char device event to the log */
> -void replay_chr_be_write(struct Chardev *s, uint8_t *buf, int len);
> +void replay_chr_be_write(struct Chardev *s, const uint8_t *buf, int len);
>  /*! Writes char write return value to the replay log. */
>  void replay_char_write_event_save(int res, int offset);
>  /*! Reads char write return value from the replay log. */
> diff --git a/replay/replay-char.c b/replay/replay-char.c
> index d2025948cf..a31aded032 100644
> --- a/replay/replay-char.c
> +++ b/replay/replay-char.c
> @@ -48,7 +48,7 @@ void replay_register_char_driver(Chardev *chr)
>  char_drivers[drivers_count++] = chr;
>  }
>
> -void replay_chr_be_write(Chardev *s, uint8_t *buf, int len)
> +void replay_chr_be_write(Chardev *s, const uint8_t *buf, int len)
>  {
>  CharEvent *event = g_new0(CharEvent, 1);
>
> diff --git a/stubs/replay-tools.c b/stubs/replay-tools.c
> index f2e72bb225..3e8ca3212d 100644
> --- a/stubs/replay-tools.c
> +++ b/stubs/replay-tools.c
> @@ -53,7 +53,7 @@ void replay_register_char_driver(struct Chardev *chr)
>  {
>  }
>
> -void replay_chr_be_write(struct Chardev *s, uint8_t *buf, int len)
> +void replay_chr_be_write(struct Chardev *s, const uint8_t *buf, int len)
>  {
>  abort();
>  }
> --
> 2.34.1
>
>
>

-- 
Marc-André Lureau


Re: [PATCH v3] 9pfs: use GHashTable for fid table

2022-09-09 Thread Christian Schoenebeck
On Donnerstag, 8. September 2022 13:23:53 CEST Linus Heckemann wrote:
> The previous implementation would iterate over the fid table for
> lookup operations, resulting in an operation with O(n) complexity on
> the number of open files and poor cache locality -- for every open,
> stat, read, write, etc operation.
> 
> This change uses a hashtable for this instead, significantly improving
> the performance of the 9p filesystem. The runtime of NixOS's simple
> installer test, which copies ~122k files totalling ~1.8GiB from 9p,
> decreased by a factor of about 10.
> 
> Signed-off-by: Linus Heckemann 
> Reviewed-by: Philippe Mathieu-Daudé 
> Reviewed-by: Greg Kurz 
> ---

Queued on 9p.next:
https://github.com/cschoenebeck/qemu/commits/9p.next

I retained the BUG_ON() in get_fid(), Greg had a point there that continuing 
to work on a clunked fid would still be a bug.

I also added the suggested TODO comment for g_hash_table_steal_extended(), the 
actual change would be outside the scope of this patch.

And finally I gave this patch a whirl, and what can I say: that's just sick! 
Compiling sources with 9p is boosted by around factor 6..7 here! And running 
9p as root fs also no longer feels sluggish as before. I mean I knew that this 
fid list traversal performance issue existed and had it on my TODO list, but 
the actual impact exceeded my expectation by far.

Thanks!

Best regards,
Christian Schoenebeck





Re: [PATCH v2 00/11] Introduce new acpi/smbios python tests using biosbits

2022-09-09 Thread Ani Sinha
+alexb

On Tue, Sep 6, 2022 at 7:00 PM Daniel P. Berrangé  wrote:
>
> On Tue, Sep 06, 2022 at 06:58:02PM +0530, Ani Sinha wrote:
> > On Tue, Sep 6, 2022 at 18:45 Daniel P. Berrangé  wrote:
> >
> > > On Thu, Jul 14, 2022 at 02:24:18PM +0100, Peter Maydell wrote:
> > > > On Mon, 11 Jul 2022 at 10:34, Michael S. Tsirkin  
> > > > wrote:
> > > > >
> > > > > On Sun, Jul 10, 2022 at 10:30:03PM +0530, Ani Sinha wrote:
> > > > > > Changelog:
> > > > > > v2:
> > > > > >  - a new class of python based tests introduced that is separate
> > > from avocado
> > > > > >tests or qtests. Can be run by using "make check-pytest".
> > > > > >  - acpi biosbits tests are the first tests to use pytest 
> > > > > > environment.
> > > > > >  - bios bits tests now download the bits binary archives from a
> > > remote
> > > > > >repository if they are not found locally. The test skips if
> > > download
> > > > > >fails.
> > > > > >  - A new environment variable is introduced that can be passed by
> > > the tester
> > > > > >to specify the location of the bits archives locally. test skips
> > > if the
> > > > > >bits binaries are not found in that location.
> > > > > >  - if pip install of python module fails for whatever reaoson, the
> > > test skips.
> > > > > >  - misc code fixes including spell check of the README doc. README
> > > has been
> > > > > >updated as well.
> > > > > >  - addition of SPDX license headers to bits test files.
> > > > > >  - update MAINTAINERS to reflect the new pytest test class.
> > > > > >
> > > > > > For biosbits repo:
> > > > > >  - added Dockerfile and build script. Made bios bits build on gcc 
> > > > > > 11.
> > > > > >
> > > https://github.com/ani-sinha/bits/blob/bits-qemu-logging/Dockerfile
> > > > > >
> > > https://github.com/ani-sinha/bits/blob/bits-qemu-logging/build-artifacts.sh
> > > > > >The build script generates the zip archive and tarball used by
> > > the test.
> > > > >
> > > > > So far so good, I think it's ok for a start. It's probably a good idea
> > > > > to host the source on qemu.org. Peter - any objection to this?
> > > >
> > > > Dan was looking at v1 from the point of view of how we handle the
> > > > guest binary blobs for these tests -- I'd rather defer to him rather
> > > > than taking the time to get up to speed on the issue myself.
> > >
> > > Storing the *source* git repo for biosbits on gitlab.com/qemu-project
> > > is sensible, as that's what we've done for other 3rd party bits that
> > > we bundle/depend on git repo access for.
> >
> >
> > Great. Can you or Peter please create a git repo cloned from the official
> > bios bits repo please? You don’t have to clone mine. Please provide me with
> > push access so that I can push fixes that I have made in order to build it.
>
> I don't have any admin privileges for qemu infra, so can't do this
> myself.
>
> With regards,
> Daniel
> --
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
>



Re: [PATCH v2 10/10] hw/isa/vt82c686: Create rtc-time alias in boards instead

2022-09-09 Thread Bernhard Beschow
Am 30. August 2022 21:46:57 UTC schrieb "Philippe Mathieu-Daudé" 
:
>On 30/8/22 21:00, Bernhard Beschow wrote:
>> According to good QOM practice, an object should only deal with objects
>> of its own sub tree. Having devices create an alias on the machine
>> object doesn't respect this good practice. To resolve this, create the
>> alias in the machine's code.
>
>IIUC, this is only true for Pegasos II, not (yet) for the Fuloong 2E.
>
>> Signed-off-by: Bernhard Beschow 
>> ---
>>   hw/isa/vt82c686.c   | 2 --
>>   hw/mips/fuloong2e.c | 4 
>>   hw/ppc/pegasos2.c   | 4 
>>   3 files changed, 8 insertions(+), 2 deletions(-)
>> 
>> diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
>> index 0ef9446374..a23ffbb3ff 100644
>> --- a/hw/isa/vt82c686.c
>> +++ b/hw/isa/vt82c686.c
>> @@ -631,8 +631,6 @@ static void via_isa_realize(PCIDevice *d, Error **errp)
>>   if (!qdev_realize(DEVICE(&s->rtc), BUS(isa_bus), errp)) {
>>   return;
>>   }
>> -object_property_add_alias(qdev_get_machine(), "rtc-time", 
>> OBJECT(&s->rtc),
>> -  "date");
>>   isa_connect_gpio_out(ISA_DEVICE(&s->rtc), 0, s->rtc.isairq);
>> for (i = 0; i < PCI_CONFIG_HEADER_SIZE; i++) {
>> diff --git a/hw/mips/fuloong2e.c b/hw/mips/fuloong2e.c
>> index 2d8723ab74..0f4cfe1188 100644
>> --- a/hw/mips/fuloong2e.c
>> +++ b/hw/mips/fuloong2e.c
>> @@ -203,6 +203,10 @@ static void vt82c686b_southbridge_init(PCIBus *pci_bus, 
>> int slot, qemu_irq intc,
>> via = pci_create_simple_multifunction(pci_bus, PCI_DEVFN(slot, 0), 
>> true,
>> TYPE_VT82C686B_ISA);
>> +object_property_add_alias(qdev_get_machine(), "rtc-time",
>> +  object_resolve_path_component(OBJECT(via),
>> +"rtc"),
>> +  "date");
>>   qdev_connect_gpio_out(DEVICE(via), 0, intc);
>> dev = PCI_DEVICE(object_resolve_path_component(OBJECT(via), "ide"));
>> diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c
>> index 09fdb7557f..f50e1d8b3f 100644
>> --- a/hw/ppc/pegasos2.c
>> +++ b/hw/ppc/pegasos2.c
>> @@ -161,6 +161,10 @@ static void pegasos2_init(MachineState *machine)
>>   /* VIA VT8231 South Bridge (multifunction PCI device) */
>>   via = pci_create_simple_multifunction(pci_bus, PCI_DEVFN(12, 0), true,
>> TYPE_VT8231_ISA);
>> +object_property_add_alias(qdev_get_machine(), "rtc-time",
>
>We already have a 'machine' pointer.

Fixed in v5.

>
>> +  object_resolve_path_component(OBJECT(via),
>> +"rtc"),
>> +  "date");
>>   qdev_connect_gpio_out(DEVICE(via), 0,
>> qdev_get_gpio_in_named(pm->mv, "gpp", 31));
>>   
>




Re: [PATCH] tcg/ppc: Optimize 26-bit jumps

2022-09-09 Thread Leandro Lupori

On 9/8/22 18:44, Richard Henderson wrote:

On 9/8/22 22:18, Leandro Lupori wrote:

PowerPC64 processors handle direct branches better than indirect
ones, resulting in less stalled cycles and branch misses.

However, PPC's tb_target_set_jmp_target() was only using direct
branches for 16-bit jumps, while PowerPC64's unconditional branch
instructions are able to handle displacements of up to 26 bits.
To take advantage of this, now jumps whose displacements fit in
between 17 and 26 bits are also converted to direct branches.


This doesn't work because you have to be able to unset the jump as well, 
and your two step
sequence doesn't handle that.  (You wind up with the two insn address 
load reset, but the

jump continuing to the previous target -- boom.)


Hello Richard, thanks for your review!
Right, I hadn't noticed this issue.


For v2.07+, you could use stq to update 4 insns atomically.

I'll try this alternative in v2, so that more CPUs can benefit from this 
change.


For v3.1+, you can eliminate TCG_REG_TB, using prefixed pc-relative 
addressing instead.
Which brings you back to only needing to update 8 bytes atomically 
(select either paddi to
compute address to feed to following mtctr+bcctr, or direct branch + nop 
leaving the

mtctr+bcctr alone and unreachable).

(Actually, there are lots of updates one could make to tcg/ppc for v3.1...)


r~





Re: [PATCH] hw/virtio/vhost-shadow-virtqueue: Silence GCC error "maybe-uninitialized"

2022-09-09 Thread Bernhard Beschow
Am 6. September 2022 17:52:24 UTC schrieb "Philippe Mathieu-Daudé" 
:
>On Tue, Sep 6, 2022 at 7:12 PM Bernhard Beschow  wrote:
>>
>> GCC issues a false positive warning, resulting in build failure with -Werror:
>>
>>   In file included from /usr/include/glib-2.0/glib.h:114,
>>from 
>> /home/zcone-pisint/Projects/qemu/src/include/glib-compat.h:32,
>>from 
>> /home/zcone-pisint/Projects/qemu/src/include/qemu/osdep.h:144,
>>from ../src/hw/virtio/vhost-shadow-virtqueue.c:10:
>>   In function ‘g_autoptr_cleanup_generic_gfree’,
>>   inlined from ‘vhost_handle_guest_kick’ at 
>> ../src/hw/virtio/vhost-shadow-virtqueue.c:292:42:
>>   /usr/include/glib-2.0/glib/glib-autocleanups.h:28:3: error: ‘elem’ may be 
>> used uninitialized [-Werror=maybe-uninitialized]
>>  28 |   g_free (*pp);
>> |   ^~~~
>>   ../src/hw/virtio/vhost-shadow-virtqueue.c: In function 
>> ‘vhost_handle_guest_kick’:
>>   ../src/hw/virtio/vhost-shadow-virtqueue.c:292:42: note: ‘elem’ was 
>> declared here
>> 292 | g_autofree VirtQueueElement *elem;
>> |  ^~~~
>>   cc1: all warnings being treated as errors
>>
>> There is actually no problem since "elem" is initialized in both branches.
>> Silence the warning by initializig it with "NULL".
>
>Could you amend the first line of `gcc --version`?

Hi Phil,

Yes, will do in v2.

Sorry for getting off-topic, but have you recognized my vt82c686 series? Your 
opinion would matter there and I'm not sure if my mail bounced. In my case 
Gmail likes to put your mail in the spam box :/

Regards,
Bernhard
>
>Reviewed-by: Philippe Mathieu-Daudé 
>
>> Fixes: 9c2ab2f1ec333be8614cc12272d4b91960704dbe ("vhost: stop transfer elem 
>> ownership in vhost_handle_guest_kick")
>> Signed-off-by: Bernhard Beschow 
>> ---
>>  hw/virtio/vhost-shadow-virtqueue.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c 
>> b/hw/virtio/vhost-shadow-virtqueue.c
>> index e8e5bbc368..596d4434d2 100644
>> --- a/hw/virtio/vhost-shadow-virtqueue.c
>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
>> @@ -289,7 +289,7 @@ static void vhost_handle_guest_kick(VhostShadowVirtqueue 
>> *svq)
>>  virtio_queue_set_notification(svq->vq, false);
>>
>>  while (true) {
>> -g_autofree VirtQueueElement *elem;
>> +g_autofree VirtQueueElement *elem = NULL;
>>  int r;
>>
>>  if (svq->next_guest_avail_elem) {
>> --
>> 2.37.3
>>
>>




Re: [PATCH] e1000e: set RX desc status with DD flag in a separate operation

2022-09-09 Thread dinghui

On 2022/9/9 10:40, Jason Wang wrote:

On Sat, Aug 27, 2022 at 12:06 AM Ding Hui  wrote:


Like commit 034d00d48581 ("e1000: set RX descriptor status in
a separate operation"), there is also same issue in e1000e, which
would cause lost packets or stop sending packets to VM with DPDK.

Do similar fix in e1000e.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/402
Signed-off-by: Ding Hui 
---
  hw/net/e1000e_core.c | 54 +++-
  1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index 208e3e0d79..b8038e4014 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -1364,6 +1364,58 @@ struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
  }
  }

+static inline void
+e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr,
+ uint8_t *desc, dma_addr_t len)
+{
+PCIDevice *dev = core->owner;
+
+if (e1000e_rx_use_legacy_descriptor(core)) {
+struct e1000_rx_desc *d = (struct e1000_rx_desc *) desc;
+size_t offset = offsetof(struct e1000_rx_desc, status);
+typeof(d->status) status = d->status;
+
+d->status &= ~E1000_RXD_STAT_DD;
+pci_dma_write(dev, addr, desc, len);
+
+if (status & E1000_RXD_STAT_DD) {
+d->status = status;
+pci_dma_write(dev, addr + offset, &status, sizeof(status));
+}
+} else {
+if (core->mac[RCTL] & E1000_RCTL_DTYP_PS) {
+union e1000_rx_desc_packet_split *d =
+(union e1000_rx_desc_packet_split *) desc;
+size_t offset = offsetof(union e1000_rx_desc_packet_split,
+wb.middle.status_error);
+typeof(d->wb.middle.status_error) status =
+d->wb.middle.status_error;


Any reason to use typeof here? Its type is known to be uint32_t?



My intention was using exact type same with struct member status_error, 
which is indeed uint32_t now. If the type of status_error in struct be 
changed in some case, we do not need to change everywhere.


Maybe I worry too much, the struct is related to h/w spec, so it is 
unlikely be changed in the future.


Should I send v2 to use uint32_t directly? I'm also OK with it.


+
+d->wb.middle.status_error &= ~E1000_RXD_STAT_DD;
+pci_dma_write(dev, addr, desc, len);
+
+if (status & E1000_RXD_STAT_DD) {
+d->wb.middle.status_error = status;
+pci_dma_write(dev, addr + offset, &status, sizeof(status));
+}
+} else {
+union e1000_rx_desc_extended *d =
+(union e1000_rx_desc_extended *) desc;
+size_t offset = offsetof(union e1000_rx_desc_extended,
+wb.upper.status_error);
+typeof(d->wb.upper.status_error) status = d->wb.upper.status_error;


So did here.

Thanks


+
+d->wb.upper.status_error &= ~E1000_RXD_STAT_DD;
+pci_dma_write(dev, addr, desc, len);
+
+if (status & E1000_RXD_STAT_DD) {
+d->wb.upper.status_error = status;
+pci_dma_write(dev, addr + offset, &status, sizeof(status));
+}
+}
+}
+}
+
  typedef struct e1000e_ba_state_st {
  uint16_t written[MAX_PS_BUFFERS];
  uint8_t cur_idx;
@@ -1600,7 +1652,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct 
NetRxPkt *pkt,

  e1000e_write_rx_descr(core, desc, is_last ? core->rx_pkt : NULL,
 rss_info, do_ps ? ps_hdr_len : 0, 
&bastate.written);
-pci_dma_write(d, base, &desc, core->rx_desc_len);
+e1000e_pci_dma_write_rx_desc(core, base, desc, core->rx_desc_len);

  e1000e_ring_advance(core, rxi,
  core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
--
2.17.1







--
Thanks,
- Ding Hui



Re: [RFC PATCH 2/2] kvm/kvm-all.c: listener should delay kvm_vm_ioctl to the commit phase

2022-09-09 Thread David Hildenbrand

On 09.09.22 10:02, Emanuele Giuseppe Esposito wrote:



One thing I forgot to ask: iirc we used to have a workaround to kick all
vcpus out, update memory slots, then continue all vcpus.  Would that work
for us too for the problem you're working on?


As reference, here is one such approach for region resizes only:

https://lore.kernel.org/qemu-devel/20200312161217.3590-1-da...@redhat.com/

which notes:

"Instead of inhibiting during the region_resize(), we could inhibit for
the hole memory transaction (from begin() to commit()). This could be
nice, because also splitting of memory regions would be atomic (I
remember there was a BUG report regarding that), however, I am not sure
if that might impact any RT users."



I read:

"Using pause_all_vcpus()/resume_all_vcpus() is not possible, as it will
temporarily drop the BQL - something most callers can't handle (esp.
when called from vcpu context e.g., in virtio code)."


... that's why the patch takes a different approach? :)

--
Thanks,

David / dhildenb




[RFC PATCH 1/1] kvm/kvm-all.c: implement KVM_SET_USER_MEMORY_REGION_LIST ioctl

2022-09-09 Thread Emanuele Giuseppe Esposito
Instead of sending memslot updates in each callback, kvm listener
already takes care of sending them in the commit phase, as multiple
ioctls.

Using the new KVM_SET_USER_MEMORY_REGION_LIST, we just need a single
call containing all memory regions to update.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 accel/kvm/kvm-all.c | 25 ++---
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 9780f3d2da..6a7f7b4567 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1547,30 +1547,25 @@ static void kvm_commit(MemoryListener *listener)
 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
   listener);
 KVMState *s = kvm_state;
-int i;
+int i, ret;
 
 for (i = 0; i < kml->mem_array.list->nent; i++) {
 struct kvm_userspace_memory_region_entry *mem;
-int ret;
 
 mem = &kml->mem_array.list->entries[i];
 
-/*
- * Note that mem is struct kvm_userspace_memory_region_entry, while the
- * kernel expects a kvm_userspace_memory_region, so it will currently
- * ignore mem->invalidate_slot and mem->padding.
- */
-ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, mem);
-
 trace_kvm_set_user_memory(mem->slot, mem->flags, mem->guest_phys_addr,
   mem->memory_size, mem->userspace_addr, 0);
+}
 
-if (ret < 0) {
-error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
- " start=0x%" PRIx64 ": %s",
- __func__, mem->slot,
- (uint64_t)mem->memory_size, strerror(errno));
-}
+ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION_LIST, 
kml->mem_array.list);
+
+if (ret < 0) {
+error_report("%s: KVM_SET_USER_MEMORY_REGION_LIST failed, size=0x%"
+ PRIx64 " flags=0x%" PRIx64 ": %s",
+ __func__, (uint64_t)kml->mem_array.list->nent,
+ (uint64_t)kml->mem_array.list->flags,
+ strerror(errno));
 }
 
 kml->mem_array.list->nent = 0;
-- 
2.31.1




[RFC PATCH 0/1] accel/kvm: implement KVM_SET_USER_MEMORY_REGION_LIST

2022-09-09 Thread Emanuele Giuseppe Esposito
Use the new KVM ioctl KVM_SET_USER_MEMORY_REGION_LIST.
Based on my QEMU serie "[RFC PATCH v2 0/3] accel/kvm: extend kvm memory 
listener to support".

Based-on: 20220909081150.709060-1-eespo...@redhat.com

Requires my KVM serie "[RFC PATCH 0/9] kvm: implement atomic memslot updates"
https://lkml.org/lkml/2022/9/9/533

Emanuele Giuseppe Esposito (1):
  kvm/kvm-all.c: implement KVM_SET_USER_MEMORY_REGION_LIST ioctl

 accel/kvm/kvm-all.c | 25 ++---
 1 file changed, 10 insertions(+), 15 deletions(-)

-- 
2.31.1




Re: [PATCH v2 2/2] Update linux headers to v6.0-rc4

2022-09-09 Thread Cornelia Huck
On Fri, Sep 09 2022, Chenyi Qiang  wrote:

> commit 7e18e42e4b280c85b76967a9106a13ca61c16179
>
> Signed-off-by: Chenyi Qiang 
> ---
>  include/standard-headers/asm-x86/bootparam.h  |   7 +-
>  include/standard-headers/drm/drm_fourcc.h |  73 +++-
>  include/standard-headers/linux/ethtool.h  |  29 +--
>  include/standard-headers/linux/input.h|  12 +-
>  include/standard-headers/linux/pci_regs.h |  30 ++-
>  include/standard-headers/linux/vhost_types.h  |  17 +-
>  include/standard-headers/linux/virtio_9p.h|   2 +-
>  .../standard-headers/linux/virtio_config.h|   7 +-
>  include/standard-headers/linux/virtio_ids.h   |  14 +-
>  include/standard-headers/linux/virtio_net.h   |  34 +++-
>  include/standard-headers/linux/virtio_pci.h   |   2 +
>  include/standard-headers/linux/virtio_ring.h  |  16 +-
>  linux-headers/asm-arm64/kvm.h |  33 +++-
>  linux-headers/asm-generic/unistd.h|   4 +-
>  linux-headers/asm-riscv/kvm.h |  22 +++
>  linux-headers/asm-riscv/unistd.h  |   3 +-
>  linux-headers/asm-s390/kvm.h  |   1 +
>  linux-headers/asm-x86/kvm.h   |  33 ++--
>  linux-headers/asm-x86/mman.h  |  14 --
>  linux-headers/linux/kvm.h | 172 +-
>  linux-headers/linux/userfaultfd.h |  10 +-
>  linux-headers/linux/vduse.h   |  47 +
>  linux-headers/linux/vfio.h|   4 +-
>  linux-headers/linux/vfio_zdev.h   |   7 +
>  linux-headers/linux/vhost.h   |  35 +++-
>  25 files changed, 538 insertions(+), 90 deletions(-)

Reviewed-by: Cornelia Huck 




Re: [PATCH v2 1/2] configure: Add -Wno-gnu-variable-sized-type-not-at-end

2022-09-09 Thread Cornelia Huck
On Fri, Sep 09 2022, Chenyi Qiang  wrote:

> In recent linux headers update to v6.0-rc, it switched GNU

Maybe

"A Linux headers update to v6.0-rc switches some definitions from the
GNU..."

?

> 'zero-length-array' extension to the C-standard-defined flexible array
> member. e.g.
>
> struct kvm_msrs {
> __u32 nmsrs; /* number of msrs in entries */
> __u32 pad;
>
> -   struct kvm_msr_entry entries[0];
> +   struct kvm_msr_entry entries[];
> };
>
> Those (unlike the GNU zero-length-array) have some extra restrictions like
> 'this must be put at the end of a struct', which clang build would complain
> about. e.g. the current code
>
> struct {
> struct kvm_msrs info;
> struct kvm_msr_entry entries[1];
> } msr_data = { }
>
> generates the warning like:
>
> target/i386/kvm/kvm.c:2868:25: error: field 'info' with variable sized
> type 'struct kvm_msrs' not at the end of a struct or class is a GNU
> extension [-Werror,-Wgnu-variable-sized-type-not-at-end]
> struct kvm_msrs info;
> ^
> In fact, the variable length 'entries[]' field in 'info' is zero-sized in
> GNU defined semantics, which can give predictable offset for 'entries[1]'
> in local msr_data. The local defined struct is just there to force a stack
> allocation large enough for 1 kvm_msr_entry, a clever trick but requires to
> turn off this clang warning.
>
> Suggested-by: Daniel P. Berrangé 
> Reviewed-by: Richard Henderson 
> Signed-off-by: Chenyi Qiang 
> ---
>  configure | 1 +
>  1 file changed, 1 insertion(+)

Reviewed-by: Cornelia Huck 




Re: [PATCH] hw/nvme: remove param zoned.auto_transition

2022-09-09 Thread Klaus Jensen
On Aug 12 13:01, Niklas Cassel wrote:
> The intention of the Zoned Namespace Command Set Specification was
> never to make an automatic zone transition optional.
> 
> Excerpt from the nvmexpress.org zns mailing list:
> """
> A question came up internally on the differences between ZNS and ZAC/ZBC
> that asked about when a controller should transitions a specific zone in
> the Implicitly Opened state to Closed state.
> 
> For example, consider a ZNS SSD that supports a max of 20 active zones,
> and a max of 10 open zones, which has the following actions occur:
> 
> First, the host writes to ten empty zones, thereby transitioning 10 zones
> to the Implicitly Opened state.
> 
> Second, the host issues a write to an 11th empty zone.
> 
> Given that state, my understanding of the second part is that the ZNS SSD
> chooses one of the previously 10 zones, and transition the chosen zone to
> the Closed state, and then proceeds to write to the new zone which also
> implicitly transition it from the Empty state to the Impl. Open state.
> After this, there would be 11 active zones in total, 10 in impl. Open
> state, and one in closed state.
> 
> The above assumes that a ZNS SSD will always transition an implicitly
> opened zone to closed state when required to free up resources when
> another zone is opened. However, it isn’t strictly said in the ZNS spec.
> 
> The paragraph that should cover it is defined in section
> 2.1.1.4.1 – Managing Resources:
> The controller may transition zones in the ZSIO:Implicitly Opened state
> to the ZSC:Closed state for resource management purposes.
> 
> However, it doesn’t say “when” it should occur. Thus, as the text stand,
> it could be misinterpreted that the controller shouldn’t do close a zone
> to make room for a new zone. The issue with this, is that it makes the
> point of having implicitly managed zones moot.
> 
> The ZAC/ZBC specs is more specific and clarifies when a zone should be
> closed. I think it would be natural to the same here.
> """
> 
> While the Zoned Namespace Command Set Specification hasn't received an
> errata yet, it is quite clear that the intention was that an automatic
> zone transition was never supposed to be optional, as then the whole
> point of having implictly open zones would be pointless. Therefore,
> remove the param zoned.auto_transition, as this was never supposed to
> be controller implementation specific.
> 
> Signed-off-by: Niklas Cassel 
> ---
>  hw/nvme/ctrl.c | 35 ---
>  hw/nvme/nvme.h |  1 -
>  2 files changed, 12 insertions(+), 24 deletions(-)
> 

No quarrel from me, but we need to deprecate the parameter first.

However, I agree that it is a misinterpretation of the spec, so I'm ok
with removing the code using the parameter so it doesnt do anything.

Please do that, but retain the parameter and add it in
docs/about/deprecated.rst and remove the documentation related to the
option.


signature.asc
Description: PGP signature


Re: [PATCH 2/3] vdpa: load vlan configuration at NIC startup

2022-09-09 Thread Michael S. Tsirkin
On Fri, Sep 09, 2022 at 10:01:16AM +0200, Eugenio Perez Martin wrote:
> On Fri, Sep 9, 2022 at 8:40 AM Jason Wang  wrote:
> >
> > On Fri, Sep 9, 2022 at 2:38 PM Jason Wang  wrote:
> > >
> > > On Wed, Sep 7, 2022 at 12:36 AM Eugenio Pérez  wrote:
> > > >
> > > > To have enabled vlans at device startup may happen in the destination of
> > > > a live migration, so this configuration must be restored.
> > > >
> > > > At this moment the code is not accessible, since SVQ refuses to start if
> > > > vlan feature is exposed by the device.
> > > >
> > > > Signed-off-by: Eugenio Pérez 
> > > > ---
> > > >  net/vhost-vdpa.c | 46 --
> > > >  1 file changed, 44 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> > > > index 4bc3fd01a8..ecbfd08eb9 100644
> > > > --- a/net/vhost-vdpa.c
> > > > +++ b/net/vhost-vdpa.c
> > > > @@ -100,6 +100,8 @@ static const uint64_t vdpa_svq_device_features =
> > > >  BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
> > > >  BIT_ULL(VIRTIO_NET_F_STANDBY);
> > > >
> > > > +#define MAX_VLAN(1 << 12)   /* Per 802.1Q definition */
> > > > +
> > > >  VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
> > > >  {
> > > >  VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> > > > @@ -423,6 +425,47 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState 
> > > > *s,
> > > >  return *s->status != VIRTIO_NET_OK;
> > > >  }
> > > >
> > > > +static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s,
> > > > +   const VirtIONet *n,
> > > > +   uint16_t vid)
> > > > +{
> > > > +ssize_t dev_written = vhost_vdpa_net_load_cmd(s, 
> > > > VIRTIO_NET_CTRL_VLAN,
> > > > +  
> > > > VIRTIO_NET_CTRL_VLAN_ADD,
> > > > +  &vid, sizeof(vid));
> > > > +if (unlikely(dev_written < 0)) {
> > > > +return dev_written;
> > > > +}
> > > > +
> > > > +if (unlikely(*s->status != VIRTIO_NET_OK)) {
> > > > +return -EINVAL;
> > > > +}
> > > > +
> > > > +return 0;
> > > > +}
> > > > +
> > > > +static int vhost_vdpa_net_load_vlan(VhostVDPAState *s,
> > > > +const VirtIONet *n)
> > > > +{
> > > > +uint64_t features = n->parent_obj.guest_features;
> > > > +
> > > > +if (!(features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN))) {
> > > > +return 0;
> > > > +}
> > > > +
> > > > +for (int i = 0; i < MAX_VLAN >> 5; i++) {
> > > > +for (int j = 0; n->vlans[i] && j <= 0x1f; j++) {
> > > > +if (n->vlans[i] & (1U << j)) {
> > > > +int r = vhost_vdpa_net_load_single_vlan(s, n, (i << 5) 
> > > > + j);
> > >
> > > This seems to cause a lot of latency if the driver has a lot of vlans.
> > >
> > > I wonder if it's simply to let all vlan traffic go by disabling
> > > CTRL_VLAN feature at vDPA layer.
> >
> 
> The guest will not be able to recover that vlan configuration.
> 
> > Another idea is to extend the spec to allow us to accept a bitmap of
> > the vlan ids via a single command, then we will be fine.
> >
> 
> I'm not sure if adding more ways to configure something is the answer,
> but I'd be ok to implement it.
> 
> Another idea is to allow the sending of many CVQ commands in parallel.
> It shouldn't be very hard to achieve using exposed buffers as ring
> buffers, and it will short down the start of the devices with many
> features.

This would seem like a reasonable second step.  A first step would be to
measure the overhead to make sure there's actually a need to optimize
this.


> Thanks!
> 
> > Thanks
> >
> > >
> > > Thanks
> > >
> > > > +if (unlikely(r != 0)) {
> > > > +return r;
> > > > +}
> > > > +}
> > > > +}
> > > > +}
> > > > +
> > > > +return 0;
> > > > +}
> > > > +
> > > >  static int vhost_vdpa_net_load(NetClientState *nc)
> > > >  {
> > > >  VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> > > > @@ -445,8 +488,7 @@ static int vhost_vdpa_net_load(NetClientState *nc)
> > > >  if (unlikely(r)) {
> > > >  return r;
> > > >  }
> > > > -
> > > > -return 0;
> > > > +return vhost_vdpa_net_load_vlan(s, n);
> > > >  }
> > > >
> > > >  static NetClientInfo net_vhost_vdpa_cvq_info = {
> > > > --
> > > > 2.31.1
> > > >
> >




[RFC PATCH v2 3/3] kvm/kvm-all.c: listener should delay kvm_vm_ioctl to the commit phase

2022-09-09 Thread Emanuele Giuseppe Esposito
Instead of sending a single ioctl every time ->region_* or ->log_*
callbacks are called, "queue" all memory regions in a
kvm_userspace_memory_region_list that will be sent only when committing.

This allow the KVM kernel API to be extended and support multiple
memslots updates in a single call.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 accel/kvm/kvm-all.c  | 131 ++-
 include/sysemu/kvm_int.h |   8 +++
 2 files changed, 96 insertions(+), 43 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index e9947ec18b..9780f3d2da 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -357,56 +357,50 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void 
*ram,
 return ret;
 }
 
+static struct kvm_userspace_memory_region_entry *kvm_memory_region_entry_get(
+KVMMemoryListener *kml)
+{
+struct MemoryRegionNodeArray *arr = &kml->mem_array;
+struct kvm_userspace_memory_region_list *list = arr->list;
+
+if (list->nent == arr->max_entries) {
+arr->max_entries += DEFAULT_KVM_MEMORY_REGION_ARRAY_GROW;
+list = g_realloc(list,
+  sizeof(struct kvm_userspace_memory_region_list) +
+  arr->max_entries *
+  sizeof(struct 
kvm_userspace_memory_region_entry));
+}
+
+return &list->entries[list->nent++];
+}
+
 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, 
bool new)
 {
-KVMState *s = kvm_state;
-struct kvm_userspace_memory_region_entry mem;
-int ret;
+struct kvm_userspace_memory_region_entry *mem;
+
+mem = kvm_memory_region_entry_get(kml);
+
+mem->slot = slot->slot | (kml->as_id << 16);
+mem->guest_phys_addr = slot->start_addr;
+mem->userspace_addr = (unsigned long)slot->ram;
+mem->flags = slot->flags;
 
-mem.slot = slot->slot | (kml->as_id << 16);
-mem.guest_phys_addr = slot->start_addr;
-mem.userspace_addr = (unsigned long)slot->ram;
-mem.flags = slot->flags;
+if (slot->memory_size && !new && (mem->flags ^ slot->old_flags) &
+KVM_MEM_READONLY) {
+struct kvm_userspace_memory_region_entry *mem2 = mem;
 
-if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & 
KVM_MEM_READONLY) {
+mem = kvm_memory_region_entry_get(kml);
+memcpy(mem, mem2, sizeof(struct kvm_userspace_memory_region_entry));
 /* Set the slot size to 0 before setting the slot to the desired
  * value. This is needed based on KVM commit 75d61fbc. */
-mem.memory_size = 0;
-mem.invalidate_slot = 1;
-/*
- * Note that mem is struct kvm_userspace_memory_region_entry, while the
- * kernel expects a kvm_userspace_memory_region, so it will currently
- * ignore mem->invalidate_slot and mem->padding.
- */
-ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
-if (ret < 0) {
-goto err;
-}
+mem2->memory_size = 0;
+mem2->invalidate_slot = 1;
 }
-mem.memory_size = slot->memory_size;
-/*
- * Invalidate if it's a kvm memslot MOVE or DELETE operation, but
- * currently QEMU does not perform any memslot MOVE operation.
- */
-mem.invalidate_slot = slot->memory_size == 0;
+mem->memory_size = slot->memory_size;
+mem->invalidate_slot = slot->memory_size == 0;
 
-/*
- * Note that mem is struct kvm_userspace_memory_region_entry, while the
- * kernel expects a kvm_userspace_memory_region, so it will currently
- * ignore mem->invalidate_slot and mem->padding.
- */
-ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
-slot->old_flags = mem.flags;
-err:
-trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
-  mem.memory_size, mem.userspace_addr, ret);
-if (ret < 0) {
-error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
- " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
- __func__, mem.slot, slot->start_addr,
- (uint64_t)mem.memory_size, strerror(errno));
-}
-return ret;
+slot->old_flags = mem->flags;
+return 0;
 }
 
 static int do_kvm_destroy_vcpu(CPUState *cpu)
@@ -1534,12 +1528,54 @@ static void kvm_region_add(MemoryListener *listener,
 static void kvm_region_del(MemoryListener *listener,
MemoryRegionSection *section)
 {
-KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, 
listener);
+KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
+  listener);
 
 kvm_set_phys_mem(kml, section, false);
 memory_region_unref(section->mr);
 }
 
+static void kvm_begin(MemoryListener *listener)
+{
+KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
+  

[RFC PATCH v2 0/3] accel/kvm: extend kvm memory listener to support

2022-09-09 Thread Emanuele Giuseppe Esposito
The aim of this serie is to prepare kvm memory listener to support atomic
memslots update. In order to do that, QEMU should take care of sending all
memslot updates in a single ioctl, so that they can all be processed
atomically.

In order to do that, implement kml->begin() and kml->commit() callbacks, and
change the logic by replacing every ioctl invocation in ->region_* and ->log_*
so that the struct kvm_userspace_memory_region are queued in a linked list that
is then traversed and processed in ->commit.

Patch 1 ensures that ->region_* and ->log_* are always wrapped by ->begin and
->commit.

---
v2:
- remove patch 1, as it is useless
- patch 2: instead of a linked list, use kvm_userspace_memory_region_list
- kvm_userspace_memory_region_list: add padding

Emanuele Giuseppe Esposito (3):
  linux-headers/linux/kvm.h: introduce kvm_userspace_memory_region_list
ioctl
  accel/kvm/kvm-all.c: pass kvm_userspace_memory_region_entry instead
  kvm/kvm-all.c: listener should delay kvm_vm_ioctl to the commit phase

 accel/kvm/kvm-all.c   | 116 +-
 include/sysemu/kvm_int.h  |   8 +++
 linux-headers/linux/kvm.h |  20 +++
 3 files changed, 117 insertions(+), 27 deletions(-)

-- 
2.31.1




[RFC PATCH v2 1/3] linux-headers/linux/kvm.h: introduce kvm_userspace_memory_region_list ioctl

2022-09-09 Thread Emanuele Giuseppe Esposito
Introduce new KVM_SET_USER_MEMORY_REGION_LIST ioctl and
kvm_userspace_memory_region_list that will be used to pass
multiple memory region updates at once to KVM.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 linux-headers/linux/kvm.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index f089349149..671cdfb8de 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -103,6 +103,24 @@ struct kvm_userspace_memory_region {
__u64 userspace_addr; /* start of the userspace allocated memory */
 };
 
+/* for KVM_SET_USER_MEMORY_REGION_LIST */
+struct kvm_userspace_memory_region_entry {
+   __u32 slot;
+   __u32 flags;
+   __u64 guest_phys_addr;
+   __u64 memory_size; /* bytes */
+   __u64 userspace_addr; /* start of the userspace allocated memory */
+   __u8 invalidate_slot;
+   __u8 padding[31];
+};
+
+/* for KVM_SET_USER_MEMORY_REGION_LIST */
+struct kvm_userspace_memory_region_list {
+   __u32 nent;
+   __u32 flags;
+   struct kvm_userspace_memory_region_entry entries[0];
+};
+
 /*
  * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace,
  * other bits are reserved for kvm internal use which are defined in
@@ -1426,6 +1444,8 @@ struct kvm_vfio_spapr_tce {
struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR  _IO(KVMIO,   0x47)
 #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+#define KVM_SET_USER_MEMORY_REGION_LIST _IOW(KVMIO, 0x49, \
+   struct kvm_userspace_memory_region_list)
 
 /* enable ucontrol for s390 */
 struct kvm_s390_ucas_mapping {
-- 
2.31.1




Re: [RFC PATCH 2/2] kvm/kvm-all.c: listener should delay kvm_vm_ioctl to the commit phase

2022-09-09 Thread Emanuele Giuseppe Esposito



Am 26/08/2022 um 16:44 schrieb David Hildenbrand:
> On 26.08.22 16:32, Emanuele Giuseppe Esposito wrote:
>>
>>
>> Am 26/08/2022 um 16:15 schrieb David Hildenbrand:
>>> On 16.08.22 12:12, Emanuele Giuseppe Esposito wrote:
 Instead of sending a single ioctl every time ->region_* or ->log_*
 callbacks are called, "queue" all memory regions in a list that will
 be emptied only when committing.

>>>
>>> Out of interest, how many such regions does the ioctl support? As many
>>> as KVM theoretically supports? (32k IIRC)
>>>
>>
>> I assume you mean for the new ioctl, but yes that's a good question.
>>
>> The problem here is that we could have more than a single update per
>> memory region. So we are not limited anymore to the number of regions,
>> but the number of operations * number of region.
>>
>> I was thinking, maybe when pre-processing QEMU could divide a single
>> transaction into multiple atomic operations (ie operations on the same
>> memory region)? That way avoid sending a single ioctl with 32k *
>> #operation elements. Is that what you mean?
> 
> Oh, so we're effectively collecting slot updates and not the complete
> "slot" view, got it. Was the kernel series already sent so I can have a
> look?

I am going to send it today. I got something working, but it's a little
bit messy on the invalid slots part.

> 
> Note that there are some possible slot updates (like a split, or a
> merge) that involve multiple slots and that would have to be part of the
> same "transaction" to be atomic.
> 
> 

Limiting the size of operations in the IOCTL can be something for the
future. Currently it's already pretty complicated as it is (in the KVM
side), plus I don't see ioctls with more than 8 requests.

Thank you,
Emanuele




[RFC PATCH v2 2/3] accel/kvm/kvm-all.c: pass kvm_userspace_memory_region_entry instead

2022-09-09 Thread Emanuele Giuseppe Esposito
It won't change anything from the kernel side, but prepares the logic
for KVM_SET_USER_MEMORY_REGION_LIST ioctl, where all requests are sent
at once.

Because QEMU does not send any memslot MOVE request to KVM, simplify
mem.invalidate_slot logic to only detect DELETE requests.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 accel/kvm/kvm-all.c | 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 645f0a249a..e9947ec18b 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -360,7 +360,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void 
*ram,
 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, 
bool new)
 {
 KVMState *s = kvm_state;
-struct kvm_userspace_memory_region mem;
+struct kvm_userspace_memory_region_entry mem;
 int ret;
 
 mem.slot = slot->slot | (kml->as_id << 16);
@@ -372,12 +372,29 @@ static int kvm_set_user_memory_region(KVMMemoryListener 
*kml, KVMSlot *slot, boo
 /* Set the slot size to 0 before setting the slot to the desired
  * value. This is needed based on KVM commit 75d61fbc. */
 mem.memory_size = 0;
+mem.invalidate_slot = 1;
+/*
+ * Note that mem is struct kvm_userspace_memory_region_entry, while the
+ * kernel expects a kvm_userspace_memory_region, so it will currently
+ * ignore mem->invalidate_slot and mem->padding.
+ */
 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 if (ret < 0) {
 goto err;
 }
 }
 mem.memory_size = slot->memory_size;
+/*
+ * Invalidate if it's a kvm memslot MOVE or DELETE operation, but
+ * currently QEMU does not perform any memslot MOVE operation.
+ */
+mem.invalidate_slot = slot->memory_size == 0;
+
+/*
+ * Note that mem is struct kvm_userspace_memory_region_entry, while the
+ * kernel expects a kvm_userspace_memory_region, so it will currently
+ * ignore mem->invalidate_slot and mem->padding.
+ */
 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 slot->old_flags = mem.flags;
 err:
-- 
2.31.1




Re: [PATCH 2/3] vdpa: load vlan configuration at NIC startup

2022-09-09 Thread Eugenio Perez Martin
On Fri, Sep 9, 2022 at 8:40 AM Jason Wang  wrote:
>
> On Fri, Sep 9, 2022 at 2:38 PM Jason Wang  wrote:
> >
> > On Wed, Sep 7, 2022 at 12:36 AM Eugenio Pérez  wrote:
> > >
> > > To have enabled vlans at device startup may happen in the destination of
> > > a live migration, so this configuration must be restored.
> > >
> > > At this moment the code is not accessible, since SVQ refuses to start if
> > > vlan feature is exposed by the device.
> > >
> > > Signed-off-by: Eugenio Pérez 
> > > ---
> > >  net/vhost-vdpa.c | 46 --
> > >  1 file changed, 44 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> > > index 4bc3fd01a8..ecbfd08eb9 100644
> > > --- a/net/vhost-vdpa.c
> > > +++ b/net/vhost-vdpa.c
> > > @@ -100,6 +100,8 @@ static const uint64_t vdpa_svq_device_features =
> > >  BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
> > >  BIT_ULL(VIRTIO_NET_F_STANDBY);
> > >
> > > +#define MAX_VLAN(1 << 12)   /* Per 802.1Q definition */
> > > +
> > >  VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
> > >  {
> > >  VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> > > @@ -423,6 +425,47 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
> > >  return *s->status != VIRTIO_NET_OK;
> > >  }
> > >
> > > +static int vhost_vdpa_net_load_single_vlan(VhostVDPAState *s,
> > > +   const VirtIONet *n,
> > > +   uint16_t vid)
> > > +{
> > > +ssize_t dev_written = vhost_vdpa_net_load_cmd(s, 
> > > VIRTIO_NET_CTRL_VLAN,
> > > +  
> > > VIRTIO_NET_CTRL_VLAN_ADD,
> > > +  &vid, sizeof(vid));
> > > +if (unlikely(dev_written < 0)) {
> > > +return dev_written;
> > > +}
> > > +
> > > +if (unlikely(*s->status != VIRTIO_NET_OK)) {
> > > +return -EINVAL;
> > > +}
> > > +
> > > +return 0;
> > > +}
> > > +
> > > +static int vhost_vdpa_net_load_vlan(VhostVDPAState *s,
> > > +const VirtIONet *n)
> > > +{
> > > +uint64_t features = n->parent_obj.guest_features;
> > > +
> > > +if (!(features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN))) {
> > > +return 0;
> > > +}
> > > +
> > > +for (int i = 0; i < MAX_VLAN >> 5; i++) {
> > > +for (int j = 0; n->vlans[i] && j <= 0x1f; j++) {
> > > +if (n->vlans[i] & (1U << j)) {
> > > +int r = vhost_vdpa_net_load_single_vlan(s, n, (i << 5) + 
> > > j);
> >
> > This seems to cause a lot of latency if the driver has a lot of vlans.
> >
> > I wonder if it's simply to let all vlan traffic go by disabling
> > CTRL_VLAN feature at vDPA layer.
>

The guest will not be able to recover that vlan configuration.

> Another idea is to extend the spec to allow us to accept a bitmap of
> the vlan ids via a single command, then we will be fine.
>

I'm not sure if adding more ways to configure something is the answer,
but I'd be ok to implement it.

Another idea is to allow the sending of many CVQ commands in parallel.
It shouldn't be very hard to achieve using exposed buffers as ring
buffers, and it will short down the start of the devices with many
features.

Thanks!

> Thanks
>
> >
> > Thanks
> >
> > > +if (unlikely(r != 0)) {
> > > +return r;
> > > +}
> > > +}
> > > +}
> > > +}
> > > +
> > > +return 0;
> > > +}
> > > +
> > >  static int vhost_vdpa_net_load(NetClientState *nc)
> > >  {
> > >  VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
> > > @@ -445,8 +488,7 @@ static int vhost_vdpa_net_load(NetClientState *nc)
> > >  if (unlikely(r)) {
> > >  return r;
> > >  }
> > > -
> > > -return 0;
> > > +return vhost_vdpa_net_load_vlan(s, n);
> > >  }
> > >
> > >  static NetClientInfo net_vhost_vdpa_cvq_info = {
> > > --
> > > 2.31.1
> > >
>




Re: [RFC PATCH 2/2] kvm/kvm-all.c: listener should delay kvm_vm_ioctl to the commit phase

2022-09-09 Thread Emanuele Giuseppe Esposito


>> One thing I forgot to ask: iirc we used to have a workaround to kick all
>> vcpus out, update memory slots, then continue all vcpus.  Would that work
>> for us too for the problem you're working on?
> 
> As reference, here is one such approach for region resizes only:
> 
> https://lore.kernel.org/qemu-devel/20200312161217.3590-1-da...@redhat.com/
> 
> which notes:
> 
> "Instead of inhibiting during the region_resize(), we could inhibit for
> the hole memory transaction (from begin() to commit()). This could be
> nice, because also splitting of memory regions would be atomic (I
> remember there was a BUG report regarding that), however, I am not sure
> if that might impact any RT users."
> 
> 
I read:

"Using pause_all_vcpus()/resume_all_vcpus() is not possible, as it will
temporarily drop the BQL - something most callers can't handle (esp.
when called from vcpu context e.g., in virtio code)."

Thank you,
Emanuele




Re: [RFC PATCH 1/2] softmmu/memory: add missing begin/commit callback calls

2022-09-09 Thread Emanuele Giuseppe Esposito



Am 27/08/2022 um 23:03 schrieb Peter Xu:
> On Fri, Aug 26, 2022 at 10:13:47AM -0400, Peter Xu wrote:
>> On Fri, Aug 26, 2022 at 03:53:09PM +0200, Emanuele Giuseppe Esposito wrote:
>>> What do you mean "will empty all regions with those listeners"?
>>> But yes theoretically vhost-vdpa and physmem have commit callbacks that
>>> are independent from whether region_add or other callbacks have been called.
>>> For kvm and probably vhost it would be no problem, since there won't be
>>> any list to iterate on.
>>
>> Right, begin()/commit() is for address space update, so it should be fine
>> to have nothing to commit, sorry.
> 
> Hold on..
> 
> When I was replying to your patch 2 and reading the code around, I fount
> that this patch does affect vhost..  see region_nop() hook and also vhost's
> version of vhost_region_addnop().  I think vhost will sync its memory
> layout for each of the commit(), and any newly created AS could emptify
> vhost memory list even if registered on address_space_memory.
> 
> The other thing is address_space_update_topology() seems to be only used by
> address_space_init().  It means I don't think there should have any
> listener registered to this AS anyway.. :) So iiuc this patch (even if
> converting to loop over per-as memory listeners) is not needed.
> 
Agree, dropping this patch :)

Emanuele




Re: [RFC PATCH 2/2] kvm/kvm-all.c: listener should delay kvm_vm_ioctl to the commit phase

2022-09-09 Thread Emanuele Giuseppe Esposito
Hi Peter,

Apologies for the delay to answer you.

[...]
>>
>> - Doing the above is still not enough, as KVM figures what operation to
>> do depending on the current state of the memslots.
>> Assuming we already have an already existing MR y, and now we get the
>> list DELETE(y) CREATE(y/2) (ie reducing y to half its size).
>> In this case the interval tree can't do anything, but it's very hard to
>> understand that the second request in the list is a CREATE, because when
>> KVM performs the check to understand which type of operation it is
>> (before doing any modification at all in the memslot list), it finds
>> that y (memslot with same id) exist, but has a different size than what
>> provided from userspace, therefore it could either fail, or misinterpret
>> it as another operation (currently fails -EINVALID).
> 
> Another good question..  I think that can be partly solved by not allowing
> ID duplication in the same batched ioctl, or maybe we can fail it.  From
> qemu perspective, we may need to change the memslot id allocation to not
> reuse IDs of being-deleted memslots until the batch is processed.
> 
> Something like adding similar INVALID tags to kvm memslots in QEMU when we
> are preparing the batch, then we should only reset memory_size==0 and clear
> INVALID tag after the ioctl returns.  Then during the batch, any new slots
> to be added from kvm_get_free_slot() will not return any duplication of a
> deleting one.

First of all, you're right. No interval tree is needed.

I think the approach I am currently using is something like what you
described above: when a DELETE operation is created in QEMU (there is no
MOVE), I set the invalid_slot flag to 1.
Then KVM will firstly process the requests with invalid_slot == 1, mark
the memslot invalid, and then process all the others (invalid included,
as they need the actual DELETE/MOVE operation).

> 
>> If we instead already provide the labels, then we can:
>>  1. substitute the memslots pointed by DELETE/MOVE with invalid & swap
>> (so it is visible, non-atomic. But we don't care, as we are not deleting
>> anything)
>>  2. delete the invalid memslot (in the inactive memslot list)
>>  3. process the other requests (in the inactive memslot list)
>>  4. single and atomic swap (step 2 and 3 are now visible).
>>
>> What do you think?
> 
> Adding some limitation to the new ioctl makes sense to me, especially
> moving the DELETE/MOVE to be handled earlier, rather than a complete
> mixture of all of them.
> 
> I'm wondering whether the batch ioctl can be layed out separately on the
> operations, then it can be clearly documented on the sequence of handling
> each op:
> 
>   struct kvm_userspace_memory_region_batch {
>  struct kvm_userspace_memory_region deletes[];
>  struct kvm_userspace_memory_region moves[];
>  struct kvm_userspace_memory_region creates[];
>  struct kvm_userspace_memory_region flags_only[];
>   };
> 
> So that the ordering can be very obvious.  But I didn't really think deeper
> than that.

No, I think it doesn't work. Oder needs to be preserved, I see many
DELETE+CREATE operations on the same slot id.
> 
> Side note: do we use MOVE at all in QEMU?

Nope :)

> 
>>
>> Bonus question: with this atomic operation, do we really need the
>> invalid memslot logic for MOVE/DELETE?
> 
> I think so.  Not an expert on that, but iiuc that's to make sure we'll zap
> all shadow paging that maps to the slots being deleted/moved.
> 
> Paolo can always help to keep me honest above.

Yes, we need to keep that logic.

v2 is coming soon.

Thank you,
Emanuele




Re: [PATCH 3/3] vdpa: Support VLAN on nic control shadow virtqueue

2022-09-09 Thread Eugenio Perez Martin
On Fri, Sep 9, 2022 at 8:39 AM Jason Wang  wrote:
>
> On Wed, Sep 7, 2022 at 12:36 AM Eugenio Pérez  wrote:
> >
> > Update the virtio-net device model with each guest's update of vlan
> > through control virtqueue, and accept creating a SVQ with a device
> > exposing vlan feature bit.
> >
> > Done in the same commit since a malicious guest could send vlan
> > commands otherwise.
> >
> > Signed-off-by: Eugenio Pérez 
> > ---
> >  net/vhost-vdpa.c | 11 +++
> >  1 file changed, 11 insertions(+)
> >
> > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> > index ecbfd08eb9..40f7c60399 100644
> > --- a/net/vhost-vdpa.c
> > +++ b/net/vhost-vdpa.c
> > @@ -94,6 +94,7 @@ static const uint64_t vdpa_svq_device_features =
> >  BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) |
> >  BIT_ULL(VIRTIO_NET_F_STATUS) |
> >  BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |
> > +BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |
> >  BIT_ULL(VIRTIO_NET_F_MQ) |
> >  BIT_ULL(VIRTIO_F_ANY_LAYOUT) |
> >  BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) |
> > @@ -538,6 +539,16 @@ static bool vhost_vdpa_net_cvq_validate_cmd(const void 
> > *out_buf, size_t len)
> >__func__, ctrl.cmd);
> >  };
> >  break;
> > +case VIRTIO_NET_CTRL_VLAN:
> > +switch (ctrl->cmd) {

Rebase mistake by my side: This must be ctrl.cmd.

> > +case VIRTIO_NET_CTRL_VLAN_ADD:
> > +case VIRTIO_NET_CTRL_VLAN_DEL:
> > +return true;
> > +default:
> > +qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid vlan cmd %u\n",
> > +  __func__, ctrl->cmd);

Same here (s/ctrl->cmd/ctrl.cmd/).


> > +};
>
> Considering we may add more features here, is it still worthwhile to
> keep a whitelist like this?
>

I guess we can remove it, let me test without it.

Thanks!

> Thanks
>
> > +break;
> >  default:
> >  qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid control class %u\n",
> >__func__, ctrl.class);
> > --
> > 2.31.1
> >
>




Re: [PATCH 3/6] parallels: Add checking and repairing duplicate offsets in BAT

2022-09-09 Thread Alexander Ivanov



On 08.09.2022 19:45, Denis V. Lunev wrote:

On 9/8/22 19:15, Denis V. Lunev wrote:

On 9/2/22 10:52, Alexander Ivanov wrote:

Cluster offsets must be unique among all BAT entries.
Find duplicate offsets in the BAT.

If a duplicated offset is found fix it by copying the content
of the relevant cluster to a new allocated cluster and
set the new cluster offset to the duplicated entry.

Add host_cluster_index() helper to deduplicate the code.
Add highest_offset() helper. It will be used for code deduplication
in the next patch.

Signed-off-by: Alexander Ivanov 
---
  block/parallels.c | 136 
++

  1 file changed, 136 insertions(+)

diff --git a/block/parallels.c b/block/parallels.c
index dbcaf5d310..339ce45634 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -136,6 +136,26 @@ static int cluster_remainder(BDRVParallelsState 
*s, int64_t sector_num,

  return MIN(nb_sectors, ret);
  }
  +static uint32_t host_cluster_index(BDRVParallelsState *s, int64_t 
off)

+{
+    off -= s->header->data_off << BDRV_SECTOR_BITS;
+    return off / s->cluster_size;
+}
+
+static int64_t highest_offset(BDRVParallelsState *s)
+{
+    int64_t off, high_off = 0;
+    int i;
+
+    for (i = 0; i < s->bat_size; i++) {
+    off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+    if (off > high_off) {
+    high_off = off;
+    }
+    }
+    return high_off;
+}
+
  static int64_t block_status(BDRVParallelsState *s, int64_t 
sector_num,

  int nb_sectors, int *pnum)
  {
@@ -547,6 +567,114 @@ static int 
parallels_check_leak(BlockDriverState *bs,

  return 0;
  }
  +static int parallels_check_duplicate(BlockDriverState *bs,
+ BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    QEMUIOVector qiov;
+    int64_t off, high_off, sector;
+    unsigned long *bitmap;
+    uint32_t i, bitmap_size, cluster_index;
+    int n, ret = 0;
+    uint64_t *buf = NULL;
+    bool new_allocations = false;
+
+    high_off = highest_offset(s);
+    if (high_off == 0) {
+    return 0;
+    }
+
+    /*
+ * Create a bitmap of used clusters.
+ * If a bit is set, there is a BAT entry pointing to this cluster.
+ * Loop through the BAT entrues, check bits relevant to an 
entry offset.

+ * If bit is set, this entry is duplicated. Otherwise set the bit.
+ */
+    bitmap_size = host_cluster_index(s, high_off) + 1;
+    bitmap = bitmap_new(bitmap_size);
+
+    buf = g_malloc(s->cluster_size);
+    qemu_iovec_init(&qiov, 0);
+    qemu_iovec_add(&qiov, buf, s->cluster_size);
+
+    for (i = 0; i < s->bat_size; i++) {
+    off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+    if (off == 0) {
+    continue;
+    }
+
+    cluster_index = host_cluster_index(s, off);
+    if (test_bit(cluster_index, bitmap)) {
+    /* this cluster duplicates another one */
+    fprintf(stderr,
+    "%s duplicate offset in BAT entry %u\n",
+    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
+
+    res->corruptions++;
+
+    if (fix & BDRV_FIX_ERRORS) {
+    /*
+ * Reset the entry and allocate a new cluster
+ * for the relevant guest offset. In this way we let
+ * the lower layer to place the new cluster properly.
+ * Copy the original cluster to the allocated one.
+ */
+    parallels_set_bat_entry(s, i, 0);
+
+    ret = bdrv_pread(bs->file, off, s->cluster_size, 
buf, 0);

+    if (ret < 0) {
+    res->check_errors++;
+    goto out;
+    }
+
+    sector = (i * s->cluster_size) >> BDRV_SECTOR_BITS;
+    off = allocate_clusters(bs, sector, s->tracks, &n);
+    if (off < 0) {
+    res->check_errors++;
+    ret = off;
+    goto out;
+    }
+    off <<= BDRV_SECTOR_BITS;
+    if (off > high_off) {
+    high_off = off;
+    }
+
+    ret = bdrv_co_pwritev(bs->file, off, 
s->cluster_size, &qiov, 0);

+    if (ret < 0) {
+    res->check_errors++;
+    goto out;
+    }
+
+    new_allocations = true;
+    res->corruptions_fixed++;
+    }
+
+    } else {
+    bitmap_set(bitmap, cluster_index, 1);
+    }
+    }
+
+    if (new_allocations) {
+    /*
+ * When new clusters are allocated, file size increases
+ * by 128 Mb blocks. We need to truncate the file to the
+ * right size.
+ */
+    ret = parallels_handle_leak(bs, res, high_off, true);
+    if (ret < 0) {
+    res->check_errors++;
+    goto out;
+  

[RISU PATCH] risu: Use altermate stack

2022-09-09 Thread Song Gao
We can use altermate stack, so that we can use sp register as intput/ouput 
register.
I had tested aarch64/LoongArch architecture.

Signed-off-by: Song Gao 
---
 risu.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/risu.c b/risu.c
index 1c096a8..98dba94 100644
--- a/risu.c
+++ b/risu.c
@@ -329,7 +329,7 @@ static void set_sigill_handler(void (*fn) (int, siginfo_t 
*, void *))
 memset(&sa, 0, sizeof(struct sigaction));
 
 sa.sa_sigaction = fn;
-sa.sa_flags = SA_SIGINFO;
+sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
 sigemptyset(&sa.sa_mask);
 if (sigaction(SIGILL, &sa, 0) != 0) {
 perror("sigaction");
@@ -550,6 +550,7 @@ int main(int argc, char **argv)
 char *trace_fn = NULL;
 struct option *longopts;
 char *shortopts;
+stack_t ss;
 
 longopts = setup_options(&shortopts);
 
@@ -617,6 +618,19 @@ int main(int argc, char **argv)
 
 load_image(imgfile);
 
+/* create altermate stack */
+ss.ss_sp = malloc(SIGSTKSZ);
+if (ss.ss_sp == NULL) {
+perror("malloc");
+exit(EXIT_FAILURE);
+}
+ss.ss_size = SIGSTKSZ;
+ss.ss_flags = 0;
+if (sigaltstack(&ss, NULL) == -1) {
+perror("sigaltstac");
+exit(EXIT_FAILURE);
+}
+
 /* E.g. select requested SVE vector length. */
 arch_init();
 
-- 
2.31.1