[PULL 21/23] block: rename former bdrv_file_open callbacks

2024-06-21 Thread Paolo Bonzini
Since there is no bdrv_file_open callback anymore, rename the implementations
so that they end with "_open" instead of "_file_open".  NFS is the exception
because all the functions are named nfs_file_*.

Suggested-by: Kevin Wolf 
Signed-off-by: Paolo Bonzini 
---
 block/blkio.c | 8 
 block/null.c  | 8 
 block/nvme.c  | 8 
 block/ssh.c   | 6 +++---
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/block/blkio.c b/block/blkio.c
index 1a38064ce76..3d9a2e764c3 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -713,7 +713,7 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, 
QDict *options,
  * for example will fail.
  *
  * In order to open the device read-only, we are using the `read-only`
- * property of the libblkio driver in blkio_file_open().
+ * property of the libblkio driver in blkio_open().
  */
 fd = qemu_open(path, O_RDWR, NULL);
 if (fd < 0) {
@@ -791,8 +791,8 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, 
QDict *options,
 return 0;
 }
 
-static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
-   Error **errp)
+static int blkio_open(BlockDriverState *bs, QDict *options, int flags,
+  Error **errp)
 {
 const char *blkio_driver = bs->drv->protocol_name;
 BDRVBlkioState *s = bs->opaque;
@@ -1088,7 +1088,7 @@ static void blkio_refresh_limits(BlockDriverState *bs, 
Error **errp)
  */
 #define BLKIO_DRIVER_COMMON \
 .instance_size   = sizeof(BDRVBlkioState), \
-.bdrv_open   = blkio_file_open, \
+.bdrv_open   = blkio_open, \
 .bdrv_close  = blkio_close, \
 .bdrv_co_getlength   = blkio_co_getlength, \
 .bdrv_co_truncate= blkio_truncate, \
diff --git a/block/null.c b/block/null.c
index 6fa64d20d86..4730acc1eb2 100644
--- a/block/null.c
+++ b/block/null.c
@@ -77,8 +77,8 @@ static void null_aio_parse_filename(const char *filename, 
QDict *options,
 }
 }
 
-static int null_file_open(BlockDriverState *bs, QDict *options, int flags,
-  Error **errp)
+static int null_open(BlockDriverState *bs, QDict *options, int flags,
+ Error **errp)
 {
 QemuOpts *opts;
 BDRVNullState *s = bs->opaque;
@@ -283,7 +283,7 @@ static BlockDriver bdrv_null_co = {
 .protocol_name  = "null-co",
 .instance_size  = sizeof(BDRVNullState),
 
-.bdrv_open  = null_file_open,
+.bdrv_open  = null_open,
 .bdrv_parse_filename= null_co_parse_filename,
 .bdrv_co_getlength  = null_co_getlength,
 .bdrv_co_get_allocated_file_size = null_co_get_allocated_file_size,
@@ -304,7 +304,7 @@ static BlockDriver bdrv_null_aio = {
 .protocol_name  = "null-aio",
 .instance_size  = sizeof(BDRVNullState),
 
-.bdrv_open  = null_file_open,
+.bdrv_open  = null_open,
 .bdrv_parse_filename= null_aio_parse_filename,
 .bdrv_co_getlength  = null_co_getlength,
 .bdrv_co_get_allocated_file_size = null_co_get_allocated_file_size,
diff --git a/block/nvme.c b/block/nvme.c
index c84914af6dd..3b588b139f6 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -889,7 +889,7 @@ out:
 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar));
 }
 
-/* Cleaning up is done in nvme_file_open() upon error. */
+/* Cleaning up is done in nvme_open() upon error. */
 return ret;
 }
 
@@ -967,8 +967,8 @@ static void nvme_close(BlockDriverState *bs)
 g_free(s->device);
 }
 
-static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
-  Error **errp)
+static int nvme_open(BlockDriverState *bs, QDict *options, int flags,
+ Error **errp)
 {
 const char *device;
 QemuOpts *opts;
@@ -1630,7 +1630,7 @@ static BlockDriver bdrv_nvme = {
 .create_opts  = &bdrv_create_opts_simple,
 
 .bdrv_parse_filename  = nvme_parse_filename,
-.bdrv_open= nvme_file_open,
+.bdrv_open= nvme_open,
 .bdrv_close   = nvme_close,
 .bdrv_co_getlength= nvme_co_getlength,
 .bdrv_probe_blocksizes= nvme_probe_blocksizes,
diff --git a/block/ssh.c b/block/ssh.c
index 1344822ed85..27d582e0e3d 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -837,8 +837,8 @@ static int connect_to_ssh(BDRVSSHState *s, 
BlockdevOptionsSsh *opts,
 return ret;
 }
 
-static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags,
- Error **errp)
+static int ssh_open(BlockDriverState *bs, QDict *options, int bdrv_flags,
+Error **errp)
 {
 BDRVSSHState *s = bs->opaque;
 BlockdevOptionsSsh *opts;
@@ -1362,7 +1362,7 @@ static BlockDriver bdrv_ssh = {
 .protocol_name= "ssh",
 .instance_siz

[PULL 15/23] Revert "host/i386: assume presence of SSE2"

2024-06-21 Thread Paolo Bonzini
This reverts commit b18236897ca15c3db1506d8edb9a191dfe51429c.
The x86-64 instruction set can now be tuned down to x86-64 v1
or i386 Pentium Pro.

Signed-off-by: Paolo Bonzini 
---
 host/include/i386/host/cpuinfo.h | 1 +
 util/bufferiszero.c  | 4 ++--
 util/cpuinfo-i386.c  | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
index 72f6fad61e5..81771733eaa 100644
--- a/host/include/i386/host/cpuinfo.h
+++ b/host/include/i386/host/cpuinfo.h
@@ -14,6 +14,7 @@
 #define CPUINFO_POPCNT  (1u << 4)
 #define CPUINFO_BMI1(1u << 5)
 #define CPUINFO_BMI2(1u << 6)
+#define CPUINFO_SSE2(1u << 7)
 #define CPUINFO_AVX1(1u << 9)
 #define CPUINFO_AVX2(1u << 10)
 #define CPUINFO_AVX512F (1u << 11)
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 11c080e02cf..74864f7b782 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -188,14 +188,14 @@ static biz_accel_fn const accel_table[] = {
 
 static unsigned best_accel(void)
 {
-#ifdef CONFIG_AVX2_OPT
 unsigned info = cpuinfo_init();
 
+#ifdef CONFIG_AVX2_OPT
 if (info & CPUINFO_AVX2) {
 return 2;
 }
 #endif
-return 1;
+return info & CPUINFO_SSE2 ? 1 : 0;
 }
 
 #elif defined(__aarch64__) && defined(__ARM_NEON)
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
index ca74ef04f54..90f92a42dc8 100644
--- a/util/cpuinfo-i386.c
+++ b/util/cpuinfo-i386.c
@@ -34,6 +34,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
 if (max >= 1) {
 __cpuid(1, a, b, c, d);
 
+info |= (d & bit_SSE2 ? CPUINFO_SSE2 : 0);
 info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
 info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
 info |= (c & bit_PCLMUL ? CPUINFO_PCLMUL : 0);
-- 
2.45.2




[PULL 17/23] meson: remove dead optimization option

2024-06-21 Thread Paolo Bonzini
Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 meson.build   | 13 -
 meson_options.txt |  2 --
 scripts/meson-buildoptions.sh |  3 ---
 3 files changed, 18 deletions(-)

diff --git a/meson.build b/meson.build
index 6e694ecd9fe..54e6b09f4fb 100644
--- a/meson.build
+++ b/meson.build
@@ -2874,18 +2874,6 @@ config_host_data.set('CONFIG_AVX2_OPT', 
get_option('avx2') \
 int main(int argc, char *argv[]) { return bar(argv[argc - 1]); }
   '''), error_message: 'AVX2 not available').allowed())
 
-config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
-  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512F') \
-  .require(cc.links('''
-#include 
-#include 
-static int __attribute__((target("avx512f"))) bar(void *a) {
-  __m512i x = *(__m512i *)a;
-  return _mm512_test_epi64_mask(x, x);
-}
-int main(int argc, char *argv[]) { return bar(argv[argc - 1]); }
-  '''), error_message: 'AVX512F not available').allowed())
-
 config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
   .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512BW') \
   .require(cc.links('''
@@ -4283,7 +4271,6 @@ summary_info += {'mutex debugging':   
get_option('debug_mutex')}
 summary_info += {'memory allocator':  get_option('malloc')}
 summary_info += {'avx2 optimization': config_host_data.get('CONFIG_AVX2_OPT')}
 summary_info += {'avx512bw optimization': 
config_host_data.get('CONFIG_AVX512BW_OPT')}
-summary_info += {'avx512f optimization': 
config_host_data.get('CONFIG_AVX512F_OPT')}
 summary_info += {'gcov':  get_option('b_coverage')}
 summary_info += {'thread sanitizer':  get_option('tsan')}
 summary_info += {'CFI support':   get_option('cfi')}
diff --git a/meson_options.txt b/meson_options.txt
index 6065ed2d352..0269fa0f16e 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -119,8 +119,6 @@ option('membarrier', type: 'feature', value: 'disabled',
 
 option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')
-option('avx512f', type: 'feature', value: 'disabled',
-   description: 'AVX512F optimizations')
 option('avx512bw', type: 'feature', value: 'auto',
description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 62842d47e88..cfadb5ea86a 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -95,7 +95,6 @@ meson_options_help() {
   printf "%s\n" '  auth-pamPAM access control'
   printf "%s\n" '  avx2AVX2 optimizations'
   printf "%s\n" '  avx512bwAVX512BW optimizations'
-  printf "%s\n" '  avx512f AVX512F optimizations'
   printf "%s\n" '  blkio   libblkio block device driver'
   printf "%s\n" '  bochs   bochs image format support'
   printf "%s\n" '  bpf eBPF support'
@@ -240,8 +239,6 @@ _meson_option_parse() {
 --disable-avx2) printf "%s" -Davx2=disabled ;;
 --enable-avx512bw) printf "%s" -Davx512bw=enabled ;;
 --disable-avx512bw) printf "%s" -Davx512bw=disabled ;;
---enable-avx512f) printf "%s" -Davx512f=enabled ;;
---disable-avx512f) printf "%s" -Davx512f=disabled ;;
 --enable-gcov) printf "%s" -Db_coverage=true ;;
 --disable-gcov) printf "%s" -Db_coverage=false ;;
 --enable-lto) printf "%s" -Db_lto=true ;;
-- 
2.45.2




[PULL 02/23] target/i386: fix CC_OP dump

2024-06-21 Thread Paolo Bonzini
POPCNT was missing, and the entries were all out of order after
ADCX/ADOX/ADCOX were moved close to EFLAGS.  Just use designated
initializers.

Fixes: 4885c3c4953 ("target-i386: Use ctpop helper", 2017-01-10)
Fixes: cc155f19717 ("target/i386: rewrite flags writeback for ADCX/ADOX", 
2024-06-11)
Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu-dump.c | 101 +
 1 file changed, 51 insertions(+), 50 deletions(-)

diff --git a/target/i386/cpu-dump.c b/target/i386/cpu-dump.c
index 40697064d92..3bb8e440916 100644
--- a/target/i386/cpu-dump.c
+++ b/target/i386/cpu-dump.c
@@ -28,69 +28,70 @@
 /* x86 debug */
 
 static const char *cc_op_str[CC_OP_NB] = {
-"DYNAMIC",
-"EFLAGS",
+[CC_OP_DYNAMIC] = "DYNAMIC",
 
-"MULB",
-"MULW",
-"MULL",
-"MULQ",
+[CC_OP_EFLAGS] = "EFLAGS",
+[CC_OP_ADCX] = "ADCX",
+[CC_OP_ADOX] = "ADOX",
+[CC_OP_ADCOX] = "ADCOX",
 
-"ADDB",
-"ADDW",
-"ADDL",
-"ADDQ",
+[CC_OP_MULB] = "MULB",
+[CC_OP_MULW] = "MULW",
+[CC_OP_MULL] = "MULL",
+[CC_OP_MULQ] = "MULQ",
 
-"ADCB",
-"ADCW",
-"ADCL",
-"ADCQ",
+[CC_OP_ADDB] = "ADDB",
+[CC_OP_ADDW] = "ADDW",
+[CC_OP_ADDL] = "ADDL",
+[CC_OP_ADDQ] = "ADDQ",
 
-"SUBB",
-"SUBW",
-"SUBL",
-"SUBQ",
+[CC_OP_ADCB] = "ADCB",
+[CC_OP_ADCW] = "ADCW",
+[CC_OP_ADCL] = "ADCL",
+[CC_OP_ADCQ] = "ADCQ",
 
-"SBBB",
-"SBBW",
-"SBBL",
-"SBBQ",
+[CC_OP_SUBB] = "SUBB",
+[CC_OP_SUBW] = "SUBW",
+[CC_OP_SUBL] = "SUBL",
+[CC_OP_SUBQ] = "SUBQ",
 
-"LOGICB",
-"LOGICW",
-"LOGICL",
-"LOGICQ",
+[CC_OP_SBBB] = "SBBB",
+[CC_OP_SBBW] = "SBBW",
+[CC_OP_SBBL] = "SBBL",
+[CC_OP_SBBQ] = "SBBQ",
 
-"INCB",
-"INCW",
-"INCL",
-"INCQ",
+[CC_OP_LOGICB] = "LOGICB",
+[CC_OP_LOGICW] = "LOGICW",
+[CC_OP_LOGICL] = "LOGICL",
+[CC_OP_LOGICQ] = "LOGICQ",
 
-"DECB",
-"DECW",
-"DECL",
-"DECQ",
+[CC_OP_INCB] = "INCB",
+[CC_OP_INCW] = "INCW",
+[CC_OP_INCL] = "INCL",
+[CC_OP_INCQ] = "INCQ",
 
-"SHLB",
-"SHLW",
-"SHLL",
-"SHLQ",
+[CC_OP_DECB] = "DECB",
+[CC_OP_DECW] = "DECW",
+[CC_OP_DECL] = "DECL",
+[CC_OP_DECQ] = "DECQ",
 
-"SARB",
-"SARW",
-"SARL",
-"SARQ",
+[CC_OP_SHLB] = "SHLB",
+[CC_OP_SHLW] = "SHLW",
+[CC_OP_SHLL] = "SHLL",
+[CC_OP_SHLQ] = "SHLQ",
 
-"BMILGB",
-"BMILGW",
-"BMILGL",
-"BMILGQ",
+[CC_OP_SARB] = "SARB",
+[CC_OP_SARW] = "SARW",
+[CC_OP_SARL] = "SARL",
+[CC_OP_SARQ] = "SARQ",
 
-"ADCX",
-"ADOX",
-"ADCOX",
+[CC_OP_BMILGB] = "BMILGB",
+[CC_OP_BMILGW] = "BMILGW",
+[CC_OP_BMILGL] = "BMILGL",
+[CC_OP_BMILGQ] = "BMILGQ",
 
-"CLR",
+[CC_OP_POPCNT] = "POPCNT",
+[CC_OP_CLR] = "CLR",
 };
 
 static void
-- 
2.45.2




[PULL 07/23] target/i386: decode address before going back to translate.c

2024-06-21 Thread Paolo Bonzini
There are now relatively few unconverted opcodes in translate.c (there
are 13 of them including 8 for x87), and all of them have the same
format with a mod/rm byte and no immediate.  A good next step is
to remove the early bail out to disas_insn_x87/disas_insn_old,
instead giving these legacy translator functions the same prototype
as the other gen_* functions.

To do this, the X86DecodeInsn can be passed down to the places that
used to fetch address bytes from the instruction stream.  To make
sure that everything is done cleanly, the CPUX86State* argument is
removed.

As part of the unification, the gen_lea_modrm() name is now free,
so rename gen_load_ea() to gen_lea_modrm().  This is as good a name
and it makes the changes to translate.c easier to review.

Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/tcg/decode-new.h |  14 ++-
 target/i386/tcg/translate.c  | 152 +--
 target/i386/tcg/decode-new.c.inc |  53 ++-
 target/i386/tcg/emit.c.inc   |   2 +-
 4 files changed, 103 insertions(+), 118 deletions(-)

diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index e4cdf5e3c4f..bebc77bd54b 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -264,12 +264,13 @@ typedef enum X86VEXSpecial {
 
 typedef struct X86OpEntry  X86OpEntry;
 typedef struct X86DecodedInsn X86DecodedInsn;
+struct DisasContext;
 
 /* Decode function for multibyte opcodes.  */
-typedef void (*X86DecodeFunc)(DisasContext *s, CPUX86State *env, X86OpEntry 
*entry, uint8_t *b);
+typedef void (*X86DecodeFunc)(struct DisasContext *s, CPUX86State *env, 
X86OpEntry *entry, uint8_t *b);
 
 /* Code generation function.  */
-typedef void (*X86GenFunc)(DisasContext *s, X86DecodedInsn *decode);
+typedef void (*X86GenFunc)(struct DisasContext *s, X86DecodedInsn *decode);
 
 struct X86OpEntry {
 /* Based on the is_decode flags.  */
@@ -316,6 +317,14 @@ typedef struct X86DecodedOp {
 };
 } X86DecodedOp;
 
+typedef struct AddressParts {
+int def_seg;
+int base;
+int index;
+int scale;
+target_long disp;
+} AddressParts;
+
 struct X86DecodedInsn {
 X86OpEntry e;
 X86DecodedOp op[3];
@@ -333,3 +342,4 @@ struct X86DecodedInsn {
 uint8_t b;
 };
 
+static void gen_lea_modrm(struct DisasContext *s, X86DecodedInsn *decode);
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 08db40681fa..1d845ff66bb 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -29,6 +29,7 @@
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
 #include "helper-tcg.h"
+#include "decode-new.h"
 
 #include "exec/log.h"
 
@@ -1529,14 +1530,6 @@ static inline uint64_t x86_ldq_code(CPUX86State *env, 
DisasContext *s)
 
 /* Decompose an address.  */
 
-typedef struct AddressParts {
-int def_seg;
-int base;
-int index;
-int scale;
-target_long disp;
-} AddressParts;
-
 static AddressParts gen_lea_modrm_0(CPUX86State *env, DisasContext *s,
 int modrm)
 {
@@ -1695,24 +1688,11 @@ static TCGv gen_lea_modrm_1(DisasContext *s, 
AddressParts a, bool is_vsib)
 return ea;
 }
 
-static void gen_lea_modrm(CPUX86State *env, DisasContext *s, int modrm)
-{
-AddressParts a = gen_lea_modrm_0(env, s, modrm);
-TCGv ea = gen_lea_modrm_1(s, a, false);
-gen_lea_v_seg(s, ea, a.def_seg, s->override);
-}
-
-static void gen_nop_modrm(CPUX86State *env, DisasContext *s, int modrm)
-{
-(void)gen_lea_modrm_0(env, s, modrm);
-}
-
 /* Used for BNDCL, BNDCU, BNDCN.  */
-static void gen_bndck(CPUX86State *env, DisasContext *s, int modrm,
+static void gen_bndck(DisasContext *s, X86DecodedInsn *decode,
   TCGCond cond, TCGv_i64 bndv)
 {
-AddressParts a = gen_lea_modrm_0(env, s, modrm);
-TCGv ea = gen_lea_modrm_1(s, a, false);
+TCGv ea = gen_lea_modrm_1(s, decode->mem, false);
 
 tcg_gen_extu_tl_i64(s->tmp1_i64, ea);
 if (!CODE64(s)) {
@@ -1724,8 +1704,9 @@ static void gen_bndck(CPUX86State *env, DisasContext *s, 
int modrm,
 }
 
 /* generate modrm load of memory or register. */
-static void gen_ld_modrm(CPUX86State *env, DisasContext *s, int modrm, MemOp 
ot)
+static void gen_ld_modrm(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
 {
+int modrm = s->modrm;
 int mod, rm;
 
 mod = (modrm >> 6) & 3;
@@ -1733,14 +1714,15 @@ static void gen_ld_modrm(CPUX86State *env, DisasContext 
*s, int modrm, MemOp ot)
 if (mod == 3) {
 gen_op_mov_v_reg(s, ot, s->T0, rm);
 } else {
-gen_lea_modrm(env, s, modrm);
+gen_lea_modrm(s, decode);
 gen_op_ld_v(s, ot, s->T0, s->A0);
 }
 }
 
 /* generate modrm store of memory or register. */
-static void gen_st_modrm(CPUX86State *env, DisasContext *s, int modrm, MemOp 
ot)
+static void gen_st_modrm(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
 {
+int modrm = s->modrm;
 int mod, rm;
 
 mod = (modrm

[PULL 01/23] configure: detect --cpu=mipsisa64r6

2024-06-21 Thread Paolo Bonzini
Treat it as a MIPS64 machine.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Thomas Huth 
Signed-off-by: Paolo Bonzini 
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index 5ad1674ca5f..8b6a2f16ceb 100755
--- a/configure
+++ b/configure
@@ -450,7 +450,7 @@ case "$cpu" in
 linux_arch=loongarch
 ;;
 
-  mips64*)
+  mips64*|mipsisa64*)
 cpu=mips64
 host_arch=mips
 linux_arch=mips
-- 
2.45.2




[PULL 14/23] Revert "host/i386: assume presence of SSSE3"

2024-06-21 Thread Paolo Bonzini
This reverts commit 433cd6d94a8256af70a5200f236dc8047c3c1468.
The x86-64 instruction set can now be tuned down to x86-64 v1
or i386 Pentium Pro.

Signed-off-by: Paolo Bonzini 
---
 util/cpuinfo-i386.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
index 6d474a6259a..ca74ef04f54 100644
--- a/util/cpuinfo-i386.c
+++ b/util/cpuinfo-i386.c
@@ -38,8 +38,8 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
 info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
 info |= (c & bit_PCLMUL ? CPUINFO_PCLMUL : 0);
 
-/* NOTE: our AES support requires SSSE3 (PSHUFB) as well. */
-info |= (c & bit_AES) ? CPUINFO_AES : 0;
+/* Our AES support requires PSHUFB as well. */
+info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0);
 
 /* For AVX features, we must check available and usable. */
 if ((c & bit_AVX) && (c & bit_OSXSAVE)) {
-- 
2.45.2




[PULL 22/23] exec: avoid using C++ keywords in function parameters

2024-06-21 Thread Paolo Bonzini
From: Roman Kiryanov 

to use the QEMU headers with a C++ compiler.

Signed-off-by: Roman Kiryanov 
Link: https://lore.kernel.org/r/20240618224553.878869-1-r...@google.com
Signed-off-by: Paolo Bonzini 
---
 include/exec/memory.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 1be58f694c9..d7591a60d9f 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -945,7 +945,7 @@ struct MemoryListener {
  * the current transaction.
  */
 void (*log_start)(MemoryListener *listener, MemoryRegionSection *section,
-  int old, int new);
+  int old_val, int new_val);
 
 /**
  * @log_stop:
@@ -964,7 +964,7 @@ struct MemoryListener {
  * the current transaction.
  */
 void (*log_stop)(MemoryListener *listener, MemoryRegionSection *section,
- int old, int new);
+ int old_val, int new_val);
 
 /**
  * @log_sync:
-- 
2.45.2




[PULL 10/23] target/i386: list instructions still in translate.c

2024-06-21 Thread Paolo Bonzini
Group them so that it is easier to figure out which two-byte opcodes to
tackle together.

Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/tcg/decode-new.c.inc | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index fa51aadfcf2..f01a4f1f1fe 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -129,6 +129,37 @@
  *
  *(^)  these are the two cases in which Intel and AMD disagree on the
  * primary exception class
+ *
+ * Instructions still in translate.c
+ * -
+ * Generation of TCG opcodes for almost all instructions is in emit.c.inc;
+ * this file interprets the prefixes and opcode bytes down to individual
+ * instruction mnemonics.  There is only a handful of opcodes still using
+ * a switch statement to decode modrm bits 3-5 and prefixes after decoding
+ * is complete; these are relics of the older x86 decoder and their code
+ * generation is performed in translate.c.
+ *
+ * These unconverted opcodes also perform their own effective address
+ * generation using the gen_lea_modrm() function.
+ *
+ * There is nothing particularly complicated about them; simply, they don't
+ * need any nasty hacks in the decoder, and they shouldn't get in the way
+ * of the implementation of new x86 instructions, so they are left alone
+ * for the time being.
+ *
+ * x87:
+ * 0xD8 - 0xDF
+ *
+ * privileged/system:
+ * 0x0F 0x00   group 6 (SLDT, STR, LLDT, LTR, VERR, VERW)
+ * 0x0F 0x01   group 7 (SGDT, SIDT, LGDT, LIDT, SMSW, LMSW, INVLPG,
+ *  MONITOR, MWAIT, CLAC, STAC, XGETBV, XSETBV,
+ *  SWAPGS, RDTSCP)
+ * 0x0F 0xC7 (reg operand) group 9 (RDRAND, RDSEED, RDPID)
+ *
+ * MPX:
+ * 0x0F 0x1A   BNDLDX, BNDMOV, BNDCL, BNDCU
+ * 0x0F 0x1B   BNDSTX, BNDMOV, BNDMK, BNDCN
  */
 
 #define X86_OP_NONE { 0 },
-- 
2.45.2




[PULL 04/23] target/i386: give CC_OP_POPCNT low bits corresponding to MO_TL

2024-06-21 Thread Paolo Bonzini
Handle it like the other arithmetic cc_ops.  This simplifies a
bit the implementation of bit test instructions.

Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu.h   | 13 +++--
 target/i386/tcg/translate.c |  3 +--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index f54cd93b3f9..8504a7998fd 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1275,6 +1275,7 @@ typedef enum {
 CC_OP_ADCX, /* CC_DST = C, CC_SRC = rest.  */
 CC_OP_ADOX, /* CC_SRC2 = O, CC_SRC = rest.  */
 CC_OP_ADCOX, /* CC_DST = C, CC_SRC2 = O, CC_SRC = rest.  */
+CC_OP_CLR, /* Z and P set, all other flags clear.  */
 
 CC_OP_MULB, /* modify all flags, C, O = (CC_SRC != 0) */
 CC_OP_MULW,
@@ -1331,8 +1332,16 @@ typedef enum {
 CC_OP_BMILGL,
 CC_OP_BMILGQ,
 
-CC_OP_CLR, /* Z set, all other flags clear.  */
-CC_OP_POPCNT, /* Z via CC_DST, all other flags clear.  */
+/*
+ * Note that only CC_OP_POPCNT (i.e. the one with MO_TL size)
+ * is used or implemented, because the translation needs
+ * to zero-extend CC_DST anyway.
+ */
+CC_OP_POPCNTB__, /* Z via CC_DST, all other flags clear.  */
+CC_OP_POPCNTW__,
+CC_OP_POPCNTL__,
+CC_OP_POPCNTQ__,
+CC_OP_POPCNT = sizeof(target_ulong) == 8 ? CC_OP_POPCNTQ__ : 
CC_OP_POPCNTL__,
 
 CC_OP_NB,
 } CCOp;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index eb353dc3c9f..934c514e64f 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -1019,8 +1019,6 @@ static CCPrepare gen_prepare_eflags_z(DisasContext *s, 
TCGv reg)
  .imm = CC_Z };
 case CC_OP_CLR:
 return (CCPrepare) { .cond = TCG_COND_ALWAYS };
-case CC_OP_POPCNT:
-return (CCPrepare) { .cond = TCG_COND_EQ, .reg = cpu_cc_dst };
 default:
 {
 MemOp size = (s->cc_op - CC_OP_ADDB) & 3;
@@ -3177,6 +3175,7 @@ static void disas_insn_old(DisasContext *s, CPUState 
*cpu, int b)
 case CC_OP_SHLB ... CC_OP_SHLQ:
 case CC_OP_SARB ... CC_OP_SARQ:
 case CC_OP_BMILGB ... CC_OP_BMILGQ:
+case CC_OP_POPCNT:
 /* Z was going to be computed from the non-zero status of CC_DST.
We can get that same Z value (and the new C value) by leaving
CC_DST alone, setting CC_SRC, and using a CC_OP_SAR of the
-- 
2.45.2




[PULL 00/23] Misc changes for 2024-06-22

2024-06-21 Thread Paolo Bonzini
The following changes since commit 223696363bb117241ad9c2facbff0c474afa4104:

  Merge tag 'edgar/xilinx-queue-2024-06-17.for-upstream' of 
https://gitlab.com/edgar.iglesias/qemu into staging (2024-06-18 13:08:01 -0700)

are available in the Git repository at:

  https://gitlab.com/bonzini/qemu.git tags/for-upstream

for you to fetch changes up to b9b51004033983589e00fb4697f620b903cfcf0e:

  exec: don't use void* in pointer arithmetic in headers (2024-06-21 18:32:18 
+0200)


* configure: detect --cpu=mipsisa64r6
* target/i386: decode address before going back to translate.c
* meson: allow configuring the x86-64 baseline
* meson: remove dead optimization option
* exec: small changes to allow compilation with C++ in Android emulator


Paolo Bonzini (21):
  configure: detect --cpu=mipsisa64r6
  target/i386: fix CC_OP dump
  target/i386: use cpu_cc_dst for CC_OP_POPCNT
  target/i386: give CC_OP_POPCNT low bits corresponding to MO_TL
  target/i386: convert bit test instructions to new decoder
  target/i386: try not to force EFLAGS computation for CC_OP_ADOX/ADCX
  target/i386: decode address before going back to translate.c
  target/i386: convert CMPXCHG8B/CMPXCHG16B to new decoder
  target/i386: do not check PREFIX_LOCK in old-style decoder
  target/i386: list instructions still in translate.c
  target/i386: assert that cc_op* and pc_save are preserved
  target/i386: remove gen_ext_tl
  Revert "host/i386: assume presence of POPCNT"
  Revert "host/i386: assume presence of SSSE3"
  Revert "host/i386: assume presence of SSE2"
  meson: allow configuring the x86-64 baseline
  meson: remove dead optimization option
  block: make assertion more generic
  block: do not check bdrv_file_open
  block: remove separate bdrv_file_open callback
  block: rename former bdrv_file_open callbacks

Roman Kiryanov (2):
  exec: avoid using C++ keywords in function parameters
  exec: don't use void* in pointer arithmetic in headers

 configure|   2 +-
 meson.build  |  54 +++--
 host/include/i386/host/cpuinfo.h |   2 +
 include/block/block_int-common.h |   3 -
 include/exec/memory.h|   6 +-
 target/i386/cpu.h|  13 +-
 target/i386/tcg/decode-new.h |  19 +-
 tcg/i386/tcg-target.h|   5 +-
 block.c  |  17 +-
 block/blkdebug.c |   2 +-
 block/blkio.c|   8 +-
 block/blkverify.c|   2 +-
 block/curl.c |   8 +-
 block/file-posix.c   |   8 +-
 block/file-win32.c   |   4 +-
 block/gluster.c  |   6 +-
 block/iscsi.c|   4 +-
 block/nbd.c  |   6 +-
 block/nfs.c  |   2 +-
 block/null.c |   8 +-
 block/nvme.c |   8 +-
 block/rbd.c  |   3 +-
 block/ssh.c  |   6 +-
 block/vvfat.c|   2 +-
 target/i386/cpu-dump.c   | 101 
 target/i386/tcg/cc_helper.c  |   2 +-
 target/i386/tcg/translate.c  | 492 ---
 util/bufferiszero.c  |   4 +-
 util/cpuinfo-i386.c  |   6 +-
 target/i386/tcg/decode-new.c.inc | 136 ---
 target/i386/tcg/emit.c.inc   | 249 +++-
 meson_options.txt|   5 +-
 scripts/meson-buildoptions.sh|   6 +-
 33 files changed, 618 insertions(+), 581 deletions(-)
-- 
2.45.2




[PULL 05/23] target/i386: convert bit test instructions to new decoder

2024-06-21 Thread Paolo Bonzini
Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/tcg/decode-new.h |   3 +
 target/i386/tcg/translate.c  | 147 +-
 target/i386/tcg/decode-new.c.inc |  40 ++---
 target/i386/tcg/emit.c.inc   | 149 ++-
 4 files changed, 181 insertions(+), 158 deletions(-)

diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index f9bf9a60411..e4cdf5e3c4f 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -190,6 +190,9 @@ typedef enum X86InsnSpecial {
 /* Always locked if it has a memory operand (XCHG) */
 X86_SPECIAL_Locked,
 
+/* Like HasLock, but also operand 2 provides bit displacement into memory. 
 */
+X86_SPECIAL_BitTest,
+
 /* Do not load effective address in s->A0 */
 X86_SPECIAL_NoLoadEA,
 
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 934c514e64f..257110ac703 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -708,11 +708,6 @@ static TCGv gen_ext_tl(TCGv dst, TCGv src, MemOp size, 
bool sign)
 return dst;
 }
 
-static void gen_exts(MemOp ot, TCGv reg)
-{
-gen_ext_tl(reg, reg, ot, true);
-}
-
 static void gen_op_j_ecx(DisasContext *s, TCGCond cond, TCGLabel *label1)
 {
 TCGv tmp = gen_ext_tl(NULL, cpu_regs[R_ECX], s->aflag, false);
@@ -2985,7 +2980,7 @@ static void disas_insn_old(DisasContext *s, CPUState 
*cpu, int b)
 int prefixes = s->prefix;
 MemOp dflag = s->dflag;
 MemOp ot;
-int modrm, reg, rm, mod, op, val;
+int modrm, reg, rm, mod, op;
 
 /* now check op code */
 switch (b) {
@@ -3051,146 +3046,6 @@ static void disas_insn_old(DisasContext *s, CPUState 
*cpu, int b)
 }
 break;
 
-//
-/* bit operations */
-case 0x1ba: /* bt/bts/btr/btc Gv, im */
-ot = dflag;
-modrm = x86_ldub_code(env, s);
-op = (modrm >> 3) & 7;
-mod = (modrm >> 6) & 3;
-rm = (modrm & 7) | REX_B(s);
-if (mod != 3) {
-s->rip_offset = 1;
-gen_lea_modrm(env, s, modrm);
-if (!(s->prefix & PREFIX_LOCK)) {
-gen_op_ld_v(s, ot, s->T0, s->A0);
-}
-} else {
-gen_op_mov_v_reg(s, ot, s->T0, rm);
-}
-/* load shift */
-val = x86_ldub_code(env, s);
-tcg_gen_movi_tl(s->T1, val);
-if (op < 4)
-goto unknown_op;
-op -= 4;
-goto bt_op;
-case 0x1a3: /* bt Gv, Ev */
-op = 0;
-goto do_btx;
-case 0x1ab: /* bts */
-op = 1;
-goto do_btx;
-case 0x1b3: /* btr */
-op = 2;
-goto do_btx;
-case 0x1bb: /* btc */
-op = 3;
-do_btx:
-ot = dflag;
-modrm = x86_ldub_code(env, s);
-reg = ((modrm >> 3) & 7) | REX_R(s);
-mod = (modrm >> 6) & 3;
-rm = (modrm & 7) | REX_B(s);
-gen_op_mov_v_reg(s, MO_32, s->T1, reg);
-if (mod != 3) {
-AddressParts a = gen_lea_modrm_0(env, s, modrm);
-/* specific case: we need to add a displacement */
-gen_exts(ot, s->T1);
-tcg_gen_sari_tl(s->tmp0, s->T1, 3 + ot);
-tcg_gen_shli_tl(s->tmp0, s->tmp0, ot);
-tcg_gen_add_tl(s->A0, gen_lea_modrm_1(s, a, false), s->tmp0);
-gen_lea_v_seg(s, s->A0, a.def_seg, s->override);
-if (!(s->prefix & PREFIX_LOCK)) {
-gen_op_ld_v(s, ot, s->T0, s->A0);
-}
-} else {
-gen_op_mov_v_reg(s, ot, s->T0, rm);
-}
-bt_op:
-tcg_gen_andi_tl(s->T1, s->T1, (1 << (3 + ot)) - 1);
-tcg_gen_movi_tl(s->tmp0, 1);
-tcg_gen_shl_tl(s->tmp0, s->tmp0, s->T1);
-if (s->prefix & PREFIX_LOCK) {
-switch (op) {
-case 0: /* bt */
-/* Needs no atomic ops; we suppressed the normal
-   memory load for LOCK above so do it now.  */
-gen_op_ld_v(s, ot, s->T0, s->A0);
-break;
-case 1: /* bts */
-tcg_gen_atomic_fetch_or_tl(s->T0, s->A0, s->tmp0,
-   s->mem_index, ot | MO_LE);
-break;
-case 2: /* btr */
-tcg_gen_not_tl(s->tmp0, s->tmp0);
-tcg_gen_atomic_fetch_and_tl(s->T0, s->A0, s->tmp0,
-s->mem_index, ot | MO_LE);
-break;
-default:
-case 3: /* btc */
-tcg_gen_atomic_fetch_xor_tl(s->T0, s->A0, s->tmp0,
-s->mem_index, ot | MO_LE);
-break;
-}
-tcg_gen_shr_tl(s->tmp4, s->T0, s->T1);
-} else {
-tcg_gen_shr_tl(s->tmp4, s->T0, s->T1);
-switch (op) {
-case 0: /* bt */
-/* Data alre

[PULL 08/23] target/i386: convert CMPXCHG8B/CMPXCHG16B to new decoder

2024-06-21 Thread Paolo Bonzini
This moves the last LOCK-enabled instructions to the new decoder.  It is now
possible to assume that PREFIX_LOCK gen_multi0F is called only after checking
that LOCK was not specified.

The gen_cmpxchg8b and gen_cmpxchg16b functions even have the correct
prototype already; the only thing that needs to be done is removing the
gen_lea_modrm() call.

Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/tcg/decode-new.h |   2 +
 target/i386/tcg/translate.c  | 121 +--
 target/i386/tcg/decode-new.c.inc |  34 ++---
 target/i386/tcg/emit.c.inc   |  96 
 4 files changed, 124 insertions(+), 129 deletions(-)

diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h
index bebc77bd54b..7f23d373ea7 100644
--- a/target/i386/tcg/decode-new.h
+++ b/target/i386/tcg/decode-new.h
@@ -114,6 +114,8 @@ typedef enum X86CPUIDFeature {
 X86_FEAT_CLWB,
 X86_FEAT_CMOV,
 X86_FEAT_CMPCCXADD,
+X86_FEAT_CX8,
+X86_FEAT_CX16,
 X86_FEAT_F16C,
 X86_FEAT_FMA,
 X86_FEAT_FSGSBASE,
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 1d845ff66bb..c60f18c7482 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2298,104 +2298,6 @@ static void gen_sty_env_A0(DisasContext *s, int offset, 
bool align)
 tcg_gen_qemu_st_i128(t, s->tmp0, mem_index, mop);
 }
 
-static void gen_cmpxchg8b(DisasContext *s, X86DecodedInsn *decode)
-{
-TCGv_i64 cmp, val, old;
-TCGv Z;
-
-gen_lea_modrm(s, decode);
-
-cmp = tcg_temp_new_i64();
-val = tcg_temp_new_i64();
-old = tcg_temp_new_i64();
-
-/* Construct the comparison values from the register pair. */
-tcg_gen_concat_tl_i64(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
-tcg_gen_concat_tl_i64(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
-
-/* Only require atomic with LOCK; non-parallel handled in generator. */
-if (s->prefix & PREFIX_LOCK) {
-tcg_gen_atomic_cmpxchg_i64(old, s->A0, cmp, val, s->mem_index, 
MO_TEUQ);
-} else {
-tcg_gen_nonatomic_cmpxchg_i64(old, s->A0, cmp, val,
-  s->mem_index, MO_TEUQ);
-}
-
-/* Set tmp0 to match the required value of Z. */
-tcg_gen_setcond_i64(TCG_COND_EQ, cmp, old, cmp);
-Z = tcg_temp_new();
-tcg_gen_trunc_i64_tl(Z, cmp);
-
-/*
- * Extract the result values for the register pair.
- * For 32-bit, we may do this unconditionally, because on success (Z=1),
- * the old value matches the previous value in EDX:EAX.  For x86_64,
- * the store must be conditional, because we must leave the source
- * registers unchanged on success, and zero-extend the writeback
- * on failure (Z=0).
- */
-if (TARGET_LONG_BITS == 32) {
-tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], old);
-} else {
-TCGv zero = tcg_constant_tl(0);
-
-tcg_gen_extr_i64_tl(s->T0, s->T1, old);
-tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EAX], Z, zero,
-   s->T0, cpu_regs[R_EAX]);
-tcg_gen_movcond_tl(TCG_COND_EQ, cpu_regs[R_EDX], Z, zero,
-   s->T1, cpu_regs[R_EDX]);
-}
-
-/* Update Z. */
-gen_compute_eflags(s);
-tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, Z, ctz32(CC_Z), 1);
-}
-
-#ifdef TARGET_X86_64
-static void gen_cmpxchg16b(DisasContext *s, X86DecodedInsn *decode)
-{
-MemOp mop = MO_TE | MO_128 | MO_ALIGN;
-TCGv_i64 t0, t1;
-TCGv_i128 cmp, val;
-
-gen_lea_modrm(s, decode);
-
-cmp = tcg_temp_new_i128();
-val = tcg_temp_new_i128();
-tcg_gen_concat_i64_i128(cmp, cpu_regs[R_EAX], cpu_regs[R_EDX]);
-tcg_gen_concat_i64_i128(val, cpu_regs[R_EBX], cpu_regs[R_ECX]);
-
-/* Only require atomic with LOCK; non-parallel handled in generator. */
-if (s->prefix & PREFIX_LOCK) {
-tcg_gen_atomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, mop);
-} else {
-tcg_gen_nonatomic_cmpxchg_i128(val, s->A0, cmp, val, s->mem_index, 
mop);
-}
-
-tcg_gen_extr_i128_i64(s->T0, s->T1, val);
-
-/* Determine success after the fact. */
-t0 = tcg_temp_new_i64();
-t1 = tcg_temp_new_i64();
-tcg_gen_xor_i64(t0, s->T0, cpu_regs[R_EAX]);
-tcg_gen_xor_i64(t1, s->T1, cpu_regs[R_EDX]);
-tcg_gen_or_i64(t0, t0, t1);
-
-/* Update Z. */
-gen_compute_eflags(s);
-tcg_gen_setcondi_i64(TCG_COND_EQ, t0, t0, 0);
-tcg_gen_deposit_tl(cpu_cc_src, cpu_cc_src, t0, ctz32(CC_Z), 1);
-
-/*
- * Extract the result values for the register pair.  We may do this
- * unconditionally, because on success (Z=1), the old value matches
- * the previous value in RDX:RAX.
- */
-tcg_gen_mov_i64(cpu_regs[R_EAX], s->T0);
-tcg_gen_mov_i64(cpu_regs[R_EDX], s->T1);
-}
-#endif
-
 #include "emit.c.inc"
 
 static void gen_x87(DisasContext *s, X86DecodedInsn *decode)
@@ -2971,29 +2873,10 @@ static void gen_multi0F(DisasContext 

[PULL 23/23] exec: don't use void* in pointer arithmetic in headers

2024-06-21 Thread Paolo Bonzini
From: Roman Kiryanov 

void* pointer arithmetic is a GCC extentension which could not be
available in other build tools (e.g. C++). This changes removes this
assumption.

Signed-off-by: Roman Kiryanov 
Suggested-by: Paolo Bonzini 
Link: https://lore.kernel.org/r/20240620201654.598024-1-r...@google.com
Signed-off-by: Paolo Bonzini 
---
 include/exec/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index d7591a60d9f..08ecd7e195d 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -2796,7 +2796,7 @@ MemTxResult address_space_write_rom(AddressSpace *as, 
hwaddr addr,
 #include "exec/memory_ldst_phys.h.inc"
 
 struct MemoryRegionCache {
-void *ptr;
+uint8_t *ptr;
 hwaddr xlat;
 hwaddr len;
 FlatView *fv;
-- 
2.45.2




[PULL 20/23] block: remove separate bdrv_file_open callback

2024-06-21 Thread Paolo Bonzini
bdrv_file_open and bdrv_open are completely equivalent, they are
never checked except to see which one to invoke.  So merge them
into a single one.

Signed-off-by: Paolo Bonzini 
---
 include/block/block_int-common.h | 3 ---
 block.c  | 4 +---
 block/blkdebug.c | 2 +-
 block/blkio.c| 2 +-
 block/blkverify.c| 2 +-
 block/curl.c | 8 
 block/file-posix.c   | 8 
 block/file-win32.c   | 4 ++--
 block/gluster.c  | 6 +++---
 block/iscsi.c| 4 ++--
 block/nbd.c  | 6 +++---
 block/nfs.c  | 2 +-
 block/null.c | 4 ++--
 block/nvme.c | 2 +-
 block/rbd.c  | 3 ++-
 block/ssh.c  | 2 +-
 block/vvfat.c| 2 +-
 17 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 761276127ed..ebb4e56a503 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -248,9 +248,6 @@ struct BlockDriver {
 int GRAPH_UNLOCKED_PTR (*bdrv_open)(
 BlockDriverState *bs, QDict *options, int flags, Error **errp);
 
-/* Protocol drivers should implement this instead of bdrv_open */
-int GRAPH_UNLOCKED_PTR (*bdrv_file_open)(
-BlockDriverState *bs, QDict *options, int flags, Error **errp);
 void (*bdrv_close)(BlockDriverState *bs);
 
 int coroutine_fn GRAPH_UNLOCKED_PTR (*bdrv_co_create)(
diff --git a/block.c b/block.c
index dd14ba85fc3..c1cc313d216 100644
--- a/block.c
+++ b/block.c
@@ -1655,9 +1655,7 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, 
const char *node_name,
 bs->opaque = g_malloc0(drv->instance_size);
 
 assert(!drv->bdrv_needs_filename || bs->filename[0]);
-if (drv->bdrv_file_open) {
-ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
-} else if (drv->bdrv_open) {
+if (drv->bdrv_open) {
 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
 } else {
 ret = 0;
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 9da8c9eddc2..c95c818c388 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -1073,7 +1073,7 @@ static BlockDriver bdrv_blkdebug = {
 .is_filter  = true,
 
 .bdrv_parse_filename= blkdebug_parse_filename,
-.bdrv_file_open = blkdebug_open,
+.bdrv_open  = blkdebug_open,
 .bdrv_close = blkdebug_close,
 .bdrv_reopen_prepare= blkdebug_reopen_prepare,
 .bdrv_child_perm= blkdebug_child_perm,
diff --git a/block/blkio.c b/block/blkio.c
index 882e1c297b4..1a38064ce76 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -1088,7 +1088,7 @@ static void blkio_refresh_limits(BlockDriverState *bs, 
Error **errp)
  */
 #define BLKIO_DRIVER_COMMON \
 .instance_size   = sizeof(BDRVBlkioState), \
-.bdrv_file_open  = blkio_file_open, \
+.bdrv_open   = blkio_file_open, \
 .bdrv_close  = blkio_close, \
 .bdrv_co_getlength   = blkio_co_getlength, \
 .bdrv_co_truncate= blkio_truncate, \
diff --git a/block/blkverify.c b/block/blkverify.c
index ec45d8335ed..5a9bf674d9c 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -321,7 +321,7 @@ static BlockDriver bdrv_blkverify = {
 .instance_size= sizeof(BDRVBlkverifyState),
 
 .bdrv_parse_filename  = blkverify_parse_filename,
-.bdrv_file_open   = blkverify_open,
+.bdrv_open= blkverify_open,
 .bdrv_close   = blkverify_close,
 .bdrv_child_perm  = bdrv_default_perms,
 .bdrv_co_getlength= blkverify_co_getlength,
diff --git a/block/curl.c b/block/curl.c
index 419f7c89ef2..ef5252d00b5 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -1034,7 +1034,7 @@ static BlockDriver bdrv_http = {
 
 .instance_size  = sizeof(BDRVCURLState),
 .bdrv_parse_filename= curl_parse_filename,
-.bdrv_file_open = curl_open,
+.bdrv_open  = curl_open,
 .bdrv_close = curl_close,
 .bdrv_co_getlength  = curl_co_getlength,
 
@@ -1053,7 +1053,7 @@ static BlockDriver bdrv_https = {
 
 .instance_size  = sizeof(BDRVCURLState),
 .bdrv_parse_filename= curl_parse_filename,
-.bdrv_file_open = curl_open,
+.bdrv_open  = curl_open,
 .bdrv_close = curl_close,
 .bdrv_co_getlength  = curl_co_getlength,
 
@@ -1072,7 +1072,7 @@ static BlockDriver bdrv_ftp = {
 
 .instance_size  = sizeof(BDRVCURLState),
 .bdrv_parse_filename= curl_parse_filename,
-.bdrv_file_open = curl_open,
+.bdrv_op

[PULL 19/23] block: do not check bdrv_file_open

2024-06-21 Thread Paolo Bonzini
The set of BlockDrivers that have .bdrv_file_open coincides with those
that have .protocol_name and guess what---checking drv->bdrv_file_open
is done to see if the driver is a protocol.  So check drv->protocol_name
instead.

Signed-off-by: Paolo Bonzini 
---
 block.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index 69a2905178a..dd14ba85fc3 100644
--- a/block.c
+++ b/block.c
@@ -926,7 +926,6 @@ BlockDriver *bdrv_find_protocol(const char *filename,
 int i;
 
 GLOBAL_STATE_CODE();
-/* TODO Drivers without bdrv_file_open must be specified explicitly */
 
 /*
  * XXX(hch): we really should not let host device detection
@@ -1983,7 +1982,7 @@ static int bdrv_open_common(BlockDriverState *bs, 
BlockBackend *file,
 open_flags = bdrv_open_flags(bs, bs->open_flags);
 node_name = qemu_opt_get(opts, "node-name");
 
-assert(!drv->bdrv_file_open || file == NULL);
+assert(!drv->protocol_name || file == NULL);
 ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
 if (ret < 0) {
 goto fail_opts;
@@ -2084,7 +2083,7 @@ static int bdrv_fill_options(QDict **options, const char 
*filename,
 }
 /* If the user has explicitly specified the driver, this choice should
  * override the BDRV_O_PROTOCOL flag */
-protocol = drv->bdrv_file_open;
+protocol = drv->protocol_name;
 }
 
 if (protocol) {
@@ -4123,7 +4122,7 @@ bdrv_open_inherit(const char *filename, const char 
*reference, QDict *options,
 }
 
 /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
-assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open);
+assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->protocol_name);
 /* file must be NULL if a protocol BDS is about to be created
  * (the inverse results in an error message from bdrv_open_common()) */
 assert(!(flags & BDRV_O_PROTOCOL) || !file);
@@ -5971,7 +5970,7 @@ int64_t coroutine_fn 
bdrv_co_get_allocated_file_size(BlockDriverState *bs)
 return drv->bdrv_co_get_allocated_file_size(bs);
 }
 
-if (drv->bdrv_file_open) {
+if (drv->protocol_name) {
 /*
  * Protocol drivers default to -ENOTSUP (most of their data is
  * not stored in any of their children (if they even have any),
@@ -8030,7 +8029,7 @@ void bdrv_refresh_filename(BlockDriverState *bs)
  *   Both of these conditions are represented by 
generate_json_filename.
  */
 if (primary_child_bs->exact_filename[0] &&
-primary_child_bs->drv->bdrv_file_open &&
+primary_child_bs->drv->protocol_name &&
 !drv->is_filter && !generate_json_filename)
 {
 strcpy(bs->exact_filename, primary_child_bs->exact_filename);
-- 
2.45.2




[PULL 18/23] block: make assertion more generic

2024-06-21 Thread Paolo Bonzini
.bdrv_needs_filename is only set for drivers that also set bdrv_file_open,
i.e. protocol drivers.

So we can make the assertion always, it will always pass for those drivers
that use bdrv_open.

Signed-off-by: Paolo Bonzini 
---
 block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 468cf5e67d7..69a2905178a 100644
--- a/block.c
+++ b/block.c
@@ -1655,8 +1655,8 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, 
const char *node_name,
 bs->drv = drv;
 bs->opaque = g_malloc0(drv->instance_size);
 
+assert(!drv->bdrv_needs_filename || bs->filename[0]);
 if (drv->bdrv_file_open) {
-assert(!drv->bdrv_needs_filename || bs->filename[0]);
 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
 } else if (drv->bdrv_open) {
 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
-- 
2.45.2




[PULL 12/23] target/i386: remove gen_ext_tl

2024-06-21 Thread Paolo Bonzini
With the introduction of tcg_gen_ext_tl, most uses can be converted directly
because they do not have a NULL destination.  tcg_gen_ext_tl is able to drop
no-ops like "tcg_gen_ext_tl(tcgv, tcgv, MO_TL)" just fine, and the only thing
that gen_ext_tl was adding on top was avoiding the creation of a useless
temporary.  This can be done in the only place where it matters, which is
gen_op_j_ecx.

Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/tcg/translate.c | 41 +++--
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index d11c5e1dc13..5c9c992400e 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -697,23 +697,16 @@ static inline TCGv gen_compute_Dshift(DisasContext *s, 
MemOp ot)
 return dshift;
 };
 
-static TCGv gen_ext_tl(TCGv dst, TCGv src, MemOp size, bool sign)
-{
-if (size == MO_TL) {
-return src;
-}
-if (!dst) {
-dst = tcg_temp_new();
-}
-tcg_gen_ext_tl(dst, src, size | (sign ? MO_SIGN : 0));
-return dst;
-}
-
 static void gen_op_j_ecx(DisasContext *s, TCGCond cond, TCGLabel *label1)
 {
-TCGv tmp = gen_ext_tl(NULL, cpu_regs[R_ECX], s->aflag, false);
-
-tcg_gen_brcondi_tl(cond, tmp, 0, label1);
+TCGv lhs;
+if (s->aflag == MO_TL) {
+lhs = cpu_regs[R_ECX];
+} else {
+lhs = tcg_temp_new();
+tcg_gen_ext_tl(lhs, cpu_regs[R_ECX], s->aflag);
+}
+tcg_gen_brcondi_tl(cond, lhs, 0, label1);
 }
 
 static inline void gen_op_jz_ecx(DisasContext *s, TCGLabel *label1)
@@ -886,16 +879,16 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, 
TCGv reg)
 case CC_OP_SUBB ... CC_OP_SUBQ:
 /* (DATA_TYPE)CC_SRCT < (DATA_TYPE)CC_SRC */
 size = s->cc_op - CC_OP_SUBB;
-gen_ext_tl(s->cc_srcT, s->cc_srcT, size, false);
-gen_ext_tl(cpu_cc_src, cpu_cc_src, size, false);
+tcg_gen_ext_tl(s->cc_srcT, s->cc_srcT, size);
+tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size);
 return (CCPrepare) { .cond = TCG_COND_LTU, .reg = s->cc_srcT,
  .reg2 = cpu_cc_src, .use_reg2 = true };
 
 case CC_OP_ADDB ... CC_OP_ADDQ:
 /* (DATA_TYPE)CC_DST < (DATA_TYPE)CC_SRC */
 size = s->cc_op - CC_OP_ADDB;
-gen_ext_tl(cpu_cc_dst, cpu_cc_dst, size, false);
-gen_ext_tl(cpu_cc_src, cpu_cc_src, size, false);
+tcg_gen_ext_tl(cpu_cc_dst, cpu_cc_dst, size);
+tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size);
 return (CCPrepare) { .cond = TCG_COND_LTU, .reg = cpu_cc_dst,
  .reg2 = cpu_cc_src, .use_reg2 = true };
 
@@ -920,7 +913,7 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv 
reg)
 
 case CC_OP_BMILGB ... CC_OP_BMILGQ:
 size = s->cc_op - CC_OP_BMILGB;
-gen_ext_tl(cpu_cc_src, cpu_cc_src, size, false);
+tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size);
 return (CCPrepare) { .cond = TCG_COND_EQ, .reg = cpu_cc_src };
 
 case CC_OP_ADCX:
@@ -1050,8 +1043,8 @@ static CCPrepare gen_prepare_cc(DisasContext *s, int b, 
TCGv reg)
 size = s->cc_op - CC_OP_SUBB;
 switch (jcc_op) {
 case JCC_BE:
-gen_ext_tl(s->cc_srcT, s->cc_srcT, size, false);
-gen_ext_tl(cpu_cc_src, cpu_cc_src, size, false);
+tcg_gen_ext_tl(s->cc_srcT, s->cc_srcT, size);
+tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size);
 cc = (CCPrepare) { .cond = TCG_COND_LEU, .reg = s->cc_srcT,
.reg2 = cpu_cc_src, .use_reg2 = true };
 break;
@@ -1061,8 +1054,8 @@ static CCPrepare gen_prepare_cc(DisasContext *s, int b, 
TCGv reg)
 case JCC_LE:
 cond = TCG_COND_LE;
 fast_jcc_l:
-gen_ext_tl(s->cc_srcT, s->cc_srcT, size, true);
-gen_ext_tl(cpu_cc_src, cpu_cc_src, size, true);
+tcg_gen_ext_tl(s->cc_srcT, s->cc_srcT, size | MO_SIGN);
+tcg_gen_ext_tl(cpu_cc_src, cpu_cc_src, size | MO_SIGN);
 cc = (CCPrepare) { .cond = cond, .reg = s->cc_srcT,
.reg2 = cpu_cc_src, .use_reg2 = true };
 break;
-- 
2.45.2




[PULL 16/23] meson: allow configuring the x86-64 baseline

2024-06-21 Thread Paolo Bonzini
Add a Meson option to configure which x86-64 instruction
set to use.  QEMU will now default to x86-64-v1 + cmpxchg16b for
64-bit builds (that corresponds to a Pentium 4 for 32-bit builds).

The baseline can be tuned down to Pentium Pro for 32-bit builds (with
-Dx86_version=0), or up as desired.

Acked-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 meson.build   | 41 ---
 meson_options.txt |  3 +++
 scripts/meson-buildoptions.sh |  3 +++
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/meson.build b/meson.build
index 97e00d6f59b..6e694ecd9fe 100644
--- a/meson.build
+++ b/meson.build
@@ -336,15 +336,40 @@ if host_arch == 'i386' and not cc.links('''
   qemu_common_flags = ['-march=i486'] + qemu_common_flags
 endif
 
-# Assume x86-64-v2 (minus CMPXCHG16B for 32-bit code)
-if host_arch == 'i386'
-  qemu_common_flags = ['-mfpmath=sse'] + qemu_common_flags
-endif
+# Pick x86-64 baseline version
 if host_arch in ['i386', 'x86_64']
-  qemu_common_flags = ['-mpopcnt', '-msse4.2'] + qemu_common_flags
-endif
-if host_arch == 'x86_64'
-  qemu_common_flags = ['-mcx16'] + qemu_common_flags
+  if get_option('x86_version') == '0' and host_arch == 'x86_64'
+error('x86_64-v1 required for x86-64 hosts')
+  endif
+
+  # add flags for individual instruction set extensions
+  if get_option('x86_version') >= '1'
+if host_arch == 'i386'
+  qemu_common_flags = ['-mfpmath=sse'] + qemu_common_flags
+else
+  # present on basically all processors but technically not part of
+  # x86-64-v1, so only include -mneeded for x86-64 version 2 and above
+  qemu_common_flags = ['-mcx16'] + qemu_common_flags
+endif
+  endif
+  if get_option('x86_version') >= '2'
+qemu_common_flags = ['-mpopcnt'] + qemu_common_flags
+qemu_common_flags = cc.get_supported_arguments('-mneeded') + 
qemu_common_flags
+  endif
+  if get_option('x86_version') >= '3'
+qemu_common_flags = ['-mmovbe', '-mabm', '-mbmi1', '-mbmi2', '-mfma', 
'-mf16c'] + qemu_common_flags
+  endif
+
+  # add required vector instruction set (each level implies those below)
+  if get_option('x86_version') == '1'
+qemu_common_flags = ['-msse2'] + qemu_common_flags
+  elif get_option('x86_version') == '2'
+qemu_common_flags = ['-msse4.2'] + qemu_common_flags
+  elif get_option('x86_version') == '3'
+qemu_common_flags = ['-mavx2'] + qemu_common_flags
+  elif get_option('x86_version') == '4'
+qemu_common_flags = ['-mavx512f', '-mavx512bw', '-mavx512cd', 
'-mavx512dq', '-mavx512vl'] + qemu_common_flags
+  endif
 endif
 
 if get_option('prefer_static')
diff --git a/meson_options.txt b/meson_options.txt
index 7a79dd89706..6065ed2d352 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -370,3 +370,6 @@ option('qemu_ga_version', type: 'string', value: '',
 
 option('hexagon_idef_parser', type : 'boolean', value : true,
description: 'use idef-parser to automatically generate TCG code for 
the Hexagon frontend')
+
+option('x86_version', type : 'combo', choices : ['0', '1', '2', '3', '4'], 
value: '1',
+   description: 'tweak required x86_64 architecture version beyond 
compiler default')
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 58d49a447d5..62842d47e88 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -82,6 +82,8 @@ meson_options_help() {
   printf "%s\n" '  --with-suffix=VALUE  Suffix for QEMU 
data/modules/config directories'
   printf "%s\n" '   (can be empty) [qemu]'
   printf "%s\n" '  --with-trace-file=VALUE  Trace file prefix for simple 
backend [trace]'
+  printf "%s\n" '  --x86-version=CHOICE tweak required x86_64 architecture 
version beyond'
+  printf "%s\n" '   compiler default [1] (choices: 
0/1/2/3)'
   printf "%s\n" ''
   printf "%s\n" 'Optional features, enabled with --enable-FEATURE and'
   printf "%s\n" 'disabled with --disable-FEATURE, default is enabled if 
available'
@@ -552,6 +554,7 @@ _meson_option_parse() {
 --disable-werror) printf "%s" -Dwerror=false ;;
 --enable-whpx) printf "%s" -Dwhpx=enabled ;;
 --disable-whpx) printf "%s" -Dwhpx=disabled ;;
+--x86-version=*) quote_sh "-Dx86_version=$2" ;;
 --enable-xen) printf "%s" -Dxen=enabled ;;
 --disable-xen) printf "%s" -Dxen=disabled ;;
 --enable-xen-pci-passthrough) printf "%s" -Dxen_pci_passthrough=enabled ;;
-- 
2.45.2




[PULL 13/23] Revert "host/i386: assume presence of POPCNT"

2024-06-21 Thread Paolo Bonzini
This reverts commit 45ccdbcb24baf99667997fac5cf60318e5e7db51.
The x86-64 instruction set can now be tuned down to x86-64 v1
or i386 Pentium Pro.

Signed-off-by: Paolo Bonzini 
---
 host/include/i386/host/cpuinfo.h | 1 +
 tcg/i386/tcg-target.h| 5 +++--
 util/cpuinfo-i386.c  | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/host/include/i386/host/cpuinfo.h b/host/include/i386/host/cpuinfo.h
index c1e94d75ce1..72f6fad61e5 100644
--- a/host/include/i386/host/cpuinfo.h
+++ b/host/include/i386/host/cpuinfo.h
@@ -11,6 +11,7 @@
 #define CPUINFO_ALWAYS  (1u << 0)  /* so cpuinfo is nonzero */
 #define CPUINFO_MOVBE   (1u << 2)
 #define CPUINFO_LZCNT   (1u << 3)
+#define CPUINFO_POPCNT  (1u << 4)
 #define CPUINFO_BMI1(1u << 5)
 #define CPUINFO_BMI2(1u << 6)
 #define CPUINFO_AVX1(1u << 9)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index ecc69827287..2f67a97e059 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -111,6 +111,7 @@ typedef enum {
 #endif
 
 #define have_bmi1 (cpuinfo & CPUINFO_BMI1)
+#define have_popcnt   (cpuinfo & CPUINFO_POPCNT)
 #define have_avx1 (cpuinfo & CPUINFO_AVX1)
 #define have_avx2 (cpuinfo & CPUINFO_AVX2)
 #define have_movbe(cpuinfo & CPUINFO_MOVBE)
@@ -142,7 +143,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nor_i32  0
 #define TCG_TARGET_HAS_clz_i32  1
 #define TCG_TARGET_HAS_ctz_i32  1
-#define TCG_TARGET_HAS_ctpop_i321
+#define TCG_TARGET_HAS_ctpop_i32have_popcnt
 #define TCG_TARGET_HAS_deposit_i32  1
 #define TCG_TARGET_HAS_extract_i32  1
 #define TCG_TARGET_HAS_sextract_i32 1
@@ -177,7 +178,7 @@ typedef enum {
 #define TCG_TARGET_HAS_nor_i64  0
 #define TCG_TARGET_HAS_clz_i64  1
 #define TCG_TARGET_HAS_ctz_i64  1
-#define TCG_TARGET_HAS_ctpop_i641
+#define TCG_TARGET_HAS_ctpop_i64have_popcnt
 #define TCG_TARGET_HAS_deposit_i64  1
 #define TCG_TARGET_HAS_extract_i64  1
 #define TCG_TARGET_HAS_sextract_i64 0
diff --git a/util/cpuinfo-i386.c b/util/cpuinfo-i386.c
index 8f2694d88f2..6d474a6259a 100644
--- a/util/cpuinfo-i386.c
+++ b/util/cpuinfo-i386.c
@@ -35,6 +35,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
 __cpuid(1, a, b, c, d);
 
 info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
+info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
 info |= (c & bit_PCLMUL ? CPUINFO_PCLMUL : 0);
 
 /* NOTE: our AES support requires SSSE3 (PSHUFB) as well. */
-- 
2.45.2




[PULL 03/23] target/i386: use cpu_cc_dst for CC_OP_POPCNT

2024-06-21 Thread Paolo Bonzini
It is the only CCOp, among those that compute ZF from one of the cc_op_*
registers, that uses cpu_cc_src.  Do not make it the odd one off,
instead use cpu_cc_dst like the others.

Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/cpu.h   | 2 +-
 target/i386/tcg/cc_helper.c | 2 +-
 target/i386/tcg/translate.c | 4 ++--
 target/i386/tcg/emit.c.inc  | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 7e2a9b56aea..f54cd93b3f9 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1332,7 +1332,7 @@ typedef enum {
 CC_OP_BMILGQ,
 
 CC_OP_CLR, /* Z set, all other flags clear.  */
-CC_OP_POPCNT, /* Z via CC_SRC, all other flags clear.  */
+CC_OP_POPCNT, /* Z via CC_DST, all other flags clear.  */
 
 CC_OP_NB,
 } CCOp;
diff --git a/target/i386/tcg/cc_helper.c b/target/i386/tcg/cc_helper.c
index f76e9cb8cfb..301ed954064 100644
--- a/target/i386/tcg/cc_helper.c
+++ b/target/i386/tcg/cc_helper.c
@@ -107,7 +107,7 @@ target_ulong helper_cc_compute_all(target_ulong dst, 
target_ulong src1,
 case CC_OP_CLR:
 return CC_Z | CC_P;
 case CC_OP_POPCNT:
-return src1 ? 0 : CC_Z;
+return dst ? 0 : CC_Z;
 
 case CC_OP_MULB:
 return compute_all_mulb(dst, src1);
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index ad1819815ab..eb353dc3c9f 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -324,7 +324,7 @@ static const uint8_t cc_op_live[CC_OP_NB] = {
 [CC_OP_ADOX] = USES_CC_SRC | USES_CC_SRC2,
 [CC_OP_ADCOX] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2,
 [CC_OP_CLR] = 0,
-[CC_OP_POPCNT] = USES_CC_SRC,
+[CC_OP_POPCNT] = USES_CC_DST,
 };
 
 static void set_cc_op_1(DisasContext *s, CCOp op, bool dirty)
@@ -1020,7 +1020,7 @@ static CCPrepare gen_prepare_eflags_z(DisasContext *s, 
TCGv reg)
 case CC_OP_CLR:
 return (CCPrepare) { .cond = TCG_COND_ALWAYS };
 case CC_OP_POPCNT:
-return (CCPrepare) { .cond = TCG_COND_EQ, .reg = cpu_cc_src };
+return (CCPrepare) { .cond = TCG_COND_EQ, .reg = cpu_cc_dst };
 default:
 {
 MemOp size = (s->cc_op - CC_OP_ADDB) & 3;
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index 11faa70b5e2..fc7477833bc 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -2804,10 +2804,10 @@ static void gen_POPA(DisasContext *s, X86DecodedInsn 
*decode)
 
 static void gen_POPCNT(DisasContext *s, X86DecodedInsn *decode)
 {
-decode->cc_src = tcg_temp_new();
+decode->cc_dst = tcg_temp_new();
 decode->cc_op = CC_OP_POPCNT;
 
-tcg_gen_mov_tl(decode->cc_src, s->T0);
+tcg_gen_mov_tl(decode->cc_dst, s->T0);
 tcg_gen_ctpop_tl(s->T0, s->T0);
 }
 
-- 
2.45.2




[PULL 09/23] target/i386: do not check PREFIX_LOCK in old-style decoder

2024-06-21 Thread Paolo Bonzini
It is already checked before getting there.

Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/tcg/translate.c | 26 --
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index c60f18c7482..501a1ef9313 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2878,7 +2878,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 switch ((modrm >> 3) & 7) {
 case 7:
 if (mod != 3 ||
-(s->prefix & (PREFIX_LOCK | PREFIX_REPNZ))) {
+(s->prefix & PREFIX_REPNZ)) {
 goto illegal_op;
 }
 if (s->prefix & PREFIX_REPZ) {
@@ -2898,7 +2898,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 
 case 6: /* RDRAND */
 if (mod != 3 ||
-(s->prefix & (PREFIX_LOCK | PREFIX_REPZ | PREFIX_REPNZ)) ||
+(s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) ||
 !(s->cpuid_ext_features & CPUID_EXT_RDRAND)) {
 goto illegal_op;
 }
@@ -3058,8 +3058,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 
 case 0xd0: /* xgetbv */
 if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0
-|| (s->prefix & (PREFIX_LOCK | PREFIX_DATA
- | PREFIX_REPZ | PREFIX_REPNZ))) {
+|| (s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
 goto illegal_op;
 }
 tcg_gen_trunc_tl_i32(s->tmp2_i32, cpu_regs[R_ECX]);
@@ -3069,8 +3068,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 
 case 0xd1: /* xsetbv */
 if ((s->cpuid_ext_features & CPUID_EXT_XSAVE) == 0
-|| (s->prefix & (PREFIX_LOCK | PREFIX_DATA
- | PREFIX_REPZ | PREFIX_REPNZ))) {
+|| (s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
 goto illegal_op;
 }
 gen_svm_check_intercept(s, SVM_EXIT_XSETBV);
@@ -3237,8 +3235,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 gen_st_modrm(s, decode, ot);
 break;
 case 0xee: /* rdpkru */
-if (s->prefix & (PREFIX_LOCK | PREFIX_DATA
- | PREFIX_REPZ | PREFIX_REPNZ)) {
+if (s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ)) {
 goto illegal_op;
 }
 tcg_gen_trunc_tl_i32(s->tmp2_i32, cpu_regs[R_ECX]);
@@ -3246,8 +3243,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 tcg_gen_extr_i64_tl(cpu_regs[R_EAX], cpu_regs[R_EDX], s->tmp1_i64);
 break;
 case 0xef: /* wrpkru */
-if (s->prefix & (PREFIX_LOCK | PREFIX_DATA
- | PREFIX_REPZ | PREFIX_REPNZ)) {
+if (s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ)) {
 goto illegal_op;
 }
 tcg_gen_concat_tl_i64(s->tmp1_i64, cpu_regs[R_EAX],
@@ -3323,7 +3319,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 if (prefixes & PREFIX_REPZ) {
 /* bndcl */
 if (reg >= 4
-|| (prefixes & PREFIX_LOCK)
 || s->aflag == MO_16) {
 goto illegal_op;
 }
@@ -3331,7 +3326,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 } else if (prefixes & PREFIX_REPNZ) {
 /* bndcu */
 if (reg >= 4
-|| (prefixes & PREFIX_LOCK)
 || s->aflag == MO_16) {
 goto illegal_op;
 }
@@ -3345,7 +3339,7 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 }
 if (mod == 3) {
 int reg2 = (modrm & 7) | REX_B(s);
-if (reg2 >= 4 || (prefixes & PREFIX_LOCK)) {
+if (reg2 >= 4) {
 goto illegal_op;
 }
 if (s->flags & HF_MPX_IU_MASK) {
@@ -3374,7 +3368,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 /* bndldx */
 AddressParts a = decode->mem;
 if (reg >= 4
-|| (prefixes & PREFIX_LOCK)
 || s->aflag == MO_16
 || a.base < -1) {
 goto illegal_op;
@@ -3410,7 +3403,6 @@ static void gen_multi0F(DisasContext *s, X86DecodedInsn 
*decode)
 if (mod != 3 && (prefixes & PREFIX_REPZ)) {
 /* bndmk */
 if (reg >= 4
-|| (prefixes & PREFIX_LOCK)
 || s->aflag == MO_16) {
 

[PULL 11/23] target/i386: assert that cc_op* and pc_save are preserved

2024-06-21 Thread Paolo Bonzini
Now all decoding has been done before any code generation.
There is no need anymore to save and restore cc_op* and
pc_save but, for the time being, assert that this is indeed
the case.

Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/tcg/translate.c | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 501a1ef9313..d11c5e1dc13 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3709,15 +3709,9 @@ static void i386_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cpu)
 case 2:
 /* Restore state that may affect the next instruction. */
 dc->pc = dc->base.pc_next;
-/*
- * TODO: These save/restore can be removed after the table-based
- * decoder is complete; we will be decoding the insn completely
- * before any code generation that might affect these variables.
- */
-dc->cc_op_dirty = orig_cc_op_dirty;
-dc->cc_op = orig_cc_op;
-dc->pc_save = orig_pc_save;
-/* END TODO */
+assert(dc->cc_op_dirty == orig_cc_op_dirty);
+assert(dc->cc_op == orig_cc_op);
+assert(dc->pc_save == orig_pc_save);
 dc->base.num_insns--;
 tcg_remove_ops_after(dc->prev_insn_end);
 dc->base.insn_start = dc->prev_insn_start;
-- 
2.45.2




[PULL 06/23] target/i386: try not to force EFLAGS computation for CC_OP_ADOX/ADCX

2024-06-21 Thread Paolo Bonzini
When computing the "other" flag (CF for CC_OP_ADOX, OF for CC_OP_ADCX),
take into account that it is already in the right position of cpu_cc_src,
just like for CC_OP_EFLAGS.  There is no need to call gen_compute_eflags().

Reviewed-by: Richard Henderson 
Signed-off-by: Paolo Bonzini 
---
 target/i386/tcg/translate.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 257110ac703..08db40681fa 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -928,6 +928,7 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv 
reg)
  .no_setcond = true };
 
 case CC_OP_EFLAGS:
+case CC_OP_ADOX:
 case CC_OP_SARB ... CC_OP_SARQ:
 /* CC_SRC & 1 */
 return (CCPrepare) { .cond = TCG_COND_TSTNE,
@@ -994,6 +995,9 @@ static CCPrepare gen_prepare_eflags_o(DisasContext *s, TCGv 
reg)
 return (CCPrepare) { .cond = TCG_COND_NE, .reg = cpu_cc_src };
 default:
 gen_compute_eflags(s);
+/* fallthrough */
+case CC_OP_EFLAGS:
+case CC_OP_ADCX:
 return (CCPrepare) { .cond = TCG_COND_TSTNE, .reg = cpu_cc_src,
  .imm = CC_O };
 }
-- 
2.45.2




Re: [RFC] vhost: Introduce packed vq and add buffer elements

2024-06-21 Thread Sahil
Hi,

On Wednesday, June 19, 2024 3:49:29 PM GMT+5:30 Eugenio Perez Martin wrote:
> [...]
> Hi Sahil,
> 
> Just some nitpicks here and there,
> 
> > [1] https://wiki.qemu.org/Internships/ProjectIdeas/PackedShadowVirtqueue
> > 
> >  hw/virtio/vhost-shadow-virtqueue.c | 124 -
> >  hw/virtio/vhost-shadow-virtqueue.h |  66 ++-
> >  2 files changed, 167 insertions(+), 23 deletions(-)
> > 
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.c
> > b/hw/virtio/vhost-shadow-virtqueue.c index fc5f408f77..e3b276a9e9 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > @@ -217,6 +217,122 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue 
> > *svq,
> >  return true;
> >  }
> > 
> > +/**
> > + * Write descriptors to SVQ packed vring
> > + *
> > + * @svq: The shadow virtqueue
> > + * @sg: Cache for hwaddr
> > + * @out_sg: The iovec from the guest that is read-only for device
> > + * @out_num: iovec length
> > + * @in_sg: The iovec from the guest that is write-only for device
> > + * @in_num: iovec length
> > + * @head_flags: flags for first descriptor in list
> > + *
> > + * Return true if success, false otherwise and print error.
> > + */
> > +static bool vhost_svq_vring_write_descs_packed(VhostShadowVirtqueue *svq, 
> > hwaddr *sg,
> > +const struct iovec *out_sg, size_t 
> > out_num,
> > +const struct iovec *in_sg, size_t 
> > in_num,
> > +uint16_t *head_flags)
> > +{
> > +uint16_t id, curr, head, i;
> > +unsigned n;
> > +struct vring_packed_desc *descs = svq->vring_packed.vring.desc;
> > +bool ok;
> > +
> > +head = svq->vring_packed.next_avail_idx;
> > +i = head;
> > +id = svq->free_head;
> > +curr = id;
> > +
> > +size_t num = out_num + in_num;
> > +
> > +if (num == 0) {
> > +return true;
> > +}
> 
> num == 0 is impossible now, the caller checks for that.

Oh yes, I missed that.

> 
> > +
> > +ok = vhost_svq_translate_addr(svq, sg, out_sg, out_num);
> > +if (unlikely(!ok)) {
> > +return false;
> > +}
> > +
> > +ok = vhost_svq_translate_addr(svq, sg + out_num, in_sg, in_num);
> > +if (unlikely(!ok)) {
> > +return false;
> > +}
> > +
> > +for (n = 0; n < num; n++) {
> > +uint16_t flags = cpu_to_le16(svq->vring_packed.avail_used_flags |
> > +(n < out_num ? 0 : VRING_DESC_F_WRITE) |
> > +(n + 1 == num ? 0 : VRING_DESC_F_NEXT));
> > +if (i == head) {
> > +*head_flags = flags;
> > +} else {
> > +descs[i].flags = flags;
> > +}
> > +
> > +descs[i].addr = cpu_to_le64(sg[n]);
> > +descs[i].id = id;
> > +if (n < out_num) {
> > +descs[i].len = cpu_to_le32(out_sg[n].iov_len);
> > +} else {
> > +descs[i].len = cpu_to_le32(in_sg[n - out_num].iov_len);
> > +}
> > +
> > +curr = cpu_to_le16(svq->desc_next[curr]);
> > +
> > +if (++i >= svq->vring_packed.vring.num) {
> > +i = 0;
> > +svq->vring_packed.avail_used_flags ^=
> > +1 << VRING_PACKED_DESC_F_AVAIL |
> > +1 << VRING_PACKED_DESC_F_USED;
> > +}
> > +}
> > +
> > +if (i <= head) {
> > +svq->vring_packed.avail_wrap_counter ^= 1;
> > +}
> > +
> > +svq->vring_packed.next_avail_idx = i;
> > +svq->free_head = curr;
> > +return true;
> > +}
> > +
> > +static bool vhost_svq_add_packed(VhostShadowVirtqueue *svq,
> > +const struct iovec *out_sg, size_t out_num,
> > +const struct iovec *in_sg, size_t in_num,
> > +unsigned *head)
> > +{
> > +bool ok;
> > +uint16_t head_flags = 0;
> > +g_autofree hwaddr *sgs = g_new(hwaddr, out_num + in_num);
> > +
> > +*head = svq->vring_packed.next_avail_idx;
> > +
> > +/* We need some descriptors here */
> > +if (unlikely(!out_num && !in_num)) {
> > +qemu_log_mask(LOG_GUEST_ERROR,
> > +  "Guest provided element with no descriptors");
> > +return false;
> > +}
> > +
> > +ok = vhost_svq_vring_write_descs_packed(svq, sgs, out_sg, out_num,
> > +in_sg, in_num, &head_flags);
> > +if (unlikely(!ok)) {
> > +return false;
> > +}
> > +
> 
> Ok now I see why you switched sgs length from MAX to sum. But if we're
> here, why not just embed all vhost_svq_vring_write_descs_packed here?
> vhost_svq_vring_write_descs makes sense to split as we repeat the
> operation, but I think it adds nothing here. What do you think?

You're right. The function is called only once and there is nothing to reuse.
I'll move "vhost_svq_vring_write_descs_packed" to "vhost_svq_add_packed".

> > [...]
> 

Re: [PATCH v3 02/16] migration: Fix file migration with fdset

2024-06-21 Thread Michael Tokarev

17.06.2024 21:57, Fabiano Rosas wrote:

When the "file:" migration support was added we missed the special
case in the qemu_open_old implementation that allows for a particular
file name format to be used to refer to a set of file descriptors that
have been previously provided to QEMU via the add-fd QMP command.

When using this fdset feature, we should not truncate the migration
file because being given an fd means that the management layer is in
control of the file and will likely already have some data written to
it. This is further indicated by the presence of the 'offset'
argument, which indicates the start of the region where QEMU is
allowed to write.

Fix the issue by replacing the O_TRUNC flag on open by an ftruncate
call, which will take the offset into consideration.

Fixes: 385f510df5 ("migration: file URI offset")
Suggested-by: Daniel P. Berrangé 
Reviewed-by: Prasad Pandit 
Reviewed-by: Peter Xu 
Reviewed-by: Daniel P. Berrangé 
Signed-off-by: Fabiano Rosas 
---
  migration/file.c | 11 +--
  1 file changed, 9 insertions(+), 2 deletions(-)


Is it a stable material?

Thanks,

/mjt


diff --git a/migration/file.c b/migration/file.c
index 2bb8c64092..a903710f06 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -84,12 +84,19 @@ void file_start_outgoing_migration(MigrationState *s,
  
  trace_migration_file_outgoing(filename);
  
-fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | O_TRUNC,

- 0600, errp);
+fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY, 0600, errp);
  if (!fioc) {
  return;
  }
  
+if (ftruncate(fioc->fd, offset)) {

+error_setg_errno(errp, errno,
+ "failed to truncate migration file to offset %" 
PRIx64,
+ offset);
+object_unref(OBJECT(fioc));
+return;
+}
+
  outgoing_args.fname = g_strdup(filename);
  
  ioc = QIO_CHANNEL(fioc);


--
GPG Key transition (from rsa2048 to rsa4096) since 2024-04-24.
New key: rsa4096/61AD3D98ECDF2C8E  9D8B E14E 3F2A 9DD7 9199  28F1 61AD 3D98 
ECDF 2C8E
Old key: rsa2048/457CE0A0804465C5  6EE1 95D1 886E 8FFB 810D  4324 457C E0A0 
8044 65C5
Transition statement: http://www.corpit.ru/mjt/gpg-transition-2024.txt




Re: [RFC] vhost: Introduce packed vq and add buffer elements

2024-06-21 Thread Sahil
Hi,

Thank you for your reply.

On Wednesday, June 19, 2024 1:07:54 PM GMT+5:30 Eugenio Perez Martin wrote:
> [...]
> > "curr" is being updated here, but descs[i].id is always set to id which
> > doesn't change in the loop. So all the descriptors in the chain will have
> > the same id. I can't find anything in the virtio specification [1] that
> > suggests that all descriptors in the chain have the same id. Also, going
> > by the figure captioned "Three chained descriptors available" in the blog
> > post on packed virtqueues [2], it looks like the descriptors in the chain
> > have different buffer ids.
> > 
> > The virtio implementation in Linux also reuses the same id value for all
> > the descriptors in a single chain. I am not sure if I am missing
> > something here.
> 
> The code is right, the id that identifies the whole chain is just the
> one on the last descriptor. The key is that all the tail descriptors
> of the chains will have a different id, the rest ids are ignored so it
> is easier this way. I got it wrong in a recent mail in the list, where
> you can find more information. Let me know if you cannot find it :).

I found the mail here [1] :)

> In the split vq is different as a chained descriptor can go back and
> forth in the descriptor ring with the next id. So all of them must be
> different. But in the packed vq, the device knows the next descriptor
> is placed at the next entry in the descriptor ring, so the only
> important id is the last one.

Ok, this makes sense now.

> > > +if (++i >= svq->vring_packed.vring.num) {
> > > +i = 0;
> > > +svq->vring_packed.avail_used_flags ^=
> > > +1 << VRING_PACKED_DESC_F_AVAIL |
> > > +1 << VRING_PACKED_DESC_F_USED;
> > > +}
> > > +}
> > > +
> > > +if (i <= head) {
> > > +svq->vring_packed.avail_wrap_counter ^= 1;
> > > +}
> > > +
> > > +svq->vring_packed.next_avail_idx = i;
> > > +svq->free_head = curr;
> > 
> > Even though the same id is used, curr will not be id+1 here.
> 
> curr is not the descriptor index, but the id. They're used in a stack
> format: One available chain pops an id and one used id pushes its id
> in the stack.
> 
> Maybe I'm wrong, but I think the main reason is to reuse the same
> memory region of the descriptor state etc so less memory is changed to
> be used in all the operations.

Right, curr is the id. I didn't really understand the popping and pushing
part.

In the implementation, the possible ids are stored in svq.desc_next. And
it's implemented in a way that the next id is (id + 1) % queue_size.

By the following line:
> Even though the same id is used, curr will not be id+1 here.

I meant that if, for example,  there is a chain of 3 descriptors and the
current id is 1, then all 3 descriptors will have 1 as their id. If the vring
size is 5, then the value of curr that will be filled in the 4th descriptor
will be 4 instead of 2.

> > > +return true;
> > > +}
> > > +
> > > +static bool vhost_svq_add_packed(VhostShadowVirtqueue *svq,
> > > +const struct iovec *out_sg, size_t
> > > out_num, +const struct iovec *in_sg,
> > > size_t in_num, +unsigned *head)
> > > +{
> > > +bool ok;
> > > +uint16_t head_flags = 0;
> > > +g_autofree hwaddr *sgs = g_new(hwaddr, out_num + in_num);
> > 
> > I chose to use out_num+in_num as the size instead of MAX(ount_num,
> > in_num). I found it easier to implement
> > "vhost_svq_vring_write_descs_packed()" like this. Please let me know if
> > this isn't feasible or ideal.
> 
> Not a big deal, I picked the MAX just because it is all the
> hwaddresses the function needs at the same time. Addition should work
> too, and AFAIK chains are usually short. We should get rid of this
> dynamic allocation in the future anyway.

Ok, understood.

> [...]
> > In "struct VhostShadowVirtqueue", I rearranged the order in which some
> > members appear. I tried to keep the members common to split and packed
> > virtqueues above the union and the rest below the union. I haven't
> > entirely understood the role of some of the members (for example,
> > VhostShadowVirtqueueOps *ops). I'll change this ordering if need be as I
> > continue to understand them better.
> 
> That's fine, but do it in a separate patch for future series, so it is
> easier to review.

Sure, I'll do that.

> ops is used when a kind of device wants specialized handling for the
> descriptors forwarded. vdpa-net uses it when QEMU also needs to
> inspect the descriptors. Feel free to ask more about it, but adding
> packed format to SVQ should not affect the ops member.

Got it. I don't have any other questions related to ops.

> > For the next step, I think I should work on "vhost_svq_start()" which is
> > where members of the struct are actually initialized. At the moment, only
> > the split ring part of the structure is initialize

Re: [PULL 00/28] Migration patches for 2024-06-21

2024-06-21 Thread Richard Henderson

On 6/21/24 10:54, Fabiano Rosas wrote:

The following changes since commit 02d9c38236cf8c9826e5c5be61780ccb4ae0:

   Merge tag 'pull-tcg-20240619' ofhttps://gitlab.com/rth7680/qemu  into 
staging (2024-06-19 14:00:39 -0700)

are available in the Git repository at:

   https://gitlab.com/farosas/qemu.git  tags/migration-20240621-pull-request

for you to fetch changes up to 04b09de16d78cf2d163ca65d7c6d161bf2baceb6:

   migration: Remove unused VMSTATE_ARRAY_TEST() macro (2024-06-21 14:37:58 
-0300)


Migration pull request

- Fabiano's fix for fdset + file migration truncating the migration
   file

- Fabiano's fdset + direct-io support for mapped-ram

- Peter's various cleanups (multifd sync, thread names, migration
   states, tests)

- Peter's new migration state postcopy-recover-setup

- Philippe's unused vmstate macro cleanup


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~




Re: [PATCH v1] memory tier: consolidate the initialization of memory tiers

2024-06-21 Thread Andrew Morton
On Fri, 21 Jun 2024 04:48:30 + "Ho-Ren (Jack) Chuang" 
 wrote:

> If we simply move the set_node_memory_tier() from memory_tier_init() to
> late_initcall(), it will result in HMAT not registering the
> mt_adistance_algorithm callback function,

Immediate reaction: then don't do that!

> because set_node_memory_tier()
> is not performed during the memory tiering initialization phase,
> leading to a lack of correct default_dram information.
> 
> Therefore, we introduced a nodemask to pass the information of the
> default DRAM nodes. The reason for not choosing to reuse
> default_dram_type->nodes is that it is not clean enough. So in the end,
> we use a __initdata variable, which is a variable that is released once
> initialization is complete, including both CPU and memory nodes for HMAT
> to iterate through.
> 
> Besides, since default_dram_type may be checked/used during the
> initialization process of HMAT and drivers, it is better to keep the
> allocation of default_dram_type in memory_tier_init().

What is this patch actually aiming to do?  Is it merely a code cleanup,
or are there functional changes?

> Signed-off-by: Ho-Ren (Jack) Chuang 
> ---
> Hi all,
> 
> The current memory tier initialization process is distributed across two
> different functions, memory_tier_init() and memory_tier_late_init(). This
> design is hard to maintain. Thus, this patch is proposed to reduce the
> possible code paths by consolidating different initialization patches into 
> one.

Ah, there it is.  Please make this the opening paragraph, not an aside
buried below the ^---$.

I'll await review input before proceeding with this, thanks.



Re: [PATCH v14 00/14] Support blob memory and venus on qemu

2024-06-21 Thread Dmitry Osipenko
On 6/21/24 11:59, Alex Bennée wrote:
> Dmitry Osipenko  writes:
> 
>> On 6/19/24 20:37, Alex Bennée wrote:
>>> So I've been experimenting with Aarch64 TCG with an Intel backend like
>>> this:
>>>
>>> ./qemu-system-aarch64 \
>>>-M virt -cpu cortex-a76 \
>>>-device virtio-net-pci,netdev=unet \
>>>-netdev user,id=unet,hostfwd=tcp::-:22 \
>>>-m 8192 \
>>>-object memory-backend-memfd,id=mem,size=8G,share=on \
>>>-serial mon:stdio \
>>>-kernel 
>>> ~/lsrc/linux.git/builds/arm64.initramfs/arch/arm64/boot/Image \
>>>-append "console=ttyAMA0" \
>>>-device qemu-xhci -device usb-kbd -device usb-tablet \
>>>-device virtio-gpu-gl-pci,blob=true,venus=true,hostmem=4G \
>>>-display sdl,gl=on -d 
>>> plugin,guest_errors,trace:virtio_gpu_cmd_res_create_blob,trace:virtio_gpu_cmd_res_back_\*,trace:virtio_gpu_cmd_res_xfer_toh_3d,trace:virtio_gpu_cmd_res_xfer_fromh_3d,trace:address_space_map
>>>  
>>>
>>> And I've noticed a couple of things. First trying to launch vkmark to
>>> run a KMS mode test fails with:
>>>
>> ...
>>>   virgl_render_server[1875931]: vkr: failed to import resource: invalid 
>>> res_id 5
>>>   virgl_render_server[1875931]: vkr: vkAllocateMemory resulted in CS error 
>>>   virgl_render_server[1875931]: vkr: ring_submit_cmd: vn_dispatch_command 
>>> failed
>>>
>>> More interestingly when shutting stuff down we see weirdness like:
>>>
>>>   address_space_map as:0x561b48ec48c0 addr 0x1008ac4b0:18 write:1 attrs:0x1 
>>> 
>>>
>>>   virgl_render_server[1875931]: vkr: destroying context 3 (vkmark) with a 
>>> valid instance  
>>>  
>>>   virgl_render_server[1875931]: vkr: destroying device with valid objects   
>>> 
>>>
>>>   vkr_context_remove_object: -7438602987017907480   
>>> 
>>>
>>>   vkr_context_remove_object: 7  
>>> 
>>>
>>>   vkr_context_remove_object: 5   
>>>
>>> which indicates something has gone very wrong. I'm not super familiar
>>> with the memory allocation patterns but should stuff that is done as
>>> virtio_gpu_cmd_res_back_attach() be find-able in the list of resources?
>>
>> This is expected to fail. Vkmark creates shmem virgl GBM FB BO on guest
>> that isn't exportable on host. AFAICT, more code changes should be
>> needed to support this case.
> 
> There are a lot of acronyms there. If this is pure guest memory why
> isn't it exportable to the host? Or should the underlying mesa library
> be making sure the allocation happens from the shared region?
> 
> Is vkmark particularly special here?

Actually, you could get it to work to a some degree if you'll compile
virglrenderer with -Dminigbm_allocation=true. On host use GTK/Wayland
display.

Vkmark isn't special. It's virglrenderer that has a room for
improvement. ChromeOS doesn't use KMS in VMs, proper KMS support was
never a priority for Venus.

>> Note that "destroying device with valid objects" msg is fine, won't hurt
>> to silence it in Venus to avoid confusion. It will happen every time
>> guest application is closed without explicitly releasing every VK
>> object.
> 
> I was more concerned with:
> 
>>>   vkr_context_remove_object: -7438602987017907480   
>>> 
>>>
> 
> which looks like a corruption of the object ids (or maybe an offby one)

At first this appeared to be a valid value, otherwise venus should've
crashed Qemu with a debug-assert if ID was invalid. But I never see such
odd IDs with my testing.

>>> I tried running under RR to further debug but weirdly I can't get
>>> working graphics with that. I did try running under threadsan which
>>> complained about a potential data race:
>>>
>>>   vkr_context_add_object: 1 -> 0x7b2c0288
>>>   vkr_context_add_object: 2 -> 0x7b2c0270
>>>   vkr_context_add_object: 3 -> 0x7b387f28
>>>   vkr_context_add_object: 4 -> 0x7b387fa0
>>>   vkr_context_add_object: 5 -> 0x7b48000103f8
>>>   vkr_context_add_object: 6 -> 0x7b48000104a0
>>>   vkr_context_add_object: 7 -> 0x7b4800010440
>>>   virtio_gpu_cmd_res_back_attach res 0x5
>>>   virtio_gpu_cmd_res_back_attach res 0x6
>>>   vkr_context_add_object: 8 -> 0x7b48000103e0
>>>   virgl_render_server[1751430]: vkr: failed to import resource: invalid 
>>> res_id 5
>>>   virgl_render_server[1751430]: vkr: vkAllocateMemory resulted in CS error

Re: [PATCH 05/20] qapi/parser: adjust info location for doc body section

2024-06-21 Thread John Snow
On Mon, May 27, 2024 at 7:58 AM Markus Armbruster  wrote:

> John Snow  writes:
>
> > On Thu, May 16, 2024, 1:58 AM Markus Armbruster 
> wrote:
> >
> >> John Snow  writes:
> >>
> >> > Instead of using the info object for the doc block as a whole, update
> >> > the info pointer for each call to ensure_untagged_section when the
> >> > existing section is otherwise empty. This way, Sphinx error
> information
> >> > will match precisely to where the text actually starts.
> >> >
> >> > Signed-off-by: John Snow 
> >> > ---
> >> >  scripts/qapi/parser.py | 9 +++--
> >> >  1 file changed, 7 insertions(+), 2 deletions(-)
> >> >
> >> > diff --git a/scripts/qapi/parser.py b/scripts/qapi/parser.py
> >> > index 8cdd5334ec6..41b9319e5cb 100644
> >> > --- a/scripts/qapi/parser.py
> >> > +++ b/scripts/qapi/parser.py
> >> > @@ -662,8 +662,13 @@ def end(self) -> None:
> >> >
> >> >  def ensure_untagged_section(self, info: QAPISourceInfo) -> None:
> >> >  if self.all_sections and not self.all_sections[-1].tag:
> >> > -# extend current section
> >> > -self.all_sections[-1].text += '\n'
> >>
> >> Before, we always append a newline.
> >>
> >> > +section = self.all_sections[-1]
> >> > +# Section is empty so far; update info to start *here*.
> >> > +if not section.text:
> >> > +section.info = info
> >> > +else:
> >> > +# extend current section
> >> > +self.all_sections[-1].text += '\n'
> >>
> >> Afterwards, we append it only when the section already has some text.
> >>
> >> The commit message claims the patch only adjusts section.info.  That's
> a
> >> lie :)
> >>
> >
> > Well. It wasn't intentional, so it wasn't a lie... it was just wrong :)
> >
> >
> >> I believe the change makes no difference because .end() strips leading
> >> and trailing newline.
> >>
> >> >  return
> >> >  # start new section
> >> >  section = self.Section(info)
> >>
> >> You could fix the commit message, but I think backing out the
> >> no-difference change is easier.  The appended patch works in my testing.
> >>
> >> Next one.  Your patch changes the meaning of section.info.  Here's its
> >> initialization:
> >>
> >> class Section:
> >> # pylint: disable=too-few-public-methods
> >> def __init__(self, info: QAPISourceInfo,
> >>  tag: Optional[str] = None):
> >> ---># section source info, i.e. where it begins
> >> self.info = info
> >> # section tag, if any ('Returns', '@name', ...)
> >> self.tag = tag
> >> # section text without tag
> >> self.text = ''
> >>
> >> The comment is now wrong.  Calls for a thorough review of .info's uses.
> >>
> >
> > Hmm... Did I really change its meaning? I guess it's debatable what
> "where
> > it begins" means. Does the tagless section start...
> >
> > ## <-- Here?
> > # Hello! <-- Or here?
> > ##
> >
> > I assert the *section* starts wherever the first line of text it contains
> > starts. Nothing else makes any sense.
> >
> > There is value in recording where the doc block starts, but that's not a
> > task for the *section* info.
> >
> > I don't think I understand your feedback.
>
> This was before my vacation, and my memory is foggy, ...  I may have
> gotten confused back then.  Let me have a fresh look now.
>
> self.info gets initialized in Section.__init__() to whatever info it
> gets passed.
>
> Your patch makes .ensure_untagged_section() overwrite this Section.info
> when it extends an untagged section that is still empty.  Hmmm.  I'd
> prefer .info to remain constant after initialization.
>

but, we don't have the right info when we initialize the entire QAPIDoc
object, because the section hasn't truly actually started yet, so I don't
think I can actually achieve your preference.


>
> I figure this overwrite can happen only when extenting the body section
> QAPIDoc.__init__() creates.  In that case, it adjusts .info from
> beginning of doc comment to first non-blank line.
>

Yes, that's the intended effect and in practice, the only time it actually
happens.

This patch is necessary for accurate error reporting info, otherwise we get
off-by-ones (or more, maybe). I believe the problem actually affects the
current generator too (I don't see why it wouldn't), but I didn't test that
because I'm trying to replace it.


>
> Thoughts?
>
>
I think this patch is fine?.


> >> The alternative to changing .info's meaning is to add another member
> >> with the meaning you need.  Then we have to review .info's uses to find
> >> out which ones to switch to the new one.
> >
> >
> >> Left for later.
> >>
> >>
> >> diff --git a/scripts/qapi/parser.py b/scripts/qapi/parser.py
> >> index 8cdd5334ec..abeae1ca77 100644
> >> --- a/scripts/qapi/parser.py
> >> +++ b/scripts/qapi/parser.py
> >> @@ -663,7 +663,10 @@ def end(self) -> None:
> >>  def ensure_untagged_section

Re: [RFC PATCH 0/7] migration/multifd: Introduce storage slots

2024-06-21 Thread Peter Xu
On Fri, Jun 21, 2024 at 07:40:01PM +0200, Maciej S. Szmigiero wrote:
> On 21.06.2024 17:56, Peter Xu wrote:
> > On Fri, Jun 21, 2024 at 05:31:54PM +0200, Maciej S. Szmigiero wrote:
> > > On 21.06.2024 17:04, Fabiano Rosas wrote:
> > > > "Maciej S. Szmigiero"  writes:
> > > > 
> > > > > Hi Fabiano,
> > > > > 
> > > > > On 20.06.2024 23:21, Fabiano Rosas wrote:
> > > > > > Hi folks,
> > > > > > 
> > > > > > First of all, apologies for the roughness of the series. I'm off for
> > > > > > the next couple of weeks and wanted to put something together early
> > > > > > for your consideration.
> > > > > > 
> > > > > > This series is a refactoring (based on an earlier, off-list
> > > > > > attempt[0]), aimed to remove the usage of the MultiFDPages_t type in
> > > > > > the multifd core. If we're going to add support for more data types 
> > > > > > to
> > > > > > multifd, we first need to clean that up.
> > > > > > 
> > > > > > This time around this work was prompted by Maciej's series[1]. I see
> > > > > > you're having to add a bunch of is_device_state checks to work 
> > > > > > around
> > > > > > the rigidity of the code.
> > > > > > 
> > > > > > Aside from the VFIO work, there is also the intent (coming back from
> > > > > > Juan's ideas) to make multifd the default code path for migration,
> > > > > > which will have to include the vmstate migration and anything else 
> > > > > > we
> > > > > > put on the stream via QEMUFile.
> > > > > > 
> > > > > > I have long since been bothered by having 'pages' sprinkled all over
> > > > > > the code, so I might be coming at this with a bit of a narrow focus,
> > > > > > but I believe in order to support more types of payloads in multifd,
> > > > > > we need to first allow the scheduling at multifd_send_pages() to be
> > > > > > independent of MultiFDPages_t. So here it is. Let me know what you
> > > > > > think.
> > > > > 
> > > > > Thanks for the patch set, I quickly glanced at these patches and they
> > > > > definitely make sense to me.
> > > > > 
> > > (..)
> > > > > > (as I said, I'll be off for a couple of weeks, so feel free to
> > > > > > incorporate any of this code if it's useful. Or to ignore it
> > > > > > completely).
> > > > > 
> > > > > I guess you are targeting QEMU 9.2 rather than 9.1 since 9.1 has
> > > > > feature freeze in about a month, correct?
> > > > > 
> > > > 
> > > > For general code improvements like this I'm not thinking about QEMU
> > > > releases at all. But this series is not super complex, so I could
> > > > imagine we merging it in time for 9.1 if we reach an agreement.
> > > > 
> > > > Are you thinking your series might miss the target? Or have concerns
> > > > over the stability of the refactoring? We can within reason merge code
> > > > based on the current framework and improve things on top, we already did
> > > > something similar when merging zero-page support. I don't have an issue
> > > > with that.
> > > 
> > > The reason that I asked whether you are targeting 9.1 is because my
> > > patch set is definitely targeting that release.
> > > 
> > > At the same time my patch set will need to be rebased/refactored on top
> > > of this patch set if it is supposed to be merged for 9.1 too.
> > > 
> > > If this patch set gets merged quickly that's not really a problem.
> > > 
> > > On the other hand, if another iteration(s) is/are needed AND you are
> > > not available in the coming weeks to work on them then there's a
> > > question whether we will make the required deadline.
> > 
> > I think it's a bit rush to merge the vfio series in this release.  I'm not
> > sure it has enough time to be properly reviewed, reposted, retested, etc.
> > 
> > I've already started looking at it, and so far I think I have doubt not
> > only on agreement with Fabiano on the device_state thing which I prefer to
> > avoid, but also I'm thinking of any possible way to at least make the
> > worker threads generic too: a direct impact could be vDPA in the near
> > future if anyone cared, while I don't want modules to create threads
> > randomly during migration.
> > 
> > Meanwhile I'm also thinking whether that "the thread needs to dump all
> > data, and during iteration we can't do that" is the good reason to not
> > support that during iterations.
> > 
> > I didn't yet reply because I don't think I think all things through, but
> > I'll get there.
> > 
> > So I'm not saying that the design is problematic, but IMHO it's just not
> > mature enough to assume it will land in 9.1, considering it's still a large
> > one, and the first non-rfc version just posted two days ago.
> 
> 
> The RFC version was posted more than 2 months ago.
> 
> It has received some review comments from multiple people,
> all of which were addressed in this patch set version.

I thought it was mostly me who reviewed it, am I right?  Or do you have
other thread that has such discussion happening, and the design review has
properly done and reached an agreement?

IMHO that is also not ho

Re: [PATCH 0/7] pc-bios/s390-ccw: Merge the netboot loader into s390-ccw.img

2024-06-21 Thread Eric Farman
On Fri, 2024-06-21 at 10:24 +0200, Thomas Huth wrote:
> We originally built a separate binary for the netboot code since it
> was considered as experimental and we could not be sure that the
> necessary SLOF module had been checked out. Time passed, the netboot
> code proved its usefulness, and the build system nowadays makes sure
> that the SLOF module is checked out if you have a s390x compiler
> available
> for building the s390-ccw bios. In fact, the possibility to build the
> s390-ccw.img without s390-netboot.img has been removed in commit
> bf6903f6944f ("pc-bios/s390-ccw: always build network bootloader")
> already.
> 
> So it does not make too much sense anymore to keep the netboot code
> in a separate binary. To make it easier to support a more flexible
> boot process soon that supports more than one boot device via the
> bootindex properties, let's finally merge the netboot code into the
> main s390-ccw.img binary now.

Hi Thomas,

I find myself wondering about the side effects of the
s/sclp_print/printf/ changes, but I haven't come up with anything I can
put my finger on. Maybe something will come to me over the weekend, but
all-in-all I like the looks of this. So, FWIW:

Reviewed-by: Eric Farman 

> 
> Thomas Huth (7):
>   pc-bios/s390-ccw: Remove duplicated LDFLAGS
>   hw/s390x/ipl: Provide more memory to the s390-ccw.img firmware
>   pc-bios/s390-ccw: Use the libc from SLOF for the main s390-ccw.img
>     binary, too
>   pc-bios/s390-ccw: Link the netboot code into the main s390-ccw.img
>     binary
>   hw/s390x: Remove the possibility to load the s390-netboot.img
> binary
>   pc-bios/s390-ccw: Merge netboot.mak into the main Makefile
>   docs/system/s390x/bootdevices: Update the documentation about
> network
>     booting
> 
>  docs/system/s390x/bootdevices.rst |  20 +++
>  pc-bios/s390-ccw/netboot.mak  |  62 -
>  hw/s390x/ipl.h    |  12 ++--
>  pc-bios/s390-ccw/cio.h    |   2 +
>  pc-bios/s390-ccw/iplb.h   |   4 +-
>  pc-bios/s390-ccw/libc.h   |  89 
> --
>  pc-bios/s390-ccw/s390-ccw.h   |  10 +++-
>  pc-bios/s390-ccw/virtio.h |   1 -
>  hw/s390x/ipl.c    |  65 +++---
>  hw/s390x/s390-virtio-ccw.c    |  10 +---
>  pc-bios/s390-ccw/bootmap.c    |   4 +-
>  pc-bios/s390-ccw/cio.c    |   2 +-
>  pc-bios/s390-ccw/dasd-ipl.c   |   2 +-
>  pc-bios/s390-ccw/jump2ipl.c   |   2 +-
>  pc-bios/s390-ccw/libc.c   |  88 
> -
>  pc-bios/s390-ccw/main.c   |  15 +++--
>  pc-bios/s390-ccw/menu.c   |  25 -
>  pc-bios/s390-ccw/netmain.c    |  15 +
>  pc-bios/s390-ccw/sclp.c   |   2 +-
>  pc-bios/s390-ccw/virtio-blkdev.c  |   1 -
>  pc-bios/s390-ccw/virtio-scsi.c    |   2 +-
>  pc-bios/s390-ccw/virtio.c |   2 +-
>  pc-bios/meson.build   |   1 -
>  pc-bios/s390-ccw/Makefile |  69 +++
>  pc-bios/s390-netboot.img  | Bin 67232 -> 0 bytes
>  25 files changed, 122 insertions(+), 383 deletions(-)
>  delete mode 100644 pc-bios/s390-ccw/netboot.mak
>  delete mode 100644 pc-bios/s390-ccw/libc.h
>  delete mode 100644 pc-bios/s390-ccw/libc.c
>  delete mode 100644 pc-bios/s390-netboot.img
> 




RE: [RFC PATCH] cxl: avoid duplicating report from MCE & device

2024-06-21 Thread Luck, Tony
> So who actually cares about recovering poisoned volatile memory?
> I'd like to understand more on how significant a use case this is.
> Whilst I can conjecture that its an extreme case of wanting to avoid
> loosing the ability to create 1GiB or larger pages due to poison
> is that a real problem for anyone today?  Note this is just the case
> where you've reached an actual uncorrectable error and probably
> / possibly killed something, not the more common soft offlining
> of memory due to correctable errors being detected.

I guess you really need a reply from someone with a data center
with thousands of machines, since that's where this question
may be important.

My humble opinion is that, outside of the huge page issue, nobody
should try to recover a poisoned page. Systems that can report
and recover from poison have tens, hundreds, or more GBytes
of memory. Dropping 4K pages will not have any measurable
impact on a system (even if there are hundreds of pages dropped).

There's no reliable way to determine whether the poisoned page
was due to some transient issue, or a permanent defect. Recovering
a poisoned page runs the risk that the poison will re-occur. Perhaps
next use of the page will be in some unrecoverable (kernel) context.

So recovery has some risk, but very little upside benefit.

-Tony



Re: [RFC PATCH] cxl: avoid duplicating report from MCE & device

2024-06-21 Thread Jonathan Cameron via
On Fri, 21 Jun 2024 10:59:46 -0700
Dan Williams  wrote:

> Jonathan Cameron wrote:
> > On Wed, 19 Jun 2024 00:53:10 +0800
> > Shiyang Ruan  wrote:
> >   
> > > Background:
> > > Since CXL device is a memory device, while CPU consumes a poison page of 
> > > CXL device, it always triggers a MCE by interrupt (INT18), no matter 
> > > which-First path is configured.  This is the first report.  Then 
> > > currently, in FW-First path, the poison event is transferred according 
> > > to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES   
> > >  -> CPER -> trace report.  This is the second one.  These two reports
> > > are indicating the same poisoning page, which is the so-called "duplicate
> > > report"[1].  And the memory_failure() handling I'm trying to add in
> > > OS-First path could also be another duplicate report.
> > > 
> > > Hope the flow below could make it easier to understand:
> > > CPU accesses bad memory on CXL device, then  
> > >  -> MCE (INT18), *always* report (1)
> > >  -> * FW-First (implemented now)
> > >   -> CXL device -> FW
> > > -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
> > > * OS-First (not implemented yet, I'm working on it)  
> > >   -> CXL device -> MSI
> > > -> OS:CXL driver -> memory_failure() (2.b)
> > > so, the (1) and (2.a/b) are duplicated.
> > > 
> > > (I didn't get response in my reply for [1] while I have to make patch to
> > > solve this problem, so please correct me if my understanding is wrong.)
> > > 
> > > This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> > > to check whether the current poison page has been reported (if yes,
> > > stop the notifier chain, won't call the following memory_failure()
> > > to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> > > page already handled(recorded and reported) in (1) or (2), the other one
> > > won't duplicate the report.  The record could be clear when
> > > cxl_clear_poison() is called.
> > > 
> > > [1] 
> > > https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be29...@dwillia2-mobl3.amr.corp.intel.com.notmuch/
> > > 
> > > Signed-off-by: Shiyang Ruan   
> > 
> > So poison can be cleared in a number of ways and a CXL poison clear command
> > is unfortunately only one of them.  Some architectures have instructions
> > that guarantee to write a whole cacheline and can clear things as well.
> > I believe x86 does for starters.  
> 
> Yes, movdir64b.

Equivalent arm64 instruction is not valid to normal memory. Lets say
no more on that :(

So who actually cares about recovering poisoned volatile memory?
I'd like to understand more on how significant a use case this is.
Whilst I can conjecture that its an extreme case of wanting to avoid
loosing the ability to create 1GiB or larger pages due to poison
is that a real problem for anyone today?  Note this is just the case
where you've reached an actual uncorrectable error and probably
/ possibly killed something, not the more common soft offlining
of memory due to correctable errors being detected.

> 
> > +CC linux-edac and related maintainers / reviewers.
> > linux-mm and hwpoison maintainer.
> > 
> > So I think this needs a more general solution that encompasses 
> > more general cleanup of poison.  
> 
> I think unless the device has "List Poison" coverage for volatile ranges
> that the kernel should not worry about tracking this itself.

Maybe.  I think you can still get a media event for this as well
as synchronous poison so there may be a path to a double report, just
a more timely one hopefully.

> 
> Perhaps what is needed is that after successful memory_failure()
> handling when the page is known to be offline the device backing the
> memory can be notified that it is safe to repair the page and but it
> back into service, but I expect that would be comparison of the device's
> own poison tracking relative to the notification of successful page
> offline.

That would work. Elide the error handling if the page is known to
be offline due to poison.  Might be racey though but does it
really hurt if we occasionally report twice?

> > > + rec = kmalloc(sizeof(struct cxl_mce_record), GFP_KERNEL);
> > > + rec->hpa = hpa;
> > > + list_add(&cxl_mce_records, &rec->node);
> > > +
> > > + mutex_unlock(&cxl_mce_mutex);
> > > +
> > > + return false;
> > > +}
> > > +
> > > +void cxl_mce_clear(u64 hpa)
> > > +{
> > > + struct cxl_mce_record *cur, *next;
> > > + int rc;
> > > +
> > > + rc = mutex_lock_interruptible(&cxl_mce_mutex);  
> > 
> > Maybe cond_guard().  
> 
> cond_guard() was rejected, you meant scoped_cond_guard()? But, then I
> think _interruptible is not appropriate here.

Ah yes.  Indeed scoped_cond_guard() but fair enough on the
interruptible point!


> 




Re: [RFC PATCH 1/1] hw/arm: FW first ARM processor error injection.

2024-06-21 Thread Jonathan Cameron via
On Fri, 21 Jun 2024 17:51:15 +0100
 wrote:

> From: Shiju Jose 
Thanks for posting this.

Given this is going to linux-edac, probably should mention
this is QEMU based error injection.  For cross postings
between kernel related and qemu lists I tend to stick
qemu in the [] of the patch description.

> 
> Add support for FW first ARM processor error injection.
> 
> Compliance with N.2.4.4 ARM Processor Error Section in
> UEFI 2.9A/2.10 specs.
> 
> Examples,
> { "execute": "arm-inject-error",
>   "arguments": {
> "errortypes": ['cache-error']
>   }
> }
> 
> { "execute": "arm-inject-error",
>   "arguments": {
> "errortypes": ['tlb-error']
>   }
> }
> 
> { "execute": "arm-inject-error",
>   "arguments": {
> "errortypes": ['bus-error']
>   }
> }
> 
> { "execute": "arm-inject-error",
>   "arguments": {
> "errortypes": ['cache-error', 'tlb-error']
>   }
> }
> 
> { "execute": "arm-inject-error",
>   "arguments": {
> "errortypes": ['cache-error', 'tlb-error', 'bus-error', 
> 'micro-arch-error']
>   }
> }
> etc.
> 
> Signed-off-by: Shiju Jose 

With a few minor tweaks to build files, this is on my cxl gitlab qemu staging
branch as that happens to have other injection stuff (this obviously has little
to do with CXL!)

http://gitlab.com/jic23/qemu/ cxl-2024-06-21

Note I haven't tested that branch beyond a quick boot test, so it might get
some revisions early next week.

Jonathan



[PULL 27/28] tests/migration-tests: Cover postcopy failure on reconnect

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

Make sure there will be an event for postcopy recovery, irrelevant of
whether the reconnect will success, or when the failure happens.

The added new case is to fail early in postcopy recovery, in which case it
didn't even reach RECOVER stage on src (and in real life it'll be the same
to dest, but the test case is just slightly more involved due to the dual
socketpair setup).

To do that, rename the postcopy_recovery_test_fail to reflect either stage
to fail, instead of a boolean.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 tests/qtest/migration-test.c | 95 +---
 1 file changed, 77 insertions(+), 18 deletions(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index e61096adfe..571fc1334c 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -73,6 +73,17 @@ static QTestMigrationState dst_state;
 #define QEMU_ENV_SRC "QTEST_QEMU_BINARY_SRC"
 #define QEMU_ENV_DST "QTEST_QEMU_BINARY_DST"
 
+typedef enum PostcopyRecoveryFailStage {
+/*
+ * "no failure" must be 0 as it's the default.  OTOH, real failure
+ * cases must be >0 to make sure they trigger by a "if" test.
+ */
+POSTCOPY_FAIL_NONE = 0,
+POSTCOPY_FAIL_CHANNEL_ESTABLISH,
+POSTCOPY_FAIL_RECOVERY,
+POSTCOPY_FAIL_MAX
+} PostcopyRecoveryFailStage;
+
 #if defined(__linux__)
 #include 
 #include 
@@ -693,7 +704,7 @@ typedef struct {
 /* Postcopy specific fields */
 void *postcopy_data;
 bool postcopy_preempt;
-bool postcopy_recovery_test_fail;
+PostcopyRecoveryFailStage postcopy_recovery_fail_stage;
 } MigrateCommon;
 
 static int test_migrate_start(QTestState **from, QTestState **to,
@@ -1371,12 +1382,16 @@ static void wait_for_postcopy_status(QTestState *one, 
const char *status)
   "completed", NULL });
 }
 
-static void postcopy_recover_fail(QTestState *from, QTestState *to)
+static void postcopy_recover_fail(QTestState *from, QTestState *to,
+  PostcopyRecoveryFailStage stage)
 {
 #ifndef _WIN32
+bool fail_early = (stage == POSTCOPY_FAIL_CHANNEL_ESTABLISH);
 int ret, pair1[2], pair2[2];
 char c;
 
+g_assert(stage > POSTCOPY_FAIL_NONE && stage < POSTCOPY_FAIL_MAX);
+
 /* Create two unrelated socketpairs */
 ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair1);
 g_assert_cmpint(ret, ==, 0);
@@ -1410,6 +1425,14 @@ static void postcopy_recover_fail(QTestState *from, 
QTestState *to)
 ret = send(pair2[1], &c, 1, 0);
 g_assert_cmpint(ret, ==, 1);
 
+if (stage == POSTCOPY_FAIL_CHANNEL_ESTABLISH) {
+/*
+ * This will make src QEMU to fail at an early stage when trying to
+ * resume later, where it shouldn't reach RECOVER stage at all.
+ */
+close(pair1[1]);
+}
+
 migrate_recover(to, "fd:fd-mig");
 migrate_qmp(from, to, "fd:fd-mig", NULL, "{'resume': true}");
 
@@ -1419,28 +1442,53 @@ static void postcopy_recover_fail(QTestState *from, 
QTestState *to)
  */
 migration_event_wait(from, "postcopy-recover-setup");
 
+if (fail_early) {
+/*
+ * When fails at reconnection, src QEMU will automatically goes
+ * back to PAUSED state.  Making sure there is an event in this
+ * case: Libvirt relies on this to detect early reconnection
+ * errors.
+ */
+migration_event_wait(from, "postcopy-paused");
+} else {
+/*
+ * We want to test "fail later" at RECOVER stage here.  Make sure
+ * both QEMU instances will go into RECOVER stage first, then test
+ * kicking them out using migrate-pause.
+ *
+ * Explicitly check the RECOVER event on src, that's what Libvirt
+ * relies on, rather than polling.
+ */
+migration_event_wait(from, "postcopy-recover");
+wait_for_postcopy_status(from, "postcopy-recover");
+
+/* Need an explicit kick on src QEMU in this case */
+migrate_pause(from);
+}
+
 /*
- * Make sure both QEMU instances will go into RECOVER stage, then test
- * kicking them out using migrate-pause.
+ * For all failure cases, we'll reach such states on both sides now.
+ * Check them.
  */
-wait_for_postcopy_status(from, "postcopy-recover");
+wait_for_postcopy_status(from, "postcopy-paused");
 wait_for_postcopy_status(to, "postcopy-recover");
 
 /*
- * This would be issued by the admin upon noticing the hang, we should
- * make sure we're able to kick this out.
+ * Kick dest QEMU out too. This is normally not needed in reality
+ * because when the channel is shutdown it should also happen on src.
+ * However here we used separate socket pairs so we need to do that
+ * explicitly.
  */
-migrate_pause(from);
-wait_for_postcopy_status(from, "postcopy-paused");
-
-/* Do the sa

[PULL 06/28] monitor: Introduce monitor_fdset_*free

2024-06-21 Thread Fabiano Rosas
Introduce new functions to remove and free no longer used fds and
fdsets.

We need those to decouple the remove/free routines from
monitor_fdset_cleanup() which will go away in the next patches.

The new functions:

- monitor_fdset_free/_if_empty() will be used when a monitor
  connection closes and when an fd is removed to cleanup any fdset
  that is now empty.

- monitor_fdset_fd_free() will be used to remove one or more fds that
  have been explicitly targeted by qmp_remove_fd().

Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 monitor/fds.c | 31 +++
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/monitor/fds.c b/monitor/fds.c
index fb9f58c056..bd45a26368 100644
--- a/monitor/fds.c
+++ b/monitor/fds.c
@@ -167,6 +167,27 @@ int monitor_get_fd(Monitor *mon, const char *fdname, Error 
**errp)
 return -1;
 }
 
+static void monitor_fdset_free(MonFdset *mon_fdset)
+{
+QLIST_REMOVE(mon_fdset, next);
+g_free(mon_fdset);
+}
+
+static void monitor_fdset_free_if_empty(MonFdset *mon_fdset)
+{
+if (QLIST_EMPTY(&mon_fdset->fds) && QLIST_EMPTY(&mon_fdset->dup_fds)) {
+monitor_fdset_free(mon_fdset);
+}
+}
+
+static void monitor_fdset_fd_free(MonFdsetFd *mon_fdset_fd)
+{
+close(mon_fdset_fd->fd);
+g_free(mon_fdset_fd->opaque);
+QLIST_REMOVE(mon_fdset_fd, next);
+g_free(mon_fdset_fd);
+}
+
 static void monitor_fdset_cleanup(MonFdset *mon_fdset)
 {
 MonFdsetFd *mon_fdset_fd;
@@ -176,17 +197,11 @@ static void monitor_fdset_cleanup(MonFdset *mon_fdset)
 if ((mon_fdset_fd->removed ||
 (QLIST_EMPTY(&mon_fdset->dup_fds) && mon_refcount == 0)) &&
 runstate_is_running()) {
-close(mon_fdset_fd->fd);
-g_free(mon_fdset_fd->opaque);
-QLIST_REMOVE(mon_fdset_fd, next);
-g_free(mon_fdset_fd);
+monitor_fdset_fd_free(mon_fdset_fd);
 }
 }
 
-if (QLIST_EMPTY(&mon_fdset->fds) && QLIST_EMPTY(&mon_fdset->dup_fds)) {
-QLIST_REMOVE(mon_fdset, next);
-g_free(mon_fdset);
-}
+monitor_fdset_free_if_empty(mon_fdset);
 }
 
 void monitor_fdsets_cleanup(void)
-- 
2.35.3




[PULL 03/28] tests/qtest/migration: Fix file migration offset check

2024-06-21 Thread Fabiano Rosas
When doing file migration, QEMU accepts an offset that should be
skipped when writing the migration stream to the file. The purpose of
the offset is to allow the management layer to put its own metadata at
the start of the file.

We have tests for this in migration-test, but only testing that the
migration stream starts at the correct offset and not that it actually
leaves the data intact. Unsurprisingly, there's been a bug in that
area that the tests didn't catch.

Fix the tests to write some data to the offset region and check that
it's actually there after the migration.

While here, switch to using g_get_file_contents() which is more
portable than mmap().

Reviewed-by: Peter Xu 
Reviewed-by: Daniel P. Berrangé 
Signed-off-by: Fabiano Rosas 
---
 tests/qtest/migration-test.c | 79 ++--
 1 file changed, 48 insertions(+), 31 deletions(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 0dccb4beff..0a529a527b 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -68,6 +68,7 @@ static QTestMigrationState dst_state;
 #define QEMU_VM_FILE_MAGIC 0x5145564d
 #define FILE_TEST_FILENAME "migfile"
 #define FILE_TEST_OFFSET 0x1000
+#define FILE_TEST_MARKER 'X'
 #define QEMU_ENV_SRC "QTEST_QEMU_BINARY_SRC"
 #define QEMU_ENV_DST "QTEST_QEMU_BINARY_DST"
 
@@ -1693,10 +1694,43 @@ finish:
 test_migrate_end(from, to, args->result == MIG_TEST_SUCCEED);
 }
 
+static void file_dirty_offset_region(void)
+{
+g_autofree char *path = g_strdup_printf("%s/%s", tmpfs, 
FILE_TEST_FILENAME);
+size_t size = FILE_TEST_OFFSET;
+g_autofree char *data = g_new0(char, size);
+
+memset(data, FILE_TEST_MARKER, size);
+g_assert(g_file_set_contents(path, data, size, NULL));
+}
+
+static void file_check_offset_region(void)
+{
+g_autofree char *path = g_strdup_printf("%s/%s", tmpfs, 
FILE_TEST_FILENAME);
+size_t size = FILE_TEST_OFFSET;
+g_autofree char *expected = g_new0(char, size);
+g_autofree char *actual = NULL;
+uint64_t *stream_start;
+
+/*
+ * Ensure the skipped offset region's data has not been touched
+ * and the migration stream starts at the right place.
+ */
+
+memset(expected, FILE_TEST_MARKER, size);
+
+g_assert(g_file_get_contents(path, &actual, NULL, NULL));
+g_assert(!memcmp(actual, expected, size));
+
+stream_start = (uint64_t *)(actual + size);
+g_assert_cmpint(cpu_to_be64(*stream_start) >> 32, ==, QEMU_VM_FILE_MAGIC);
+}
+
 static void test_file_common(MigrateCommon *args, bool stop_src)
 {
 QTestState *from, *to;
 void *data_hook = NULL;
+bool check_offset = false;
 
 if (test_migrate_start(&from, &to, args->listen_uri, &args->start)) {
 return;
@@ -1709,6 +1743,16 @@ static void test_file_common(MigrateCommon *args, bool 
stop_src)
  */
 g_assert_false(args->live);
 
+if (g_strrstr(args->connect_uri, "offset=")) {
+check_offset = true;
+/*
+ * This comes before the start_hook because it's equivalent to
+ * a management application creating the file and writing to
+ * it so hooks should expect the file to be already present.
+ */
+file_dirty_offset_region();
+}
+
 if (args->start_hook) {
 data_hook = args->start_hook(from, to);
 }
@@ -1743,6 +1787,10 @@ static void test_file_common(MigrateCommon *args, bool 
stop_src)
 
 wait_for_serial("dest_serial");
 
+if (check_offset) {
+file_check_offset_region();
+}
+
 finish:
 if (args->finish_hook) {
 args->finish_hook(from, to, data_hook);
@@ -1942,36 +1990,6 @@ static void test_precopy_file(void)
 test_file_common(&args, true);
 }
 
-static void file_offset_finish_hook(QTestState *from, QTestState *to,
-void *opaque)
-{
-#if defined(__linux__)
-g_autofree char *path = g_strdup_printf("%s/%s", tmpfs, 
FILE_TEST_FILENAME);
-size_t size = FILE_TEST_OFFSET + sizeof(QEMU_VM_FILE_MAGIC);
-uintptr_t *addr, *p;
-int fd;
-
-fd = open(path, O_RDONLY);
-g_assert(fd != -1);
-addr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
-g_assert(addr != MAP_FAILED);
-
-/*
- * Ensure the skipped offset contains zeros and the migration
- * stream starts at the right place.
- */
-p = addr;
-while (p < addr + FILE_TEST_OFFSET / sizeof(uintptr_t)) {
-g_assert(*p == 0);
-p++;
-}
-g_assert_cmpint(cpu_to_be64(*p) >> 32, ==, QEMU_VM_FILE_MAGIC);
-
-munmap(addr, size);
-close(fd);
-#endif
-}
-
 static void test_precopy_file_offset(void)
 {
 g_autofree char *uri = g_strdup_printf("file:%s/%s,offset=%d", tmpfs,
@@ -1980,7 +1998,6 @@ static void test_precopy_file_offset(void)
 MigrateCommon args = {
 .connect_uri = uri,
 .listen_uri = "defer",
-.finish_hook = file_offset_finish_hook,
 };
 
 test_file_common(&args, false);
-- 
2.35.3




[PULL 02/28] migration: Fix file migration with fdset

2024-06-21 Thread Fabiano Rosas
When the "file:" migration support was added we missed the special
case in the qemu_open_old implementation that allows for a particular
file name format to be used to refer to a set of file descriptors that
have been previously provided to QEMU via the add-fd QMP command.

When using this fdset feature, we should not truncate the migration
file because being given an fd means that the management layer is in
control of the file and will likely already have some data written to
it. This is further indicated by the presence of the 'offset'
argument, which indicates the start of the region where QEMU is
allowed to write.

Fix the issue by replacing the O_TRUNC flag on open by an ftruncate
call, which will take the offset into consideration.

Fixes: 385f510df5 ("migration: file URI offset")
Suggested-by: Daniel P. Berrangé 
Reviewed-by: Prasad Pandit 
Reviewed-by: Peter Xu 
Reviewed-by: Daniel P. Berrangé 
Signed-off-by: Fabiano Rosas 
---
 migration/file.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/migration/file.c b/migration/file.c
index 2bb8c64092..a903710f06 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -84,12 +84,19 @@ void file_start_outgoing_migration(MigrationState *s,
 
 trace_migration_file_outgoing(filename);
 
-fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | O_TRUNC,
- 0600, errp);
+fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY, 0600, errp);
 if (!fioc) {
 return;
 }
 
+if (ftruncate(fioc->fd, offset)) {
+error_setg_errno(errp, errno,
+ "failed to truncate migration file to offset %" 
PRIx64,
+ offset);
+object_unref(OBJECT(fioc));
+return;
+}
+
 outgoing_args.fname = g_strdup(filename);
 
 ioc = QIO_CHANNEL(fioc);
-- 
2.35.3




[PULL 23/28] tests/migration-tests: Drop most WIN32 ifdefs for postcopy failure tests

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

Most of them are not needed, we can stick with one ifdef inside
postcopy_recover_fail() so as to cover the scm right tricks only.
The tests won't run on windows anyway due to has_uffd always false.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 tests/qtest/migration-test.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 6207305ff8..b7dea1aabb 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1364,9 +1364,9 @@ static void wait_for_postcopy_status(QTestState *one, 
const char *status)
   "completed", NULL });
 }
 
-#ifndef _WIN32
 static void postcopy_recover_fail(QTestState *from, QTestState *to)
 {
+#ifndef _WIN32
 int ret, pair1[2], pair2[2];
 char c;
 
@@ -1428,8 +1428,8 @@ static void postcopy_recover_fail(QTestState *from, 
QTestState *to)
 close(pair1[1]);
 close(pair2[0]);
 close(pair2[1]);
+#endif
 }
-#endif /* _WIN32 */
 
 static void test_postcopy_recovery_common(MigrateCommon *args)
 {
@@ -1469,7 +1469,6 @@ static void test_postcopy_recovery_common(MigrateCommon 
*args)
 wait_for_postcopy_status(to, "postcopy-paused");
 wait_for_postcopy_status(from, "postcopy-paused");
 
-#ifndef _WIN32
 if (args->postcopy_recovery_test_fail) {
 /*
  * Test when a wrong socket specified for recover, and then the
@@ -1478,7 +1477,6 @@ static void test_postcopy_recovery_common(MigrateCommon 
*args)
 postcopy_recover_fail(from, to);
 /* continue with a good recovery */
 }
-#endif /* _WIN32 */
 
 /*
  * Create a new socket to emulate a new channel that is different
@@ -1507,7 +1505,6 @@ static void test_postcopy_recovery(void)
 test_postcopy_recovery_common(&args);
 }
 
-#ifndef _WIN32
 static void test_postcopy_recovery_double_fail(void)
 {
 MigrateCommon args = {
@@ -1516,7 +1513,6 @@ static void test_postcopy_recovery_double_fail(void)
 
 test_postcopy_recovery_common(&args);
 }
-#endif /* _WIN32 */
 
 #ifdef CONFIG_GNUTLS
 static void test_postcopy_recovery_tls_psk(void)
@@ -3782,10 +3778,8 @@ int main(int argc, char **argv)
test_postcopy_preempt);
 migration_test_add("/migration/postcopy/preempt/recovery/plain",
test_postcopy_preempt_recovery);
-#ifndef _WIN32
 migration_test_add("/migration/postcopy/recovery/double-failures",
test_postcopy_recovery_double_fail);
-#endif /* _WIN32 */
 if (is_x86) {
 migration_test_add("/migration/postcopy/suspend",
test_postcopy_suspend);
-- 
2.35.3




[PULL 17/28] migration/multifd: Avoid the final FLUSH in complete()

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

We always do the flush when finishing one round of scan, and during
complete() phase we should scan one more round making sure no dirty page
existed.  In that case we shouldn't need one explicit FLUSH at the end of
complete(), as when reaching there all pages should have been flushed.

Reviewed-by: Fabiano Rosas 
Tested-by: Fabiano Rosas 
Signed-off-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 migration/ram.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index ceea586b06..edec1a2d07 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3300,10 +3300,6 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 }
 }
 
-if (migrate_multifd() && !migrate_multifd_flush_after_each_section() &&
-!migrate_mapped_ram()) {
-qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
-}
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
 return qemu_fflush(f);
 }
-- 
2.35.3




[PULL 20/28] migration: Cleanup incoming migration setup state change

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

Destination QEMU can setup incoming ports for two purposes: either a fresh
new incoming migration, in which QEMU will switch to SETUP for channel
establishment, or a paused postcopy migration, in which QEMU will stay in
POSTCOPY_PAUSED until kicking off the RECOVER phase.

Now the state machine worked on dest node for the latter, only because
migrate_set_state() implicitly will become a noop if the current state
check failed.  It wasn't clear at all.

Clean it up by providing a helper migration_incoming_state_setup() doing
proper checks over current status.  Postcopy-paused will be explicitly
checked now, and then we can bail out for unknown states.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 migration/migration.c | 28 ++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 795b30f0d0..41a88fc50a 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -618,6 +618,29 @@ bool migrate_uri_parse(const char *uri, MigrationChannel 
**channel,
 return true;
 }
 
+static bool
+migration_incoming_state_setup(MigrationIncomingState *mis, Error **errp)
+{
+MigrationStatus current = mis->state;
+
+if (current == MIGRATION_STATUS_POSTCOPY_PAUSED) {
+/*
+ * Incoming postcopy migration will stay in PAUSED state even if
+ * reconnection happened.
+ */
+return true;
+}
+
+if (current != MIGRATION_STATUS_NONE) {
+error_setg(errp, "Illegal migration incoming state: %s",
+   MigrationStatus_str(current));
+return false;
+}
+
+migrate_set_state(&mis->state, current, MIGRATION_STATUS_SETUP);
+return true;
+}
+
 static void qemu_start_incoming_migration(const char *uri, bool has_channels,
   MigrationChannelList *channels,
   Error **errp)
@@ -656,8 +679,9 @@ static void qemu_start_incoming_migration(const char *uri, 
bool has_channels,
 return;
 }
 
-migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
-  MIGRATION_STATUS_SETUP);
+if (!migration_incoming_state_setup(mis, errp)) {
+return;
+}
 
 if (addr->transport == MIGRATION_ADDRESS_TYPE_SOCKET) {
 SocketAddress *saddr = &addr->u.socket;
-- 
2.35.3




[PULL 01/28] migration: Drop reference to QIOChannel if file seeking fails

2024-06-21 Thread Fabiano Rosas
We forgot to drop the reference to the QIOChannel in the error path of
the offset adjustment. Do it now.

Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 migration/file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/migration/file.c b/migration/file.c
index ab18ba505a..2bb8c64092 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -94,6 +94,7 @@ void file_start_outgoing_migration(MigrationState *s,
 
 ioc = QIO_CHANNEL(fioc);
 if (offset && qio_channel_io_seek(ioc, offset, SEEK_SET, errp) < 0) {
+object_unref(OBJECT(fioc));
 return;
 }
 qio_channel_set_name(ioc, "migration-file-outgoing");
-- 
2.35.3




[PULL 08/28] monitor: Simplify fdset and fd removal

2024-06-21 Thread Fabiano Rosas
Remove fds right away instead of setting the ->removed flag. We don't
need the extra complexity of having a cleanup function reap the
removed entries at a later time.

Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 monitor/fds.c | 27 ++-
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/monitor/fds.c b/monitor/fds.c
index 76199d4b3b..e7619a6103 100644
--- a/monitor/fds.c
+++ b/monitor/fds.c
@@ -43,7 +43,6 @@ struct mon_fd_t {
 typedef struct MonFdsetFd MonFdsetFd;
 struct MonFdsetFd {
 int fd;
-bool removed;
 char *opaque;
 QLIST_ENTRY(MonFdsetFd) next;
 };
@@ -193,20 +192,6 @@ static void monitor_fdset_fd_free(MonFdsetFd *mon_fdset_fd)
 g_free(mon_fdset_fd);
 }
 
-static void monitor_fdset_cleanup(MonFdset *mon_fdset)
-{
-MonFdsetFd *mon_fdset_fd;
-MonFdsetFd *mon_fdset_fd_next;
-
-QLIST_FOREACH_SAFE(mon_fdset_fd, &mon_fdset->fds, next, mon_fdset_fd_next) 
{
-if (mon_fdset_fd->removed) {
-monitor_fdset_fd_free(mon_fdset_fd);
-}
-}
-
-monitor_fdset_free_if_empty(mon_fdset);
-}
-
 void monitor_fdsets_cleanup(void)
 {
 MonFdset *mon_fdset;
@@ -281,7 +266,7 @@ void qmp_get_win32_socket(const char *infos, const char 
*fdname, Error **errp)
 void qmp_remove_fd(int64_t fdset_id, bool has_fd, int64_t fd, Error **errp)
 {
 MonFdset *mon_fdset;
-MonFdsetFd *mon_fdset_fd;
+MonFdsetFd *mon_fdset_fd, *mon_fdset_fd_next;
 char fd_str[60];
 
 QEMU_LOCK_GUARD(&mon_fdsets_lock);
@@ -289,21 +274,22 @@ void qmp_remove_fd(int64_t fdset_id, bool has_fd, int64_t 
fd, Error **errp)
 if (mon_fdset->id != fdset_id) {
 continue;
 }
-QLIST_FOREACH(mon_fdset_fd, &mon_fdset->fds, next) {
+QLIST_FOREACH_SAFE(mon_fdset_fd, &mon_fdset->fds, next,
+   mon_fdset_fd_next) {
 if (has_fd) {
 if (mon_fdset_fd->fd != fd) {
 continue;
 }
-mon_fdset_fd->removed = true;
+monitor_fdset_fd_free(mon_fdset_fd);
 break;
 } else {
-mon_fdset_fd->removed = true;
+monitor_fdset_fd_free(mon_fdset_fd);
 }
 }
 if (has_fd && !mon_fdset_fd) {
 goto error;
 }
-monitor_fdset_cleanup(mon_fdset);
+monitor_fdset_free_if_empty(mon_fdset);
 return;
 }
 
@@ -413,7 +399,6 @@ AddfdInfo *monitor_fdset_add_fd(int fd, bool has_fdset_id, 
int64_t fdset_id,
 
 mon_fdset_fd = g_malloc0(sizeof(*mon_fdset_fd));
 mon_fdset_fd->fd = fd;
-mon_fdset_fd->removed = false;
 mon_fdset_fd->opaque = g_strdup(opaque);
 QLIST_INSERT_HEAD(&mon_fdset->fds, mon_fdset_fd, next);
 
-- 
2.35.3




[PULL 05/28] monitor: Drop monitor_fdset_dup_fd_find/_remove()

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

Those functions are not needed, one remove function should already
work.  Clean it up.

Here the code doesn't really care about whether we need to keep that dupfd
around if close() failed: when that happens something got very wrong,
keeping the dup_fd around the fdsets may not help that situation so far.

Cc: Dr. David Alan Gilbert 
Cc: Markus Armbruster 
Cc: Philippe Mathieu-Daudé 
Cc: Paolo Bonzini 
Cc: Daniel P. Berrangé 
Signed-off-by: Peter Xu 
Reviewed-by: Daniel P. Berrangé 
[add missing return statement, removal during traversal is not safe]
Signed-off-by: Fabiano Rosas 
---
 include/monitor/monitor.h |  1 -
 monitor/fds.c | 28 ++--
 stubs/fdset.c |  5 -
 util/osdep.c  | 15 +--
 4 files changed, 7 insertions(+), 42 deletions(-)

diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index 965f5d5450..fd9b3f538c 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -53,7 +53,6 @@ AddfdInfo *monitor_fdset_add_fd(int fd, bool has_fdset_id, 
int64_t fdset_id,
 const char *opaque, Error **errp);
 int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags);
 void monitor_fdset_dup_fd_remove(int dup_fd);
-int64_t monitor_fdset_dup_fd_find(int dup_fd);
 
 void monitor_register_hmp(const char *name, bool info,
   void (*cmd)(Monitor *mon, const QDict *qdict));
diff --git a/monitor/fds.c b/monitor/fds.c
index d86c2c674c..fb9f58c056 100644
--- a/monitor/fds.c
+++ b/monitor/fds.c
@@ -458,7 +458,7 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
 #endif
 }
 
-static int64_t monitor_fdset_dup_fd_find_remove(int dup_fd, bool remove)
+void monitor_fdset_dup_fd_remove(int dup_fd)
 {
 MonFdset *mon_fdset;
 MonFdsetFd *mon_fdset_fd_dup;
@@ -467,31 +467,15 @@ static int64_t monitor_fdset_dup_fd_find_remove(int 
dup_fd, bool remove)
 QLIST_FOREACH(mon_fdset, &mon_fdsets, next) {
 QLIST_FOREACH(mon_fdset_fd_dup, &mon_fdset->dup_fds, next) {
 if (mon_fdset_fd_dup->fd == dup_fd) {
-if (remove) {
-QLIST_REMOVE(mon_fdset_fd_dup, next);
-g_free(mon_fdset_fd_dup);
-if (QLIST_EMPTY(&mon_fdset->dup_fds)) {
-monitor_fdset_cleanup(mon_fdset);
-}
-return -1;
-} else {
-return mon_fdset->id;
+QLIST_REMOVE(mon_fdset_fd_dup, next);
+g_free(mon_fdset_fd_dup);
+if (QLIST_EMPTY(&mon_fdset->dup_fds)) {
+monitor_fdset_cleanup(mon_fdset);
 }
+return;
 }
 }
 }
-
-return -1;
-}
-
-int64_t monitor_fdset_dup_fd_find(int dup_fd)
-{
-return monitor_fdset_dup_fd_find_remove(dup_fd, false);
-}
-
-void monitor_fdset_dup_fd_remove(int dup_fd)
-{
-monitor_fdset_dup_fd_find_remove(dup_fd, true);
 }
 
 int monitor_fd_param(Monitor *mon, const char *fdname, Error **errp)
diff --git a/stubs/fdset.c b/stubs/fdset.c
index d7c39a28ac..389e368a29 100644
--- a/stubs/fdset.c
+++ b/stubs/fdset.c
@@ -9,11 +9,6 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
 return -1;
 }
 
-int64_t monitor_fdset_dup_fd_find(int dup_fd)
-{
-return -1;
-}
-
 void monitor_fdset_dup_fd_remove(int dupfd)
 {
 }
diff --git a/util/osdep.c b/util/osdep.c
index 5d23bbfbec..756de9a745 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -398,21 +398,8 @@ int qemu_open_old(const char *name, int flags, ...)
 
 int qemu_close(int fd)
 {
-int64_t fdset_id;
-
 /* Close fd that was dup'd from an fdset */
-fdset_id = monitor_fdset_dup_fd_find(fd);
-if (fdset_id != -1) {
-int ret;
-
-ret = close(fd);
-if (ret == 0) {
-monitor_fdset_dup_fd_remove(fd);
-}
-
-return ret;
-}
-
+monitor_fdset_dup_fd_remove(fd);
 return close(fd);
 }
 
-- 
2.35.3




[PULL 13/28] tests/qtest/migration: Add tests for file migration with direct-io

2024-06-21 Thread Fabiano Rosas
The tests are only allowed to run in systems that know about the
O_DIRECT flag and in filesystems which support it.

Note: this also brings back migrate_set_parameter_bool() which went
away when we removed the compression tests. I copied it verbatim.

Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 tests/qtest/migration-helpers.c | 44 +++
 tests/qtest/migration-helpers.h |  8 +
 tests/qtest/migration-test.c| 62 +
 3 files changed, 114 insertions(+)

diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
index ce6d6615b5..0ac49ceb54 100644
--- a/tests/qtest/migration-helpers.c
+++ b/tests/qtest/migration-helpers.c
@@ -18,6 +18,7 @@
 #include "qapi/error.h"
 #include "qapi/qmp/qlist.h"
 #include "qemu/cutils.h"
+#include "qemu/memalign.h"
 
 #include "migration-helpers.h"
 
@@ -473,3 +474,46 @@ void migration_test_add(const char *path, void (*fn)(void))
 qtest_add_data_func_full(path, test, migration_test_wrapper,
  migration_test_destroy);
 }
+
+#ifdef O_DIRECT
+/*
+ * Probe for O_DIRECT support on the filesystem. Since this is used
+ * for tests, be conservative, if anything fails, assume it's
+ * unsupported.
+ */
+bool probe_o_direct_support(const char *tmpfs)
+{
+g_autofree char *filename = g_strdup_printf("%s/probe-o-direct", tmpfs);
+int fd, flags = O_CREAT | O_RDWR | O_TRUNC | O_DIRECT;
+void *buf;
+ssize_t ret, len;
+uint64_t offset;
+
+fd = open(filename, flags, 0660);
+if (fd < 0) {
+unlink(filename);
+return false;
+}
+
+/*
+ * Using 1MB alignment as conservative choice to satisfy any
+ * plausible architecture default page size, and/or filesystem
+ * alignment restrictions.
+ */
+len = 0x10;
+offset = 0x10;
+
+buf = qemu_try_memalign(len, len);
+g_assert(buf);
+
+ret = pwrite(fd, buf, len, offset);
+unlink(filename);
+g_free(buf);
+
+if (ret < 0) {
+return false;
+}
+
+return true;
+}
+#endif
diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h
index 1339835698..50095fca4a 100644
--- a/tests/qtest/migration-helpers.h
+++ b/tests/qtest/migration-helpers.h
@@ -54,5 +54,13 @@ char *find_common_machine_version(const char *mtype, const 
char *var1,
   const char *var2);
 char *resolve_machine_version(const char *alias, const char *var1,
   const char *var2);
+#ifdef O_DIRECT
+bool probe_o_direct_support(const char *tmpfs);
+#else
+static inline bool probe_o_direct_support(const char *tmpfs)
+{
+return false;
+}
+#endif
 void migration_test_add(const char *path, void (*fn)(void));
 #endif /* MIGRATION_HELPERS_H */
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index eb4d5948e0..5c41d1b70e 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -408,6 +408,38 @@ static void migrate_set_parameter_str(QTestState *who, 
const char *parameter,
 migrate_check_parameter_str(who, parameter, value);
 }
 
+static long long migrate_get_parameter_bool(QTestState *who,
+   const char *parameter)
+{
+QDict *rsp;
+int result;
+
+rsp = qtest_qmp_assert_success_ref(
+who, "{ 'execute': 'query-migrate-parameters' }");
+result = qdict_get_bool(rsp, parameter);
+qobject_unref(rsp);
+return !!result;
+}
+
+static void migrate_check_parameter_bool(QTestState *who, const char 
*parameter,
+int value)
+{
+int result;
+
+result = migrate_get_parameter_bool(who, parameter);
+g_assert_cmpint(result, ==, value);
+}
+
+static void migrate_set_parameter_bool(QTestState *who, const char *parameter,
+  int value)
+{
+qtest_qmp_assert_success(who,
+ "{ 'execute': 'migrate-set-parameters',"
+ "'arguments': { %s: %i } }",
+ parameter, value);
+migrate_check_parameter_bool(who, parameter, value);
+}
+
 static void migrate_ensure_non_converge(QTestState *who)
 {
 /* Can't converge with 1ms downtime + 3 mbs bandwidth limit */
@@ -2235,6 +2267,33 @@ static void test_multifd_file_mapped_ram(void)
 test_file_common(&args, true);
 }
 
+static void *multifd_mapped_ram_dio_start(QTestState *from, QTestState *to)
+{
+migrate_multifd_mapped_ram_start(from, to);
+
+migrate_set_parameter_bool(from, "direct-io", true);
+migrate_set_parameter_bool(to, "direct-io", true);
+
+return NULL;
+}
+
+static void test_multifd_file_mapped_ram_dio(void)
+{
+g_autofree char *uri = g_strdup_printf("file:%s/%s", tmpfs,
+   FILE_TEST_FILENAME);
+MigrateCommon args = {
+.connect_uri = uri,
+.listen_uri = "defer",
+.start_hook = multifd_ma

[PULL 24/28] tests/migration-tests: Always enable migration events

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

Libvirt should always enable it, so it'll be nice qtest also cover that for
all tests on both sides.  migrate_incoming_qmp() used to enable it only on
dst, now we enable them on both, as we'll start to sanity check events even
on the src QEMU.

We'll need to leave the one in migrate_incoming_qmp(), because
virtio-net-failover test uses that one only, and it relies on the events to
work.

Signed-off-by: Peter Xu 
Reviewed-by: Fabiano Rosas 
Signed-off-by: Fabiano Rosas 
---
 tests/qtest/migration-helpers.c | 1 +
 tests/qtest/migration-test.c| 7 +++
 2 files changed, 8 insertions(+)

diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
index 0ac49ceb54..2ca4425d71 100644
--- a/tests/qtest/migration-helpers.c
+++ b/tests/qtest/migration-helpers.c
@@ -258,6 +258,7 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, 
const char *fmt, ...)
 g_assert(!qdict_haskey(args, "uri"));
 qdict_put_str(args, "uri", uri);
 
+/* This function relies on the event to work, make sure it's enabled */
 migrate_set_capability(to, "events", true);
 
 rsp = qtest_qmp(to, "{ 'execute': 'migrate-incoming', 'arguments': %p}",
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index b7dea1aabb..32e31fff86 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -852,6 +852,13 @@ static int test_migrate_start(QTestState **from, 
QTestState **to,
 unlink(shmem_path);
 }
 
+/*
+ * Always enable migration events.  Libvirt always uses it, let's try
+ * to mimic as closer as that.
+ */
+migrate_set_capability(*from, "events", true);
+migrate_set_capability(*to, "events", true);
+
 return 0;
 }
 
-- 
2.35.3




[PULL 14/28] monitor: fdset: Match against O_DIRECT

2024-06-21 Thread Fabiano Rosas
We're about to enable the use of O_DIRECT in the migration code and
due to the alignment restrictions imposed by filesystems we need to
make sure the flag is only used when doing aligned IO.

The migration will do parallel IO to different regions of a file, so
we need to use more than one file descriptor. Those cannot be obtained
by duplicating (dup()) since duplicated file descriptors share the
file status flags, including O_DIRECT. If one migration channel does
unaligned IO while another sets O_DIRECT to do aligned IO, the
filesystem would fail the unaligned operation.

The add-fd QMP command along with the fdset code are specifically
designed to allow the user to pass a set of file descriptors with
different access flags into QEMU to be later fetched by code that
needs to alternate between those flags when doing IO.

Extend the fdset matching to behave the same with the O_DIRECT flag.

Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 monitor/fds.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/monitor/fds.c b/monitor/fds.c
index d8c6b395b0..b5416b5b5d 100644
--- a/monitor/fds.c
+++ b/monitor/fds.c
@@ -424,6 +424,11 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags, 
Error **errp)
 int fd = -1;
 int dup_fd;
 int mon_fd_flags;
+int mask = O_ACCMODE;
+
+#ifdef O_DIRECT
+mask |= O_DIRECT;
+#endif
 
 if (mon_fdset->id != fdset_id) {
 continue;
@@ -437,7 +442,7 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags, 
Error **errp)
 return -1;
 }
 
-if ((flags & O_ACCMODE) == (mon_fd_flags & O_ACCMODE)) {
+if ((flags & mask) == (mon_fd_flags & mask)) {
 fd = mon_fdset_fd->fd;
 break;
 }
-- 
2.35.3




[PULL 16/28] tests/qtest/migration: Add a test for mapped-ram with passing of fds

2024-06-21 Thread Fabiano Rosas
Add a multifd test for mapped-ram with passing of fds into QEMU. This
is how libvirt will consume the feature.

There are a couple of details to the fdset mechanism:

- multifd needs two distinct file descriptors (not duplicated with
  dup()) so it can enable O_DIRECT only on the channels that do
  aligned IO. The dup() system call creates file descriptors that
  share status flags, of which O_DIRECT is one.

- the open() access mode flags used for the fds passed into QEMU need
  to match the flags QEMU uses to open the file. Currently O_WRONLY
  for src and O_RDONLY for dst.

Note that fdset code goes under _WIN32 because fd passing is not
supported on Windows.

Reviewed-by: Peter Xu 
[brought back the qmp_remove_fd() call at the end of the tests]
Signed-off-by: Fabiano Rosas 
---
 tests/qtest/migration-test.c | 105 ++-
 1 file changed, 102 insertions(+), 3 deletions(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 5c41d1b70e..6207305ff8 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -2104,11 +2104,18 @@ static void test_precopy_file(void)
 
 #ifndef _WIN32
 static void fdset_add_fds(QTestState *qts, const char *file, int flags,
-  int num_fds)
+  int num_fds, bool direct_io)
 {
 for (int i = 0; i < num_fds; i++) {
 int fd;
 
+#ifdef O_DIRECT
+/* only secondary channels can use direct-io */
+if (direct_io && i != 0) {
+flags |= O_DIRECT;
+}
+#endif
+
 fd = open(file, flags, 0660);
 assert(fd != -1);
 
@@ -2122,8 +2129,8 @@ static void *file_offset_fdset_start_hook(QTestState 
*from, QTestState *to)
 {
 g_autofree char *file = g_strdup_printf("%s/%s", tmpfs, 
FILE_TEST_FILENAME);
 
-fdset_add_fds(from, file, O_WRONLY, 1);
-fdset_add_fds(to, file, O_RDONLY, 1);
+fdset_add_fds(from, file, O_WRONLY, 1, false);
+fdset_add_fds(to, file, O_RDONLY, 1, false);
 
 return NULL;
 }
@@ -2295,6 +2302,91 @@ static void test_multifd_file_mapped_ram_dio(void)
 test_file_common(&args, true);
 }
 
+#ifndef _WIN32
+static void multifd_mapped_ram_fdset_end(QTestState *from, QTestState *to,
+ void *opaque)
+{
+QDict *resp;
+QList *fdsets;
+
+/*
+ * Remove the fdsets after migration, otherwise a second migration
+ * would fail due fdset reuse.
+ */
+qtest_qmp_assert_success(from, "{'execute': 'remove-fd', "
+ "'arguments': { 'fdset-id': 1}}");
+
+/*
+ * Make sure no fdsets are left after migration, otherwise a
+ * second migration would fail due fdset reuse.
+ */
+resp = qtest_qmp(from, "{'execute': 'query-fdsets', "
+ "'arguments': {}}");
+g_assert(qdict_haskey(resp, "return"));
+fdsets = qdict_get_qlist(resp, "return");
+g_assert(fdsets && qlist_empty(fdsets));
+}
+
+static void *multifd_mapped_ram_fdset_dio(QTestState *from, QTestState *to)
+{
+g_autofree char *file = g_strdup_printf("%s/%s", tmpfs, 
FILE_TEST_FILENAME);
+
+fdset_add_fds(from, file, O_WRONLY, 2, true);
+fdset_add_fds(to, file, O_RDONLY, 2, true);
+
+migrate_multifd_mapped_ram_start(from, to);
+migrate_set_parameter_bool(from, "direct-io", true);
+migrate_set_parameter_bool(to, "direct-io", true);
+
+return NULL;
+}
+
+static void *multifd_mapped_ram_fdset(QTestState *from, QTestState *to)
+{
+g_autofree char *file = g_strdup_printf("%s/%s", tmpfs, 
FILE_TEST_FILENAME);
+
+fdset_add_fds(from, file, O_WRONLY, 2, false);
+fdset_add_fds(to, file, O_RDONLY, 2, false);
+
+migrate_multifd_mapped_ram_start(from, to);
+
+return NULL;
+}
+
+static void test_multifd_file_mapped_ram_fdset(void)
+{
+g_autofree char *uri = g_strdup_printf("file:/dev/fdset/1,offset=%d",
+   FILE_TEST_OFFSET);
+MigrateCommon args = {
+.connect_uri = uri,
+.listen_uri = "defer",
+.start_hook = multifd_mapped_ram_fdset,
+.finish_hook = multifd_mapped_ram_fdset_end,
+};
+
+test_file_common(&args, true);
+}
+
+static void test_multifd_file_mapped_ram_fdset_dio(void)
+{
+g_autofree char *uri = g_strdup_printf("file:/dev/fdset/1,offset=%d",
+   FILE_TEST_OFFSET);
+MigrateCommon args = {
+.connect_uri = uri,
+.listen_uri = "defer",
+.start_hook = multifd_mapped_ram_fdset_dio,
+.finish_hook = multifd_mapped_ram_fdset_end,
+};
+
+if (!probe_o_direct_support(tmpfs)) {
+g_test_skip("Filesystem does not support O_DIRECT");
+return;
+}
+
+test_file_common(&args, true);
+}
+#endif /* !_WIN32 */
+
 static void test_precopy_tcp_plain(void)
 {
 MigrateCommon args = {
@@ -3736,6 +3828,13 @@ int main(int argc, char **argv)
 migration_test_add("/migration/multifd/file/mapped-ram/

[PULL 00/28] Migration patches for 2024-06-21

2024-06-21 Thread Fabiano Rosas
The following changes since commit 02d9c38236cf8c9826e5c5be61780ccb4ae0:

  Merge tag 'pull-tcg-20240619' of https://gitlab.com/rth7680/qemu into staging 
(2024-06-19 14:00:39 -0700)

are available in the Git repository at:

  https://gitlab.com/farosas/qemu.git tags/migration-20240621-pull-request

for you to fetch changes up to 04b09de16d78cf2d163ca65d7c6d161bf2baceb6:

  migration: Remove unused VMSTATE_ARRAY_TEST() macro (2024-06-21 14:37:58 
-0300)


Migration pull request

- Fabiano's fix for fdset + file migration truncating the migration
  file

- Fabiano's fdset + direct-io support for mapped-ram

- Peter's various cleanups (multifd sync, thread names, migration
  states, tests)

- Peter's new migration state postcopy-recover-setup

- Philippe's unused vmstate macro cleanup



Fabiano Rosas (15):
  migration: Drop reference to QIOChannel if file seeking fails
  migration: Fix file migration with fdset
  tests/qtest/migration: Fix file migration offset check
  tests/qtest/migration: Add a precopy file test with fdset
  monitor: Introduce monitor_fdset_*free
  monitor: Stop removing non-duplicated fds
  monitor: Simplify fdset and fd removal
  monitor: Report errors from monitor_fdset_dup_fd_add
  io: Stop using qemu_open_old in channel-file
  migration: Add direct-io parameter
  migration/multifd: Add direct-io support
  tests/qtest/migration: Add tests for file migration with direct-io
  monitor: fdset: Match against O_DIRECT
  migration: Add documentation for fdset with multifd + file
  tests/qtest/migration: Add a test for mapped-ram with passing of fds

Peter Xu (12):
  monitor: Drop monitor_fdset_dup_fd_find/_remove()
  migration/multifd: Avoid the final FLUSH in complete()
  migration: Rename thread debug names
  migration: Use MigrationStatus instead of int
  migration: Cleanup incoming migration setup state change
  migration/postcopy: Add postcopy-recover-setup phase
  migration/docs: Update postcopy recover session for SETUP phase
  tests/migration-tests: Drop most WIN32 ifdefs for postcopy failure
tests
  tests/migration-tests: Always enable migration events
  tests/migration-tests: migration_event_wait()
  tests/migration-tests: Verify postcopy-recover-setup status
  tests/migration-tests: Cover postcopy failure on reconnect

Philippe Mathieu-Daudé (1):
  migration: Remove unused VMSTATE_ARRAY_TEST() macro

 docs/devel/migration/main.rst   |  24 +-
 docs/devel/migration/mapped-ram.rst |   6 +-
 docs/devel/migration/postcopy.rst   |  29 +-
 include/migration/vmstate.h |  10 -
 include/monitor/monitor.h   |   3 +-
 include/qemu/osdep.h|   2 +
 io/channel-file.c   |   8 +-
 migration/colo.c|   2 +-
 migration/file.c|  45 ++-
 migration/file.h|   1 -
 migration/migration-hmp-cmds.c  |  11 +
 migration/migration.c   | 121 +--
 migration/migration.h   |   9 +-
 migration/multifd.c |   6 +-
 migration/options.c |  35 +++
 migration/options.h |   1 +
 migration/postcopy-ram.c|  10 +-
 migration/postcopy-ram.h|   3 +
 migration/ram.c |   4 -
 migration/savevm.c  |   6 +-
 monitor/fds.c   |  96 +++---
 monitor/hmp.c   |   2 -
 monitor/monitor-internal.h  |   1 -
 monitor/monitor.c   |   1 -
 monitor/qmp.c   |   2 -
 qapi/migration.json |  25 +-
 stubs/fdset.c   |   7 +-
 tests/qtest/libqtest.c  |  15 +-
 tests/qtest/libqtest.h  |   2 +
 tests/qtest/migration-helpers.c |  76 -
 tests/qtest/migration-helpers.h |  10 +
 tests/qtest/migration-test.c| 470 +---
 util/osdep.c|  34 +-
 33 files changed, 838 insertions(+), 239 deletions(-)

-- 
2.35.3




[PULL 04/28] tests/qtest/migration: Add a precopy file test with fdset

2024-06-21 Thread Fabiano Rosas
Add a test for file migration using fdset. The passing of fds is more
complex than using a file path. This is also the scenario where it's
most important we ensure that the initial migration stream offset is
respected because the fdset interface is the one used by the
management layer when providing a non empty migration file.

Note that fd passing is not available on Windows, so anything that
uses add-fd needs to exclude that platform.

Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 tests/qtest/migration-test.c | 44 
 1 file changed, 44 insertions(+)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 0a529a527b..22b07bc0ec 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1990,6 +1990,46 @@ static void test_precopy_file(void)
 test_file_common(&args, true);
 }
 
+#ifndef _WIN32
+static void fdset_add_fds(QTestState *qts, const char *file, int flags,
+  int num_fds)
+{
+for (int i = 0; i < num_fds; i++) {
+int fd;
+
+fd = open(file, flags, 0660);
+assert(fd != -1);
+
+qtest_qmp_fds_assert_success(qts, &fd, 1, "{'execute': 'add-fd', "
+ "'arguments': {'fdset-id': 1}}");
+close(fd);
+}
+}
+
+static void *file_offset_fdset_start_hook(QTestState *from, QTestState *to)
+{
+g_autofree char *file = g_strdup_printf("%s/%s", tmpfs, 
FILE_TEST_FILENAME);
+
+fdset_add_fds(from, file, O_WRONLY, 1);
+fdset_add_fds(to, file, O_RDONLY, 1);
+
+return NULL;
+}
+
+static void test_precopy_file_offset_fdset(void)
+{
+g_autofree char *uri = g_strdup_printf("file:/dev/fdset/1,offset=%d",
+   FILE_TEST_OFFSET);
+MigrateCommon args = {
+.connect_uri = uri,
+.listen_uri = "defer",
+.start_hook = file_offset_fdset_start_hook,
+};
+
+test_file_common(&args, false);
+}
+#endif
+
 static void test_precopy_file_offset(void)
 {
 g_autofree char *uri = g_strdup_printf("file:%s/%s,offset=%d", tmpfs,
@@ -3527,6 +3567,10 @@ int main(int argc, char **argv)
test_precopy_file);
 migration_test_add("/migration/precopy/file/offset",
test_precopy_file_offset);
+#ifndef _WIN32
+migration_test_add("/migration/precopy/file/offset/fdset",
+   test_precopy_file_offset_fdset);
+#endif
 migration_test_add("/migration/precopy/file/offset/bad",
test_precopy_file_offset_bad);
 
-- 
2.35.3




[PULL 19/28] migration: Use MigrationStatus instead of int

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

QEMU uses "int" in most cases even if it stores MigrationStatus.  I don't
know why, so let's try to do that right and see what blows up..

Reviewed-by: Fabiano Rosas 
Signed-off-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 migration/migration.c | 24 +++-
 migration/migration.h |  9 +
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index f9b69af62f..795b30f0d0 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -413,7 +413,7 @@ void migration_incoming_state_destroy(void)
 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
 }
 
-static void migrate_generate_event(int new_state)
+static void migrate_generate_event(MigrationStatus new_state)
 {
 if (migrate_events()) {
 qapi_event_send_migration(new_state);
@@ -1296,8 +1296,6 @@ static void fill_destination_migration_info(MigrationInfo 
*info)
 }
 
 switch (mis->state) {
-case MIGRATION_STATUS_NONE:
-return;
 case MIGRATION_STATUS_SETUP:
 case MIGRATION_STATUS_CANCELLING:
 case MIGRATION_STATUS_CANCELLED:
@@ -1313,6 +1311,8 @@ static void fill_destination_migration_info(MigrationInfo 
*info)
 info->has_status = true;
 fill_destination_postcopy_migration_info(info);
 break;
+default:
+return;
 }
 info->status = mis->state;
 
@@ -1360,7 +1360,8 @@ void qmp_migrate_start_postcopy(Error **errp)
 
 /* shared migration helpers */
 
-void migrate_set_state(int *state, int old_state, int new_state)
+void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
+   MigrationStatus new_state)
 {
 assert(new_state < MIGRATION_STATUS__MAX);
 if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
@@ -1567,7 +1568,7 @@ bool migration_in_postcopy(void)
 }
 }
 
-bool migration_postcopy_is_alive(int state)
+bool migration_postcopy_is_alive(MigrationStatus state)
 {
 switch (state) {
 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
@@ -1612,20 +1613,9 @@ bool migration_is_idle(void)
 case MIGRATION_STATUS_COMPLETED:
 case MIGRATION_STATUS_FAILED:
 return true;
-case MIGRATION_STATUS_SETUP:
-case MIGRATION_STATUS_CANCELLING:
-case MIGRATION_STATUS_ACTIVE:
-case MIGRATION_STATUS_POSTCOPY_ACTIVE:
-case MIGRATION_STATUS_COLO:
-case MIGRATION_STATUS_PRE_SWITCHOVER:
-case MIGRATION_STATUS_DEVICE:
-case MIGRATION_STATUS_WAIT_UNPLUG:
+default:
 return false;
-case MIGRATION_STATUS__MAX:
-g_assert_not_reached();
 }
-
-return false;
 }
 
 bool migration_is_active(void)
diff --git a/migration/migration.h b/migration/migration.h
index 6af01362d4..38aa1402d5 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -160,7 +160,7 @@ struct MigrationIncomingState {
 /* PostCopyFD's for external userfaultfds & handlers of shared memory */
 GArray   *postcopy_remote_fds;
 
-int state;
+MigrationStatus state;
 
 /*
  * The incoming migration coroutine, non-NULL during qemu_loadvm_state().
@@ -301,7 +301,7 @@ struct MigrationState {
 /* params from 'migrate-set-parameters' */
 MigrationParameters parameters;
 
-int state;
+MigrationStatus state;
 
 /* State related to return path */
 struct {
@@ -459,7 +459,8 @@ struct MigrationState {
 bool rdma_migration;
 };
 
-void migrate_set_state(int *state, int old_state, int new_state);
+void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
+   MigrationStatus new_state);
 
 void migration_fd_process_incoming(QEMUFile *f);
 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp);
@@ -479,7 +480,7 @@ int migrate_init(MigrationState *s, Error **errp);
 bool migration_is_blocked(Error **errp);
 /* True if outgoing migration has entered postcopy phase */
 bool migration_in_postcopy(void);
-bool migration_postcopy_is_alive(int state);
+bool migration_postcopy_is_alive(MigrationStatus state);
 MigrationState *migrate_get_current(void);
 bool migration_has_failed(MigrationState *);
 bool migrate_mode_is_cpr(MigrationState *);
-- 
2.35.3




[PULL 26/28] tests/migration-tests: Verify postcopy-recover-setup status

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

Making sure the postcopy-recover-setup status is present in the postcopy
failure unit test.  Note that it only applies to src QEMU not dest.

Signed-off-by: Peter Xu 
Reviewed-by: Fabiano Rosas 
Signed-off-by: Fabiano Rosas 
---
 tests/qtest/migration-test.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 32e31fff86..e61096adfe 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1413,6 +1413,12 @@ static void postcopy_recover_fail(QTestState *from, 
QTestState *to)
 migrate_recover(to, "fd:fd-mig");
 migrate_qmp(from, to, "fd:fd-mig", NULL, "{'resume': true}");
 
+/*
+ * Source QEMU has an extra RECOVER_SETUP phase, dest doesn't have it.
+ * Make sure it appears along the way.
+ */
+migration_event_wait(from, "postcopy-recover-setup");
+
 /*
  * Make sure both QEMU instances will go into RECOVER stage, then test
  * kicking them out using migrate-pause.
-- 
2.35.3




[PULL 09/28] monitor: Report errors from monitor_fdset_dup_fd_add

2024-06-21 Thread Fabiano Rosas
I'm keeping the EACCES because callers expect to be able to look at
errno.

Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 include/monitor/monitor.h |  2 +-
 monitor/fds.c | 10 +-
 stubs/fdset.c |  2 +-
 util/osdep.c  | 10 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index fd9b3f538c..c3740ec616 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -51,7 +51,7 @@ int monitor_read_password(MonitorHMP *mon, ReadLineFunc 
*readline_func,
 
 AddfdInfo *monitor_fdset_add_fd(int fd, bool has_fdset_id, int64_t fdset_id,
 const char *opaque, Error **errp);
-int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags);
+int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags, Error **errp);
 void monitor_fdset_dup_fd_remove(int dup_fd);
 
 void monitor_register_hmp(const char *name, bool info,
diff --git a/monitor/fds.c b/monitor/fds.c
index e7619a6103..d8c6b395b0 100644
--- a/monitor/fds.c
+++ b/monitor/fds.c
@@ -409,9 +409,10 @@ AddfdInfo *monitor_fdset_add_fd(int fd, bool has_fdset_id, 
int64_t fdset_id,
 return fdinfo;
 }
 
-int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
+int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags, Error **errp)
 {
 #ifdef _WIN32
+error_setg(errp, "Platform does not support fd passing (fdset)");
 return -ENOENT;
 #else
 MonFdset *mon_fdset;
@@ -431,6 +432,8 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
 QLIST_FOREACH(mon_fdset_fd, &mon_fdset->fds, next) {
 mon_fd_flags = fcntl(mon_fdset_fd->fd, F_GETFL);
 if (mon_fd_flags == -1) {
+error_setg(errp, "Failed to read file status flags for fd=%d",
+   mon_fdset_fd->fd);
 return -1;
 }
 
@@ -442,11 +445,15 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
 
 if (fd == -1) {
 errno = EACCES;
+error_setg(errp,
+   "Failed to find file descriptor with matching 
flags=0x%x",
+   flags);
 return -1;
 }
 
 dup_fd = qemu_dup_flags(fd, flags);
 if (dup_fd == -1) {
+error_setg(errp, "Failed to dup() given file descriptor fd=%d", 
fd);
 return -1;
 }
 
@@ -456,6 +463,7 @@ int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
 return dup_fd;
 }
 
+error_setg(errp, "Failed to find fdset /dev/fdset/%" PRId64, fdset_id);
 errno = ENOENT;
 return -1;
 #endif
diff --git a/stubs/fdset.c b/stubs/fdset.c
index 389e368a29..2950fd91fd 100644
--- a/stubs/fdset.c
+++ b/stubs/fdset.c
@@ -3,7 +3,7 @@
 #include "monitor/monitor.h"
 #include "../monitor/monitor-internal.h"
 
-int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
+int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags, Error **errp)
 {
 errno = ENOSYS;
 return -1;
diff --git a/util/osdep.c b/util/osdep.c
index 756de9a745..5bbfdfac7a 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -310,7 +310,6 @@ qemu_open_internal(const char *name, int flags, mode_t 
mode, Error **errp)
 /* Attempt dup of fd from fd set */
 if (strstart(name, "/dev/fdset/", &fdset_id_str)) {
 int64_t fdset_id;
-int dupfd;
 
 fdset_id = qemu_parse_fdset(fdset_id_str);
 if (fdset_id == -1) {
@@ -319,14 +318,7 @@ qemu_open_internal(const char *name, int flags, mode_t 
mode, Error **errp)
 return -1;
 }
 
-dupfd = monitor_fdset_dup_fd_add(fdset_id, flags);
-if (dupfd == -1) {
-error_setg_errno(errp, errno, "Could not dup FD for %s flags %x",
- name, flags);
-return -1;
-}
-
-return dupfd;
+return monitor_fdset_dup_fd_add(fdset_id, flags, errp);
 }
 #endif
 
-- 
2.35.3




[PULL 21/28] migration/postcopy: Add postcopy-recover-setup phase

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

This patch adds a migration state on src called "postcopy-recover-setup".
The new state will describe the intermediate step starting from when the
src QEMU received a postcopy recovery request, until the migration channels
are properly established, but before the recovery process take place.

The request came from Libvirt where Libvirt currently rely on the migration
state events to detect migration state changes.  That works for most of the
migration process but except postcopy recovery failures at the beginning.

Currently postcopy recovery only has two major states:

  - postcopy-paused: this is the state that both sides of QEMU will be in
for a long time as long as the migration channel was interrupted.

  - postcopy-recover: this is the state where both sides of QEMU handshake
with each other, preparing for a continuation of postcopy which used to
be interrupted.

The issue here is when the recovery port is invalid, the src QEMU will take
the URI/channels, noticing the ports are not valid, and it'll silently keep
in the postcopy-paused state, with no event sent to Libvirt.  In this case,
the only thing Libvirt can do is to poll the migration status with a proper
interval, however that's less optimal.

Considering that this is the only case where Libvirt won't get a
notification from QEMU on such events, let's add postcopy-recover-setup
state to mimic what we have with the "setup" state of a newly initialized
migration, describing the phase of connection establishment.

With that, postcopy recovery will have two paths to go now, and either path
will guarantee an event generated.  Now the events will look like this
during a recovery process on src QEMU:

  - Initially when the recovery is initiated on src, QEMU will go from
"postcopy-paused" -> "postcopy-recover-setup".  Old QEMUs don't have
this event.

  - Depending on whether the channel re-establishment is succeeded:

- In succeeded case, src QEMU will move from "postcopy-recover-setup"
  to "postcopy-recover".  Old QEMUs also have this event.

- In failure case, src QEMU will move from "postcopy-recover-setup" to
  "postcopy-paused" again.  Old QEMUs don't have this event.

This guarantees that Libvirt will always receive a notification for
recovery process properly.

One thing to mention is, such new status is only needed on src QEMU not
both.  On dest QEMU, the state machine doesn't change.  Hence the events
don't change either.  It's done like so because dest QEMU may not have an
explicit point of setup start.  E.g., it can happen that when dest QEMUs
doesn't use migrate-recover command to use a new URI/channel, but the old
URI/channels can be reused in recovery, in which case the old ports simply
can work again after the network routes are fixed up.

Add a new helper postcopy_is_paused() detecting whether postcopy is still
paused, taking RECOVER_SETUP into account too.  When using it on both
src/dst, a slight change is done altogether to always wait for the
semaphore before checking the status, because for both sides a sem_post()
will be required for a recovery.

Cc: Jiri Denemark 
Cc: Prasad Pandit 
Reviewed-by: Fabiano Rosas 
Buglink: https://issues.redhat.com/browse/RHEL-38485
Signed-off-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 migration/migration.c| 40 ++--
 migration/postcopy-ram.c |  6 ++
 migration/postcopy-ram.h |  3 +++
 migration/savevm.c   |  4 ++--
 qapi/migration.json  |  4 
 5 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 41a88fc50a..3dea06d577 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1117,6 +1117,7 @@ bool migration_is_setup_or_active(void)
 case MIGRATION_STATUS_ACTIVE:
 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
 case MIGRATION_STATUS_POSTCOPY_PAUSED:
+case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
 case MIGRATION_STATUS_POSTCOPY_RECOVER:
 case MIGRATION_STATUS_SETUP:
 case MIGRATION_STATUS_PRE_SWITCHOVER:
@@ -1139,6 +1140,7 @@ bool migration_is_running(void)
 case MIGRATION_STATUS_ACTIVE:
 case MIGRATION_STATUS_POSTCOPY_ACTIVE:
 case MIGRATION_STATUS_POSTCOPY_PAUSED:
+case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
 case MIGRATION_STATUS_POSTCOPY_RECOVER:
 case MIGRATION_STATUS_SETUP:
 case MIGRATION_STATUS_PRE_SWITCHOVER:
@@ -1276,6 +1278,7 @@ static void fill_source_migration_info(MigrationInfo 
*info)
 case MIGRATION_STATUS_PRE_SWITCHOVER:
 case MIGRATION_STATUS_DEVICE:
 case MIGRATION_STATUS_POSTCOPY_PAUSED:
+case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
 case MIGRATION_STATUS_POSTCOPY_RECOVER:
 /* TODO add some postcopy stats */
 populate_time_info(info, s);
@@ -1482,9 +1485,30 @@ static void migrate_error_free(MigrationState *s)
 
 static void migrate_fd_error(MigrationState *s, const Error *error)
 {
+MigrationStatus current 

[PULL 25/28] tests/migration-tests: migration_event_wait()

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

Introduce a small helper to wait for a migration event, generalized from
the incoming migration path.  Make the helper easier to use by allowing it
to keep waiting until the expected event is received.

Signed-off-by: Peter Xu 
Reviewed-by: Fabiano Rosas 
Signed-off-by: Fabiano Rosas 
---
 tests/qtest/migration-helpers.c | 31 ++-
 tests/qtest/migration-helpers.h |  2 ++
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
index 2ca4425d71..84f49db85e 100644
--- a/tests/qtest/migration-helpers.c
+++ b/tests/qtest/migration-helpers.c
@@ -249,7 +249,7 @@ void migrate_set_capability(QTestState *who, const char 
*capability,
 void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, 
...)
 {
 va_list ap;
-QDict *args, *rsp, *data;
+QDict *args, *rsp;
 
 va_start(ap, fmt);
 args = qdict_from_vjsonf_nofail(fmt, ap);
@@ -272,14 +272,7 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, 
const char *fmt, ...)
 g_assert(qdict_haskey(rsp, "return"));
 qobject_unref(rsp);
 
-rsp = qtest_qmp_eventwait_ref(to, "MIGRATION");
-g_assert(qdict_haskey(rsp, "data"));
-
-data = qdict_get_qdict(rsp, "data");
-g_assert(qdict_haskey(data, "status"));
-g_assert_cmpstr(qdict_get_str(data, "status"), ==, "setup");
-
-qobject_unref(rsp);
+migration_event_wait(to, "setup");
 }
 
 /*
@@ -518,3 +511,23 @@ bool probe_o_direct_support(const char *tmpfs)
 return true;
 }
 #endif
+
+/*
+ * Wait for a "MIGRATION" event.  This is what Libvirt uses to track
+ * migration status changes.
+ */
+void migration_event_wait(QTestState *s, const char *target)
+{
+QDict *response, *data;
+const char *status;
+bool found;
+
+do {
+response = qtest_qmp_eventwait_ref(s, "MIGRATION");
+data = qdict_get_qdict(response, "data");
+g_assert(data);
+status = qdict_get_str(data, "status");
+found = (strcmp(status, target) == 0);
+qobject_unref(response);
+} while (!found);
+}
diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h
index 50095fca4a..72dba369fb 100644
--- a/tests/qtest/migration-helpers.h
+++ b/tests/qtest/migration-helpers.h
@@ -63,4 +63,6 @@ static inline bool probe_o_direct_support(const char *tmpfs)
 }
 #endif
 void migration_test_add(const char *path, void (*fn)(void));
+void migration_event_wait(QTestState *s, const char *target);
+
 #endif /* MIGRATION_HELPERS_H */
-- 
2.35.3




[PULL 22/28] migration/docs: Update postcopy recover session for SETUP phase

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

Firstly, the "Paused" state was added in the wrong place before. The state
machine section was describing PostcopyState, rather than MigrationStatus.
Drop the Paused state descriptions.

Then in the postcopy recover session, add more information on the state
machine for MigrationStatus in the lines.  Add the new RECOVER_SETUP phase.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Peter Xu 
[fix typo s/reconnects/reconnect]
Signed-off-by: Fabiano Rosas 
---
 docs/devel/migration/postcopy.rst | 29 +++--
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/docs/devel/migration/postcopy.rst 
b/docs/devel/migration/postcopy.rst
index 6c51e96d79..82e7a848c6 100644
--- a/docs/devel/migration/postcopy.rst
+++ b/docs/devel/migration/postcopy.rst
@@ -99,17 +99,6 @@ ADVISE->DISCARD->LISTEN->RUNNING->END
 (although it can't do the cleanup it would do as it
 finishes a normal migration).
 
- - Paused
-
-Postcopy can run into a paused state (normally on both sides when
-happens), where all threads will be temporarily halted mostly due to
-network errors.  When reaching paused state, migration will make sure
-the qemu binary on both sides maintain the data without corrupting
-the VM.  To continue the migration, the admin needs to fix the
-migration channel using the QMP command 'migrate-recover' on the
-destination node, then resume the migration using QMP command 'migrate'
-again on source node, with resume=true flag set.
-
  - End
 
 The listen thread can now quit, and perform the cleanup of migration
@@ -221,7 +210,8 @@ paused postcopy migration.
 
 The recovery phase normally contains a few steps:
 
-  - When network issue occurs, both QEMU will go into PAUSED state
+  - When network issue occurs, both QEMU will go into **POSTCOPY_PAUSED**
+migration state.
 
   - When the network is recovered (or a new network is provided), the admin
 can setup the new channel for migration using QMP command
@@ -229,9 +219,20 @@ The recovery phase normally contains a few steps:
 
   - On source host, the admin can continue the interrupted postcopy
 migration using QMP command 'migrate' with resume=true flag set.
+Source QEMU will go into **POSTCOPY_RECOVER_SETUP** state trying to
+re-establish the channels.
 
-  - After the connection is re-established, QEMU will continue the postcopy
-migration on both sides.
+  - When both sides of QEMU successfully reconnect using a new or fixed up
+channel, they will go into **POSTCOPY_RECOVER** state, some handshake
+procedure will be needed to properly synchronize the VM states between
+the two QEMUs to continue the postcopy migration.  For example, there
+can be pages sent right during the window when the network is
+interrupted, then the handshake will guarantee pages lost in-flight
+will be resent again.
+
+  - After a proper handshake synchronization, QEMU will continue the
+postcopy migration on both sides and go back to **POSTCOPY_ACTIVE**
+state.  Postcopy migration will continue.
 
 During a paused postcopy migration, the VM can logically still continue
 running, and it will not be impacted from any page access to pages that
-- 
2.35.3




[PULL 18/28] migration: Rename thread debug names

2024-06-21 Thread Fabiano Rosas
From: Peter Xu 

The postcopy thread names on dest QEMU are slightly confusing, partly I'll
need to blame myself on 36f62f11e4 ("migration: Postcopy preemption
preparation on channel creation").  E.g., "fault-fast" reads like a fast
version of "fault-default", but it's actually the fast version of
"postcopy/listen".

Taking this chance, rename all the migration threads with proper rules.
Considering we only have 15 chars usable, prefix all threads with "mig/",
meanwhile identify src/dst threads properly this time.  So now most thread
names will look like "mig/DIR/xxx", where DIR will be "src"/"dst", except
the bg-snapshot thread which doesn't have a direction.

For multifd threads, making them "mig/{src|dst}/{send|recv}_%d".

We used to have "live_migration" thread for a very long time, now it's
called "mig/src/main".  We may hope to have "mig/dst/main" soon but not
yet.

Reviewed-by: Fabiano Rosas 
Reviewed-by: Zhijian Li (Fujitsu) 
Signed-off-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 migration/colo.c | 2 +-
 migration/migration.c| 6 +++---
 migration/multifd.c  | 6 +++---
 migration/postcopy-ram.c | 4 ++--
 migration/savevm.c   | 2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/migration/colo.c b/migration/colo.c
index f96c2ee069..6449490221 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -935,7 +935,7 @@ void coroutine_fn colo_incoming_co(void)
 assert(bql_locked());
 assert(migration_incoming_colo_enabled());
 
-qemu_thread_create(&th, "COLO incoming", colo_process_incoming_thread,
+qemu_thread_create(&th, "mig/dst/colo", colo_process_incoming_thread,
mis, QEMU_THREAD_JOINABLE);
 
 mis->colo_incoming_co = qemu_coroutine_self();
diff --git a/migration/migration.c b/migration/migration.c
index e03c80b3aa..f9b69af62f 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2431,7 +2431,7 @@ static int open_return_path_on_source(MigrationState *ms)
 
 trace_open_return_path_on_source();
 
-qemu_thread_create(&ms->rp_state.rp_thread, "return path",
+qemu_thread_create(&ms->rp_state.rp_thread, "mig/src/rp-thr",
source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
 ms->rp_state.rp_thread_created = true;
 
@@ -3770,10 +3770,10 @@ void migrate_fd_connect(MigrationState *s, Error 
*error_in)
 }
 
 if (migrate_background_snapshot()) {
-qemu_thread_create(&s->thread, "bg_snapshot",
+qemu_thread_create(&s->thread, "mig/snapshot",
 bg_migration_thread, s, QEMU_THREAD_JOINABLE);
 } else {
-qemu_thread_create(&s->thread, "live_migration",
+qemu_thread_create(&s->thread, "mig/src/main",
 migration_thread, s, QEMU_THREAD_JOINABLE);
 }
 s->migration_thread_running = true;
diff --git a/migration/multifd.c b/migration/multifd.c
index d82885fdbb..0b4cbaddfe 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -1069,7 +1069,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams 
*p,
 args->p = p;
 
 p->tls_thread_created = true;
-qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker",
+qemu_thread_create(&p->tls_thread, "mig/src/tls",
multifd_tls_handshake_thread, args,
QEMU_THREAD_JOINABLE);
 return true;
@@ -1190,7 +1190,7 @@ bool multifd_send_setup(void)
 p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 p->packet->version = cpu_to_be32(MULTIFD_VERSION);
 }
-p->name = g_strdup_printf("multifdsend_%d", i);
+p->name = g_strdup_printf("mig/src/send_%d", i);
 p->page_size = qemu_target_page_size();
 p->page_count = page_count;
 p->write_flags = 0;
@@ -1604,7 +1604,7 @@ int multifd_recv_setup(Error **errp)
 + sizeof(uint64_t) * page_count;
 p->packet = g_malloc0(p->packet_len);
 }
-p->name = g_strdup_printf("multifdrecv_%d", i);
+p->name = g_strdup_printf("mig/dst/recv_%d", i);
 p->normal = g_new0(ram_addr_t, page_count);
 p->zero = g_new0(ram_addr_t, page_count);
 p->page_count = page_count;
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 3419779548..97701e6bb2 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -1238,7 +1238,7 @@ int postcopy_ram_incoming_setup(MigrationIncomingState 
*mis)
 return -1;
 }
 
-postcopy_thread_create(mis, &mis->fault_thread, "fault-default",
+postcopy_thread_create(mis, &mis->fault_thread, "mig/dst/fault",
postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
 mis->have_fault_thread = true;
 
@@ -1258,7 +1258,7 @@ int postcopy_ram_incoming_setup(MigrationIncomingState 
*mis)
  * This thread needs to be created after the temp pages because
  * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately.
  */
-   

[PULL 28/28] migration: Remove unused VMSTATE_ARRAY_TEST() macro

2024-06-21 Thread Fabiano Rosas
From: Philippe Mathieu-Daudé 

Last use of VMSTATE_ARRAY_TEST() was removed in commit 46baa9007f
("migration/i386: Remove old non-softfloat 64bit FP support"), we
can safely get rid of it.

Signed-off-by: Philippe Mathieu-Daudé 
Reviewed-by: Li Zhijian 
Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 include/migration/vmstate.h | 10 --
 1 file changed, 10 deletions(-)

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 294d2d8486..f313f2f408 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -388,16 +388,6 @@ extern const VMStateInfo vmstate_info_qlist;
 .offset = vmstate_offset_varray(_state, _field, _type),  \
 }
 
-#define VMSTATE_ARRAY_TEST(_field, _state, _num, _test, _info, _type) {\
-.name = (stringify(_field)),  \
-.field_exists = (_test),  \
-.num  = (_num),   \
-.info = &(_info), \
-.size = sizeof(_type),\
-.flags= VMS_ARRAY,\
-.offset   = vmstate_offset_array(_state, _field, _type, _num),\
-}
-
 #define VMSTATE_SUB_ARRAY(_field, _state, _start, _num, _version, _info, 
_type) { \
 .name   = (stringify(_field)),   \
 .version_id = (_version),\
-- 
2.35.3




[PULL 12/28] migration/multifd: Add direct-io support

2024-06-21 Thread Fabiano Rosas
When multifd is used along with mapped-ram, we can take benefit of a
filesystem that supports the O_DIRECT flag and perform direct I/O in
the multifd threads. This brings a significant performance improvement
because direct-io writes bypass the page cache which would otherwise
be thrashed by the multifd data which is unlikely to be needed again
in a short period of time.

To be able to use a multifd channel opened with O_DIRECT, we must
ensure that a certain aligment is used. Filesystems usually require a
block-size alignment for direct I/O. The way to achieve this is by
enabling the mapped-ram feature, which already aligns its I/O properly
(see MAPPED_RAM_FILE_OFFSET_ALIGNMENT at ram.c).

By setting O_DIRECT on the multifd channels, all writes to the same
file descriptor need to be aligned as well, even the ones that come
from outside multifd, such as the QEMUFile I/O from the main migration
code. This makes it impossible to use the same file descriptor for the
QEMUFile and for the multifd channels. The various flags and metadata
written by the main migration code will always be unaligned by virtue
of their small size. To workaround this issue, we'll require a second
file descriptor to be used exclusively for direct I/O.

The second file descriptor can be obtained by QEMU by re-opening the
migration file (already possible), or by being provided by the user or
management application (support to be added in future patches).

Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 migration/file.c  | 33 -
 migration/file.h  |  1 -
 migration/migration.c | 23 +++
 3 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/migration/file.c b/migration/file.c
index a903710f06..db870f2cf0 100644
--- a/migration/file.c
+++ b/migration/file.c
@@ -50,12 +50,31 @@ void file_cleanup_outgoing_migration(void)
 outgoing_args.fname = NULL;
 }
 
+static void file_enable_direct_io(int *flags)
+{
+#ifdef O_DIRECT
+*flags |= O_DIRECT;
+#else
+/* it should have been rejected when setting the parameter */
+g_assert_not_reached();
+#endif
+}
+
 bool file_send_channel_create(gpointer opaque, Error **errp)
 {
 QIOChannelFile *ioc;
 int flags = O_WRONLY;
 bool ret = true;
 
+if (migrate_direct_io()) {
+/*
+ * Enable O_DIRECT for the secondary channels. These are used
+ * for sending ram pages and writes should be guaranteed to be
+ * aligned to at least page size.
+ */
+file_enable_direct_io(&flags);
+}
+
 ioc = qio_channel_file_new_path(outgoing_args.fname, flags, 0, errp);
 if (!ioc) {
 ret = false;
@@ -117,21 +136,25 @@ static gboolean file_accept_incoming_migration(QIOChannel 
*ioc,
 return G_SOURCE_REMOVE;
 }
 
-void file_create_incoming_channels(QIOChannel *ioc, Error **errp)
+static void file_create_incoming_channels(QIOChannel *ioc, char *filename,
+  Error **errp)
 {
-int i, fd, channels = 1;
+int i, channels = 1;
 g_autofree QIOChannel **iocs = NULL;
+int flags = O_RDONLY;
 
 if (migrate_multifd()) {
 channels += migrate_multifd_channels();
+if (migrate_direct_io()) {
+file_enable_direct_io(&flags);
+}
 }
 
 iocs = g_new0(QIOChannel *, channels);
-fd = QIO_CHANNEL_FILE(ioc)->fd;
 iocs[0] = ioc;
 
 for (i = 1; i < channels; i++) {
-QIOChannelFile *fioc = qio_channel_file_new_dupfd(fd, errp);
+QIOChannelFile *fioc = qio_channel_file_new_path(filename, flags, 0, 
errp);
 
 if (!fioc) {
 while (i) {
@@ -171,7 +194,7 @@ void file_start_incoming_migration(FileMigrationArgs 
*file_args, Error **errp)
 return;
 }
 
-file_create_incoming_channels(QIO_CHANNEL(fioc), errp);
+file_create_incoming_channels(QIO_CHANNEL(fioc), filename, errp);
 }
 
 int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov,
diff --git a/migration/file.h b/migration/file.h
index 7699c04677..9f71e87f74 100644
--- a/migration/file.h
+++ b/migration/file.h
@@ -20,7 +20,6 @@ void file_start_outgoing_migration(MigrationState *s,
 int file_parse_offset(char *filespec, uint64_t *offsetp, Error **errp);
 void file_cleanup_outgoing_migration(void);
 bool file_send_channel_create(gpointer opaque, Error **errp);
-void file_create_incoming_channels(QIOChannel *ioc, Error **errp);
 int file_write_ramblock_iov(QIOChannel *ioc, const struct iovec *iov,
 int niov, RAMBlock *block, Error **errp);
 int multifd_file_recv_data(MultiFDRecvParams *p, Error **errp);
diff --git a/migration/migration.c b/migration/migration.c
index e1b269624c..e03c80b3aa 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -155,6 +155,16 @@ static bool migration_needs_seekable_channel(void)
 return migrate_mapped_ram();
 }
 
+static bool migration_needs_extra_fds(void)
+{
+/*
+ * When doing dire

[PULL 11/28] migration: Add direct-io parameter

2024-06-21 Thread Fabiano Rosas
Add the direct-io migration parameter that tells the migration code to
use O_DIRECT when opening the migration stream file whenever possible.

This is currently only used with the mapped-ram migration that has a
clear window guaranteed to perform aligned writes.

Acked-by: Markus Armbruster 
Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 include/qemu/osdep.h   |  2 ++
 migration/migration-hmp-cmds.c | 11 +++
 migration/options.c| 35 ++
 migration/options.h|  1 +
 qapi/migration.json| 21 +---
 util/osdep.c   |  9 +
 6 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index f61edcfdc2..191916f38e 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -612,6 +612,8 @@ int qemu_lock_fd_test(int fd, int64_t start, int64_t len, 
bool exclusive);
 bool qemu_has_ofd_lock(void);
 #endif
 
+bool qemu_has_direct_io(void);
+
 #if defined(__HAIKU__) && defined(__i386__)
 #define FMT_pid "%ld"
 #elif defined(WIN64)
diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 9f0e8029e0..7d608d26e1 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -351,6 +351,13 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict 
*qdict)
 monitor_printf(mon, "%s: %s\n",
 MigrationParameter_str(MIGRATION_PARAMETER_MODE),
 qapi_enum_lookup(&MigMode_lookup, params->mode));
+
+if (params->has_direct_io) {
+monitor_printf(mon, "%s: %s\n",
+   MigrationParameter_str(
+   MIGRATION_PARAMETER_DIRECT_IO),
+   params->direct_io ? "on" : "off");
+}
 }
 
 qapi_free_MigrationParameters(params);
@@ -624,6 +631,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict 
*qdict)
 p->has_mode = true;
 visit_type_MigMode(v, param, &p->mode, &err);
 break;
+case MIGRATION_PARAMETER_DIRECT_IO:
+p->has_direct_io = true;
+visit_type_bool(v, param, &p->direct_io, &err);
+break;
 default:
 assert(0);
 }
diff --git a/migration/options.c b/migration/options.c
index 5ab5b6d85d..645f55003d 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -702,6 +702,25 @@ bool migrate_cpu_throttle_tailslow(void)
 return s->parameters.cpu_throttle_tailslow;
 }
 
+bool migrate_direct_io(void)
+{
+MigrationState *s = migrate_get_current();
+
+/*
+ * O_DIRECT is only supported with mapped-ram and multifd.
+ *
+ * mapped-ram is needed because filesystems impose restrictions on
+ * O_DIRECT IO alignment (see MAPPED_RAM_FILE_OFFSET_ALIGNMENT).
+ *
+ * multifd is needed to keep the unaligned portion of the stream
+ * isolated to the main migration thread while multifd channels
+ * process the aligned data with O_DIRECT enabled.
+ */
+return s->parameters.direct_io &&
+s->capabilities[MIGRATION_CAPABILITY_MAPPED_RAM] &&
+s->capabilities[MIGRATION_CAPABILITY_MULTIFD];
+}
+
 uint64_t migrate_downtime_limit(void)
 {
 MigrationState *s = migrate_get_current();
@@ -905,6 +924,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->mode = s->parameters.mode;
 params->has_zero_page_detection = true;
 params->zero_page_detection = s->parameters.zero_page_detection;
+params->has_direct_io = true;
+params->direct_io = s->parameters.direct_io;
 
 return params;
 }
@@ -937,6 +958,7 @@ void migrate_params_init(MigrationParameters *params)
 params->has_vcpu_dirty_limit = true;
 params->has_mode = true;
 params->has_zero_page_detection = true;
+params->has_direct_io = true;
 }
 
 /*
@@ -1110,6 +1132,11 @@ bool migrate_params_check(MigrationParameters *params, 
Error **errp)
 return false;
 }
 
+if (params->has_direct_io && params->direct_io && !qemu_has_direct_io()) {
+error_setg(errp, "No build-time support for direct-io");
+return false;
+}
+
 return true;
 }
 
@@ -1216,6 +1243,10 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 if (params->has_zero_page_detection) {
 dest->zero_page_detection = params->zero_page_detection;
 }
+
+if (params->has_direct_io) {
+dest->direct_io = params->direct_io;
+}
 }
 
 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
@@ -1341,6 +1372,10 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 if (params->has_zero_page_detection) {
 s->parameters.zero_page_detection = params->zero_page_detection;
 }
+
+if (params->has_direct_io) {
+s->parameters.direct_io = params->direct_io;
+}
 }
 
 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
diff --git a/

[PULL 10/28] io: Stop using qemu_open_old in channel-file

2024-06-21 Thread Fabiano Rosas
We want to make use of the Error object to report fdset errors from
qemu_open_internal() and passing the error pointer to qemu_open_old()
would require changing all callers. Move the file channel to the new
API instead.

Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 io/channel-file.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/io/channel-file.c b/io/channel-file.c
index 6436cfb6ae..2ea8d08360 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -68,11 +68,13 @@ qio_channel_file_new_path(const char *path,
 
 ioc = QIO_CHANNEL_FILE(object_new(TYPE_QIO_CHANNEL_FILE));
 
-ioc->fd = qemu_open_old(path, flags, mode);
+if (flags & O_CREAT) {
+ioc->fd = qemu_create(path, flags & ~O_CREAT, mode, errp);
+} else {
+ioc->fd = qemu_open(path, flags, errp);
+}
 if (ioc->fd < 0) {
 object_unref(OBJECT(ioc));
-error_setg_errno(errp, errno,
- "Unable to open %s", path);
 return NULL;
 }
 
-- 
2.35.3




[PULL 15/28] migration: Add documentation for fdset with multifd + file

2024-06-21 Thread Fabiano Rosas
With the last few changes to the fdset infrastructure, we now allow
multifd to use an fdset when migrating to a file. This is useful for
the scenario where the management layer wants to have control over the
migration file.

By receiving the file descriptors directly, QEMU can delegate some
high level operating system operations to the management layer (such
as mandatory access control). The management layer might also want to
add its own headers before the migration stream.

Document the "file:/dev/fdset/#" syntax for the multifd migration with
mapped-ram. The requirements for the fdset mechanism are:

- the fdset must contain two fds that are not duplicates between
  themselves;

- if direct-io is to be used, exactly one of the fds must have the
  O_DIRECT flag set;

- the file must be opened with WRONLY on the migration source side;

- the file must be opened with RDONLY on the migration destination
  side.

Reviewed-by: Peter Xu 
Signed-off-by: Fabiano Rosas 
---
 docs/devel/migration/main.rst   | 24 +++-
 docs/devel/migration/mapped-ram.rst |  6 +-
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst
index 495cdcb112..784c899dca 100644
--- a/docs/devel/migration/main.rst
+++ b/docs/devel/migration/main.rst
@@ -47,11 +47,25 @@ over any transport.
   QEMU interference. Note that QEMU does not flush cached file
   data/metadata at the end of migration.
 
-In addition, support is included for migration using RDMA, which
-transports the page data using ``RDMA``, where the hardware takes care of
-transporting the pages, and the load on the CPU is much lower.  While the
-internals of RDMA migration are a bit different, this isn't really visible
-outside the RAM migration code.
+  The file migration also supports using a file that has already been
+  opened. A set of file descriptors is passed to QEMU via an "fdset"
+  (see add-fd QMP command documentation). This method allows a
+  management application to have control over the migration file
+  opening operation. There are, however, strict requirements to this
+  interface if the multifd capability is enabled:
+
+- the fdset must contain two file descriptors that are not
+  duplicates between themselves;
+- if the direct-io capability is to be used, exactly one of the
+  file descriptors must have the O_DIRECT flag set;
+- the file must be opened with WRONLY on the migration source side
+  and RDONLY on the migration destination side.
+
+- rdma migration: support is included for migration using RDMA, which
+  transports the page data using ``RDMA``, where the hardware takes
+  care of transporting the pages, and the load on the CPU is much
+  lower.  While the internals of RDMA migration are a bit different,
+  this isn't really visible outside the RAM migration code.
 
 All these migration protocols use the same infrastructure to
 save/restore state devices.  This infrastructure is shared with the
diff --git a/docs/devel/migration/mapped-ram.rst 
b/docs/devel/migration/mapped-ram.rst
index fa4cefd9fc..d352b546e9 100644
--- a/docs/devel/migration/mapped-ram.rst
+++ b/docs/devel/migration/mapped-ram.rst
@@ -16,7 +16,7 @@ location in the file, rather than constantly being added to a
 sequential stream. Having the pages at fixed offsets also allows the
 usage of O_DIRECT for save/restore of the migration stream as the
 pages are ensured to be written respecting O_DIRECT alignment
-restrictions (direct-io support not yet implemented).
+restrictions.
 
 Usage
 -
@@ -35,6 +35,10 @@ Use a ``file:`` URL for migration:
 Mapped-ram migration is best done non-live, i.e. by stopping the VM on
 the source side before migrating.
 
+For best performance enable the ``direct-io`` parameter as well:
+
+``migrate_set_parameter direct-io on``
+
 Use-cases
 -
 
-- 
2.35.3




[PULL 07/28] monitor: Stop removing non-duplicated fds

2024-06-21 Thread Fabiano Rosas
monitor_fdsets_cleanup() currently has three responsibilities:

1- Remove the fds that have been marked for removal(->removed=true) by
   qmp_remove_fd(). This is overly complicated, but ok.

2- Remove any file descriptors that have been passed into QEMU and
   never duplicated[1,2]. A file descriptor without duplicates
   indicates that no part of QEMU has made use of it. This is
   problematic because the current implementation does it only if the
   guest is not running and the monitor is closed.

3- Remove/free fdsets that have become empty due to the above
   removals. This is ok.

The scenario described in (2) is starting to show some cracks now that
we're trying to consume fds from the migration code:

- Doing cleanup every time the last monitor connection closes works to
  reap unused fds, but also has the side effect of forcing the
  management layer to pass the file descriptors again in case of a
  disconnect/re-connect, if that happened to be the only monitor
  connection.

  Another side effect is that removing an fd with qmp_remove_fd() is
  effectively delayed until the last monitor connection closes.

  The usage of mon_refcount is also problematic because it's racy.

- Checking runstate_is_running() skips the cleanup unless the VM is
  running and avoids premature cleanup of the fds, but also has the
  side effect of blocking the legitimate removal of an fd via
  qmp_remove_fd() if the VM happens to be in another state.

  This affects qmp_remove_fd() and qmp_query_fdsets() in particular
  because requesting a removal at a bad time (guest stopped) might
  cause an fd to never be removed, or to be removed at a much later
  point in time, causing the query command to continue showing the
  supposedly removed fd/fdset.

Note that file descriptors that *have* been duplicated are owned by
the code that uses them and will be removed after qemu_close() is
called. Therefore we've decided that the best course of action to
avoid the undesired side-effects is to stop managing non-duplicated
file descriptors.

1- efb87c1697 ("monitor: Clean up fd sets on monitor disconnect")
2- ebe52b592d ("monitor: Prevent removing fd from set during init")

Reviewed-by: Peter Xu 
[fix logic mistake: s/fdset_free/fdset_free_if_empty]
Signed-off-by: Fabiano Rosas 
---
 monitor/fds.c| 15 ---
 monitor/hmp.c|  2 -
 monitor/monitor-internal.h   |  1 -
 monitor/monitor.c|  1 -
 monitor/qmp.c|  2 -
 tests/qtest/libqtest.c   | 15 ---
 tests/qtest/libqtest.h   |  2 +
 tests/qtest/migration-test.c | 82 
 8 files changed, 102 insertions(+), 18 deletions(-)

diff --git a/monitor/fds.c b/monitor/fds.c
index bd45a26368..76199d4b3b 100644
--- a/monitor/fds.c
+++ b/monitor/fds.c
@@ -175,6 +175,11 @@ static void monitor_fdset_free(MonFdset *mon_fdset)
 
 static void monitor_fdset_free_if_empty(MonFdset *mon_fdset)
 {
+/*
+ * Only remove an empty fdset. The fds are owned by the user and
+ * should have been removed with qmp_remove_fd(). The dup_fds are
+ * owned by QEMU and should have been removed with qemu_close().
+ */
 if (QLIST_EMPTY(&mon_fdset->fds) && QLIST_EMPTY(&mon_fdset->dup_fds)) {
 monitor_fdset_free(mon_fdset);
 }
@@ -194,9 +199,7 @@ static void monitor_fdset_cleanup(MonFdset *mon_fdset)
 MonFdsetFd *mon_fdset_fd_next;
 
 QLIST_FOREACH_SAFE(mon_fdset_fd, &mon_fdset->fds, next, mon_fdset_fd_next) 
{
-if ((mon_fdset_fd->removed ||
-(QLIST_EMPTY(&mon_fdset->dup_fds) && mon_refcount == 0)) &&
-runstate_is_running()) {
+if (mon_fdset_fd->removed) {
 monitor_fdset_fd_free(mon_fdset_fd);
 }
 }
@@ -211,7 +214,7 @@ void monitor_fdsets_cleanup(void)
 
 QEMU_LOCK_GUARD(&mon_fdsets_lock);
 QLIST_FOREACH_SAFE(mon_fdset, &mon_fdsets, next, mon_fdset_next) {
-monitor_fdset_cleanup(mon_fdset);
+monitor_fdset_free_if_empty(mon_fdset);
 }
 }
 
@@ -484,9 +487,7 @@ void monitor_fdset_dup_fd_remove(int dup_fd)
 if (mon_fdset_fd_dup->fd == dup_fd) {
 QLIST_REMOVE(mon_fdset_fd_dup, next);
 g_free(mon_fdset_fd_dup);
-if (QLIST_EMPTY(&mon_fdset->dup_fds)) {
-monitor_fdset_cleanup(mon_fdset);
-}
+monitor_fdset_free_if_empty(mon_fdset);
 return;
 }
 }
diff --git a/monitor/hmp.c b/monitor/hmp.c
index 69c1b7e98a..460e8832f6 100644
--- a/monitor/hmp.c
+++ b/monitor/hmp.c
@@ -1437,11 +1437,9 @@ static void monitor_event(void *opaque, QEMUChrEvent 
event)
 monitor_resume(mon);
 }
 qemu_mutex_unlock(&mon->mon_lock);
-mon_refcount++;
 break;
 
 case CHR_EVENT_CLOSED:
-mon_refcount--;
 monitor_fdsets_cleanup();
 break;
 
diff --git a/monitor/monitor-internal.h b/monitor/monitor-internal.h

Re: [RFC PATCH] cxl: avoid duplicating report from MCE & device

2024-06-21 Thread Dan Williams
Shiyang Ruan wrote:
> Background:
> Since CXL device is a memory device, while CPU consumes a poison page of 
> CXL device, it always triggers a MCE by interrupt (INT18), no matter 
> which-First path is configured.  This is the first report.  Then 
> currently, in FW-First path, the poison event is transferred according 
> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES 
>  -> CPER -> trace report.  This is the second one.  These two reports
> are indicating the same poisoning page, which is the so-called "duplicate
> report"[1].  And the memory_failure() handling I'm trying to add in
> OS-First path could also be another duplicate report.
> 
> Hope the flow below could make it easier to understand:
> CPU accesses bad memory on CXL device, then
>  -> MCE (INT18), *always* report (1)
>  -> * FW-First (implemented now)
>   -> CXL device -> FW
> -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)
> * OS-First (not implemented yet, I'm working on it)
>   -> CXL device -> MSI
> -> OS:CXL driver -> memory_failure() (2.b)
> so, the (1) and (2.a/b) are duplicated.
> 
> (I didn't get response in my reply for [1] while I have to make patch to
> solve this problem, so please correct me if my understanding is wrong.)

The CPU MCE may not be in the loop. Consider the case of patrol scrub,
or device-DMA accessing poison. In that case the device will signal a
component event and the CPU may never issue the MCE.

What is missing for me from this description is *why* does the duplicate
report matter in practice? If all that happens is that the kernel
repeats the lookup to offline the page and set the HWPoison bit, is that
duplicated work worth adding more tracking?

> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> to check whether the current poison page has been reported (if yes,
> stop the notifier chain, won't call the following memory_failure()
> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> page already handled(recorded and reported) in (1) or (2), the other one
> won't duplicate the report.  The record could be clear when
> cxl_clear_poison() is called.
> 
> [1] 
> https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be29...@dwillia2-mobl3.amr.corp.intel.com.notmuch/
> 
> Signed-off-by: Shiyang Ruan 
> ---
>  arch/x86/include/asm/mce.h |   1 +
>  drivers/cxl/core/mbox.c| 130 +
>  drivers/cxl/core/memdev.c  |   6 +-
>  drivers/cxl/cxlmem.h   |   3 +
>  4 files changed, 139 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index dfd2e9699bd7..d8109c48e7d9 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
>   MCE_PRIO_NFIT,
>   MCE_PRIO_EXTLOG,
>   MCE_PRIO_UC,
> + MCE_PRIO_CXL,
>   MCE_PRIO_EARLY,
>   MCE_PRIO_CEC,
>   MCE_PRIO_HIGHEST = MCE_PRIO_CEC
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 2626f3fff201..0eb3c5401e81 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -4,6 +4,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev 
> *cxlmd,
>   if (cxlr)
>   hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
>  
> + if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
> + return;
> +
>   if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
>   trace_cxl_general_media(cxlmd, type, cxlr, hpa,
>   &evt->gen_media);
> @@ -1408,6 +1413,127 @@ int cxl_poison_state_init(struct cxl_memdev_state 
> *mds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, CXL);
>  
> +struct cxl_mce_record {
> + struct list_head node;
> + u64 hpa;
> +};
> +LIST_HEAD(cxl_mce_records);
> +DEFINE_MUTEX(cxl_mce_mutex);

I would recommend an xarray for this use case as that already has its
own internal locking and efficient memory allocation for new nodes.

However, the "why" question needs to be answered first.



command line syntax for connecting a chardev to a CPU

2024-06-21 Thread Peter Maydell
Arm CPUs have a "debug communications channel" which on real hardware
is basically a way to talk to the debugger on the other end of a JTAG
connection; Linux supports using this as a console. This patchseries:
 https://patchew.org/QEMU/20240614093026.328271-1-sai.pavan.bo...@amd.com/
proposes implementing this in QEMU by wiring it up to a QEMU chardev.

I think this is useful (among other things, it lets the user sidestep
the "where is my UART?" question). But I'm not sure what the right way
to let the user enable it and pick the chardev on the command line is.
Do we have any relevant existing precedent?

The patchseries has the CPU look for a chardev by ID, so if the user
creates a chardev with id=dcc0 the first CPU will use that, if there's
a chardev with id=dcc1 the second CPU will use that, and so on. I
don't think we really want to make some ID string values be magic,
but maybe we do that already somewhere, and so it's OK to do here?

I thought also of having the CPU take a chardev property, but then the
question is how to specify that on the command line. AFAICT the -cpu
option (a) requires a CPU type first, which is a pain for cases where
otherwise the user has no need to care about the exact type of CPU
because the machine model creates the right one for them, and (b) for
the key=value properties in a -cpu option string it will set the same
property value for every CPU in the system (which obviously isn't what
we want for this chardev).

We could make it a machine property (so you would say eg
 -M xlnx-zcu102,dcc0=mychardev -chardev stdio,id=mychardev)
but then that would require plumbing code in every machine model to
create the property and set the value on the right CPU.

Do we have a neat way to specify per-cpu CPU properties that I'm missing?

thanks
-- PMM



Re: [PATCH 09/13] qapi: convert "Note" sections to plain rST

2024-06-21 Thread John Snow
On Fri, Jun 21, 2024 at 8:23 AM Markus Armbruster  wrote:

> John Snow  writes:
>
> > On Thu, Jun 20, 2024 at 11:46 AM John Snow  wrote:
> >
> >>
> >>
> >> On Thu, Jun 20, 2024, 9:35 AM Markus Armbruster 
> wrote:
> >>
> >>> Markus Armbruster  writes:
> >>>
> >>> > John Snow  writes:
> >>>
> >>> [...]
> >>>
> >>> >> diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
> >>> >> index b3de1fb6b3a..57598331c5c 100644
> >>> >> --- a/qga/qapi-schema.json
> >>> >> +++ b/qga/qapi-schema.json
> >>>
> >>> [...]
> >>>
> >>> >> @@ -631,8 +632,8 @@
> >>> >>  # Errors:
> >>> >>  # - If hybrid suspend is not supported, Unsupported
> >>> >>  #
> >>> >> -# Notes: It's strongly recommended to issue the guest-sync command
> >>> >> -# before sending commands when the guest resumes
> >>> >> +# .. note:: It's strongly recommended to issue the guest-sync
> command
> >>> >> +#before sending commands when the guest resumes.
> >>> >>  #
> >>> >>  # Since: 1.1
> >>> >>  ##
> >>> >> @@ -1461,16 +1462,15 @@
> >>> >>  # * POSIX: as defined by os-release(5)
> >>> >>  # * Windows: contains string "server" or "client"
> >>> >>  #
> >>> >> -# Notes: On POSIX systems the fields @id, @name, @pretty-name,
> >>> >> -# @version, @version-id, @variant and @variant-id follow the
> >>> >> -# definition specified in os-release(5). Refer to the manual
> page
> >>> >> -# for exact description of the fields.  Their values are taken
> >>> >> -# from the os-release file.  If the file is not present in the
> >>> >> -# system, or the values are not present in the file, the fields
> >>> >> -# are not included.
> >>> >> +# .. note:: On POSIX systems the fields @id, @name, @pretty-name,
> >>> >> +#@version, @version-id, @variant and @variant-id follow the
> >>> >> +#definition specified in os-release(5). Refer to the manual
> page
> >>> for
> >>> >> +#exact description of the fields.  Their values are taken from
> the
> >>> >> +#os-release file.  If the file is not present in the system, or
> >>> the
> >>> >> +#values are not present in the file, the fields are not
> included.
> >>> >>  #
> >>> >> -# On Windows the values are filled from information gathered
> from
> >>> >> -# the system.
> >>> >> +#On Windows the values are filled from information gathered
> from
> >>> >> +#the system.
> >>> >
> >>> > Please don't change the indentation here.  I get the same output with
> >>> >
> >>> >   @@ -1461,7 +1462,7 @@
> >>> ># * POSIX: as defined by os-release(5)
> >>> ># * Windows: contains string "server" or "client"
> >>> >#
> >>> >   -# Notes: On POSIX systems the fields @id, @name, @pretty-name,
> >>> >   +# .. note:: On POSIX systems the fields @id, @name, @pretty-name,
> >>> ># @version, @version-id, @variant and @variant-id follow the
> >>> ># definition specified in os-release(5). Refer to the manual
> page
> >>> ># for exact description of the fields.  Their values are taken
> >>>
> >>> I'm blind.  Actually, you change indentation of subsequent lines from 4
> >>> to 3 *everywhere*.  I guess you do that to make subsequent lines line
> up
> >>> with the directive, here "note".
> >>>
> >>> Everywhere else, we indent such lines by 4.  Hmm.  How terrible would
> it
> >>> be not to mess with the alignment?
> >>>
> >>> If we want to use 3 for directives, is it worth pointing out in the
> >>> commit message?
> >>>
> >>> [...]
> >>>
> >>
> >> Let me look up some conventions and see what's the most prominent... as
> >> well as testing what emacs does by default (or if emacs can be
> configured
> >> to do what we want with in-tree style config. Warning: I am functionally
> >> inept at emacs lisp. Warning 2x: [neo]vi[m] users, you're entirely on
> your
> >> own. I'm sorry.)
> >>
> >> I use three myself by force of habit and have some mild reluctance to
> >> respin for that reason, but ... guess we ought to be consistent if we
> can.
> >>
> >> (No idea where my habit came from. Maybe it is just because it looks
> nice
> >> to me and no other reason.)
> >>
> >> ((I have no plans, nor desire, to write any kind of checker to enforce
> >> this, though - sorry.))
> >>
> >
> > Sphinx doc uses three spaces:
> >
> https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#directives
> >
> > ... but it warns that it's arbitrary; but it seems common to align with
> the
> > directive.
> >
> > *
> >
> https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#footnotes
> >footnotes require "at least 3" spaces
> >
> > *
> >
> https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#directives
> >   directives are only required to be "indented" but the amount isn't
> > specified. rst docs use three.
> >
> > I'm happy with three; I don't believe we need to make it consistent with
> > e.g. our home-spun field list syntax (arguments, features) or with code
> > blocks. I think whatever looks good in the source is fine, and I 

Re: [RFC PATCH 0/7] migration/multifd: Introduce storage slots

2024-06-21 Thread Maciej S. Szmigiero

On 21.06.2024 17:56, Peter Xu wrote:

On Fri, Jun 21, 2024 at 05:31:54PM +0200, Maciej S. Szmigiero wrote:

On 21.06.2024 17:04, Fabiano Rosas wrote:

"Maciej S. Szmigiero"  writes:


Hi Fabiano,

On 20.06.2024 23:21, Fabiano Rosas wrote:

Hi folks,

First of all, apologies for the roughness of the series. I'm off for
the next couple of weeks and wanted to put something together early
for your consideration.

This series is a refactoring (based on an earlier, off-list
attempt[0]), aimed to remove the usage of the MultiFDPages_t type in
the multifd core. If we're going to add support for more data types to
multifd, we first need to clean that up.

This time around this work was prompted by Maciej's series[1]. I see
you're having to add a bunch of is_device_state checks to work around
the rigidity of the code.

Aside from the VFIO work, there is also the intent (coming back from
Juan's ideas) to make multifd the default code path for migration,
which will have to include the vmstate migration and anything else we
put on the stream via QEMUFile.

I have long since been bothered by having 'pages' sprinkled all over
the code, so I might be coming at this with a bit of a narrow focus,
but I believe in order to support more types of payloads in multifd,
we need to first allow the scheduling at multifd_send_pages() to be
independent of MultiFDPages_t. So here it is. Let me know what you
think.


Thanks for the patch set, I quickly glanced at these patches and they
definitely make sense to me.


(..)

(as I said, I'll be off for a couple of weeks, so feel free to
incorporate any of this code if it's useful. Or to ignore it
completely).


I guess you are targeting QEMU 9.2 rather than 9.1 since 9.1 has
feature freeze in about a month, correct?



For general code improvements like this I'm not thinking about QEMU
releases at all. But this series is not super complex, so I could
imagine we merging it in time for 9.1 if we reach an agreement.

Are you thinking your series might miss the target? Or have concerns
over the stability of the refactoring? We can within reason merge code
based on the current framework and improve things on top, we already did
something similar when merging zero-page support. I don't have an issue
with that.


The reason that I asked whether you are targeting 9.1 is because my
patch set is definitely targeting that release.

At the same time my patch set will need to be rebased/refactored on top
of this patch set if it is supposed to be merged for 9.1 too.

If this patch set gets merged quickly that's not really a problem.

On the other hand, if another iteration(s) is/are needed AND you are
not available in the coming weeks to work on them then there's a
question whether we will make the required deadline.


I think it's a bit rush to merge the vfio series in this release.  I'm not
sure it has enough time to be properly reviewed, reposted, retested, etc.

I've already started looking at it, and so far I think I have doubt not
only on agreement with Fabiano on the device_state thing which I prefer to
avoid, but also I'm thinking of any possible way to at least make the
worker threads generic too: a direct impact could be vDPA in the near
future if anyone cared, while I don't want modules to create threads
randomly during migration.

Meanwhile I'm also thinking whether that "the thread needs to dump all
data, and during iteration we can't do that" is the good reason to not
support that during iterations.

I didn't yet reply because I don't think I think all things through, but
I'll get there.

So I'm not saying that the design is problematic, but IMHO it's just not
mature enough to assume it will land in 9.1, considering it's still a large
one, and the first non-rfc version just posted two days ago.



The RFC version was posted more than 2 months ago.

It has received some review comments from multiple people,
all of which were addressed in this patch set version.

I have not received any further comments during these 2 months, so I thought
the overall design is considered okay - if anything, there might be minor
code comments/issues but these can easily be improved/fixed in the 5 weeks
remaining to the soft code freeze for 9.1.


If anything, I think that the VM live phase (non-downtime) transfers
functionality should be deferred until 9.2 because:
* It wasn't a part of the RFC so even if implemented today would get much
less testing overall,

* It's orthogonal to the switchover time device state transfer functionality
introduced by this patch set and could be added on top of that without
changing the wire protocol for switchover time device state transfers,

* It doesn't impact the switchover downtime so in this case 9.1 would
already contain all what's necessary to improve it.


Thanks,
Maciej




Re: [PATCH v3 00/11] migration: New postcopy state, and some cleanups

2024-06-21 Thread Fabiano Rosas
On Wed, 19 Jun 2024 18:30:35 -0400, Peter Xu wrote:
> Based-on: <20240617185731.9725-1-faro...@suse.de>
> 
> v3:
> - Added one comment in patch 8 explaining why migrate_incoming_qmp() needs
>   to keep enabling "events" capability.
> - Split patch 9 into two patches, which makes migration_event_wait() to be
>   used also in migrate_incoming_qmp()
> - Rename the tests in last patch, and a spell fix
> - Rebased to "[PATCH v3 00/16] migration/mapped-ram: Add direct-io support"
> 
> [...]

Queued, thanks!



Re: [PATCH] migration: Remove unused VMSTATE_ARRAY_TEST() macro

2024-06-21 Thread Fabiano Rosas
On Fri, 21 Jun 2024 09:03:17 +0200, Philippe Mathieu-Daudé wrote:
> Last use of VMSTATE_ARRAY_TEST() was removed in commit 46baa9007f
> ("migration/i386: Remove old non-softfloat 64bit FP support"), we
> can safely get rid of it.
> 
> 

Queued, thanks!



Re: [PATCH 09/13] qapi: convert "Note" sections to plain rST

2024-06-21 Thread John Snow
On Fri, Jun 21, 2024 at 8:08 AM Markus Armbruster  wrote:

> John Snow  writes:
>
> > We do not need a dedicated section for notes. By eliminating a specially
> > parsed section, these notes can be treated as normal rST paragraphs in
> > the new QMP reference manual, and can be placed and styled much more
> > flexibly.
> >
> > Convert all existing "Note" and "Notes" sections to pure rST. As part of
> > the conversion, capitalize the first letter of each sentence and add
> > trailing punctuation where appropriate to ensure notes look sensible and
> > consistent in rendered HTML documentation.
> >
> > Update docs/devel/qapi-code-gen.rst to reflect the new paradigm, and ...
> >
> > ... Update the QAPI parser to prohibit "Note" sections while suggesting
> > a new syntax. The exact formatting to use is a matter of taste, but a
> > good candidate is simply:
> >
> > .. note:: lorem ipsum ...
> >
> > ... but there are other choices, too. The Sphinx readthedocs theme
> > offers theming for the following forms (capitalization unimportant); all
> > are adorned with a (!) symbol in the title bar for rendered HTML docs.
> >
> > See
> >
> https://sphinx-rtd-theme.readthedocs.io/en/stable/demo/demo.html#admonitions
> > for examples of each directive/admonition in use.
> >
> > These are rendered in orange:
> >
> > .. Attention:: ...
> > .. Caution:: ...
> > .. WARNING:: ...
> >
> > These are rendered in red:
> >
> > .. DANGER:: ...
> > .. Error:: ...
> >
> > These are rendered in green:
> >
> > .. Hint:: ...
> > .. Important:: ...
> > .. Tip:: ...
> >
> > These are rendered in blue:
> >
> > .. Note:: ...
> > .. admonition:: custom title
> >
> >admonition body text
> >
> > This patch uses ".. note::" almost everywhere, with just two "caution"
> > directives. ".. admonition:: notes" is used in a few places where we had
> > an ordered list of multiple notes that would not make sense as
> > standalone/separate admonitions.
> >
> > Signed-off-by: John Snow 
> > Acked-by: Stefan Hajnoczi  [for block*.json]
>
> [...]
>
> > diff --git a/qapi/qom.json b/qapi/qom.json
> > index 8bd299265e3..5bfa0ded42c 100644
> > --- a/qapi/qom.json
> > +++ b/qapi/qom.json
> > @@ -195,12 +195,12 @@
> >  #
> >  # @typename: the type name of an object
> >  #
> > -# Note: objects can create properties at runtime, for example to
> > -# describe links between different devices and/or objects.  These
> > -# properties are not included in the output of this command.
> > -#
> >  # Returns: a list of ObjectPropertyInfo describing object properties
> >  #
> > +# .. note:: Objects can create properties at runtime, for example to
> > +#describe links between different devices and/or objects.  These
> > +#properties are not included in the output of this command.
> > +#
> >  # Since: 2.12
> >  ##
>
> You move the note.  Commit message doesn't tell why.
>
> >  { 'command': 'qom-list-properties',
>
> [...]
>

"v2" of this series now declines to move the note in this patch and instead
moves it in a separate patch that also enforces source order more strictly
so that the move can be explained in detail.

Rendering order diverges from source order briefly as a result; I will
mention that in the commit message instead.

(I don't think it's easy or worth doing to re-order the patches such that
source and render order never diverge; too much engineering for so
temporary a minor issue. Not to mention the source and render order is
already divergent in many places, so I don't think it's a regression so
much as it is a temporary  lateralgression?)


Re: [PATCH 04/13] qapi/parser: preserve indentation in QAPIDoc sections

2024-06-21 Thread John Snow
On Fri, Jun 21, 2024 at 2:38 AM Markus Armbruster  wrote:

> John Snow  writes:
>
> > On Thu, Jun 20, 2024, 11:07 AM Markus Armbruster 
> wrote:
> >
> >> John Snow  writes:
> >>
> >> > On Wed, Jun 19, 2024, 8:03 AM Markus Armbruster 
> wrote:
> >> >
> >> >> John Snow  writes:
> >> >>
> >> >> > Change get_doc_indented() to preserve indentation on all
> subsequent text
> >> >> > lines, and create a compatibility dedent() function for qapidoc.py
> to
> >> >> > remove that indentation. This is being done for the benefit of a
> new
> >> >>
> >> >> Suggest "remove indentation the same way get_doc_indented() did."
> >> >>
> >> >
> >> > Aight.
> >> >
> >> >
> >> >> > qapidoc generator which requires that indentation in argument and
> >> >> > features sections are preserved.
> >> >> >
> >> >> > Prior to this patch, a section like this:
> >> >> >
> >> >> > ```
> >> >> > @name: lorem ipsum
> >> >> >dolor sit amet
> >> >> >  consectetur adipiscing elit
> >> >> > ```
> >> >> >
> >> >> > would have its body text be parsed as:
> >> >>
> >> >> Suggest "parsed into".
> >> >>
> >> >
> >> > Why? (I mean, I'll do it, but I don't see the semantic difference
> >> > personally)
> >> >
> >>
> >> "Parse as " vs. "Parse into ".
> >>
> >> >> > (first and final newline only for presentation)
> >> >> >
> >> >> > ```
> >> >> > lorem ipsum
> >> >> > dolor sit amet
> >> >> >   consectetur adipiscing elit
> >> >> > ```
> >> >> >
> >> >> > We want to preserve the indentation for even the first body line
> so that
> >> >> > the entire block can be parsed directly as rST. This patch would
> now
> >> >> > parse that segment as:
> >> >>
> >> >> If you change "parsed as" to "parsed into" above, then do it here,
> too.
> >> >>
> >> >> >
> >> >> > ```
> >> >> > lorem ipsum
> >> >> >dolor sit amet
> >> >> >  consectetur adipiscing elit
> >> >> > ```
> >> >> >
> >> >> > This is helpful for formatting arguments and features as field
> lists in
> >> >> > rST, where the new generator will format this information as:
> >> >> >
> >> >> > ```
> >> >> > :arg type name: lorem ipsum
> >> >> >dolor sit amet
> >> >> >  consectetur apidiscing elit
> >> >> > ```
> >> >> >
> >> >> > ...and can be formed by the simple concatenation of the field list
> >> >> > construct and the body text. The indents help preserve the
> continuation
> >> >> > of a block-level element, and further allow the use of additional
> rST
> >> >> > block-level constructs such as code blocks, lists, and other such
> >> >> > markup. Avoiding reflowing the text conditionally also helps
> preserve
> >> >> > source line context for better rST error reporting from sphinx
> through
> >> >> > generated source, too.
> >> >>
> >> >> What do you mean by "reflowing"?
> >> >>
> >> >
> >> > Poorly phrased, was thinking about emacs too much. I mean munging the
> text
> >> > post-hoc for the doc generator such that newlines are added or
> removed in
> >> > the process of re-formatting text to get the proper indentation for
> the new
> >> > rST form.
> >> >
> >> > In prototyping, this got messy very quickly and was difficult to
> correlate
> >> > source line numbers across the transformation.
> >> >
> >> > It was easier to just not munge the text at all instead of munging it
> and
> >> > then un-munging it.
> >> >
> >> > (semantic satiation: munge munge munge munge.)
> >>
> >> Is this about a possible alternative solution you explored?  Keeping
> >> .get_doc_indented() as is, and then try to undo its damage?
> >>
> >
> > precisamente. That solution was categorically worse.
>
> Since .get_doc_indented() removes N spaces of indentation, we'd want to
> add back N spaces of indentation.  But we can't know N, so I guess we'd
> make do with an arbitrary number.  Where would reflowing come it?
>
> I'd like you to express more clearly that you're talking about an
> alternative you rejected.  Perhaps like this:
>
>   block-level constructs such as code blocks, lists, and other such
>   markup.
>
>   The alternative would be to somehow undo .get_doc_indented()'s
>   indentation changes in the new generator.  Much messier.
>
> Feel free to add more detail to the last paragraph.
>

Eh, I just deleted it. I recall running into troubles but I can't
articulate the precise conditions because as you point out, it's a doomed
strategy for other reasons - you can't reconstruct the proper indentation.

This patch is still the correct way to go, so I don't have to explain my
failures at length in the commit message ... I just like giving people
clues for *why* I decided to implement things a certain way, because I
often find that more instructive than the "how". In this case, the "why" is
probably more properly summarized as "it's a total shitshow in that
direction, trust me"

--js


Re: [PATCH RFC v2 0/3] cxl: Multi-headed Single Logical Device (MHSLD)

2024-06-21 Thread Jonathan Cameron via
On Wed, 24 Apr 2024 13:04:05 -0700
Svetly Todorov  wrote:

> MHSLDs allow multiple hosts to access dynamic capacity on a single
> backing device. This complicates DC management because adds,
> removals, and accesses need to be vetted such that hosts don't
> stomp on each other's data.
> 
> This patchset proposes a set of hooks to be called in cxl_type3.c
> when each of the above events happens. The results of the hooks
> can be used to prevent illegal DC operations in the corresponding
> cxl_* functions. 

Hi,

I had a quick go at applying this to my gitlab tree cxl staging tree
to make it more generally available.  It unfortunately needs some updates
for the change to Extent Groups in the most recent DCD code.

Whilst I can probably sort that, it's going to happen particularly soon.

I'll be pushing a new tree shortly if you have time to rebase.

Jonathan



Re: standardizing i2c device ids

2024-06-21 Thread Patrick Leis
On Thu, Jun 20, 2024 at 11:26 PM Philippe Mathieu-Daudé 
wrote:

> Hi Patrick,
>
> On 21/6/24 00:03, Patrick Leis wrote:
> > Corey and Peter,
> >
> > My team builds lots of configurations for Qemu boards, and one pain
> > point has been that the qom path for a device depends on the device
> > insertion order, child[0], child[1] and the like.  I noticed that the
> > qdev paths for devices also exist by their device id property.  By
> > default, this ends up being the device type name.  I was wondering if it
> > made sense to override this with the device type plus the smbus
> > address?  I did something similar with the i2c mux device, to resolve
> > part of this issue.
>
> Including Markus since we discussed this with him last year, but
> I don't remember correctly what was agreed / decided :S
>

Thanks :)

I'd really like to be able to access devices with paths that specify the
device I want specifically :)


>
> Regards,
>
> Phil.
>


Re: [RFC PATCH] cxl: avoid duplicating report from MCE & device

2024-06-21 Thread Jonathan Cameron via
On Fri, 21 Jun 2024 18:16:33 +0800
Shiyang Ruan  wrote:

> 在 2024/6/21 1:02, Jonathan Cameron 写道:
> > On Wed, 19 Jun 2024 00:53:10 +0800
> > Shiyang Ruan  wrote:
> >   
> >> Background:
> >> Since CXL device is a memory device, while CPU consumes a poison page of
> >> CXL device, it always triggers a MCE by interrupt (INT18), no matter
> >> which-First path is configured.  This is the first report.  Then
> >> currently, in FW-First path, the poison event is transferred according
> >> to the following process: CXL device -> firmware -> OS:ACPI->APEI->GHES  
> >>   -> CPER -> trace report.  This is the second one.  These two reports  
> >> are indicating the same poisoning page, which is the so-called "duplicate
> >> report"[1].  And the memory_failure() handling I'm trying to add in
> >> OS-First path could also be another duplicate report.
> >>
> >> Hope the flow below could make it easier to understand:
> >> CPU accesses bad memory on CXL device, then  
> >>   -> MCE (INT18), *always* report (1)
> >>   -> * FW-First (implemented now)
> >>-> CXL device -> FW
> >>  -> OS:ACPI->APEI->GHES->CPER -> trace report (2.a)  
> >>  * OS-First (not implemented yet, I'm working on it)  
> >>-> CXL device -> MSI
> >>  -> OS:CXL driver -> memory_failure() (2.b)  
> >> so, the (1) and (2.a/b) are duplicated.
> >>
> >> (I didn't get response in my reply for [1] while I have to make patch to
> >> solve this problem, so please correct me if my understanding is wrong.)
> >>
> >> This patch adds a new notifier_block and MCE_PRIO_CXL, for CXL memdev
> >> to check whether the current poison page has been reported (if yes,
> >> stop the notifier chain, won't call the following memory_failure()
> >> to report), into `x86_mce_decoder_chain`.  In this way, if the poison
> >> page already handled(recorded and reported) in (1) or (2), the other one
> >> won't duplicate the report.  The record could be clear when
> >> cxl_clear_poison() is called.
> >>
> >> [1] 
> >> https://lore.kernel.org/linux-cxl/664d948fb86f0_e8be29...@dwillia2-mobl3.amr.corp.intel.com.notmuch/
> >>
> >> Signed-off-by: Shiyang Ruan   
> > 
> > So poison can be cleared in a number of ways and a CXL poison clear command
> > is unfortunately only one of them.  Some architectures have instructions
> > that guarantee to write a whole cacheline and can clear things as well.
> > I believe x86 does for starters.  
> 
> According to the CXL Spec, to clear an error record on device, an 
> explicit clear operation is required (I think this means sending a mbox 
> command).  I'm not sure if it is able to clear device error by just 
> writing a whole cacheline.
> 

Please give a spec reference.  The only one I'm immediately seeing is
in 8.3.9.9.4.1 Get Poison List (opcode 43000h)
which says
"When poison is cleared"
but doesn't talk about how.

For TSP cases Clear poison is not allowed, so if they want to clear it
they will have to do it a suitable CPU arch approach not that command
(which may not be implemented in a given device - I gather it is
 awkward to do and a backdoor from control path to datapath isn't
 a popular feature!).

+CC John Groves.  John, any info you can share on whether you expect all
devices with a poison list to support the clear poison command?



> > 
> > +CC linux-edac and related maintainers / reviewers.
> >  linux-mm and hwpoison maintainer.
> > 
> > So I think this needs a more general solution that encompasses
> > more general cleanup of poison.
> > 
> > Trivial comments inline.  
> 
> Thanks
> 
> > 
> > Jonathan
> > 
> >   
> >> ---
> >>   arch/x86/include/asm/mce.h |   1 +
> >>   drivers/cxl/core/mbox.c| 130 +
> >>   drivers/cxl/core/memdev.c  |   6 +-
> >>   drivers/cxl/cxlmem.h   |   3 +
> >>   4 files changed, 139 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> >> index dfd2e9699bd7..d8109c48e7d9 100644
> >> --- a/arch/x86/include/asm/mce.h
> >> +++ b/arch/x86/include/asm/mce.h
> >> @@ -182,6 +182,7 @@ enum mce_notifier_prios {
> >>MCE_PRIO_NFIT,
> >>MCE_PRIO_EXTLOG,
> >>MCE_PRIO_UC,
> >> +  MCE_PRIO_CXL,
> >>MCE_PRIO_EARLY,
> >>MCE_PRIO_CEC,
> >>MCE_PRIO_HIGHEST = MCE_PRIO_CEC
> >> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> >> index 2626f3fff201..0eb3c5401e81 100644
> >> --- a/drivers/cxl/core/mbox.c
> >> +++ b/drivers/cxl/core/mbox.c
> >> @@ -4,6 +4,8 @@
> >>   #include 
> >>   #include 
> >>   #include 
> >> +#include 
> >> +#include 
> >>   #include 
> >>   #include 
> >>   #include 
> >> @@ -880,6 +882,9 @@ void cxl_event_trace_record(const struct cxl_memdev 
> >> *cxlmd,
> >>if (cxlr)
> >>hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
> >>   
> >> +  if (hpa != ULLONG_MAX && cxl_mce_recorded(hpa))
> >> +  return;
> >> +
> >>if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
> >>  

Re: [PATCH] hw/usb/hcd-ohci: Fix ohci_service_td: accept valid TDs

2024-06-21 Thread Peter Maydell
On Fri, 21 Jun 2024 at 17:24, Cord Amfmgm  wrote:
>
>
> On Fri, Jun 21, 2024 at 10:21 AM Peter Maydell  
> wrote:
>> Thanks; I've picked up this patch for target-arm.next (as with
>> your previous one for hcd-ohci, adjusting the Author and
>> Signed-off-by lines to both read David Hubbard).
>>
>> I tweaked the commit message a little bit, so the middle part reads:
>>
>> What this patch does is loosen the qemu ohci implementation to allow a
>> zero-length packet if td.be (Buffer End) is set to td.cbp - 1, and with a
>> non-zero td.cbp value.
>>
>> The spec is unclear whether this is valid or not -- it is not the
>> clearly documented way to send a zero length TD (which is CBP=BE=0),
>> but it isn't specifically forbidden. Actual hw seems to be ok with it.
>
> That tweak looks great.
>
> Thank you for your patience working with me to get this patch into a good 
> shape.
>
> This was my first attempt to contribute to qemu - really appreciate it.

You're welcome -- thanks for the effort you've put in on your end
working through our code review process.

-- PMM



Re: [PATCH 2/3] target/ppc: Update VMX storage access insns to use tcg_gen_qemu_ld/st_i128.

2024-06-21 Thread Richard Henderson

On 6/21/24 09:34, Richard Henderson wrote:

On 6/21/24 04:46, Chinmay Rath wrote:

+    tcg_gen_qemu_ld_i128(avr, EA, ctx->mem_idx, DEF_MEMOP(MO_128));
+    set_avr_full(a->rt, avr);


This needs to specify atomicity as well.  This is much more important to for 16 byte 
operations than smaller accesses, as this might require stop-the-world semantics depending 
on the host.


According to section 1.4 Storage Atomicity, we need no more than 8-byte atomicity for 
these vector operations, and then the following the alignment bits down.


So: MO_128 | MO_ATOM_IFALIGN_PAIR,


Actually, you need MO_ATOM_SUBALIGN semantics, maxing out at MO_64, which hasn't been 
implemented.  But since none of the rest of target/ppc has been updated to use SUBALIGN, 
using IFALIGN is not a regression.



r~




[RFC PATCH 1/1] hw/arm: FW first ARM processor error injection.

2024-06-21 Thread shiju . jose--- via
From: Shiju Jose 

Add support for FW first ARM processor error injection.

Compliance with N.2.4.4 ARM Processor Error Section in
UEFI 2.9A/2.10 specs.

Examples,
{ "execute": "arm-inject-error",
  "arguments": {
"errortypes": ['cache-error']
  }
}

{ "execute": "arm-inject-error",
  "arguments": {
"errortypes": ['tlb-error']
  }
}

{ "execute": "arm-inject-error",
  "arguments": {
"errortypes": ['bus-error']
  }
}

{ "execute": "arm-inject-error",
  "arguments": {
"errortypes": ['cache-error', 'tlb-error']
  }
}

{ "execute": "arm-inject-error",
  "arguments": {
"errortypes": ['cache-error', 'tlb-error', 'bus-error', 
'micro-arch-error']
  }
}
etc.

Signed-off-by: Shiju Jose 
---
 configs/targets/aarch64-softmmu.mak |   1 +
 hw/acpi/ghes.c  | 170 +++-
 hw/arm/Kconfig  |   4 +
 hw/arm/arm_error_inject.c   |  49 
 hw/arm/arm_error_inject_stubs.c |  19 
 hw/arm/meson.build  |   2 +
 include/hw/acpi/ghes.h  |   1 +
 qapi/arm-error-inject.json  |  44 +++
 qapi/meson.build|   1 +
 qapi/qapi-schema.json   |   1 +
 10 files changed, 291 insertions(+), 1 deletion(-)
 create mode 100644 hw/arm/arm_error_inject.c
 create mode 100644 hw/arm/arm_error_inject_stubs.c
 create mode 100644 qapi/arm-error-inject.json

diff --git a/configs/targets/aarch64-softmmu.mak 
b/configs/targets/aarch64-softmmu.mak
index b4338e9568..6f9daa9b17 100644
--- a/configs/targets/aarch64-softmmu.mak
+++ b/configs/targets/aarch64-softmmu.mak
@@ -3,3 +3,4 @@ TARGET_BASE_ARCH=arm
 TARGET_SUPPORTS_MTTCG=y
 TARGET_XML_FILES= gdb-xml/aarch64-core.xml gdb-xml/aarch64-fpu.xml 
gdb-xml/arm-core.xml gdb-xml/arm-vfp.xml gdb-xml/arm-vfp3.xml 
gdb-xml/arm-vfp-sysregs.xml gdb-xml/arm-neon.xml gdb-xml/arm-m-profile.xml 
gdb-xml/arm-m-profile-mve.xml gdb-xml/aarch64-pauth.xml
 TARGET_NEED_FDT=y
+CONFIG_ARM_EINJ=y
diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
index 34d8b8a518..ba18e849e2 100644
--- a/hw/acpi/ghes.c
+++ b/hw/acpi/ghes.c
@@ -31,6 +31,7 @@
 #include "qemu/uuid.h"
 #include "hw/cxl/cxl_device.h"
 #include "hw/cxl/cxl.h"
+#include "qapi/qapi-types-arm-error-inject.h"
 
 #define ACPI_GHES_ERRORS_FW_CFG_FILE"etc/hardware_errors"
 #define ACPI_GHES_DATA_ADDR_FW_CFG_FILE "etc/hardware_errors_addr"
@@ -57,6 +58,7 @@
 /* The memory section CPER size, UEFI 2.6: N.2.5 Memory Error Section */
 #define ACPI_GHES_MEM_CPER_LENGTH   80
 #define ACPI_GHES_PCIE_CPER_LENGTH 208
+#define ACPI_GHES_ARM_CPER_LENGTH (72 + 600)
 
 /* Masks for block_status flags */
 #define ACPI_GEBS_UNCORRECTABLE 1
@@ -189,6 +191,94 @@ static void acpi_ghes_build_append_mem_cper(GArray *table,
 build_append_int_noprefix(table, 0, 7);
 }
 
+/* UEFI 2.9: N.2.4.4 ARM Processor Error Section */
+static void acpi_ghes_build_append_arm_cper(uint8_t error_types, GArray *table)
+{
+/*
+ * ARM Processor Error Record
+ */
+
+/* Validation Bits */
+build_append_int_noprefix(table,
+  (1ULL << 3) | /* Vendor specific info Valid */
+  (1ULL << 2) | /* Running status Valid */
+  (1ULL << 1) | /* Error affinity level Valid */
+  (1ULL << 0), /* MPIDR Valid */
+  4);
+/* Error Info Num */
+build_append_int_noprefix(table, 1, 2);
+/* Context Info Num */
+build_append_int_noprefix(table, 1, 2);
+/* Section length */
+build_append_int_noprefix(table, ACPI_GHES_ARM_CPER_LENGTH, 4);
+/* Error affinity level */
+build_append_int_noprefix(table, 2, 1);
+/* Reserved */
+build_append_int_noprefix(table, 0, 3);
+/* MPIDR_EL1 */
+build_append_int_noprefix(table, 0xAB12, 8);
+/* MIDR_EL1 */
+build_append_int_noprefix(table, 0xCD24, 8);
+/* Running state */
+build_append_int_noprefix(table, 0x1, 4);
+/* PSCI state */
+build_append_int_noprefix(table, 0x1234, 4);
+
+/* ARM Propcessor error information */
+/* Version */
+build_append_int_noprefix(table, 0, 1);
+/*  Length */
+build_append_int_noprefix(table, 32, 1);
+/* Validation Bits */
+build_append_int_noprefix(table,
+  (1ULL << 4) | /* Physical fault address Valid */
+ (1ULL << 3) | /* Virtual fault address Valid */
+ (1ULL << 2) | /* Error information Valid */
+  (1ULL << 1) | /* Flags Valid */
+  (1ULL << 0), /* Multiple error count Valid */
+  2);
+/* Type */
+if (error_types & BIT(ARM_PROCESSOR_ERROR_TYPE_CACHE_ERROR) ||
+error_types & BIT(ARM_PROCESSOR_ERROR_TYPE_TLB_ERROR) ||
+error_types & BIT(ARM_PROCESSOR_ERROR_TYPE_BUS_ERROR) ||
+   

Re: [PATCH v2 0/2] target/arm: Enable FEAT_Debugv8p8 for -cpu max

2024-06-21 Thread Richard Henderson

On 6/21/24 07:39, Gustavo Romero wrote:

Gustavo Romero (2):
   target/arm: Move initialization of debug ID registers
   target/arm: Enable FEAT_Debugv8p8 for -cpu max


Reviewed-by: Richard Henderson 

r~



Re: [PATCH] bswap: Add st24_be_p() to store 24 bits in big-endian order

2024-06-21 Thread Richard Henderson

On 6/21/24 00:56, Philippe Mathieu-Daudé wrote:

Commit 14180d6221 ("bswap: Add the ability to store to an
unaligned 24 bit field") added st24_le_p() for little
endianness, add st24_be_p() equivalent for bit one.

Signed-off-by: Philippe Mathieu-Daudé 
---
Some SD card registers are 3 bytes wide stored MSB first.
---
  include/qemu/bswap.h | 7 +++
  1 file changed, 7 insertions(+)


Reviewed-by: Richard Henderson 


r~



Re: [PATCH 2/3] target/ppc: Update VMX storage access insns to use tcg_gen_qemu_ld/st_i128.

2024-06-21 Thread Richard Henderson

On 6/21/24 04:46, Chinmay Rath wrote:

+tcg_gen_qemu_ld_i128(avr, EA, ctx->mem_idx, DEF_MEMOP(MO_128));
+set_avr_full(a->rt, avr);


This needs to specify atomicity as well.  This is much more important to for 16 byte 
operations than smaller accesses, as this might require stop-the-world semantics depending 
on the host.


According to section 1.4 Storage Atomicity, we need no more than 8-byte atomicity for 
these vector operations, and then the following the alignment bits down.


So: MO_128 | MO_ATOM_IFALIGN_PAIR,


r~



Re: [PATCH v2] exec: don't use void* in pointer arithmetic in headers

2024-06-21 Thread Paolo Bonzini

On 6/20/24 22:16, Roman Kiryanov wrote:

void* pointer arithmetic is a GCC extentension
which could not be available in other build
tools (e.g. C++). This changes removes this
assumption.

Google-Bug-Id: 331190993
Change-Id: I5a064853429f627c17a9213910811dea4ced6174
Signed-off-by: Roman Kiryanov 
Suggested-by: Paolo Bonzini 
---
v2: renamed from "use char* for pointer arithmetic"
 and removed all explicit extra cast with
 one typedef in memory.h.

  include/exec/memory.h | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index b1713f30b8..b616338f05 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -2795,8 +2795,10 @@ MemTxResult address_space_write_rom(AddressSpace *as, 
hwaddr addr,
  #define ARG1_DECLAddressSpace *as
  #include "exec/memory_ldst_phys.h.inc"
  
+typedef uint8_t *MemoryRegionCachePtr;

+
  struct MemoryRegionCache {
-void *ptr;
+MemoryRegionCachePtr ptr;


Just "uint8_t *ptr" is enough; thanks for testing that it's enough.

Queued for the next pull request, thanks.

Paolo



  hwaddr xlat;
  hwaddr len;
  FlatView *fv;





Re: [PATCH] hw/usb/hcd-ohci: Fix ohci_service_td: accept valid TDs

2024-06-21 Thread Cord Amfmgm
On Fri, Jun 21, 2024 at 10:21 AM Peter Maydell 
wrote:

> On Wed, 12 Jun 2024 at 20:36, Alex Bennée  wrote:
> >
> > Cord Amfmgm  writes:
> >
> > > On Wed, Jun 12, 2024 at 9:21 AM Alex Bennée 
> wrote:
> > >
> > >  David Hubbard  writes:
> > >
> > >  > From: Cord Amfmgm 
> > >  >
> > >  > This changes the way the ohci emulation handles a Transfer
> Descriptor with
> > >  > "Current Buffer Pointer" set to "Buffer End" + 1.
> > >  >
> > >  > The OHCI spec 4.3.1.2 Table 4-2 allows td.cbp to be one byte more
> than td.be
> > >  > to signal the buffer has zero length. Currently qemu only accepts
> zero-length
> > >  > Transfer Descriptors if the td.cbp is equal to 0, while actual OHCI
> hardware
> > >  > accepts both cases.
> > >  >
> > >  > The qemu ohci emulation has a regression in ohci_service_td.
> Version 4.2
> > >  > and earlier matched the spec. (I haven't taken the time to bisect
> exactly
> > >  > where the logic was changed.)
> > >
> > >  I find it hard to characterise this as a regression because we've
> > >  basically gone from no checks to actually doing bounds checks:
> > >
> > >1328fe0c32 (hw: usb: hcd-ohci: check len and frame_number variables)
> > >
> > >  The argument here seems to be that real hardware is laxer than the
> specs
> > >  in what it accepts.
> > >
> > 
> > >
> > >  With the updated commit message:
> > >
> > >  Reviewed-by: Alex Bennée 
> > >
> > > Please forgive my lack of experience on this mailing list. I don't see
> a suggested commit message from Alex but in case that
> > > was what is being considered, here is one. Feedback welcome, also if
> this is not what is wanted, please just say it.
> > >
> >
> > Something along the lines of what you suggest here
>
> Thanks; I've picked up this patch for target-arm.next (as with
> your previous one for hcd-ohci, adjusting the Author and
> Signed-off-by lines to both read David Hubbard).
>
> I tweaked the commit message a little bit, so the middle part reads:
>
> What this patch does is loosen the qemu ohci implementation to allow a
> zero-length packet if td.be (Buffer End) is set to td.cbp - 1, and
> with a
> non-zero td.cbp value.
>
> The spec is unclear whether this is valid or not -- it is not the
> clearly documented way to send a zero length TD (which is CBP=BE=0),
> but it isn't specifically forbidden. Actual hw seems to be ok with it.
>
> thanks
> -- PMM
>

That tweak looks great.

Thank you for your patience working with me to get this patch into a good
shape.

This was my first attempt to contribute to qemu - really appreciate it.


Re: [PATCH 20/23] hw/sd/sdcard: Add comments around registers and commands

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

From: Philippe Mathieu-Daudé 

Signed-off-by: Philippe Mathieu-Daudé 
Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index ca2c903c5b..95e23abd30 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -317,6 +317,8 @@ static uint8_t sd_crc7(const void *message, size_t width)
  return shift_reg;
  }
  
+/* Operation Conditions register */

+
  #define OCR_POWER_DELAY_NS  50 /* 0.5ms */
  
  FIELD(OCR, VDD_VOLTAGE_WINDOW,  0, 24)

@@ -366,6 +368,8 @@ static void sd_set_ocr(SDState *sd)
  }
  }
  
+/* SD Configuration register */

+
  static void sd_set_scr(SDState *sd)
  {
  sd->scr[0] = 0 << 4;/* SCR structure version 1.0 */
@@ -388,6 +392,8 @@ static void sd_set_scr(SDState *sd)
  sd->scr[7] = 0x00;
  }
  
+/* Card IDentification register */

+
  #define MID 0xaa
  #define OID "XY"
  #define PNM "QEMU!"
@@ -413,6 +419,8 @@ static void sd_set_cid(SDState *sd)
  sd->cid[15] = (sd_crc7(sd->cid, 15) << 1) | 1;
  }
  
+/* Card-Specific Data register */

+
  #define HWBLOCK_SHIFT   9/* 512 bytes */
  #define SECTOR_SHIFT5/* 16 kilobytes */
  #define WPGROUP_SHIFT   7/* 2 megs */
@@ -482,6 +490,8 @@ static void sd_set_csd(SDState *sd, uint64_t size)
  sd->csd[15] = (sd_crc7(sd->csd, 15) << 1) | 1;
  }
  
+/* Relative Card Address register */

+
  static uint16_t sd_req_get_rca(SDState *s, SDRequest req)
  {
  if (sd_cmd_type[req.cmd] == sd_ac || sd_cmd_type[req.cmd] == sd_adtc) {
@@ -490,6 +500,8 @@ static uint16_t sd_req_get_rca(SDState *s, SDRequest req)
  return 0;
  }
  
+/* Card Status register */

+
  FIELD(CSR, AKE_SEQ_ERROR,   3,  1)
  FIELD(CSR, APP_CMD, 5,  1)
  FIELD(CSR, FX_EVENT,6,  1)
@@ -620,6 +632,8 @@ static void sd_reset(DeviceState *dev)
  sect = sd_addr_to_wpnum(size) + 1;
  
  sd->state = sd_idle_state;

+
+/* card registers */
  sd->rca = 0x;
  sd->size = size;
  sd_set_ocr(sd);
@@ -1052,6 +1066,7 @@ static sd_rsp_type_t sd_cmd_unimplemented(SDState *sd, 
SDRequest req)
  return sd_illegal;
  }
  
+/* CMD0 */

  static sd_rsp_type_t sd_cmd_GO_IDLE_STATE(SDState *sd, SDRequest req)
  {
  if (sd->state != sd_inactive_state) {
@@ -1062,6 +1077,7 @@ static sd_rsp_type_t sd_cmd_GO_IDLE_STATE(SDState *sd, 
SDRequest req)
  return sd_is_spi(sd) ? sd_r1 : sd_r0;
  }
  
+/* CMD1 */

  static sd_rsp_type_t spi_cmd_SEND_OP_COND(SDState *sd, SDRequest req)
  {
  sd->state = sd_transfer_state;
@@ -1069,6 +1085,7 @@ static sd_rsp_type_t spi_cmd_SEND_OP_COND(SDState *sd, 
SDRequest req)
  return sd_r1;
  }
  
+/* CMD2 */

  static sd_rsp_type_t sd_cmd_ALL_SEND_CID(SDState *sd, SDRequest req)
  {
  switch (sd->state) {
@@ -1080,6 +1097,7 @@ static sd_rsp_type_t sd_cmd_ALL_SEND_CID(SDState *sd, 
SDRequest req)
  }
  }
  
+/* CMD3 */

  static sd_rsp_type_t sd_cmd_SEND_RELATIVE_ADDR(SDState *sd, SDRequest req)
  {
  switch (sd->state) {
@@ -1094,6 +1112,7 @@ static sd_rsp_type_t sd_cmd_SEND_RELATIVE_ADDR(SDState 
*sd, SDRequest req)
  }
  }
  
+/* CMD19 */

  static sd_rsp_type_t sd_cmd_SEND_TUNING_BLOCK(SDState *sd, SDRequest req)
  {
  if (sd->spec_version < SD_PHY_SPECv3_01_VERS) {
@@ -1110,6 +1129,7 @@ static sd_rsp_type_t sd_cmd_SEND_TUNING_BLOCK(SDState 
*sd, SDRequest req)
  return sd_r1;
  }
  
+/* CMD23 */

  static sd_rsp_type_t sd_cmd_SET_BLOCK_COUNT(SDState *sd, SDRequest req)
  {
  if (sd->spec_version < SD_PHY_SPECv3_01_VERS) {





Re: [PATCH 18/23] hw/sd/sdcard: Add sd_invalid_mode_for_cmd to report invalid mode switch

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

Having the mode switch displayed help to track incomplete
command implementations.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 75 +-
  1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 1df16ce6a2..8d63a39a54 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -180,6 +180,17 @@ static const char *sd_version_str(enum 
SDPhySpecificationVersion version)
  return sdphy_version[version];
  }
  
+static const char *sd_mode_name(enum SDCardModes mode)

+{
+static const char *mode_name[] = {
+[sd_inactive]   = "inactive",
+[sd_card_identification_mode]   = "identification",
+[sd_data_transfer_mode] = "transfer",
+};
+assert(mode < ARRAY_SIZE(mode_name));
+return mode_name[mode];
+}
+
  static const char *sd_state_name(enum SDCardStates state)
  {
  static const char *state_name[] = {
@@ -1015,6 +1026,15 @@ static sd_rsp_type_t sd_invalid_state_for_cmd(SDState 
*sd, SDRequest req)
  return sd_illegal;
  }
  
+static sd_rsp_type_t sd_invalid_mode_for_cmd(SDState *sd, SDRequest req)

+{
+qemu_log_mask(LOG_GUEST_ERROR, "%s: CMD%i in a wrong mode: %s (spec %s)\n",
+  sd_proto(sd)->name, req.cmd, sd_mode_name(sd->mode),
+  sd_version_str(sd->spec_version));
+
+return sd_illegal;
+}
+
  static sd_rsp_type_t sd_cmd_illegal(SDState *sd, SDRequest req)
  {
  qemu_log_mask(LOG_GUEST_ERROR, "%s: Unknown CMD%i for spec %s\n",
@@ -1154,18 +1174,14 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  break;
  
  case 6:  /* CMD6:   SWITCH_FUNCTION */

-switch (sd->mode) {
-case sd_data_transfer_mode:
-sd_function_switch(sd, req.arg);
-sd->state = sd_sendingdata_state;
-sd->data_start = 0;
-sd->data_offset = 0;
-return sd_r1;
-
-default:
-break;
+if (sd->mode != sd_data_transfer_mode) {
+return sd_invalid_mode_for_cmd(sd, req);
  }
-break;
+sd_function_switch(sd, req.arg);
+sd->state = sd_sendingdata_state;
+sd->data_start = 0;
+sd->data_offset = 0;
+return sd_r1;
  
  case 7:  /* CMD7:   SELECT/DESELECT_CARD */

  rca = sd_req_get_rca(sd, req);
@@ -1289,33 +1305,24 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  
  case 13:  /* CMD13:  SEND_STATUS */

  rca = sd_req_get_rca(sd, req);
-switch (sd->mode) {
-case sd_data_transfer_mode:
-if (!sd_is_spi(sd) && sd->rca != rca) {
-return sd_r0;
-}
-
-return sd_r1;
-
-default:
-break;
+if (sd->mode != sd_data_transfer_mode) {
+return sd_invalid_mode_for_cmd(sd, req);
  }
-break;
+if (!sd_is_spi(sd) && sd->rca != rca) {
+return sd_r0;
+}
+
+return sd_r1;
  
  case 15:  /* CMD15:  GO_INACTIVE_STATE */

-rca = sd_req_get_rca(sd, req);
-switch (sd->mode) {
-case sd_data_transfer_mode:
-if (sd->rca != rca)
-return sd_r0;
-
-sd->state = sd_inactive_state;
-return sd_r0;
-
-default:
-break;
+if (sd->mode != sd_data_transfer_mode) {
+return sd_invalid_mode_for_cmd(sd, req);
  }
-break;
+rca = sd_req_get_rca(sd, req);
+if (sd->rca == rca) {
+sd->state = sd_inactive_state;
+}
+return sd_r0;
  
  /* Block read commands (Class 2) */

  case 16:  /* CMD16:  SET_BLOCKLEN */





Re: [PATCH 19/23] hw/sd/sdcard: Inline BLK_READ_BLOCK / BLK_WRITE_BLOCK macros

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

These macros only save 3 chars and make the code harder
to maintain, simply remove them.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 15 +++
  1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 8d63a39a54..ca2c903c5b 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -816,8 +816,6 @@ static void sd_blk_write(SDState *sd, uint64_t addr, 
uint32_t len)
  }
  }
  
-#define BLK_READ_BLOCK(a, len)  sd_blk_read(sd, a, len)

-#define BLK_WRITE_BLOCK(a, len) sd_blk_write(sd, a, len)
  #define APP_READ_BLOCK(a, len)  memset(sd->data, 0xec, len)
  #define APP_WRITE_BLOCK(a, len)
  
@@ -869,7 +867,7 @@ static void sd_erase(SDState *sd)

  continue;
  }
  }
-BLK_WRITE_BLOCK(erase_addr, erase_len);
+sd_blk_write(sd, erase_addr, erase_len);
  }
  }
  
@@ -1901,7 +1899,7 @@ void sd_write_byte(SDState *sd, uint8_t value)

  if (sd->data_offset >= sd->blk_len) {
  /* TODO: Check CRC before committing */
  sd->state = sd_programming_state;
-BLK_WRITE_BLOCK(sd->data_start, sd->data_offset);
+sd_blk_write(sd, sd->data_start, sd->data_offset);
  sd->blk_written ++;
  sd->csd[14] |= 0x40;
  /* Bzzztt  Operation complete.  */
@@ -1927,7 +1925,7 @@ void sd_write_byte(SDState *sd, uint8_t value)
  if (sd->data_offset >= sd->blk_len) {
  /* TODO: Check CRC before committing */
  sd->state = sd_programming_state;
-BLK_WRITE_BLOCK(sd->data_start, sd->data_offset);
+sd_blk_read(sd, sd->data_start, sd->data_offset);
  sd->blk_written++;
  sd->data_start += sd->blk_len;
  sd->data_offset = 0;
@@ -2075,8 +2073,9 @@ uint8_t sd_read_byte(SDState *sd)
  break;
  
  case 17:  /* CMD17:  READ_SINGLE_BLOCK */

-if (sd->data_offset == 0)
-BLK_READ_BLOCK(sd->data_start, io_len);
+if (sd->data_offset == 0) {
+sd_blk_read(sd, sd->data_start, io_len);
+}
  ret = sd->data[sd->data_offset ++];
  
  if (sd->data_offset >= io_len)

@@ -2089,7 +2088,7 @@ uint8_t sd_read_byte(SDState *sd)
sd->data_start, io_len)) {
  return 0x00;
  }
-BLK_READ_BLOCK(sd->data_start, io_len);
+sd_blk_read(sd, sd->data_start, io_len);
  }
  ret = sd->data[sd->data_offset ++];
  





Re: [PATCH 17/23] hw/sd/sdcard: Only call sd_req_get_address() where address is used

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

It will be useful later to assert only ADTC commands
(Addressed point-to-point Data Transfer Commands, defined
as the 'sd_adtc' enum) extract the address value from the
command argument.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 14 --
  1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index a0193a46ea..1df16ce6a2 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -,7 +,7 @@ static sd_rsp_type_t sd_cmd_SET_BLOCK_COUNT(SDState *sd, 
SDRequest req)
  static sd_rsp_type_t sd_normal_command(SDState *sd, SDRequest req)
  {
  uint16_t rca;
-uint64_t addr = sd_req_get_address(sd, req);
+uint64_t addr;
  
  sd->last_cmd_name = sd_cmd_name(req.cmd);

  /* CMD55 precedes an ACMD, so we are not interested in tracing it.
@@ -1237,7 +1237,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  }
  sd->state = sd_sendingdata_state;
  memcpy(sd->data, sd->csd, 16);
-sd->data_start = addr;
+sd->data_start = sd_req_get_address(sd, req);
  sd->data_offset = 0;
  return sd_r1;
  
@@ -1261,7 +1261,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, SDRequest req)

  }
  sd->state = sd_sendingdata_state;
  memcpy(sd->data, sd->cid, 16);
-sd->data_start = addr;
+sd->data_start = sd_req_get_address(sd, req);
  sd->data_offset = 0;
  return sd_r1;
  
@@ -1337,6 +1337,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, SDRequest req)
  
  case 17:  /* CMD17:  READ_SINGLE_BLOCK */

  case 18:  /* CMD18:  READ_MULTIPLE_BLOCK */
+addr = sd_req_get_address(sd, req);
  switch (sd->state) {
  case sd_transfer_state:
  
@@ -1357,6 +1358,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, SDRequest req)

  /* Block write commands (Class 4) */
  case 24:  /* CMD24:  WRITE_SINGLE_BLOCK */
  case 25:  /* CMD25:  WRITE_MULTIPLE_BLOCK */
+addr = sd_req_get_address(sd, req);
  switch (sd->state) {
  case sd_transfer_state:
  
@@ -1415,7 +1417,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, SDRequest req)

  if (sd->size > SDSC_MAX_CAPACITY) {
  return sd_illegal;
  }
-
+addr = sd_req_get_address(sd, req);
  switch (sd->state) {
  case sd_transfer_state:
  if (!address_in_range(sd, "SET_WRITE_PROT", addr, 1)) {
@@ -1437,7 +1439,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  if (sd->size > SDSC_MAX_CAPACITY) {
  return sd_illegal;
  }
-
+addr = sd_req_get_address(sd, req);
  switch (sd->state) {
  case sd_transfer_state:
  if (!address_in_range(sd, "CLR_WRITE_PROT", addr, 1)) {
@@ -1459,7 +1461,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  if (sd->size > SDSC_MAX_CAPACITY) {
  return sd_illegal;
  }
-
+addr = sd_req_get_address(sd, req);
  switch (sd->state) {
  case sd_transfer_state:
  if (!address_in_range(sd, "SEND_WRITE_PROT",





Re: [PATCH 16/23] hw/sd/sdcard: Factor sd_req_get_address() method out

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

Extract sd_cmd_get_address() so we can re-use it
in various SDProto handlers. Use CARD_CAPACITY and
HWBLOCK_SHIFT definitions instead of magic values.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 10 +-
  1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index cb9d85bb11..a0193a46ea 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -579,6 +579,14 @@ static void sd_response_r7_make(SDState *sd, uint8_t 
*response)
  stl_be_p(response, sd->vhs);
  }
  
+static uint64_t sd_req_get_address(SDState *sd, SDRequest req)

+{
+if (FIELD_EX32(sd->ocr, OCR, CARD_CAPACITY)) {
+return (uint64_t) req.arg << HWBLOCK_SHIFT;
+}
+return req.arg;
+}
+
  static inline uint64_t sd_addr_to_wpnum(uint64_t addr)
  {
  return addr >> (HWBLOCK_SHIFT + SECTOR_SHIFT + WPGROUP_SHIFT);
@@ -1103,7 +,7 @@ static sd_rsp_type_t sd_cmd_SET_BLOCK_COUNT(SDState *sd, 
SDRequest req)
  static sd_rsp_type_t sd_normal_command(SDState *sd, SDRequest req)
  {
  uint16_t rca;
-uint64_t addr = (sd->ocr & (1 << 30)) ? (uint64_t) req.arg << 9 : req.arg;
+uint64_t addr = sd_req_get_address(sd, req);
  
  sd->last_cmd_name = sd_cmd_name(req.cmd);

  /* CMD55 precedes an ACMD, so we are not interested in tracing it.





Re: [PATCH 15/23] hw/sd/sdcard: Only call sd_req_get_rca() where RCA is used

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

It will be useful later to assert only AC commands
(Addressed point-to-point Commands, defined as the
'sd_ac' enum) extract the RCA value from the command
argument.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index bc47ae36bc..cb9d85bb11 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -1102,7 +1102,7 @@ static sd_rsp_type_t sd_cmd_SET_BLOCK_COUNT(SDState *sd, 
SDRequest req)
  
  static sd_rsp_type_t sd_normal_command(SDState *sd, SDRequest req)

  {
-uint16_t rca = sd_req_get_rca(sd, req);
+uint16_t rca;
  uint64_t addr = (sd->ocr & (1 << 30)) ? (uint64_t) req.arg << 9 : req.arg;
  
  sd->last_cmd_name = sd_cmd_name(req.cmd);

@@ -1160,6 +1160,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  break;
  
  case 7:  /* CMD7:   SELECT/DESELECT_CARD */

+rca = sd_req_get_rca(sd, req);
  switch (sd->state) {
  case sd_standby_state:
  if (sd->rca != rca)
@@ -1214,6 +1215,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  return sd_r7;
  
  case 9:  /* CMD9:   SEND_CSD */

+rca = sd_req_get_rca(sd, req);
  switch (sd->state) {
  case sd_standby_state:
  if (sd->rca != rca)
@@ -1237,6 +1239,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  break;
  
  case 10:  /* CMD10:  SEND_CID */

+rca = sd_req_get_rca(sd, req);
  switch (sd->state) {
  case sd_standby_state:
  if (sd->rca != rca)
@@ -1277,6 +1280,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  break;
  
  case 13:  /* CMD13:  SEND_STATUS */

+rca = sd_req_get_rca(sd, req);
  switch (sd->mode) {
  case sd_data_transfer_mode:
  if (!sd_is_spi(sd) && sd->rca != rca) {
@@ -1291,6 +1295,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  break;
  
  case 15:  /* CMD15:  GO_INACTIVE_STATE */

+rca = sd_req_get_rca(sd, req);
  switch (sd->mode) {
  case sd_data_transfer_mode:
  if (sd->rca != rca)
@@ -1523,6 +1528,7 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  
  /* Application specific commands (Class 8) */

  case 55:  /* CMD55:  APP_CMD */
+rca = sd_req_get_rca(sd, req);
  switch (sd->state) {
  case sd_ready_state:
  case sd_identification_state:





Re: [PATCH 14/23] hw/sd/sdcard: Factor sd_req_get_rca() method out

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

Extract sd_req_get_rca() so we can re-use it in various
SDProto handlers. Return a 16-bit value since RCA is 16-bit.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 15 +--
  1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 510784fc82..bc47ae36bc 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -471,6 +471,14 @@ static void sd_set_csd(SDState *sd, uint64_t size)
  sd->csd[15] = (sd_crc7(sd->csd, 15) << 1) | 1;
  }
  
+static uint16_t sd_req_get_rca(SDState *s, SDRequest req)

+{
+if (sd_cmd_type[req.cmd] == sd_ac || sd_cmd_type[req.cmd] == sd_adtc) {
+return req.arg >> 16;
+}
+return 0;
+}
+
  FIELD(CSR, AKE_SEQ_ERROR,   3,  1)
  FIELD(CSR, APP_CMD, 5,  1)
  FIELD(CSR, FX_EVENT,6,  1)
@@ -1094,7 +1102,7 @@ static sd_rsp_type_t sd_cmd_SET_BLOCK_COUNT(SDState *sd, 
SDRequest req)
  
  static sd_rsp_type_t sd_normal_command(SDState *sd, SDRequest req)

  {
-uint32_t rca = 0x;
+uint16_t rca = sd_req_get_rca(sd, req);
  uint64_t addr = (sd->ocr & (1 << 30)) ? (uint64_t) req.arg << 9 : req.arg;
  
  sd->last_cmd_name = sd_cmd_name(req.cmd);

@@ -1110,11 +1118,6 @@ static sd_rsp_type_t sd_normal_command(SDState *sd, 
SDRequest req)
  /* Not interpreting this as an app command */
  sd->card_status &= ~APP_CMD;
  
-if (sd_cmd_type[req.cmd] == sd_ac

-|| sd_cmd_type[req.cmd] == sd_adtc) {
-rca = req.arg >> 16;
-}
-
  /* CMD23 (set block count) must be immediately followed by CMD18 or CMD25
   * if not, its effects are cancelled */
  if (sd->multi_blk_cnt != 0 && !(req.cmd == 18 || req.cmd == 25)) {





Re: [PATCH 13/23] hw/sd/sdcard: Have cmd_valid_while_locked() return a boolean value

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index c6cc1bab11..510784fc82 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -1716,7 +1716,7 @@ static sd_rsp_type_t sd_app_command(SDState *sd,
  return sd_illegal;
  }
  
-static int cmd_valid_while_locked(SDState *sd, const uint8_t cmd)

+static bool cmd_valid_while_locked(SDState *sd, unsigned cmd)
  {
  /* Valid commands in locked state:
   * basic class (0)
@@ -1730,7 +1730,7 @@ static int cmd_valid_while_locked(SDState *sd, const 
uint8_t cmd)
  return cmd == 41 || cmd == 42;
  }
  if (cmd == 16 || cmd == 55) {
-return 1;
+return true;
  }
  return sd_cmd_class[cmd] == 0 || sd_cmd_class[cmd] == 7;
  }





Re: [PATCH v3 qemu 00/11] acpi: NUMA nodes for CXL HB as GP + complex NUMA test

2024-06-21 Thread Jonathan Cameron via
On Thu, 20 Jun 2024 17:03:08 +0100
Jonathan Cameron  wrote:

> v3: Thanks to Richard for help debugging BE issue and to Igor for
> finding a bunch of other thing to improve via the context in
> the fix patch.

I forgot to mention that his time I ran the bios tables test on
an emulated x86_64 machine on top of an emulated s390 (with the timeouts
massively increased as it took about 2 hours).

Hopefully no more surprises!




Re: [PATCH 12/23] hw/sd/sdcard: Trace block offset in READ/WRITE data accesses

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

Useful to detect out of bound accesses.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 4 ++--
  hw/sd/trace-events | 4 ++--
  2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 2586d15cbd..c6cc1bab11 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -1868,7 +1868,7 @@ void sd_write_byte(SDState *sd, uint8_t value)
  
  trace_sdcard_write_data(sd_proto(sd)->name,

  sd->last_cmd_name,
-sd->current_cmd, value);
+sd->current_cmd, sd->data_offset, value);
  switch (sd->current_cmd) {
  case 24:  /* CMD24:  WRITE_SINGLE_BLOCK */
  sd->data[sd->data_offset ++] = value;
@@ -2024,7 +2024,7 @@ uint8_t sd_read_byte(SDState *sd)
  
  trace_sdcard_read_data(sd_proto(sd)->name,

 sd->last_cmd_name,
-   sd->current_cmd, io_len);
+   sd->current_cmd, sd->data_offset, io_len);
  switch (sd->current_cmd) {
  case 6:  /* CMD6:   SWITCH_FUNCTION */
  ret = sd->data[sd->data_offset ++];
diff --git a/hw/sd/trace-events b/hw/sd/trace-events
index 724365efc3..0eee98a646 100644
--- a/hw/sd/trace-events
+++ b/hw/sd/trace-events
@@ -52,8 +52,8 @@ sdcard_lock(void) ""
  sdcard_unlock(void) ""
  sdcard_read_block(uint64_t addr, uint32_t len) "addr 0x%" PRIx64 " size 0x%x"
  sdcard_write_block(uint64_t addr, uint32_t len) "addr 0x%" PRIx64 " size 0x%x"
-sdcard_write_data(const char *proto, const char *cmd_desc, uint8_t cmd, uint8_t value) 
"%s %20s/ CMD%02d value 0x%02x"
-sdcard_read_data(const char *proto, const char *cmd_desc, uint8_t cmd, uint32_t length) 
"%s %20s/ CMD%02d len %" PRIu32
+sdcard_write_data(const char *proto, const char *cmd_desc, uint8_t cmd, uint32_t offset, uint8_t 
value) "%s %20s/ CMD%02d ofs %"PRIu32" value 0x%02x"
+sdcard_read_data(const char *proto, const char *cmd_desc, uint8_t cmd, uint32_t offset, uint32_t 
length) "%s %20s/ CMD%02d ofs %"PRIu32" len %" PRIu32
  sdcard_set_voltage(uint16_t millivolts) "%u mV"
  
  # pxa2xx_mmci.c





Re: [PATCH 11/23] hw/sd/sdcard: Trace update of block count (CMD23)

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 1 +
  hw/sd/trace-events | 3 ++-
  2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 4e378f7cf7..2586d15cbd 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -1087,6 +1087,7 @@ static sd_rsp_type_t sd_cmd_SET_BLOCK_COUNT(SDState *sd, 
SDRequest req)
  }
  
  sd->multi_blk_cnt = req.arg;

+trace_sdcard_set_block_count(sd->multi_blk_cnt);
  
  return sd_r1;

  }
diff --git a/hw/sd/trace-events b/hw/sd/trace-events
index 94a00557b2..724365efc3 100644
--- a/hw/sd/trace-events
+++ b/hw/sd/trace-events
@@ -43,7 +43,8 @@ sdcard_response(const char *rspdesc, int rsplen) "%s (sz:%d)"
  sdcard_powerup(void) ""
  sdcard_inquiry_cmd41(void) ""
  sdcard_reset(void) ""
-sdcard_set_blocklen(uint16_t length) "0x%03x"
+sdcard_set_blocklen(uint16_t length) "block len 0x%03x"
+sdcard_set_block_count(uint32_t cnt) "block cnt 0x%"PRIx32
  sdcard_inserted(bool readonly) "read_only: %u"
  sdcard_ejected(void) ""
  sdcard_erase(uint32_t first, uint32_t last) "addr first 0x%" PRIx32" last 
0x%" PRIx32





Re: [PATCH 08/23] hw/sd/sdcard: Remove explicit entries for illegal commands

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

NULL handler is already handled as illegal, no need to
duplicate (that keeps this array simpler to maintain).

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 11 ---
  1 file changed, 11 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index e9af834a8c..30239b28bc 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -2146,12 +2146,6 @@ static const SDProto sd_proto_spi = {
  .cmd = {
  [0] = sd_cmd_GO_IDLE_STATE,
  [1] = spi_cmd_SEND_OP_COND,
-[2 ... 4]   = sd_cmd_illegal,
-[5] = sd_cmd_illegal,
-[7] = sd_cmd_illegal,
-[15]= sd_cmd_illegal,
-[26]= sd_cmd_illegal,
-[52 ... 54] = sd_cmd_illegal,
  },
  .acmd = {
  [41]= spi_cmd_SEND_OP_COND,
@@ -2162,15 +2156,10 @@ static const SDProto sd_proto_sd = {
  .name = "SD",
  .cmd = {
  [0] = sd_cmd_GO_IDLE_STATE,
-[1] = sd_cmd_illegal,
  [2] = sd_cmd_ALL_SEND_CID,
  [3] = sd_cmd_SEND_RELATIVE_ADDR,
-[5] = sd_cmd_illegal,
  [19]= sd_cmd_SEND_TUNING_BLOCK,
  [23]= sd_cmd_SET_BLOCK_COUNT,
-[52 ... 54] = sd_cmd_illegal,
-[58]= sd_cmd_illegal,
-[59]= sd_cmd_illegal,
  },
  };
  





Re: [PATCH 07/23] hw/sd/sdcard: Remove ACMD6 handler for SPI mode

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

There is no ACMD6 command in SPI mode, remove the pointless
handler introduced in commit 946897ce18 ("sdcard: handles
more commands in SPI mode"). Keep sd_cmd_unimplemented()
since we'll reuse it later.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index b0cd30c657..e9af834a8c 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -1012,6 +1012,7 @@ static sd_rsp_type_t sd_cmd_illegal(SDState *sd, 
SDRequest req)
  }
  
  /* Commands that are recognised but not yet implemented. */

+__attribute__((unused))
  static sd_rsp_type_t sd_cmd_unimplemented(SDState *sd, SDRequest req)
  {
  qemu_log_mask(LOG_UNIMP, "%s: CMD%i not implemented\n",
@@ -2153,7 +2154,6 @@ static const SDProto sd_proto_spi = {
  [52 ... 54] = sd_cmd_illegal,
  },
  .acmd = {
-[6] = sd_cmd_unimplemented,
  [41]= spi_cmd_SEND_OP_COND,
  },
  };





Re: [PATCH 06/23] hw/sd/sdcard: Use Load/Store API to fill some CID/CSD registers

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

The ld/st API helps noticing CID or CSD bytes refer
to the same field. Multi-bytes fields are stored MSB
first in CID / CSD.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 9 ++---
  1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 24415cb9f0..b0cd30c657 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -393,10 +393,7 @@ static void sd_set_cid(SDState *sd)
  sd->cid[6] = PNM[3];
  sd->cid[7] = PNM[4];
  sd->cid[8] = PRV;   /* Fake product revision (PRV) */
-sd->cid[9] = 0xde;  /* Fake serial number (PSN) */
-sd->cid[10] = 0xad;
-sd->cid[11] = 0xbe;
-sd->cid[12] = 0xef;
+stl_be_p(&sd->cid[9], 0xdeadbeef); /* Fake serial number (PSN) */
  sd->cid[13] = 0x00 |/* Manufacture date (MDT) */
  ((MDT_YR - 2000) / 10);
  sd->cid[14] = ((MDT_YR % 10) << 4) | MDT_MON;
@@ -462,9 +459,7 @@ static void sd_set_csd(SDState *sd, uint64_t size)
  sd->csd[4] = 0x5b;
  sd->csd[5] = 0x59;
  sd->csd[6] = 0x00;
-sd->csd[7] = (size >> 16) & 0xff;
-sd->csd[8] = (size >> 8) & 0xff;
-sd->csd[9] = (size & 0xff);
+st24_be_p(&sd->csd[7], size);
  sd->csd[10] = 0x7f;
  sd->csd[11] = 0x80;
  sd->csd[12] = 0x0a;





Re: [PATCH 05/23] hw/sd/sdcard: Use registerfield CSR::CURRENT_STATE definition

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

Use registerfield-generated definitions to update card_status.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index c528c30bcf..24415cb9f0 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -1788,8 +1788,8 @@ int sd_do_command(SDState *sd, SDRequest *req,
   * (Do this now so they appear in r1 responses.)
   */
  sd->current_cmd = req->cmd;
-sd->card_status &= ~CURRENT_STATE;
-sd->card_status |= (last_state << 9);
+sd->card_status = FIELD_DP32(sd->card_status, CSR,
+ CURRENT_STATE, last_state);
  }
  
  send_response:





Re: [PATCH 03/23] hw/sd/sdcard: Fix typo in SEND_OP_COND command name

2024-06-21 Thread Cédric Le Goater

On 6/21/24 10:05 AM, Philippe Mathieu-Daudé wrote:

There is no SEND_OP_CMD but SEND_OP_COND.

Signed-off-by: Philippe Mathieu-Daudé 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/sd/sd.c | 6 +++---
  hw/sd/sdmmc-internal.c | 2 +-
  2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index addeb1940f..331cef5779 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -1035,7 +1035,7 @@ static sd_rsp_type_t sd_cmd_GO_IDLE_STATE(SDState *sd, 
SDRequest req)
  return sd_is_spi(sd) ? sd_r1 : sd_r0;
  }
  
-static sd_rsp_type_t sd_cmd_SEND_OP_CMD(SDState *sd, SDRequest req)

+static sd_rsp_type_t spi_cmd_SEND_OP_COND(SDState *sd, SDRequest req)
  {
  sd->state = sd_transfer_state;
  
@@ -2149,7 +2149,7 @@ static const SDProto sd_proto_spi = {

  .name = "SPI",
  .cmd = {
  [0] = sd_cmd_GO_IDLE_STATE,
-[1] = sd_cmd_SEND_OP_CMD,
+[1] = spi_cmd_SEND_OP_COND,
  [2 ... 4]   = sd_cmd_illegal,
  [5] = sd_cmd_illegal,
  [7] = sd_cmd_illegal,
@@ -2159,7 +2159,7 @@ static const SDProto sd_proto_spi = {
  },
  .acmd = {
  [6] = sd_cmd_unimplemented,
-[41]= sd_cmd_SEND_OP_CMD,
+[41]= spi_cmd_SEND_OP_COND,
  },
  };
  
diff --git a/hw/sd/sdmmc-internal.c b/hw/sd/sdmmc-internal.c

index 8648a7808d..c1d5508ae6 100644
--- a/hw/sd/sdmmc-internal.c
+++ b/hw/sd/sdmmc-internal.c
@@ -14,7 +14,7 @@
  const char *sd_cmd_name(uint8_t cmd)
  {
  static const char *cmd_abbrev[SDMMC_CMD_MAX] = {
- [0]= "GO_IDLE_STATE",   [1]= "SEND_OP_CMD",
+ [0]= "GO_IDLE_STATE",   [1]= "SEND_OP_COND",
   [2]= "ALL_SEND_CID",[3]= "SEND_RELATIVE_ADDR",
   [4]= "SET_DSR", [5]= "IO_SEND_OP_COND",
   [6]= "SWITCH_FUNC", [7]= "SELECT/DESELECT_CARD",





  1   2   3   >