date:20230814

Re: [PATCH v2] target/i386: Export GDS_NO bit to guests

2023-08-14 Thread Xiaoyao Li


On 8/15/2023 12:54 PM, Pawan Gupta wrote:

Gather Data Sampling (GDS) is a side-channel attack using Gather
instructions. Some Intel processors will set ARCH_CAP_GDS_NO bit in
MSR IA32_ARCH_CAPABILITIES to report that they are not vulnerable to
GDS.

Make this bit available to guests.

Closes: 
https://lore.kernel.org/qemu-devel/camgffemg6tnq0n3+4ojagxc8j0oevy60khzekxcbs3lok9v...@mail.gmail.com/
Reported-by: Jack Wang 
Signed-off-by: Pawan Gupta 
Tested-by: Jack Wang 
Tested-by: Daniel Sneddon 


Reviewed-by: Xiaoyao Li 


---
v2: Added commit tags

v1: 
https://lore.kernel.org/qemu-devel/c373f3f92b542b738f296d44bb6a916a1cded7bd.1691774049.git.pawan.kumar.gu...@linux.intel.com/

  target/i386/cpu.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 97ad229d8ba3..48709b77689f 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1155,7 +1155,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
  NULL, "sbdr-ssdp-no", "fbsdp-no", "psdp-no",
  NULL, "fb-clear", NULL, NULL,
  NULL, NULL, NULL, NULL,
-"pbrsb-no", NULL, NULL, NULL,
+"pbrsb-no", NULL, "gds-no", NULL,
  NULL, NULL, NULL, NULL,
  },
  .msr = {

Re: [PULL 0/1] late tcg fix

2023-08-14 Thread Richard Henderson


On 8/14/23 19:10, Richard Henderson wrote:

The following changes since commit bb5f142cb320d45d3d8dee2c82dae003cad39da8:

   Merge tag 'pull-riscv-to-apply-20230811-3' of 
https://github.com/alistair23/qemu into staging (2023-08-11 14:47:23 -0700)

are available in the Git repository at:

   https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230814

for you to fetch changes up to d3b41127c205062ca6c29c89c9542c4112c39ca0:

   tcg/i386: Output %gs prefix in tcg_out_vex_opc (2023-08-12 08:51:12 -0700)


tcg/i386: Output %gs prefix in tcg_out_vex_opc


Richard Henderson (1):
   tcg/i386: Output %gs prefix in tcg_out_vex_opc

  tcg/i386/tcg-target.c.inc | 3 +++
  1 file changed, 3 insertions(+)


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/8.1 as 
appropriate.


r~

Re: [PATCH] trace-events: Fix the name of the tracing.rst file

2023-08-14 Thread Michael Tokarev


14.08.2023 20:12, Thomas Huth wrote:

The file has been converted to .rst a while ago - make sure that the
references in the trace-events files are pointing to the right location
now.


Applied to my trivial-patches tree, thanks!

/mjt

[RFC PATCH] tcg/ppc: Enable direct branching tcg_out_goto_tb with TCG_REG_TB

2023-08-14 Thread Jordan Niethe

Direct branch patching was disabled when using TCG_REG_TB in commit
736a1588c1 ("tcg/ppc: Fix race in goto_tb implementation"). Commit
7502f89c74 ("tcg/ppc: Use prefixed instructions for tcg_out_goto_tb")
used the support for pc relative ISAv3.1 instructions to re-enable
direct branch patching on POWER10.

The issue with direct branch patching with TCG_REG_TB is the lack of
synchronization between the new TCG_REG_TB being established and the
direct branch being patched in.

If each translation block is responsible for establishing its own
TCG_REG_TB then there can be no synchronization issue.

Make each translation block begin by setting up its own TCG_REG_TB.
ISA v3.0 has addpcis so use that for getting the pc at the beginning of
a translation block (plus offset). For systems without addpcis, use
the preferred 'bcl 20,31,$+4' sequence.

When branching indirectly to a translation block the setup sequence can
be skipped if the caller sets up TCG_REG_TB as there is no possible race
in this case.

Signed-off-by: Jordan Niethe 
---
This is just a proof of concept, not sure that this is the correct way
to do this or even if it is something we'd like to do.

Applies on top of Richard's series [1].

  [1] 
https://lore.kernel.org/qemu-devel/20230808030250.50602-1-richard.hender...@linaro.org/
---
 include/tcg/tcg.h|  1 +
 tcg/ppc/tcg-target.c.inc | 59 ++--
 tcg/tcg.c|  3 ++
 3 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 0875971719..337506fea0 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -518,6 +518,7 @@ struct TCGContext {
extension that allows arithmetic on void*.  */
 void *code_gen_buffer;
 size_t code_gen_buffer_size;
+size_t code_gen_entry_size;
 void *code_gen_ptr;
 void *data_gen_ptr;
 
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index b686a68247..4b55751051 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -382,6 +382,7 @@ static bool tcg_target_const_match(int64_t val, TCGType 
type, int ct)
 #define CRNAND XO19(225)
 #define CROR   XO19(449)
 #define CRNOR  XO19( 33)
+#define ADDPCIS  XO19( 2)
 
 #define EXTSB  XO31(954)
 #define EXTSH  XO31(922)
@@ -2635,6 +2636,30 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 tcg_out32(s, BCLR | BO_ALWAYS);
 }
 
+
+#define TCG_TARGET_NEED_ENTER_TB
+static void tcg_out_enter_tb(TCGContext *s)
+{
+if (!USE_REG_TB) {
+return;
+}
+
+if (have_isa_3_00) {
+/* lnia REG_TB */
+tcg_out32(s, ADDPCIS | RT(TCG_REG_TB));
+tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -4));
+} else {
+tcg_out32(s, MFSPR | RT(TCG_REG_TMP1) | LR);
+/* bcl 20,31,$+4 (Preferred form for getting nia.) */
+tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK);
+tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR);
+tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -8));
+tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | LR);
+}
+
+s->code_gen_entry_size = tcg_current_code_size(s);
+}
+
 static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg)
 {
 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R3, arg);
@@ -2645,23 +2670,6 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 {
 uintptr_t ptr = get_jmp_target_addr(s, which);
 
-if (USE_REG_TB) {
-/*
- * With REG_TB, we must always use indirect branching,
- * so that the branch destination and TCG_REG_TB match.
- */
-ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
-tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
-tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR);
-tcg_out32(s, BCCTR | BO_ALWAYS);
-
-/* For the unlinked case, need to reset TCG_REG_TB.  */
-set_jmp_reset_offset(s, which);
-tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB,
- -tcg_current_code_size(s));
-return;
-}
-
 /* Direct branch will be patched by tb_target_set_jmp_target. */
 set_jmp_insn_offset(s, which);
 tcg_out32(s, NOP);
@@ -2670,6 +2678,13 @@ static void tcg_out_goto_tb(TCGContext *s, int which)
 if (have_isa_3_10) {
 ptrdiff_t offset = tcg_pcrel_diff_for_prefix(s, (void *)ptr);
 tcg_out_8ls_d(s, PLD, TCG_REG_TMP1, 0, offset, 1);
+} else if (USE_REG_TB) {
+ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr);
+
+tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset);
+/* Callee can skip establishing REG_TB for the indirect case. */
+tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, TCG_REG_TB,
+s->code_gen_entry_size));
 } else {
 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr);
 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr);
@@ -2678,6 +2693,12 @@ static void tcg_out_goto_tb(TCGContext *s

[PATCH v2] target/i386: Export GDS_NO bit to guests

2023-08-14 Thread Pawan Gupta

Gather Data Sampling (GDS) is a side-channel attack using Gather
instructions. Some Intel processors will set ARCH_CAP_GDS_NO bit in
MSR IA32_ARCH_CAPABILITIES to report that they are not vulnerable to
GDS.

Make this bit available to guests.

Closes: 
https://lore.kernel.org/qemu-devel/camgffemg6tnq0n3+4ojagxc8j0oevy60khzekxcbs3lok9v...@mail.gmail.com/
Reported-by: Jack Wang 
Signed-off-by: Pawan Gupta 
Tested-by: Jack Wang 
Tested-by: Daniel Sneddon 
---
v2: Added commit tags

v1: 
https://lore.kernel.org/qemu-devel/c373f3f92b542b738f296d44bb6a916a1cded7bd.1691774049.git.pawan.kumar.gu...@linux.intel.com/

 target/i386/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 97ad229d8ba3..48709b77689f 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1155,7 +1155,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 NULL, "sbdr-ssdp-no", "fbsdp-no", "psdp-no",
 NULL, "fb-clear", NULL, NULL,
 NULL, NULL, NULL, NULL,
-"pbrsb-no", NULL, NULL, NULL,
+"pbrsb-no", NULL, "gds-no", NULL,
 NULL, NULL, NULL, NULL,
 },
 .msr = {
-- 
2.34.1

[PATCH] target/riscv: Update CSR bits name for svadu extension

2023-08-14 Thread Weiwei Li

The Svadu specification updated the name of the *envcfg bit from
HADE to ADUE.

Signed-off-by: Weiwei Li 
Signed-off-by: Junqiang Wang 
---
 target/riscv/cpu.c|  4 ++--
 target/riscv/cpu_bits.h   |  8 
 target/riscv/cpu_helper.c |  4 ++--
 target/riscv/csr.c| 12 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 6b93b04453..f04a985d55 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -875,9 +875,9 @@ static void riscv_cpu_reset_hold(Object *obj)
 env->two_stage_lookup = false;
 
 env->menvcfg = (cpu->cfg.ext_svpbmt ? MENVCFG_PBMTE : 0) |
-   (cpu->cfg.ext_svadu ? MENVCFG_HADE : 0);
+   (cpu->cfg.ext_svadu ? MENVCFG_ADUE : 0);
 env->henvcfg = (cpu->cfg.ext_svpbmt ? HENVCFG_PBMTE : 0) |
-   (cpu->cfg.ext_svadu ? HENVCFG_HADE : 0);
+   (cpu->cfg.ext_svadu ? HENVCFG_ADUE : 0);
 
 /* Initialized default priorities of local interrupts. */
 for (i = 0; i < ARRAY_SIZE(env->miprio); i++) {
diff --git a/target/riscv/cpu_bits.h b/target/riscv/cpu_bits.h
index 59f0ffd9e1..1c2ffae883 100644
--- a/target/riscv/cpu_bits.h
+++ b/target/riscv/cpu_bits.h
@@ -745,12 +745,12 @@ typedef enum RISCVException {
 #define MENVCFG_CBIE   (3UL << 4)
 #define MENVCFG_CBCFE  BIT(6)
 #define MENVCFG_CBZE   BIT(7)
-#define MENVCFG_HADE   (1ULL << 61)
+#define MENVCFG_ADUE   (1ULL << 61)
 #define MENVCFG_PBMTE  (1ULL << 62)
 #define MENVCFG_STCE   (1ULL << 63)
 
 /* For RV32 */
-#define MENVCFGH_HADE  BIT(29)
+#define MENVCFGH_ADUE  BIT(29)
 #define MENVCFGH_PBMTE BIT(30)
 #define MENVCFGH_STCE  BIT(31)
 
@@ -763,12 +763,12 @@ typedef enum RISCVException {
 #define HENVCFG_CBIE   MENVCFG_CBIE
 #define HENVCFG_CBCFE  MENVCFG_CBCFE
 #define HENVCFG_CBZE   MENVCFG_CBZE
-#define HENVCFG_HADE   MENVCFG_HADE
+#define HENVCFG_ADUE   MENVCFG_ADUE
 #define HENVCFG_PBMTE  MENVCFG_PBMTE
 #define HENVCFG_STCE   MENVCFG_STCE
 
 /* For RV32 */
-#define HENVCFGH_HADE   MENVCFGH_HADE
+#define HENVCFGH_ADUE   MENVCFGH_ADUE
 #define HENVCFGH_PBMTE  MENVCFGH_PBMTE
 #define HENVCFGH_STCE   MENVCFGH_STCE
 
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index 9f611d89bb..0ff6b59cff 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -861,11 +861,11 @@ static int get_physical_address(CPURISCVState *env, 
hwaddr *physical,
 }
 
 bool pbmte = env->menvcfg & MENVCFG_PBMTE;
-bool hade = env->menvcfg & MENVCFG_HADE;
+bool hade = env->menvcfg & MENVCFG_ADUE;
 
 if (first_stage && two_stage && env->virt_enabled) {
 pbmte = pbmte && (env->henvcfg & HENVCFG_PBMTE);
-hade = hade && (env->henvcfg & HENVCFG_HADE);
+hade = hade && (env->henvcfg & HENVCFG_ADUE);
 }
 
 int ptshift = (levels - 1) * ptidxbits;
diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index ea7585329e..b4c66dc8ca 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -1951,7 +1951,7 @@ static RISCVException write_menvcfg(CPURISCVState *env, 
int csrno,
 if (riscv_cpu_mxl(env) == MXL_RV64) {
 mask |= (cfg->ext_svpbmt ? MENVCFG_PBMTE : 0) |
 (cfg->ext_sstc ? MENVCFG_STCE : 0) |
-(cfg->ext_svadu ? MENVCFG_HADE : 0);
+(cfg->ext_svadu ? MENVCFG_ADUE : 0);
 }
 env->menvcfg = (env->menvcfg & ~mask) | (val & mask);
 
@@ -1971,7 +1971,7 @@ static RISCVException write_menvcfgh(CPURISCVState *env, 
int csrno,
 const RISCVCPUConfig *cfg = riscv_cpu_cfg(env);
 uint64_t mask = (cfg->ext_svpbmt ? MENVCFG_PBMTE : 0) |
 (cfg->ext_sstc ? MENVCFG_STCE : 0) |
-(cfg->ext_svadu ? MENVCFG_HADE : 0);
+(cfg->ext_svadu ? MENVCFG_ADUE : 0);
 uint64_t valh = (uint64_t)val << 32;
 
 env->menvcfg = (env->menvcfg & ~mask) | (valh & mask);
@@ -2023,7 +2023,7 @@ static RISCVException read_henvcfg(CPURISCVState *env, 
int csrno,
  * henvcfg.stce is read_only 0 when menvcfg.stce = 0
  * henvcfg.hade is read_only 0 when menvcfg.hade = 0
  */
-*val = env->henvcfg & (~(HENVCFG_PBMTE | HENVCFG_STCE | HENVCFG_HADE) |
+*val = env->henvcfg & (~(HENVCFG_PBMTE | HENVCFG_STCE | HENVCFG_ADUE) |
env->menvcfg);
 return RISCV_EXCP_NONE;
 }
@@ -2040,7 +2040,7 @@ static RISCVException write_henvcfg(CPURISCVState *env, 
int csrno,
 }
 
 if (riscv_cpu_mxl(env) == MXL_RV64) {
-mask |= env->menvcfg & (HENVCFG_PBMTE | HENVCFG_STCE

[PULL 0/1] late tcg fix

2023-08-14 Thread Richard Henderson

The following changes since commit bb5f142cb320d45d3d8dee2c82dae003cad39da8:

  Merge tag 'pull-riscv-to-apply-20230811-3' of 
https://github.com/alistair23/qemu into staging (2023-08-11 14:47:23 -0700)

are available in the Git repository at:

  https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230814

for you to fetch changes up to d3b41127c205062ca6c29c89c9542c4112c39ca0:

  tcg/i386: Output %gs prefix in tcg_out_vex_opc (2023-08-12 08:51:12 -0700)


tcg/i386: Output %gs prefix in tcg_out_vex_opc


Richard Henderson (1):
  tcg/i386: Output %gs prefix in tcg_out_vex_opc

 tcg/i386/tcg-target.c.inc | 3 +++
 1 file changed, 3 insertions(+)

[PULL 1/1] tcg/i386: Output %gs prefix in tcg_out_vex_opc

2023-08-14 Thread Richard Henderson

Missing the segment prefix means that user-only fails
to add guest_base for some 128-bit load/store.

Fixes: 098d0fc10d2 ("tcg/i386: Support 128-bit load/store")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1763
Signed-off-by: Richard Henderson 
---
 tcg/i386/tcg-target.c.inc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 77482da070..a6b2eae995 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -595,6 +595,9 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, 
int v,
 {
 int tmp;
 
+if (opc & P_GS) {
+tcg_out8(s, 0x65);
+}
 /* Use the two byte form if possible, which cannot encode
VEX.W, VEX.B, VEX.X, or an m- field other than P_EXT.  */
 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
-- 
2.34.1

Re: [PATCH v2] virtio: don't zero out memory region cache for indirect descriptors

2023-08-14 Thread Jason Wang

On Fri, Aug 11, 2023 at 10:33 PM Ilya Maximets  wrote:
>
> Lots of virtio functions that are on a hot path in data transmission
> are initializing indirect descriptor cache at the point of stack
> allocation.  It's a 112 byte structure that is getting zeroed out on
> each call adding unnecessary overhead.  It's going to be correctly
> initialized later via special init function.  The only reason to
> actually initialize right away is the ability to safely destruct it.
> Replacing a designated initializer with a function to only initialize
> what is necessary.
>
> Removal of the unnecessary stack initializations improves throughput
> of virtio-net devices in terms of 64B packets per second by 6-14 %
> depending on the case.  Tested with a proposed af-xdp network backend
> and a dpdk testpmd application in the guest, but should be beneficial
> for other virtio devices as well.
>
> Signed-off-by: Ilya Maximets 

Acked-by: Jason Wang 

Thanks

> ---
>
> Version 2:
>
>   * Introduced an initialization function, so we don't need to compare
> pointers in the end. [Stefan]
>   * Removed the now unused macro. [Jason]
>
>  hw/virtio/virtio.c| 20 +++-
>  include/exec/memory.h | 16 +---
>  2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 309038fd46..3d768fda5a 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -1071,10 +1071,12 @@ static void virtqueue_split_get_avail_bytes(VirtQueue 
> *vq,
>  VirtIODevice *vdev = vq->vdev;
>  unsigned int idx;
>  unsigned int total_bufs, in_total, out_total;
> -MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
> +MemoryRegionCache indirect_desc_cache;
>  int64_t len = 0;
>  int rc;
>
> +address_space_cache_init_empty(&indirect_desc_cache);
> +
>  idx = vq->last_avail_idx;
>  total_bufs = in_total = out_total = 0;
>
> @@ -1207,12 +1209,14 @@ static void 
> virtqueue_packed_get_avail_bytes(VirtQueue *vq,
>  VirtIODevice *vdev = vq->vdev;
>  unsigned int idx;
>  unsigned int total_bufs, in_total, out_total;
> +MemoryRegionCache indirect_desc_cache;
>  MemoryRegionCache *desc_cache;
> -MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
>  int64_t len = 0;
>  VRingPackedDesc desc;
>  bool wrap_counter;
>
> +address_space_cache_init_empty(&indirect_desc_cache);
> +
>  idx = vq->last_avail_idx;
>  wrap_counter = vq->last_avail_wrap_counter;
>  total_bufs = in_total = out_total = 0;
> @@ -1487,7 +1491,7 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t 
> sz)
>  {
>  unsigned int i, head, max;
>  VRingMemoryRegionCaches *caches;
> -MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
> +MemoryRegionCache indirect_desc_cache;
>  MemoryRegionCache *desc_cache;
>  int64_t len;
>  VirtIODevice *vdev = vq->vdev;
> @@ -1498,6 +1502,8 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t 
> sz)
>  VRingDesc desc;
>  int rc;
>
> +address_space_cache_init_empty(&indirect_desc_cache);
> +
>  RCU_READ_LOCK_GUARD();
>  if (virtio_queue_empty_rcu(vq)) {
>  goto done;
> @@ -1624,7 +1630,7 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t 
> sz)
>  {
>  unsigned int i, max;
>  VRingMemoryRegionCaches *caches;
> -MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
> +MemoryRegionCache indirect_desc_cache;
>  MemoryRegionCache *desc_cache;
>  int64_t len;
>  VirtIODevice *vdev = vq->vdev;
> @@ -1636,6 +1642,8 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t 
> sz)
>  uint16_t id;
>  int rc;
>
> +address_space_cache_init_empty(&indirect_desc_cache);
> +
>  RCU_READ_LOCK_GUARD();
>  if (virtio_queue_packed_empty_rcu(vq)) {
>  goto done;
> @@ -3935,13 +3943,15 @@ VirtioQueueElement 
> *qmp_x_query_virtio_queue_element(const char *path,
>  } else {
>  unsigned int head, i, max;
>  VRingMemoryRegionCaches *caches;
> -MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
> +MemoryRegionCache indirect_desc_cache;
>  MemoryRegionCache *desc_cache;
>  VRingDesc desc;
>  VirtioRingDescList *list = NULL;
>  VirtioRingDescList *node;
>  int rc; int ndescs;
>
> +address_space_cache_init_empty(&indirect_desc_cache);
> +
>  RCU_READ_LOCK_GUARD();
>
>  max = vq->vring.num;
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 68284428f8..b1c4329d11 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -2664,9 +2664,6 @@ struct MemoryRegionCache {
>  bool is_write;
>  };
>
> -#define MEMORY_REGION_CACHE_INVALID ((MemoryRegionCache) { .mrs.mr = NULL })
> -
> -
>  /* address_space_ld*_cached: load from a cached #MemoryRegion
>   * address_space_st*_cached: store

Re: [PATCH QEMU v2 0/3] provide a smooth upgrade solution for multi-queues disk

2023-08-14 Thread Yong Huang

On Tue, Aug 15, 2023 at 12:28 AM Stefan Hajnoczi 
wrote:

> On Fri, Aug 11, 2023 at 10:31:51AM +0800, Yong Huang wrote:
> > Hi, Stefan, thank you for your interest in this series.
> > I'm trying to explain my point,  if you think my explanation
> > doesn't stand up, please let me know.
> >
> > On Fri, Aug 11, 2023 at 2:33 AM Stefan Hajnoczi 
> wrote:
> >
> > > On Thu, Aug 10, 2023 at 07:07:09AM +, ~hyman wrote:
> > > > Ping,
> > > >
> > > > This version is a copy of version 1 and is rebased
> > > > on the master. No functional changes.
> > > >
> > > > A 1:1 virtqueue:vCPU mapping implementation for virtio-*-pci disk
> > > > introduced since qemu >= 5.2.0, which improves IO performance
> > > > remarkably. To enjoy this feature for exiting running VMs without
> > > > service interruption, the common solution is to migrate VMs from the
> > > > lower version of the hypervisor to the upgraded hypervisor, then wait
> > > > for the next cold reboot of the VM to enable this feature. That's the
> > > > way "discard" and "write-zeroes" features work.
> > > >
> > > > As to multi-queues disk allocation automatically, it's a little
> > > > different because the destination will allocate queues to match the
> > > > number of vCPUs automatically by default in the case of live
> migration,
> > > > and the VMs on the source side remain 1 queue by default, which
> results
> > > > in migration failure due to loading disk VMState incorrectly on the
> > > > destination side.
> > >
> > > Are you using QEMU's versioned machine types to freeze the VM
> > > configuration?
> >
> >
> > > If not, then live migration won't work reliably because you're
> migrating
> > > between two potentially different VM configurations. This issue is not
> > > specific to num-queues, it affects all device properties.
> > >
> > > In commit 9445e1e15e66c19e42bea942ba810db28052cd05 ("virtio-blk-pci:
> > > default num_queues to -smp N") the num_queues property is set to 1 for
> > > versioned machine types <=5.1:
> > >
> > > diff --git a/hw/core/machine.c b/hw/core/machine.c
> > > index 9ee2aa0f7b..7f65fa8743 100644
> > > --- a/hw/core/machine.c
> > > +++ b/hw/core/machine.c
> > > @@ -31,6 +31,7 @@
> > >  GlobalProperty hw_compat_5_1[] = {
> > >  { "vhost-scsi", "num_queues", "1"},
> > >  { "vhost-user-scsi", "num_queues", "1"},
> > > +{ "virtio-blk-device", "num-queues", "1"},
> > >  { "virtio-scsi-device", "num_queues", "1"},
> > >  };
> > >  const size_t hw_compat_5_1_len = G_N_ELEMENTS(hw_compat_5_1);
> > >
> > > Live migration works when the source and destination QEMU are launched
> > > with the same versioned machine type. You can check the "info qtree"
> > > output to confirm that starting a VM with -smp 4 -M pc-q35-5.1 results
> > > in num-queues=1 while -smp 4 -M pc-q35-5.2 results in num-queues=4.
> > >
> > > > This issue requires Qemu to provide a hint that shows
> > > > multi-queues disk allocation is automatically supported, and this
> allows
> > > > upper APPs, e.g., libvirt, to recognize the hypervisor's capability
> of
> > > > this. And upper APPs can ensure to allocate the same num-queues on
> the
> > > > destination side in case of migration failure.
> > > >
> > > > To fix the issue, we introduce the auto-num-queues property for
> > > > virtio-*-pci as a solution, which would be probed by APPs, e.g.,
> libvirt
> > > > by querying the device properties of QEMU. When launching live
> > > > migration, libvirt will send the auto-num-queues property as a
> migration
> > > > cookie to the destination, and thus the destination knows if the
> source
> > > > side supports auto-num-queues. If not, the destination would switch
> off
> > > > by building the command line with "auto-num-queues=off" when
> preparing
> > > > the incoming VM process. The following patches of libvirt show how it
> > > > roughly works:
> > > >
> > >
> https://github.com/newfriday/libvirt/commit/ce2bae2e1a6821afeb80756dc01f3680f525e506
> > > >
> > >
> https://github.com/newfriday/libvirt/commit/f546972b009458c88148fe079544db7e9e1f43c3
> > > >
> > >
> https://github.com/newfriday/libvirt/commit/5ee19c8646fdb4d87ab8b93f287c20925268ce83
> > > >
> > > > The smooth upgrade solution requires the introduction of the
> auto-num-
> > > > queues property on the QEMU side, which is what the patch set does.
> I'm
> > > > hoping for comments about the series.
> > >
> > > Please take a look at versioned machine types. I think auto-num-queues
> > > is not necessary if you use versioned machine types.
> > >
> > > If you do think auto-num-queues is needed, please explain the issue in
> > > more detail and state why versioned machine types don't help.
> >
> >
> > "Using the versioned machine types" is indeed the standard way to ensure
> > the proper functioning of live migration.
> >
> > However, a stable version is strongly advised to maintain function in our
> > production environment and perhaps practically all the production
> > environments in other businesses. As a

[PATCH v5 6/9] gfxstream + rutabaga: add initial support for gfxstream

2023-08-14 Thread Gurchetan Singh

This adds initial support for gfxstream and cross-domain.  Both
features rely on virtio-gpu blob resources and context types, which
are also implemented in this patch.

gfxstream has a long and illustrious history in Android graphics
paravirtualization.  It has been powering graphics in the Android
Studio Emulator for more than a decade, which is the main developer
platform.

Originally conceived by Jesse Hall, it was first known as "EmuGL" [a].
The key design characteristic was a 1:1 threading model and
auto-generation, which fit nicely with the OpenGLES spec.  It also
allowed easy layering with ANGLE on the host, which provides the GLES
implementations on Windows or MacOS enviroments.

gfxstream has traditionally been maintained by a single engineer, and
between 2015 to 2021, the goldfish throne passed to Frank Yang.
Historians often remark this glorious reign ("pax gfxstreama" is the
academic term) was comparable to that of Augustus and both Queen
Elizabeths.  Just to name a few accomplishments in a resplendent
panoply: higher versions of GLES, address space graphics, snapshot
support and CTS compliant Vulkan [b].

One major drawback was the use of out-of-tree goldfish drivers.
Android engineers didn't know much about DRM/KMS and especially TTM so
a simple guest to host pipe was conceived.

Luckily, virtio-gpu 3D started to emerge in 2016 due to the work of
the Mesa/virglrenderer communities.  In 2018, the initial virtio-gpu
port of gfxstream was done by Cuttlefish enthusiast Alistair Delva.
It was a symbol compatible replacement of virglrenderer [c] and named
"AVDVirglrenderer".  This implementation forms the basis of the
current gfxstream host implementation still in use today.

cross-domain support follows a similar arc.  Originally conceived by
Wayland aficionado David Reveman and crosvm enjoyer Zach Reizner in
2018, it initially relied on the downstream "virtio-wl" device.

In 2020 and 2021, virtio-gpu was extended to include blob resources
and multiple timelines by yours truly, features gfxstream/cross-domain
both require to function correctly.

Right now, we stand at the precipice of a truly fantastic possibility:
the Android Emulator powered by upstream QEMU and upstream Linux
kernel.  gfxstream will then be packaged properfully, and app
developers can even fix gfxstream bugs on their own if they encounter
them.

It's been quite the ride, my friends.  Where will gfxstream head next,
nobody really knows.  I wouldn't be surprised if it's around for
another decade, maintained by a new generation of Android graphics
enthusiasts.

Technical details:
  - Very simple initial display integration: just used Pixman
  - Largely, 1:1 mapping of virtio-gpu hypercalls to rutabaga function
calls

Next steps for Android VMs:
  - The next step would be improving display integration and UI interfaces
with the goal of the QEMU upstream graphics being in an emulator
release [d].

Next steps for Linux VMs for display virtualization:
  - For widespread distribution, someone needs to package Sommelier or the
wayland-proxy-virtwl [e] ideally into Debian main. In addition, newer
versions of the Linux kernel come with DRM_VIRTIO_GPU_KMS option,
which allows disabling KMS hypercalls.  If anyone cares enough, it'll
probably be possible to build a custom VM variant that uses this display
virtualization strategy.

[a] https://android-review.googlesource.com/c/platform/development/+/34470
[b] 
https://android-review.googlesource.com/q/topic:%22vulkan-hostconnection-start%22
[c] 
https://android-review.googlesource.com/c/device/generic/goldfish-opengl/+/761927
[d] https://developer.android.com/studio/releases/emulator
[e] https://github.com/talex5/wayland-proxy-virtwl

Signed-off-by: Gurchetan Singh 
Tested-by: Alyssa Ross 
---
v1: Incorported various suggestions by Akihiko Odaki and Bernard Berschow
- Removed GET_VIRTIO_GPU_GL / GET_RUTABAGA macros
- Used error_report(..)
- Used g_autofree to fix leaks on error paths
- Removed unnecessary casts
- added virtio-gpu-pci-rutabaga.c + virtio-vga-rutabaga.c files

v2: Incorported various suggestions by Akihiko Odaki, Marc-André Lureau and
Bernard Berschow:
- Parenthesis in CHECK macro
- CHECK_RESULT(result, ..) --> CHECK(!result, ..)
- delay until g->parent_obj.enable = 1
- Additional cast fixes
- initialize directly in virtio_gpu_rutabaga_realize(..)
- add debug callback to hook into QEMU error's APIs

v3: Incorporated feedback from Akihiko Odaki and Alyssa Ross:
- Autodetect Wayland socket when not explicitly specified
- Fix map_blob error paths
- Add comment why we need both `res` and `resource` in create blob
- Cast and whitespace fixes
- Big endian check comes before virtio_gpu_rutabaga_init().
- VirtIOVGARUTABAGA --> VirtIOVGARutabaga

v4: Incorporated feedback from Akihiko Odaki and Alyssa Ross:
- Double checked all casts
- Remove unnecessary parenthesis
- Remo

[PATCH v5 8/9] gfxstream + rutabaga: enable rutabaga

2023-08-14 Thread Gurchetan Singh

This change enables rutabaga to receive virtio-gpu-3d hypercalls
when it is active.

Signed-off-by: Gurchetan Singh 
Tested-by: Alyssa Ross 
---
v3: Whitespace fix (Akihiko)

 hw/display/virtio-gpu-base.c | 3 ++-
 hw/display/virtio-gpu.c  | 5 +++--
 softmmu/qdev-monitor.c   | 3 +++
 softmmu/vl.c | 1 +
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c
index 4f2b0ba1f3..50c5373b65 100644
--- a/hw/display/virtio-gpu-base.c
+++ b/hw/display/virtio-gpu-base.c
@@ -223,7 +223,8 @@ virtio_gpu_base_get_features(VirtIODevice *vdev, uint64_t 
features,
 {
 VirtIOGPUBase *g = VIRTIO_GPU_BASE(vdev);
 
-if (virtio_gpu_virgl_enabled(g->conf)) {
+if (virtio_gpu_virgl_enabled(g->conf) ||
+virtio_gpu_rutabaga_enabled(g->conf)) {
 features |= (1 << VIRTIO_GPU_F_VIRGL);
 }
 if (virtio_gpu_edid_enabled(g->conf)) {
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 3e658f1fef..08e170e029 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -1361,8 +1361,9 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error 
**errp)
 VirtIOGPU *g = VIRTIO_GPU(qdev);
 
 if (virtio_gpu_blob_enabled(g->parent_obj.conf)) {
-if (!virtio_gpu_have_udmabuf()) {
-error_setg(errp, "cannot enable blob resources without udmabuf");
+if (!virtio_gpu_have_udmabuf() &&
+!virtio_gpu_rutabaga_enabled(g->parent_obj.conf)) {
+error_setg(errp, "need udmabuf or rutabaga for blob resources");
 return;
 }
 
diff --git a/softmmu/qdev-monitor.c b/softmmu/qdev-monitor.c
index 74f4e41338..1b8005ae55 100644
--- a/softmmu/qdev-monitor.c
+++ b/softmmu/qdev-monitor.c
@@ -86,6 +86,9 @@ static const QDevAlias qdev_alias_table[] = {
 { "virtio-gpu-pci", "virtio-gpu", QEMU_ARCH_VIRTIO_PCI },
 { "virtio-gpu-gl-device", "virtio-gpu-gl", QEMU_ARCH_VIRTIO_MMIO },
 { "virtio-gpu-gl-pci", "virtio-gpu-gl", QEMU_ARCH_VIRTIO_PCI },
+{ "virtio-gpu-rutabaga-device", "virtio-gpu-rutabaga",
+  QEMU_ARCH_VIRTIO_MMIO },
+{ "virtio-gpu-rutabaga-pci", "virtio-gpu-rutabaga", QEMU_ARCH_VIRTIO_PCI },
 { "virtio-input-host-device", "virtio-input-host", QEMU_ARCH_VIRTIO_MMIO },
 { "virtio-input-host-ccw", "virtio-input-host", QEMU_ARCH_VIRTIO_CCW },
 { "virtio-input-host-pci", "virtio-input-host", QEMU_ARCH_VIRTIO_PCI },
diff --git a/softmmu/vl.c b/softmmu/vl.c
index b0b96f67fa..2f98eefdf3 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -216,6 +216,7 @@ static struct {
 { .driver = "ati-vga",  .flag = &default_vga   },
 { .driver = "vhost-user-vga",   .flag = &default_vga   },
 { .driver = "virtio-vga-gl",.flag = &default_vga   },
+{ .driver = "virtio-vga-rutabaga",  .flag = &default_vga   },
 };
 
 static QemuOptsList qemu_rtc_opts = {
-- 
2.41.0.694.ge786442a9b-goog

[PATCH v5 7/9] gfxstream + rutabaga: meson support

2023-08-14 Thread Gurchetan Singh

- Add meson detection of rutabaga_gfx
- Build virtio-gpu-rutabaga.c + associated vga/pci files when
  present

Signed-off-by: Gurchetan Singh 
Tested-by: Alyssa Ross 
---
v3: Fix alignment issues (Akihiko)

 hw/display/meson.build| 22 ++
 meson.build   |  7 +++
 meson_options.txt |  2 ++
 scripts/meson-buildoptions.sh |  3 +++
 4 files changed, 34 insertions(+)

diff --git a/hw/display/meson.build b/hw/display/meson.build
index 413ba4ab24..e362d625dd 100644
--- a/hw/display/meson.build
+++ b/hw/display/meson.build
@@ -79,6 +79,13 @@ if config_all_devices.has_key('CONFIG_VIRTIO_GPU')
  if_true: [files('virtio-gpu-gl.c', 
'virtio-gpu-virgl.c'), pixman, virgl])
 hw_display_modules += {'virtio-gpu-gl': virtio_gpu_gl_ss}
   endif
+
+  if rutabaga.found()
+virtio_gpu_rutabaga_ss = ss.source_set()
+virtio_gpu_rutabaga_ss.add(when: ['CONFIG_VIRTIO_GPU', rutabaga],
+   if_true: [files('virtio-gpu-rutabaga.c'), 
pixman])
+hw_display_modules += {'virtio-gpu-rutabaga': virtio_gpu_rutabaga_ss}
+  endif
 endif
 
 if config_all_devices.has_key('CONFIG_VIRTIO_PCI')
@@ -95,6 +102,12 @@ if config_all_devices.has_key('CONFIG_VIRTIO_PCI')
  if_true: [files('virtio-gpu-pci-gl.c'), pixman])
 hw_display_modules += {'virtio-gpu-pci-gl': virtio_gpu_pci_gl_ss}
   endif
+  if rutabaga.found()
+virtio_gpu_pci_rutabaga_ss = ss.source_set()
+virtio_gpu_pci_rutabaga_ss.add(when: ['CONFIG_VIRTIO_GPU', 
'CONFIG_VIRTIO_PCI', rutabaga],
+   if_true: 
[files('virtio-gpu-pci-rutabaga.c'), pixman])
+hw_display_modules += {'virtio-gpu-pci-rutabaga': 
virtio_gpu_pci_rutabaga_ss}
+  endif
 endif
 
 if config_all_devices.has_key('CONFIG_VIRTIO_VGA')
@@ -113,6 +126,15 @@ if config_all_devices.has_key('CONFIG_VIRTIO_VGA')
   virtio_vga_gl_ss.add(when: 'CONFIG_ACPI', if_true: files('acpi-vga.c'),
 if_false: files('acpi-vga-stub.c'))
   hw_display_modules += {'virtio-vga-gl': virtio_vga_gl_ss}
+
+  if rutabaga.found()
+virtio_vga_rutabaga_ss = ss.source_set()
+virtio_vga_rutabaga_ss.add(when: ['CONFIG_VIRTIO_VGA', rutabaga],
+   if_true: [files('virtio-vga-rutabaga.c'), 
pixman])
+virtio_vga_rutabaga_ss.add(when: 'CONFIG_ACPI', if_true: 
files('acpi-vga.c'),
+if_false: 
files('acpi-vga-stub.c'))
+hw_display_modules += {'virtio-vga-rutabaga': virtio_vga_rutabaga_ss}
+  endif
 endif
 
 system_ss.add(when: 'CONFIG_OMAP', if_true: files('omap_lcdc.c'))
diff --git a/meson.build b/meson.build
index 98e68ef0b1..293f388e53 100644
--- a/meson.build
+++ b/meson.build
@@ -1069,6 +1069,12 @@ if not get_option('virglrenderer').auto() or have_system 
or have_vhost_user_gpu
dependencies: virgl))
   endif
 endif
+rutabaga = not_found
+if not get_option('rutabaga_gfx').auto() or have_system or have_vhost_user_gpu
+  rutabaga = dependency('rutabaga_gfx_ffi',
+ method: 'pkg-config',
+ required: get_option('rutabaga_gfx'))
+endif
 blkio = not_found
 if not get_option('blkio').auto() or have_block
   blkio = dependency('blkio',
@@ -4272,6 +4278,7 @@ summary_info += {'libtasn1':  tasn1}
 summary_info += {'PAM':   pam}
 summary_info += {'iconv support': iconv}
 summary_info += {'virgl support': virgl}
+summary_info += {'rutabaga support':  rutabaga}
 summary_info += {'blkio support': blkio}
 summary_info += {'curl support':  curl}
 summary_info += {'Multipath support': mpathpersist}
diff --git a/meson_options.txt b/meson_options.txt
index aaea5ddd77..dea3bf7d9c 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -224,6 +224,8 @@ option('vmnet', type : 'feature', value : 'auto',
description: 'vmnet.framework network backend support')
 option('virglrenderer', type : 'feature', value : 'auto',
description: 'virgl rendering support')
+option('rutabaga_gfx', type : 'feature', value : 'auto',
+   description: 'rutabaga_gfx support')
 option('png', type : 'feature', value : 'auto',
description: 'PNG support with libpng')
 option('vnc', type : 'feature', value : 'auto',
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 9da3fe299b..9a95b4f782 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -154,6 +154,7 @@ meson_options_help() {
   printf "%s\n" '  rbd Ceph block device driver'
   printf "%s\n" '  rdmaEnable RDMA-based migration'
   printf "%s\n" '  replication replication support'
+  printf "%s\n" '  rutabaga-gfxrutabaga_gfx support'
   printf "%s\n" '  sdl SDL user interface'
   printf "%s\n" '  sdl-image   SDL Image support for icons'
   printf "%s\n" '  seccomp seccom

[PATCH v5 0/9] gfxstream + rutabaga_gfx

2023-08-14 Thread Gurchetan Singh

From: Gurchetan Singh 

Prior versions:

v4:
https://lists.gnu.org/archive/html/qemu-devel/2023-08/msg01566.html

v3:
https://lists.gnu.org/archive/html/qemu-devel/2023-08/msg00565.html

v2:
https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg05801.html

v1:
https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg02341.html

RFC:
https://patchew.org/QEMU/20230421011223.718-1-gurchetansi...@chromium.org/

Changes since v4:
- Incorporated review feedback

There area a few things I didn't fully incorporate and here are reasons
why:

"Perhaps it's better to add another parameter to CHECK() and pass values
other than VIRTIO_GPU_RESP_ERR_UNSPEC where appropriate although I doubt
anyone cares" - Akihiko Odaki

The rutabaga_debug_cb will give a more detailed description of the error
in most cases anyways, so I didn't change the macro for simplicity.

"I think it's better to make a hard error if the user requested
RUTABAGA_CAPSET_CROSS_DOMAIN but the default socket path does not fit in
wayland_socket_path." - Akihiko Odaki

It is possible to use cross-domain without enabling Wayland, for guest
swapchain allocation [a] which often needs special handling.  We might
want to use this capability on MacOS/Windows in the future without
Wayland, hence we shouldn't hard error if the default wayland socket
isn't found.

[a] 
https://android.googlesource.com/platform/external/minigbm/+/refs/heads/main/virtgpu_cross_domain.c

"Originally, Antonio Caggiano implemented the virgl_cmd_resource_xxx_blob
in virtio-gpu-virgl.c. Could you have a way to re-use that kinds of
functions in your implemenation?" - Ray Huang

Those patches haven't landed yet.  Once a blob-enabled series does land, I
do think makes sense to factor out generic helpers where-ever possible.

Otherwise, all other suggestions are in v5.

How to build both rutabaga and gfxstream guest/host libs:

https://crosvm.dev/book/appendix/rutabaga_gfx.html

Branch containing this patch series:

https://gitlab.freedesktop.org/gurchetansingh/qemu-gfxstream/-/commits/qemu-gfxstream-v5

Antonio Caggiano (2):
  virtio-gpu: CONTEXT_INIT feature
  virtio-gpu: blob prep

Dr. David Alan Gilbert (1):
  virtio: Add shared memory capability

Gerd Hoffmann (1):
  virtio-gpu: hostmem

Gurchetan Singh (5):
  gfxstream + rutabaga prep: added need defintions, fields, and options
  gfxstream + rutabaga: add initial support for gfxstream
  gfxstream + rutabaga: meson support
  gfxstream + rutabaga: enable rutabaga
  docs/system: add basic virtio-gpu documentation

 docs/system/device-emulation.rst |1 +
 docs/system/devices/virtio-gpu.rst   |  113 +++
 hw/display/meson.build   |   22 +
 hw/display/virtio-gpu-base.c |6 +-
 hw/display/virtio-gpu-pci-rutabaga.c |   48 ++
 hw/display/virtio-gpu-pci.c  |   14 +
 hw/display/virtio-gpu-rutabaga.c | 1114 ++
 hw/display/virtio-gpu.c  |   16 +-
 hw/display/virtio-vga-rutabaga.c |   51 ++
 hw/display/virtio-vga.c  |   33 +-
 hw/virtio/virtio-pci.c   |   18 +
 include/hw/virtio/virtio-gpu-bswap.h |   18 +
 include/hw/virtio/virtio-gpu.h   |   41 +
 include/hw/virtio/virtio-pci.h   |4 +
 meson.build  |7 +
 meson_options.txt|2 +
 scripts/meson-buildoptions.sh|3 +
 softmmu/qdev-monitor.c   |3 +
 softmmu/vl.c |1 +
 19 files changed, 1496 insertions(+), 19 deletions(-)
 create mode 100644 docs/system/devices/virtio-gpu.rst
 create mode 100644 hw/display/virtio-gpu-pci-rutabaga.c
 create mode 100644 hw/display/virtio-gpu-rutabaga.c
 create mode 100644 hw/display/virtio-vga-rutabaga.c

-- 
2.41.0.694.ge786442a9b-goog

[PATCH v5 3/9] virtio-gpu: hostmem

2023-08-14 Thread Gurchetan Singh

From: Gerd Hoffmann 

Use VIRTIO_GPU_SHM_ID_HOST_VISIBLE as id for virtio-gpu.

Signed-off-by: Antonio Caggiano 
Tested-by: Alyssa Ross 
Acked-by: Michael S. Tsirkin 
---
 hw/display/virtio-gpu-pci.c| 14 ++
 hw/display/virtio-gpu.c|  1 +
 hw/display/virtio-vga.c| 33 -
 include/hw/virtio/virtio-gpu.h |  5 +
 4 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/hw/display/virtio-gpu-pci.c b/hw/display/virtio-gpu-pci.c
index 93f214ff58..da6a99f038 100644
--- a/hw/display/virtio-gpu-pci.c
+++ b/hw/display/virtio-gpu-pci.c
@@ -33,6 +33,20 @@ static void virtio_gpu_pci_base_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 DeviceState *vdev = DEVICE(g);
 int i;
 
+if (virtio_gpu_hostmem_enabled(g->conf)) {
+vpci_dev->msix_bar_idx = 1;
+vpci_dev->modern_mem_bar_idx = 2;
+memory_region_init(&g->hostmem, OBJECT(g), "virtio-gpu-hostmem",
+   g->conf.hostmem);
+pci_register_bar(&vpci_dev->pci_dev, 4,
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_PREFETCH |
+ PCI_BASE_ADDRESS_MEM_TYPE_64,
+ &g->hostmem);
+virtio_pci_add_shm_cap(vpci_dev, 4, 0, g->conf.hostmem,
+   VIRTIO_GPU_SHM_ID_HOST_VISIBLE);
+}
+
 virtio_pci_force_virtio_1(vpci_dev);
 if (!qdev_realize(vdev, BUS(&vpci_dev->bus), errp)) {
 return;
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index bbd5c6561a..48ef0d9fad 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -1509,6 +1509,7 @@ static Property virtio_gpu_properties[] = {
  256 * MiB),
 DEFINE_PROP_BIT("blob", VirtIOGPU, parent_obj.conf.flags,
 VIRTIO_GPU_FLAG_BLOB_ENABLED, false),
+DEFINE_PROP_SIZE("hostmem", VirtIOGPU, parent_obj.conf.hostmem, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/display/virtio-vga.c b/hw/display/virtio-vga.c
index e6fb0aa876..c8552ff760 100644
--- a/hw/display/virtio-vga.c
+++ b/hw/display/virtio-vga.c
@@ -115,17 +115,32 @@ static void virtio_vga_base_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 pci_register_bar(&vpci_dev->pci_dev, 0,
  PCI_BASE_ADDRESS_MEM_PREFETCH, &vga->vram);
 
-/*
- * Configure virtio bar and regions
- *
- * We use bar #2 for the mmio regions, to be compatible with stdvga.
- * virtio regions are moved to the end of bar #2, to make room for
- * the stdvga mmio registers at the start of bar #2.
- */
-vpci_dev->modern_mem_bar_idx = 2;
-vpci_dev->msix_bar_idx = 4;
 vpci_dev->modern_io_bar_idx = 5;
 
+if (!virtio_gpu_hostmem_enabled(g->conf)) {
+/*
+ * Configure virtio bar and regions
+ *
+ * We use bar #2 for the mmio regions, to be compatible with stdvga.
+ * virtio regions are moved to the end of bar #2, to make room for
+ * the stdvga mmio registers at the start of bar #2.
+ */
+vpci_dev->modern_mem_bar_idx = 2;
+vpci_dev->msix_bar_idx = 4;
+} else {
+vpci_dev->msix_bar_idx = 1;
+vpci_dev->modern_mem_bar_idx = 2;
+memory_region_init(&g->hostmem, OBJECT(g), "virtio-gpu-hostmem",
+   g->conf.hostmem);
+pci_register_bar(&vpci_dev->pci_dev, 4,
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_PREFETCH |
+ PCI_BASE_ADDRESS_MEM_TYPE_64,
+ &g->hostmem);
+virtio_pci_add_shm_cap(vpci_dev, 4, 0, g->conf.hostmem,
+   VIRTIO_GPU_SHM_ID_HOST_VISIBLE);
+}
+
 if (!(vpci_dev->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)) {
 /*
  * with page-per-vq=off there is no padding space we can use
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index 8377c365ef..de4f624e94 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -108,12 +108,15 @@ enum virtio_gpu_base_conf_flags {
 (_cfg.flags & (1 << VIRTIO_GPU_FLAG_BLOB_ENABLED))
 #define virtio_gpu_context_init_enabled(_cfg) \
 (_cfg.flags & (1 << VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED))
+#define virtio_gpu_hostmem_enabled(_cfg) \
+(_cfg.hostmem > 0)
 
 struct virtio_gpu_base_conf {
 uint32_t max_outputs;
 uint32_t flags;
 uint32_t xres;
 uint32_t yres;
+uint64_t hostmem;
 };
 
 struct virtio_gpu_ctrl_command {
@@ -137,6 +140,8 @@ struct VirtIOGPUBase {
 int renderer_blocked;
 int enable;
 
+MemoryRegion hostmem;
+
 struct virtio_gpu_scanout scanout[VIRTIO_GPU_MAX_SCANOUTS];
 
 int enabled_output_bitmask;
-- 
2.41.0.694.ge786442a9b-goog

[PATCH v5 9/9] docs/system: add basic virtio-gpu documentation

2023-08-14 Thread Gurchetan Singh

This adds basic documentation for virtio-gpu.

Suggested-by: Akihiko Odaki 
Signed-off-by: Gurchetan Singh 
---
v2: - Incorporated suggestions by Akihiko Odaki
- Listed the currently supported capset_names (Bernard)

v3: - Incorporated suggestions by Akihiko Odaki and Alyssa Ross

v4: - Incorporated suggestions by Akihiko Odaki

v5: - Removed pci suffix from examples
- Verified that -device virtio-gpu-rutabaga works.  Strangely
  enough, I don't remember changing anything, and I remember
  it not working.  I did rebase to top of tree though.
- Fixed meson examples in crosvm docs
 docs/system/device-emulation.rst   |   1 +
 docs/system/devices/virtio-gpu.rst | 113 +
 2 files changed, 114 insertions(+)
 create mode 100644 docs/system/devices/virtio-gpu.rst

diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst
index 4491c4cbf7..1167f3a9f2 100644
--- a/docs/system/device-emulation.rst
+++ b/docs/system/device-emulation.rst
@@ -91,6 +91,7 @@ Emulated Devices
devices/nvme.rst
devices/usb.rst
devices/vhost-user.rst
+   devices/virtio-gpu.rst
devices/virtio-pmem.rst
devices/vhost-user-rng.rst
devices/canokey.rst
diff --git a/docs/system/devices/virtio-gpu.rst 
b/docs/system/devices/virtio-gpu.rst
new file mode 100644
index 00..8c5c708272
--- /dev/null
+++ b/docs/system/devices/virtio-gpu.rst
@@ -0,0 +1,113 @@
+..
+   SPDX-License-Identifier: GPL-2.0
+
+virtio-gpu
+==
+
+This document explains the setup and usage of the virtio-gpu device.
+The virtio-gpu device paravirtualizes the GPU and display controller.
+
+Linux kernel support
+
+
+virtio-gpu requires a guest Linux kernel built with the
+``CONFIG_DRM_VIRTIO_GPU`` option.
+
+QEMU virtio-gpu variants
+
+
+QEMU virtio-gpu device variants come in the following form:
+
+ * ``virtio-vga[-BACKEND]``
+ * ``virtio-gpu[-BACKEND][-INTERFACE]``
+ * ``vhost-user-vga``
+ * ``vhost-user-pci``
+
+**Backends:** QEMU provides a 2D virtio-gpu backend, and two accelerated
+backends: virglrenderer ('gl' device label) and rutabaga_gfx ('rutabaga'
+device label).  There is a vhost-user backend that runs the graphics stack
+in a separate process for improved isolation.
+
+**Interfaces:** QEMU further categorizes virtio-gpu device variants based
+on the interface exposed to the guest. The interfaces can be classified
+into VGA and non-VGA variants. The VGA ones are prefixed with virtio-vga
+or vhost-user-vga while the non-VGA ones are prefixed with virtio-gpu or
+vhost-user-gpu.
+
+The VGA ones always use the PCI interface, but for the non-VGA ones, the
+user can further pick between MMIO or PCI. For MMIO, the user can suffix
+the device name with -device, though vhost-user-gpu does not support MMIO.
+For PCI, the user can suffix it with -pci. Without these suffixes, the
+platform default will be chosen.
+
+virtio-gpu 2d
+-
+
+The default 2D backend only performs 2D operations. The guest needs to
+employ a software renderer for 3D graphics.
+
+Typically, the software renderer is provided by `Mesa`_ or `SwiftShader`_.
+Mesa's implementations (LLVMpipe, Lavapipe and virgl below) work out of box
+on typical modern Linux distributions.
+
+.. parsed-literal::
+-device virtio-gpu
+
+.. _Mesa: https://www.mesa3d.org/
+.. _SwiftShader: https://github.com/google/swiftshader
+
+virtio-gpu virglrenderer
+
+
+When using virgl accelerated graphics mode in the guest, OpenGL API calls
+are translated into an intermediate representation (see `Gallium3D`_). The
+intermediate representation is communicated to the host and the
+`virglrenderer`_ library on the host translates the intermediate
+representation back to OpenGL API calls.
+
+.. parsed-literal::
+-device virtio-gpu-gl
+
+.. _Gallium3D: https://www.freedesktop.org/wiki/Software/gallium/
+.. _virglrenderer: https://gitlab.freedesktop.org/virgl/virglrenderer/
+
+virtio-gpu rutabaga
+---
+
+virtio-gpu can also leverage `rutabaga_gfx`_ to provide `gfxstream`_
+rendering and `Wayland display passthrough`_.  With the gfxstream rendering
+mode, GLES and Vulkan calls are forwarded to the host with minimal
+modification.
+
+The crosvm book provides directions on how to build a `gfxstream-enabled
+rutabaga`_ and launch a `guest Wayland proxy`_.
+
+This device does require host blob support (``hostmem`` field below). The
+``hostmem`` field specifies the size of virtio-gpu host memory window.
+This is typically between 256M and 8G.
+
+At least one capset (see colon separated ``capset_names`` below) must be
+specified when starting the device.  The currently supported
+``capset_names`` are ``gfxstream-vulkan`` and ``cross-domain`` on Linux
+guests. For Android guests, ``gfxstream-gles`` is also supported.
+
+The device will try to auto-detect the wayland socket path if the
+``cross-domain`` capset name is set.  The user may optionally specif

[PATCH v5 1/9] virtio: Add shared memory capability

2023-08-14 Thread Gurchetan Singh

From: "Dr. David Alan Gilbert" 

Define a new capability type 'VIRTIO_PCI_CAP_SHARED_MEMORY_CFG' to allow
defining shared memory regions with sizes and offsets of 2^32 and more.
Multiple instances of the capability are allowed and distinguished
by a device-specific 'id'.

Signed-off-by: Dr. David Alan Gilbert 
Signed-off-by: Antonio Caggiano 
Reviewed-by: Gurchetan Singh 
Signed-off-by: Gurchetan Singh 
Tested-by: Alyssa Ross 
Acked-by: Huang Rui 
Tested-by: Huang Rui 
Reviewed-by: Akihiko Odaki 
---
 hw/virtio/virtio-pci.c | 18 ++
 include/hw/virtio/virtio-pci.h |  4 
 2 files changed, 22 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index edbc0daa18..da8c9ea12d 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1435,6 +1435,24 @@ static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
 return offset;
 }
 
+int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy,
+   uint8_t bar, uint64_t offset, uint64_t length,
+   uint8_t id)
+{
+struct virtio_pci_cap64 cap = {
+.cap.cap_len = sizeof cap,
+.cap.cfg_type = VIRTIO_PCI_CAP_SHARED_MEMORY_CFG,
+};
+
+cap.cap.bar = bar;
+cap.cap.length = cpu_to_le32(length);
+cap.length_hi = cpu_to_le32(length >> 32);
+cap.cap.offset = cpu_to_le32(offset);
+cap.offset_hi = cpu_to_le32(offset >> 32);
+cap.cap.id = id;
+return virtio_pci_add_mem_cap(proxy, &cap.cap);
+}
+
 static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr,
unsigned size)
 {
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index ab2051b64b..5a3f182f99 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -264,4 +264,8 @@ unsigned virtio_pci_optimal_num_queues(unsigned 
fixed_queues);
 void virtio_pci_set_guest_notifier_fd_handler(VirtIODevice *vdev, VirtQueue 
*vq,
   int n, bool assign,
   bool with_irqfd);
+
+int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, uint8_t bar, uint64_t offset,
+   uint64_t length, uint8_t id);
+
 #endif
-- 
2.41.0.694.ge786442a9b-goog

[PATCH v5 5/9] gfxstream + rutabaga prep: added need defintions, fields, and options

2023-08-14 Thread Gurchetan Singh

This modifies the common virtio-gpu.h file have the fields and
defintions needed by gfxstream/rutabaga, by VirtioGpuRutabaga.

The command to run these would be:

Signed-off-by: Gurchetan Singh 
Tested-by: Alyssa Ross 
---
-device virtio-vga-rutabaga,capset_names=gfxstream-vulkan:cross-domain, \
wayland_socket_path=/run/user/1000/wayland-0,hostmem=8G  \

v1: void *rutabaga --> struct rutabaga *rutabaga (Akihiko)
have a separate rutabaga device instead of using GL device (Bernard)

v2: VirtioGpuRutabaga --> VirtIOGPURutabaga (Akihiko)
move MemoryRegionInfo into VirtIOGPURutabaga (Akihiko)
remove 'ctx' field (Akihiko)
remove 'rutabaga_active'

 include/hw/virtio/virtio-gpu.h | 28 
 1 file changed, 28 insertions(+)

diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index 55973e112f..e2a07e68d9 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -38,6 +38,9 @@ OBJECT_DECLARE_SIMPLE_TYPE(VirtIOGPUGL, VIRTIO_GPU_GL)
 #define TYPE_VHOST_USER_GPU "vhost-user-gpu"
 OBJECT_DECLARE_SIMPLE_TYPE(VhostUserGPU, VHOST_USER_GPU)
 
+#define TYPE_VIRTIO_GPU_RUTABAGA "virtio-gpu-rutabaga-device"
+OBJECT_DECLARE_SIMPLE_TYPE(VirtIOGPURutabaga, VIRTIO_GPU_RUTABAGA)
+
 struct virtio_gpu_simple_resource {
 uint32_t resource_id;
 uint32_t width;
@@ -94,6 +97,7 @@ enum virtio_gpu_base_conf_flags {
 VIRTIO_GPU_FLAG_DMABUF_ENABLED,
 VIRTIO_GPU_FLAG_BLOB_ENABLED,
 VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED,
+VIRTIO_GPU_FLAG_RUTABAGA_ENABLED,
 };
 
 #define virtio_gpu_virgl_enabled(_cfg) \
@@ -108,6 +112,8 @@ enum virtio_gpu_base_conf_flags {
 (_cfg.flags & (1 << VIRTIO_GPU_FLAG_BLOB_ENABLED))
 #define virtio_gpu_context_init_enabled(_cfg) \
 (_cfg.flags & (1 << VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED))
+#define virtio_gpu_rutabaga_enabled(_cfg) \
+(_cfg.flags & (1 << VIRTIO_GPU_FLAG_RUTABAGA_ENABLED))
 #define virtio_gpu_hostmem_enabled(_cfg) \
 (_cfg.hostmem > 0)
 
@@ -232,6 +238,28 @@ struct VhostUserGPU {
 bool backend_blocked;
 };
 
+#define MAX_SLOTS 4096
+
+struct MemoryRegionInfo {
+int used;
+MemoryRegion mr;
+uint32_t resource_id;
+};
+
+struct rutabaga;
+
+struct VirtIOGPURutabaga {
+struct VirtIOGPU parent_obj;
+
+struct MemoryRegionInfo memory_regions[MAX_SLOTS];
+char *capset_names;
+char *wayland_socket_path;
+char *wsi;
+bool headless;
+uint32_t num_capsets;
+struct rutabaga *rutabaga;
+};
+
 #define VIRTIO_GPU_FILL_CMD(out) do {   \
 size_t s;   \
 s = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, 0,  \
-- 
2.41.0.694.ge786442a9b-goog

[PATCH v5 2/9] virtio-gpu: CONTEXT_INIT feature

2023-08-14 Thread Gurchetan Singh

From: Antonio Caggiano 

The feature can be enabled when a backend wants it.

Signed-off-by: Antonio Caggiano 
Reviewed-by: Marc-André Lureau 
Signed-off-by: Gurchetan Singh 
Tested-by: Alyssa Ross 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Akihiko Odaki 
---
 hw/display/virtio-gpu-base.c   | 3 +++
 include/hw/virtio/virtio-gpu.h | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/hw/display/virtio-gpu-base.c b/hw/display/virtio-gpu-base.c
index ca1fb7b16f..4f2b0ba1f3 100644
--- a/hw/display/virtio-gpu-base.c
+++ b/hw/display/virtio-gpu-base.c
@@ -232,6 +232,9 @@ virtio_gpu_base_get_features(VirtIODevice *vdev, uint64_t 
features,
 if (virtio_gpu_blob_enabled(g->conf)) {
 features |= (1 << VIRTIO_GPU_F_RESOURCE_BLOB);
 }
+if (virtio_gpu_context_init_enabled(g->conf)) {
+features |= (1 << VIRTIO_GPU_F_CONTEXT_INIT);
+}
 
 return features;
 }
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index 390c4642b8..8377c365ef 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -93,6 +93,7 @@ enum virtio_gpu_base_conf_flags {
 VIRTIO_GPU_FLAG_EDID_ENABLED,
 VIRTIO_GPU_FLAG_DMABUF_ENABLED,
 VIRTIO_GPU_FLAG_BLOB_ENABLED,
+VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED,
 };
 
 #define virtio_gpu_virgl_enabled(_cfg) \
@@ -105,6 +106,8 @@ enum virtio_gpu_base_conf_flags {
 (_cfg.flags & (1 << VIRTIO_GPU_FLAG_DMABUF_ENABLED))
 #define virtio_gpu_blob_enabled(_cfg) \
 (_cfg.flags & (1 << VIRTIO_GPU_FLAG_BLOB_ENABLED))
+#define virtio_gpu_context_init_enabled(_cfg) \
+(_cfg.flags & (1 << VIRTIO_GPU_FLAG_CONTEXT_INIT_ENABLED))
 
 struct virtio_gpu_base_conf {
 uint32_t max_outputs;
-- 
2.41.0.694.ge786442a9b-goog

[PATCH v5 4/9] virtio-gpu: blob prep

2023-08-14 Thread Gurchetan Singh

From: Antonio Caggiano 

This adds preparatory functions needed to:

 - decode blob cmds
 - tracking iovecs

Signed-off-by: Antonio Caggiano 
Signed-off-by: Dmitry Osipenko 
Signed-off-by: Gurchetan Singh 
Tested-by: Alyssa Ross 
---
 hw/display/virtio-gpu.c  | 10 +++---
 include/hw/virtio/virtio-gpu-bswap.h | 18 ++
 include/hw/virtio/virtio-gpu.h   |  5 +
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 48ef0d9fad..3e658f1fef 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -33,15 +33,11 @@
 
 #define VIRTIO_GPU_VM_VERSION 1
 
-static struct virtio_gpu_simple_resource*
-virtio_gpu_find_resource(VirtIOGPU *g, uint32_t resource_id);
 static struct virtio_gpu_simple_resource *
 virtio_gpu_find_check_resource(VirtIOGPU *g, uint32_t resource_id,
bool require_backing,
const char *caller, uint32_t *error);
 
-static void virtio_gpu_cleanup_mapping(VirtIOGPU *g,
-   struct virtio_gpu_simple_resource *res);
 static void virtio_gpu_reset_bh(void *opaque);
 
 void virtio_gpu_update_cursor_data(VirtIOGPU *g,
@@ -116,7 +112,7 @@ static void update_cursor(VirtIOGPU *g, struct 
virtio_gpu_update_cursor *cursor)
   cursor->resource_id ? 1 : 0);
 }
 
-static struct virtio_gpu_simple_resource *
+struct virtio_gpu_simple_resource *
 virtio_gpu_find_resource(VirtIOGPU *g, uint32_t resource_id)
 {
 struct virtio_gpu_simple_resource *res;
@@ -904,8 +900,8 @@ void virtio_gpu_cleanup_mapping_iov(VirtIOGPU *g,
 g_free(iov);
 }
 
-static void virtio_gpu_cleanup_mapping(VirtIOGPU *g,
-   struct virtio_gpu_simple_resource *res)
+void virtio_gpu_cleanup_mapping(VirtIOGPU *g,
+struct virtio_gpu_simple_resource *res)
 {
 virtio_gpu_cleanup_mapping_iov(g, res->iov, res->iov_cnt);
 res->iov = NULL;
diff --git a/include/hw/virtio/virtio-gpu-bswap.h 
b/include/hw/virtio/virtio-gpu-bswap.h
index 9124108485..dd1975e2d4 100644
--- a/include/hw/virtio/virtio-gpu-bswap.h
+++ b/include/hw/virtio/virtio-gpu-bswap.h
@@ -63,10 +63,28 @@ virtio_gpu_create_blob_bswap(struct 
virtio_gpu_resource_create_blob *cblob)
 {
 virtio_gpu_ctrl_hdr_bswap(&cblob->hdr);
 le32_to_cpus(&cblob->resource_id);
+le32_to_cpus(&cblob->blob_mem);
 le32_to_cpus(&cblob->blob_flags);
+le32_to_cpus(&cblob->nr_entries);
+le64_to_cpus(&cblob->blob_id);
 le64_to_cpus(&cblob->size);
 }
 
+static inline void
+virtio_gpu_map_blob_bswap(struct virtio_gpu_resource_map_blob *mblob)
+{
+virtio_gpu_ctrl_hdr_bswap(&mblob->hdr);
+le32_to_cpus(&mblob->resource_id);
+le64_to_cpus(&mblob->offset);
+}
+
+static inline void
+virtio_gpu_unmap_blob_bswap(struct virtio_gpu_resource_unmap_blob *ublob)
+{
+virtio_gpu_ctrl_hdr_bswap(&ublob->hdr);
+le32_to_cpus(&ublob->resource_id);
+}
+
 static inline void
 virtio_gpu_scanout_blob_bswap(struct virtio_gpu_set_scanout_blob *ssb)
 {
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index de4f624e94..55973e112f 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -257,6 +257,9 @@ void virtio_gpu_base_fill_display_info(VirtIOGPUBase *g,
 void virtio_gpu_base_generate_edid(VirtIOGPUBase *g, int scanout,
struct virtio_gpu_resp_edid *edid);
 /* virtio-gpu.c */
+struct virtio_gpu_simple_resource *
+virtio_gpu_find_resource(VirtIOGPU *g, uint32_t resource_id);
+
 void virtio_gpu_ctrl_response(VirtIOGPU *g,
   struct virtio_gpu_ctrl_command *cmd,
   struct virtio_gpu_ctrl_hdr *resp,
@@ -275,6 +278,8 @@ int virtio_gpu_create_mapping_iov(VirtIOGPU *g,
   uint32_t *niov);
 void virtio_gpu_cleanup_mapping_iov(VirtIOGPU *g,
 struct iovec *iov, uint32_t count);
+void virtio_gpu_cleanup_mapping(VirtIOGPU *g,
+struct virtio_gpu_simple_resource *res);
 void virtio_gpu_process_cmdq(VirtIOGPU *g);
 void virtio_gpu_device_realize(DeviceState *qdev, Error **errp);
 void virtio_gpu_reset(VirtIODevice *vdev);
-- 
2.41.0.694.ge786442a9b-goog

Re: [PATCH for-8.2] hw/s390x/s390-virtio-ccw: Remove superfluous code to set the NIC model

2023-08-14 Thread Halil Pasic

On Fri,  4 Aug 2023 09:35:25 +0200
Thomas Huth  wrote:

> The check for nd->model being NULL was originally required, but in
> commit e11f463295d95aba ("s390x/virtio: use qemu_check_nic_model()")
> the corresponding code had been replaced by a call to the function
> qemu_check_nic_model() - and this in turn calls qemu_find_nic_model()
> which contains the same check for nd->model being NULL again. So we
> can remove this from the calling site now.
> 
> Signed-off-by: Thomas Huth 

Reviewed-by: Halil Pasic

Re: [PATCH for-8.2 v2 1/2] qapi/migration: Deduplicate migration parameter field comments

2023-08-14 Thread Peter Xu

On Tue, Aug 08, 2023 at 04:03:46PM -0400, Peter Xu wrote:
> On Sun, Aug 06, 2023 at 11:49:46AM -0400, Peter Xu wrote:
> > > I think we have a tradeoff here.  If perpetuating the unclean and ugly
> > > use of "" is what it takes to de-triplicate migration parameters, we may
> > > decide to accept that.
> > 
> > I don't think it's a must.  As Dan raised, we can convert str -> StrOrNull
> > for MigrationParameters. I assume it won't affect query-migrate-parameters
> > anyway OTOH.
> > 
> > I assume it means there's nothing yet obvious that we overlooked on the
> > whole idea.  Let me propose the formal patchset early next week.  It'll be
> > mostly the patch I attached but just add those extra logics for StrOrNull,
> > so the diffstat might be less attractive but hopefully still good enough to
> > be accepted.
> 
> The new StrOrNull approach doesn't work with current migration object
> properties.. as StrOrNull must be a pointer for @MigrationParameters not
> static, and it stops working with offsetof():
> 
> ../migration/options.c:218:5: error: cannot apply ‘offsetof’ to a non 
> constant address
>   218 | DEFINE_PROP_STRING("tls-creds", MigrationState, 
> parameters.tls_creds->u.s),
>   | ^~
> ../migration/options.c:219:5: error: cannot apply ‘offsetof’ to a non 
> constant address
>   219 | DEFINE_PROP_STRING("tls-hostname", MigrationState, 
> parameters.tls_hostname->u.s),
>   | ^~
> ../migration/options.c:220:5: error: cannot apply ‘offsetof’ to a non 
> constant address
>   220 | DEFINE_PROP_STRING("tls-authz", MigrationState, 
> parameters.tls_authz->u.s),
>   | ^~
> 
> Any easy way to fix this?  I.e., is there a way to declare StrOrNull (in
> MigrationParameters of qapi/migration.json) to be statically allocated
> rather than a pointer (just like default behavior of any uint* types)?

Posted a version with 'str' replacing 'StrOrNull'.  Let's move the
discussion there:

https://lore.kernel.org/r/20230814221947.353093-1-pet...@redhat.com

-- 
Peter Xu

[PATCH for-8.2 3/4] migration/qapi: Replace @MigrateSetParameters with @MigrationParameters

2023-08-14 Thread Peter Xu

These two structs are mostly identical besides some fields (quote from
Daniel P. Berrangé in his reply):

  1c1
  < { 'struct': 'MigrationParameters',
  ---
  > { 'struct': 'MigrateSetParameters',
  14,16c14,16
  < '*tls-creds': 'str',
  < '*tls-hostname': 'str',
  < '*tls-authz': 'str',
  ---
  > '*tls-creds': 'StrOrNull',
  > '*tls-hostname': 'StrOrNull',
  > '*tls-authz': 'StrOrNull',

Here the difference is @MigrateSetParameters object would allow 'null'
values for any tls-* fields passed in.

Markus used to describe why it happened to be StrOrNull, and also his
concern on having a pure "str" type to be problematic as the reset
indicator in the commit 01fa559826 ("migration: Use JSON null instead of ""
to reset parameter to default").  There, "null" is introduced for the tls
fields even though being treated as "" (empty string) internally to match
the code.

Here to deduplicate the two objects, logically it'll be safe only if we use
"StrOrNull" to replace "str" type, not vice versa.  However we may face
difficulty using StrOrNull as part of MigrationState.parameters [1] when
replacing existing @MigrationParameters to use StrOrNull.  With the fact
that nobody seems to be using "null" for tls-* fields (see the long
standing qemu crash bug on tls-authz when "null" was passed in), let's use
"str" to represent both objects.

This greatly deduplicates the code not only in qapi/migration.json, but
also in the generic migration code on handling transitions between
StrOrNull <-> str types.

[1] https://lore.kernel.org/all/ZNKfoqM0V6pcvrz%2F@x1n/

Signed-off-by: Peter Xu 
---
 qapi/migration.json| 185 +
 migration/migration-hmp-cmds.c |  16 +--
 migration/options.c| 145 ++
 3 files changed, 12 insertions(+), 334 deletions(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 8843e74b59..0416da65b5 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -851,189 +851,6 @@
{ 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] },
'vcpu-dirty-limit'] }
 
-##
-# @MigrateSetParameters:
-#
-# @announce-initial: Initial delay (in milliseconds) before sending
-# the first announce (Since 4.0)
-#
-# @announce-max: Maximum delay (in milliseconds) between packets in
-# the announcement (Since 4.0)
-#
-# @announce-rounds: Number of self-announce packets sent after
-# migration (Since 4.0)
-#
-# @announce-step: Increase in delay (in milliseconds) between
-# subsequent packets in the announcement (Since 4.0)
-#
-# @compress-level: compression level
-#
-# @compress-threads: compression thread count
-#
-# @compress-wait-thread: Controls behavior when all compression
-# threads are currently busy.  If true (default), wait for a free
-# compression thread to become available; otherwise, send the page
-# uncompressed.  (Since 3.1)
-#
-# @decompress-threads: decompression thread count
-#
-# @throttle-trigger-threshold: The ratio of bytes_dirty_period and
-# bytes_xfer_period to trigger throttling.  It is expressed as
-# percentage.  The default value is 50. (Since 5.0)
-#
-# @cpu-throttle-initial: Initial percentage of time guest cpus are
-# throttled when migration auto-converge is activated.  The
-# default value is 20. (Since 2.7)
-#
-# @cpu-throttle-increment: throttle percentage increase each time
-# auto-converge detects that migration is not making progress.
-# The default value is 10. (Since 2.7)
-#
-# @cpu-throttle-tailslow: Make CPU throttling slower at tail stage At
-# the tail stage of throttling, the Guest is very sensitive to CPU
-# percentage while the @cpu-throttle -increment is excessive
-# usually at tail stage.  If this parameter is true, we will
-# compute the ideal CPU percentage used by the Guest, which may
-# exactly make the dirty rate match the dirty rate threshold.
-# Then we will choose a smaller throttle increment between the one
-# specified by @cpu-throttle-increment and the one generated by
-# ideal CPU percentage.  Therefore, it is compatible to
-# traditional throttling, meanwhile the throttle increment won't
-# be excessive at tail stage.  The default value is false.  (Since
-# 5.1)
-#
-# @tls-creds: ID of the 'tls-creds' object that provides credentials
-# for establishing a TLS connection over the migration data
-# channel.  On the outgoing side of the migration, the credentials
-# must be for a 'client' endpoint, while for the incoming side the
-# credentials must be for a 'server' endpoint.  Setting this to a
-# non-empty string enables TLS for all migrations.  An empty
-# string means that QEMU will use plain text mode for migration,
-# rather than TLS (Since 2.9) Previously (since 2.7), this was
-# reported by omitting tls-creds instead.
-#
-# @tls-hostname: hostname o

[PATCH for-8.2 2/4] tests/migration-test: Add a test for null parameter setups

2023-08-14 Thread Peter Xu

Add a test for StrOrNull parameters (tls-*).

Signed-off-by: Peter Xu 
---
 tests/qtest/migration-test.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 62d3f37021..12e72580a6 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1471,6 +1471,29 @@ static void test_postcopy_preempt_all(void)
 
 #endif
 
+/*
+ * We have a few parameters that allows null as input, test them to make
+ * sure they won't crash (where some used to).
+ */
+static void test_null_parameters(void)
+{
+const char *null_params[] = {
+"tls-authz", "tls-hostname", "tls-creds"};
+QTestState *vm = qtest_init("");
+QDict *response;
+int i;
+
+for (i = 0; i < sizeof(null_params) / sizeof(const char *); i++) {
+response = qtest_qmp(vm, "{ 'execute': 'migrate-set-parameters',"
+ "'arguments': { %s: null } }",
+ null_params[i]);
+/* Succeed or fail; as long as not crashing */
+qobject_unref(response);
+}
+
+qtest_quit(vm);
+}
+
 static void test_baddest(void)
 {
 MigrateStart args = {
@@ -2827,6 +2850,7 @@ int main(int argc, char **argv)
 }
 }
 
+qtest_add_func("/migration/null_parameters", test_null_parameters);
 qtest_add_func("/migration/bad_dest", test_baddest);
 qtest_add_func("/migration/precopy/unix/plain", test_precopy_unix_plain);
 qtest_add_func("/migration/precopy/unix/xbzrle", test_precopy_unix_xbzrle);
-- 
2.41.0

[PATCH for-8.2 4/4] migration/qapi: Drop @MigrationParameter enum

2023-08-14 Thread Peter Xu

Drop the enum in qapi because it is never used in QMP APIs.  Instead making
it an internal definition for QEMU so that we can decouple it from QAPI.
One of the important things is we can deduplicate the documentations on the
various migration parameters.

Signed-off-by: Peter Xu 
---
 qapi/migration.json| 179 -
 migration/options.h|  47 +
 migration/migration-hmp-cmds.c |   3 +-
 migration/options.c|  51 ++
 4 files changed, 100 insertions(+), 180 deletions(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 0416da65b5..4846b2a98e 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -672,185 +672,6 @@
   'bitmaps': [ 'BitmapMigrationBitmapAlias' ]
   } }
 
-##
-# @MigrationParameter:
-#
-# Migration parameters enumeration
-#
-# @announce-initial: Initial delay (in milliseconds) before sending
-# the first announce (Since 4.0)
-#
-# @announce-max: Maximum delay (in milliseconds) between packets in
-# the announcement (Since 4.0)
-#
-# @announce-rounds: Number of self-announce packets sent after
-# migration (Since 4.0)
-#
-# @announce-step: Increase in delay (in milliseconds) between
-# subsequent packets in the announcement (Since 4.0)
-#
-# @compress-level: Set the compression level to be used in live
-# migration, the compression level is an integer between 0 and 9,
-# where 0 means no compression, 1 means the best compression
-# speed, and 9 means best compression ratio which will consume
-# more CPU.
-#
-# @compress-threads: Set compression thread count to be used in live
-# migration, the compression thread count is an integer between 1
-# and 255.
-#
-# @compress-wait-thread: Controls behavior when all compression
-# threads are currently busy.  If true (default), wait for a free
-# compression thread to become available; otherwise, send the page
-# uncompressed.  (Since 3.1)
-#
-# @decompress-threads: Set decompression thread count to be used in
-# live migration, the decompression thread count is an integer
-# between 1 and 255. Usually, decompression is at least 4 times as
-# fast as compression, so set the decompress-threads to the number
-# about 1/4 of compress-threads is adequate.
-#
-# @throttle-trigger-threshold: The ratio of bytes_dirty_period and
-# bytes_xfer_period to trigger throttling.  It is expressed as
-# percentage.  The default value is 50. (Since 5.0)
-#
-# @cpu-throttle-initial: Initial percentage of time guest cpus are
-# throttled when migration auto-converge is activated.  The
-# default value is 20. (Since 2.7)
-#
-# @cpu-throttle-increment: throttle percentage increase each time
-# auto-converge detects that migration is not making progress.
-# The default value is 10. (Since 2.7)
-#
-# @cpu-throttle-tailslow: Make CPU throttling slower at tail stage At
-# the tail stage of throttling, the Guest is very sensitive to CPU
-# percentage while the @cpu-throttle -increment is excessive
-# usually at tail stage.  If this parameter is true, we will
-# compute the ideal CPU percentage used by the Guest, which may
-# exactly make the dirty rate match the dirty rate threshold.
-# Then we will choose a smaller throttle increment between the one
-# specified by @cpu-throttle-increment and the one generated by
-# ideal CPU percentage.  Therefore, it is compatible to
-# traditional throttling, meanwhile the throttle increment won't
-# be excessive at tail stage.  The default value is false.  (Since
-# 5.1)
-#
-# @tls-creds: ID of the 'tls-creds' object that provides credentials
-# for establishing a TLS connection over the migration data
-# channel.  On the outgoing side of the migration, the credentials
-# must be for a 'client' endpoint, while for the incoming side the
-# credentials must be for a 'server' endpoint.  Setting this will
-# enable TLS for all migrations.  The default is unset, resulting
-# in unsecured migration at the QEMU level.  (Since 2.7)
-#
-# @tls-hostname: hostname of the target host for the migration.  This
-# is required when using x509 based TLS credentials and the
-# migration URI does not already include a hostname.  For example
-# if using fd: or exec: based migration, the hostname must be
-# provided so that the server's x509 certificate identity can be
-# validated.  (Since 2.7)
-#
-# @tls-authz: ID of the 'authz' object subclass that provides access
-# control checking of the TLS x509 certificate distinguished name.
-# This object is only resolved at time of use, so can be deleted
-# and recreated on the fly while the migration server is active.
-# If missing, it will default to denying access (Since 4.0)
-#
-# @max-bandwidth: to set maximum speed for migration.  maximum speed
-# in bytes per second.  (Since 2.8)
-#
-# @downtime-limit: set maximum tol

[PATCH for-8.2 1/4] migration/qmp: Fix crash on setting tls-authz with null

2023-08-14 Thread Peter Xu

QEMU will crash if anyone tries to set tls-authz (which is a type
StrOrNull) with 'null' value.  Fix it in the easy way by converting it to
qstring just like the other two tls parameters.

Cc: qemu-sta...@nongnu.org # v4.0+
Fixes: d2f1d29b95 ("migration: add support for a "tls-authz" migration 
parameter")
Signed-off-by: Peter Xu 
---
 migration/options.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/migration/options.c b/migration/options.c
index 1d1e1321b0..6bbfd4853d 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -1408,20 +1408,25 @@ void qmp_migrate_set_parameters(MigrateSetParameters 
*params, Error **errp)
 {
 MigrationParameters tmp;
 
-/* TODO Rewrite "" to null instead */
+/* TODO Rewrite "" to null instead for all three tls_* parameters */
 if (params->tls_creds
 && params->tls_creds->type == QTYPE_QNULL) {
 qobject_unref(params->tls_creds->u.n);
 params->tls_creds->type = QTYPE_QSTRING;
 params->tls_creds->u.s = strdup("");
 }
-/* TODO Rewrite "" to null instead */
 if (params->tls_hostname
 && params->tls_hostname->type == QTYPE_QNULL) {
 qobject_unref(params->tls_hostname->u.n);
 params->tls_hostname->type = QTYPE_QSTRING;
 params->tls_hostname->u.s = strdup("");
 }
+if (params->tls_authz
+&& params->tls_authz->type == QTYPE_QNULL) {
+qobject_unref(params->tls_authz->u.n);
+params->tls_authz->type = QTYPE_QSTRING;
+params->tls_authz->u.s = strdup("");
+}
 
 migrate_params_test_apply(params, &tmp);
 
-- 
2.41.0

[PATCH for-8.2 0/4] qapi/migration: Dedup migration parameter objects and fix tls-authz crash

2023-08-14 Thread Peter Xu

Patch 1 fixes the tls-authz crashing when someone specifies "null"
parameter for tls-authz.

Patch 2 added a test case for all three tls-auth parameters specifying
"null" to make sure nothing will crash ever with 'null' passed into it.

Patch 3-4 are the proposed patches to deduplicate the three migration
parameter objects in qapi/migration.json.  Note that in this version (patch
3) we used 'str' to replace 'StrOrNull' for tls-* parameters to make then
deduplicate-able.

Please review, thanks.

Peter Xu (4):
  migration/qmp: Fix crash on setting tls-authz with null
  tests/migration-test: Add a test for null parameter setups
  migration/qapi: Replace @MigrateSetParameters with
@MigrationParameters
  migration/qapi: Drop @MigrationParameter enum

 qapi/migration.json| 364 +
 migration/options.h|  47 +
 migration/migration-hmp-cmds.c |  19 +-
 migration/options.c| 191 ++---
 tests/qtest/migration-test.c   |  24 +++
 5 files changed, 136 insertions(+), 509 deletions(-)

-- 
2.41.0

Re: [PATCH v2] migration: refactor migration_completion

2023-08-14 Thread Isaku Yamahata

On Fri, Aug 04, 2023 at 05:30:53PM +0800,
Wei Wang  wrote:

> Current migration_completion function is a bit long. Refactor the long
> implementation into different subfunctions:
> - migration_completion_precopy: completion code related to precopy
> - migration_completion_postcopy: completion code related to postcopy
> - close_return_path_on_source: rp thread related cleanup on migration
> completion. It is named to match with open_return_path_on_source.
> 
> This improves readability and is easier for future updates (e.g. add new
> subfunctions when completion code related to new features are needed). No
> functional changes intended.
> 
> Signed-off-by: Wei Wang 
> ---
> Changelog:
> - Merge await_return_path_close_on_source into
>   close_return_path_on_source as the later basically just calls the
>   previous;
> - make migration_completion_postcopy "void" as it doesn't return a
>   value.

Reviewed-by: Isaku Yamahata 
-- 
Isaku Yamahata

Re: [RFC PATCH 00/19] QEMU gmem implemention

2023-08-14 Thread Isaku Yamahata

On Thu, Aug 10, 2023 at 10:58:09AM -0500,
Michael Roth via  wrote:

> On Tue, Aug 01, 2023 at 09:45:41AM +0800, Xiaoyao Li wrote:
> > On 8/1/2023 12:51 AM, Daniel P. Berrangé wrote:
> > > On Mon, Jul 31, 2023 at 12:21:42PM -0400, Xiaoyao Li wrote:
> > > > This is the first RFC version of enabling KVM gmem[1] as the backend for
> > > > private memory of KVM_X86_PROTECTED_VM.
> > > > 
> > > > It adds the support to create a specific KVM_X86_PROTECTED_VM type VM,
> > > > and introduces 'private' property for memory backend. When the vm type
> > > > is KVM_X86_PROTECTED_VM and memory backend has private enabled as below,
> > > > it will call KVM gmem ioctl to allocate private memory for the backend.
> > > > 
> > > >  $qemu -object memory-backend-ram,id=mem0,size=1G,private=on \
> > > >-machine q35,kvm-type=sw-protected-vm,memory-backend=mem0 \
> > > >   ...
> > > > 
> > > > Unfortunately this patch series fails the boot of OVMF at very early
> > > > stage due to triple fault because KVM doesn't support emulate string IO
> > > > to private memory. We leave it as an open to be discussed.
> > > > 
> > > > There are following design opens that need to be discussed:
> > > > 
> > > > 1. how to determine the vm type?
> > > > 
> > > > a. like this series, specify the vm type via machine property
> > > >'kvm-type'
> > > > b. check the memory backend, if any backend has 'private' property
> > > >set, the vm-type is set to KVM_X86_PROTECTED_VM.
> > > > 
> > > > 2. whether 'private' property is needed if we choose 1.b as design
> > > > 
> > > > with 1.b, QEMU can decide whether the memory region needs to be
> > > > private (allocates gmem fd for it) or not, on its own.
> > > > 
> > > > 3. What is KVM_X86_SW_PROTECTED_VM going to look like? What's the
> > > > purose of it and what's the requirement on it. I think it's the
> > > > questions for KVM folks than QEMU folks.
> > > > 
> > > > Any other idea/open/question is welcomed.
> > > > 
> > > > 
> > > > Beside, TDX QEMU implemetation is based on this series to provide
> > > > private gmem for TD private memory, which can be found at [2].
> > > > And it can work corresponding KVM [3] to boot TDX guest.
> > > 
> > > We already have a general purpose configuration mechanism for
> > > confidential guests.  The -machine argument has a property
> > > confidential-guest-support=$OBJECT-ID, for pointing to an
> > > object that implements the TYPE_CONFIDENTIAL_GUEST_SUPPORT
> > > interface in QEMU. This is implemented with SEV, PPC PEF
> > > mode, and s390 protvirt.
> > > 
> > > I would expect TDX to follow this same design ie
> > > 
> > >  qemu-system-x86_64 \
> > >-object tdx-guest,id=tdx0,. \
> > >-machine q35,confidential-guest-support=tdx0 \
> > >...
> > > 
> > > and not require inventing the new 'kvm-type' attribute at least.
> > 
> > yes.
> > 
> > TDX is initialized exactly as the above.
> > 
> > This RFC series introduces the 'kvm-type' for KVM_X86_SW_PROTECTED_VM. It's
> > my fault that forgot to list the option of introducing sw_protected_vm
> > object with CONFIDENTIAL_GUEST_SUPPORT interface.
> > Thanks for Isaku to raise it 
> > https://lore.kernel.org/qemu-devel/20230731171041.gb1807...@ls.amr.corp.intel.com/
> > 
> > we can specify KVM_X86_SW_PROTECTED_VM this way:
> > 
> > qemu  \
> >   -object sw-protected,id=swp0,... \
> >   -machine confidential-guest-support=swp0 \
> >   ...
> > 
> > > For the memory backend though, I'm not so sure - possibly that
> > > might be something that still wants an extra property to identify
> > > the type of memory to allocate, since we use memory-backend-ram
> > > for a variety of use cases.  Or it could be an entirely new object
> > > type such as "memory-backend-gmem"
> > 
> > What I want to discuss is whether providing the interface to users to allow
> > them configuring which memory is/can be private. For example, QEMU can do it
> > internally. If users wants a confidential guest, QEMU allocates private gmem
> > for normal RAM automatically.
> 
> I think handling it automatically simplifies things a good deal on the
> QEMU side. I think it's still worthwhile to still allow:
> 
>  -object memory-backend-memfd-private,...
> 
> because it provides a nice mechanism to set up a pair of shared/private
> memfd's to enable hole-punching via fallocate() to avoid doubling memory
> allocations for shared/private. It's also a nice place to control
> potentially-configurable things like:
> 
>  - whether or not to enable discard/hole-punching
>  - if discard is enabled, whether or not to register the range via
>RamDiscardManager interface so that VFIO/IOMMU mappings get updated
>when doing PCI passthrough. SNP relies on this for PCI passthrough
>when discard is enabled, otherwise DMA occurs to stale mappings of
>discarded bounce-buffer pages:
> 
>  
> https://github.com/AMDESE/qemu/blob/snp-latest/backends/hostmem-memfd-private.c

Re: [PATCH V1 2/3] migration: fix suspended runstate

2023-08-14 Thread Peter Xu

On Mon, Aug 14, 2023 at 02:53:56PM -0400, Steven Sistare wrote:
> > Can we just call vm_state_notify() earlier?
> 
> We cannot.  The guest is not running yet, and will not be until later.
> We cannot call notifiers that perform actions that complete, or react to, 
> the guest entering a running state.

I tried to look at a few examples of the notifees and most of them I read
do not react to "vcpu running" but "vm running" (in which case I think
"suspended" mode falls into "vm running" case); most of them won't care on
the RunState parameter passed in, but only the bool "running".

In reality, when running=true, it must be RUNNING so far.

In that case does it mean we should notify right after the switchover,
since after migration the vm is indeed running only if the vcpus are not
during suspend?

One example (of possible issue) is vfio_vmstate_change(), where iiuc if we
try to suspend a VM it should keep to be VFIO_DEVICE_STATE_RUNNING for that
device; this kind of prove to me that SUSPEND is actually one of
running=true states.

If we postpone all notifiers here even after we switched over to dest qemu
to the next upcoming suspend wakeup, I think it means these devices will
not be in VFIO_DEVICE_STATE_RUNNING after switchover but perhaps
VFIO_DEVICE_STATE_STOP.

Ideally I think we should here call vm_state_notify() with running=true and
state=SUSPEND, but since I do see some hooks are not well prepared for
SUSPEND over running=true, I'd think we should on the safe side call
vm_state_notify(running=true, state=RUNNING) even for SUSPEND at switch
over phase.  With that IIUC it'll naturally work (e.g. when wakeup again
later we just need to call no notifiers).

-- 
Peter Xu

[PATCH] qxl: don't assert() if device isn't yet initialized

2023-08-14 Thread marcandre . lureau

From: Marc-André Lureau 

If the PCI BAR isn't yet mapped or was unmapped, QXL_IO_SET_MODE will
assert(). Instead, report a guest bug and keep going.

This can be reproduced with:

cat << EOF | ./qemu-system-x86_64 -vga qxl -m 2048 -nodefaults -qtest stdio
outl 0xcf8 0x8000101c
outl 0xcfc 0xc000
outl 0xcf8 0x80001001
outl 0xcfc 0x0100
outl 0xc006 0x00
EOF

Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1829

Signed-off-by: Marc-André Lureau 
---
 hw/display/qxl.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/display/qxl.c b/hw/display/qxl.c
index f1c0eb7dfc..70b73820b2 100644
--- a/hw/display/qxl.c
+++ b/hw/display/qxl.c
@@ -1591,7 +1591,10 @@ static void qxl_set_mode(PCIQXLDevice *d, unsigned int 
modenr, int loadvm)
 }
 
 d->guest_slots[0].slot = slot;
-assert(qxl_add_memslot(d, 0, devmem, QXL_SYNC) == 0);
+if (qxl_add_memslot(d, 0, devmem, QXL_SYNC) != 0) {
+qxl_set_guest_bug(d, "device isn't initialized yet");
+return;
+}
 
 d->guest_primary.surface = surface;
 qxl_create_guest_primary(d, 0, QXL_SYNC);
-- 
2.41.0

[PATCH for-8.2] target/s390x/kvm: Simplify the GPRs, ACRs, CRs and prefix synchronization code

2023-08-14 Thread Thomas Huth

KVM_SYNC_GPRS, KVM_SYNC_ACRS, KVM_SYNC_CRS and KVM_SYNC_PREFIX are
available since kernel 3.10. Since we already require at least kernel
3.15 in the s390x KVM code, we can assume that the KVM_CAP_SYNC_REGS
sync code is always possible for these registers, and remove the
related checks and fallbacks via KVM_SET_REGS and KVM_GET_REGS.

Signed-off-by: Thomas Huth 
---
 target/s390x/kvm/kvm.c | 119 -
 1 file changed, 34 insertions(+), 85 deletions(-)

diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
index a9e5880349..ff415f7b30 100644
--- a/target/s390x/kvm/kvm.c
+++ b/target/s390x/kvm/kvm.c
@@ -148,7 +148,6 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 KVM_CAP_LAST_INFO
 };
 
-static int cap_sync_regs;
 static int cap_async_pf;
 static int cap_mem_op;
 static int cap_mem_op_extension;
@@ -342,21 +341,28 @@ static void ccw_machine_class_foreach(ObjectClass *oc, 
void *opaque)
 
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
+int required_caps[] = {
+KVM_CAP_DEVICE_CTRL,
+KVM_CAP_SYNC_REGS,
+};
+
+for (int i = 0; i < ARRAY_SIZE(required_caps); i++) {
+if (!kvm_check_extension(s, required_caps[i])) {
+error_report("KVM is missing capability #%d - "
+ "please use kernel 3.15 or newer", required_caps[i]);
+return -1;
+}
+}
+
 object_class_foreach(ccw_machine_class_foreach, TYPE_S390_CCW_MACHINE,
  false, NULL);
 
-if (!kvm_check_extension(kvm_state, KVM_CAP_DEVICE_CTRL)) {
-error_report("KVM is missing capability KVM_CAP_DEVICE_CTRL - "
- "please use kernel 3.15 or newer");
-return -1;
-}
 if (!kvm_check_extension(s, KVM_CAP_S390_COW)) {
 error_report("KVM is missing capability KVM_CAP_S390_COW - "
  "unsupported environment");
 return -1;
 }
 
-cap_sync_regs = kvm_check_extension(s, KVM_CAP_SYNC_REGS);
 cap_async_pf = kvm_check_extension(s, KVM_CAP_ASYNC_PF);
 cap_mem_op = kvm_check_extension(s, KVM_CAP_S390_MEM_OP);
 cap_mem_op_extension = kvm_check_extension(s, 
KVM_CAP_S390_MEM_OP_EXTENSION);
@@ -463,15 +469,15 @@ void kvm_s390_reset_vcpu_normal(S390CPU *cpu)
 
 static int can_sync_regs(CPUState *cs, int regs)
 {
-return cap_sync_regs && (cs->kvm_run->kvm_valid_regs & regs) == regs;
+return (cs->kvm_run->kvm_valid_regs & regs) == regs;
 }
 
 int kvm_arch_put_registers(CPUState *cs, int level)
 {
+const int required_syncs = KVM_SYNC_GPRS | KVM_SYNC_ACRS |
+   KVM_SYNC_CRS | KVM_SYNC_PREFIX;
 S390CPU *cpu = S390_CPU(cs);
 CPUS390XState *env = &cpu->env;
-struct kvm_sregs sregs;
-struct kvm_regs regs;
 struct kvm_fpu fpu = {};
 int r;
 int i;
@@ -480,21 +486,16 @@ int kvm_arch_put_registers(CPUState *cs, int level)
 cs->kvm_run->psw_addr = env->psw.addr;
 cs->kvm_run->psw_mask = env->psw.mask;
 
-if (can_sync_regs(cs, KVM_SYNC_GPRS)) {
-for (i = 0; i < 16; i++) {
-cs->kvm_run->s.regs.gprs[i] = env->regs[i];
-cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_GPRS;
-}
-} else {
-for (i = 0; i < 16; i++) {
-regs.gprs[i] = env->regs[i];
-}
-r = kvm_vcpu_ioctl(cs, KVM_SET_REGS, ®s);
-if (r < 0) {
-return r;
-}
+g_assert((cs->kvm_run->kvm_valid_regs & required_syncs) == required_syncs);
+cs->kvm_run->kvm_dirty_regs |= required_syncs;
+for (i = 0; i < 16; i++) {
+cs->kvm_run->s.regs.gprs[i] = env->regs[i];
+cs->kvm_run->s.regs.acrs[i] = env->aregs[i];
+cs->kvm_run->s.regs.crs[i] = env->cregs[i];
 }
 
+cs->kvm_run->s.regs.prefix = env->psa;
+
 if (can_sync_regs(cs, KVM_SYNC_VRS)) {
 for (i = 0; i < 32; i++) {
 cs->kvm_run->s.regs.vrs[i][0] = env->vregs[i][0];
@@ -572,25 +573,6 @@ int kvm_arch_put_registers(CPUState *cs, int level)
 }
 }
 
-/* access registers and control registers*/
-if (can_sync_regs(cs, KVM_SYNC_ACRS | KVM_SYNC_CRS)) {
-for (i = 0; i < 16; i++) {
-cs->kvm_run->s.regs.acrs[i] = env->aregs[i];
-cs->kvm_run->s.regs.crs[i] = env->cregs[i];
-}
-cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_ACRS;
-cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_CRS;
-} else {
-for (i = 0; i < 16; i++) {
-sregs.acrs[i] = env->aregs[i];
-sregs.crs[i] = env->cregs[i];
-}
-r = kvm_vcpu_ioctl(cs, KVM_SET_SREGS, &sregs);
-if (r < 0) {
-return r;
-}
-}
-
 if (can_sync_regs(cs, KVM_SYNC_GSCB)) {
 memcpy(cs->kvm_run->s.regs.gscb, env->gscb, 32);
 cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_GSCB;
@@ -612,22 +594,15 @@ int kvm_arch_put_registers(CPUState *cs, int level)
 cs->kvm_run->kvm_dirty_regs |= KVM_SYNC_DIAG318;

[PATCH V3 06/10] tests/qtest: migration events

2023-08-14 Thread Steve Sistare

Define a state object to capture events seen by migration tests, to allow
more events to be captured in a subsequent patch, and simplify event
checking in wait_for_migration_pass.  No functional change.

Signed-off-by: Steve Sistare 
Reviewed-by: Fabiano Rosas 
---
 tests/qtest/migration-helpers.c | 24 +--
 tests/qtest/migration-helpers.h |  8 +++--
 tests/qtest/migration-test.c| 68 +++--
 3 files changed, 44 insertions(+), 56 deletions(-)

diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
index be00c52..b541108 100644
--- a/tests/qtest/migration-helpers.c
+++ b/tests/qtest/migration-helpers.c
@@ -23,26 +23,16 @@
  */
 #define MIGRATION_STATUS_WAIT_TIMEOUT 120
 
-bool migrate_watch_for_stop(QTestState *who, const char *name,
-QDict *event, void *opaque)
-{
-bool *seen = opaque;
-
-if (g_str_equal(name, "STOP")) {
-*seen = true;
-return true;
-}
-
-return false;
-}
-
-bool migrate_watch_for_resume(QTestState *who, const char *name,
+bool migrate_watch_for_events(QTestState *who, const char *name,
   QDict *event, void *opaque)
 {
-bool *seen = opaque;
+QTestMigrationState *state = opaque;
 
-if (g_str_equal(name, "RESUME")) {
-*seen = true;
+if (g_str_equal(name, "STOP")) {
+state->stop_seen = true;
+return true;
+} else if (g_str_equal(name, "RESUME")) {
+state->resume_seen = true;
 return true;
 }
 
diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h
index 009e250..59fbb83 100644
--- a/tests/qtest/migration-helpers.h
+++ b/tests/qtest/migration-helpers.h
@@ -15,9 +15,11 @@
 
 #include "libqtest.h"
 
-bool migrate_watch_for_stop(QTestState *who, const char *name,
-QDict *event, void *opaque);
-bool migrate_watch_for_resume(QTestState *who, const char *name,
+typedef struct QTestMigrationState {
+bool stop_seen, resume_seen;
+} QTestMigrationState;
+
+bool migrate_watch_for_events(QTestState *who, const char *name,
   QDict *event, void *opaque);
 
 G_GNUC_PRINTF(3, 4)
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 62d3f37..526a1b7 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -43,8 +43,8 @@
 unsigned start_address;
 unsigned end_address;
 static bool uffd_feature_thread_id;
-static bool got_src_stop;
-static bool got_dst_resume;
+static QTestMigrationState src_state;
+static QTestMigrationState dst_state;
 
 /*
  * An initial 3 MB offset is used as that corresponds
@@ -188,6 +188,13 @@ static void wait_for_serial(const char *side)
 } while (true);
 }
 
+static void wait_for_stop(QTestState *who, QTestMigrationState *state)
+{
+if (!state->stop_seen) {
+qtest_qmp_eventwait(who, "STOP");
+}
+}
+
 /*
  * It's tricky to use qemu's migration event capability with qtest,
  * events suddenly appearing confuse the qmp()/hmp() responses.
@@ -235,21 +242,19 @@ static void read_blocktime(QTestState *who)
 qobject_unref(rsp_return);
 }
 
+/*
+ * Wait for two changes in the migration pass count, but bail if we stop.
+ */
 static void wait_for_migration_pass(QTestState *who)
 {
-uint64_t initial_pass = get_migration_pass(who);
-uint64_t pass;
+uint64_t pass, prev_pass = 0, changes = 0;
 
-/* Wait for the 1st sync */
-while (!got_src_stop && !initial_pass) {
-usleep(1000);
-initial_pass = get_migration_pass(who);
-}
-
-do {
+while (changes < 2 && !src_state.stop_seen) {
 usleep(1000);
 pass = get_migration_pass(who);
-} while (pass == initial_pass && !got_src_stop);
+changes += (pass != prev_pass);
+prev_pass = pass;
+}
 }
 
 static void check_guests_ram(QTestState *who)
@@ -586,10 +591,7 @@ static void migrate_postcopy_start(QTestState *from, 
QTestState *to)
 {
 qtest_qmp_assert_success(from, "{ 'execute': 'migrate-start-postcopy' }");
 
-if (!got_src_stop) {
-qtest_qmp_eventwait(from, "STOP");
-}
-
+wait_for_stop(from, &src_state);
 qtest_qmp_eventwait(to, "RESUME");
 }
 
@@ -720,8 +722,9 @@ static int test_migrate_start(QTestState **from, QTestState 
**to,
 }
 }
 
-got_src_stop = false;
-got_dst_resume = false;
+dst_state = (QTestMigrationState) { };
+src_state = (QTestMigrationState) { };
+
 bootpath = g_strdup_printf("%s/bootsect", tmpfs);
 if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
 /* the assembled x86 boot sector should be exactly one sector large */
@@ -801,8 +804,8 @@ static int test_migrate_start(QTestState **from, QTestState 
**to,
 if (!args->only_target) {
 *from = qtest_init(cmd_source);
 qtest_qmp_set_event_callback(*from,
- migrate_watch_for_stop,
-

[PATCH V3 05/10] migration: preserve suspended for bg_migration

2023-08-14 Thread Steve Sistare

Do not wake a suspended guest during bg_migration.

Signed-off-by: Steve Sistare 
---
 migration/migration.c | 12 +---
 softmmu/runstate.c|  1 +
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 1a5484a..e6b8024 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3069,7 +3069,9 @@ static void bg_migration_vm_start_bh(void *opaque)
 qemu_bh_delete(s->vm_start_bh);
 s->vm_start_bh = NULL;
 
-vm_start();
+if (!runstate_check(RUN_STATE_SUSPENDED)) {
+vm_start();
+}
 s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
 }
 
@@ -3139,16 +3141,12 @@ static void *bg_migration_thread(void *opaque)
 
 qemu_mutex_lock_iothread();
 
-/*
- * If VM is currently in suspended state, then, to make a valid runstate
- * transition in vm_stop_force_state() we need to wakeup it up.
- */
-qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
 s->vm_old_state = runstate_get();
 
 global_state_store();
 /* Forcibly stop VM before saving state of vCPUs and devices */
-if (vm_stop_force_state(RUN_STATE_PAUSED)) {
+if (!runstate_check(RUN_STATE_SUSPENDED) &&
+vm_stop_force_state(RUN_STATE_PAUSED)) {
 goto fail;
 }
 /*
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 2f70c07..c683dcf 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -163,6 +163,7 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
 { RUN_STATE_SUSPENDED, RUN_STATE_PRELAUNCH },
 { RUN_STATE_SUSPENDED, RUN_STATE_COLO},
+{ RUN_STATE_SUSPENDED, RUN_STATE_PAUSED },
 
 { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING },
 { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE },
-- 
1.8.3.1

[PATCH V3 00/10] fix migration of suspended runstate

2023-08-14 Thread Steve Sistare

Migration of a guest in the suspended runstate is broken.  The incoming
migration code automatically tries to wake the guest, which is wrong;
the guest should end migration in the same runstate it started.  Further,
for a restored snapshot, the automatic wakeup fails.  The runstate is
RUNNING, but the guest is not.  See the commit messages for the details.

Changes in V2:
  * simplify "start on wakeup request"
  * fix postcopy, snapshot, and background migration
  * refactor fixes for each type of migration
  * explicitly handled suspended events and runstate in tests
  * add test for postcopy and background migration

Changes in V3:
  * rebase to tip
  * fix hang in new function migrate_wait_for_dirty_mem

Steve Sistare (10):
  vl: start on wakeup request
  migration: preserve suspended runstate
  migration: add runstate function
  migration: preserve suspended for snapshot
  migration: preserve suspended for bg_migration
  tests/qtest: migration events
  tests/qtest: option to suspend during migration
  tests/qtest: precopy migration with suspend
  tests/qtest: postcopy migration with suspend
  tests/qtest: background migration with suspend

 include/sysemu/runstate.h|   1 +
 migration/migration.c|  28 --
 migration/migration.h|   1 +
 migration/savevm.c   |   9 +-
 softmmu/cpus.c   |  12 +++
 softmmu/runstate.c   |   5 +-
 tests/migration/i386/Makefile|   5 +-
 tests/migration/i386/a-b-bootblock.S |  51 +-
 tests/migration/i386/a-b-bootblock.h |  22 +++--
 tests/qtest/migration-helpers.c  |  27 ++
 tests/qtest/migration-helpers.h  |   9 +-
 tests/qtest/migration-test.c | 177 +++
 12 files changed, 258 insertions(+), 89 deletions(-)

-- 
1.8.3.1

[PATCH V3 08/10] tests/qtest: precopy migration with suspend

2023-08-14 Thread Steve Sistare

Add a test case to verify that the suspended state is handled correctly
during live migration precopy.  The test suspends the src, migrates, then
wakes the dest.

Signed-off-by: Steve Sistare 
---
 tests/qtest/migration-helpers.c |  3 ++
 tests/qtest/migration-helpers.h |  3 +-
 tests/qtest/migration-test.c| 72 +
 3 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
index b541108..d1fec49 100644
--- a/tests/qtest/migration-helpers.c
+++ b/tests/qtest/migration-helpers.c
@@ -31,6 +31,9 @@ bool migrate_watch_for_events(QTestState *who, const char 
*name,
 if (g_str_equal(name, "STOP")) {
 state->stop_seen = true;
 return true;
+} else if (g_str_equal(name, "SUSPEND")) {
+state->suspend_seen = true;
+return true;
 } else if (g_str_equal(name, "RESUME")) {
 state->resume_seen = true;
 return true;
diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h
index 59fbb83..bac8699 100644
--- a/tests/qtest/migration-helpers.h
+++ b/tests/qtest/migration-helpers.h
@@ -16,7 +16,8 @@
 #include "libqtest.h"
 
 typedef struct QTestMigrationState {
-bool stop_seen, resume_seen;
+bool suspend_me;
+bool stop_seen, suspend_seen, resume_seen;
 } QTestMigrationState;
 
 bool migrate_watch_for_events(QTestState *who, const char *name,
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 32fea73..e3fc71e 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -135,7 +135,7 @@ static void init_bootfile(const char *bootpath, void 
*content, size_t len)
 /*
  * Wait for some output in the serial output file,
  * we get an 'A' followed by an endless string of 'B's
- * but on the destination we won't have the A.
+ * but on the destination we won't have the A (unless we enabled 
suspend/resume)
  */
 static void wait_for_serial(const char *side)
 {
@@ -195,6 +195,13 @@ static void wait_for_stop(QTestState *who, 
QTestMigrationState *state)
 }
 }
 
+static void wait_for_suspend(QTestState *who, QTestMigrationState *state)
+{
+if (!state->suspend_seen) {
+qtest_qmp_eventwait(who, "SUSPEND");
+}
+}
+
 /*
  * It's tricky to use qemu's migration event capability with qtest,
  * events suddenly appearing confuse the qmp()/hmp() responses.
@@ -249,7 +256,7 @@ static void wait_for_migration_pass(QTestState *who)
 {
 uint64_t pass, prev_pass = 0, changes = 0;
 
-while (changes < 2 && !src_state.stop_seen) {
+while (changes < 2 && !src_state.stop_seen && !src_state.suspend_seen) {
 usleep(1000);
 pass = get_migration_pass(who);
 changes += (pass != prev_pass);
@@ -545,7 +552,8 @@ static void migrate_wait_for_dirty_mem(QTestState *from,
 watch_byte = qtest_readb(from, watch_address);
 do {
 usleep(1000 * 10);
-} while (qtest_readb(from, watch_address) == watch_byte);
+} while (qtest_readb(from, watch_address) == watch_byte &&
+ !src_state.suspend_seen);
 }
 
 
@@ -727,6 +735,7 @@ static int test_migrate_start(QTestState **from, QTestState 
**to,
 dst_state = (QTestMigrationState) { };
 src_state = (QTestMigrationState) { };
 
+src_state.suspend_me = args->suspend_me;
 x86_bootsect[SYM_suspend_me - SYM_start] = args->suspend_me;
 
 bootpath = g_strdup_printf("%s/bootsect", tmpfs);
@@ -1522,8 +1531,12 @@ static void test_precopy_common(MigrateCommon *args)
  * change anything.
  */
 if (args->result == MIG_TEST_SUCCEED) {
-qtest_qmp_assert_success(from, "{ 'execute' : 'stop'}");
-wait_for_stop(from, &src_state);
+if (src_state.suspend_me) {
+wait_for_suspend(from, &src_state);
+} else {
+qtest_qmp_assert_success(from, "{ 'execute' : 'stop'}");
+wait_for_stop(from, &src_state);
+}
 migrate_ensure_converge(from);
 }
 }
@@ -1565,7 +1578,11 @@ static void test_precopy_common(MigrateCommon *args)
  */
 wait_for_migration_complete(from);
 
-wait_for_stop(from, &src_state);
+if (src_state.suspend_me) {
+wait_for_suspend(from, &src_state);
+} else {
+wait_for_stop(from, &src_state);
+}
 
 } else {
 wait_for_migration_complete(from);
@@ -1579,6 +1596,11 @@ static void test_precopy_common(MigrateCommon *args)
 qtest_qmp_assert_success(to, "{ 'execute' : 'cont'}");
 }
 
+if (args->start.suspend_me) {
+/* wakeup succeeds only if guest is suspended */
+qtest_qmp_assert_success(to, "{'execute': 'system_wakeup'}");
+}
+
 if (!dst_state.resume_seen) {
 qtest_qmp_eventwait(to, "RESUME");
 }
@@ -1609,6 +1631,34 @@ static void test_pre

[PATCH V3 03/10] migration: add runstate function

2023-08-14 Thread Steve Sistare

Create a subroutine for preserving the runstate after migration,
to be used in a subsequent patch.  No functional change.

Signed-off-by: Steve Sistare 
Reviewed-by: Fabiano Rosas 
---
 migration/migration.c | 14 ++
 migration/migration.h |  1 +
 migration/savevm.c| 11 +--
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 51ace82..1a5484a 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1141,6 +1141,20 @@ void migrate_set_state(int *state, int old_state, int 
new_state)
 }
 }
 
+void migrate_set_runstate(void)
+{
+if (!global_state_received() ||
+global_state_get_runstate() == RUN_STATE_RUNNING) {
+if (autostart) {
+vm_start();
+} else {
+runstate_set(RUN_STATE_PAUSED);
+}
+} else {
+runstate_set(global_state_get_runstate());
+}
+}
+
 static void migrate_fd_cleanup(MigrationState *s)
 {
 qemu_bh_delete(s->cleanup_bh);
diff --git a/migration/migration.h b/migration/migration.h
index 6eea18d..45e9805 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -456,6 +456,7 @@ struct MigrationState {
 };
 
 void migrate_set_state(int *state, int old_state, int new_state);
+void migrate_set_runstate(void);
 
 void migration_fd_process_incoming(QEMUFile *f, Error **errp);
 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp);
diff --git a/migration/savevm.c b/migration/savevm.c
index be42d0a..eba3653 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2070,16 +2070,7 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
 
 dirty_bitmap_mig_before_vm_start();
 
-if (!global_state_received() ||
-global_state_get_runstate() == RUN_STATE_RUNNING) {
-if (autostart) {
-vm_start();
-} else {
-runstate_set(RUN_STATE_PAUSED);
-}
-} else {
-runstate_set(global_state_get_runstate());
-}
+migrate_set_runstate();
 
 qemu_bh_delete(mis->bh);
 
-- 
1.8.3.1

[PATCH V3 02/10] migration: preserve suspended runstate

2023-08-14 Thread Steve Sistare

A guest that is migrated in the suspended state automaticaly wakes and
continues execution.  This is wrong; the guest should end migration in
the same state it started.  The root causes is that the outgoing migration
code automatically wakes the guest, then saves the RUNNING runstate in
global_state_store(), hence the incoming migration code thinks the guest is
running and continues the guest if autostart is true.

On the outgoing side, do not call qemu_system_wakeup_request().  That
alone fixes precopy migration, as process_incoming_migration_bh correctly
sets runstate from global_state_get_runstate().

On the incoming side for postcopy, do not wake the guest, and apply the
the same logic as found in precopy: if autostart and the runstate is
RUNNING, then vm_start, else merely restore the runstate.

In both cases, if the restored state is SUSPENDED, then a later wakeup
request will resume the guest, courtesy of the previous "start on wakeup"
patch.

Signed-off-by: Steve Sistare 
Reviewed-by: Fabiano Rosas 
---
 migration/migration.c |  2 --
 migration/savevm.c| 13 -
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 5528acb..51ace82 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2109,7 +2109,6 @@ static int postcopy_start(MigrationState *ms, Error 
**errp)
 qemu_mutex_lock_iothread();
 trace_postcopy_start_set_run();
 
-qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
 global_state_store();
 ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
 if (ret < 0) {
@@ -2315,7 +2314,6 @@ static void migration_completion(MigrationState *s)
 if (s->state == MIGRATION_STATUS_ACTIVE) {
 qemu_mutex_lock_iothread();
 s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
 
 s->vm_old_state = runstate_get();
 global_state_store();
diff --git a/migration/savevm.c b/migration/savevm.c
index a2cb885..be42d0a 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2070,12 +2070,15 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
 
 dirty_bitmap_mig_before_vm_start();
 
-if (autostart) {
-/* Hold onto your hats, starting the CPU */
-vm_start();
+if (!global_state_received() ||
+global_state_get_runstate() == RUN_STATE_RUNNING) {
+if (autostart) {
+vm_start();
+} else {
+runstate_set(RUN_STATE_PAUSED);
+}
 } else {
-/* leave it paused and let management decide when to start the CPU */
-runstate_set(RUN_STATE_PAUSED);
+runstate_set(global_state_get_runstate());
 }
 
 qemu_bh_delete(mis->bh);
-- 
1.8.3.1

[PATCH V3 10/10] tests/qtest: background migration with suspend

2023-08-14 Thread Steve Sistare

Add a test case to verify that the suspended state is handled correctly by
a background migration.  The test suspends the src, migrates, then wakes
the dest.

Signed-off-by: Steve Sistare 
---
 tests/qtest/migration-test.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index d99620a..3c9e487 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1678,6 +1678,26 @@ static void test_precopy_unix_suspend_notlive(void)
 test_precopy_common(&args);
 }
 
+static void *test_bg_suspend_start(QTestState *from, QTestState *to)
+{
+migrate_set_capability(from, "background-snapshot", true);
+return NULL;
+}
+
+static void test_bg_suspend(void)
+{
+g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs);
+MigrateCommon args = {
+.listen_uri = uri,
+.connect_uri = uri,
+.live = true,   /* runs fast, the src suspends immediately. */
+.start.suspend_me = true,
+.start_hook = test_bg_suspend_start
+};
+
+test_precopy_common(&args);
+}
+
 static void test_precopy_unix_dirty_ring(void)
 {
 g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs);
@@ -2904,6 +2924,7 @@ int main(int argc, char **argv)
 if (is_x86) {
 qtest_add_func("/migration/postcopy/suspend",
test_postcopy_suspend);
+qtest_add_func("/migration/bg/suspend", test_bg_suspend);
 }
 }
 
-- 
1.8.3.1

[PATCH V3 07/10] tests/qtest: option to suspend during migration

2023-08-14 Thread Steve Sistare

Add an option to suspend the src in a-b-bootblock.S, which puts the guest
in S3 state after one round of writing to memory.  The option is enabled by
poking a 1 into the suspend_me word in the boot block prior to starting the
src vm.  Generate symbol offsets in a-b-bootblock.h so that the suspend_me
offset is known.

Signed-off-by: Steve Sistare 
---
 tests/migration/i386/Makefile|  5 ++--
 tests/migration/i386/a-b-bootblock.S | 51 +---
 tests/migration/i386/a-b-bootblock.h | 22 ++--
 tests/qtest/migration-test.c |  4 +++
 4 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/tests/migration/i386/Makefile b/tests/migration/i386/Makefile
index 5c03241..37a72ae 100644
--- a/tests/migration/i386/Makefile
+++ b/tests/migration/i386/Makefile
@@ -4,9 +4,10 @@
 .PHONY: all clean
 all: a-b-bootblock.h
 
-a-b-bootblock.h: x86.bootsect
+a-b-bootblock.h: x86.bootsect x86.o
echo "$$__note" > header.tmp
xxd -i $< | sed -e 's/.*int.*//' >> header.tmp
+   nm x86.o | awk '{print "#define SYM_"$$3" 0x"$$1}' >> header.tmp
mv header.tmp $@
 
 x86.bootsect: x86.boot
@@ -16,7 +17,7 @@ x86.boot: x86.o
$(CROSS_PREFIX)objcopy -O binary $< $@
 
 x86.o: a-b-bootblock.S
-   $(CROSS_PREFIX)gcc -m32 -march=i486 -c $< -o $@
+   $(CROSS_PREFIX)gcc -I.. -m32 -march=i486 -c $< -o $@
 
 clean:
@rm -rf *.boot *.o *.bootsect
diff --git a/tests/migration/i386/a-b-bootblock.S 
b/tests/migration/i386/a-b-bootblock.S
index 3d464c7..62d79b2 100644
--- a/tests/migration/i386/a-b-bootblock.S
+++ b/tests/migration/i386/a-b-bootblock.S
@@ -9,6 +9,23 @@
 #
 # Author: dgilb...@redhat.com
 
+#include "migration-test.h"
+
+#define ACPI_ENABLE 0xf1
+#define ACPI_PORT_SMI_CMD   0xb2
+#define ACPI_PM_BASE0x600
+#define PM1A_CNT_OFFSET 4
+
+#define ACPI_SCI_ENABLE 0x0001
+#define ACPI_SLEEP_TYPE 0x0400
+#define ACPI_SLEEP_ENABLE   0x2000
+#define SLEEP (ACPI_SCI_ENABLE + ACPI_SLEEP_TYPE + ACPI_SLEEP_ENABLE)
+
+#define LOW_ADDRX86_TEST_MEM_START
+#define HIGH_ADDR   X86_TEST_MEM_END
+
+/* Save the suspended status at an address that is not written in the loop. */
+#define suspended   (X86_TEST_MEM_START + 4)
 
 .code16
 .org 0x7c00
@@ -41,12 +58,11 @@ start: # at 0x7c00 ?
 # bl keeps a counter so we limit the output speed
 mov $0, %bl
 mainloop:
-# Start from 1MB
-mov $(1024*1024),%eax
+mov $LOW_ADDR,%eax
 innerloop:
 incb (%eax)
 add $4096,%eax
-cmp $(100*1024*1024),%eax
+cmp $HIGH_ADDR,%eax
 jl innerloop
 
 inc %bl
@@ -57,7 +73,30 @@ innerloop:
 mov $0x3f8,%dx
 outb %al,%dx
 
-jmp mainloop
+# should this test suspend?
+mov (suspend_me),%eax
+cmp $0,%eax
+je mainloop
+
+# are we waking after suspend?  do not suspend again.
+mov $suspended,%eax
+mov (%eax),%eax
+cmp $1,%eax
+je mainloop
+
+# enable acpi
+mov $ACPI_ENABLE,%al
+outb %al,$ACPI_PORT_SMI_CMD
+
+# suspend to ram
+mov $suspended,%eax
+movl $1,(%eax)
+mov $SLEEP,%ax
+mov $(ACPI_PM_BASE + PM1A_CNT_OFFSET),%dx
+outw %ax,%dx
+# not reached.  The wakeup causes reset and restart at 0x7c00, and we
+# do not save and restore registers as a real kernel would do.
+
 
 # GDT magic from old (GPLv2)  Grub startup.S
 .p2align2   /* force 4-byte alignment */
@@ -83,6 +122,10 @@ gdtdesc:
 .word   0x27/* limit */
 .long   gdt /* addr */
 
+/* test launcher can poke a 1 here to exercise suspend */
+suspend_me:
+.int  0
+
 /* I'm a bootable disk */
 .org 0x7dfe
 .byte 0x55
diff --git a/tests/migration/i386/a-b-bootblock.h 
b/tests/migration/i386/a-b-bootblock.h
index b7b0fce..4d46873 100644
--- a/tests/migration/i386/a-b-bootblock.h
+++ b/tests/migration/i386/a-b-bootblock.h
@@ -4,20 +4,20 @@
  * the header and the assembler differences in your patch submission.
  */
 unsigned char x86_bootsect[] = {
-  0xfa, 0x0f, 0x01, 0x16, 0x78, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00,
+  0xfa, 0x0f, 0x01, 0x16, 0xa4, 0x7c, 0x66, 0xb8, 0x01, 0x00, 0x00, 0x00,
   0x0f, 0x22, 0xc0, 0x66, 0xea, 0x20, 0x7c, 0x00, 0x00, 0x08, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x92, 0x0c, 0x02,
   0xe6, 0x92, 0xb8, 0x10, 0x00, 0x00, 0x00, 0x8e, 0xd8, 0x66, 0xb8, 0x41,
   0x00, 0x66, 0xba, 0xf8, 0x03, 0xee, 0xb3, 0x00, 0xb8, 0x00, 0x00, 0x10,
   0x00, 0xfe, 0x00, 0x05, 0x00, 0x10, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x40,
   0x06, 0x7c, 0xf2, 0xfe, 0xc3, 0x80, 0xe3, 0x3f, 0x75, 0xe6, 0x66, 0xb8,
-  0x42, 0x00, 0x66, 0xba, 0xf8, 0x03, 0xee, 0xeb, 0xdb, 0x8d, 0x76, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00,
-  0x00, 0x9a, 0xcf, 0

[PATCH V3 09/10] tests/qtest: postcopy migration with suspend

2023-08-14 Thread Steve Sistare

Add a test case to verify that the suspended state is handled correctly by
live migration postcopy.  The test suspends the src, migrates, then wakes
the dest.

Signed-off-by: Steve Sistare 
---
 tests/qtest/migration-test.c | 26 --
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index e3fc71e..d99620a 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -599,8 +599,12 @@ static void migrate_postcopy_start(QTestState *from, 
QTestState *to)
 {
 qtest_qmp_assert_success(from, "{ 'execute': 'migrate-start-postcopy' }");
 
-wait_for_stop(from, &src_state);
-qtest_qmp_eventwait(to, "RESUME");
+if (src_state.suspend_me) {
+wait_for_suspend(from, &src_state);
+} else {
+wait_for_stop(from, &src_state);
+qtest_qmp_eventwait(to, "RESUME");
+}
 }
 
 typedef struct {
@@ -1299,6 +1303,11 @@ static void migrate_postcopy_complete(QTestState *from, 
QTestState *to,
 {
 wait_for_migration_complete(from);
 
+if (args->start.suspend_me) {
+/* wakeup succeeds only if guest is suspended */
+qtest_qmp_assert_success(to, "{'execute': 'system_wakeup'}");
+}
+
 /* Make sure we get at least one "B" on destination */
 wait_for_serial("dest_serial");
 
@@ -1332,6 +1341,15 @@ static void test_postcopy(void)
 test_postcopy_common(&args);
 }
 
+static void test_postcopy_suspend(void)
+{
+MigrateCommon args = {
+.start.suspend_me = true,
+};
+
+test_postcopy_common(&args);
+}
+
 static void test_postcopy_compress(void)
 {
 MigrateCommon args = {
@@ -2883,6 +2901,10 @@ int main(int argc, char **argv)
 qtest_add_func("/migration/postcopy/recovery/compress/plain",
test_postcopy_recovery_compress);
 }
+if (is_x86) {
+qtest_add_func("/migration/postcopy/suspend",
+   test_postcopy_suspend);
+}
 }
 
 qtest_add_func("/migration/bad_dest", test_baddest);
-- 
1.8.3.1

[PATCH V3 01/10] vl: start on wakeup request

2023-08-14 Thread Steve Sistare

If qemu starts and loads a VM in the suspended state, then a later wakeup
request directly sets the state to running.  This skips vm_start() and its
initialization steps, which is fatal for the guest.  See
qemu_system_wakeup_request(), and qemu_system_wakeup() in
main_loop_should_exit().

Remember if vm_start has been called.  If not, then call vm_start from
qemu_system_wakeup_request.

Signed-off-by: Steve Sistare 
Reviewed-by: Fabiano Rosas 
---
 include/sysemu/runstate.h |  1 +
 softmmu/cpus.c| 12 
 softmmu/runstate.c|  2 +-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h
index 7beb29c..42ddf83 100644
--- a/include/sysemu/runstate.h
+++ b/include/sysemu/runstate.h
@@ -34,6 +34,7 @@ static inline bool shutdown_caused_by_guest(ShutdownCause 
cause)
 }
 
 void vm_start(void);
+void vm_wakeup(void);
 
 /**
  * vm_prepare_start: Prepare for starting/resuming the VM
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index fed20ff..fa9e5ba 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -66,6 +66,7 @@
 #endif /* CONFIG_LINUX */
 
 static QemuMutex qemu_global_mutex;
+static bool vm_started;
 
 /*
  * The chosen accelerator is supposed to register this.
@@ -264,6 +265,7 @@ static int do_vm_stop(RunState state, bool send_stop)
 if (send_stop) {
 qapi_event_send_stop();
 }
+vm_started = false;
 }
 
 bdrv_drain_all();
@@ -722,6 +724,16 @@ void vm_start(void)
 {
 if (!vm_prepare_start(false)) {
 resume_all_vcpus();
+vm_started = true;
+}
+}
+
+void vm_wakeup(void)
+{
+if (!vm_started) {
+vm_start();
+} else {
+runstate_set(RUN_STATE_RUNNING);
 }
 }
 
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index f3bd862..95c6ae7 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -580,7 +580,7 @@ void qemu_system_wakeup_request(WakeupReason reason, Error 
**errp)
 if (!(wakeup_reason_mask & (1 << reason))) {
 return;
 }
-runstate_set(RUN_STATE_RUNNING);
+vm_wakeup();
 wakeup_reason = reason;
 qemu_notify_event();
 }
-- 
1.8.3.1

[PATCH V3 04/10] migration: preserve suspended for snapshot

2023-08-14 Thread Steve Sistare

Restoring a snapshot can break a suspended guest.

If a guest is suspended and saved to a snapshot using savevm, and qemu
is terminated and restarted with the -S option, then loadvm does not
restore the guest.  The runstate is running, but the guest is not, because
vm_start was not called.  The root cause is that loadvm does not restore
the runstate (eg suspended) from global_state loaded from the state file.

Restore the runstate, and allow the new state transitions that are possible.

Signed-off-by: Steve Sistare 
---
 migration/savevm.c | 1 +
 softmmu/runstate.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/migration/savevm.c b/migration/savevm.c
index eba3653..7b9c477 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -3194,6 +3194,7 @@ bool load_snapshot(const char *name, const char *vmstate,
 }
 aio_context_acquire(aio_context);
 ret = qemu_loadvm_state(f);
+migrate_set_runstate();
 migration_incoming_state_destroy();
 aio_context_release(aio_context);
 
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 95c6ae7..2f70c07 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -77,6 +77,8 @@ typedef struct {
 
 static const RunStateTransition runstate_transitions_def[] = {
 { RUN_STATE_PRELAUNCH, RUN_STATE_INMIGRATE },
+{ RUN_STATE_PRELAUNCH, RUN_STATE_PAUSED },
+{ RUN_STATE_PRELAUNCH, RUN_STATE_SUSPENDED },
 
 { RUN_STATE_DEBUG, RUN_STATE_RUNNING },
 { RUN_STATE_DEBUG, RUN_STATE_FINISH_MIGRATE },
-- 
1.8.3.1

Re: [PATCH V1 2/3] migration: fix suspended runstate

2023-08-14 Thread Steven Sistare

On 7/26/2023 4:18 PM, Peter Xu wrote:
> On Fri, Jun 30, 2023 at 09:50:41AM -0400, Steven Sistare wrote:
>> On 6/26/2023 2:27 PM, Peter Xu wrote:
>>> On Fri, Jun 23, 2023 at 02:25:05PM -0400, Steven Sistare wrote:
 On 6/21/2023 4:28 PM, Peter Xu wrote:
> On Wed, Jun 21, 2023 at 03:15:42PM -0400, Steven Sistare wrote:
>> On 6/20/2023 5:46 PM, Peter Xu wrote:
>>> On Thu, Jun 15, 2023 at 01:26:39PM -0700, Steve Sistare wrote:
 Migration of a guest in the suspended state is broken.  The incoming
 migration code automatically tries to wake the guest, which IMO is
 wrong -- the guest should end migration in the same state it started.
 Further, the wakeup is done by calling qemu_system_wakeup_request(), 
 which
 bypasses vm_start().  The guest appears to be in the running state, but
 it is not.

 To fix, leave the guest in the suspended state, but call
 qemu_system_start_on_wakeup_request() so the guest is properly resumed
 later, when the client sends a system_wakeup command.

 Signed-off-by: Steve Sistare 
 ---
  migration/migration.c | 11 ---
  softmmu/runstate.c|  1 +
  2 files changed, 5 insertions(+), 7 deletions(-)

 diff --git a/migration/migration.c b/migration/migration.c
 index 17b4b47..851fe6d 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
 @@ -496,6 +496,10 @@ static void process_incoming_migration_bh(void 
 *opaque)
  vm_start();
  } else {
  runstate_set(global_state_get_runstate());
 +if (runstate_check(RUN_STATE_SUSPENDED)) {
 +/* Force vm_start to be called later. */
 +qemu_system_start_on_wakeup_request();
 +}
>>>
>>> Is this really needed, along with patch 1?
>>>
>>> I have a very limited knowledge on suspension, so I'm prone to making
>>> mistakes..
>>>
>>> But from what I read this, qemu_system_wakeup_request() (existing one, 
>>> not
>>> after patch 1 applied) will setup wakeup_reason and kick the main thread
>>> using qemu_notify_event().  Then IIUC the e.g. vcpu wakeups will be 
>>> done in
>>> the main thread later on after qemu_wakeup_requested() returns true.
>>
>> Correct, here:
>>
>> if (qemu_wakeup_requested()) {
>> pause_all_vcpus();
>> qemu_system_wakeup();
>> notifier_list_notify(&wakeup_notifiers, &wakeup_reason);
>> wakeup_reason = QEMU_WAKEUP_REASON_NONE;
>> resume_all_vcpus();
>> qapi_event_send_wakeup();
>> }
>>
>> However, that is not sufficient, because vm_start() was never called on 
>> the incoming
>> side.  vm_start calls the vm state notifiers for RUN_STATE_RUNNING, 
>> among other things.
>>
>>
>> Without my fixes, it "works" because the outgoing migration 
>> automatically wakes a suspended
>> guest, which sets the state to running, which is saved in global state:
>>
>> void migration_completion(MigrationState *s)
>> qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
>> global_state_store()
>>
>> Then the incoming migration calls vm_start here:
>>
>> migration/migration.c
>> if (!global_state_received() ||
>> global_state_get_runstate() == RUN_STATE_RUNNING) {
>> if (autostart) {
>> vm_start();
>>
>> vm_start must be called for correctness.
>
> I see.  Though I had a feeling that this is still not the right way to do,
> at least not as clean.
>
> One question is, would above work for postcopy when VM is suspended during
> the switchover?

 Good catch, that is broken.
 I added qemu_system_start_on_wakeup_request to 
 loadvm_postcopy_handle_run_bh
 and now it works.

 if (global_state_get_runstate() == RUN_STATE_RUNNING) {
 if (autostart) {
 vm_start();
 } else {
 runstate_set(RUN_STATE_PAUSED);
 }
 } else {
 runstate_set(global_state_get_runstate());
 if (runstate_check(RUN_STATE_SUSPENDED)) {
 qemu_system_start_on_wakeup_request();
 }
 }

> I think I see your point that vm_start() (mostly vm_prepare_start())
> contains a bunch of operations that maybe we must have before starting the
> VM, but then.. should we just make that vm_start() unconditional when
> loading VM completes?  I just don't see anything won't need it (besides
> -S), even COLO.
>
> So I'm wondering about something like this:
>
> ===8<===
> --- a/migration/migration.c
> +++ b/migration/migration.c
>>>

[PATCH] trace-events: Fix the name of the tracing.rst file

2023-08-14 Thread Thomas Huth

The file has been converted to .rst a while ago - make sure that the
references in the trace-events files are pointing to the right location
now.

Signed-off-by: Thomas Huth 
---
 bsd-user/trace-events | 2 +-
 ebpf/trace-events | 2 +-
 hw/nubus/trace-events | 2 +-
 target/s390x/kvm/trace-events | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bsd-user/trace-events b/bsd-user/trace-events
index 843896f627..2c1cb66726 100644
--- a/bsd-user/trace-events
+++ b/bsd-user/trace-events
@@ -1,4 +1,4 @@
-# See docs/tracing.txt for syntax documentation.
+# See docs/devel/tracing.rst for syntax documentation.
 
 # bsd-user/signal.c
 user_setup_frame(void *env, uint64_t frame_addr) "env=%p frame_addr=0x%"PRIx64
diff --git a/ebpf/trace-events b/ebpf/trace-events
index 411b1e2be3..b3ad1a35f2 100644
--- a/ebpf/trace-events
+++ b/ebpf/trace-events
@@ -1,4 +1,4 @@
-# See docs/devel/tracing.txt for syntax documentation.
+# See docs/devel/tracing.rst for syntax documentation.
 
 # ebpf-rss.c
 ebpf_error(const char *s1, const char *s2) "error in %s: %s"
diff --git a/hw/nubus/trace-events b/hw/nubus/trace-events
index e31833d694..9259d66725 100644
--- a/hw/nubus/trace-events
+++ b/hw/nubus/trace-events
@@ -1,4 +1,4 @@
-# See docs/devel/tracing.txt for syntax documentation.
+# See docs/devel/tracing.rst for syntax documentation.
 
 # nubus-bus.c
 nubus_slot_read(uint64_t addr, int size) "reading unassigned addr 0x%"PRIx64 " 
size %d"
diff --git a/target/s390x/kvm/trace-events b/target/s390x/kvm/trace-events
index 5289f5f675..380976b225 100644
--- a/target/s390x/kvm/trace-events
+++ b/target/s390x/kvm/trace-events
@@ -1,4 +1,4 @@
-# See docs/devel/tracing.txt for syntax documentation.
+# See docs/devel/tracing.rst for syntax documentation.
 
 # kvm.c
 kvm_enable_cmma(int rc) "CMMA: enabling with result code %d"
-- 
2.39.3

[PATCH 3/4] replay: allow runstate shutdown->running when replaying trace

2023-08-14 Thread Nicholas Piggin

When replaying a trace, it is possible to go from shutdown to
running with a reverse-debugging step. This can be useful if the
problem being debugged triggers a reset or shutdown.

Signed-off-by: Nicholas Piggin 
---
 include/sysemu/runstate.h |  1 +
 replay/replay.c   |  2 ++
 softmmu/runstate.c| 19 +++
 3 files changed, 22 insertions(+)

diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h
index 7beb29c2e2..85a1167ccb 100644
--- a/include/sysemu/runstate.h
+++ b/include/sysemu/runstate.h
@@ -9,6 +9,7 @@ void runstate_set(RunState new_state);
 RunState runstate_get(void);
 bool runstate_is_running(void);
 bool runstate_needs_reset(void);
+void runstate_replay_enable(void);
 
 typedef void VMChangeStateHandler(void *opaque, bool running, RunState state);
 
diff --git a/replay/replay.c b/replay/replay.c
index 0f7d766efe..e64f71209a 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -272,6 +272,8 @@ static void replay_enable(const char *fname, int mode)
 /* go to the beginning */
 fseek(replay_file, HEADER_SIZE, SEEK_SET);
 replay_fetch_data_kind();
+
+runstate_replay_enable();
 }
 
 replay_init_events();
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index f3bd862818..9fd3e57485 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -174,6 +174,12 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 { RUN_STATE__MAX, RUN_STATE__MAX },
 };
 
+static const RunStateTransition replay_runstate_transitions_def[] = {
+{ RUN_STATE_SHUTDOWN, RUN_STATE_RUNNING},
+
+{ RUN_STATE__MAX, RUN_STATE__MAX },
+};
+
 static bool runstate_valid_transitions[RUN_STATE__MAX][RUN_STATE__MAX];
 
 bool runstate_check(RunState state)
@@ -181,6 +187,19 @@ bool runstate_check(RunState state)
 return current_run_state == state;
 }
 
+void runstate_replay_enable(void)
+{
+const RunStateTransition *p;
+
+assert(replay_mode == REPLAY_MODE_PLAY);
+
+for (p = &replay_runstate_transitions_def[0]; p->from != RUN_STATE__MAX;
+ p++) {
+runstate_valid_transitions[p->from][p->to] = true;
+}
+
+}
+
 static void runstate_init(void)
 {
 const RunStateTransition *p;
-- 
2.40.1

[PATCH 1/4] scripts/replay_dump.sh: Update to current rr record format

2023-08-14 Thread Nicholas Piggin

This thing seems to have fallen by the wayside. This gets it working with
the current format, although does not quite implement all events.

Signed-off-by: Nicholas Piggin 
---
My python skills are not good. Any help on this or patch 2 is
appreciated.

Thanks,
Nick

 scripts/replay-dump.py | 107 ++---
 1 file changed, 101 insertions(+), 6 deletions(-)

diff --git a/scripts/replay-dump.py b/scripts/replay-dump.py
index 3ba97a6d30..937ae19ff1 100755
--- a/scripts/replay-dump.py
+++ b/scripts/replay-dump.py
@@ -20,6 +20,7 @@
 
 import argparse
 import struct
+import os
 from collections import namedtuple
 
 # This mirrors some of the global replay state which some of the
@@ -62,6 +63,10 @@ def read_byte(fin):
 "Read a single byte"
 return struct.unpack('>B', fin.read(1))[0]
 
+def read_bytes(fin, nr):
+"Read a nr bytes"
+return fin.read(nr)
+
 def read_event(fin):
 "Read a single byte event, but save some state"
 if replay_state.already_read:
@@ -122,12 +127,18 @@ def swallow_async_qword(eid, name, dumpfile):
 print("  %s(%d) @ %d" % (name, eid, step_id))
 return True
 
+def swallow_bytes(eid, name, dumpfile, nr):
+"Swallow nr bytes of data without looking at it"
+dumpfile.seek(nr, os.SEEK_CUR)
+return True
+
 async_decode_table = [ Decoder(0, "REPLAY_ASYNC_EVENT_BH", 
swallow_async_qword),
-   Decoder(1, "REPLAY_ASYNC_INPUT", decode_unimp),
-   Decoder(2, "REPLAY_ASYNC_INPUT_SYNC", decode_unimp),
-   Decoder(3, "REPLAY_ASYNC_CHAR_READ", decode_unimp),
-   Decoder(4, "REPLAY_ASYNC_EVENT_BLOCK", decode_unimp),
-   Decoder(5, "REPLAY_ASYNC_EVENT_NET", decode_unimp),
+   Decoder(1, "REPLAY_ASYNC_BH_ONESHOT", decode_unimp),
+   Decoder(2, "REPLAY_ASYNC_INPUT", decode_unimp),
+   Decoder(3, "REPLAY_ASYNC_INPUT_SYNC", decode_unimp),
+   Decoder(4, "REPLAY_ASYNC_CHAR_READ", decode_unimp),
+   Decoder(5, "REPLAY_ASYNC_EVENT_BLOCK", decode_unimp),
+   Decoder(6, "REPLAY_ASYNC_EVENT_NET", decode_unimp),
 ]
 # See replay_read_events/replay_read_event
 def decode_async(eid, name, dumpfile):
@@ -156,6 +167,13 @@ def decode_audio_out(eid, name, dumpfile):
 print_event(eid, name, "%d" % (audio_data))
 return True
 
+def decode_random(eid, name, dumpfile):
+ret = read_dword(dumpfile)
+size = read_dword(dumpfile)
+swallow_bytes(eid, name, dumpfile, size)
+print_event(eid, name, "%d %d" % (ret, size))
+return True
+
 def decode_checkpoint(eid, name, dumpfile):
 """Decode a checkpoint.
 
@@ -184,6 +202,38 @@ def decode_interrupt(eid, name, dumpfile):
 print_event(eid, name)
 return True
 
+def decode_exception(eid, name, dumpfile):
+print_event(eid, name)
+return True
+
+def decode_shutdown(eid, name, dumpfile):
+print_event(eid, name)
+return True
+
+def decode_end(eid, name, dumpfile):
+print_event(eid, name)
+return False
+
+def decode_char_write(eid, name, dumpfile):
+res = read_dword(dumpfile)
+offset = read_dword(dumpfile)
+print_event(eid, name)
+return True
+
+def decode_async_char_read(eid, name, dumpfile):
+char_id = read_byte(dumpfile)
+size = read_dword(dumpfile)
+print_event(eid, name, "device:%x chars:%s" % (char_id, 
read_bytes(dumpfile, size)))
+return True
+
+def decode_async_net(eid, name, dumpfile):
+net_id = read_byte(dumpfile)
+flags = read_dword(dumpfile)
+size = read_dword(dumpfile)
+swallow_bytes(eid, name, dumpfile, size)
+print_event(eid, name, "net:%x flags:%x bytes:%d" % (net_id, flags, size))
+return True
+
 def decode_clock(eid, name, dumpfile):
 clock_data = read_qword(dumpfile)
 print_event(eid, name, "0x%x" % (clock_data))
@@ -268,6 +318,48 @@ def decode_clock(eid, name, dumpfile):
   Decoder(28, "EVENT_CP_RESET", decode_checkpoint),
 ]
 
+v12_event_table = [Decoder(0, "EVENT_INSTRUCTION", decode_instruction),
+  Decoder(1, "EVENT_INTERRUPT", decode_interrupt),
+  Decoder(2, "EVENT_EXCEPTION", decode_exception),
+  Decoder(3, "EVENT_ASYNC_BH", swallow_async_qword),
+  Decoder(4, "EVENT_ASYNC_BH_ONESHOT", swallow_async_qword),
+  Decoder(5, "EVENT_ASYNC_INPUT", decode_unimp),
+  Decoder(6, "EVENT_ASYNC_INPUT_SYNC", decode_unimp),
+  Decoder(7, "EVENT_ASYNC_CHAR_READ", decode_async_char_read),
+  Decoder(8, "EVENT_ASYNC_BLOCK", swallow_async_qword),
+  Decoder(9, "EVENT_ASYNC_NET", decode_async_net),
+  Decoder(10, "EVENT_SHUTDOWN", decode_unimp),
+  Decoder(11, "EVENT_SHUTDOWN_HOST_ERR", decode_shutdown),
+  Decoder(12, "EVENT_SHUTDOWN_HOST_QMP_QUIT", decode_shutdown),
+

[PATCH 4/4] replay: simple auto-snapshot mode for record

2023-08-14 Thread Nicholas Piggin

record makes an initial snapshot when the machine is created, to enable
reverse-debugging. Often the issue being debugged appears near the end of
the trace, so it is important for performance to keep snapshots close to
the end.

This implements a periodic snapshot mode that keeps a rolling set of
recent snapshots.

Arguably this should be done by the debugger or a program that talks to
QMP, but for setting up simple scenarios and tests, it is convenient to
have this feature.

Signed-off-by: Nicholas Piggin 
---
 docs/system/replay.rst   |  5 
 include/sysemu/replay.h  | 11 
 qemu-options.hx  |  9 +--
 replay/replay-snapshot.c | 57 
 replay/replay.c  | 25 ++
 softmmu/vl.c |  9 +++
 6 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/docs/system/replay.rst b/docs/system/replay.rst
index 3105327423..bef9ea4171 100644
--- a/docs/system/replay.rst
+++ b/docs/system/replay.rst
@@ -156,6 +156,11 @@ for storing VM snapshots. Here is the example of the 
command line for this:
 ``empty.qcow2`` drive does not connected to any virtual block device and used
 for VM snapshots only.
 
+``rrsnapmode`` can be used to select just an initial snapshot or periodic
+snapshots, with ``rrsnapcount`` specifying the number of periodic snapshots
+to maintain, and ``rrsnaptime`` the amount of run time in seconds between
+periodic snapshots.
+
 .. _network-label:
 
 Network devices
diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
index 08aae5869f..a83e54afc6 100644
--- a/include/sysemu/replay.h
+++ b/include/sysemu/replay.h
@@ -45,6 +45,17 @@ typedef enum ReplayCheckpoint ReplayCheckpoint;
 
 typedef struct ReplayNetState ReplayNetState;
 
+enum ReplaySnapshotMode {
+REPLAY_SNAPSHOT_MODE_INITIAL,
+REPLAY_SNAPSHOT_MODE_PERIODIC,
+};
+typedef enum ReplaySnapshotMode ReplaySnapshotMode;
+
+extern ReplaySnapshotMode replay_snapshot_mode;
+
+extern uint64_t replay_snapshot_periodic_delay;
+extern int replay_snapshot_periodic_nr_keep;
+
 /* Name of the initial VM snapshot */
 extern char *replay_snapshot;
 
diff --git a/qemu-options.hx b/qemu-options.hx
index 29b98c3d4c..0dce93eeab 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4530,13 +4530,13 @@ SRST
 ERST
 
 DEF("icount", HAS_ARG, QEMU_OPTION_icount, \
-"-icount 
[shift=N|auto][,align=on|off][,sleep=on|off][,rr=record|replay,rrfile=[,rrsnapshot=]]\n"
 \
+"-icount 
[shift=N|auto][,align=on|off][,sleep=on|off][,rr=record|replay,rrfile=[,rrsnapshot=][,rrsnapmode=initial|periodic][,rrsnaptime=secs][,rrsnapcount=N]\n"
 \
 "enable virtual instruction counter with 2^N clock ticks 
per\n" \
 "instruction, enable aligning the host and virtual 
clocks\n" \
 "or disable real time cpu sleeping, and optionally 
enable\n" \
 "record-and-replay mode\n", QEMU_ARCH_ALL)
 SRST
-``-icount 
[shift=N|auto][,align=on|off][,sleep=on|off][,rr=record|replay,rrfile=filename[,rrsnapshot=snapshot]]``
+``-icount 
[shift=N|auto][,align=on|off][,sleep=on|off][,rr=record|replay,rrfile=filename[,rrsnapshot=snapshot][,rrsnapmode=initial|periodic][,rrsnaptime=secs][,rrsnapcount=N]]``
 Enable virtual instruction counter. The virtual cpu will execute one
 instruction every 2^N ns of virtual time. If ``auto`` is specified
 then the virtual cpu speed will be automatically adjusted to keep
@@ -4578,6 +4578,11 @@ SRST
 name. In record mode, a new VM snapshot with the given name is created
 at the start of execution recording. In replay mode this option
 specifies the snapshot name used to load the initial VM state.
+``rrsnapmode=periodic`` will additionally cause a periodic snapshot to
+be created after ``rrsnaptime=secs`` seconds of real runtime. The last
+``rrsnapcount=N`` periodic snapshots (not including the initial) will
+be kept (0 for infinite). Periodic snapshots are useful to speed
+reverse debugging operations near the end of the recorded trace.
 ERST
 
 DEF("watchdog-action", HAS_ARG, QEMU_OPTION_watchdog_action, \
diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c
index 10a7cf7992..38eac61c43 100644
--- a/replay/replay-snapshot.c
+++ b/replay/replay-snapshot.c
@@ -69,6 +69,53 @@ void replay_vmstate_register(void)
 vmstate_register(NULL, 0, &vmstate_replay, &replay_state);
 }
 
+static QEMUTimer *replay_snapshot_timer;
+static int replay_snapshot_count;
+
+static void replay_snapshot_timer_cb(void *opaque)
+{
+Error *err = NULL;
+char *name;
+
+if (!replay_can_snapshot()) {
+/* Try again soon */
+timer_mod(replay_snapshot_timer,
+  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
+  replay_snapshot_periodic_delay / 10);
+return;
+}
+
+name = g_strdup_printf("%s-%d", replay_snapshot, replay_snapshot_count);
+if (!save_snapshot(name,
+

[PATCH 2/4] tests/avocado: replay_linux.py add replay-dump.py test

2023-08-14 Thread Nicholas Piggin

This runs replay-dump.py after recording a trace, and fails the test if
the script fails.

replay-dump.py is modified to exit with non-zero if an error is
encountered while parsing.

Signed-off-by: Nicholas Piggin 
---
It's possible this could introduce failures to existing test if an
unimplemented event gets recorded. I would make a new test for this but
it takes quite a while to record such a long trace that includes some
block and net events to excercise the script.

Thanks,
Nick

 scripts/replay-dump.py|  6 --
 tests/avocado/replay_linux.py | 16 +++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/scripts/replay-dump.py b/scripts/replay-dump.py
index 937ae19ff1..8f4715632a 100755
--- a/scripts/replay-dump.py
+++ b/scripts/replay-dump.py
@@ -21,6 +21,7 @@
 import argparse
 import struct
 import os
+import sys
 from collections import namedtuple
 
 # This mirrors some of the global replay state which some of the
@@ -97,7 +98,7 @@ def call_decode(table, index, dumpfile):
 print("Could not decode index: %d" % (index))
 print("Entry is: %s" % (decoder))
 print("Decode Table is:\n%s" % (table))
-return False
+sys.exit(1)
 else:
 return decoder.fn(decoder.eid, decoder.name, dumpfile)
 
@@ -118,7 +119,7 @@ def print_event(eid, name, string=None, event_count=None):
 def decode_unimp(eid, name, _unused_dumpfile):
 "Unimplimented decoder, will trigger exit"
 print("%s not handled - will now stop" % (name))
-return False
+sys.exit(1)
 
 # Checkpoint decoder
 def swallow_async_qword(eid, name, dumpfile):
@@ -401,3 +402,4 @@ def decode_file(filename):
 if __name__ == "__main__":
 args = parse_arguments()
 decode_file(args.file)
+sys.exit(0)
diff --git a/tests/avocado/replay_linux.py b/tests/avocado/replay_linux.py
index a76dd507fc..12937ce0ec 100644
--- a/tests/avocado/replay_linux.py
+++ b/tests/avocado/replay_linux.py
@@ -11,6 +11,7 @@
 import os
 import logging
 import time
+import subprocess
 
 from avocado import skipUnless
 from avocado_qemu import BUILD_DIR
@@ -21,6 +22,11 @@
 from avocado.utils.path import find_command
 from avocado_qemu import LinuxTest
 
+from pathlib import Path
+
+self_dir = Path(__file__).parent
+src_dir = self_dir.parent.parent
+
 class ReplayLinux(LinuxTest):
 """
 Boots a Linux system, checking for a successful initialization
@@ -94,7 +100,7 @@ def launch_and_wait(self, record, args, shift):
 else:
 vm.event_wait('SHUTDOWN', self.timeout)
 vm.shutdown(True)
-logger.info('successfully fihished the replay')
+logger.info('successfully finished the replay')
 elapsed = time.time() - start_time
 logger.info('elapsed time %.2f sec' % elapsed)
 return elapsed
@@ -105,6 +111,14 @@ def run_rr(self, args=None, shift=7):
 logger = logging.getLogger('replay')
 logger.info('replay overhead {:.2%}'.format(t2 / t1 - 1))
 
+try:
+replay_path = os.path.join(self.workdir, 'replay.bin')
+subprocess.check_call(["./scripts/replay-dump.py",
+   "-f", replay_path],
+  cwd=src_dir, stdout=subprocess.DEVNULL)
+except subprocess.CalledProcessError:
+self.fail('replay-dump.py failed')
+
 @skipUnless(os.getenv('AVOCADO_TIMEOUT_EXPECTED'), 'Test might timeout')
 class ReplayLinuxX8664(ReplayLinux):
 """
-- 
2.40.1

[PATCH for-8.2 0/4] Assorted replay patches

2023-08-14 Thread Nicholas Piggin

Hi,

These are a few small things I have found helpful while trying to
implement and test rr changes. Patch 2 depends on 1, but otherwise
the patches are independent.

Thanks,
Nick

Nicholas Piggin (4):
  scripts/replay_dump.sh: Update to current rr record format
  tests/avocado: replay_linux.py add replay-dump.py test
  replay: allow runstate shutdown->running when replaying trace
  replay: simple auto-snapshot mode for record

 docs/system/replay.rst|   5 ++
 include/sysemu/replay.h   |  11 
 include/sysemu/runstate.h |   1 +
 qemu-options.hx   |   9 ++-
 replay/replay-snapshot.c  |  57 +
 replay/replay.c   |  27 
 scripts/replay-dump.py| 113 +++---
 softmmu/runstate.c|  19 ++
 softmmu/vl.c  |   9 +++
 tests/avocado/replay_linux.py |  16 -
 10 files changed, 256 insertions(+), 11 deletions(-)

-- 
2.40.1

Re: [PATCH v2] virtio: don't zero out memory region cache for indirect descriptors

2023-08-14 Thread Stefan Hajnoczi

On Fri, Aug 11, 2023 at 04:34:23PM +0200, Ilya Maximets wrote:
> Lots of virtio functions that are on a hot path in data transmission
> are initializing indirect descriptor cache at the point of stack
> allocation.  It's a 112 byte structure that is getting zeroed out on
> each call adding unnecessary overhead.  It's going to be correctly
> initialized later via special init function.  The only reason to
> actually initialize right away is the ability to safely destruct it.
> Replacing a designated initializer with a function to only initialize
> what is necessary.
> 
> Removal of the unnecessary stack initializations improves throughput
> of virtio-net devices in terms of 64B packets per second by 6-14 %
> depending on the case.  Tested with a proposed af-xdp network backend
> and a dpdk testpmd application in the guest, but should be beneficial
> for other virtio devices as well.
> 
> Signed-off-by: Ilya Maximets 
> ---
> 
> Version 2:
> 
>   * Introduced an initialization function, so we don't need to compare
> pointers in the end. [Stefan]
>   * Removed the now unused macro. [Jason]
> 
>  hw/virtio/virtio.c| 20 +++-
>  include/exec/memory.h | 16 +---
>  2 files changed, 28 insertions(+), 8 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature

Re: [PATCH QEMU v2 0/3] provide a smooth upgrade solution for multi-queues disk

2023-08-14 Thread Stefan Hajnoczi

On Fri, Aug 11, 2023 at 10:31:51AM +0800, Yong Huang wrote:
> Hi, Stefan, thank you for your interest in this series.
> I'm trying to explain my point,  if you think my explanation
> doesn't stand up, please let me know.
> 
> On Fri, Aug 11, 2023 at 2:33 AM Stefan Hajnoczi  wrote:
> 
> > On Thu, Aug 10, 2023 at 07:07:09AM +, ~hyman wrote:
> > > Ping,
> > >
> > > This version is a copy of version 1 and is rebased
> > > on the master. No functional changes.
> > >
> > > A 1:1 virtqueue:vCPU mapping implementation for virtio-*-pci disk
> > > introduced since qemu >= 5.2.0, which improves IO performance
> > > remarkably. To enjoy this feature for exiting running VMs without
> > > service interruption, the common solution is to migrate VMs from the
> > > lower version of the hypervisor to the upgraded hypervisor, then wait
> > > for the next cold reboot of the VM to enable this feature. That's the
> > > way "discard" and "write-zeroes" features work.
> > >
> > > As to multi-queues disk allocation automatically, it's a little
> > > different because the destination will allocate queues to match the
> > > number of vCPUs automatically by default in the case of live migration,
> > > and the VMs on the source side remain 1 queue by default, which results
> > > in migration failure due to loading disk VMState incorrectly on the
> > > destination side.
> >
> > Are you using QEMU's versioned machine types to freeze the VM
> > configuration?
> 
> 
> > If not, then live migration won't work reliably because you're migrating
> > between two potentially different VM configurations. This issue is not
> > specific to num-queues, it affects all device properties.
> >
> > In commit 9445e1e15e66c19e42bea942ba810db28052cd05 ("virtio-blk-pci:
> > default num_queues to -smp N") the num_queues property is set to 1 for
> > versioned machine types <=5.1:
> >
> > diff --git a/hw/core/machine.c b/hw/core/machine.c
> > index 9ee2aa0f7b..7f65fa8743 100644
> > --- a/hw/core/machine.c
> > +++ b/hw/core/machine.c
> > @@ -31,6 +31,7 @@
> >  GlobalProperty hw_compat_5_1[] = {
> >  { "vhost-scsi", "num_queues", "1"},
> >  { "vhost-user-scsi", "num_queues", "1"},
> > +{ "virtio-blk-device", "num-queues", "1"},
> >  { "virtio-scsi-device", "num_queues", "1"},
> >  };
> >  const size_t hw_compat_5_1_len = G_N_ELEMENTS(hw_compat_5_1);
> >
> > Live migration works when the source and destination QEMU are launched
> > with the same versioned machine type. You can check the "info qtree"
> > output to confirm that starting a VM with -smp 4 -M pc-q35-5.1 results
> > in num-queues=1 while -smp 4 -M pc-q35-5.2 results in num-queues=4.
> >
> > > This issue requires Qemu to provide a hint that shows
> > > multi-queues disk allocation is automatically supported, and this allows
> > > upper APPs, e.g., libvirt, to recognize the hypervisor's capability of
> > > this. And upper APPs can ensure to allocate the same num-queues on the
> > > destination side in case of migration failure.
> > >
> > > To fix the issue, we introduce the auto-num-queues property for
> > > virtio-*-pci as a solution, which would be probed by APPs, e.g., libvirt
> > > by querying the device properties of QEMU. When launching live
> > > migration, libvirt will send the auto-num-queues property as a migration
> > > cookie to the destination, and thus the destination knows if the source
> > > side supports auto-num-queues. If not, the destination would switch off
> > > by building the command line with "auto-num-queues=off" when preparing
> > > the incoming VM process. The following patches of libvirt show how it
> > > roughly works:
> > >
> > https://github.com/newfriday/libvirt/commit/ce2bae2e1a6821afeb80756dc01f3680f525e506
> > >
> > https://github.com/newfriday/libvirt/commit/f546972b009458c88148fe079544db7e9e1f43c3
> > >
> > https://github.com/newfriday/libvirt/commit/5ee19c8646fdb4d87ab8b93f287c20925268ce83
> > >
> > > The smooth upgrade solution requires the introduction of the auto-num-
> > > queues property on the QEMU side, which is what the patch set does. I'm
> > > hoping for comments about the series.
> >
> > Please take a look at versioned machine types. I think auto-num-queues
> > is not necessary if you use versioned machine types.
> >
> > If you do think auto-num-queues is needed, please explain the issue in
> > more detail and state why versioned machine types don't help.
> 
> 
> "Using the versioned machine types" is indeed the standard way to ensure
> the proper functioning of live migration.
> 
> However, a stable version is strongly advised to maintain function in our
> production environment and perhaps practically all the production
> environments in other businesses. As a result, we must backport features
> like "auto-allocation num-queues" while keeping the machine type the same.
> 
> This patch set is posted for that reason. The "feature-backport" scenario
> is its target. I'm not sure if the upstream development strategy should
> take this sc

Re: [PATCH v3 14/17] i386: Use CPUCacheInfo.share_level to encode CPUID[4]

2023-08-14 Thread Moger, Babu

Hi Zhao,


On 8/14/23 03:22, Zhao Liu wrote:
> Hi Babu,
> 
> On Fri, Aug 04, 2023 at 10:48:29AM -0500, Moger, Babu wrote:
>> Date: Fri, 4 Aug 2023 10:48:29 -0500
>> From: "Moger, Babu" 
>> Subject: Re: [PATCH v3 14/17] i386: Use CPUCacheInfo.share_level to encode
>>  CPUID[4]
>>
>> Hi Zhao,
>>
>> On 8/4/23 04:48, Zhao Liu wrote:
>>> Hi Babu,
>>>
>>> On Thu, Aug 03, 2023 at 11:41:40AM -0500, Moger, Babu wrote:
 Date: Thu, 3 Aug 2023 11:41:40 -0500
 From: "Moger, Babu" 
 Subject: Re: [PATCH v3 14/17] i386: Use CPUCacheInfo.share_level to encode
  CPUID[4]

 Hi Zhao,

 On 8/2/23 18:49, Moger, Babu wrote:
> Hi Zhao,
>
> Hitting this error after this patch.
>
> ERROR:../target/i386/cpu.c:257:max_processor_ids_for_cache: code should
> not be reached
> Bail out! ERROR:../target/i386/cpu.c:257:max_processor_ids_for_cache: code
> should not be reached
> Aborted (core dumped)
>
> Looks like share_level for all the caches for AMD is not initialized.
>>>
>>> I missed these change when I rebase. Sorry for that.
>>>
>>> BTW, could I ask a question? From a previous discussion[1], I understand
>>> that the cache info is used to show the correct cache information in
>>> new machine. And from [2], the wrong cache info may cause "compatibility
>>> issues".
>>>
>>> Is this "compatibility issues" AMD specific? I'm not sure if Intel should
>>> update the cache info like that. thanks!
>>
>> I was going to comment about that. Good that you asked me.
>>
>> Compatibility is qemu requirement.  Otherwise the migrations will fail.
>>
>> Any changes in the topology is going to cause migration problems.
> 
> Could you please educate me more about the details of the "migration
> problem"?
> 
> I didn't understand why it was causing the problem and wasn't sure if I
> was missing any cases.
> 

I am not an expert on migration but I test VM migration sometimes.
Here are some guidelines.
https://developers.redhat.com/blog/2015/03/24/live-migrating-qemu-kvm-virtual-machines

When you migrate a VM to newer qemu using the same CPU type, migration
should work seamless. That means list of CPU features should be compatible
when you move to newer qemu version with CPU type.

Thanks
Babu

Re: [PATCH for-8.2] s390x: Convert DPRINTF to trace events

2023-08-14 Thread Thomas Huth


On 04/08/2023 10.04, Cédric Le Goater wrote:

Output message are slightly modified to ease selection with wildcards
and to report extra parameters.

Signed-off-by: Cédric Le Goater 
---

...

diff --git a/hw/s390x/trace-events b/hw/s390x/trace-events
index 
8b9213eab90c31d1eb37816d350bf76e902ccd10..34da5ea3230a0ac82f72a4d7a2aee047194be493
 100644
--- a/hw/s390x/trace-events
+++ b/hw/s390x/trace-events
@@ -19,3 +19,20 @@ virtio_ccw_set_ind(uint64_t ind_loc, uint8_t ind_old, uint8_t 
ind_new) "VIRTIO-C
  s390_pci_clp_cap(const char *id, uint32_t cap) "PCI: %s: missing expected CLP 
capability %u"
  s390_pci_clp_cap_size(const char *id, uint32_t size, uint32_t cap) "PCI: %s: bad 
size (%u) for CLP capability %u"
  s390_pci_clp_dev_info(const char *id) "PCI: %s: cannot read vfio device info"
+
+# s390-pci-bus.c
+s390_pci_sclp_nodev(const char *str, uint32_t aid) "%s no dev found aid 0x%x"
+s390_pci_iommu_xlate(uint64_t addr) "iommu trans addr 0x%" PRIx64
+s390_pci_msi_ctrl_write(uint64_t data, uint32_t idx, uint32_t vec) "write_msix data 0x%" 
PRIx64 " idx %d vec 0x%x"
+s390_pcihost(const char *msg) "%s"
+
+# s390-pci-inst.c
+s390_pci_irqs(const char *str, uint32_t id) "%s irqs for adapter id %d"
+s390_pci_kvm_aif(const char *str) "Failed to %s interrupt forwarding"
+


Remove the empty line here?


+s390_pci_list_entry(uint32_t g_l2, uint32_t vid, uint32_t did, uint32_t fid, uint32_t 
fh) "g_l2 %d vendor id 0x%x device id 0x%x fid 0x%x fh 0x%x"
+s390_pci_list(uint32_t rc) "failed rc 0x%x"
+s390_pci_unknown(const char *msg, uint32_t cmd) "%s unknown command 0x%x"
+s390_pci_bar(uint32_t bar, uint32_t addr, uint64_t size, uint32_t barsize) "bar %d addr 0x%x 
size 0x%" PRIx64 "barsize 0x%x"
+s390_pci_nodev(const char *cmd, uint32_t fh) "%s no pci dev fh 0x%x"
+s390_pci_invalid(const char *cmd, uint32_t fh) "%s invalid space fh 0x%x"
diff --git a/target/s390x/kvm/trace-events b/target/s390x/kvm/trace-events
index 
5289f5f6750e763c2e84f4d1626f70901a93d0ff..818f1a37a1525707ed0f61179ff03acb23fdf22c
 100644
--- a/target/s390x/kvm/trace-events
+++ b/target/s390x/kvm/trace-events
@@ -5,3 +5,10 @@ kvm_enable_cmma(int rc) "CMMA: enabling with result code %d"
  kvm_clear_cmma(int rc) "CMMA: clearing with result code %d"
  kvm_failed_cpu_state_set(int cpu_index, uint8_t state, const char *msg) "Warning: Unable to 
set cpu %d state %" PRIu8 " to KVM: %s"
  kvm_assign_subch_ioeventfd(int fd, uint32_t addr, bool assign, int datamatch) "fd: 
%d sch: @0x%x assign: %d vq: %d"
+
+kvm_sw_breakpoint(uint32_t n) "KVM: will use %d-byte sw breakpoints"
+kvm_insn_unhandled_priv(uint32_t x) "KVM: unhandled PRIV: 0x%x"
+kvm_insn_diag(uint32_t x) "KVM: unknown DIAG: 0x%x"
+kvm_insn(uint32_t ipa, uint32_t ipb) "handle_instruction 0x%x 0x%x"
+kvm_intercept(uint32_t icpt_code, uint64_t psw_addr) "intercept: 0x%x (at 
0x%"PRIx64"lx)"
+kvm_msi_route_fixup(const char* msg) "%s"


Apart from the nit with the empty line:
Reviewed-by: Thomas Huth

Re: [RFC PATCH 00/24] plugins: Allow to read registers

2023-08-14 Thread Alex Bennée

Akihiko Odaki  writes:

> I and other people in the University of Tokyo, where I research processor
> design, found TCG plugins are very useful for processor design
> exploration.

Thanks for the submission - I've finished my initial review pass.

I think introducing register introspection into the plugins subsystem is
a very worthwhile addition. I'm also happy (for now) to use the
underlying gdb support for it in lieu of a greater refactoring of QEMU's
multiple register introspection features.

> The feature we find missing is the capability to read registers from
> plugins. In this series, I propose to add such a capability by reusing
> gdbstub code.
>
> The reuse of gdbstub code ensures the long-term stability of the TCG plugin
> interface for register access without incurring a burden to maintain yet
> another interface for register access.

However I don't want to expose the gdb detail to plugins to leave us a
free hand in further internal clean-ups later on.

> This process to add TCG plugin involves four major changes. The first one
> is to add GDBFeature structure that represents a GDB feature, which usually
> includes registers. GDBFeature can be generated from static XML files or
> dynamically generated by architecture-specific code. In fact, this is a
> refactoring independent of the feature this series adds, and potentially
> it's benefitial even without the plugin feature. The plugin feature will
> utilize this new structure to describe registers exposed to plugins.

I think we can get cleanups to this handling in ahead of the wider
plugin feature. Ideally it would be nice to push the XML generation into
gdbstub itself but that might be more of a refactor than you are willing
to pursue for the time being.

> The second one is to make gdb_read_register/gdb_write_register usable
> outside of gdbstub context.
>
> The third one is to actually make registers readable for plugins.

Modulo isolating the plugin API from gdb specifics I'm happy with this
approach.

> The last one is to allow to implement a QEMU plugin in C++. A plugin that
> I'll describe later is written in C++.

I would want a more compelling reason that a hello world plugin for
this. Only because QEMU has removed a bunch of C++ dependency over the
last few years so I don't think we are in a rush to re-introduce it. 

Are you OK to do a re-spin addressing the comments so far?

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC PATCH 24/24] contrib/plugins: Add cc plugin

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> This demonstrates how to write a plugin in C++.
>
> Signed-off-by: Akihiko Odaki 
> ---
>  docs/devel/tcg-plugins.rst |  8 
>  configure  | 15 ---
>  contrib/plugins/Makefile   |  5 +
>  contrib/plugins/cc.cc  | 15 +++
>  tests/tcg/Makefile.target  |  3 +++
>  5 files changed, 43 insertions(+), 3 deletions(-)
>  create mode 100644 contrib/plugins/cc.cc
>
> diff --git a/docs/devel/tcg-plugins.rst b/docs/devel/tcg-plugins.rst
> index c9f8b27590..0a11f8036c 100644
> --- a/docs/devel/tcg-plugins.rst
> +++ b/docs/devel/tcg-plugins.rst
> @@ -584,6 +584,14 @@ The plugin has a number of arguments, all of them are 
> optional:
>configuration arguments implies ``l2=on``.
>(default: N = 2097152 (2MB), B = 64, A = 16)
>  
> +- contrib/plugins/cc.cc
> +
> +cc plugin demonstrates how to write a plugin in C++. It simply outputs
> +"hello, world" to the plugin log::
> +
> +  $ qemu-system-arm $(QEMU_ARGS) \
> +-plugin ./contrib/plugins/libcc.so -d plugin
> +

I'm going to assume this is useful because you have some out of tree C++
plugins? I'd drop the last two patches for now until there is a slightly
more compelling use case.

The C++ compiler detection moved into meson in b485458e00 (configure,
meson: move C++ compiler detection to meson.build) so I don't think
there is currently a compelling reason to bring this back into
configure.

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC PATCH 22/24] contrib/plugins: Allow to log registers

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> This demonstrates how a register can be read from a plugin.

I think it would be a little more useful as a demo if it tracked changes
to the register state rather than dumping it for every line executed.

>
> Signed-off-by: Akihiko Odaki 
> ---
>  docs/devel/tcg-plugins.rst |  10 ++-
>  contrib/plugins/execlog.c  | 130 -
>  2 files changed, 108 insertions(+), 32 deletions(-)
>
> diff --git a/docs/devel/tcg-plugins.rst b/docs/devel/tcg-plugins.rst
> index 81dcd43a61..c9f8b27590 100644
> --- a/docs/devel/tcg-plugins.rst
> +++ b/docs/devel/tcg-plugins.rst
> @@ -497,6 +497,15 @@ arguments if required::
>$ qemu-system-arm $(QEMU_ARGS) \
>  -plugin ./contrib/plugins/libexeclog.so,ifilter=st1w,afilter=0x40001808 
> -d plugin
>  
> +This plugin can also dump a specified register. The specification of register
> +follows `GDB standard target features 
> `__.
> +
> +Specify the name of the feature that contains the register and the name of 
> the
> +register with ``rfile`` and ``reg`` options, respectively::
> +
> +  $ qemu-system-arm $(QEMU_ARGS) \
> +-plugin 
> ./contrib/plugins/libexeclog.so,rfile=org.gnu.gdb.arm.core,reg=sp -d plugin
> +
>  - contrib/plugins/cache.c
>  
>  Cache modelling plugin that measures the performance of a given L1 cache
> @@ -583,4 +592,3 @@ The following API is generated from the inline 
> documentation in
>  include the full kernel-doc annotations.
>  
>  .. kernel-doc:: include/qemu/qemu-plugin.h
> -
> diff --git a/contrib/plugins/execlog.c b/contrib/plugins/execlog.c
> index ce67acf145..031ad67fbb 100644
> --- a/contrib/plugins/execlog.c
> +++ b/contrib/plugins/execlog.c
> @@ -15,27 +15,42 @@
>  
>  #include 
>  
> +typedef struct CPU {
> +/* Store last executed instruction on each vCPU as a GString */
> +GString *last_exec;
> +GByteArray *reg_buf;
> +
> +int reg;
> +} CPU;
> +
>  QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
>  
> -/* Store last executed instruction on each vCPU as a GString */
> -static GPtrArray *last_exec;
> +static CPU *cpus;
> +static int num_cpus;
>  static GRWLock expand_array_lock;
>  
>  static GPtrArray *imatches;
>  static GArray *amatches;
>  
> +static char *rfile_name;
> +static char *reg_name;
> +
>  /*
> - * Expand last_exec array.
> + * Expand cpu array.
>   *
>   * As we could have multiple threads trying to do this we need to
>   * serialise the expansion under a lock.
>   */
> -static void expand_last_exec(int cpu_index)
> +static void expand_cpu(int cpu_index)
>  {
> -g_rw_lock_writer_unlock(&expand_array_lock);
> -while (cpu_index >= last_exec->len) {
> -GString *s = g_string_new(NULL);
> -g_ptr_array_add(last_exec, s);
> +g_rw_lock_writer_lock(&expand_array_lock);
> +if (cpu_index >= num_cpus) {
> +cpus = g_realloc_n(cpus, cpu_index + 1, sizeof(*cpus));
> +while (cpu_index >= num_cpus) {
> +cpus[num_cpus].last_exec = g_string_new(NULL);
> +cpus[num_cpus].reg_buf = g_byte_array_new();
> +num_cpus++;
> +}
>  }
>  g_rw_lock_writer_unlock(&expand_array_lock);
>  }
> @@ -50,8 +65,8 @@ static void vcpu_mem(unsigned int cpu_index, 
> qemu_plugin_meminfo_t info,
>  
>  /* Find vCPU in array */
>  g_rw_lock_reader_lock(&expand_array_lock);
> -g_assert(cpu_index < last_exec->len);
> -s = g_ptr_array_index(last_exec, cpu_index);
> +g_assert(cpu_index < num_cpus);
> +s = cpus[cpu_index].last_exec;
>  g_rw_lock_reader_unlock(&expand_array_lock);
>  
>  /* Indicate type of memory access */
> @@ -77,28 +92,35 @@ static void vcpu_mem(unsigned int cpu_index, 
> qemu_plugin_meminfo_t info,
>   */
>  static void vcpu_insn_exec(unsigned int cpu_index, void *udata)
>  {
> -GString *s;
> +CPU cpu;
> +int n;
> +int i;
>  
>  /* Find or create vCPU in array */
>  g_rw_lock_reader_lock(&expand_array_lock);
> -if (cpu_index >= last_exec->len) {
> -g_rw_lock_reader_unlock(&expand_array_lock);
> -expand_last_exec(cpu_index);
> -g_rw_lock_reader_lock(&expand_array_lock);
> -}
> -s = g_ptr_array_index(last_exec, cpu_index);
> +cpu = cpus[cpu_index];
>  g_rw_lock_reader_unlock(&expand_array_lock);
>  
>  /* Print previous instruction in cache */
> -if (s->len) {
> -qemu_plugin_outs(s->str);
> +if (cpu.last_exec->len) {
> +qemu_plugin_outs(cpu.last_exec->str);
>  qemu_plugin_outs("\n");
>  }
>  
>  /* Store new instruction in cache */
>  /* vcpu_mem will add memory access information to last_exec */
> -g_string_printf(s, "%u, ", cpu_index);
> -g_string_append(s, (char *)udata);
> +g_string_printf(cpu.last_exec, "%u, ", cpu_index);
> +g_string_append(cpu.last_exec, (char *)udata);
> +
> +if (cpu.reg >= 0) {
> +g_string_appe

Re: [RFC PATCH 21/24] plugins: Allow to read registers

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> It is based on GDB protocol to ensure interface stability.

See comments bellow.

> The timing of the vcpu init hook is also changed so that the hook will
> get called after GDB features are initialized.

This might be worth splitting to a separate patch for cleaner bisecting.

>
> Signed-off-by: Akihiko Odaki 
> ---
>  include/qemu/qemu-plugin.h   | 65 ++--
>  cpu.c| 11 --
>  hw/core/cpu-common.c | 10 ++
>  plugins/api.c| 40 ++
>  plugins/qemu-plugins.symbols |  2 ++
>  5 files changed, 114 insertions(+), 14 deletions(-)
>
> diff --git a/include/qemu/qemu-plugin.h b/include/qemu/qemu-plugin.h
> index 50a9957279..214b12bfd6 100644
> --- a/include/qemu/qemu-plugin.h
> +++ b/include/qemu/qemu-plugin.h
> @@ -11,6 +11,7 @@
>  #ifndef QEMU_QEMU_PLUGIN_H
>  #define QEMU_QEMU_PLUGIN_H
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -51,7 +52,7 @@ typedef uint64_t qemu_plugin_id_t;
>  
>  extern QEMU_PLUGIN_EXPORT int qemu_plugin_version;
>  
> -#define QEMU_PLUGIN_VERSION 1
> +#define QEMU_PLUGIN_VERSION 2
>  
>  /**
>   * struct qemu_info_t - system information for plugins
> @@ -218,8 +219,8 @@ struct qemu_plugin_insn;
>   * @QEMU_PLUGIN_CB_R_REGS: callback reads the CPU's regs
>   * @QEMU_PLUGIN_CB_RW_REGS: callback reads and writes the CPU's regs
>   *
> - * Note: currently unused, plugins cannot read or change system
> - * register state.
> + * Note: currently QEMU_PLUGIN_CB_RW_REGS is unused, plugins cannot change
> + * system register state.
>   */
>  enum qemu_plugin_cb_flags {
>  QEMU_PLUGIN_CB_NO_REGS,
> @@ -664,4 +665,62 @@ uint64_t qemu_plugin_end_code(void);
>   */
>  uint64_t qemu_plugin_entry_code(void);
>  
> +/**
> + * struct qemu_plugin_register_file_t - register information
> + *
> + * This structure identifies registers. The identifiers included in this
> + * structure are identical with names used in GDB's standard target features
> + * with some extensions. For details, see:
> + *
> https://sourceware.org/gdb/onlinedocs/gdb/Standard-Target-Features.html

I'm not super keen on baking GDB-isms into the plugin register
interface.

> + *
> + * A register is uniquely identified with the combination of a feature name
> + * and a register name or a register number. It is recommended to derive
> + * register numbers from feature names and register names each time a new 
> vcpu
> + * starts.

Do you have examples of clashing register names from different feature
sets? 

> + *
> + * To derive the register number from a feature name and a register name,
> + * first look up qemu_plugin_register_file_t with the feature name, and then
> + * look up the register name in its @regs. The sum of the @base_reg and the
> + * index in the @reg is the register number.
> + *
> + * Note that @regs may have holes; some elements of @regs may be NULL.
> + */
> +typedef struct qemu_plugin_register_file_t {
> +/** @name: feature name */
> +const char *name;
> +/** @regs: register names */
> +const char * const *regs;
> +/** @base_reg: the base identified number */
> +int base_reg;
> +/** @num_regs: the number of elements in @regs */
> +int num_regs;
> +} qemu_plugin_register_file_t;
> +
> +/**
> + * qemu_plugin_get_register_files() - returns register information
> + *
> + * @vcpu_index: the index of the vcpu context
> + * @size: the pointer to the variable to hold the number of returned elements
> + *
> + * Returns an array of qemu_plugin_register_file_t. The user should g_free()
> + * the array once no longer needed.
> + */
> +qemu_plugin_register_file_t *
> +qemu_plugin_get_register_files(unsigned int vcpu_index, int *size);

I think I'd rather have a simpler interface that returns an anonymous
handle to the plugin. For example:

  struct qemu_plugin_register;
  struct qemu_plugin_register qemu_plugin_find_register(const char *name);

> +
> +/**
> + * qemu_plugin_read_register() - read register
> + *
> + * @buf: the byte array to append the read register content to.
> + * @reg: the register identifier determined with
> + *   qemu_plugin_get_register_files().
> + *
> + * This function is only available in a context that register read access is
> + * explicitly requested.
> + *
> + * Returns the size of the read register. The content of @buf is in target 
> byte
> + * order.
> + */
> +int qemu_plugin_read_register(GByteArray *buf, int reg);

and this then becomes:

  int qemu_plugin_read_register(GByteArray *buf, struct qemu_plugin_register);

in practice these can become anonymous pointers which hide the
implementation details from the plugin itself. Then the details of
mapping the register to a gdb regnum can be kept in the plugin code
keeping us free to further re-factor the code as we go.

The plugin code works quite hard to try and avoid leaking implementation
details to plugins so as not to tie QEMU's hands in re-factoring. While

Re: [PATCH v2] ui/dbus: implement damage regions for GL

2023-08-14 Thread Marc-André Lureau

Hi

On Mon, Aug 14, 2023 at 5:08 PM Bilal Elmoussaoui  wrote:
>
> That is fine, it is not something we need urgently as we still need a kernel 
> patch to make virtio gpu use the KMS atomic API in Mutter. Am I supposed to 
> send a new patch with the "Reviewed-by" part in the commit message or?
>

That's not necessary. Either the maintainer will add it, or can pick
it from patchew who does that for us (ex:
https://patchew.org/QEMU/20230814125802.102160-1-belmo...@redhat.com/mbox)

thanks

-- 
Marc-André Lureau

Re: [RFC PATCH 15/24] target/arm: Fill new members of GDBFeature

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> These members will be used to help plugins to identify registers.
>
> Signed-off-by: Akihiko Odaki 
> ---
>  target/arm/gdbstub.c   | 46 +++---
>  target/arm/gdbstub64.c | 42 +-
>  2 files changed, 58 insertions(+), 30 deletions(-)
>
> diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c
> index 100a6eed15..56d24028f6 100644
> --- a/target/arm/gdbstub.c
> +++ b/target/arm/gdbstub.c
> @@ -270,6 +270,7 @@ static void arm_gen_one_feature_sysreg(GString *s,
>  g_string_append_printf(s, " regnum=\"%d\"", regnum);
>  g_string_append_printf(s, " group=\"cp_regs\"/>");
>  dyn_feature->data.cpregs.keys[dyn_feature->desc.num_regs] = ri_key;
> +((const char **)dyn_feature->desc.regs)[dyn_feature->desc.num_regs] = 
> ri->name;
>  dyn_feature->desc.num_regs++;
>  }
>  
> @@ -316,6 +317,8 @@ static GDBFeature 
> *arm_gen_dynamic_sysreg_feature(CPUState *cs, int base_reg)
>  DynamicGDBFeatureInfo *dyn_feature = &cpu->dyn_sysreg_feature;
>  gsize num_regs = g_hash_table_size(cpu->cp_regs);
>  
> +dyn_feature->desc.name = "org.qemu.gdb.arm.sys.regs";
> +dyn_feature->desc.regs = g_new(const char *, num_regs);

AIUI this means we now have an array of register names which mirrors the
names embedded in the XML. This smells like a few steps away from just
abstracting the whole XML away from the targets and generating them
inside gdbstub when we need them. As per my stalled attempt I referenced
earlier.


>  dyn_feature->desc.num_regs = 0;
>  dyn_feature->data.cpregs.keys = g_new(uint32_t, num_regs);
>  g_string_printf(s, "");
> @@ -418,30 +421,34 @@ static int arm_gdb_set_m_systemreg(CPUARMState *env, 
> uint8_t *buf, int reg)
>  }
>  
>  static GDBFeature *arm_gen_dynamic_m_systemreg_feature(CPUState *cs,
> -   int orig_base_reg)
> +   int base_reg)
>  {
>  ARMCPU *cpu = ARM_CPU(cs);
>  CPUARMState *env = &cpu->env;
>  GString *s = g_string_new(NULL);
> -int base_reg = orig_base_reg;
> -int i;
> +const char **regs = g_new(const char *, ARRAY_SIZE(m_sysreg_def));
> +int i = 0;
> +int j;
>  
>  g_string_printf(s, "");
>  g_string_append_printf(s, "");
>  g_string_append_printf(s, " name=\"org.gnu.gdb.arm.m-system\">\n");
>  
> -for (i = 0; i < ARRAY_SIZE(m_sysreg_def); i++) {
> -if (arm_feature(env, m_sysreg_def[i].feature)) {
> +for (j = 0; j < ARRAY_SIZE(m_sysreg_def); j++) {
> +if (arm_feature(env, m_sysreg_def[j].feature)) {
> +regs[i] = m_sysreg_def[j].name;
>  g_string_append_printf(s,
>  "\n",
> -m_sysreg_def[i].name, base_reg++);
> +m_sysreg_def[j].name, base_reg + i++);
>  }
>  }
>  
>  g_string_append_printf(s, "");
> +cpu->dyn_m_systemreg_feature.desc.name = "org.gnu.gdb.arm.m-system";
>  cpu->dyn_m_systemreg_feature.desc.xmlname = "arm-m-system.xml";
>  cpu->dyn_m_systemreg_feature.desc.xml = g_string_free(s, false);
> -cpu->dyn_m_systemreg_feature.desc.num_regs = base_reg - orig_base_reg;
> +cpu->dyn_m_systemreg_feature.desc.regs = regs;
> +cpu->dyn_m_systemreg_feature.desc.num_regs = i;
>  
>  return &cpu->dyn_m_systemreg_feature.desc;
>  }
> @@ -462,30 +469,37 @@ static int arm_gdb_set_m_secextreg(CPUARMState *env, 
> uint8_t *buf, int reg)
>  }
>  
>  static GDBFeature *arm_gen_dynamic_m_secextreg_feature(CPUState *cs,
> -   int orig_base_reg)
> +   int base_reg)
>  {
>  ARMCPU *cpu = ARM_CPU(cs);
>  GString *s = g_string_new(NULL);
> -int base_reg = orig_base_reg;
> -int i;
> +const char **regs = g_new(const char *, ARRAY_SIZE(m_sysreg_def) * 2);
> +int i = 0;
> +int j;
>  
>  g_string_printf(s, "");
>  g_string_append_printf(s, "");
>  g_string_append_printf(s, "\n");
>  
> -for (i = 0; i < ARRAY_SIZE(m_sysreg_def); i++) {
> +for (j = 0; j < ARRAY_SIZE(m_sysreg_def); j++) {
> +regs[i] = g_strconcat(m_sysreg_def[j].name, "_ns", NULL);
>  g_string_append_printf(s,
> -"\n",
> -m_sysreg_def[i].name, base_reg++);
> +"\n",
> +regs[i], base_reg + i);
> +i++;
> +regs[i] = g_strconcat(m_sysreg_def[j].name, "_s", NULL);
>  g_string_append_printf(s,
> -"\n",
> -m_sysreg_def[i].name, base_reg++);
> +"\n",
> +regs[i], base_reg + i);
> +i++;
>  }
>  
>  g_string_append_printf(s, "");
> +cpu->dyn_m_secextreg_feature.desc.name = "org.gnu.gdb.arm.secext";
>  cpu->dyn_m_secextreg_feature.desc.xmlname = "arm-m-secext.xml";
>  cpu->dyn_m_secextreg_feature.desc.xml = g_string_free(s, false)

Re: [PATCH] migrate/multifd: fix coredump when the multifd thread cleanup

2023-08-14 Thread Fabiano Rosas

"chenyuhui (A)" via  writes:

> On 2023/7/26 0:53, Peter Xu wrote:
>> On Tue, Jul 25, 2023 at 04:43:28PM +0800, chenyuhui (A) wrote:
>>> @Peter Xu @Fabiano Rosas
>>> Kindly ping on this.
>> 
>> Ah I see what's missing - please copy maintainer (Juan) for any migration
>> patches, especially multifd ones..  I'm doing that for this one, but I'd
>> suggest you repost with a whole patch and information put into commit msg.
>> 
>> Thanks.
>> 
> Hi, Juan
> This is a patch for migration，please take a look.
>
> From: Yuhui Chen 
> Date: Tue, 25 Jul 2023 10:45:48 +0800
> Subject: [PATCH] migrate/multifd: fix coredump when the multifd thread cleanup
>
> There is a coredump while trying to destroy mutex when
> p->running is false but p->mutex is not unlock.
> Make sure all mutexes has been released before destroy them.
>
> Signed-off-by: Yuhui Chen 
> ---
>  migration/multifd.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/migration/multifd.c b/migration/multifd.c
> index 3387d8277f..3a085bf3ec 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -521,9 +521,7 @@ void multifd_save_cleanup(void)
>  for (i = 0; i < migrate_multifd_channels(); i++) {
>  MultiFDSendParams *p = &multifd_send_state->params[i];
>
> -if (p->running) {
> -qemu_thread_join(&p->thread);
> -}
> +qemu_thread_join(&p->thread);
>  }
>  for (i = 0; i < migrate_multifd_channels(); i++) {
>  MultiFDSendParams *p = &multifd_send_state->params[i];

Hi,

I took another look at this and noticed that we're not even holding the
lock in the other threads when accessing p->running.

Also, the save_cleanup() function can be called before
qemu_thread_create(), which means p->thread.thread would have a bogus
value and qemu_thread_join() would abort the QEMU process.

What we need here is to stop setting p->running from inside the thread
altogether. The flag is only effectively protecting the
qemu_thread_join() from being called before the thread's creation. We
can post to sem_sync regardless of the thread state because we only
destroy it a few lines down in the same function. And there's already
p->quit which is being accessed properly and sends the thread out of the
loop.

So I suggest this pattern:

qemu_sem_post(&p->sem_sync);
if (p->running) {
qemu_thread_join(&p->thread);
p->running = false;
}

It would also be nice to move the p->running = true closer to the
qemu_thread_create() call at multifd_channel_connect().

[PATCH] tests/qemu-iotests/183: Fix timeout issue when running tests in parallel

2023-08-14 Thread Thomas Huth

When running "make check-block SPEED=slow -j$(nproc)", the test 183
fails due to the very low default timeout value since the system is
quite loaded and thus slower in this case. We need a much higher
value when running tests in parallel, so let's try to detect this
situation by looking for "-j..." in the MAKEFLAGS environment variable.

Signed-off-by: Thomas Huth 
---
 PS: Yeah, I know, this is a little bit ugly ... if someone has a better
 idea how to fix this, please let me know.

 tests/qemu-iotests/183 | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/183 b/tests/qemu-iotests/183
index ee62939e72..b50e0d2b85 100755
--- a/tests/qemu-iotests/183
+++ b/tests/qemu-iotests/183
@@ -97,7 +97,9 @@ if echo "$reply" | grep "compiled without old-style" > 
/dev/null; then
 fi
 
 timeout_comm=$QEMU_COMM_TIMEOUT
-if [ "${VALGRIND_QEMU}" == "y" ]; then
+if echo "$MAKEFLAGS" | grep -q "\-j"; then
+QEMU_COMM_TIMEOUT=10
+elif [ "${VALGRIND_QEMU}" == "y" ]; then
 QEMU_COMM_TIMEOUT=4
 else
 QEMU_COMM_TIMEOUT=0.1
-- 
2.39.3

Re: [PATCH v2 1/3] linux-headers: update asm-s390/kvm.h

2023-08-14 Thread Thomas Huth




Sorry, I should have mentioned this for v1 already, but better late than 
never: We need to replace this patch with a proper header update later (via 
the scripts/update-linux-headers.sh script) - so in case you respin, please 
mark it with NOTFORMERGE or PLACEHOLDER or something similar in the subject, 
and mention the kernel version in the description that will likely contain 
the update.


 Thanks,
  Thomas


On 10/08/2023 14.47, Steffen Eiden wrote:

Signed-off-by: Steffen Eiden 
---
  linux-headers/asm-s390/kvm.h | 16 
  1 file changed, 16 insertions(+)

diff --git a/linux-headers/asm-s390/kvm.h b/linux-headers/asm-s390/kvm.h
index e2afd95420..023a2763a9 100644
--- a/linux-headers/asm-s390/kvm.h
+++ b/linux-headers/asm-s390/kvm.h
@@ -159,6 +159,22 @@ struct kvm_s390_vm_cpu_subfunc {
__u8 reserved[1728];
  };
  
+#define KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST	6

+#define KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST  7
+
+#define KVM_S390_VM_CPU_UV_FEAT_NR_BITS64
+struct kvm_s390_vm_cpu_uv_feat {
+   union {
+   struct {
+   __u64 : 4;
+   __u64 ap : 1;   /* bit 4 */
+   __u64 ap_intr : 1;  /* bit 5 */
+   __u64 : 58;
+   };
+   __u64 feat;
+   };
+};
+
  /* kvm attributes for crypto */
  #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW  0
  #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW  1

Re: [PATCH 2/5] qemu-nbd: fix regression with qemu-nbd --fork run over ssh

2023-08-14 Thread Kevin Wolf

Am 17.07.2023 um 16:55 hat Denis V. Lunev geschrieben:
> Commit e6df58a5578fee7a50bbf36f4a50a2781cff855d
> Author: Hanna Reitz 
> Date:   Wed May 8 23:18:18 2019 +0200
> qemu-nbd: Do not close stderr
> has introduced an interesting regression. Original behavior of
> ssh somehost qemu-nbd /home/den/tmp/file -f raw --fork
> was the following:
>  * qemu-nbd was started as a daemon
>  * the command execution is done and ssh exited with success
> 
> The patch has changed this behavior and 'ssh' command now hangs forever.
> 
> According to the normal specification of the daemon() call, we should
> endup with STDERR pointing to /dev/null. That should be done at the
> very end of the successful startup sequence when the pipe to the
> bootstrap process (used for diagnostics) is no longer needed.
> 
> This could be achived in the same way as done for 'qemu-nbd -c' case.
> That was commit 0eaf453e, also fixing up e6df58a5. STDOUT copying to
> STDERR does the trick.
> 
> This also leads to proper 'ssh' connection closing which fixes my
> original problem.
> 
> Signed-off-by: Denis V. Lunev 
> CC: Eric Blake 
> CC: Vladimir Sementsov-Ogievskiy 
> CC: Hanna Reitz 
> CC: 

This broke qemu-iotests 233 (Eric, please make sure to run the full
qemu-iotests suite before sending block related pull requests):

--- /home/kwolf/source/qemu/tests/qemu-iotests/233.out
+++ /home/kwolf/source/qemu/build-clang/scratch/raw-file-233/233.out.bad
@@ -99,14 +99,4 @@
 qemu-nbd: TLS handshake failed: The TLS connection was non-properly terminated.

 == final server log ==
-qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
-qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
-qemu-nbd: option negotiation failed: Verify failed: No certificate was found.
-qemu-nbd: option negotiation failed: Verify failed: No certificate was found.
-qemu-nbd: option negotiation failed: TLS x509 authz check for 
DISTINGUISHED-NAME is denied
-qemu-nbd: option negotiation failed: TLS x509 authz check for 
DISTINGUISHED-NAME is denied
-qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
-qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
-qemu-nbd: option negotiation failed: TLS handshake failed: An illegal 
parameter has been received.
-qemu-nbd: option negotiation failed: TLS handshake failed: An illegal 
parameter has been received.
 *** done

Do we really want to lose these error messages? This looks wrong to me.

Kevin

Re: [PATCH] qemu_cleanup: begin drained section after vm_shutdown()

2023-08-14 Thread Kevin Wolf

Am 06.07.2023 um 16:43 hat Paolo Bonzini geschrieben:
> Queued, thanks.

This patch broke qemu-iotests 109 (for raw images), some block jobs get
now paused once more. This is probably okay, but please double check and
fix either the reference output or the code.

Kevin

Re: [RFC PATCH 14/24] gdbstub: Add members to identify registers to GDBFeature

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> These members will be used to help plugins to identify registers.

I'm wary of exposing gdb'isms directly to plugins. However making stuff
easier for the gdbstub internals is ok. I shall reserve judgement until
I've read the rest of the series.


> The added members in instances of GDBFeature dynamically generated by
> CPUs will be filled in later changes.
>
> Signed-off-by: Akihiko Odaki 
> ---
>  include/exec/gdbstub.h  |  2 ++
>  scripts/feature_to_c.py | 14 +-
>  2 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h
> index 9b3da5b257..6da4af9612 100644
> --- a/include/exec/gdbstub.h
> +++ b/include/exec/gdbstub.h
> @@ -13,6 +13,8 @@
>  typedef struct GDBFeature {
>  const char *xmlname;
>  const char *xml;
> +const char *name;
> +const char * const *regs;
>  int num_regs;
>  } GDBFeature;
>  
> diff --git a/scripts/feature_to_c.py b/scripts/feature_to_c.py
> index 8eb8c81cf8..11b1bc05c9 100755
> --- a/scripts/feature_to_c.py
> +++ b/scripts/feature_to_c.py
> @@ -46,7 +46,9 @@ def writeliteral(indent, bytes):
>  sys.stderr.write(f'unexpected start tag: {element.tag}\n')
>  exit(1)
>  
> +feature_name = element.attrib['name']
>  regnum = 0
> +regnames = []
>  regnums = []
>  tags = ['feature']
>  for event, element in events:
> @@ -63,6 +65,7 @@ def writeliteral(indent, bytes):
>  if 'regnum' in element.attrib:
>  regnum = int(element.attrib['regnum'])
>  
> +regnames.append(element.attrib['name'])
>  regnums.append(regnum)
>  regnum += 1
>  
> @@ -81,6 +84,15 @@ def writeliteral(indent, bytes):
>  writeliteral(8, bytes(os.path.basename(input), 'utf-8'))
>  sys.stdout.write(',\n')
>  writeliteral(8, read)
> -sys.stdout.write(f',\n{num_regs},\n}},\n')
> +sys.stdout.write(',\n')
> +writeliteral(8, bytes(feature_name, 'utf-8'))
> +sys.stdout.write(',\n(const char * const []) {\n')
> +
> +for index, regname in enumerate(regnames):
> +sys.stdout.write(f'[{regnums[index] - base_reg}] =\n')
> +writeliteral(16, bytes(regname, 'utf-8'))
> +sys.stdout.write(',\n')
> +
> +sys.stdout.write(f'}},\n{num_regs},\n}},\n')
>  
>  sys.stdout.write('{ NULL }\n};\n')


-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC PATCH 12/24] gdbstub: Simplify XML lookup

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> Now we know all instances of GDBFeature that is used in CPU so we can
> traverse them to find XML. This removes the need for a CPU-specific
> lookup function for dynamic XMLs.
>
> Signed-off-by: Akihiko Odaki 
> ---
>  gdbstub/gdbstub.c | 28 +---
>  1 file changed, 9 insertions(+), 19 deletions(-)
>
> diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c
> index 182efe7e0f..e5bb2c89ba 100644
> --- a/gdbstub/gdbstub.c
> +++ b/gdbstub/gdbstub.c
> @@ -354,8 +354,7 @@ static const char *get_feature_xml(const char *p, const 
> char **newp,
> GDBProcess *process)
>  {
>  size_t len;
> -int i;
> -const char *name;
> +GDBRegisterState *r;
>  CPUState *cpu = gdb_get_first_cpu_in_process(process);
>  CPUClass *cc = CPU_GET_CLASS(cpu);
>  
> @@ -364,15 +363,12 @@ static const char *get_feature_xml(const char *p, const 
> char **newp,
>  len++;
>  *newp = p + len;
>  
> -name = NULL;
>  if (strncmp(p, "target.xml", len) == 0) {
>  char *buf = process->target_xml;
>  const size_t buf_sz = sizeof(process->target_xml);
>  
>  /* Generate the XML description for this CPU.  */
>  if (!buf[0]) {
> -GDBRegisterState *r;
> -
>  pstrcat(buf, buf_sz,
>  ""
>  ""
> @@ -389,28 +385,22 @@ static const char *get_feature_xml(const char *p, const 
> char **newp,
>  pstrcat(buf, buf_sz, "\"/>");
>  for (r = cpu->gdb_regs; r; r = r->next) {
>  pstrcat(buf, buf_sz, " -pstrcat(buf, buf_sz, r->feature->xml);
> +pstrcat(buf, buf_sz, r->feature->xmlname);
>  pstrcat(buf, buf_sz, "\"/>");
>  }
>  pstrcat(buf, buf_sz, "");
>  }
>  return buf;
>  }

It would be nice to modernise this code before adding to it. The static
target_xml buffer and use of pstrcat could be replaced by GString code
that is less sketchy.


> -if (cc->gdb_get_dynamic_xml) {
> -char *xmlname = g_strndup(p, len);
> -const char *xml = cc->gdb_get_dynamic_xml(cpu, xmlname);
> -
> -g_free(xmlname);
> -if (xml) {
> -return xml;
> -}
> +if (strncmp(p, cc->gdb_core_feature->xmlname, len) == 0) {
> +return cc->gdb_core_feature->xml;
>  }
> -for (i = 0; ; i++) {
> -name = gdb_features[i].xmlname;
> -if (!name || (strncmp(name, p, len) == 0 && strlen(name) == len))
> -break;
> +for (r = cpu->gdb_regs; r; r = r->next) {
> +if (strncmp(p, r->feature->xmlname, len) == 0) {
> +return r->feature->xml;
> +}
>  }
> -return name ? gdb_features[i].xml : NULL;
> +return NULL;
>  }
>  
>  const GDBFeature *gdb_find_static_feature(const char *xmlname)


-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC PATCH 13/24] hw/core/cpu: Remove gdb_get_dynamic_xml member

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> This function is no longer used.
>
> Signed-off-by: Akihiko Odaki 

Reviewed-by: Alex Bennée 


-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC PATCH 06/24] hw/core/cpu: Replace gdb_core_xml_file with gdb_core_feature

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> This is a tree-wide change to replace gdb_core_xml_file, the path to
> GDB XML file with gdb_core_feature, the pointer to GDBFeature. This
> also replaces the values assigned to gdb_num_core_regs with the
> num_regs member of GDBFeature where applicable to remove magic numbers.
>
> A following change will utilize additional information provided by
> GDBFeature to simplify XML file lookup.

re: other comment about assert(). Maybe gdb_find_static_feature() needs to 
assert
success because:

Thread 1 "qemu-loongarch6" received signal SIGSEGV, Segmentation fault.
loongarch_cpu_class_init (c=, data=) at 
../../target/loongarch/cpu.c:726
726 cc->gdb_num_core_regs = cc->gdb_core_feature->num_regs;
(gdb) p/x cc->gdb_core_feature 
$1 = 0x0
(gdb) l
721 cc->disas_set_info = loongarch_cpu_disas_set_info;
722 cc->gdb_read_register = loongarch_cpu_gdb_read_register;
723 cc->gdb_write_register = loongarch_cpu_gdb_write_register;
724 cc->disas_set_info = loongarch_cpu_disas_set_info;
725 cc->gdb_core_feature = 
gdb_find_static_feature("loongarch-base64.xml");
726 cc->gdb_num_core_regs = cc->gdb_core_feature->num_regs;
727 cc->gdb_stop_before_watchpoint = true;
728 cc->gdb_arch_name = loongarch_gdb_arch_name;
729
730 #ifdef CONFIG_TCG

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC PATCH 10/24] gdbstub: Use GDBFeature for gdb_register_coprocessor

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> This is a tree-wide change to introduce GDBFeature parameter to
> gdb_register_coprocessor(). The new parameter just replaces num_regs
> and xml parameters for now. GDBFeature will be utilized to simplify XML
> lookup in a following change.
>
> Signed-off-by: Akihiko Odaki 
> ---
>  include/exec/gdbstub.h |  2 +-
>  gdbstub/gdbstub.c  | 13 +++--
>  target/arm/gdbstub.c   | 36 
>  target/hexagon/cpu.c   |  3 +--
>  target/loongarch/gdbstub.c |  2 +-
>  target/m68k/helper.c   |  6 +++---
>  target/microblaze/cpu.c|  5 +++--
>  target/ppc/gdbstub.c   | 11 ++-
>  target/riscv/gdbstub.c | 20 
>  target/s390x/gdbstub.c | 28 +++-
>  10 files changed, 61 insertions(+), 65 deletions(-)
>
> diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h
> index 3115dc21c0..9b3da5b257 100644
> --- a/include/exec/gdbstub.h
> +++ b/include/exec/gdbstub.h
> @@ -22,7 +22,7 @@ typedef int (*gdb_get_reg_cb)(CPUArchState *env, GByteArray 
> *buf, int reg);
>  typedef int (*gdb_set_reg_cb)(CPUArchState *env, uint8_t *buf, int reg);
>  void gdb_register_coprocessor(CPUState *cpu,
>gdb_get_reg_cb get_reg, gdb_set_reg_cb set_reg,
> -  int num_regs, const char *xml, int g_pos);
> +  const GDBFeature *feature, int g_pos);
>  
>  /**
>   * gdbserver_start: start the gdb server
> diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c
> index 6f2e0cb06f..ab75f6686b 100644
> --- a/gdbstub/gdbstub.c
> +++ b/gdbstub/gdbstub.c
> @@ -471,7 +471,7 @@ static int gdb_write_register(CPUState *cpu, uint8_t 
> *mem_buf, int reg)
>  
>  void gdb_register_coprocessor(CPUState *cpu,
>gdb_get_reg_cb get_reg, gdb_set_reg_cb set_reg,
> -  int num_regs, const char *xml, int g_pos)
> +  const GDBFeature *feature, int g_pos)
>  {
>  GDBRegisterState *s;
>  GDBRegisterState **p;

As we are expecting gdb_find_static_feature to always succeed should we
maybe:

 g_assert(feature);

to avoid errors creeping in later (although I guess we'll seg almost
immediately after)?

> @@ -479,25 +479,26 @@ void gdb_register_coprocessor(CPUState *cpu,
>  p = &cpu->gdb_regs;
>  while (*p) {
>  /* Check for duplicates.  */
> -if (strcmp((*p)->xml, xml) == 0)
> +if (strcmp((*p)->xml, feature->xmlname) == 0)
>  return;
>  p = &(*p)->next;
>  }
>  
>  s = g_new0(GDBRegisterState, 1);
>  s->base_reg = cpu->gdb_num_regs;
> -s->num_regs = num_regs;
> +s->num_regs = feature->num_regs;
>  s->get_reg = get_reg;
>  s->set_reg = set_reg;
> -s->xml = xml;
> +s->xml = feature->xml;
>  
>  /* Add to end of list.  */
> -cpu->gdb_num_regs += num_regs;
> +cpu->gdb_num_regs += feature->num_regs;
>  *p = s;
>  if (g_pos) {
>  if (g_pos != s->base_reg) {
>  error_report("Error: Bad gdb register numbering for '%s', "
> - "expected %d got %d", xml, g_pos, s->base_reg);
> + "expected %d got %d", feature->xml,
> + g_pos, s->base_reg);
>  } else {
>  cpu->gdb_num_g_regs = cpu->gdb_num_regs;
>  }
> diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c
> index cd35bac013..ab4ffe6264 100644
> --- a/target/arm/gdbstub.c
> +++ b/target/arm/gdbstub.c
> @@ -522,14 +522,15 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
>   */
>  #ifdef TARGET_AARCH64
>  if (isar_feature_aa64_sve(&cpu->isar)) {
> -int nreg = arm_gen_dynamic_svereg_feature(cs, 
> cs->gdb_num_regs)->num_regs;
> +GDBFeature *feature =
> +arm_gen_dynamic_svereg_feature(cs, cs->gdb_num_regs);
>  gdb_register_coprocessor(cs, aarch64_gdb_get_sve_reg,
> - aarch64_gdb_set_sve_reg, nreg,
> - "sve-registers.xml", 0);
> + aarch64_gdb_set_sve_reg, feature, 0);
>  } else {
>  gdb_register_coprocessor(cs, aarch64_gdb_get_fpu_reg,
>   aarch64_gdb_set_fpu_reg,
> - 34, "aarch64-fpu.xml", 0);
> + 
> gdb_find_static_feature("aarch64-fpu.xml"),
> + 0);
>  }
>  /*
>   * Note that we report pauth information via the feature name
> @@ -540,19 +541,22 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
>  if (isar_feature_aa64_pauth(&cpu->isar)) {
>  gdb_register_coprocessor(cs, aarch64_gdb_get_pauth_reg,
>   aarch64_gdb_set_pauth_reg,
> -

Re: [RFC PATCH 07/24] target/arm: Use GDBFeature for dynamic XML

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> In preparation for a change to use GDBFeature as a parameter of
> gdb_register_coprocessor(), convert the internal representation of
> dynamic feature from plain XML to GDBFeature.

FWIW one of the aims I had with my stalled rewrite of the register API
was to move all this XML generation into common code:

  
https://github.com/qemu/qemu/compare/master...stsquad:qemu:introspection/registers#diff-f6409265629976beb19cc9b8d96889b67c006a265586615f491e7d59dd83dc44R68

to avoid each of the targets having to mess with constructing their own
XML and just concentrate of the semantics of each register type.

>
> Signed-off-by: Akihiko Odaki 
> ---
>  target/arm/cpu.h   | 20 +--
>  target/arm/internals.h |  2 +-
>  target/arm/gdbstub.c   | 80 +++---
>  target/arm/gdbstub64.c | 11 +++---
>  4 files changed, 60 insertions(+), 53 deletions(-)
>
> diff --git a/target/arm/cpu.h b/target/arm/cpu.h
> index 88e5accda6..d6c2378d05 100644
> --- a/target/arm/cpu.h
> +++ b/target/arm/cpu.h
> @@ -136,23 +136,21 @@ enum {
>   */
>  
>  /**
> - * DynamicGDBXMLInfo:
> - * @desc: Contains the XML descriptions.
> - * @num: Number of the registers in this XML seen by GDB.
> + * DynamicGDBFeatureInfo:
> + * @desc: Contains the feature descriptions.
>   * @data: A union with data specific to the set of registers
>   *@cpregs_keys: Array that contains the corresponding Key of
>   *  a given cpreg with the same order of the cpreg
>   *  in the XML description.
>   */
> -typedef struct DynamicGDBXMLInfo {
> -char *desc;
> -int num;
> +typedef struct DynamicGDBFeatureInfo {
> +GDBFeature desc;
>  union {
>  struct {
>  uint32_t *keys;
>  } cpregs;
>  } data;
> -} DynamicGDBXMLInfo;
> +} DynamicGDBFeatureInfo;
>  
>  /* CPU state for each instance of a generic timer (in cp15 c14) */
>  typedef struct ARMGenericTimer {
> @@ -881,10 +879,10 @@ struct ArchCPU {
>  uint64_t *cpreg_vmstate_values;
>  int32_t cpreg_vmstate_array_len;
>  
> -DynamicGDBXMLInfo dyn_sysreg_xml;
> -DynamicGDBXMLInfo dyn_svereg_xml;
> -DynamicGDBXMLInfo dyn_m_systemreg_xml;
> -DynamicGDBXMLInfo dyn_m_secextreg_xml;
> +DynamicGDBFeatureInfo dyn_sysreg_feature;
> +DynamicGDBFeatureInfo dyn_svereg_feature;
> +DynamicGDBFeatureInfo dyn_m_systemreg_feature;
> +DynamicGDBFeatureInfo dyn_m_secextreg_feature;
>  
>  /* Timers used by the generic (architected) timer */
>  QEMUTimer *gt_timer[NUM_GTIMERS];
> diff --git a/target/arm/internals.h b/target/arm/internals.h
> index 0f01bc32a8..8421a755af 100644
> --- a/target/arm/internals.h
> +++ b/target/arm/internals.h
> @@ -1388,7 +1388,7 @@ static inline uint64_t pmu_counter_mask(CPUARMState 
> *env)
>  }
>  
>  #ifdef TARGET_AARCH64
> -int arm_gen_dynamic_svereg_xml(CPUState *cpu, int base_reg);
> +GDBFeature *arm_gen_dynamic_svereg_feature(CPUState *cpu, int base_reg);
>  int aarch64_gdb_get_sve_reg(CPUARMState *env, GByteArray *buf, int reg);
>  int aarch64_gdb_set_sve_reg(CPUARMState *env, uint8_t *buf, int reg);
>  int aarch64_gdb_get_fpu_reg(CPUARMState *env, GByteArray *buf, int reg);
> diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c
> index f421c5d041..cd35bac013 100644
> --- a/target/arm/gdbstub.c
> +++ b/target/arm/gdbstub.c
> @@ -25,11 +25,11 @@
>  #include "internals.h"
>  #include "cpregs.h"
>  
> -typedef struct RegisterSysregXmlParam {
> +typedef struct RegisterSysregFeatureParam {
>  CPUState *cs;
>  GString *s;
>  int n;
> -} RegisterSysregXmlParam;
> +} RegisterSysregFeatureParam;
>  
>  /* Old gdb always expect FPA registers.  Newer (xml-aware) gdb only expect
> whatever the target description contains.  Due to a historical mishap
> @@ -243,7 +243,7 @@ static int arm_gdb_get_sysreg(CPUARMState *env, 
> GByteArray *buf, int reg)
>  const ARMCPRegInfo *ri;
>  uint32_t key;
>  
> -key = cpu->dyn_sysreg_xml.data.cpregs.keys[reg];
> +key = cpu->dyn_sysreg_feature.data.cpregs.keys[reg];
>  ri = get_arm_cp_reginfo(cpu->cp_regs, key);
>  if (ri) {
>  if (cpreg_field_is_64bit(ri)) {
> @@ -260,7 +260,8 @@ static int arm_gdb_set_sysreg(CPUARMState *env, uint8_t 
> *buf, int reg)
>  return 0;
>  }
>  
> -static void arm_gen_one_xml_sysreg_tag(GString *s, DynamicGDBXMLInfo 
> *dyn_xml,
> +static void arm_gen_one_feature_sysreg(GString *s,
> +   DynamicGDBFeatureInfo *dyn_feature,
> ARMCPRegInfo *ri, uint32_t ri_key,
> int bitsize, int regnum)
>  {
> @@ -268,25 +269,25 @@ static void arm_gen_one_xml_sysreg_tag(GString *s, 
> DynamicGDBXMLInfo *dyn_xml,
>  g_string_append_printf(s, " bitsize=\"%d\"", bitsize);
>  g_string_append_printf(s, " regnum=\"%d\"", regnum);
>  g_string_append_printf(s, " group=\"cp_regs\"/>");
> -dyn_xml->data

Re: [PATCH v2] ui/dbus: implement damage regions for GL

2023-08-14 Thread Bilal Elmoussaoui

That is fine, it is not something we need urgently as we still need a
kernel patch to make virtio gpu use the KMS atomic API in Mutter. Am I
supposed to send a new patch with the "Reviewed-by" part in the commit
message or?

On Mon, Aug 14, 2023 at 3:02 PM Marc-André Lureau <
marcandre.lur...@redhat.com> wrote:

> Hi
>
> On Mon, Aug 14, 2023 at 4:58 PM Bilal Elmoussaoui 
> wrote:
> >
> > Currently, when using `-display dbus,gl=on` all updates to the client
> > become "full scanout" updates, meaning there is no way for the client to
> > limit damage regions to the display server.
> >
> > Instead of using an "update count", this patch tracks the damage region
> > and propagates it to the client.
> >
> > This was less of an issue when clients were using GtkGLArea for
> > rendering,
> > as you'd be doing full-surface redraw. To be efficient, the client needs
> > both a DMA-BUF and the damage region to be updated.
> >
> > Co-authored-by: Christian Hergert 
> > Signed-off-by: Bilal Elmoussaoui 
>
> Reviewed-by: Marc-André Lureau 
>
> It could be considered as a fix, but I think we can delay it for the
> next release. Fine with you?
>
> > ---
> >  ui/dbus-listener.c | 32 +---
> >  1 file changed, 25 insertions(+), 7 deletions(-)
> >
> > diff --git a/ui/dbus-listener.c b/ui/dbus-listener.c
> > index 30917271ab..36548a7f52 100644
> > --- a/ui/dbus-listener.c
> > +++ b/ui/dbus-listener.c
> > @@ -26,6 +26,9 @@
> >  #include "qapi/error.h"
> >  #include "sysemu/sysemu.h"
> >  #include "dbus.h"
> > +#ifdef CONFIG_OPENGL
> > +#include 
> > +#endif
> >  #ifdef G_OS_UNIX
> >  #include 
> >  #endif
> > @@ -59,12 +62,15 @@ struct _DBusDisplayListener {
> >
> >  QemuDBusDisplay1Listener *proxy;
> >
> > +#ifdef CONFIG_OPENGL
> > +/* Keep track of the damage region */
> > +pixman_region32_t gl_damage;
> > +#endif
> > +
> >  DisplayChangeListener dcl;
> >  DisplaySurface *ds;
> >  enum share_kind ds_share;
> >
> > -int gl_updates;
> > -
> >  bool ds_mapped;
> >  bool can_share_map;
> >
> > @@ -539,11 +545,16 @@ static void dbus_gl_refresh(DisplayChangeListener
> *dcl)
> >  return;
> >  }
> >
> > -if (ddl->gl_updates) {
> > -dbus_call_update_gl(dcl, 0, 0,
> > -surface_width(ddl->ds),
> surface_height(ddl->ds));
> > -ddl->gl_updates = 0;
> > +int n_rects = pixman_region32_n_rects(&ddl->gl_damage);
> > +
> > +for (int i = 0; i < n_rects; i++) {
> > +pixman_box32_t *box;
> > +box = pixman_region32_rectangles(&ddl->gl_damage, NULL) + i;
> > +/* TODO: Add a UpdateList call to send multiple updates at once
> */
> > +dbus_call_update_gl(dcl, box->x1, box->y1,
> > +box->x2 - box->x1, box->y2 - box->y1);
> >  }
> > +pixman_region32_clear(&ddl->gl_damage);
> >  }
> >  #endif /* OPENGL */
> >
> > @@ -558,7 +569,10 @@ static void
> dbus_gl_gfx_update(DisplayChangeListener *dcl,
> >  {
> >  DBusDisplayListener *ddl = container_of(dcl, DBusDisplayListener,
> dcl);
> >
> > -ddl->gl_updates++;
> > +pixman_region32_t rect_region;
> > +pixman_region32_init_rect(&rect_region, x, y, w, h);
> > +pixman_region32_union(&ddl->gl_damage, &ddl->gl_damage,
> &rect_region);
> > +pixman_region32_fini(&rect_region);
> >  }
> >  #endif
> >
> > @@ -738,6 +752,7 @@ dbus_display_listener_dispose(GObject *object)
> >  g_clear_object(&ddl->d3d11_proxy);
> >  g_clear_pointer(&ddl->peer_process, CloseHandle);
> >  #ifdef CONFIG_OPENGL
> > +pixman_region32_fini(&ddl->gl_damage);
> >  egl_fb_destroy(&ddl->fb);
> >  #endif
> >  #endif
> > @@ -772,6 +787,9 @@
> dbus_display_listener_class_init(DBusDisplayListenerClass *klass)
> >  static void
> >  dbus_display_listener_init(DBusDisplayListener *ddl)
> >  {
> > +#ifdef CONFIG_OPENGL
> > +pixman_region32_init(&ddl->gl_damage);
> > +#endif
> >  }
> >
> >  const char *
> > --
> > 2.41.0
> >
>
>

Re: [PATCH v2] ui/dbus: implement damage regions for GL

2023-08-14 Thread Marc-André Lureau

Hi

On Mon, Aug 14, 2023 at 4:58 PM Bilal Elmoussaoui  wrote:
>
> Currently, when using `-display dbus,gl=on` all updates to the client
> become "full scanout" updates, meaning there is no way for the client to
> limit damage regions to the display server.
>
> Instead of using an "update count", this patch tracks the damage region
> and propagates it to the client.
>
> This was less of an issue when clients were using GtkGLArea for
> rendering,
> as you'd be doing full-surface redraw. To be efficient, the client needs
> both a DMA-BUF and the damage region to be updated.
>
> Co-authored-by: Christian Hergert 
> Signed-off-by: Bilal Elmoussaoui 

Reviewed-by: Marc-André Lureau 

It could be considered as a fix, but I think we can delay it for the
next release. Fine with you?

> ---
>  ui/dbus-listener.c | 32 +---
>  1 file changed, 25 insertions(+), 7 deletions(-)
>
> diff --git a/ui/dbus-listener.c b/ui/dbus-listener.c
> index 30917271ab..36548a7f52 100644
> --- a/ui/dbus-listener.c
> +++ b/ui/dbus-listener.c
> @@ -26,6 +26,9 @@
>  #include "qapi/error.h"
>  #include "sysemu/sysemu.h"
>  #include "dbus.h"
> +#ifdef CONFIG_OPENGL
> +#include 
> +#endif
>  #ifdef G_OS_UNIX
>  #include 
>  #endif
> @@ -59,12 +62,15 @@ struct _DBusDisplayListener {
>
>  QemuDBusDisplay1Listener *proxy;
>
> +#ifdef CONFIG_OPENGL
> +/* Keep track of the damage region */
> +pixman_region32_t gl_damage;
> +#endif
> +
>  DisplayChangeListener dcl;
>  DisplaySurface *ds;
>  enum share_kind ds_share;
>
> -int gl_updates;
> -
>  bool ds_mapped;
>  bool can_share_map;
>
> @@ -539,11 +545,16 @@ static void dbus_gl_refresh(DisplayChangeListener *dcl)
>  return;
>  }
>
> -if (ddl->gl_updates) {
> -dbus_call_update_gl(dcl, 0, 0,
> -surface_width(ddl->ds), surface_height(ddl->ds));
> -ddl->gl_updates = 0;
> +int n_rects = pixman_region32_n_rects(&ddl->gl_damage);
> +
> +for (int i = 0; i < n_rects; i++) {
> +pixman_box32_t *box;
> +box = pixman_region32_rectangles(&ddl->gl_damage, NULL) + i;
> +/* TODO: Add a UpdateList call to send multiple updates at once */
> +dbus_call_update_gl(dcl, box->x1, box->y1,
> +box->x2 - box->x1, box->y2 - box->y1);
>  }
> +pixman_region32_clear(&ddl->gl_damage);
>  }
>  #endif /* OPENGL */
>
> @@ -558,7 +569,10 @@ static void dbus_gl_gfx_update(DisplayChangeListener 
> *dcl,
>  {
>  DBusDisplayListener *ddl = container_of(dcl, DBusDisplayListener, dcl);
>
> -ddl->gl_updates++;
> +pixman_region32_t rect_region;
> +pixman_region32_init_rect(&rect_region, x, y, w, h);
> +pixman_region32_union(&ddl->gl_damage, &ddl->gl_damage, &rect_region);
> +pixman_region32_fini(&rect_region);
>  }
>  #endif
>
> @@ -738,6 +752,7 @@ dbus_display_listener_dispose(GObject *object)
>  g_clear_object(&ddl->d3d11_proxy);
>  g_clear_pointer(&ddl->peer_process, CloseHandle);
>  #ifdef CONFIG_OPENGL
> +pixman_region32_fini(&ddl->gl_damage);
>  egl_fb_destroy(&ddl->fb);
>  #endif
>  #endif
> @@ -772,6 +787,9 @@ dbus_display_listener_class_init(DBusDisplayListenerClass 
> *klass)
>  static void
>  dbus_display_listener_init(DBusDisplayListener *ddl)
>  {
> +#ifdef CONFIG_OPENGL
> +pixman_region32_init(&ddl->gl_damage);
> +#endif
>  }
>
>  const char *
> --
> 2.41.0
>

Re: [PATCH] ui/dbus: implement damage regions for GL

2023-08-14 Thread Bilal Elmoussaoui

Thanks for the quick review! I have sent a v2 with the requested changes.

On Mon, Aug 14, 2023 at 2:41 PM Marc-André Lureau <
marcandre.lur...@gmail.com> wrote:

> Hi
>
> On Mon, Aug 14, 2023 at 4:10 PM Bilal Elmoussaoui 
> wrote:
> >
> > Currently, when using `-display dbus,gl=on` all updates to the client
> > become "full scanout" updates, meaning there is no way for the client to
> > limit damage regions to the display server.
> >
> > Instead of using an "update count", this patch tracks the damage region
> > and propagates it to the client.
> >
> > This was less of an issue when clients were using GtkGLArea for
> > rendering,
> > as you'd be doing full-surface redraw. To be efficient, the client needs
> > both a DMA-BUF and the damage region to be updated.
> >
> > Co-authored-by: Christian Hergert 
> > Signed-off-by: Bilal Elmoussaoui 
> > ---
> >  ui/dbus-listener.c | 32 
> >  1 file changed, 24 insertions(+), 8 deletions(-)
> >
> > diff --git a/ui/dbus-listener.c b/ui/dbus-listener.c
> > index 30917271ab..d015e8d759 100644
> > --- a/ui/dbus-listener.c
> > +++ b/ui/dbus-listener.c
> > @@ -26,6 +26,9 @@
> >  #include "qapi/error.h"
> >  #include "sysemu/sysemu.h"
> >  #include "dbus.h"
> > +#ifdef CONFIG_OPENGL
> > +#include 
> > +#endif
> >  #ifdef G_OS_UNIX
> >  #include 
> >  #endif
> > @@ -59,12 +62,15 @@ struct _DBusDisplayListener {
> >
> >  QemuDBusDisplay1Listener *proxy;
> >
> > +#ifdef CONFIG_OPENGL
> > +/* Keep track of the damage region */
> > +pixman_region32_t gl_damage;
> > +#endif
>
> I think it should call pixman_region32_init() in
> dbus_display_listener_new(), & _fini() in _dispose()
>
> > +
> >  DisplayChangeListener dcl;
> >  DisplaySurface *ds;
> >  enum share_kind ds_share;
> >
> > -int gl_updates;
> > -
> >  bool ds_mapped;
> >  bool can_share_map;
> >
> > @@ -539,11 +545,16 @@ static void dbus_gl_refresh(DisplayChangeListener
> *dcl)
> >  return;
> >  }
> >
> > -if (ddl->gl_updates) {
> > -dbus_call_update_gl(dcl, 0, 0,
> > -surface_width(ddl->ds),
> surface_height(ddl->ds));
> > -ddl->gl_updates = 0;
> > +int n_rects = pixman_region32_n_rects(&ddl->gl_damage);
> > +
> > +for (int i = 0; i < n_rects; i++) {
> > +pixman_box32_t *box;
> > +box = pixman_region32_rectangles(&ddl->gl_damage, NULL) + i;
> > +
> > +dbus_call_update_gl(dcl, box->x1, box->y1,
> > +box->x2 - box->x1, box->y2 - box->y1);
>
> May be worth to add a "TODO: add Update*List methods" ?
>
> >  }
> > +pixman_region32_clear(&ddl->gl_damage);
> >  }
> >  #endif /* OPENGL */
> >
> > @@ -558,7 +569,10 @@ static void
> dbus_gl_gfx_update(DisplayChangeListener *dcl,
> >  {
> >  DBusDisplayListener *ddl = container_of(dcl, DBusDisplayListener,
> dcl);
> >
> > -ddl->gl_updates++;
> > +pixman_region32_t rect_region;
> > +pixman_region32_init_rect(&rect_region, x, y, w, h);
> > +pixman_region32_union(&ddl->gl_damage, &ddl->gl_damage,
> &rect_region);
> > +pixman_region32_fini(&rect_region);
> >  }
> >  #endif
> >
> > @@ -933,7 +947,9 @@ dbus_display_listener_new(const char *bus_name,
> >  g_object_unref(ddl);
> >  return NULL;
> >  }
> > -
> > +#ifdef CONFIG_OPENGL
> > +pixman_region32_init(&ddl->gl_damage);
> > +#endif
> >  ddl->bus_name = g_strdup(bus_name);
> >  ddl->conn = conn;
> >  ddl->console = console;
> > --
> > 2.41.0
> >
> >
>
> otherwise, lgtm
>
> --
> Marc-André Lureau
>
>

[PATCH v2] ui/dbus: implement damage regions for GL

2023-08-14 Thread Bilal Elmoussaoui

Currently, when using `-display dbus,gl=on` all updates to the client
become "full scanout" updates, meaning there is no way for the client to
limit damage regions to the display server.

Instead of using an "update count", this patch tracks the damage region
and propagates it to the client.

This was less of an issue when clients were using GtkGLArea for
rendering,
as you'd be doing full-surface redraw. To be efficient, the client needs
both a DMA-BUF and the damage region to be updated.

Co-authored-by: Christian Hergert 
Signed-off-by: Bilal Elmoussaoui 
---
 ui/dbus-listener.c | 32 +---
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/ui/dbus-listener.c b/ui/dbus-listener.c
index 30917271ab..36548a7f52 100644
--- a/ui/dbus-listener.c
+++ b/ui/dbus-listener.c
@@ -26,6 +26,9 @@
 #include "qapi/error.h"
 #include "sysemu/sysemu.h"
 #include "dbus.h"
+#ifdef CONFIG_OPENGL
+#include 
+#endif
 #ifdef G_OS_UNIX
 #include 
 #endif
@@ -59,12 +62,15 @@ struct _DBusDisplayListener {
 
 QemuDBusDisplay1Listener *proxy;
 
+#ifdef CONFIG_OPENGL
+/* Keep track of the damage region */
+pixman_region32_t gl_damage;
+#endif
+
 DisplayChangeListener dcl;
 DisplaySurface *ds;
 enum share_kind ds_share;
 
-int gl_updates;
-
 bool ds_mapped;
 bool can_share_map;
 
@@ -539,11 +545,16 @@ static void dbus_gl_refresh(DisplayChangeListener *dcl)
 return;
 }
 
-if (ddl->gl_updates) {
-dbus_call_update_gl(dcl, 0, 0,
-surface_width(ddl->ds), surface_height(ddl->ds));
-ddl->gl_updates = 0;
+int n_rects = pixman_region32_n_rects(&ddl->gl_damage);
+
+for (int i = 0; i < n_rects; i++) {
+pixman_box32_t *box;
+box = pixman_region32_rectangles(&ddl->gl_damage, NULL) + i;
+/* TODO: Add a UpdateList call to send multiple updates at once */
+dbus_call_update_gl(dcl, box->x1, box->y1,
+box->x2 - box->x1, box->y2 - box->y1);
 }
+pixman_region32_clear(&ddl->gl_damage);
 }
 #endif /* OPENGL */
 
@@ -558,7 +569,10 @@ static void dbus_gl_gfx_update(DisplayChangeListener *dcl,
 {
 DBusDisplayListener *ddl = container_of(dcl, DBusDisplayListener, dcl);
 
-ddl->gl_updates++;
+pixman_region32_t rect_region;
+pixman_region32_init_rect(&rect_region, x, y, w, h);
+pixman_region32_union(&ddl->gl_damage, &ddl->gl_damage, &rect_region);
+pixman_region32_fini(&rect_region);
 }
 #endif
 
@@ -738,6 +752,7 @@ dbus_display_listener_dispose(GObject *object)
 g_clear_object(&ddl->d3d11_proxy);
 g_clear_pointer(&ddl->peer_process, CloseHandle);
 #ifdef CONFIG_OPENGL
+pixman_region32_fini(&ddl->gl_damage);
 egl_fb_destroy(&ddl->fb);
 #endif
 #endif
@@ -772,6 +787,9 @@ dbus_display_listener_class_init(DBusDisplayListenerClass 
*klass)
 static void
 dbus_display_listener_init(DBusDisplayListener *ddl)
 {
+#ifdef CONFIG_OPENGL
+pixman_region32_init(&ddl->gl_damage);
+#endif
 }
 
 const char *
-- 
2.41.0

Re: [RFC PATCH 06/24] hw/core/cpu: Replace gdb_core_xml_file with gdb_core_feature

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> This is a tree-wide change to replace gdb_core_xml_file, the path to
> GDB XML file with gdb_core_feature, the pointer to GDBFeature. This
> also replaces the values assigned to gdb_num_core_regs with the
> num_regs member of GDBFeature where applicable to remove magic numbers.
>
> A following change will utilize additional information provided by
> GDBFeature to simplify XML file lookup.
>
> Signed-off-by: Akihiko Odaki 
> ---
>  include/hw/core/cpu.h   | 5 +++--
>  target/s390x/cpu.h  | 2 --
>  gdbstub/gdbstub.c   | 6 +++---
>  target/arm/cpu.c| 4 ++--
>  target/arm/cpu64.c  | 4 ++--
>  target/arm/tcg/cpu32.c  | 3 ++-
>  target/avr/cpu.c| 4 ++--
>  target/hexagon/cpu.c| 2 +-
>  target/i386/cpu.c   | 7 +++
>  target/loongarch/cpu.c  | 4 ++--
>  target/m68k/cpu.c   | 7 ---
>  target/microblaze/cpu.c | 4 ++--
>  target/ppc/cpu_init.c   | 4 ++--
>  target/riscv/cpu.c  | 7 ---
>  target/rx/cpu.c | 4 ++--
>  target/s390x/cpu.c  | 4 ++--
>  16 files changed, 36 insertions(+), 35 deletions(-)
>
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index fdcbe87352..84219c1885 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -23,6 +23,7 @@
>  #include "hw/qdev-core.h"
>  #include "disas/dis-asm.h"
>  #include "exec/cpu-common.h"
> +#include "exec/gdbstub.h"
>  #include "exec/hwaddr.h"
>  #include "exec/memattrs.h"
>  #include "qapi/qapi-types-run-state.h"
> @@ -127,7 +128,7 @@ struct SysemuCPUOps;
>   *   breakpoint.  Used by AVR to handle a gdb mis-feature with
>   *   its Harvard architecture split code and data.
>   * @gdb_num_core_regs: Number of core registers accessible to GDB.

It seems redundant to have this when gdb_core_features already
encapsulates this, especially since...

> - * @gdb_core_xml_file: File name for core registers GDB XML description.
> + * @gdb_core_feature: GDB core feature description.
>   * @gdb_stop_before_watchpoint: Indicates whether GDB expects the CPU to stop
>   *   before the insn which triggers a watchpoint rather than after 
> it.
>   * @gdb_arch_name: Optional callback that returns the architecture name known
> @@ -163,7 +164,7 @@ struct CPUClass {
>  int (*gdb_write_register)(CPUState *cpu, uint8_t *buf, int reg);
>  vaddr (*gdb_adjust_breakpoint)(CPUState *cpu, vaddr addr);
>  
> -const char *gdb_core_xml_file;
> +const GDBFeature *gdb_core_feature;
>  gchar * (*gdb_arch_name)(CPUState *cpu);
>  const char * (*gdb_get_dynamic_xml)(CPUState *cpu, const char *xmlname);
>  

> diff --git a/target/arm/cpu.c b/target/arm/cpu.c
> index d71a162070..a206ab6b1b 100644
> --- a/target/arm/cpu.c
> +++ b/target/arm/cpu.c
> @@ -2353,7 +2353,6 @@ static void arm_cpu_class_init(ObjectClass *oc, void 
> *data)
>  #ifndef CONFIG_USER_ONLY
>  cc->sysemu_ops = &arm_sysemu_ops;
>  #endif
> -cc->gdb_num_core_regs = 26;
>  cc->gdb_arch_name = arm_gdb_arch_name;
>  cc->gdb_get_dynamic_xml = arm_gdb_get_dynamic_xml;
>  cc->gdb_stop_before_watchpoint = true;
> @@ -2378,7 +2377,8 @@ static void cpu_register_class_init(ObjectClass *oc, 
> void *data)
>  CPUClass *cc = CPU_CLASS(acc);
>  
>  acc->info = data;
> -cc->gdb_core_xml_file = "arm-core.xml";
> +cc->gdb_core_feature = gdb_find_static_feature("arm-core.xml");
> +cc->gdb_num_core_regs = cc->gdb_core_feature->num_regs;

You are doing assignments like this. I think something like this in
gdbstub:

modified   gdbstub/gdbstub.c
@@ -440,7 +440,7 @@ int gdb_read_register(CPUState *cpu, GByteArray *buf, int 
reg, bool has_xml)
 CPUArchState *env = cpu->env_ptr;
 GDBRegisterState *r;
 
-if (reg < cc->gdb_num_core_regs) {
+if (reg < cc->gdb_core_feature->num_regs) {
 return cc->gdb_read_register(cpu, buf, reg, has_xml);
 }
 
@@ -459,7 +459,7 @@ static int gdb_write_register(CPUState *cpu, uint8_t 
*mem_buf, int reg,
 CPUArchState *env = cpu->env_ptr;
 GDBRegisterState *r;
 
-if (reg < cc->gdb_num_core_regs) {
+if (reg < cc->gdb_core_feature->num_regs) {
 return cc->gdb_write_register(cpu, mem_buf, reg, has_xml);
 }

makes most of the uses go away. Some of the other arches might need
target specific tweaks.



-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

qemu-system-riscv64 -cpu host uses wrong privilege spec version

2023-08-14 Thread Andreas Schwab

When running a KVM enabled qemu inside a qemu instance, I get these warnings:

$ qemu-system-riscv64 -cpu host -machine virt,accel=kvm
qemu-system-riscv64: warning: disabling h extension for hart 0x 
because privilege spec version does not match
qemu-system-riscv64: warning: disabling zicbom extension for hart 
0x because privilege spec version does not match
qemu-system-riscv64: warning: disabling zicboz extension for hart 
0x because privilege spec version does not match
qemu-system-riscv64: warning: disabling zawrs extension for hart 
0x because privilege spec version does not match
qemu-system-riscv64: warning: disabling zba extension for hart 
0x because privilege spec version does not match
qemu-system-riscv64: warning: disabling zbb extension for hart 
0x because privilege spec version does not match
qemu-system-riscv64: warning: disabling zbc extension for hart 
0x because privilege spec version does not match
qemu-system-riscv64: warning: disabling zbs extension for hart 
0x because privilege spec version does not match
qemu-system-riscv64: warning: disabling sstc extension for hart 
0x because privilege spec version does not match
qemu-system-riscv64: warning: disabling svadu extension for hart 
0x because privilege spec version does not match

The qemu instance is running openSUSE Tumbleweed with -cpu rv64,h=on,sv48=on.

-- 
Andreas Schwab, SUSE Labs, sch...@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."

Re: [PATCH] ui/dbus: implement damage regions for GL

2023-08-14 Thread Marc-André Lureau

Hi

On Mon, Aug 14, 2023 at 4:10 PM Bilal Elmoussaoui  wrote:
>
> Currently, when using `-display dbus,gl=on` all updates to the client
> become "full scanout" updates, meaning there is no way for the client to
> limit damage regions to the display server.
>
> Instead of using an "update count", this patch tracks the damage region
> and propagates it to the client.
>
> This was less of an issue when clients were using GtkGLArea for
> rendering,
> as you'd be doing full-surface redraw. To be efficient, the client needs
> both a DMA-BUF and the damage region to be updated.
>
> Co-authored-by: Christian Hergert 
> Signed-off-by: Bilal Elmoussaoui 
> ---
>  ui/dbus-listener.c | 32 
>  1 file changed, 24 insertions(+), 8 deletions(-)
>
> diff --git a/ui/dbus-listener.c b/ui/dbus-listener.c
> index 30917271ab..d015e8d759 100644
> --- a/ui/dbus-listener.c
> +++ b/ui/dbus-listener.c
> @@ -26,6 +26,9 @@
>  #include "qapi/error.h"
>  #include "sysemu/sysemu.h"
>  #include "dbus.h"
> +#ifdef CONFIG_OPENGL
> +#include 
> +#endif
>  #ifdef G_OS_UNIX
>  #include 
>  #endif
> @@ -59,12 +62,15 @@ struct _DBusDisplayListener {
>
>  QemuDBusDisplay1Listener *proxy;
>
> +#ifdef CONFIG_OPENGL
> +/* Keep track of the damage region */
> +pixman_region32_t gl_damage;
> +#endif

I think it should call pixman_region32_init() in
dbus_display_listener_new(), & _fini() in _dispose()

> +
>  DisplayChangeListener dcl;
>  DisplaySurface *ds;
>  enum share_kind ds_share;
>
> -int gl_updates;
> -
>  bool ds_mapped;
>  bool can_share_map;
>
> @@ -539,11 +545,16 @@ static void dbus_gl_refresh(DisplayChangeListener *dcl)
>  return;
>  }
>
> -if (ddl->gl_updates) {
> -dbus_call_update_gl(dcl, 0, 0,
> -surface_width(ddl->ds), surface_height(ddl->ds));
> -ddl->gl_updates = 0;
> +int n_rects = pixman_region32_n_rects(&ddl->gl_damage);
> +
> +for (int i = 0; i < n_rects; i++) {
> +pixman_box32_t *box;
> +box = pixman_region32_rectangles(&ddl->gl_damage, NULL) + i;
> +
> +dbus_call_update_gl(dcl, box->x1, box->y1,
> +box->x2 - box->x1, box->y2 - box->y1);

May be worth to add a "TODO: add Update*List methods" ?

>  }
> +pixman_region32_clear(&ddl->gl_damage);
>  }
>  #endif /* OPENGL */
>
> @@ -558,7 +569,10 @@ static void dbus_gl_gfx_update(DisplayChangeListener 
> *dcl,
>  {
>  DBusDisplayListener *ddl = container_of(dcl, DBusDisplayListener, dcl);
>
> -ddl->gl_updates++;
> +pixman_region32_t rect_region;
> +pixman_region32_init_rect(&rect_region, x, y, w, h);
> +pixman_region32_union(&ddl->gl_damage, &ddl->gl_damage, &rect_region);
> +pixman_region32_fini(&rect_region);
>  }
>  #endif
>
> @@ -933,7 +947,9 @@ dbus_display_listener_new(const char *bus_name,
>  g_object_unref(ddl);
>  return NULL;
>  }
> -
> +#ifdef CONFIG_OPENGL
> +pixman_region32_init(&ddl->gl_damage);
> +#endif
>  ddl->bus_name = g_strdup(bus_name);
>  ddl->conn = conn;
>  ddl->console = console;
> --
> 2.41.0
>
>

otherwise, lgtm

-- 
Marc-André Lureau

Re: [PATCH] i386/xen: Don't advertise XENFEAT_supervisor_mode_kernel

2023-08-14 Thread Paul Durrant


On 08/08/2023 18:08, David Woodhouse wrote:

From: David Woodhouse 

XENFEAT_supervisor_mode_kernel shouldn't be set for HVM guests. It
confuses lscpu into thinking it's running in PVH mode.

No non-cosmetic effects have been observed so far.

Signed-off-by: David Woodhouse 
---
Only really cosmetic. Don't feel strongly about whether it makes 8.1.

  target/i386/kvm/xen-emu.c | 1 -
  1 file changed, 1 deletion(-)



Reviewed-by: Paul Durrant

Re: [RFC PATCH post-8.1] hw/xen: Clean up event channel 'type_val' handling to use union

2023-08-14 Thread Paul Durrant


On 03/08/2023 16:28, David Woodhouse wrote:

From: David Woodhouse 

A previous implementation of this stuff used a 64-bit field for all of
the port information (vcpu/type/type_val) and did atomic exchanges on
them. When I implemented that in Qemu I regretted my life choices and
just kept it simple with locking instead.

So there's no need for the XenEvtchnPort to be so simplistic. We can
use a union for the pirq/virq/interdomain information, which lets us
keep a separate bit for the 'remote domain' in interdomain ports. A
single bit is enough since the only possible targets are loopback or
qemu itself.

So now we can ditch PORT_INFO_TYPEVAL_REMOTE_QEMU and the horrid
manual masking, although the in-memory representation is identical
so there's no change in the saved state ABI.

Signed-off-by: David Woodhouse 
---
Thought this would be a nice cleanup to avoid abusing `type_val` for
various different purposes, and especially the top bit of it for
interdomain ports. But having done it I find myself fairly ambivalent
about it. Does anyone feel strongly either way?

  hw/i386/kvm/xen_evtchn.c | 124 ---
  1 file changed, 64 insertions(+), 60 deletions(-)



I don't feel that strongly, but using the union+bitfield approach is a 
little nicer to read and only makes the code 4 lines longer.




diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index a731738411..446ae46022 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -58,7 +58,15 @@ OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN)
  typedef struct XenEvtchnPort {
  uint32_t vcpu;  /* Xen/ACPI vcpu_id */
  uint16_t type;  /* EVTCHNSTAT_ */
-uint16_t type_val;  /* pirq# / virq# / remote port according to type */
+union {
+uint16_t type_val;  /* pirq# / virq# / remote port according to type */


Not sure the comment is that valuable any more... and maybe just 'val' 
now rather than 'type_val'?



+uint16_t pirq;
+uint16_t virq;
+struct {
+uint16_t port:15;
+uint16_t to_qemu:1; /* Only two targets; qemu or loopback */


I'd have switch the sense and called this 'loopback'... since it's the 
more unlikely case.



+} interdomain;
+} u;
  } XenEvtchnPort;
  
  /* 32-bit compatibility definitions, also used natively in 32-bit build */

@@ -210,16 +218,16 @@ static int xen_evtchn_post_load(void *opaque, int 
version_id)
  XenEvtchnPort *p = &s->port_table[i];
  
  if (p->type == EVTCHNSTAT_pirq) {

-assert(p->type_val);
-assert(p->type_val < s->nr_pirqs);
+assert(p->u.pirq);
+assert(p->u.pirq < s->nr_pirqs);
  
  /*

   * Set the gsi to IRQ_UNBOUND; it may be changed to an actual
   * GSI# below, or to IRQ_MSI_EMU when the MSI table snooping
   * catches up with it.
   */
-s->pirq[p->type_val].gsi = IRQ_UNBOUND;
-s->pirq[p->type_val].port = i;
+s->pirq[p->u.pirq].gsi = IRQ_UNBOUND;
+s->pirq[p->u.pirq].port = i;
  }
  }
  /* Rebuild s->pirq[].gsi mapping */
@@ -243,7 +251,7 @@ static const VMStateDescription xen_evtchn_port_vmstate = {
  .fields = (VMStateField[]) {
  VMSTATE_UINT32(vcpu, XenEvtchnPort),
  VMSTATE_UINT16(type, XenEvtchnPort),
-VMSTATE_UINT16(type_val, XenEvtchnPort),
+VMSTATE_UINT16(u.type_val, XenEvtchnPort),
  VMSTATE_END_OF_LIST()
  }
  };
@@ -599,14 +607,13 @@ static void unbind_backend_ports(XenEvtchnState *s)
  
  for (i = 1; i < s->nr_ports; i++) {

  p = &s->port_table[i];
-if (p->type == EVTCHNSTAT_interdomain &&
-(p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU)) {
-evtchn_port_t be_port = p->type_val & 
PORT_INFO_TYPEVAL_REMOTE_PORT_MASK;
+if (p->type == EVTCHNSTAT_interdomain && p->u.interdomain.to_qemu) {
+evtchn_port_t be_port = p->u.interdomain.port;
  
  if (s->be_handles[be_port]) {

  /* This part will be overwritten on the load anyway. */
  p->type = EVTCHNSTAT_unbound;
-p->type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+p->u.interdomain.port = 0;
  
  /* Leave the backend port open and unbound too. */

  if (kvm_xen_has_cap(EVTCHN_SEND)) {
@@ -644,7 +651,7 @@ int xen_evtchn_status_op(struct evtchn_status *status)
  
  switch (p->type) {

  case EVTCHNSTAT_unbound:
-if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+if (p->u.interdomain.to_qemu) {
  status->u.unbound.dom = DOMID_QEMU;
  } else {
  status->u.unbound.dom = xen_domid;
@@ -652,22 +659,21 @@ int xen_evtchn_status_op(struct evtchn_status *status)
  break;
  
  case EVTCHNSTAT_interdomain:

-if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_

Re: [PATCH 1/2] vhost-user: fix lost reconnect

2023-08-14 Thread Raphael Norwitz

Why can’t we rather fix this by adding a “event_cb” param to 
vhost_user_async_close and then call qemu_chr_fe_set_handlers in 
vhost_user_async_close_bh()?

Even if calling vhost_dev_cleanup() twice is safe today I worry future changes 
may easily stumble over the reconnect case and introduce crashes or double 
frees.


> On Aug 4, 2023, at 1:29 AM, Li Feng  wrote:
> 
> When the vhost-user is reconnecting to the backend, and if the vhost-user 
> fails
> at the get_features in vhost_dev_init(), then the reconnect will fail
> and it will not be retriggered forever.
> 
> The reason is:
> When the vhost-user fail at get_features, the vhost_dev_cleanup will be called
> immediately.
> 
> vhost_dev_cleanup calls 'memset(hdev, 0, sizeof(struct vhost_dev))'.
> 
> The reconnect path is:
> vhost_user_blk_event
>   vhost_user_async_close(.. vhost_user_blk_disconnect ..)
> qemu_chr_fe_set_handlers <- clear the notifier callback
>   schedule vhost_user_async_close_bh
> 
> The vhost->vdev is null, so the vhost_user_blk_disconnect will not be
> called, then the event fd callback will not be reinstalled.
> 
> With this patch, the vhost_user_blk_disconnect will call the
> vhost_dev_cleanup() again, it's safe.
> 
> All vhost-user devices have this issue, including vhost-user-blk/scsi.
> 
> Fixes: 71e076a07d ("hw/virtio: generalise CHR_EVENT_CLOSED handling")
> 
> Signed-off-by: Li Feng 
> ---
> hw/virtio/vhost-user.c | 10 +-
> 1 file changed, 1 insertion(+), 9 deletions(-)
> 
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index 8dcf049d42..697b403fe2 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -2648,16 +2648,8 @@ typedef struct {
> static void vhost_user_async_close_bh(void *opaque)
> {
> VhostAsyncCallback *data = opaque;
> -struct vhost_dev *vhost = data->vhost;
> 
> -/*
> - * If the vhost_dev has been cleared in the meantime there is
> - * nothing left to do as some other path has completed the
> - * cleanup.
> - */
> -if (vhost->vdev) {
> -data->cb(data->dev);
> -}
> +data->cb(data->dev);
> 
> g_free(data);
> }
> -- 
> 2.41.0
>

Re: [PATCH 2/2] vhost: Add Error parameter to vhost_scsi_common_start()

2023-08-14 Thread Raphael Norwitz

Thanks for the cleanup! A few comments.

> On Aug 4, 2023, at 1:29 AM, Li Feng  wrote:
> 
> Add a Error parameter to report the real error, like vhost-user-blk.
> 
> Signed-off-by: Li Feng 
> ---
> hw/scsi/vhost-scsi-common.c   | 17 ++---
> hw/scsi/vhost-scsi.c  |  5 +++--
> hw/scsi/vhost-user-scsi.c | 14 --
> include/hw/virtio/vhost-scsi-common.h |  2 +-
> 4 files changed, 22 insertions(+), 16 deletions(-)
> 
> diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c
> index a61cd0e907..392587dfb5 100644
> --- a/hw/scsi/vhost-scsi-common.c
> +++ b/hw/scsi/vhost-scsi-common.c
> @@ -16,6 +16,7 @@
>  */
> 
> #include "qemu/osdep.h"
> +#include "qapi/error.h"
> #include "qemu/error-report.h"
> #include "qemu/module.h"
> #include "hw/virtio/vhost.h"
> @@ -25,7 +26,7 @@
> #include "hw/virtio/virtio-access.h"
> #include "hw/fw-path-provider.h"
> 
> -int vhost_scsi_common_start(VHostSCSICommon *vsc)
> +int vhost_scsi_common_start(VHostSCSICommon *vsc, Error **errp)
> {
> int ret, i;
> VirtIODevice *vdev = VIRTIO_DEVICE(vsc);
> @@ -35,18 +36,19 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
> VirtIOSCSICommon *vs = (VirtIOSCSICommon *)vsc;
> 
> if (!k->set_guest_notifiers) {
> -error_report("binding does not support guest notifiers");
> +error_setg(errp, "binding does not support guest notifiers");
> return -ENOSYS;
> }
> 
> ret = vhost_dev_enable_notifiers(&vsc->dev, vdev);
> if (ret < 0) {
> +error_setg_errno(errp, -ret, "Error enabling host notifiers");
> return ret;
> }
> 
> ret = k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, true);
> if (ret < 0) {
> -error_report("Error binding guest notifier");
> +error_setg_errno(errp, -ret, "Error binding guest notifier");
> goto err_host_notifiers;
> }
> 
> @@ -54,7 +56,7 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
> 
> ret = vhost_dev_prepare_inflight(&vsc->dev, vdev);
> if (ret < 0) {
> -error_report("Error setting inflight format: %d", -ret);

Curious why you’re adding the error value to the string. Isn’t it redundant 
since we pass it in as the second param?

> +error_setg_errno(errp, -ret, "Error setting inflight format: %d", 
> -ret);
> goto err_guest_notifiers;
> }
> 
> @@ -64,21 +66,22 @@ int vhost_scsi_common_start(VHostSCSICommon *vsc)
> vs->conf.virtqueue_size,
> vsc->inflight);
> if (ret < 0) {
> -error_report("Error getting inflight: %d", -ret);

Ditto

> +error_setg_errno(errp, -ret, "Error getting inflight: %d",
> + -ret);
> goto err_guest_notifiers;
> }
> }
> 
> ret = vhost_dev_set_inflight(&vsc->dev, vsc->inflight);
> if (ret < 0) {
> -error_report("Error setting inflight: %d", -ret);
> +error_setg_errno(errp, -ret, "Error setting inflight: %d", -ret);
> goto err_guest_notifiers;
> }
> }
> 
> ret = vhost_dev_start(&vsc->dev, vdev, true);
> if (ret < 0) {
> -error_report("Error start vhost dev");

“Error starting vhost dev”?

> +error_setg_errno(errp, -ret, "Error start vhost dev");
> goto err_guest_notifiers;
> }
> 
> diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
> index 443f67daa4..01a3ab4277 100644
> --- a/hw/scsi/vhost-scsi.c
> +++ b/hw/scsi/vhost-scsi.c
> @@ -75,6 +75,7 @@ static int vhost_scsi_start(VHostSCSI *s)
> int ret, abi_version;
> VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
> const VhostOps *vhost_ops = vsc->dev.vhost_ops;
> +Error *local_err = NULL;
> 
> ret = vhost_ops->vhost_scsi_get_abi_version(&vsc->dev, &abi_version);
> if (ret < 0) {
> @@ -88,14 +89,14 @@ static int vhost_scsi_start(VHostSCSI *s)
> return -ENOSYS;
> }
> 
> -ret = vhost_scsi_common_start(vsc);
> +ret = vhost_scsi_common_start(vsc, &local_err);
> if (ret < 0) {
> return ret;
> }
> 
> ret = vhost_scsi_set_endpoint(s);
> if (ret < 0) {
> -error_report("Error setting vhost-scsi endpoint");
> +error_reportf_err(local_err, "Error setting vhost-scsi endpoint");
> vhost_scsi_common_stop(vsc);
> }
> 
> diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
> index a7fa8e8df2..d368171e28 100644
> --- a/hw/scsi/vhost-user-scsi.c
> +++ b/hw/scsi/vhost-user-scsi.c
> @@ -43,12 +43,12 @@ enum VhostUserProtocolFeature {
> VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13,
> };
> 
> -static int vhost_user_scsi_start(VHostUserSCSI *s)
> +static int vhost_user_scsi_start(VHostUserSCSI *s, Error **errp)
> {
> VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s);
> int ret;
> 
> -ret = vhost_scsi_common_start(vsc);
> +ret = vhost_scsi_common_start(vs

[PATCH] ui/dbus: implement damage regions for GL

2023-08-14 Thread Bilal Elmoussaoui

Currently, when using `-display dbus,gl=on` all updates to the client
become "full scanout" updates, meaning there is no way for the client to
limit damage regions to the display server.

Instead of using an "update count", this patch tracks the damage region
and propagates it to the client.

This was less of an issue when clients were using GtkGLArea for
rendering,
as you'd be doing full-surface redraw. To be efficient, the client needs
both a DMA-BUF and the damage region to be updated.

Co-authored-by: Christian Hergert 
Signed-off-by: Bilal Elmoussaoui 
---
 ui/dbus-listener.c | 32 
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/ui/dbus-listener.c b/ui/dbus-listener.c
index 30917271ab..d015e8d759 100644
--- a/ui/dbus-listener.c
+++ b/ui/dbus-listener.c
@@ -26,6 +26,9 @@
 #include "qapi/error.h"
 #include "sysemu/sysemu.h"
 #include "dbus.h"
+#ifdef CONFIG_OPENGL
+#include 
+#endif
 #ifdef G_OS_UNIX
 #include 
 #endif
@@ -59,12 +62,15 @@ struct _DBusDisplayListener {
 
 QemuDBusDisplay1Listener *proxy;
 
+#ifdef CONFIG_OPENGL
+/* Keep track of the damage region */
+pixman_region32_t gl_damage;
+#endif
+
 DisplayChangeListener dcl;
 DisplaySurface *ds;
 enum share_kind ds_share;
 
-int gl_updates;
-
 bool ds_mapped;
 bool can_share_map;
 
@@ -539,11 +545,16 @@ static void dbus_gl_refresh(DisplayChangeListener *dcl)
 return;
 }
 
-if (ddl->gl_updates) {
-dbus_call_update_gl(dcl, 0, 0,
-surface_width(ddl->ds), surface_height(ddl->ds));
-ddl->gl_updates = 0;
+int n_rects = pixman_region32_n_rects(&ddl->gl_damage);
+
+for (int i = 0; i < n_rects; i++) {
+pixman_box32_t *box;
+box = pixman_region32_rectangles(&ddl->gl_damage, NULL) + i;
+
+dbus_call_update_gl(dcl, box->x1, box->y1,
+box->x2 - box->x1, box->y2 - box->y1);
 }
+pixman_region32_clear(&ddl->gl_damage);
 }
 #endif /* OPENGL */
 
@@ -558,7 +569,10 @@ static void dbus_gl_gfx_update(DisplayChangeListener *dcl,
 {
 DBusDisplayListener *ddl = container_of(dcl, DBusDisplayListener, dcl);
 
-ddl->gl_updates++;
+pixman_region32_t rect_region;
+pixman_region32_init_rect(&rect_region, x, y, w, h);
+pixman_region32_union(&ddl->gl_damage, &ddl->gl_damage, &rect_region);
+pixman_region32_fini(&rect_region);
 }
 #endif
 
@@ -933,7 +947,9 @@ dbus_display_listener_new(const char *bus_name,
 g_object_unref(ddl);
 return NULL;
 }
-
+#ifdef CONFIG_OPENGL
+pixman_region32_init(&ddl->gl_damage);
+#endif
 ddl->bus_name = g_strdup(bus_name);
 ddl->conn = conn;
 ddl->console = console;
-- 
2.41.0

Re: [RFC PATCH 04/24] gdbstub: Introduce gdb_find_static_feature()

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> This function is useful to determine the number of registers exposed to
> GDB from the XML name.
>
> Signed-off-by: Akihiko Odaki 
> ---
>  include/exec/gdbstub.h |  2 ++
>  gdbstub/gdbstub.c  | 13 +
>  2 files changed, 15 insertions(+)
>
> diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h
> index 22e5add5b1..3115dc21c0 100644
> --- a/include/exec/gdbstub.h
> +++ b/include/exec/gdbstub.h
> @@ -34,6 +34,8 @@ void gdb_register_coprocessor(CPUState *cpu,
>   */
>  int gdbserver_start(const char *port_or_device);
>  
> +const GDBFeature *gdb_find_static_feature(const char *xmlname);
> +
>  void gdb_set_stop_cpu(CPUState *cpu);
>  
>  /**
> diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c
> index fad70200d8..6d9cef5b95 100644
> --- a/gdbstub/gdbstub.c
> +++ b/gdbstub/gdbstub.c
> @@ -414,6 +414,19 @@ static const char *get_feature_xml(const char *p, const 
> char **newp,
>  return name ? gdb_features[i].xml : NULL;
>  }
>  
> +const GDBFeature *gdb_find_static_feature(const char *xmlname)
> +{
> +const GDBFeature *feature;
> +
> +for (feature = gdb_features; feature->xmlname; feature++) {
> +if (!strcmp(feature->xmlname, xmlname)) {

I'd prefer g_strcmp0(feature->xmlname, xmlname) == 0 but either way:

Reviewed-by: Alex Bennée 

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC PATCH 03/24] gdbstub: Add num_regs member to GDBFeature

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> Currently the number of registers exposed to GDB is written as magic
> numbers in code. Derive the number of registers GDB actually see from
> XML files to replace the magic numbers in code later.
>
> Signed-off-by: Akihiko Odaki 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC PATCH 02/24] gdbstub: Introduce GDBFeature structure

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> Before this change, the information from a XML file was stored in an
> array that is not descriptive. Introduce a dedicated structure type to
> make it easier to understand and to extend with more fields.
>
> Signed-off-by: Akihiko Odaki 

with Phillipe's suggested updates:

Reviewed-by: Alex Bennée 

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: qemu-system-m68k: ../hw/scsi/scsi-disk.c:557: scsi_write_data: Assertion `r->req.aiocb == NULL' failed.

2023-08-14 Thread Thomas Huth


On 13/08/2023 09.46, Waldemar Brodkorb wrote:

Hi,

I am using Qemu 8.0.3 and getting this error:
qemu-system-m68k: ../hw/scsi/scsi-disk.c:557: scsi_write_data: Assertion 
`r->req.aiocb == NULL' failed.

It happens f.e. when I extract the glibc source code inside Linux.
Linux 6.1.44, glibc 2.38 is used for the Linux system.
I am starting qemu this way:
qemu-system-m68k -nographic -M q800 -kernel 
/home/wbx/openadk/firmware/qemu-m68k-q800_glibc_68040/qemu-m68k-q800-archive-kernel
 qemu-m68k.img

What can I do about it? Why this is happening on "heavy" I/O usage?


Is it a regression? I.e. did it not happen on older versions of QEMU?

 Thomas

Re: [RFC PATCH 01/24] contrib/plugins: Use GRWLock in execlog

2023-08-14 Thread Alex Bennée



Akihiko Odaki  writes:

> execlog had the following comment:
>> As we could have multiple threads trying to do this we need to
>> serialise the expansion under a lock. Threads accessing already
>> created entries can continue without issue even if the ptr array
>> gets reallocated during resize.
>
> However, when the ptr array gets reallocated, the other threads may have
> a stale reference to the old buffer. This results in use-after-free.
>
> Use GRWLock to properly fix this issue.
>
> Fixes: 3d7caf145e ("contrib/plugins: add execlog to log instruction execution 
> and memory access")
> Signed-off-by: Akihiko Odaki 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [PATCH] Add api to read CPU registers in TCG plugins

2023-08-14 Thread Alex Bennée



Mikhail Tyutin  writes:

> From: Aleksandr Anenkov 
>
> This commit:
> - adds a plugin API to read the registers of the current CPU
> - introduces qemu_plugin_reg_ctx structure for faster data gathering of
>   a set of registers without memory reallocation
> - adds an example plugin showing how to work with this API
> - fixes for GDB that read wrong register 'eflags' value
> - minor performance improvements when reading registers from GDB

You've managed to submit a patch at the same time as another independent
implementation:

  Message-ID: <20230731084354.115015-1-akihiko.od...@daynix.com>
  Date: Mon, 31 Jul 2023 17:43:27 +0900
  Subject: [RFC PATCH 00/24] plugins: Allow to read registers
  From: Akihiko Odaki 

I'm going to review his series now as it looks more complete and is
better structured. However should you want to re-submit a few notes:

  - please split patches into separate chunks to aid review, for
example:
 - core gdbstub changes
 - plugin api
 - example plugin
 - expand register set for one architecture at a time

>
> In this commit, we rely on the already written register
> reading code that QEMU for GDB has.
>
> Why is the GDB code used
> Each CPU architecture contains its own set of registers.
> At the same time, QEMU does not contain a unified code architecture
> for working with registers in various CPU architectures.
> Each implementation of the CPU architecture in QEMU locates, reads,
> and writes registers differently. In addition, each register has
> its own specifics for reading and writing. Fortunately,
> the GDB part of QEMU code already contains something unified
> and complete. So in terms of simplicity and minimal code
> changes of QEMU, we're just reusing what's already in GDB.
> It works without having to run the GDB server.

I've been trying to avoid exposing GDB register numbers to the plugin
API because they are fairly arbitrary. The long term plan was to
abstract all the various users in QEMU and have a common sub-system to
deal with this but as that work is blocked on time I'm willing to
consider short cutting this and using the gdb backend directly inside
QEMU. However I don't want the API exposed to the plugins to expose
these details so I'd rather that we have anonymous handles to registers
rather than gdb register numbers exposed to the plugins. That way if we
ever do get QEMU's internals in order we don't have to tweak the plugin
API.

>
> How it works
> The existing GDB code in QEMU already knows how to read registers
> by register number, but cannot do it by register name.
> QEMU has xml files using GDB Target Description Format to describe
> targets sourceware.org/gdb/onlinedocs/gdb/Target-Description-Format.html
> from where GDB gets information about all registers. It only remained
> to "teach" QEMU to read register names from these files the way GDB
> itself does it and remember them in order to convert back to numbers
> from the user API.
>
> Signed-off-by: Aleksandr Anenkov 
> ---
>  contrib/plugins/Makefile |   1 +
>  contrib/plugins/registers.c  | 231 +
>  gdb-xml/i386-32bit.xml   |   9 ++
>  gdb-xml/i386-64bit.xml   |  17 +++
>  gdbstub/gdbstub.c| 280 +--
>  include/exec/gdbstub.h   |  36 +
>  include/hw/core/cpu.h|   3 +
>  include/qemu/qemu-plugin.h   | 105 -
>  plugins/api.c| 223 
>  plugins/qemu-plugins.symbols |  10 ++
>  target/arm/cpu.c |   2 +
>  target/arm/gdbstub.c |   2 +-
>  target/i386/cpu.c|   6 +-
>  target/i386/gdbstub.c|  25 +++-
>  target/riscv/cpu.c   |   3 +
>  target/riscv/gdbstub.c   |   3 +-
>  16 files changed, 933 insertions(+), 23 deletions(-)
>  create mode 100644 contrib/plugins/registers.c
>
> diff --git a/contrib/plugins/Makefile b/contrib/plugins/Makefile
> index b2b9db9f51..e14c07ddcc 100644
> --- a/contrib/plugins/Makefile
> +++ b/contrib/plugins/Makefile
> @@ -21,6 +21,7 @@ NAMES += lockstep
>  NAMES += hwprofile
>  NAMES += cache
>  NAMES += drcov
> +NAMES += registers
>  
>  SONAMES := $(addsuffix .so,$(addprefix lib,$(NAMES)))
>  
> diff --git a/contrib/plugins/registers.c b/contrib/plugins/registers.c
> new file mode 100644
> index 00..ae40a27f5f
> --- /dev/null
> +++ b/contrib/plugins/registers.c
> @@ -0,0 +1,231 @@
> +/*
> + * Log register states
> + *
> + * Copyright (c) 2022 YADRO.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2 or later, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You s

Re: [RFC PATCH 2/5] HACK: target/arm/tcg: Add some more caches to cpu=max

2023-08-14 Thread Alex Bennée



Jonathan Cameron  writes:

> Used to drive the MPAM cache intialization and to exercise more
> of the PPTT cache entry generation code. Perhaps a default
> L3 cache is acceptable for max?
>
> Signed-off-by: Jonathan Cameron 
> ---
>  target/arm/tcg/cpu64.c | 12 
>  1 file changed, 12 insertions(+)
>
> diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c
> index 8019f00bc3..2af67739f6 100644
> --- a/target/arm/tcg/cpu64.c
> +++ b/target/arm/tcg/cpu64.c
> @@ -711,6 +711,17 @@ void aarch64_max_tcg_initfn(Object *obj)
>  uint64_t t;
>  uint32_t u;
>  
> +/*
> + * Expanded cache set
> + */
> +cpu->clidr = 0x8204923; /* 4 4 4 4 3 in 3 bit fields */
> +cpu->ccsidr[0] = 0x00ff001aull; /* 64KB L1 dcache */
> +cpu->ccsidr[1] = 0x00ff001aull; /* 64KB L1 icache */
> +cpu->ccsidr[2] = 0x07ff003aull; /* 1MB L2 unified cache */
> +cpu->ccsidr[4] = 0x07ff007cull; /* 2MB L3 cache 128B line */
> +cpu->ccsidr[6] = 0x7fff007cull; /* 16MB L4 cache 128B line */
> +cpu->ccsidr[8] = 0x0007007cull; /* 2048MB L5 cache 128B line */
> +

I think Peter in another thread wondered if we should have a generic
function for expanding the cache idr registers based on a abstract lane
definition. 

>  /*
>   * Reset MIDR so the guest doesn't mistake our 'max' CPU type for a real
>   * one and try to apply errata workarounds or use impdef features we
> @@ -828,6 +839,7 @@ void aarch64_max_tcg_initfn(Object *obj)
>  t = FIELD_DP64(t, ID_AA64MMFR2, BBM, 2);  /* FEAT_BBM at level 2 */
>  t = FIELD_DP64(t, ID_AA64MMFR2, EVT, 2);  /* FEAT_EVT */
>  t = FIELD_DP64(t, ID_AA64MMFR2, E0PD, 1); /* FEAT_E0PD */
> +t = FIELD_DP64(t, ID_AA64MMFR2, CCIDX, 1);  /* FEAT_TTCNP */
>  cpu->isar.id_aa64mmfr2 = t;
>  
>  t = cpu->isar.id_aa64zfr0;


-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [PATCH v3 00/15] linux-user: Implement VDSOs

2023-08-14 Thread Alex Bennée

Richard Henderson  writes:

> It's time for another round on implementing the VDSO for linux-user.
> We are now seeing applications built that absolutely require it, 
> and have no fallback for the VDSO to be absent.

Something broke configure for me:

  ../../configure --disable-docs --disable-system

Gave:

  Dependency glib-2.0 found: YES 2.74.6 (overridden)
  Program indent found: NO

  ../../linux-user/hppa/meson.build:7:0: ERROR: File vdso.so does not exist.

  A full log can be found at 
/home/alex/lsrc/qemu.git/builds/user/meson-logs/meson-log.txt
  FAILED: build.ninja 
  /home/alex/lsrc/qemu.git/builds/user/pyvenv/bin/meson --internal regenerate 
/home/alex/lsrc/qemu.git /home/alex/lsrc/qemu.git/builds/user
  ninja: error: rebuilding 'build.ninja': subcommand failed
  make: Nothing to be done for 'all'.

Will there be linux-user targets that never support vdso?

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC PATCH 1/5] hw/acpi: Add PPTT cache descriptions

2023-08-14 Thread Zhao Liu

Hi Jonathan,

On Tue, Aug 08, 2023 at 12:57:09PM +0100, Jonathan Cameron via wrote:
> Date: Tue, 8 Aug 2023 12:57:09 +0100
> From: Jonathan Cameron via 
> Subject: [RFC PATCH 1/5] hw/acpi: Add PPTT cache descriptions
> X-Mailer: git-send-email 2.39.2
> 
> Current PPTT tables generated by QEMU only provide information on CPU
> topology and neglect the description of Caches.
> 
> This patch adds flexible definition of those caches and updates the
> table version to 3 to allow for the per CPU cache instance IDs needed
> for cross references from the MPAM table.
> 
> If MPAM is not being used, then a unified description can be used,
> greatly reducing the resulting table size.
> 
> New machine parameters are used to control the cache toplogy.
> cache-cluster-start-level: Which caches are associated with the cluster
>   level of the topology. e.g cache-cluster-start-level=2 results in shared
>   l2 cache across a cluster.

So the i/d cache are at core level by default and we don't need to
configure its topology, right?

> cache-numa-start-level: Which caches are associate with the NUMA (in qemu
>   this is currently the physical package level).

I'm a bit confused about the connection of this numa and l3.
Does there "NUMA" refer to socket level?

> For example
>   cache-cluster-start-level=2,cache-numa-start-level=3 gives
>   private l1, cluster shared l2 and package shared L3.

Okay, you list the topology as: l1 per core, l2 per cluster and l3 per
socket.

For this case, I think my QOM topology proposal [1] (this is the underlying
general topology implementation, compatible with symmetric and 
heterogeneous, and I'm working on this QOM topology as a superset of smp)
is compatible with your command.

And I understand the difference between my "x-l2-cache-topo=[core|cluster]"
for x86 and yours is that I named the l2 cache, while you took level count
as the parameter.

What if I extend my symmetric cache topology commands for i386 as
"l2-cache=cluster,l3-cache=socket (*)"?

Compared to cache-cluster-start-level=2,cache-numa-start-level=3, are there
some specific cases that cache-xxx-start-level can solves but (*) command
cannot?

[1]: https://mail.gnu.org/archive/html/qemu-devel/2023-02/msg05167.html

> 
> FIXME: Test updates.
> 
> Signed-off-by: Jonathan Cameron 
> ---
>  qapi/machine.json   |   8 +-
>  include/hw/acpi/aml-build.h |  19 +++-
>  include/hw/boards.h |   4 +
>  hw/acpi/aml-build.c | 189 ++--
>  hw/arm/virt-acpi-build.c| 130 -
>  hw/core/machine-smp.c   |   8 ++
>  hw/loongarch/acpi-build.c   |   2 +-
>  7 files changed, 350 insertions(+), 10 deletions(-)
> 
> diff --git a/qapi/machine.json b/qapi/machine.json
> index a08b6576ca..cc86784641 100644
> --- a/qapi/machine.json
> +++ b/qapi/machine.json
> @@ -1494,6 +1494,10 @@
>  # @maxcpus: maximum number of hotpluggable virtual CPUs in the virtual
>  # machine
>  #
> +# @cache-cluster-start-level: Level of first cache attached to cluster
> +#
> +# @cache-node-start-level: Level of first cache attached to cluster

node or numa?

Thanks,
Zhao

> +#
>  # Since: 6.1
>  ##
>  { 'struct': 'SMPConfiguration', 'data': {
> @@ -1503,7 +1507,9 @@
>   '*clusters': 'int',
>   '*cores': 'int',
>   '*threads': 'int',
> - '*maxcpus': 'int' } }
> + '*maxcpus': 'int',
> + '*cache-cluster-start-level': 'int',
> + '*cache-node-start-level': 'int'} }
>  
>  ##
>  # @x-query-irq:
> diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
> index d1fb08514b..055b74820d 100644
> --- a/include/hw/acpi/aml-build.h
> +++ b/include/hw/acpi/aml-build.h
> @@ -489,8 +489,25 @@ void build_srat_memory(GArray *table_data, uint64_t base,
>  void build_slit(GArray *table_data, BIOSLinker *linker, MachineState *ms,
>  const char *oem_id, const char *oem_table_id);
>  
> +typedef enum ACPIPPTTCacheType {
> +DATA,
> +INSTRUCTION,
> +UNIFIED,
> +} ACPIPPTTCacheType;
> +
> +typedef struct ACPIPPTTCache {
> +ACPIPPTTCacheType type;
> +int sets;
> +int size;
> +int associativity;
> +int linesize;
> +unsigned int pptt_id;
> +int level;
> +} ACPIPPTTCache;
> +
>  void build_pptt(GArray *table_data, BIOSLinker *linker, MachineState *ms,
> -const char *oem_id, const char *oem_table_id);
> +const char *oem_id, const char *oem_table_id,
> +int num_caches, ACPIPPTTCache *caches);
>  
>  void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f,
>  const char *oem_id, const char *oem_table_id);
> diff --git a/include/hw/boards.h b/include/hw/boards.h
> index ed83360198..6e8ab92684 100644
> --- a/include/hw/boards.h
> +++ b/include/hw/boards.h
> @@ -316,6 +316,8 @@ typedef struct DeviceMemoryState {
>   * @cores: the number of cores in one cluster
>   * @threads: the number of threads in one core
>   * @max_cpus: the maximum number of logic

[PATCH v2 4/4] iotests: test the zoned format feature for qcow2 file

2023-08-14 Thread Sam Li

The zoned format feature can be tested by:
$ tests/qemu-iotests/check zoned-qcow2

Signed-off-by: Sam Li 
---
 tests/qemu-iotests/tests/zoned-qcow2 | 135 ++
 tests/qemu-iotests/tests/zoned-qcow2.out | 140 +++
 2 files changed, 275 insertions(+)
 create mode 100755 tests/qemu-iotests/tests/zoned-qcow2
 create mode 100644 tests/qemu-iotests/tests/zoned-qcow2.out

diff --git a/tests/qemu-iotests/tests/zoned-qcow2 
b/tests/qemu-iotests/tests/zoned-qcow2
new file mode 100755
index 00..473b462b50
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned-qcow2
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+#
+# Test zone management operations for qcow2 file.
+#
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+status=1 # failure is the default!
+
+file_name="zbc.qcow2"
+_cleanup()
+{
+  _cleanup_test_img
+  _rm_test_img "$file_name"
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ../common.rc
+. ../common.filter
+. ../common.qemu
+
+# This test only runs on Linux hosts with qcow2 image files.
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+echo
+echo "=== Initial image setup ==="
+echo
+
+$QEMU_IMG create -f qcow2 $file_name -o size=768M -o zone_size=64M \
+-o zone_capacity=64M -o zone_nr_conv=0 -o max_append_sectors=131072 \
+-o max_open_zones=0 -o max_active_zones=0 -o zoned_profile=zbc
+
+IMG="--image-opts -n driver=qcow2,file.driver=file,file.filename=$file_name"
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+
+echo
+echo "=== Testing a qcow2 img with zoned format ==="
+echo
+echo "case 1: test if one zone operation works"
+
+echo "(1) report zones[0]:"
+$QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "report zones[0~9]:"
+$QEMU_IO $IMG -c "zrp 0 10"
+echo
+echo "report the last zone:"
+$QEMU_IO $IMG -c "zrp 0x2C00 2" # 0x2C00 / 512 = 0x16
+echo
+echo
+echo "open zones[0]:"
+$QEMU_IO $IMG -c "zo 0 0x400" # 0x400 / 512 = 0x2
+$QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "open zones[1]"
+$QEMU_IO $IMG -c "zo 0x400 0x400"
+$QEMU_IO $IMG -c "zrp 0x400 1"
+echo
+echo "open the last zone"
+$QEMU_IO $IMG -c "zo 0x2C00 0x400"
+$QEMU_IO $IMG -c "zrp 0x2C00 2"
+echo
+echo
+echo "close zones[0]"
+$QEMU_IO $IMG -c "zc 0 0x400"
+$QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "close the last zone"
+$QEMU_IO $IMG -c "zc 0x3e7000 0x400"
+$QEMU_IO $IMG -c "zrp 0x3e7000 2"
+echo
+echo
+echo "(4) finish zones[1]"
+$QEMU_IO $IMG -c "zf 0x400 0x400"
+$QEMU_IO $IMG -c "zrp 0x400 1"
+echo
+echo
+echo "(5) reset zones[1]"
+$QEMU_IO $IMG -c "zrs 0x400 0x400"
+$QEMU_IO $IMG -c "zrp 0x400 1"
+echo
+echo
+echo "(6) append write with (4k, 8k) data" # the physical block size of the 
device is 4096
+$QEMU_IO $IMG -c "zrp 0 12"
+echo "Append write zones[0] one time:"
+$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
+$QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "Append write zones[0] twice:"
+$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
+$QEMU_IO $IMG -c "zrp 0 1"
+echo
+echo "Append write zones[1] one time:"
+$QEMU_IO $IMG -c "zap -p 0x400 0x1000 0x2000"
+$QEMU_IO $IMG -c "zrp 0x400 1"
+echo
+echo "Append write zones[1] twice:"
+$QEMU_IO $IMG -c "zap -p 0x400 0x1000 0x2000"
+$QEMU_IO $IMG -c "zrp 0x400 1"
+echo
+echo "Reset all:"
+$QEMU_IO $IMG -c "zrs 0 768M"
+$QEMU_IO $IMG -c "zrp 0 12"
+echo
+echo
+echo "case 2: test a sets of ops that works or not"
+
+echo "(1) append write (4k, 4k) and then write to full"
+$QEMU_IO $IMG -c "zap -p 0 0x1000 0x1000"
+echo "wrote (4k, 4k):"
+$QEMU_IO $IMG -c "zrp 0 1"
+$QEMU_IO $IMG -c "zap -p 0 0x1000 0x3ffd000"
+echo "wrote to full:"
+$QEMU_IO $IMG -c "zrp 0 1"
+echo "Reset zones[0]:"
+$QEMU_IO $IMG -c "zrs 0 64M"
+$QEMU_IO $IMG -c "zrp 0 1"
+
+echo "(2) write in zones[0], zones[3], zones[8], and then reset all"
+$QEMU_IO $IMG -c "zap -p 0 0x1000 0x1000"
+$QEMU_IO $IMG -c "zap -p 0xc00 0x1000 0x1000"
+$QEMU_IO $IMG -c "zap -p 0x2000 0x1000 0x1000"
+echo "wrote three zones:"
+$QEMU_IO $IMG -c "zrp 0 12"
+echo "Reset all:"
+$QEMU_IO $IMG -c "zrs 0 768M"
+$QEMU_IO $IMG -c "zrp 0 12"
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/tests/zoned-qcow2.out 
b/tests/qemu-iotests/tests/zoned-qcow2.out
new file mode 100644
index 00..0a6b21cb55
--- /dev/null
+++ b/tests/qemu-iotests/tests/zoned-qcow2.out
@@ -0,0 +1,140 @@
+QA output created by zoned-qcow2
+
+=== Initial image setup ===
+
+Formatting 'zbc.qcow2', fmt=qcow2 cluster_size=65536 extended_l2=off 
compression_type=zlib zoned_profile=zbc zone_size=67108864 
zone_capacity=67108864 zone_nr_conv=0 max_append_sectors=131072 
max_active_zones=0 max_open_zones=0 size=805306368 lazy_refcounts=off 
refcount_bits=16
+
+=== Testing a qcow2 img with zoned format ===
+
+case 1: test if one zone operation works
+(1) report zones[0]:
+start: 0x0, len 0x2, cap 0x2, wptr 0x0, zcond:1, [type: 2]
+
+report zones[0~9]:
+star

[PATCH v2 0/4] Add full zoned storage emulation to qcow2 driver

2023-08-14 Thread Sam Li

This patch series add a new extension - zoned format - to the
qcow2 driver thereby allowing full zoned storage emulation on
the qcow2 img file. Users can attach such a qcow2 file to the
guest as a zoned device.

To create a qcow2 file with zoned format, use command like this:
$ qemu-img create -f qcow2 test.qcow2 -o size=768M -o
zone_size=64M -o zone_capacity=64M -o zone_nr_conv=0 -o
max_append_sectors=512 -o max_open_zones=0 -o max_active_zones=0
-o zoned_profile=zbc

Then add it to the QEMU command line:
-blockdev 
node-name=drive1,driver=qcow2,file.driver=file,file.filename=../qemu/test.qcow2 
\
-device virtio-blk-pci,drive=drive1 \

v1->v2:
- add more tests to qemu-io zoned commands
- make zone append change state to full when wp reaches end
- add documentation to qcow2 zoned extension header
- address review comments (Stefan):
  * fix zoned_mata allocation size
  * use bitwise or than addition
  * fix wp index overflow and locking
  * cleanups: comments, naming

Sam Li (4):
  docs/qcow2: add the zoned format feature
  qcow2: add configurations for zoned format extension
  qcow2: add zoned emulation capability
  iotests: test the zoned format feature for qcow2 file

 block/qcow2.c| 799 ++-
 block/qcow2.h|  23 +
 docs/interop/qcow2.txt   |  26 +
 docs/system/qemu-block-drivers.rst.inc   |  39 ++
 include/block/block-common.h |   5 +
 include/block/block_int-common.h |  16 +
 qapi/block-core.json |  46 +-
 tests/qemu-iotests/tests/zoned-qcow2 | 135 
 tests/qemu-iotests/tests/zoned-qcow2.out | 140 
 9 files changed, 1214 insertions(+), 15 deletions(-)
 create mode 100755 tests/qemu-iotests/tests/zoned-qcow2
 create mode 100644 tests/qemu-iotests/tests/zoned-qcow2.out

-- 
2.40.1

[PATCH v2 1/4] docs/qcow2: add the zoned format feature

2023-08-14 Thread Sam Li

Add the specs for the zoned format feature of the qcow2 driver. If
the zoned_profile is set to `zbc`, then the qcow2 file can be taken
as zoned device and passed through by virtio-blk device to the guest.
If it's `zns`, then it can be passed through by virtio-blk device or
NVMe ZNS device as a ZNS drive.

Signed-off-by: Sam Li 
---
 docs/system/qemu-block-drivers.rst.inc | 39 ++
 1 file changed, 39 insertions(+)

diff --git a/docs/system/qemu-block-drivers.rst.inc 
b/docs/system/qemu-block-drivers.rst.inc
index 105cb9679c..2c1620668f 100644
--- a/docs/system/qemu-block-drivers.rst.inc
+++ b/docs/system/qemu-block-drivers.rst.inc
@@ -172,6 +172,45 @@ This section describes each format and the options that 
are supported for it.
 filename`` to check if the NOCOW flag is set or not (Capital 'C' is
 NOCOW flag).
 
+  .. option:: zoned_profile
+
+The option configures the zoned format feature on the qcow2 driver. If
+this is set to ``zbc``, then it follows the basics of ZBC/ZAC protocol.
+If setting to ``zns``, then it follows NVMe ZNS protocol.
+
+The virtio-blk device allows ``zbc`` and ``zns`` options to pass through
+zoned devices. While NVMe ZNS device only allows ``zns`` option.
+
+  .. option:: zone_size
+
+The size of a zone of the zoned device in bytes. The device is divided
+into zones of this size with the exception of the last zone, which may
+be smaller.
+
+  .. option:: zone_capacity
+
+The initial capacity value for all zones. The capacity must be less than
+or equal to zone size. If the last zone is smaller, then its capacity is
+capped. The device follows the ZBC protocol tends to have the same size
+as its zone.
+
+  .. option:: zone_nr_conv
+
+The number of conventional zones of the zoned device.
+
+  .. option:: max_open_zones
+
+The maximal allowed open zones.
+
+  .. option:: max_active_zones
+
+The limit of the zones with implicit open, explicit open or closed state.
+
+  .. option:: max_append_sectors
+
+The maximal sectors in 512B blocks that is allowed to append to zones
+while writing.
+
 .. program:: image-formats
 .. option:: qed
 
-- 
2.40.1

Re: [PATCH v5 08/11] target/loongarch: Reject la64-only instructions in la32 mode

2023-08-14 Thread gaosong


在 2023/8/11 下午11:18, Richard Henderson 写道:

On 8/11/23 01:12, gaosong wrote:

+TRANS_64(sra_d, gen_rrr, EXT_NONE, EXT_NONE, EXT_NONE, gen_sra_d)
  TRANS(rotr_w, gen_rrr, EXT_ZERO, EXT_NONE, EXT_SIGN, gen_rotr_w)

TRANS_64(rotr_w, ...)

...

  TRANS(rotri_w, gen_rri_v, EXT_NONE, EXT_NONE, gen_rotr_w)

TRANS_64(rotri_w, ...)

I see the manual from https://www.loongson.cn/download/index

insn cpucfg also not support on la32.



I see all 3 of these, ROTR.W, ROTRI.W and CPUCFG listed in Table 2 at

https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#overview-of-basic-integer-instructions 





I see. These are not listed in [1] (the LA32 Lite Edition Manual),  It 
seems that the LA32 Lite Edition drops some instructions.



[1]:
https://www.loongson.cn/uploads/images/2023041918122813624.%E9%BE%99%E8%8A%AF%E6%9E%B6%E6%9E%8432%E4%BD%8D%E7%B2%BE%E7%AE%80%E7%89%88%E5%8F%82%E8%80%83%E6%89%8B%E5%86%8C_r1p03.pdf

Thanks.
Song Gao

[PATCH v2 3/4] qcow2: add zoned emulation capability

2023-08-14 Thread Sam Li

By adding zone operations and zoned metadata, the zoned emulation
capability enables full emulation support of zoned device using
a qcow2 file. The zoned device metadata includes zone type,
zoned device state and write pointer of each zone, which is stored
to an array of unsigned integers.

Each zone of a zoned device makes state transitions following
the zone state machine. The zone state machine mainly describes
five states, IMPLICIT OPEN, EXPLICIT OPEN, FULL, EMPTY and CLOSED.
READ ONLY and OFFLINE states will generally be affected by device
internal events. The operations on zones cause corresponding state
changing.

Zoned devices have a limit on zone resources, which puts constraints on
write operations into zones.

Signed-off-by: Sam Li 
---
 block/qcow2.c  | 676 -
 block/qcow2.h  |   2 +
 docs/interop/qcow2.txt |   2 +
 3 files changed, 678 insertions(+), 2 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index c1077c4a4a..5ccf79cbe7 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -194,6 +194,164 @@ qcow2_extract_crypto_opts(QemuOpts *opts, const char 
*fmt, Error **errp)
 return cryptoopts_qdict;
 }
 
+#define QCOW2_ZT_IS_CONV(wp)(wp & 1ULL << 59)
+
+static inline int qcow2_get_wp(uint64_t wp)
+{
+/* clear state and type information */
+return ((wp << 5) >> 5);
+}
+
+static inline int qcow2_get_zs(uint64_t wp)
+{
+return (wp >> 60);
+}
+
+static inline void qcow2_set_wp(uint64_t *wp, BlockZoneState zs)
+{
+uint64_t addr = qcow2_get_wp(*wp);
+addr |= ((uint64_t)zs << 60);
+*wp = addr;
+}
+
+/*
+ * File wp tracking: reset zone, finish zone and append zone can
+ * change the value of write pointer. All zone operations will change
+ * the state of that/those zone.
+ * */
+static inline void qcow2_wp_tracking_helper(int index, uint64_t wp) {
+/* format: operations, the wp. */
+printf("wps[%d]: 0x%x\n", index, qcow2_get_wp(wp)>>BDRV_SECTOR_BITS);
+}
+
+/*
+ * Perform a state assignment and a flush operation that writes the new wp
+ * value to the dedicated location of the disk file.
+ */
+static int qcow2_write_wp_at(BlockDriverState *bs, uint64_t *wp,
+ uint32_t index, BlockZoneState zs) {
+BDRVQcow2State *s = bs->opaque;
+int ret;
+
+qcow2_set_wp(wp, zs);
+ret = bdrv_pwrite(bs->file, s->zoned_header.zonedmeta_offset
++ sizeof(uint64_t) * index, sizeof(uint64_t), wp, 0);
+
+if (ret < 0) {
+goto exit;
+}
+qcow2_wp_tracking_helper(index, *wp);
+return ret;
+
+exit:
+error_report("Failed to write metadata with file");
+return ret;
+}
+
+static int qcow2_check_active(BlockDriverState *bs)
+{
+BDRVQcow2State *s = bs->opaque;
+
+if (!s->zoned_header.max_active_zones) {
+return 0;
+}
+
+if (s->nr_zones_exp_open + s->nr_zones_imp_open + s->nr_zones_closed
+< s->zoned_header.max_active_zones) {
+return 0;
+}
+
+return -1;
+}
+
+static int qcow2_check_open(BlockDriverState *bs)
+{
+BDRVQcow2State *s = bs->opaque;
+int ret;
+
+if (!s->zoned_header.max_open_zones) {
+return 0;
+}
+
+if (s->nr_zones_exp_open + s->nr_zones_imp_open
+< s->zoned_header.max_open_zones) {
+return 0;
+}
+
+if(s->nr_zones_imp_open) {
+ret = qcow2_check_active(bs);
+if (ret == 0) {
+/* TODO: it takes O(n) time complexity (n = nr_zones).
+ * Optimizations required. */
+/* close one implicitly open zones to make it available */
+for (int i = s->zoned_header.zone_nr_conv;
+i < bs->bl.nr_zones; ++i) {
+uint64_t *wp = &s->wps->wp[i];
+if (qcow2_get_zs(*wp) == BLK_ZS_IOPEN) {
+ret = qcow2_write_wp_at(bs, wp, i, BLK_ZS_CLOSED);
+if (ret < 0) {
+return ret;
+}
+s->wps->wp[i] = *wp;
+s->nr_zones_imp_open--;
+s->nr_zones_closed++;
+break;
+}
+}
+return 0;
+}
+return ret;
+}
+
+return -1;
+}
+
+/*
+ * The zoned device has limited zone resources of open, closed, active
+ * zones.
+ */
+static int qcow2_check_zone_resources(BlockDriverState *bs,
+  BlockZoneState zs)
+{
+int ret;
+
+switch (zs) {
+case BLK_ZS_EMPTY:
+ret = qcow2_check_active(bs);
+if (ret < 0) {
+error_report("No enough active zones");
+return ret;
+}
+return ret;
+case BLK_ZS_CLOSED:
+ret = qcow2_check_open(bs);
+if (ret < 0) {
+error_report("No enough open zones");
+return ret;
+}
+return ret;
+default:
+return -EINVAL;
+}
+
+}
+
+static inline int qcow2_refresh_zonedmeta(BlockDriverState *bs)

[PATCH v2 2/4] qcow2: add configurations for zoned format extension

2023-08-14 Thread Sam Li

To configure the zoned format feature on the qcow2 driver, it
requires following arguments: the device size, zoned profile,
zoned model, zone size, zone capacity, number of conventional
zones, limits on zone resources (max append sectors, max open
zones, and max_active_zones). The zoned profile option is set
to zns when using the qcow2 file as a ZNS drive.

To create a qcow2 file with zoned format, use command like this:
$ qemu-img create -f qcow2 test.qcow2 -o size=768M -o
zone_size=64M -o zone_capacity=64M -o zone_nr_conv=0 -o
max_append_sectors=512 -o max_open_zones=0 -o max_active_zones=0
 -o zoned_profile=zbc/zns

Signed-off-by: Sam Li 
---
 block/qcow2.c| 125 +++
 block/qcow2.h|  21 ++
 docs/interop/qcow2.txt   |  24 ++
 include/block/block-common.h |   5 ++
 include/block/block_int-common.h |  16 
 qapi/block-core.json |  46 
 6 files changed, 223 insertions(+), 14 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index c51388e99d..c1077c4a4a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -73,6 +73,7 @@ typedef struct {
 #define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
 #define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
 #define  QCOW2_EXT_MAGIC_DATA_FILE 0x44415441
+#define  QCOW2_EXT_MAGIC_ZONED_FORMAT 0x7a6264
 
 static int coroutine_fn
 qcow2_co_preadv_compressed(BlockDriverState *bs,
@@ -210,6 +211,7 @@ qcow2_read_extensions(BlockDriverState *bs, uint64_t 
start_offset,
 uint64_t offset;
 int ret;
 Qcow2BitmapHeaderExt bitmaps_ext;
+Qcow2ZonedHeaderExtension zoned_ext;
 
 if (need_update_header != NULL) {
 *need_update_header = false;
@@ -431,6 +433,38 @@ qcow2_read_extensions(BlockDriverState *bs, uint64_t 
start_offset,
 break;
 }
 
+case QCOW2_EXT_MAGIC_ZONED_FORMAT:
+{
+if (ext.len != sizeof(zoned_ext)) {
+error_setg_errno(errp, -ret, "zoned_ext: "
+ "Invalid extension length");
+return -EINVAL;
+}
+ret = bdrv_pread(bs->file, offset, ext.len, &zoned_ext, 0);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "zoned_ext: "
+ "Could not read ext header");
+return ret;
+}
+
+zoned_ext.zone_size = be32_to_cpu(zoned_ext.zone_size);
+zoned_ext.zone_capacity = be32_to_cpu(zoned_ext.zone_capacity);
+zoned_ext.nr_zones = be32_to_cpu(zoned_ext.nr_zones);
+zoned_ext.zone_nr_conv = be32_to_cpu(zoned_ext.zone_nr_conv);
+zoned_ext.max_open_zones = be32_to_cpu(zoned_ext.max_open_zones);
+zoned_ext.max_active_zones =
+be32_to_cpu(zoned_ext.max_active_zones);
+zoned_ext.max_append_sectors =
+be32_to_cpu(zoned_ext.max_append_sectors);
+s->zoned_header = zoned_ext;
+
+#ifdef DEBUG_EXT
+printf("Qcow2: Got zoned format extension: "
+   "offset=%" PRIu32 "\n", offset);
+#endif
+break;
+}
+
 default:
 /* unknown magic - save it in case we need to rewrite the header */
 /* If you add a new feature, make sure to also update the fast
@@ -3089,6 +3123,31 @@ int qcow2_update_header(BlockDriverState *bs)
 buflen -= ret;
 }
 
+/* Zoned devices header extension */
+if (s->zoned_header.zoned == BLK_Z_HM) {
+Qcow2ZonedHeaderExtension zoned_header = {
+.zoned_profile  = s->zoned_header.zoned_profile,
+.zoned  = s->zoned_header.zoned,
+.nr_zones   = cpu_to_be32(s->zoned_header.nr_zones),
+.zone_size  = cpu_to_be32(s->zoned_header.zone_size),
+.zone_capacity  = cpu_to_be32(s->zoned_header.zone_capacity),
+.zone_nr_conv   = cpu_to_be32(s->zoned_header.zone_nr_conv),
+.max_open_zones = cpu_to_be32(s->zoned_header.max_open_zones),
+.max_active_zones   =
+cpu_to_be32(s->zoned_header.max_active_zones),
+.max_append_sectors =
+cpu_to_be32(s->zoned_header.max_append_sectors)
+};
+ret = header_ext_add(buf, QCOW2_EXT_MAGIC_ZONED_FORMAT,
+ &zoned_header, sizeof(zoned_header),
+ buflen);
+if (ret < 0) {
+goto fail;
+}
+buf += ret;
+buflen -= ret;
+}
+
 /* Keep unknown header extensions */
 QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
 ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
@@ -3773,6 +3832,23 @@ qcow2_co_create(BlockdevCreateOptions *create_options, 
Error **errp)
 s->image_data_file = g_strdup(data_bs->filename);
 }
 
+if (qcow2_opts->zoned_profile) {
+

Re: [PATCH 5/5] target/arm: Implement cortex-a710

2023-08-14 Thread Marcin Juszkiewicz


W dniu 10.08.2023 o 19:12, Peter Maydell pisze:

On Thu, 10 Aug 2023 at 18:05, Richard Henderson

On 8/10/23 08:49, Peter Maydell wrote:

On Thu, 10 Aug 2023 at 03:36, Richard Henderson



Will sbsa-ref want this core ?



It only has 40 PA bits, and I think sbsa-ref requires 48.



Yes, it does want 48 (we ran into that with some other core).


sbsa-ref needs PA > 40 bit as memory starts at 2^40.

Cortex A57/A72 have only 44 bits for PA space and are fine.

From v9 cores I look forward for Neoverse-N2/V2 cores.

My "AArch64 cpu cores info table" [1] lists all/most of Arm designed 
cores with some basic information (arch level, 32bit, PA/VA, granules, SVE).


1. https://marcin.juszkiewicz.com.pl/download/tables/arm-cpu-cores.html

Re: [PATCH v5 1/5] ebpf: Added eBPF map update through mmap.

2023-08-14 Thread Andrew Melnichenko

Hi, all.

I've researched an issue a bit. And what can we do?
In the case of an "old" kernel 5.4, we need to load RSS eBPF without
BPF_F_MAPPABLE
and use bpf syscall to update the maps. This requires additional capabilities,
and the libvirtd will never give any capabilities to Qemu.
So, the only case for "fallback" is running Qemu manually with
capabilities(or with root) on kernel 5.4.

We can add hack/fallback to RSS ebpf loading routine with additional
checks and modify for BPF_F_MAPPABLE.
And we can add a fallback for mmap/syscall ebpf access.

The problem is that we need kernel 5.5 with BPF_F_MAPPABLE in headers
to compile Qemu with fallback,
or move macro to the Qemu headers.

It can be implemented something like this:
RSS eBPF open/load:
 * open the skeleton.
 * load the skeleton as is - it would fail because of an unknown BPF_F_MAPPABLE.
 * hack/modify map_flags for skeleton and try to reload.
RSS eBPF map update(this is straightforward):
 * check the mem pointer if null, use bpf syscall

The advantage of hacks in Qemu is that we are aware of the eBPF context.
I suggest creating different series of patches that would implement
the hack/fallback,
If we really want to support eBPF on old kernels.

On Wed, Aug 9, 2023 at 5:21 AM Jason Wang  wrote:
>
> On Wed, Aug 9, 2023 at 7:15 AM Andrew Melnichenko  wrote:
> >
> > Hi all,
> >
> > On Tue, Aug 8, 2023 at 5:39 AM Jason Wang  wrote:
> > >
> > > On Thu, Aug 3, 2023 at 5:01 AM Andrew Melnychenko  
> > > wrote:
> > > >
> > > > Changed eBPF map updates through mmaped array.
> > > > Mmaped arrays provide direct access to map data.
> > > > It should omit using bpf_map_update_elem() call,
> > > > which may require capabilities that are not present.
> > > >
> > > > Signed-off-by: Andrew Melnychenko 
> > > > ---
> > > >  ebpf/ebpf_rss.c | 117 ++--
> > > >  ebpf/ebpf_rss.h |   5 +++
> > > >  2 files changed, 99 insertions(+), 23 deletions(-)
> > > >
> > > > diff --git a/ebpf/ebpf_rss.c b/ebpf/ebpf_rss.c
> > > > index cee658c158b..247f5eee1b6 100644
> > > > --- a/ebpf/ebpf_rss.c
> > > > +++ b/ebpf/ebpf_rss.c
> > > > @@ -27,19 +27,83 @@ void ebpf_rss_init(struct EBPFRSSContext *ctx)
> > > >  {
> > > >  if (ctx != NULL) {
> > > >  ctx->obj = NULL;
> > > > +ctx->program_fd = -1;
> > > > +ctx->map_configuration = -1;
> > > > +ctx->map_toeplitz_key = -1;
> > > > +ctx->map_indirections_table = -1;
> > > > +
> > > > +ctx->mmap_configuration = NULL;
> > > > +ctx->mmap_toeplitz_key = NULL;
> > > > +ctx->mmap_indirections_table = NULL;
> > > >  }
> > > >  }
> > > >
> > > >  bool ebpf_rss_is_loaded(struct EBPFRSSContext *ctx)
> > > >  {
> > > > -return ctx != NULL && ctx->obj != NULL;
> > > > +return ctx != NULL && (ctx->obj != NULL || ctx->program_fd != -1);
> > > > +}
> > > > +
> > > > +static bool ebpf_rss_mmap(struct EBPFRSSContext *ctx)
> > > > +{
> > > > +if (!ebpf_rss_is_loaded(ctx)) {
> > > > +return false;
> > > > +}
> > > > +
> > > > +ctx->mmap_configuration = mmap(NULL, qemu_real_host_page_size(),
> > > > +   PROT_READ | PROT_WRITE, MAP_SHARED,
> > > > +   ctx->map_configuration, 0);
> > > > +if (ctx->mmap_configuration == MAP_FAILED) {
> > > > +trace_ebpf_error("eBPF RSS", "can not mmap eBPF configuration 
> > > > array");
> > > > +return false;
> > > > +}
> > > > +ctx->mmap_toeplitz_key = mmap(NULL, qemu_real_host_page_size(),
> > > > +   PROT_READ | PROT_WRITE, MAP_SHARED,
> > > > +   ctx->map_toeplitz_key, 0);
> > > > +if (ctx->mmap_toeplitz_key == MAP_FAILED) {
> > > > +trace_ebpf_error("eBPF RSS", "can not mmap eBPF toeplitz key");
> > > > +goto toeplitz_fail;
> > > > +}
> > > > +ctx->mmap_indirections_table = mmap(NULL, 
> > > > qemu_real_host_page_size(),
> > > > +   PROT_READ | PROT_WRITE, MAP_SHARED,
> > > > +   ctx->map_indirections_table, 0);
> > > > +if (ctx->mmap_indirections_table == MAP_FAILED) {
> > > > +trace_ebpf_error("eBPF RSS", "can not mmap eBPF indirection 
> > > > table");
> > > > +goto indirection_fail;
> > > > +}
> > > > +
> > > > +return true;
> > > > +
> > > > +indirection_fail:
> > > > +munmap(ctx->mmap_toeplitz_key, qemu_real_host_page_size());
> > > > +toeplitz_fail:
> > > > +munmap(ctx->mmap_configuration, qemu_real_host_page_size());
> > > > +
> > > > +ctx->mmap_configuration = NULL;
> > > > +ctx->mmap_toeplitz_key = NULL;
> > > > +ctx->mmap_indirections_table = NULL;
> > > > +return false;
> > > > +}
> > > > +
> > > > +static void ebpf_rss_munmap(struct EBPFRSSContext *ctx)
> > > > +{
> > > > +if (!ebpf_rss_is_loaded(ctx)) {
> > > > +return;
> > > > +}
> > > > +
> > > > +munm

1 2 >

1 - 100 of 112 matches

Mail list logo