date:20240516

On Mon May 13, 2024 at 9:27 AM AEST, BALATON Zoltan wrote:
> Now that only 6xx cases left in ppc_jumbo_xlate() we can change it
> to ppc_6xx_xlate() also removing get_physical_address_wtlb().
>

Reviewed-by: Nicholas Piggin 

> Signed-off-by: BALATON Zoltan 
> ---
>  target/ppc/internal.h   |  5 +
>  target/ppc/mmu_common.c | 38 --
>  2 files changed, 13 insertions(+), 30 deletions(-)
>
> diff --git a/target/ppc/internal.h b/target/ppc/internal.h
> index 98b41a970c..4a4f9b9ec8 100644
> --- a/target/ppc/internal.h
> +++ b/target/ppc/internal.h
> @@ -262,10 +262,7 @@ typedef struct mmu_ctx_t mmu_ctx_t;
>  bool ppc_xlate(PowerPCCPU *cpu, vaddr eaddr, MMUAccessType access_type,
>hwaddr *raddrp, int *psizep, int *protp,
>int mmu_idx, bool guest_visible);
> -int get_physical_address_wtlb(CPUPPCState *env, mmu_ctx_t *ctx,
> - target_ulong eaddr,
> - MMUAccessType access_type, int type,
> - int mmu_idx);
> +
>  /* Software driven TLB helpers */
>  int ppc6xx_tlb_getnum(CPUPPCState *env, target_ulong eaddr,
>  int way, int is_code);
> diff --git a/target/ppc/mmu_common.c b/target/ppc/mmu_common.c
> index ddb014e0aa..961062bca1 100644
> --- a/target/ppc/mmu_common.c
> +++ b/target/ppc/mmu_common.c
> @@ -1112,22 +1112,6 @@ void dump_mmu(CPUPPCState *env)
>  }
>  }
>  
> -int get_physical_address_wtlb(CPUPPCState *env, mmu_ctx_t *ctx,
> - target_ulong eaddr,
> - MMUAccessType access_type, int type,
> - int mmu_idx)
> -{
> -switch (env->mmu_model) {
> -case POWERPC_MMU_SOFT_6xx:
> -return mmu6xx_get_physical_address(env, ctx, eaddr, access_type, 
> type);
> -case POWERPC_MMU_SOFT_4xx:
> -return mmu40x_get_physical_address(env, >raddr, >prot, 
> eaddr,
> -   access_type);
> -default:
> -cpu_abort(env_cpu(env), "Unknown or invalid MMU model\n");
> -}
> -}
> -
>  static void booke206_update_mas_tlb_miss(CPUPPCState *env, target_ulong 
> address,
>   MMUAccessType access_type, int 
> mmu_idx)
>  {
> @@ -1326,12 +1310,10 @@ static bool ppc_40x_xlate(PowerPCCPU *cpu, vaddr 
> eaddr,
>  return false;
>  }
>  
> -/* Perform address translation */
> -/* TODO: Split this by mmu_model. */
> -static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr eaddr,
> -MMUAccessType access_type,
> -hwaddr *raddrp, int *psizep, int *protp,
> -int mmu_idx, bool guest_visible)
> +static bool ppc_6xx_xlate(PowerPCCPU *cpu, vaddr eaddr,
> +  MMUAccessType access_type,
> +  hwaddr *raddrp, int *psizep, int *protp,
> +  int mmu_idx, bool guest_visible)
>  {
>  CPUState *cs = CPU(cpu);
>  CPUPPCState *env = >env;
> @@ -1353,8 +1335,10 @@ static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr 
> eaddr,
>  type = ACCESS_INT;
>  }
>  
> -ret = get_physical_address_wtlb(env, , eaddr, access_type,
> -type, mmu_idx);
> +ctx.prot = 0;
> +ctx.hash[0] = 0;
> +ctx.hash[1] = 0;
> +ret = mmu6xx_get_physical_address(env, , eaddr, access_type, type);
>  if (ret == 0) {
>  *raddrp = ctx.raddr;
>  *protp = ctx.prot;
> @@ -1498,14 +1482,16 @@ bool ppc_xlate(PowerPCCPU *cpu, vaddr eaddr, 
> MMUAccessType access_type,
>  case POWERPC_MMU_SOFT_4xx:
>  return ppc_40x_xlate(cpu, eaddr, access_type, raddrp,
>   psizep, protp, mmu_idx, guest_visible);
> +case POWERPC_MMU_SOFT_6xx:
> +return ppc_6xx_xlate(cpu, eaddr, access_type, raddrp,
> + psizep, protp, mmu_idx, guest_visible);
>  case POWERPC_MMU_REAL:
>  return ppc_real_mode_xlate(cpu, eaddr, access_type, raddrp, psizep,
> protp);
>  case POWERPC_MMU_MPC8xx:
>  cpu_abort(env_cpu(>env), "MPC8xx MMU model is not 
> implemented\n");
>  default:
> -return ppc_jumbo_xlate(cpu, eaddr, access_type, raddrp,
> -   psizep, protp, mmu_idx, guest_visible);
> +cpu_abort(CPU(cpu), "Unknown or invalid MMU model\n");
>  }
>  }
>

Re: [PATCH v7 25/61] target/ppc/mmu_common.c: Split off 40x cases from ppc_jumbo_xlate()

On Mon May 13, 2024 at 9:27 AM AEST, BALATON Zoltan wrote:
> Introduce ppc_40x_xlate() to split off 40x handlning leaving only 6xx
> in ppc_jumbo_xlate() now.
>

Reviewed-by: Nicholas Piggin 

> Signed-off-by: BALATON Zoltan 
> ---
>  target/ppc/mmu_common.c | 150 +---
>  1 file changed, 93 insertions(+), 57 deletions(-)
>
> diff --git a/target/ppc/mmu_common.c b/target/ppc/mmu_common.c
> index ab912da821..ddb014e0aa 100644
> --- a/target/ppc/mmu_common.c
> +++ b/target/ppc/mmu_common.c
> @@ -1258,6 +1258,74 @@ static bool ppc_real_mode_xlate(PowerPCCPU *cpu, vaddr 
> eaddr,
>  return false;
>  }
>  
> +static bool ppc_40x_xlate(PowerPCCPU *cpu, vaddr eaddr,
> +  MMUAccessType access_type,
> +  hwaddr *raddrp, int *psizep, int *protp,
> +  int mmu_idx, bool guest_visible)
> +{
> +CPUState *cs = CPU(cpu);
> +CPUPPCState *env = >env;
> +int ret;
> +
> +if (ppc_real_mode_xlate(cpu, eaddr, access_type, raddrp, psizep, protp)) 
> {
> +return true;
> +}
> +
> +ret = mmu40x_get_physical_address(env, raddrp, protp, eaddr, 
> access_type);
> +if (ret == 0) {
> +*psizep = TARGET_PAGE_BITS;
> +return true;
> +} else if (!guest_visible) {
> +return false;
> +}
> +
> +log_cpu_state_mask(CPU_LOG_MMU, cs, 0);
> +if (access_type == MMU_INST_FETCH) {
> +switch (ret) {
> +case -1:
> +/* No matches in page tables or TLB */
> +cs->exception_index = POWERPC_EXCP_ITLB;
> +env->error_code = 0;
> +env->spr[SPR_40x_DEAR] = eaddr;
> +env->spr[SPR_40x_ESR] = 0x;
> +break;
> +case -2:
> +/* Access rights violation */
> +cs->exception_index = POWERPC_EXCP_ISI;
> +env->error_code = 0x0800;
> +break;
> +default:
> +g_assert_not_reached();
> +}
> +} else {
> +switch (ret) {
> +case -1:
> +/* No matches in page tables or TLB */
> +cs->exception_index = POWERPC_EXCP_DTLB;
> +env->error_code = 0;
> +env->spr[SPR_40x_DEAR] = eaddr;
> +if (access_type == MMU_DATA_STORE) {
> +env->spr[SPR_40x_ESR] = 0x0080;
> +} else {
> +env->spr[SPR_40x_ESR] = 0x;
> +}
> +break;
> +case -2:
> +/* Access rights violation */
> +cs->exception_index = POWERPC_EXCP_DSI;
> +env->error_code = 0;
> +env->spr[SPR_40x_DEAR] = eaddr;
> +if (access_type == MMU_DATA_STORE) {
> +env->spr[SPR_40x_ESR] |= 0x0080;
> +}
> +break;
> +default:
> +g_assert_not_reached();
> +}
> +}
> +return false;
> +}
> +
>  /* Perform address translation */
>  /* TODO: Split this by mmu_model. */
>  static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr eaddr,
> @@ -1301,23 +1369,11 @@ static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr 
> eaddr,
>  switch (ret) {
>  case -1:
>  /* No matches in page tables or TLB */
> -switch (env->mmu_model) {
> -case POWERPC_MMU_SOFT_6xx:
> -cs->exception_index = POWERPC_EXCP_IFTLB;
> -env->error_code = 1 << 18;
> -env->spr[SPR_IMISS] = eaddr;
> -env->spr[SPR_ICMP] = 0x8000 | ctx.ptem;
> -goto tlb_miss;
> -case POWERPC_MMU_SOFT_4xx:
> -cs->exception_index = POWERPC_EXCP_ITLB;
> -env->error_code = 0;
> -env->spr[SPR_40x_DEAR] = eaddr;
> -env->spr[SPR_40x_ESR] = 0x;
> -break;
> -default:
> -g_assert_not_reached();
> -}
> -break;
> +cs->exception_index = POWERPC_EXCP_IFTLB;
> +env->error_code = 1 << 18;
> +env->spr[SPR_IMISS] = eaddr;
> +env->spr[SPR_ICMP] = 0x8000 | ctx.ptem;
> +goto tlb_miss;
>  case -2:
>  /* Access rights violation */
>  cs->exception_index = POWERPC_EXCP_ISI;
> @@ -1339,54 +1395,31 @@ static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr 
> eaddr,
>  switch (ret) {
>  case -1:
>  /* No matches in page tables or TLB */
> -switch (env->mmu_model) {
> -case POWERPC_MMU_SOFT_6xx:
> -if (access_type == MMU_DATA_STORE) {
> -cs->exception_index = POWERPC_EXCP_DSTLB;
> -env->error_code = 1 << 16;
> -} else {
> -cs->exception_index = POWERPC_EXCP_DLTLB;
> -env->error_code = 0;
> -}
> -env->spr[SPR_DMISS] = eaddr;
> -

Re: [PATCH v7 28/61] target/ppc/mmu_common.c: Remove pte_update_flags()

On Mon May 13, 2024 at 9:28 AM AEST, BALATON Zoltan wrote:
> This function is used only once, its return value is ignored and one
> of its parameter is a return value from a previous call. It is better
> to inline it in the caller and remove it.

Debatable. It's definitely clunky code that could use some
love.

But without looking at details I would bet it's actually cleaner
to inline this into ppc6xx_tlb_pte_check since that is what deals
with the ptes.

Might leave this patch out for the first PR and see how things
settle.

Logic is odd too, or at least I don't really understand it or
intricacies of 6xx mmu. . Access bit is set even for access
violation? Store rejection logic I don't quite understand. Not
that I suggest changing anything in a cleanup series, but
would be nice to untangle and comment unusual cases a bit more
at least.

Thanks,
Nick

>
> Signed-off-by: BALATON Zoltan 
> ---
>  target/ppc/mmu_common.c | 41 +
>  1 file changed, 13 insertions(+), 28 deletions(-)
>
> diff --git a/target/ppc/mmu_common.c b/target/ppc/mmu_common.c
> index 34200d9cb1..4fb93cbf40 100644
> --- a/target/ppc/mmu_common.c
> +++ b/target/ppc/mmu_common.c
> @@ -179,39 +179,14 @@ static int ppc6xx_tlb_pte_check(mmu_ctx_t *ctx, 
> target_ulong pte0,
>  return ret;
>  }
>  
> -static int pte_update_flags(mmu_ctx_t *ctx, target_ulong *pte1p,
> -int ret, MMUAccessType access_type)
> -{
> -int store = 0;
> -
> -/* Update page flags */
> -if (!(*pte1p & 0x0100)) {
> -/* Update accessed flag */
> -*pte1p |= 0x0100;
> -store = 1;
> -}
> -if (!(*pte1p & 0x0080)) {
> -if (access_type == MMU_DATA_STORE && ret == 0) {
> -/* Update changed flag */
> -*pte1p |= 0x0080;
> -store = 1;
> -} else {
> -/* Force page fault for first write access */
> -ctx->prot &= ~PAGE_WRITE;
> -}
> -}
> -
> -return store;
> -}
> -
>  /* Software driven TLB helpers */
>  
>  static int ppc6xx_tlb_check(CPUPPCState *env, mmu_ctx_t *ctx,
>  target_ulong eaddr, MMUAccessType access_type)
>  {
>  ppc6xx_tlb_t *tlb;
> -int nr, best, way;
> -int ret;
> +target_ulong *pte1p;
> +int nr, best, way, ret;
>  
>  best = -1;
>  ret = -1; /* No TLB found */
> @@ -264,7 +239,17 @@ done:
>" prot=%01x ret=%d\n",
>ctx->raddr & TARGET_PAGE_MASK, ctx->prot, ret);
>  /* Update page flags */
> -pte_update_flags(ctx, >tlb.tlb6[best].pte1, ret, access_type);
> +pte1p = >tlb.tlb6[best].pte1;
> +*pte1p |= 0x0100; /* Update accessed flag */
> +if (!(*pte1p & 0x0080)) {
> +if (access_type == MMU_DATA_STORE && ret == 0) {
> +/* Update changed flag */
> +*pte1p |= 0x0080;
> +} else {
> +/* Force page fault for first write access */
> +ctx->prot &= ~PAGE_WRITE;
> +}
> +}
>  }
>  #if defined(DUMP_PAGE_TABLES)
>  if (qemu_loglevel_mask(CPU_LOG_MMU)) {

Re: [PATCH 02/13] s390_flic: add migration-enabled property

2024-05-16 Thread Thomas Huth


On 16/05/2024 16.42, Marc Hartmayer wrote:

On Thu, May 09, 2024 at 07:00 PM +0200, Paolo Bonzini  
wrote:

Instead of mucking with css_migration_enabled(), add a property specific to
the FLIC device, similar to what is done for TYPE_S390_STATTRIB.

Signed-off-by: Paolo Bonzini 
---
  include/hw/s390x/s390_flic.h | 1 +
  hw/intc/s390_flic.c  | 6 +-
  hw/s390x/s390-virtio-ccw.c   | 1 +
  3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/hw/s390x/s390_flic.h b/include/hw/s390x/s390_flic.h
index 3907a13d076..bcb081def58 100644
--- a/include/hw/s390x/s390_flic.h
+++ b/include/hw/s390x/s390_flic.h
@@ -47,6 +47,7 @@ struct S390FLICState {
  /* to limit AdapterRoutes.num_routes for compat */
  uint32_t adapter_routes_max_batch;
  bool ais_supported;
+bool migration_enabled;
  };
  
  
diff --git a/hw/intc/s390_flic.c b/hw/intc/s390_flic.c

index f4a848460b8..7f930800877 100644
--- a/hw/intc/s390_flic.c
+++ b/hw/intc/s390_flic.c
@@ -405,6 +405,8 @@ static void qemu_s390_flic_class_init(ObjectClass *oc, void 
*data)
  static Property s390_flic_common_properties[] = {
  DEFINE_PROP_UINT32("adapter_routes_max_batch", S390FLICState,
 adapter_routes_max_batch, ADAPTER_ROUTES_MAX_GSI),
+DEFINE_PROP_BOOL("migration-enabled", S390FLICState,
+ migration_enabled, true),
  DEFINE_PROP_END_OF_LIST(),
  };
  
@@ -457,7 +459,9 @@ type_init(qemu_s390_flic_register_types)
  
  static bool adapter_info_so_needed(void *opaque)

  {
-return css_migration_enabled();
+S390FLICState *fs = S390_FLIC_COMMON(opaque);
+
+return fs->migration_enabled;
  }

...

This patch causes QEMU to crash when trying to save the domain state
(e.g. using libvirt)


Oh, drat, that vmstate belongs to a ccw device, not to a flic device, so the 
"opaque" pointer in adapter_info_so_needed points to the wrong structure.


I guess the easiest fix is:

diff --git a/hw/intc/s390_flic.c b/hw/intc/s390_flic.c
--- a/hw/intc/s390_flic.c
+++ b/hw/intc/s390_flic.c
@@ -459,7 +459,7 @@ type_init(qemu_s390_flic_register_types)

 static bool adapter_info_so_needed(void *opaque)
 {
-S390FLICState *fs = S390_FLIC_COMMON(opaque);
+S390FLICState *fs = s390_get_flic();

 return fs->migration_enabled;
 }

I'll send it as a proper patch...

 Thomas

Re: [PATCH v7 24/61] target/ppc/mmu_common.c: Split off real mode handling from get_physical_address_wtlb()

On Mon May 13, 2024 at 9:27 AM AEST, BALATON Zoltan wrote:
> Add ppc_real_mode_xlate() to handle real mode translation and allow
> removing this case from ppc_jumbo_xlate().
>

Reviewed-by: Nicholas Piggin 

> Signed-off-by: BALATON Zoltan 
> ---
>  target/ppc/mmu_common.c | 46 -
>  1 file changed, 27 insertions(+), 19 deletions(-)
>
> diff --git a/target/ppc/mmu_common.c b/target/ppc/mmu_common.c
> index 8599106f75..ab912da821 100644
> --- a/target/ppc/mmu_common.c
> +++ b/target/ppc/mmu_common.c
> @@ -1117,23 +1117,12 @@ int get_physical_address_wtlb(CPUPPCState *env, 
> mmu_ctx_t *ctx,
>   MMUAccessType access_type, int type,
>   int mmu_idx)
>  {
> -bool real_mode = (type == ACCESS_CODE) ? !FIELD_EX64(env->msr, MSR, IR)
> -   : !FIELD_EX64(env->msr, MSR, DR);
> -if (real_mode) {
> -ctx->raddr = eaddr;
> -ctx->prot = PAGE_RWX;
> -return 0;
> -}
> -
>  switch (env->mmu_model) {
>  case POWERPC_MMU_SOFT_6xx:
>  return mmu6xx_get_physical_address(env, ctx, eaddr, access_type, 
> type);
>  case POWERPC_MMU_SOFT_4xx:
>  return mmu40x_get_physical_address(env, >raddr, >prot, 
> eaddr,
> access_type);
> -case POWERPC_MMU_REAL:
> -cpu_abort(env_cpu(env),
> -  "PowerPC in real mode do not do any translation\n");
>  default:
>  cpu_abort(env_cpu(env), "Unknown or invalid MMU model\n");
>  }
> @@ -1251,6 +1240,24 @@ static bool ppc_booke_xlate(PowerPCCPU *cpu, vaddr 
> eaddr,
>  return false;
>  }
>  
> +static bool ppc_real_mode_xlate(PowerPCCPU *cpu, vaddr eaddr,
> +MMUAccessType access_type,
> +hwaddr *raddrp, int *psizep, int *protp)
> +{
> +CPUPPCState *env = >env;
> +
> +if (access_type == MMU_INST_FETCH ? !FIELD_EX64(env->msr, MSR, IR)
> +  : !FIELD_EX64(env->msr, MSR, DR)) {
> +*raddrp = eaddr;
> +*protp = PAGE_RWX;
> +*psizep = TARGET_PAGE_BITS;
> +return true;
> +} else if (env->mmu_model == POWERPC_MMU_REAL) {
> +cpu_abort(CPU(cpu), "PowerPC in real mode shold not do 
> translation\n");
> +}
> +return false;
> +}
> +
>  /* Perform address translation */
>  /* TODO: Split this by mmu_model. */
>  static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr eaddr,
> @@ -1264,6 +1271,10 @@ static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr 
> eaddr,
>  int type;
>  int ret;
>  
> +if (ppc_real_mode_xlate(cpu, eaddr, access_type, raddrp, psizep, protp)) 
> {
> +return true;
> +}
> +
>  if (access_type == MMU_INST_FETCH) {
>  /* code access */
>  type = ACCESS_CODE;
> @@ -1303,11 +1314,8 @@ static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr 
> eaddr,
>  env->spr[SPR_40x_DEAR] = eaddr;
>  env->spr[SPR_40x_ESR] = 0x;
>  break;
> -case POWERPC_MMU_REAL:
> -cpu_abort(cs, "PowerPC in real mode should never raise "
> -  "any MMU exceptions\n");
>  default:
> -cpu_abort(cs, "Unknown or invalid MMU model\n");
> +g_assert_not_reached();
>  }
>  break;
>  case -2:
> @@ -1359,11 +1367,8 @@ static bool ppc_jumbo_xlate(PowerPCCPU *cpu, vaddr 
> eaddr,
>  env->spr[SPR_40x_ESR] = 0x;
>  }
>  break;
> -case POWERPC_MMU_REAL:
> -cpu_abort(cs, "PowerPC in real mode should never raise "
> -  "any MMU exceptions\n");
>  default:
> -cpu_abort(cs, "Unknown or invalid MMU model\n");
> +g_assert_not_reached();
>  }
>  break;
>  case -2:
> @@ -1457,6 +1462,9 @@ bool ppc_xlate(PowerPCCPU *cpu, vaddr eaddr, 
> MMUAccessType access_type,
>  case POWERPC_MMU_BOOKE206:
>  return ppc_booke_xlate(cpu, eaddr, access_type, raddrp,
> psizep, protp, mmu_idx, guest_visible);
> +case POWERPC_MMU_REAL:
> +return ppc_real_mode_xlate(cpu, eaddr, access_type, raddrp, psizep,
> +   protp);
>  case POWERPC_MMU_MPC8xx:
>  cpu_abort(env_cpu(>env), "MPC8xx MMU model is not 
> implemented\n");
>  default:

Re: [PATCH v2 1/3] docs: introduce dedicated page about code provenance / sign-off

2024-05-16 Thread Thomas Huth

On 16/05/2024 19.43, Peter Maydell wrote:

On Thu, 16 May 2024 at 18:34, Michael S. Tsirkin wrote:

On Thu, May 16, 2024 at 06:29:39PM +0100, Peter Maydell wrote:

On Thu, 16 May 2024 at 17:22, Daniel P. Berrangé wrote:

Currently we have a short paragraph saying that patches must include
a Signed-off-by line, and merely link to the kernel documentation.
The linked kernel docs have a lot of content beyond the part about
sign-off an thus are misleading/distracting to QEMU contributors.

Thanks for this -- I've felt for ages that it was a bit awkward
that we didn't have a good place to link people to for the fuller
explanation of this.

This introduces a dedicated 'code-provenance' page in QEMU talking
about why we require sign-off, explaining the other tags we commonly
use, and what to do in some edge cases.

The version of the kernel SubmittingPatches we used to link to
includes the text "sorry, no pseudonyms or anonymous contributions".
This new documentation doesn't say anything either way about
our approach to pseudonyms. I think we should probably say
something, but I don't know if we have an in-practice consensus
there, so maybe we should approach that as a separate change on
top of this patch.

Well given we referred to kernel previously then I guess that's
the concensus, no?

AIUI the kernel devs have changed their point of view on the
pseudonym question, so it's a question of whether we were
deliberately referring to that specific revision of the kernel's
practice because we agreed with it or just by chance...

https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=d4563201f33a022fc0353033d9dfeb1606a88330

is where the kernel changed to saying merely "no anonymous
contributions", dropping the 'pseudonyms' part.

FWIW, we had a clear statement in our document in the past:

https://gitlab.com/qemu-project/qemu/-/commit/ca127fe96ddb827f3ea153610c1e8f6e374708e2#9620a1442f724c9d8bfd5408e4611ba1839fcb8a_315_321

Quoting: "Please use your real name to sign a patch (not an alias or acronym)."

But it got lost in that rework, I assume by accident?

So IMHO we had a consensus once to not allow anonymous contributions. I'm in
favor of adding such a sentence back here now.

Thomas

Re: [PATCH] target/ppc: handle vcpu hotplug failure gracefully

On Thu May 16, 2024 at 2:31 PM AEST, Harsh Prateek Bora wrote:
> Hi Nick,
>
> On 5/14/24 08:39, Nicholas Piggin wrote:
> > On Tue Apr 23, 2024 at 4:30 PM AEST, Harsh Prateek Bora wrote:
> >> + qemu-devel
> >>
> >> On 4/23/24 11:40, Harsh Prateek Bora wrote:
> >>> On ppc64, the PowerVM hypervisor runs with limited memory and a VCPU
> >>> creation during hotplug may fail during kvm_ioctl for KVM_CREATE_VCPU,
> >>> leading to termination of guest since errp is set to _fatal while
> >>> calling kvm_init_vcpu. This unexpected behaviour can be avoided by
> >>> pre-creating vcpu and parking it on success or return error otherwise.
> >>> This enables graceful error delivery for any vcpu hotplug failures while
> >>> the guest can keep running.
> > 
> > So this puts in on the park list so when kvm_init_vcpu() later runs it
> > will just take it off the park list instead of issuing another
> > KVM_CREATE_VCPU ioctl.
> > 
> > And kvm_init_vcpu() runs in the vcpu thread function, which does not
> > have a good way to indicate failure to the caller.
> > 
> > I'm don't know a lot about this part of qemu but it seems like a good
> > idea to move fail-able initialisation out of the vcpu thread in that
> > case. So the general idea seems good to me.
> > 
>
> Yeh ..
>
> >>>
> >>> Based on api refactoring to create/park vcpus introduced in 1/8 of patch 
> >>> series:
> >>> https://lore.kernel.org/qemu-devel/2024031202.12992-2-salil.me...@huawei.com/
> > 
> > So from this series AFAIKS you're just using kvm_create / kvm_park
> > routines? You could easily pull that patch 1 out ahead of that larger
> > series if progress is slow on it, it's a decent cleanup by itself by
> > the looks.
> > 
>
> Yeh, patch 1 of that series is only we need but the author mentioned on 
> the list that he is about to post next version soon.
>
> >>>
> >>> Tested OK by repeatedly doing a hotplug/unplug of vcpus as below:
> >>>
> >>>#virsh setvcpus hotplug 40
> >>>#virsh setvcpus hotplug 70
> >>> error: internal error: unable to execute QEMU command 'device_add':
> >>> kvmppc_cpu_realize: vcpu hotplug failed with -12
> >>>
> >>> Reported-by: Anushree Mathur 
> >>> Suggested-by: Shivaprasad G Bhat 
> >>> Suggested-by: Vaibhav Jain 
> >>> Signed-off by: Harsh Prateek Bora 
> >>> ---
> >>> ---
> >>>target/ppc/kvm.c | 42 ++
> >>>1 file changed, 42 insertions(+)
> >>>
> >>> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> >>> index 8231feb2d4..c887f6dfa0 100644
> >>> --- a/target/ppc/kvm.c
> >>> +++ b/target/ppc/kvm.c
> >>> @@ -48,6 +48,8 @@
> >>>#include "qemu/mmap-alloc.h"
> >>>#include "elf.h"
> >>>#include "sysemu/kvm_int.h"
> >>> +#include "sysemu/kvm.h"
> >>> +#include "hw/core/accel-cpu.h"
> >>>
> >>>#define PROC_DEVTREE_CPU  "/proc/device-tree/cpus/"
> >>>
> >>> @@ -2339,6 +2341,43 @@ static void alter_insns(uint64_t *word, uint64_t 
> >>> flags, bool on)
> >>>}
> >>>}
> >>>
> >>> +static int max_cpu_index = 0;
> >>> +
> >>> +static bool kvmppc_cpu_realize(CPUState *cs, Error **errp)
> >>> +{
> >>> +int ret;
> >>> +
> >>> +cs->cpu_index = max_cpu_index++;
> >>> +
> >>> +POWERPC_CPU(cs)->vcpu_id = cs->cpu_index;
> > 
> > So you're overriding the cpu_get_free_index() allocator here.
> > And you need to because vcpu_id needs to be assigned before
> > the KVM create, I guess.
> > 
>
> Yes ..
>
> > I guess it works. I would add a comment like s390x has.
> > 
> Not sure which comment you were referring to but with exporting
> cpu_get_free_index as suggested later, not sure if we still need any
> comment.

Yeah that's true.

> >>> +
> >>> +if (cs->parent_obj.hotplugged) {
> > 
> > Can _all_ kvm cpu creation go via this path? Why just limit it to
> > hotplugged?
>
> For the initial bootup, we actually want to abort if the requested vCPUs
> cant be allocated so that user can retry until the requested vCPUs are
> allocated. For hotplug failure, bringing down entire guest isn't fair,
> hence the fix.

But you could make the error handling depend on hotplugged, no?
Perhaps put that error handling decision in common code so policy
is the same for all targets and back ends.

[...]

> >>> +}
> >>> +
> >>>static void kvmppc_host_cpu_class_init(ObjectClass *oc, void *data)
> >>>{
> >>>PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
> >>> @@ -2963,4 +3002,7 @@ bool kvm_arch_cpu_check_are_resettable(void)
> >>>
> >>>void kvm_arch_accel_class_init(ObjectClass *oc)
> >>>{
> >>> +AccelClass *ac = ACCEL_CLASS(oc);
> >>> +ac->cpu_common_realize = kvmppc_cpu_realize;
> >>> +ac->cpu_common_unrealize = kvmppc_cpu_unrealize;
> >>>}

One other thing I noticed -- cpu_common_realize seems to be for
core code and cpu_target_realize for targets. Should we be
using the latter here? If not, a comment would be warranted and
probably also a comment in accel_cpu_common_realize().

Thanks,
Nick

Re: [PATCH v2 1/4] accel/kvm: Extract common KVM vCPU {creation, parking} code

On Thu May 16, 2024 at 11:35 PM AEST, Salil Mehta wrote:
>
> >  From: Harsh Prateek Bora 
> >  Sent: Thursday, May 16, 2024 2:07 PM
> >  
> >  Hi Salil,
> >  
> >  On 5/16/24 17:42, Salil Mehta wrote:
> >  > Hi Harsh,
> >  >
> >  >>   From: Harsh Prateek Bora 
> >  >>   Sent: Thursday, May 16, 2024 11:15 AM
> >  >>
> >  >>   Hi Salil,
> >  >>
> >  >>   Thanks for your email.
> >  >>   Your patch 1/8 is included here based on review comments on my  
> > previous
> >  >>   patch from one of the maintainers in the community and therefore I  
> > had
> >  >>   kept you in CC to be aware of the desire of having this independent 
> > patch to
> >  >>   get merged earlier even if your other patches in the series may go 
> > through
> >  >>   further reviews.
> >  >
> >  > I really don’t know which discussion are  you pointing at? Please
> >  > understand you are fixing a bug and we are pushing a feature which has 
> > got large series.
> >  > It will break the patch-set  which is about t be merged.
> >  >
> >  > There will be significant overhead of testing on us for the work we
> >  > have been carrying forward for large time. This will be disruptive. 
> > Please dont!
> >  >
> >  
> >  I was referring to the review discussion on my prev patch here:
> >  https://lore.kernel.org/qemu-devel/d191d2jfar7l.2eh4s445m4...@gmail.com/
>
>
> Sure, I'm, not sure what this means. 
>
>
> >  Although your patch was included with this series only to facilitate 
> > review of
> >  the additional patches depending on just one of your patch.
>
>
> Generally you rebase your patch-set over the other and clearly state on the 
> cover
> letter that this patch-set is dependent upon such and such patch-set. Just 
> imagine
> if everyone starts to unilaterally pick up patches from each other's 
> patch-set it will
> create a chaos not only for the feature owners but also for the maintainers.
>
>
> >  
> >  I am not sure what is appearing disruptive here. It is a common practive in
> >  the community that maintainer(s) can pick individual patches from the
> >  series if it has been vetted by siginificant number of reviewers.
>
>
> Don’t you think this patch-set is asking for acceptance for a patch already 
> part of another patch-set which is about to be accepted and is a bigger 
> feature?
> Will it cause maintenance overhead at the last moment? Yes, of course!
>
>
> >  However, in this case, since you have mentioned to post next version soon,
> >  you need not worry about it as that would be the preferred version for both
> >  of the series.
>
>
> Yes, but please understand we are working for the benefit of overall 
> community.
> Please cooperate here.

There might be a misunderstanding, Harsh just said there had not been
much progress on your series for a while and he wasn't sure what the
status was. I mentioned that we *could* take your patch 1 (with your
blessing) if there was a hold up with the rest of the series. He was
going to check in with you to see how it was going.

This patch 1 was not intended to be merged as is without syncing up with
you first, but it's understandable you were concerned because that was
probably not communicated with you clearly.

I appreciate you bringing up your concerns, we'll try to do better.

Thanks,
Nick

Re: [PATCH] ppc/pnv: Implement POWER9 LPC PSI serirq outputs and auto-clear function

On Mon May 13, 2024 at 9:49 PM AEST, Cédric Le Goater wrote:
> Hello,
>
> On 5/10/24 16:30, Nicholas Piggin wrote:
> > The POWER8 LPC ISA device irqs all get combined and reported to the line
> > connected the PSI LPCHC irq. POWER9 changed this so only internal LPC
> > host controller irqs use that line, and the device irqs get routed to
> > 4 new lines connected to PSI SERIRQ0-3.
> > 
> > POWER9 also introduced a new feature that automatically clears the irq
> > status in the LPC host controller when EOI'ed, so software does not have
> > to.
> > 
> > The powernv OPAL (skiboot) firmware managed to work because the LPCHC
> > irq handler scanned all LPC irqs and handled those including clearing
> > status even on POWER9 systems. So LPC irqs worked despite OPAL thinking
> > it was running in POWER9 mode. After this change, UART interrupts show
> > up on serirq1 which is where OPAL routes them to:
> > 
> >   cat /proc/interrupts
> >   ...
> >   20:  0  XIVE-IRQ 1048563 Level opal-psi#0:lpchc
> >   ...
> >   25: 34  XIVE-IRQ 1048568 Level opal-psi#0:lpc_serirq_mux1
> > 
> > Whereas they previously turn up on lpchc.
> > 
> > Signed-off-by: Nicholas Piggin 
> > ---
> >   include/hw/ppc/pnv_lpc.h |  12 -
> >   hw/ppc/pnv.c |  38 +--
> >   hw/ppc/pnv_lpc.c | 100 +++
> >   3 files changed, 136 insertions(+), 14 deletions(-)
> > 
> > diff --git a/include/hw/ppc/pnv_lpc.h b/include/hw/ppc/pnv_lpc.h
> > index 5d22c45570..57e324b4dc 100644
> > --- a/include/hw/ppc/pnv_lpc.h
> > +++ b/include/hw/ppc/pnv_lpc.h
> > @@ -84,8 +84,18 @@ struct PnvLpcController {
> >   /* XSCOM registers */
> >   MemoryRegion xscom_regs;
> >   
> > +/*
> > + * In P8, ISA irqs are combined with internal sources to drive the
> > + * LPCHC interrupt output. P9 ISA irqs raise one of 4 lines that
> > + * drive PSI SERIRQ irqs, routing according to OPB routing registers.
> > + */
> > +bool psi_serirq;
> > +
> >   /* PSI to generate interrupts */
> > -qemu_irq psi_irq;
> > +qemu_irq psi_irq_lpchc;
> > +
> > +/* P9 introduced a serirq mode */
> > +qemu_irq psi_irq_serirq[4];
> >   };
> >   
> >   struct PnvLpcClass {
> > diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
> > index 6e3a5ccdec..3b1c05a1d8 100644
> > --- a/hw/ppc/pnv.c
> > +++ b/hw/ppc/pnv.c
> > @@ -744,18 +744,48 @@ static ISABus *pnv_chip_power8nvl_isa_create(PnvChip 
> > *chip, Error **errp)
> >   static ISABus *pnv_chip_power9_isa_create(PnvChip *chip, Error **errp)
> >   {
> >   Pnv9Chip *chip9 = PNV9_CHIP(chip);
> > -qemu_irq irq = qdev_get_gpio_in(DEVICE(>psi), PSIHB9_IRQ_LPCHC);
> >   
> > -qdev_connect_gpio_out(DEVICE(>lpc), 0, irq);
>
> The pnv_chip_power8*_isa_create() routines also need an update.

Good catch, thank you.

> > +qdev_connect_gpio_out_named(DEVICE(>lpc), "LPCHC", 0,
> > +qdev_get_gpio_in(DEVICE(>psi),
> > +PSIHB9_IRQ_LPCHC));
> > +
> > +qdev_connect_gpio_out_named(DEVICE(>lpc), "SERIRQ", 0,
> > +qdev_get_gpio_in(DEVICE(>psi),
> > +PSIHB9_IRQ_LPC_SIRQ0));
> > +qdev_connect_gpio_out_named(DEVICE(>lpc), "SERIRQ", 1,
> > +qdev_get_gpio_in(DEVICE(>psi),
> > +PSIHB9_IRQ_LPC_SIRQ1));
> > +qdev_connect_gpio_out_named(DEVICE(>lpc), "SERIRQ", 2,
> > +qdev_get_gpio_in(DEVICE(>psi),
> > +PSIHB9_IRQ_LPC_SIRQ2));
> > +qdev_connect_gpio_out_named(DEVICE(>lpc), "SERIRQ", 3,
> > +qdev_get_gpio_in(DEVICE(>psi),
> > +PSIHB9_IRQ_LPC_SIRQ3));
> > +
> >   return pnv_lpc_isa_create(>lpc, false, errp);
> >   }
> >   
> >   static ISABus *pnv_chip_power10_isa_create(PnvChip *chip, Error **errp)
> >   {
> >   Pnv10Chip *chip10 = PNV10_CHIP(chip);
> > -qemu_irq irq = qdev_get_gpio_in(DEVICE(>psi), 
> > PSIHB9_IRQ_LPCHC);
> >   
> > -qdev_connect_gpio_out(DEVICE(>lpc), 0, irq);
> > +qdev_connect_gpio_out_named(DEVICE(>lpc), "LPCHC", 0,
> > +qdev_get_gpio_in(DEVICE(>psi),
> > +PSIHB9_IRQ_LPCHC));
> > +
> > +qdev_connect_gpio_out_named(DEVICE(>lpc), "SERIRQ", 0,
> > +qdev_get_gpio_in(DEVICE(>psi),
> > +PSIHB9_IRQ_LPC_SIRQ0));
> > +qdev_connect_gpio_out_named(DEVICE(>lpc), "SERIRQ", 1,
> > +qdev_get_gpio_in(DEVICE(>psi),
> > +PSIHB9_IRQ_LPC_SIRQ1));
> > +qdev_connect_gpio_out_named(DEVICE(>lpc), "SERIRQ", 2,
> > +qdev_get_gpio_in(DEVICE(>psi),
> > +PSIHB9_IRQ_LPC_SIRQ2));
> > +qdev_connect_gpio_out_named(DEVICE(>lpc),

RE: [PATCH v3] hw/virtio: Fix obtain the buffer id from the last descriptor

2024-05-16 Thread Wafer

Ping :)

> -Original Message-
> From: Wafer
> Sent: 2024/05/10  15:29
> To: epere...@redhat.com; m...@redhat.com; jasow...@redhat.com
> Cc: qemu-devel@nongnu.org; Angus Chen ;
> Wafer 
> Subject: [PATCH v3] hw/virtio: Fix obtain the buffer id from the last
> descriptor
> 
> The virtio-1.3 specification
>  writes:
> 2.8.6 Next Flag: Descriptor Chaining
>   Buffer ID is included in the last descriptor in the list.
> 
> If the feature (_F_INDIRECT_DESC) has been negotiated, install only one
> descriptor in the virtqueue.
> Therefor the buffer id should be obtained from the first descriptor.
> 
> In descriptor chaining scenarios, the buffer id should be obtained from the
> last descriptor.
> 
> Fixes: 86044b24e8 ("virtio: basic packed virtqueue support")
> 
> Signed-off-by: Wafer 
> Reviewed-by: Jason Wang 
> Reviewed-by: Eugenio Pérez 
> Acked-by: Jason Wang 
> 
> --
> Changes in v3:
>  - Add Acked-by Jason Wang
> 
> Changes in v2:
>  - Use Jason suggestion: Move the code out of the loop.
> ---
>  hw/virtio/virtio.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index
> 871674f9be..e9e8447878 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -1744,6 +1744,11 @@ static void *virtqueue_packed_pop(VirtQueue
> *vq, size_t sz)
>   _desc_cache);
>  } while (rc == VIRTQUEUE_READ_DESC_MORE);
> 
> +if (desc_cache != _desc_cache) {
> +/* Buffer ID is included in the last descriptor in the list. */
> +id = desc.id;
> +}
> +
>  /* Now copy what we have collected and mapped */
>  elem = virtqueue_alloc_element(sz, out_num, in_num);
>  for (i = 0; i < out_num; i++) {
> --
> 2.27.0

[PATCH] ui/console: Fix compiling issue

2024-05-16 Thread Bibo Mao

Local variable fence_fd is defined but not used if CONFIG_GBM is
not enabled, and there is compiling problem.

Signed-off-by: Bibo Mao 
---
 ui/gtk-egl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
index 0473f689c9..9831c10e1b 100644
--- a/ui/gtk-egl.c
+++ b/ui/gtk-egl.c
@@ -68,9 +68,9 @@ void gd_egl_draw(VirtualConsole *vc)
 GdkWindow *window;
 #ifdef CONFIG_GBM
 QemuDmaBuf *dmabuf = vc->gfx.guest_fb.dmabuf;
+int fence_fd;
 #endif
 int ww, wh, ws;
-int fence_fd;
 
 if (!vc->gfx.gls) {
 return;

base-commit: 922582ace2df59572a671f5c0c5c6c5c706995e5
-- 
2.39.3

[RFC v2] target/loongarch/kvm: Add software breakpoint support

2024-05-16 Thread Bibo Mao

With KVM virtualization, debug exception is passthrough to guest kernel
rather than host mode. Here hypercall instruction with special code is used
for sw breakpoint usage.

Now only software breakpoint is supported, and it is allowed to
insert/remove software breakpoint. We can debug guest kernel with gdb
method after kernel is loaded, hardware breakpoint will be added in later.

Signed-off-by: Bibo Mao 
---
v1 ... v2:
  1. Enable TARGET_KVM_HAVE_GUEST_DEBUG on loongarch64 platform
---
 configs/targets/loongarch64-softmmu.mak |  1 +
 target/loongarch/kvm/kvm.c  | 76 +
 2 files changed, 77 insertions(+)

diff --git a/configs/targets/loongarch64-softmmu.mak 
b/configs/targets/loongarch64-softmmu.mak
index 84beb19b90..65b65e0c34 100644
--- a/configs/targets/loongarch64-softmmu.mak
+++ b/configs/targets/loongarch64-softmmu.mak
@@ -1,5 +1,6 @@
 TARGET_ARCH=loongarch64
 TARGET_BASE_ARCH=loongarch
+TARGET_KVM_HAVE_GUEST_DEBUG=y
 TARGET_SUPPORTS_MTTCG=y
 TARGET_XML_FILES= gdb-xml/loongarch-base32.xml gdb-xml/loongarch-base64.xml 
gdb-xml/loongarch-fpu.xml
 # all boards require libfdt
diff --git a/target/loongarch/kvm/kvm.c b/target/loongarch/kvm/kvm.c
index bc75552d0f..0a94784276 100644
--- a/target/loongarch/kvm/kvm.c
+++ b/target/loongarch/kvm/kvm.c
@@ -28,6 +28,7 @@
 #include "trace.h"
 
 static bool cap_has_mp_state;
+static unsigned int brk_insn;
 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 KVM_CAP_LAST_INFO
 };
@@ -658,7 +659,14 @@ static void kvm_loongarch_vm_stage_change(void *opaque, 
bool running,
 
 int kvm_arch_init_vcpu(CPUState *cs)
 {
+uint64_t val;
+
 qemu_add_vm_change_state_handler(kvm_loongarch_vm_stage_change, cs);
+
+if (!kvm_get_one_reg(cs, KVM_REG_LOONGARCH_DEBUG_INST, )) {
+brk_insn = val;
+}
+
 return 0;
 }
 
@@ -733,6 +741,67 @@ bool kvm_arch_stop_on_emulation_error(CPUState *cs)
 return true;
 }
 
+void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
+{
+if (kvm_sw_breakpoints_active(cpu)) {
+dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
+}
+}
+
+int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
+{
+if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)>saved_insn, 4, 0) ||
+cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)_insn, 4, 1)) {
+error_report("%s failed", __func__);
+return -EINVAL;
+}
+return 0;
+}
+
+int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
+{
+static uint32_t brk;
+
+if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *), 4, 0) ||
+brk != brk_insn ||
+cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)>saved_insn, 4, 1)) {
+error_report("%s failed", __func__);
+return -EINVAL;
+}
+return 0;
+}
+
+int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type)
+{
+return -ENOSYS;
+}
+
+int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type)
+{
+return -ENOSYS;
+}
+
+void kvm_arch_remove_all_hw_breakpoints(void)
+{
+}
+
+static bool kvm_loongarch_handle_debug(CPUState *cs, struct kvm_run *run)
+{
+LoongArchCPU *cpu = LOONGARCH_CPU(cs);
+CPULoongArchState *env = >env;
+
+kvm_cpu_synchronize_state(cs);
+if (cs->singlestep_enabled) {
+return true;
+}
+
+if (kvm_find_sw_breakpoint(cs, env->pc)) {
+return true;
+}
+
+return false;
+}
+
 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
 {
 int ret = 0;
@@ -751,6 +820,13 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
  run->iocsr_io.len,
  run->iocsr_io.is_write);
 break;
+
+case KVM_EXIT_DEBUG:
+if (kvm_loongarch_handle_debug(cs, run)) {
+ret = EXCP_DEBUG;
+}
+break;
+
 default:
 ret = -1;
 warn_report("KVM: unknown exit reason %d", run->exit_reason);

base-commit: 922582ace2df59572a671f5c0c5c6c5c706995e5
-- 
2.39.3

Re: [PATCH v7 00/12] Enabling DCD emulation support in Qemu

2024-05-16 Thread Zhijian Li (Fujitsu)

Fan,

Many thanks, it helps a lot. Previous I forgot to create a new dax 
device(daxctl create-device region0)
Question: Why we need to create a the dax0.1, why the dax0.0 doesn't associate 
to the new adding DCD region.

Ira,

Let me try to report a kernel panic.

kernel: dcd-2024-04-17
qemu: dcd-2024-04-17

QEMU command line:
164 
165 
166 
167 
168 
169 
170 
171 
172 
173 


Reproducer:
  1. guest: ./create-dc.sh
  2. host: virsh qemu-monitor-command rdma-server-cxl-persistent-dcd $(cat 
cxl-add-dcd.json)
  3. guest: daxctl create-device region0 # will create dax0.1
  4. daxctl reconfigure-device  --mode=system-ram --force  dax0.1 -u  # kernel 
panic

=
# cat ./create-dc.sh
#!/bin/bash
set -ex

region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
echo $region> /sys/bus/cxl/devices/decoder0.0/create_dc_region
echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
echo 1 > /sys/bus/cxl/devices/$region/interleave_ways
echo "dc0" >/sys/bus/cxl/devices/decoder2.0/mode
echo 0x1000 >/sys/bus/cxl/devices/decoder2.0/dpa_size
echo 0x1000 > /sys/bus/cxl/devices/$region/size
echo "decoder2.0" > /sys/bus/cxl/devices/$region/target0
echo 1 > /sys/bus/cxl/devices/$region/commit
echo $region > /sys/bus/cxl/drivers/cxl_region/bind
=
# cat cxl-add-dcd.json
{ "execute": "cxl-add-dynamic-capacity",
   "arguments": {
   "path": "/machine/peripheral/pmem-dcmem",
   "hid": 0,
   "selection-policy": 2,
   "region-id": 0,
   "tag": "",
   "extents": [
   {
   "offset": 0,
   "len": 268435456
   }
   ]
   }
}



[  126.909297] Demotion targets for Node 0: preferred: 1, fallback: 1
[  126.911186] Demotion targets for Node 1: null
[  126.913808] BUG: kernel NULL pointer dereference, address: 0468
[  126.915431] #PF: supervisor read access in kernel mode
[  126.917156] #PF: error_code(0x) - not-present page
[  126.918976] PGD 86771067 P4D 86771067 PUD e777067 PMD 0
[  126.920587] Oops:  [#1] PREEMPT SMP PTI
[  126.921714] CPU: 0 PID: 1101 Comm: daxctl Kdump: loaded Not tainted 
6.9.0-rc3-lizhijian+ #489
[  126.924914] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[  126.928620] RIP: 0010:cxl_region_perf_attrs_callback+0x25/0x110 [cxl_core]
[  126.930316] Code: 90 90 90 90 90 0f 1f 44 00 00 41 56 41 55 41 54 55 53 8b 
6a 24 83 fd ff 74 20 48 83 fe 01 75 1a 48 8b 87 58 ff ff ff 48 89 fb <48> 8b b8 
68 04 00 00 e8 cf a2 f4 e0 39 c5 74 13 45 31 e4 5b 44 89
[  126.934920] RSP: 0018:c97cbc58 EFLAGS: 00010246
[  126.936994] RAX:  RBX: 888007534d60 RCX: 0020
[  126.939378] RDX: c97cbcf8 RSI: 0001 RDI: 888007534d60
[  126.942721] RBP: 0001 R08: 0001 R09: 0001
[  126.944762] R10: 88807fc31d80 R11:  R12: 
[  126.946900] R13: 0001 R14: c97cbcf8 R15: 888007534d60
[  126.948871] FS:  7fb2ab918880() GS:88807fc0() 
knlGS:
[  126.951241] CS:  0010 DS:  ES:  CR0: 80050033
[  126.952722] CR2: 0468 CR3: 0aaf0003 CR4: 001706f0
[  126.954623] DR0:  DR1:  DR2: 
[  126.956768] DR3:  DR6: fffe0ff0 DR7: 0400
[  126.958887] Call Trace:
[  126.959814]  
[  126.960569]  ? __die+0x20/0x70
[  126.961645]  ? page_fault_oops+0x15a/0x450
[  126.962930]  ? search_module_extables+0x33/0x90
[  126.964374]  ? fixup_exception+0x22/0x310
[  126.965693]  ? exc_page_fault+0x68/0x200
[  126.967371]  ? asm_exc_page_fault+0x22/0x30
[  126.968713]  ? cxl_region_perf_attrs_callback+0x25/0x110 [cxl_core]
[  126.972508]  notifier_call_chain+0x40/0x110
[  126.974380]  blocking_notifier_call_chain+0x43/0x60
[  126.975788]  online_pages+0x24c/0x2d0
[  126.977008]  memory_subsys_online+0x233/0x290
[  126.978338]  device_online+0x64/0x90
[  126.979440]  state_store+0xae/0xc0
[  126.980510]  kernfs_fop_write_iter+0x143/0x200
[  126.981734]  vfs_write+0x3a6/0x570
[  126.982851]  ksys_write+0x65/0xf0
[  126.984006]  do_syscall_64+0x6d/0x140
[  126.985309]  entry_SYSCALL_64_after_hwframe+0x71/0x79
[  126.986927] RIP: 0033:0x7fb2abc777a7
[  126.987983] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 
f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 
f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
[  126.992770] RSP: 002b:7ffebec70b98 EFLAGS: 0246 ORIG_RAX: 
0001
[  126.994874] RAX: ffda RBX: 0040e1f0 RCX: 7fb2abc777a7
[  126.996906] RDX: 000f RSI: 7fb2abdb6434 RDI: 0004
[  126.998911] RBP: 7ffebec70bd0 R08:  R09: 7ffebec70640
[  127.000879] R10:

Re: [PATCH] target/riscv: zvbb implies zvkb

2024-05-16 Thread Frank Chang

Reviewed-by: Frank Chang 

On Thu, May 16, 2024 at 8:34 PM Jerry Zhang Jian 
wrote:

> - According to RISC-V crypto spec, Zvkb extension is a proper subset of
> the Zvbb extension.
>
> - Reference:
> https://github.com/riscv/riscv-crypto/blob/1769c2609bf4535632e0c0fd715778f212bb272e/doc/vector/riscv-crypto-vector-zvkb.adoc?plain=1#L10
>
> Signed-off-by: Jerry Zhang Jian 
> ---
>  target/riscv/tcg/tcg-cpu.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/target/riscv/tcg/tcg-cpu.c b/target/riscv/tcg/tcg-cpu.c
> index 40054a391a..f1a1306ab2 100644
> --- a/target/riscv/tcg/tcg-cpu.c
> +++ b/target/riscv/tcg/tcg-cpu.c
> @@ -658,6 +658,10 @@ void riscv_cpu_validate_set_extensions(RISCVCPU *cpu,
> Error **errp)
>  cpu_cfg_ext_auto_update(cpu, CPU_CFG_OFFSET(ext_zvbc), true);
>  }
>
> +if (cpu->cfg.ext_zvbb) {
> +cpu_cfg_ext_auto_update(cpu, CPU_CFG_OFFSET(ext_zvkb), true);
> +}
> +
>  /*
>   * In principle Zve*x would also suffice here, were they supported
>   * in qemu
> --
> 2.42.0
>
>

Re: [PULL 0/5] loongarch-to-apply queue

2024-05-16 Thread gaosong


在 2024/5/16 下午5:28, Peter Maydell 写道:

On Thu, 16 May 2024 at 10:12, Song Gao  wrote:

The following changes since commit 922582ace2df59572a671f5c0c5c6c5c706995e5:

   Merge tag 'pull-hppa-20240515' of https://gitlab.com/rth7680/qemu into 
staging (2024-05-15 11:46:58 +0200)

are available in the Git repository at:

   https://gitlab.com/gaosong/qemu.git tags/pull-loongarch-20240516

for you to fetch changes up to d55d16700a2e2b36c7e34724d4d77f4a75c5243a:

   target/loongarch/kvm: fpu save the vreg registers high 192bit (2024-05-16 
16:32:35 +0800)


pull-loongarch-20240516


Bibo Mao (3):
   hw/loongarch: Add compat machine for 9.1
   hw/loongarch: Remove minimum and default memory size
   tests: Add migration test for loongarch64

RTH: I had a comment about adding the versioned machine type, so we
should hold off on applying this until that is resolved, I think.

Agreed,   We will try resolved it.   Thanks for your explanation.

Thanks.
Song Gao

thanks
-- PMM

Re: [PATCH 1/1] riscv, gdbstub.c: fix reg_width in ricsv_gen_dynamic_vector_feature()

2024-05-16 Thread LIU Zhiwei




On 2024/5/17 1:10, Daniel Henrique Barboza wrote:

Commit 33a24910ae changed 'reg_width' to use 'vlenb', i.e. vector length
in bytes, when in this context we want 'reg_width' as the length in
bits.

Fix 'reg_width' back to the value in bits like 7cb59921c05a
("target/riscv/gdbstub.c: use 'vlenb' instead of shifting 'vlen'") set
beforehand.

Cc: Akihiko Odaki 
Cc: Alex Bennée 
Reported-by: Robin Dapp 
Fixes: 33a24910ae ("target/riscv: Use GDBFeature for dynamic XML")
Signed-off-by: Daniel Henrique Barboza 


Reviewed-by: LIU Zhiwei 

Zhiwei


---
  target/riscv/gdbstub.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/riscv/gdbstub.c b/target/riscv/gdbstub.c
index d0cc5762c2..358158c42a 100644
--- a/target/riscv/gdbstub.c
+++ b/target/riscv/gdbstub.c
@@ -288,7 +288,7 @@ static GDBFeature *riscv_gen_dynamic_csr_feature(CPUState 
*cs, int base_reg)
  static GDBFeature *ricsv_gen_dynamic_vector_feature(CPUState *cs, int 
base_reg)
  {
  RISCVCPU *cpu = RISCV_CPU(cs);
-int reg_width = cpu->cfg.vlenb;
+int reg_width = cpu->cfg.vlenb << 3;
  GDBFeatureBuilder builder;
  int i;

Re: [PULL 04/17] virtio-net: Add support for USO features

2024-05-16 Thread Jason Wang

On Thu, May 16, 2024 at 9:51 PM Fiona Ebner  wrote:
>
> Hi,
>
> Am 08.09.23 um 08:44 schrieb Jason Wang:
> > diff --git a/hw/core/machine.c b/hw/core/machine.c
> > index da699cf..230aab8 100644
> > --- a/hw/core/machine.c
> > +++ b/hw/core/machine.c
> > @@ -38,6 +38,7 @@
> >  #include "exec/confidential-guest-support.h"
> >  #include "hw/virtio/virtio.h"
> >  #include "hw/virtio/virtio-pci.h"
> > +#include "hw/virtio/virtio-net.h"
> >
> >  GlobalProperty hw_compat_8_1[] = {};
> >  const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
> > @@ -45,6 +46,9 @@ const size_t hw_compat_8_1_len = 
> > G_N_ELEMENTS(hw_compat_8_1);
> >  GlobalProperty hw_compat_8_0[] = {
> >  { "migration", "multifd-flush-after-each-section", "on"},
> >  { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
> > +{ TYPE_VIRTIO_NET, "host_uso", "off"},
> > +{ TYPE_VIRTIO_NET, "guest_uso4", "off"},
> > +{ TYPE_VIRTIO_NET, "guest_uso6", "off"},
> >  };
> >  const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);
> >
>
> unfortunately, this broke backwards migration with machine version 8.1
> from 8.2 and 9.0 binaries to a 8.1 binary:
>
> > kvm: Features 0x1c0010130afffa7 unsupported. Allowed features: 0x10179bfffe7
> > kvm: Failed to load virtio-net:virtio
> > kvm: error while loading state for instance 0x0 of device 
> > ':00:12.0/virtio-net'
> > kvm: load of migration failed: Operation not permitted
>
> Since the series here only landed in 8.2, shouldn't these flags have
> been added to hw_compat_8_1[] instead?

You are right. We need to put them into hw_compat_8_1[].

>
> Attempting to fix it by moving the flags will break migration with
> machine version 8.1 between patched 9.0 and unpatched 9.0 however :(

I'm sorry but I can't think of a way better.

>
> Is there anything that can be done or will it need to stay broken now?

Would you mind posting a patch to fix this and cc stable?

>
> CC-ing the migration maintainers.
>
> Best Regards,
> Fiona
>

Thanks

Re: [PATCH v6 6/8] xen: mapcache: Pass the ram_addr offset to xen_map_cache()

2024-05-16 Thread Stefano Stabellini

On Thu, 16 May 2024, Edgar E. Iglesias wrote:
> From: "Edgar E. Iglesias" 
> 
> Pass the ram_addr offset to xen_map_cache.
> This is in preparation for adding grant mappings that need
> to compute the address within the RAMBlock.
> 
> No functional changes.
> 
> Signed-off-by: Edgar E. Iglesias 

Reviewed-by: Stefano Stabellini

[PATCH 5/5] contrib/plugins: add ips plugin example for cost modeling

This plugin uses the new time control interface to make decisions
about the state of time during the emulation. The algorithm is
currently very simple. The user specifies an ips rate which applies
per core. If the core runs ahead of its allocated execution time the
plugin sleeps for a bit to let real time catch up. Either way time is
updated for the emulation as a function of total executed instructions
with some adjustments for cores that idle.

Examples


Slow down execution of /bin/true:
$ num_insn=$(./build/qemu-x86_64 -plugin ./build/tests/plugin/libinsn.so -d 
plugin /bin/true |& grep total | sed -e 's/.*: //')
$ time ./build/qemu-x86_64 -plugin 
./build/contrib/plugins/libips.so,ips=$(($num_insn/4)) /bin/true
real 4.000s

Boot a Linux kernel simulating a 250MHz cpu:
$ /build/qemu-system-x86_64 -kernel /boot/vmlinuz-6.1.0-21-amd64 -append 
"console=ttyS0" -plugin 
./build/contrib/plugins/libips.so,ips=$((250*1000*1000)) -smp 1 -m 512
check time until kernel panic on serial0

Signed-off-by: Pierrick Bouvier 
---
 contrib/plugins/ips.c| 239 +++
 contrib/plugins/Makefile |   1 +
 2 files changed, 240 insertions(+)
 create mode 100644 contrib/plugins/ips.c

diff --git a/contrib/plugins/ips.c b/contrib/plugins/ips.c
new file mode 100644
index 000..cf3159df391
--- /dev/null
+++ b/contrib/plugins/ips.c
@@ -0,0 +1,239 @@
+/*
+ * ips rate limiting plugin.
+ *
+ * This plugin can be used to restrict the execution of a system to a
+ * particular number of Instructions Per Second (ips). This controls
+ * time as seen by the guest so while wall-clock time may be longer
+ * from the guests point of view time will pass at the normal rate.
+ *
+ * This uses the new plugin API which allows the plugin to control
+ * system time.
+ *
+ * Copyright (c) 2023 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include 
+#include 
+#include 
+
+QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
+
+/* how many times do we update time per sec */
+#define NUM_TIME_UPDATE_PER_SEC 10
+#define NSEC_IN_ONE_SEC (1000 * 1000 * 1000)
+
+static GMutex global_state_lock;
+
+static uint64_t insn_per_second = 1000 * 1000; /* ips per core, per second */
+static uint64_t insn_quantum; /* trap every N instructions */
+static bool precise_execution; /* count every instruction */
+static int64_t start_time_ns; /* time (ns since epoch) first vCPU started */
+static int64_t virtual_time_ns; /* last set virtual time */
+
+static const void *time_handle;
+
+typedef enum {
+UNKNOWN = 0,
+EXECUTING,
+IDLE,
+FINISHED
+} vCPUState;
+
+typedef struct {
+uint64_t counter;
+uint64_t track_insn;
+vCPUState state;
+/* timestamp when vCPU entered state */
+int64_t last_state_time;
+} vCPUTime;
+
+struct qemu_plugin_scoreboard *vcpus;
+
+/* return epoch time in ns */
+static int64_t now_ns(void)
+{
+return g_get_real_time() * 1000;
+}
+
+static uint64_t num_insn_during(int64_t elapsed_ns)
+{
+double num_secs = elapsed_ns / (double) NSEC_IN_ONE_SEC;
+return num_secs * (double) insn_per_second;
+}
+
+static int64_t time_for_insn(uint64_t num_insn)
+{
+double num_secs = (double) num_insn / (double) insn_per_second;
+return num_secs * (double) NSEC_IN_ONE_SEC;
+}
+
+static int64_t uptime_ns(void)
+{
+int64_t now = now_ns();
+g_assert(now >= start_time_ns);
+return now - start_time_ns;
+}
+
+static void vcpu_set_state(vCPUTime *vcpu, vCPUState new_state)
+{
+vcpu->last_state_time = now_ns();
+vcpu->state = new_state;
+}
+
+static void update_system_time(vCPUTime *vcpu)
+{
+/* flush remaining instructions */
+vcpu->counter += vcpu->track_insn;
+vcpu->track_insn = 0;
+
+int64_t uptime = uptime_ns();
+uint64_t expected_insn = num_insn_during(uptime);
+
+if (vcpu->counter >= expected_insn) {
+/* this vcpu ran faster than expected, so it has to sleep */
+uint64_t insn_advance = vcpu->counter - expected_insn;
+uint64_t time_advance_ns = time_for_insn(insn_advance);
+int64_t sleep_us = time_advance_ns / 1000;
+g_usleep(sleep_us);
+}
+
+/* based on number of instructions, what should be the new time? */
+int64_t new_virtual_time = time_for_insn(vcpu->counter);
+
+g_mutex_lock(_state_lock);
+
+/* Time only moves forward. Another vcpu might have updated it already. */
+if (new_virtual_time > virtual_time_ns) {
+qemu_plugin_update_ns(time_handle, new_virtual_time);
+virtual_time_ns = new_virtual_time;
+}
+
+g_mutex_unlock(_state_lock);
+}
+
+static void set_start_time()
+{
+g_mutex_lock(_state_lock);
+if (!start_time_ns) {
+start_time_ns = now_ns();
+}
+g_mutex_unlock(_state_lock);
+}
+
+static void vcpu_init(qemu_plugin_id_t id, unsigned int cpu_index)
+{
+vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
+/* ensure start time is set first */
+

[PATCH 4/5] plugins: add time control API

From: Alex Bennée 

Expose the ability to control time through the plugin API. Only one
plugin can control time so it has to request control when loaded.
There are probably more corner cases to catch here.

From: Alex Bennée 
Signed-off-by: Alex Bennée 
---
 include/qemu/qemu-plugin.h   | 23 +++
 plugins/api.c| 31 +++
 plugins/qemu-plugins.symbols |  2 ++
 3 files changed, 56 insertions(+)

diff --git a/include/qemu/qemu-plugin.h b/include/qemu/qemu-plugin.h
index 95703d8fec1..80b1637cede 100644
--- a/include/qemu/qemu-plugin.h
+++ b/include/qemu/qemu-plugin.h
@@ -661,6 +661,29 @@ void qemu_plugin_register_vcpu_mem_inline_per_vcpu(
 qemu_plugin_u64 entry,
 uint64_t imm);
 
+/**
+ * qemu_plugin_request_time_control() - request the ability to control time
+ *
+ * This grants the plugin the ability to control system time. Only one
+ * plugin can control time so if multiple plugins request the ability
+ * all but the first will fail.
+ *
+ * Returns an opaque handle or NULL if fails
+ */
+const void *qemu_plugin_request_time_control(void);
+
+/**
+ * qemu_plugin_update_ns() - update system emulation time
+ * @handle: opaque handle returned by qemu_plugin_request_time_control()
+ * @time: time in nanoseconds
+ *
+ * This allows an appropriately authorised plugin (i.e. holding the
+ * time control handle) to move system time forward to @time.
+ *
+ * Start time is 0.
+ */
+void qemu_plugin_update_ns(const void *handle, int64_t time);
+
 typedef void
 (*qemu_plugin_vcpu_syscall_cb_t)(qemu_plugin_id_t id, unsigned int vcpu_index,
  int64_t num, uint64_t a1, uint64_t a2,
diff --git a/plugins/api.c b/plugins/api.c
index 5a0a7f8c712..26822b69ea2 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -39,6 +39,7 @@
 #include "qemu/main-loop.h"
 #include "qemu/plugin.h"
 #include "qemu/log.h"
+#include "qemu/timer.h"
 #include "tcg/tcg.h"
 #include "exec/exec-all.h"
 #include "exec/gdbstub.h"
@@ -583,3 +584,33 @@ uint64_t qemu_plugin_u64_sum(qemu_plugin_u64 entry)
 }
 return total;
 }
+
+/*
+ * Time control
+ */
+static bool has_control;
+
+const void *qemu_plugin_request_time_control(void)
+{
+if (!has_control) {
+has_control = true;
+return _control;
+}
+return NULL;
+}
+
+static void advance_virtual_time__async(CPUState *cpu, run_on_cpu_data data)
+{
+int64_t new_time = data.host_ulong;
+qemu_clock_advance_virtual_time(new_time);
+}
+
+void qemu_plugin_update_ns(const void *handle, int64_t new_time)
+{
+if (handle == _control) {
+/* Need to execute out of cpu_exec, so bql can be locked. */
+async_run_on_cpu(current_cpu,
+ advance_virtual_time__async,
+ RUN_ON_CPU_HOST_ULONG(new_time));
+}
+}
diff --git a/plugins/qemu-plugins.symbols b/plugins/qemu-plugins.symbols
index aa0a77a319f..ca773d8d9fe 100644
--- a/plugins/qemu-plugins.symbols
+++ b/plugins/qemu-plugins.symbols
@@ -38,6 +38,7 @@
   qemu_plugin_register_vcpu_tb_exec_cond_cb;
   qemu_plugin_register_vcpu_tb_exec_inline_per_vcpu;
   qemu_plugin_register_vcpu_tb_trans_cb;
+  qemu_plugin_request_time_control;
   qemu_plugin_reset;
   qemu_plugin_scoreboard_free;
   qemu_plugin_scoreboard_find;
@@ -51,5 +52,6 @@
   qemu_plugin_u64_set;
   qemu_plugin_u64_sum;
   qemu_plugin_uninstall;
+  qemu_plugin_update_ns;
   qemu_plugin_vcpu_for_each;
 };
-- 
2.39.2

[PATCH 1/5] sysemu: add set_virtual_time to accel ops

From: Alex Bennée 

We are about to remove direct calls to individual accelerators for
this information and will need a central point for plugins to hook
into time changes.

From: Alex Bennée 
Signed-off-by: Alex Bennée 
Reviewed-by: Philippe Mathieu-Daudé 
---
 include/sysemu/accel-ops.h | 18 +-
 include/sysemu/cpu-timers.h|  3 ++-
 ...et-virtual-clock.c => cpus-virtual-clock.c} |  5 +
 system/cpus.c  | 11 +++
 stubs/meson.build  |  6 +-
 5 files changed, 40 insertions(+), 3 deletions(-)
 rename stubs/{cpus-get-virtual-clock.c => cpus-virtual-clock.c} (68%)

diff --git a/include/sysemu/accel-ops.h b/include/sysemu/accel-ops.h
index ef91fc28bbd..a0886722305 100644
--- a/include/sysemu/accel-ops.h
+++ b/include/sysemu/accel-ops.h
@@ -20,7 +20,12 @@
 typedef struct AccelOpsClass AccelOpsClass;
 DECLARE_CLASS_CHECKERS(AccelOpsClass, ACCEL_OPS, TYPE_ACCEL_OPS)
 
-/* cpus.c operations interface */
+/**
+ * struct AccelOpsClass - accelerator interfaces
+ *
+ * This structure is used to abstract accelerator differences from the
+ * core CPU code. Not all have to be implemented.
+ */
 struct AccelOpsClass {
 /*< private >*/
 ObjectClass parent_class;
@@ -44,7 +49,18 @@ struct AccelOpsClass {
 
 void (*handle_interrupt)(CPUState *cpu, int mask);
 
+/**
+ * @get_virtual_clock: fetch virtual clock
+ * @set_virtual_clock: set virtual clock
+ *
+ * These allow the timer subsystem to defer to the accelerator to
+ * fetch time. The set function is needed if the accelerator wants
+ * to track the changes to time as the timer is warped through
+ * various timer events.
+ */
 int64_t (*get_virtual_clock)(void);
+void (*set_virtual_clock)(int64_t time);
+
 int64_t (*get_elapsed_ticks)(void);
 
 /* gdbstub hooks */
diff --git a/include/sysemu/cpu-timers.h b/include/sysemu/cpu-timers.h
index d86738a378d..7bfa960fbd6 100644
--- a/include/sysemu/cpu-timers.h
+++ b/include/sysemu/cpu-timers.h
@@ -96,8 +96,9 @@ int64_t cpu_get_clock(void);
 
 void qemu_timer_notify_cb(void *opaque, QEMUClockType type);
 
-/* get the VIRTUAL clock and VM elapsed ticks via the cpus accel interface */
+/* get/set VIRTUAL clock and VM elapsed ticks via the cpus accel interface */
 int64_t cpus_get_virtual_clock(void);
+void cpus_set_virtual_clock(int64_t new_time);
 int64_t cpus_get_elapsed_ticks(void);
 
 #endif /* SYSEMU_CPU_TIMERS_H */
diff --git a/stubs/cpus-get-virtual-clock.c b/stubs/cpus-virtual-clock.c
similarity index 68%
rename from stubs/cpus-get-virtual-clock.c
rename to stubs/cpus-virtual-clock.c
index fd447d53f3c..af7c1a1d403 100644
--- a/stubs/cpus-get-virtual-clock.c
+++ b/stubs/cpus-virtual-clock.c
@@ -6,3 +6,8 @@ int64_t cpus_get_virtual_clock(void)
 {
 return cpu_get_clock();
 }
+
+void cpus_set_virtual_clock(int64_t new_time)
+{
+/* do nothing */
+}
diff --git a/system/cpus.c b/system/cpus.c
index 68d161d96b7..03ba026667c 100644
--- a/system/cpus.c
+++ b/system/cpus.c
@@ -229,6 +229,17 @@ int64_t cpus_get_virtual_clock(void)
 return cpu_get_clock();
 }
 
+/*
+ * Signal the new virtual time to the accelerator. This is only needed
+ * by accelerators that need to track the changes as we warp time.
+ */
+void cpus_set_virtual_clock(int64_t new_time)
+{
+if (cpus_accel && cpus_accel->set_virtual_clock) {
+cpus_accel->set_virtual_clock(new_time);
+}
+}
+
 /*
  * return the time elapsed in VM between vm_start and vm_stop.  Unless
  * icount is active, cpus_get_elapsed_ticks() uses units of the host CPU cycle
diff --git a/stubs/meson.build b/stubs/meson.build
index 3b9d42023cb..672213b7482 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -3,6 +3,11 @@
 # below, so that it is clear who needs the stubbed functionality.
 
 stub_ss.add(files('cpu-get-clock.c'))
+stub_ss.add(files('cpus-virtual-clock.c'))
+stub_ss.add(files('qemu-timer-notify-cb.c'))
+stub_ss.add(files('icount.c'))
+stub_ss.add(files('dump.c'))
+stub_ss.add(files('error-printf.c'))
 stub_ss.add(files('fdset.c'))
 stub_ss.add(files('iothread-lock.c'))
 stub_ss.add(files('is-daemonized.c'))
@@ -28,7 +33,6 @@ endif
 if have_block or have_ga
   stub_ss.add(files('replay-tools.c'))
   # stubs for hooks in util/main-loop.c, util/async.c etc.
-  stub_ss.add(files('cpus-get-virtual-clock.c'))
   stub_ss.add(files('icount.c'))
   stub_ss.add(files('graph-lock.c'))
   if linux_io_uring.found()
-- 
2.39.2

[PATCH 2/5] qtest: use cpu interface in qtest_clock_warp

From: Alex Bennée 

This generalises the qtest_clock_warp code to use the AccelOps
handlers for updating its own sense of time. This will make the next
patch which moves the warp code closer to pure code motion.

From: Alex Bennée 
Signed-off-by: Alex Bennée 
Acked-by: Thomas Huth 
---
 include/sysemu/qtest.h | 1 +
 accel/qtest/qtest.c| 1 +
 system/qtest.c | 6 +++---
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/sysemu/qtest.h b/include/sysemu/qtest.h
index b5d5fd34637..45f3b7e1df5 100644
--- a/include/sysemu/qtest.h
+++ b/include/sysemu/qtest.h
@@ -36,6 +36,7 @@ void qtest_server_set_send_handler(void (*send)(void *, const 
char *),
 void qtest_server_inproc_recv(void *opaque, const char *buf);
 
 int64_t qtest_get_virtual_clock(void);
+void qtest_set_virtual_clock(int64_t count);
 #endif
 
 #endif
diff --git a/accel/qtest/qtest.c b/accel/qtest/qtest.c
index f6056ac8361..53182e6c2ae 100644
--- a/accel/qtest/qtest.c
+++ b/accel/qtest/qtest.c
@@ -52,6 +52,7 @@ static void qtest_accel_ops_class_init(ObjectClass *oc, void 
*data)
 
 ops->create_vcpu_thread = dummy_start_vcpu_thread;
 ops->get_virtual_clock = qtest_get_virtual_clock;
+ops->set_virtual_clock = qtest_set_virtual_clock;
 };
 
 static const TypeInfo qtest_accel_ops_type = {
diff --git a/system/qtest.c b/system/qtest.c
index 6da58b3874e..ee8b139e982 100644
--- a/system/qtest.c
+++ b/system/qtest.c
@@ -332,14 +332,14 @@ int64_t qtest_get_virtual_clock(void)
 return qatomic_read_i64(_clock_counter);
 }
 
-static void qtest_set_virtual_clock(int64_t count)
+void qtest_set_virtual_clock(int64_t count)
 {
 qatomic_set_i64(_clock_counter, count);
 }
 
 static void qtest_clock_warp(int64_t dest)
 {
-int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+int64_t clock = cpus_get_virtual_clock();
 AioContext *aio_context;
 assert(qtest_enabled());
 aio_context = qemu_get_aio_context();
@@ -348,7 +348,7 @@ static void qtest_clock_warp(int64_t dest)
   QEMU_TIMER_ATTR_ALL);
 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 
-qtest_set_virtual_clock(qtest_get_virtual_clock() + warp);
+cpus_set_virtual_clock(cpus_get_virtual_clock() + warp);
 
 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
-- 
2.39.2

[PATCH 0/5] Implement icount=auto using TCG Plugins

The goal here is to be able to scale temporally execution of qemu-user/system,
using a given number of instructions per second.

We define a virtual clock, that can be late or in advance compared to real time.
When we are in advance, we slow execution (by sleeping) until catching real
time.

Finally, we should be able to cleanup icount=auto mode completely, and keep
icount usage for determistic purposes only.

It is built upon new TCG Plugins inline ops (store + conditional callbacks), now
merged on master.

Example in user-mode:

- Retrieve number of instructions to execute /bin/true
$ ./build/qemu-x86_64 -plugin ./build/tests/plugin/libinsn.so -d plugin 
/bin/true
cpu 0 insns: 120546
total insns: 120546
- Slow execution to match 5 seconds
$ time ./build/qemu-x86_64 -plugin 
./build/contrib/plugins/libips,ips=$((120546/5)) /bin/true
real0m4.985s

Alex Bennée (4):
  sysemu: add set_virtual_time to accel ops
  qtest: use cpu interface in qtest_clock_warp
  sysemu: generalise qtest_warp_clock as qemu_clock_advance_virtual_time
  plugins: add time control API

Pierrick Bouvier (1):
  contrib/plugins: add ips plugin example for cost modeling

 include/qemu/qemu-plugin.h|  23 ++
 include/qemu/timer.h  |  15 ++
 include/sysemu/accel-ops.h|  18 +-
 include/sysemu/cpu-timers.h   |   3 +-
 include/sysemu/qtest.h|   1 +
 accel/qtest/qtest.c   |   1 +
 contrib/plugins/ips.c | 239 ++
 plugins/api.c |  31 +++
 ...t-virtual-clock.c => cpus-virtual-clock.c} |   5 +
 system/cpus.c |  11 +
 system/qtest.c|  27 +-
 util/qemu-timer.c |  26 ++
 contrib/plugins/Makefile  |   1 +
 plugins/qemu-plugins.symbols  |   2 +
 stubs/meson.build |   6 +-
 15 files changed, 383 insertions(+), 26 deletions(-)
 create mode 100644 contrib/plugins/ips.c
 rename stubs/{cpus-get-virtual-clock.c => cpus-virtual-clock.c} (68%)

-- 
2.39.2

[PATCH 3/5] sysemu: generalise qtest_warp_clock as qemu_clock_advance_virtual_time

From: Alex Bennée 

Move the key functionality of moving time forward into the clock
sub-system itself. This will allow us to plumb in time control into
plugins.

From: Alex Bennée 
Signed-off-by: Alex Bennée 
---
 include/qemu/timer.h | 15 +++
 system/qtest.c   | 25 +++--
 util/qemu-timer.c| 26 ++
 3 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 9a366e551fb..910587d8293 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -245,6 +245,21 @@ bool qemu_clock_run_timers(QEMUClockType type);
  */
 bool qemu_clock_run_all_timers(void);
 
+/**
+ * qemu_clock_advance_virtual_time(): advance the virtual time tick
+ * @target: target time in nanoseconds
+ *
+ * This function is used where the control of the flow of time has
+ * been delegated to outside the clock subsystem (be it qtest, icount
+ * or some other external source). You can ask the clock system to
+ * return @early at the first expired timer.
+ *
+ * Time can only move forward, attempts to reverse time would lead to
+ * an error.
+ *
+ * Returns: new virtual time.
+ */
+int64_t qemu_clock_advance_virtual_time(int64_t dest);
 
 /*
  * QEMUTimerList
diff --git a/system/qtest.c b/system/qtest.c
index ee8b139e982..e6f6b4e62d5 100644
--- a/system/qtest.c
+++ b/system/qtest.c
@@ -337,26 +337,6 @@ void qtest_set_virtual_clock(int64_t count)
 qatomic_set_i64(_clock_counter, count);
 }
 
-static void qtest_clock_warp(int64_t dest)
-{
-int64_t clock = cpus_get_virtual_clock();
-AioContext *aio_context;
-assert(qtest_enabled());
-aio_context = qemu_get_aio_context();
-while (clock < dest) {
-int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
-  QEMU_TIMER_ATTR_ALL);
-int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
-
-cpus_set_virtual_clock(cpus_get_virtual_clock() + warp);
-
-qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
-timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
-clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
-}
-qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
-}
-
 static bool (*process_command_cb)(CharBackend *chr, gchar **words);
 
 void qtest_set_command_cb(bool (*pc_cb)(CharBackend *chr, gchar **words))
@@ -755,7 +735,8 @@ static void qtest_process_command(CharBackend *chr, gchar 
**words)
 ns = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 QEMU_TIMER_ATTR_ALL);
 }
-qtest_clock_warp(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ns);
+qemu_clock_advance_virtual_time(
+qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ns);
 qtest_send_prefix(chr);
 qtest_sendf(chr, "OK %"PRIi64"\n",
 (int64_t)qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
@@ -781,7 +762,7 @@ static void qtest_process_command(CharBackend *chr, gchar 
**words)
 g_assert(words[1]);
 ret = qemu_strtoi64(words[1], NULL, 0, );
 g_assert(ret == 0);
-qtest_clock_warp(ns);
+qemu_clock_advance_virtual_time(ns);
 qtest_send_prefix(chr);
 qtest_sendf(chr, "OK %"PRIi64"\n",
 (int64_t)qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index 6a0de33dd2b..213114be68c 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -645,6 +645,11 @@ int64_t qemu_clock_get_ns(QEMUClockType type)
 }
 }
 
+static void qemu_virtual_clock_set_ns(int64_t time)
+{
+return cpus_set_virtual_clock(time);
+}
+
 void init_clocks(QEMUTimerListNotifyCB *notify_cb)
 {
 QEMUClockType type;
@@ -675,3 +680,24 @@ bool qemu_clock_run_all_timers(void)
 
 return progress;
 }
+
+int64_t qemu_clock_advance_virtual_time(int64_t dest)
+{
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+AioContext *aio_context;
+aio_context = qemu_get_aio_context();
+while (clock < dest) {
+int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+  QEMU_TIMER_ATTR_ALL);
+int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
+
+qemu_virtual_clock_set_ns(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 
warp);
+
+qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
+timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
+clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+}
+qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+
+return clock;
+}
-- 
2.39.2

Re: [PATCH 1/9] monitor: Honor QMP request for fd removal immediately

2024-05-16 Thread Fabiano Rosas

Daniel P. Berrangé  writes:

> On Fri, Apr 26, 2024 at 11:20:34AM -0300, Fabiano Rosas wrote:
>> We're enabling using the fdset interface to pass file descriptors for
>> use in the migration code. Since migrations can happen more than once
>> during the VMs lifetime, we need a way to remove an fd from the fdset
>> at the end of migration.
>> 
>> The current code only removes an fd from the fdset if the VM is
>> running. This causes a QMP call to "remove-fd" to not actually remove
>> the fd if the VM happens to be stopped.
>> 
>> While the fd would eventually be removed when monitor_fdset_cleanup()
>> is called again, the user request should be honored and the fd
>> actually removed. Calling remove-fd + query-fdset shows a recently
>> removed fd still present.
>> 
>> The runstate_is_running() check was introduced by commit ebe52b592d
>> ("monitor: Prevent removing fd from set during init"), which by the
>> shortlog indicates that they were trying to avoid removing an
>> yet-unduplicated fd too early.
>
> IMHO that should be reverted. The justification says
>
>   "If an fd is added to an fd set via the command line, and it is not
>referenced by another command line option (ie. -drive), then clean
>it up after QEMU initialization is complete"
>
> which I think is pretty weak. Why should QEMU forceably stop an app
> from passing in an FD to be used by a QMP command issued just after
> the VM starts running ?  While it could just use QMP to pass in the
> FD set, the mgmt app might have its own reason for wanting QEMU to
> own the passed FD from the very start of the process execve().

I don't think that's what that patch does. That description is
misleading. I read it as:

   "If an fd is added to an fd set via the command line, and it is not
referenced by another command line option (ie. -drive), then clean
it up ONLY after QEMU initialization is complete"
  ^

By the subject ("monitor: Prevent removing fd from set during init") and
the fact that this function is only called when the monitor connection
closes, I believe the idea was to *save* the fds until after the VM
starts running, i.e. some fd was being lost because
monitor_fdset_cleanup() was being called before the dup().

See my reply to Peter in this same patch (PATCH 1/9).

>
> Implicitly this cleanup is attempting to "fix" a bug where the mgmt
> app passes in an FD that it never needed. If any such bug were ever
> found, then the mgmt app should just be fixed to not pass it in. I
> don't think QEMU needs to be trying to fix mgmt app bugs.
>
> IOW, this commit is imposing an arbitrary & unecessary usage policy
> on passed in FD sets, and as your commit explains has further
> unhelpful (& undocumented) side effects on the 'remove-fd' QMP command.
>
> Just revert it IMHO.
>
>> 
>> I don't see why an fd explicitly removed with qmp_remove_fd() should
>> be under runstate_is_running(). I'm assuming this was a mistake when
>> adding the parenthesis around the expression.
>> 
>> Move the runstate_is_running() check to apply only to the
>> QLIST_EMPTY(dup_fds) side of the expression and ignore it when
>> mon_fdset_fd->removed has been explicitly set.
>> 
>> Signed-off-by: Fabiano Rosas 
>> ---
>>  monitor/fds.c | 6 +++---
>>  1 file changed, 3 insertions(+), 3 deletions(-)
>> 
>> diff --git a/monitor/fds.c b/monitor/fds.c
>> index d86c2c674c..4ec3b7eea9 100644
>> --- a/monitor/fds.c
>> +++ b/monitor/fds.c
>> @@ -173,9 +173,9 @@ static void monitor_fdset_cleanup(MonFdset *mon_fdset)
>>  MonFdsetFd *mon_fdset_fd_next;
>>  
>>  QLIST_FOREACH_SAFE(mon_fdset_fd, _fdset->fds, next, 
>> mon_fdset_fd_next) {
>> -if ((mon_fdset_fd->removed ||
>> -(QLIST_EMPTY(_fdset->dup_fds) && mon_refcount == 0)) &&
>> -runstate_is_running()) {
>> +if (mon_fdset_fd->removed ||
>> +(QLIST_EMPTY(_fdset->dup_fds) && mon_refcount == 0 &&
>> + runstate_is_running())) {
>>  close(mon_fdset_fd->fd);
>>  g_free(mon_fdset_fd->opaque);
>>  QLIST_REMOVE(mon_fdset_fd, next);
>> -- 
>> 2.35.3
>> 
>
> With regards,
> Daniel

Re: [PATCH 1/9] monitor: Honor QMP request for fd removal immediately

2024-05-16 Thread Fabiano Rosas

Hi all, sorry to have been away from this thread for a while, I was
trying to catch up on my reviews queue.

Peter Xu  writes:

> On Fri, Apr 26, 2024 at 11:20:34AM -0300, Fabiano Rosas wrote:
>> We're enabling using the fdset interface to pass file descriptors for
>> use in the migration code. Since migrations can happen more than once
>> during the VMs lifetime, we need a way to remove an fd from the fdset
>> at the end of migration.
>> 
>> The current code only removes an fd from the fdset if the VM is
>> running. This causes a QMP call to "remove-fd" to not actually remove
>> the fd if the VM happens to be stopped.
>> 
>> While the fd would eventually be removed when monitor_fdset_cleanup()
>> is called again, the user request should be honored and the fd
>> actually removed. Calling remove-fd + query-fdset shows a recently
>> removed fd still present.
>> 
>> The runstate_is_running() check was introduced by commit ebe52b592d
>> ("monitor: Prevent removing fd from set during init"), which by the
>> shortlog indicates that they were trying to avoid removing an
>> yet-unduplicated fd too early.
>> 
>> I don't see why an fd explicitly removed with qmp_remove_fd() should
>> be under runstate_is_running(). I'm assuming this was a mistake when
>> adding the parenthesis around the expression.
>> 
>> Move the runstate_is_running() check to apply only to the
>> QLIST_EMPTY(dup_fds) side of the expression and ignore it when
>> mon_fdset_fd->removed has been explicitly set.
>
> I am confused on why the fdset removal is as complicated.  I'm also
> wondering here whether it's dropped because we checked against
> "mon_refcount == 0", and maybe monitor_fdset_cleanup() is simply called
> _before_ a monitor is created?  Why do we need such check on the first
> place?
>

It seems the intention was to reuse monitor_fdset_cleanup() to do
cleanup when all monitors connections are closed:

efb87c1697 ("monitor: Clean up fd sets on monitor disconnect")
Author: Corey Bryant 
Date:   Tue Aug 14 16:43:48 2012 -0400

monitor: Clean up fd sets on monitor disconnect

Fd sets are shared by all monitor connections.  Fd sets are considered
to be in use while at least one monitor is connected.  When the last
monitor disconnects, all fds that are members of an fd set with no
outstanding dup references are closed.  This prevents any fd leakage
associated with a client disconnect prior to using a passed fd.

Signed-off-by: Corey Bryant 
Signed-off-by: Kevin Wolf 

This could have been done differently at monitor_qmp_event():

case CHR_EVENT_CLOSED:
...
mon_refcount--;
if (mon_refcount == 0) {
monitor_fdsets_cleanup();
}

But maybe there was a concern about making sure the empty fdsets (last
block in monitor_fdset_cleanup) were removed at every refcount decrement
and not only when mon_refcount == 0 for some reason.

> I'm thinking one case where the only QMP monitor got (for some reason)
> disconnected, and reconnected again during VM running.  Won't current code
> already lead to unwanted removal of mostly all fds due to mon_refcount==0?

I think that's the case that the patch in question was trying to
avoid. If the last monitor connects and disconnects while fds have not
been dup'ed yet, the mon_fdset->dup_fds list will be empty and what you
say will happen. There seems to be an assumption that after the guest
starts running all fds that need to be dup'ed will already have been
dup'ed.

So I think we cannot simply revert the patch as Daniel suggested,
because that might regress the original block layer use-case if a
monitor open->close causes the removal of all the yet undup'ed fds[1].

For the migration use-case, the dup() only happens after the migrate
command has been issued, so the runstate_is_running() check doesn't help
us. But it also doesn't hurt. However, we're still exposed to a monitor
disconnection removing the undup'ed fds. So AFAICS, we either stop
calling monitor_fdset_cleanup() at monitor close or declare that this
issue is unlikely to occur (because it is) and leave a comment in the
code.

===
1- I ran a quick test:

connect() // monitor opened: refcnt: 1

{"execute": "add-fd", "arguments": {"fdset-id": 1}}
{"return": {"fd": 9, "fdset-id": 1}}

{"execute": "add-fd", "arguments": {"fdset-id": 1}}
{"return": {"fd": 21, "fdset-id": 1}}

close()   // monitor closed: refcnt: 0

connect() // monitor opened: refcnt: 1

{"execute": "migrate", "arguments": {"uri": "file:/dev/fdset/1,offset=4096"}}
{
"error": {
"class": "GenericError",
"desc": "Outgoing migration needs two fds in the fdset, got 0"
}
}

>
> I also am confused why ->removed flags is ever needed, and why we can't
> already remove the fdsets fds if found matching.
>

Prior to commit efb87c1697 ("monitor: Clean up fd sets on monitor
disconnect") we only called monitor_fdset_cleanup() from
qmp_remove_fd(), so we effectively

Re: [PATCH v2 08/15] hw/riscv/riscv-iommu: add Address Translation Cache (IOATC)





On 5/8/24 04:26, Frank Chang wrote:

Hi Daniel,

Daniel Henrique Barboza  於 2024年3月8日 週五 上午12:05寫道：


From: Tomasz Jeznach 

The RISC-V IOMMU spec predicts that the IOMMU can use translation caches
to hold entries from the DDT. This includes implementation for all cache
commands that are marked as 'not implemented'.

There are some artifacts included in the cache that predicts s-stage and
g-stage elements, although we don't support it yet. We'll introduce them
next.

Signed-off-by: Tomasz Jeznach 
Signed-off-by: Daniel Henrique Barboza 
---
  hw/riscv/riscv-iommu.c | 190 -
  hw/riscv/riscv-iommu.h |   2 +
  2 files changed, 188 insertions(+), 4 deletions(-)

diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
index df534b99b0..0b93146327 100644
--- a/hw/riscv/riscv-iommu.c
+++ b/hw/riscv/riscv-iommu.c
@@ -63,6 +63,16 @@ struct RISCVIOMMUContext {
  uint64_t msiptp;/* MSI redirection page table pointer */
  };

+/* Address translation cache entry */
+struct RISCVIOMMUEntry {
+uint64_t iova:44;   /* IOVA Page Number */
+uint64_t pscid:20;  /* Process Soft-Context identifier */
+uint64_t phys:44;   /* Physical Page Number */
+uint64_t gscid:16;  /* Guest Soft-Context identifier */
+uint64_t perm:2;/* IOMMU_RW flags */
+uint64_t __rfu:2;
+};
+
  /* IOMMU index for transactions without PASID specified. */
  #define RISCV_IOMMU_NOPASID 0

@@ -629,14 +639,127 @@ static AddressSpace *riscv_iommu_space(RISCVIOMMUState 
*s, uint32_t devid)
  return >iova_as;
  }

+/* Translation Object cache support */
+static gboolean __iot_equal(gconstpointer v1, gconstpointer v2)
+{
+RISCVIOMMUEntry *t1 = (RISCVIOMMUEntry *) v1;
+RISCVIOMMUEntry *t2 = (RISCVIOMMUEntry *) v2;
+return t1->gscid == t2->gscid && t1->pscid == t2->pscid &&
+   t1->iova == t2->iova;
+}
+
+static guint __iot_hash(gconstpointer v)
+{
+RISCVIOMMUEntry *t = (RISCVIOMMUEntry *) v;
+return (guint)t->iova;
+}
+
+/* GV: 1 PSCV: 1 AV: 1 */
+static void __iot_inval_pscid_iova(gpointer key, gpointer value, gpointer data)
+{
+RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
+if (iot->gscid == arg->gscid &&
+iot->pscid == arg->pscid &&
+iot->iova == arg->iova) {
+iot->perm = 0;


Maybe using IOMMU_NONE would be clearer?


Agree. I changed all relevant "iot->perm = 0" instances to "iot->perm = 
IOMMU_NONE".


Thanks,


Daniel



Otherwise,
Reviewed-by: Frank Chang 


+}
+}
+
+/* GV: 1 PSCV: 1 AV: 0 */
+static void __iot_inval_pscid(gpointer key, gpointer value, gpointer data)
+{
+RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
+if (iot->gscid == arg->gscid &&
+iot->pscid == arg->pscid) {
+iot->perm = 0;
+}
+}
+
+/* GV: 1 GVMA: 1 */
+static void __iot_inval_gscid_gpa(gpointer key, gpointer value, gpointer data)
+{
+RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
+if (iot->gscid == arg->gscid) {
+/* simplified cache, no GPA matching */
+iot->perm = 0;
+}
+}
+
+/* GV: 1 GVMA: 0 */
+static void __iot_inval_gscid(gpointer key, gpointer value, gpointer data)
+{
+RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
+if (iot->gscid == arg->gscid) {
+iot->perm = 0;
+}
+}
+
+/* GV: 0 */
+static void __iot_inval_all(gpointer key, gpointer value, gpointer data)
+{
+RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
+iot->perm = 0;
+}
+
+/* caller should keep ref-count for iot_cache object */
+static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
+GHashTable *iot_cache, hwaddr iova)
+{
+RISCVIOMMUEntry key = {
+.pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
+.iova  = PPN_DOWN(iova),
+};
+return g_hash_table_lookup(iot_cache, );
+}
+
+/* caller should keep ref-count for iot_cache object */
+static void riscv_iommu_iot_update(RISCVIOMMUState *s,
+GHashTable *iot_cache, RISCVIOMMUEntry *iot)
+{
+if (!s->iot_limit) {
+return;
+}
+
+if (g_hash_table_size(s->iot_cache) >= s->iot_limit) {
+iot_cache = g_hash_table_new_full(__iot_hash, __iot_equal,
+  g_free, NULL);
+g_hash_table_unref(qatomic_xchg(>iot_cache, iot_cache));
+}
+g_hash_table_add(iot_cache, iot);
+}
+
+static void riscv_iommu_iot_inval(RISCVIOMMUState *s, GHFunc func,
+uint32_t gscid, uint32_t pscid, hwaddr iova)
+{
+GHashTable *iot_cache;
+RISCVIOMMUEntry key = {
+.gscid = gscid,
+.pscid = pscid,
+.iova  = PPN_DOWN(iova),
+};
+
+iot_cache = g_hash_table_ref(s->iot_cache);
+g_hash_table_foreach(iot_cache, func, );
+

Re: [PATCH 00/41] target/sparc: Implement VIS4

2024-05-16 Thread Mark Cave-Ayland


On 15/05/2024 16:30, Richard Henderson wrote:


On 4/29/24 23:02, Richard Henderson wrote:

On 4/29/24 13:52, Mark Cave-Ayland wrote:
No objections here about the remainder of the series, other than that I don't have 
an easy/obvious way to test the new instructions...


I was thinking about adding support to RISU, but the gcc compile farm sparc 
machines have been down for ages, so no way to generate the reference traces.


Update: I have successfully ported RISU to Sparc64, Solaris and Linux.  There is a 
limitation in that I cannot find how to extract %gsr from the signal frame, which is 
unfortunate, but I can work around that for now.


I have added descriptions of VIS1 instructions to RISU, and it turns out we have 
failures relative to a Sparc M8.  I have not yet analyzed these failures, but it 
proves the effort was not wasted.  :-)


I'll clean up these patches and post them here when I next get some downtime.
 
r~


That's great news, thanks for the update. I've had confirmation that there is work 
underway to repair the SPARC hardware hosting Linux for the gcc buildfarm, so 
hopefully it will be back in service soon.



ATB,

Mark.

Re: [PATCH v6 6/8] xen: mapcache: Pass the ram_addr offset to xen_map_cache()

2024-05-16 Thread David Hildenbrand


On 16.05.24 17:48, Edgar E. Iglesias wrote:

From: "Edgar E. Iglesias" 

Pass the ram_addr offset to xen_map_cache.
This is in preparation for adding grant mappings that need
to compute the address within the RAMBlock.

No functional changes.

Signed-off-by: Edgar E. Iglesias 
---


Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH v6 5/8] softmmu: Replace check for RAMBlock offset 0 with xen_mr_is_memory

2024-05-16 Thread David Hildenbrand


On 16.05.24 17:48, Edgar E. Iglesias wrote:

From: "Edgar E. Iglesias" 

For xen, when checking for the first RAM (xen_memory), use
xen_mr_is_memory() rather than checking for a RAMBlock with
offset 0.

All Xen machines create xen_memory first so this has no
functional change for existing machines.

Signed-off-by: Edgar E. Iglesias 
Reviewed-by: Stefano Stabellini 
---


Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH v6 4/8] softmmu: xen: Always pass offset + addr to xen_map_cache

2024-05-16 Thread David Hildenbrand


On 16.05.24 17:48, Edgar E. Iglesias wrote:

From: "Edgar E. Iglesias" 

Always pass address with offset to xen_map_cache().
This is in preparation for support for grant mappings.

Since this is within a block that checks for offset == 0,
this has no functional changes.

Signed-off-by: Edgar E. Iglesias 
Reviewed-by: Stefano Stabellini 
---


Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [RFC/PATCH v2 03/12] hw/arm/virt: confidential guest support

* Daniel P. Berrang?  [2024-05-16 16:04:24]:

> On Thu, May 16, 2024 at 02:33:47PM +, Srivatsa Vaddagiri wrote:
> > This adds support to launch hypervisor-assisted confidential guests,
> > where guest's memory is protected from a potentially untrusted host.
> > Hypervisor can setup host's page-tables so that it loses access to guest
> > memory.
> > 
> > Since some guest drivers may need to communicate data with their host
> > counterparts via shared memory, optionally allow setting aside some part
> > of the confidential guest's memory as "shared". The size of this shared
> > memory is specified via the optional "swiotlb-size" parameter.
> > 
> > -machine virt,confidential-guest-support=prot0 \
> > -object arm-confidential-guest,id=prot0,swiotlb-size=16777216
> > 
> > The size of this shared memory is indicated to the guest in size/reg
> > property of device-tree node "/reserved-memory/restricted_dma_reserved".
> > A memory-region property is added to device-tree node representing
> > virtio-pcie hub, so that all DMA allocations requested by guest's 
> > virtio-pcie
> > device drivers are satisfied from the shared swiotlb region.
> 
> For reference, there is another series proposing confidential guest
> support for the 'virt' machine on AArch64 with KVM
> 
>  https://lists.nongnu.org/archive/html/qemu-devel/2024-04/msg02742.html
> 
> I've no idea how closely your impl matches the KVM proposed impl. ie
> whether we need 2 distinct "ConfidentialGuest" subclasses for KVM vs
> Gunyah, or whether 1 can cope with both.  If we do need 2 distinct
> subclasses for each hypervisor, then calling this Gunyah targetted
> object 'arm-confidential-guest' is too broad of an name.

Thanks for that pointer! Let me study the proposed KVM implementation and 
see how we can consolidate support for KVM and Gunyah hypervisors.

- vatsa

Re: [PATCH v2 09/15] hw/riscv/riscv-iommu: add s-stage and g-stage support





On 5/10/24 08:14, Andrew Jones wrote:

On Fri, May 10, 2024 at 06:36:51PM GMT, Frank Chang wrote:
...

  static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
-IOMMUTLBEntry *iotlb)
+IOMMUTLBEntry *iotlb, bool gpa)
  {
+dma_addr_t addr, base;
+uint64_t satp, gatp, pte;
+bool en_s, en_g;
+struct {
+unsigned char step;
+unsigned char levels;
+unsigned char ptidxbits;
+unsigned char ptesize;
+} sc[2];
+/* Translation stage phase */
+enum {
+S_STAGE = 0,
+G_STAGE = 1,
+} pass;
+
+satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
+gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
+
+en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;
+en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
+
  /* Early check for MSI address match when IOVA == GPA */
-if (iotlb->perm & IOMMU_WO &&
+if (!en_s && (iotlb->perm & IOMMU_WO) &&


I'm wondering do we need to check "en_s" for MSI writes?

IOMMU spec Section 2.3.3. Process to translate addresses of MSIs says:
"Determine if the address A is an access to a virtual interrupt file
as specified in Section 2.1.3.6."

and Section 2.1.3.6 says:

"An incoming memory access made by a device is recognized as
an access to a virtual interrupt file if the destination guest physical page
matches the supplied address pattern in all bit positions that are zeros
in the supplied address mask. In detail, a memory access to
guest physical address A is recognized as an access to a virtual
interrupt file’s
memory-mapped page if:
(A >> 12) & ~msi_addr_mask = (msi_addr_pattern & ~msi_addr_mask)"

Is checking the address pattern sufficient enough to determine
the address is an MSI to a virtual interrupt file?



I think so. In fact, I've removed that en_s check on our internal build in
order to get things working for my irqbypass work, as we can do device
assignment with VFIO with only S-stage enabled.


The following code will be fixed up here:

 static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
-IOMMUTLBEntry *iotlb, bool gpa)
+IOMMUTLBEntry *iotlb)
 {
 dma_addr_t addr, base;
 uint64_t satp, gatp, pte;
@@ -238,11 +237,11 @@ static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, 
RISCVIOMMUContext *ctx,
 satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
 gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
 
-en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE && !gpa;

+en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
 en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
 
 /* Early check for MSI address match when IOVA == GPA */

-if (!en_s && (iotlb->perm & IOMMU_WO) &&
+if ((iotlb->perm & IOMMU_WO) &&
 riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
 iotlb->target_as = >trap_as;
 iotlb->translated_addr = iotlb->iova;
@@ -1203,7 +1202,7 @@ static int riscv_iommu_translate(RISCVIOMMUState *s, 
RISCVIOMMUContext *ctx,
 }
 
 /* Translate using device directory / page table information. */

-fault = riscv_iommu_spa_fetch(s, ctx, iotlb, false);
+fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
 
 if (!fault && iotlb->target_as == >trap_as) {

 /* Do not cache trapped MSI translations */

'gpa' is eliminated since it was only being used as 'false' by the only
caller of riscv_iommu_spa_fetch(). The boolean was used only to calculate
en_s as "&& !gpa", so it's always 'true' and had no impact in en_s. My
understand here is that 'gpa' was a prototype of the first implementation
that got left behind and ended up not being used.

As for the MSI check, we won't skip translation if satp is bare (!en_s) because
we might be using just stage2 for a guest, thus en_s is removed from the
conditional. As Frank said, this change also complies with the spec since we 
don't
need to check satp to determine if the address is an MSI to a virtual interrupt
file.

And, last but not the least, this change doesn't break my KVM VFIO passthrough
test case :) I'll document more about the test case I'm using in the v3 cover
letter.


Thanks,

Daniel




Thanks,
drew

Re: [PATCH] gitlab-ci: Replace Docker with Kaniko

On Thu, May 16, 2024 at 05:52:43PM +0100, Camilla Conte wrote:
> Enables caching from the qemu-project repository.
> 
> Uses a dedicated "$NAME-cache" tag for caching, to address limitations.
> See issue "when using --cache=true, kaniko fail to push cache layer [...]":
> https://github.com/GoogleContainerTools/kaniko/issues/1459

After investigating, this is a result of a different design approach
for caching in kaniko.

In docker, it can leverage any existing image as a cache source,
reusing individual layers that were present. IOW, there's no
difference between a cache and a final image, they're one and the
same thing

In kaniko, the cache is a distinct object type. IIUC, it is not
populated with the individual layers, instead it has a custom
format for storing the cached content. Therefore the concept of
storing the cache at the same location as the final image, is
completely inappropriate - you can't store two completely different
kinds of content at the same place.

That is also why you can't just "git pull" the fetch the cache
image(s) beforehand, and also why it doesn't look like you can
use multiple cache sources with kaniko.

None of this is inherantly a bad thing. except when it comes
to data storage. By using Kaniko we would, at minimum, doubling
the amount of data storage we consume in the gitlab registry.

This is a potentially significant concern because GitLab does
technically have a limited storage quota, even with our free
OSS plan  subscription.

Due to technical limitations, they've never been able to
actually enforce it thus far, but one day they probably will.
At which point we're doomed, because even with our current
Docker-in-Docker setup I believe we're exceeding our quota.
Thus the idea of doubling our container storage usage is pretty
unappealing.

We can avoid that by running without cache, but that has the
cost of increasing the job running time, since all containers
would be rebuilt on every pipeline. This will burn through
our Azure compute allowance more quickly (or our GitLab CI
credits if we had to switch away from Azure).

> Does not specify a context since no Dockerfile is using COPY or ADD 
> instructions.
> 
> Does not enable reproducible builds as
> that results in builds failing with an out of memory error.
> See issue "Using --reproducible loads entire image into memory":
> https://github.com/GoogleContainerTools/kaniko/issues/862
> 
> Previous attempts, for the records:
>   - Alex Bennée: 
> https://lore.kernel.org/qemu-devel/20230330101141.30199-12-alex.ben...@linaro.org/
>   - Camilla Conte (me): 
> https://lore.kernel.org/qemu-devel/20230531150824.32349-6-cco...@redhat.com/
> 
> Signed-off-by: Camilla Conte 
> ---
>  .gitlab-ci.d/container-template.yml | 25 +++--
>  1 file changed, 11 insertions(+), 14 deletions(-)
> 
> diff --git a/.gitlab-ci.d/container-template.yml 
> b/.gitlab-ci.d/container-template.yml
> index 4eec72f383..066f253dd5 100644
> --- a/.gitlab-ci.d/container-template.yml
> +++ b/.gitlab-ci.d/container-template.yml
> @@ -1,21 +1,18 @@
>  .container_job_template:
>extends: .base_job_template
> -  image: docker:latest
>stage: containers
> -  services:
> -- docker:dind
> +  image:
> +name: gcr.io/kaniko-project/executor:debug
> +entrypoint: [""]
> +  variables:
> +DOCKERFILE: "$CI_PROJECT_DIR/tests/docker/dockerfiles/$NAME.docker"
> +CACHE_REPO: "$CI_REGISTRY/qemu-project/qemu/qemu/$NAME-cache"
>before_script:
>  - export TAG="$CI_REGISTRY_IMAGE/qemu/$NAME:$QEMU_CI_CONTAINER_TAG"
> -# Always ':latest' because we always use upstream as a common cache 
> source
> -- export COMMON_TAG="$CI_REGISTRY/qemu-project/qemu/qemu/$NAME:latest"
> -- docker login $CI_REGISTRY -u "$CI_REGISTRY_USER" -p 
> "$CI_REGISTRY_PASSWORD"
> -- until docker info; do sleep 1; done
>script:
>  - echo "TAG:$TAG"
> -- echo "COMMON_TAG:$COMMON_TAG"
> -- docker build --tag "$TAG" --cache-from "$TAG" --cache-from 
> "$COMMON_TAG"
> -  --build-arg BUILDKIT_INLINE_CACHE=1
> -  -f "tests/docker/dockerfiles/$NAME.docker" "."
> -- docker push "$TAG"
> -  after_script:
> -- docker logout
> +- /kaniko/executor
> +  --dockerfile "$DOCKERFILE"
> +  --destination "$TAG"
> +  --cache=true
> +  --cache-repo="$CACHE_REPO"

I'm surprised there is no need to set provide the user/password
login credentials for the registry. None the less  I tested this
and it succeeed.

I guess gitlab somehow has some magic authorization granted to any CI
job, that avoids the need for a manual login ? Wonder why we needed
the 'docker login' step though ? Perhaps because D-in-D results in
using an externally running docker daemon which didn't inherit
credentials from the job environment ?

Caching of course fails when I'm running jobs in my fork. IOW, if we
change container content in a fork and want to test it, it will be
doing a full build from scratch every time. This likely isn't the

Re: [PATCH 08/20] qapi/parser: differentiate intro and outro paragraphs

On Thu, May 16, 2024 at 11:06 AM John Snow  wrote:

>
>
> On Thu, May 16, 2024, 5:34 AM Markus Armbruster  wrote:
>
>> John Snow  writes:
>>
>> > Add a semantic tag to paragraphs that appear *before* tagged
>> > sections/members/features and those that appear after. This will control
>> > how they are inlined when doc sections are merged and flattened.
>>
>> This future use is not obvious to me now.  I guess the effective way to
>> help me see it is actual patches, which will come in due time.
>>
>
> Head recursion and tail recursion, respectively :)
>
> * intro
> * inherited intro
> * members [ancestor-descendent]
> * features [ancestor-descendent]
> * inherited outro
> * outro
>
> Child gets the first and final words. Inherited stuff goes in the sandwich
> fillings.
>
> It feels like a simple rule that's easy to internalize. As a bonus, you
> can explain it by analogy to Americans as a burger, which is the only
> metaphor we understand.
>
>
>> > Signed-off-by: John Snow 
>> > ---
>> >  scripts/qapi/parser.py | 22 +-
>> >  1 file changed, 17 insertions(+), 5 deletions(-)
>> >
>> > diff --git a/scripts/qapi/parser.py b/scripts/qapi/parser.py
>> > index cf4cbca1c1f..b1794f71e12 100644
>> > --- a/scripts/qapi/parser.py
>> > +++ b/scripts/qapi/parser.py
>> > @@ -503,6 +503,10 @@ def get_doc(self) -> 'QAPIDoc':
>> >  self.accept(False)
>> >  line = self.get_doc_line()
>> >  no_more_args = False
>> > +# Paragraphs before members/features/tagged are "intro"
>> paragraphs.
>> > +# Any appearing subsequently are "outro" paragraphs.
>> > +# This is only semantic metadata for the doc generator.
>>
>> Not sure about the last sentence.  Isn't it true for almost everything
>> around here?
>>
>
> I guess I was trying to say "There's no real difference between the two
> mechanically, it's purely based on where it appears in the doc block, which
> offers only a heuristic for its semantic value- introductory statements or
> additional detail."
>
> In my mind: the other "kind" values have some more mechanical difference
> to them, but intro/outro don't.
>
>
>> Also, long line.
>>
>> > +intro = True
>> >
>> >  while line is not None:
>> >  # Blank lines
>> > @@ -532,6 +536,7 @@ def get_doc(self) -> 'QAPIDoc':
>> >  raise QAPIParseError(
>> >  self, 'feature descriptions expected')
>> >  no_more_args = True
>> > +intro = False
>>
>> After feature descriptions.
>>
>> >  elif match := self._match_at_name_colon(line):
>> >  # description
>> >  if no_more_args:
>> > @@ -547,6 +552,7 @@ def get_doc(self) -> 'QAPIDoc':
>> >  doc.append_line(text)
>> >  line = self.get_doc_indented(doc)
>> >  no_more_args = True
>> > +intro = False
>>
>> Or after member descriptions.
>>
>> >  elif match := re.match(
>> >
>> r'(Returns|Errors|Since|Notes?|Examples?|TODO): *',
>> >  line):
>> > @@ -557,13 +563,14 @@ def get_doc(self) -> 'QAPIDoc':
>> >  doc.append_line(text)
>> >  line = self.get_doc_indented(doc)
>> >  no_more_args = True
>> > +intro = False
>>
>> Or after the first tagged section.
>>
>> Okay, it does what it says on the tin.
>>
>> >  elif line.startswith('='):
>> >  raise QAPIParseError(
>> >  self,
>> >  "unexpected '=' markup in definition
>> documentation")
>> >  else:
>> >  # tag-less paragraph
>> > -doc.ensure_untagged_section(self.info)
>> > +doc.ensure_untagged_section(self.info, intro)
>> >  doc.append_line(line)
>> >  line = self.get_doc_paragraph(doc)
>> >  else:
>> > @@ -617,7 +624,7 @@ def __init__(
>> >  self,
>> >  info: QAPISourceInfo,
>> >  tag: Optional[str] = None,
>> > -kind: str = 'paragraph',
>> > +kind: str = 'intro-paragraph',
>>
>> The question "why is this optional?" crossed my mind when reviewing the
>> previous patch.  I left it unasked, because I felt challenging the
>> overlap between @kind and @tag was more useful.  However, the new
>> default value 'intro-paragraph' feels more arbitrary to me than the old
>> one 'paragraph', and that makes the question pop right back into my
>> mind.
>>
>
> Just "don't break API" habit, nothing more. I can make it mandatory.
>
>
>> Unless I'm mistaken, all calls but one @tag and @kind.  Making that one
>> pass it too feels simpler to me.
>>
>> Moot if we fuse @tag and @kind, of course.
>
>
>> >  ):
>> >

Re: CPR/liveupdate: test results using prior bug fix

2024-05-16 Thread Steven Sistare


On 5/16/2024 1:24 PM, Michael Galaxy wrote:

On 5/14/24 08:54, Michael Tokarev wrote:

On 5/14/24 16:39, Michael Galaxy wrote:

Steve,

OK, so it does not look like this bugfix you wrote was included in 8.2.4 
(which was released yesterday). Unfortunately, that means that anyone using 
CPR in that release will still (eventually) encounter the bug like I did.


8.2.4 is basically a "bugfix" release for 8.2.3 which I somewhat
screwed up (in a minor way), plus a few currently (at the time)
queued up changes.   8.2.3 was a big release though.

I would recommend that y'all consider cherry-picking, perhaps, the relevant 
commits for a possible 8.2.5 ?


Please Cc changes which are relevant for -stable to, well,
qemu-sta...@nongnu.org :)

Which changes needs to be picked up?

Steve, can you comment here, please? At a minimum, we have this one: [PULL 
20/25] migration: stop vm for cpr


But that pull came with a handful of other changes that are also not in QEMU v8, 
so I suspect I'm missing some other important changes that might be important 
for a stable release?


- Michael


Hi Michael, I sent the full list of commits to this distribution yesterday, and
I see it in my Sent email folder.  Copying verbatim:


Michael Galaxy, I'm afraid you are out of luck with respect to qemu 8.2.
It has some of the cpr reboot commits, but is missing the following:

87a2848 migration: massage cpr-reboot documentation
cbdafc1 migration: options incompatible with cpr
ce5db1c migration: update cpr-reboot description
9867d4d migration: stop vm for cpr
4af667f migration: notifier error checking
bf78a04 migration: refactor migrate_fd_connect failures
6835f5a migration: per-mode notifiers
5663dd3 migration: MigrationNotifyFunc
c763a23e migration: remove postcopy_after_devices
9d9babf migration: MigrationEvent for notifiers
3e77573 migration: convert to NotifierWithReturn
d91f33c migration: remove error from notifier data
be19d83 notify: pass error to notifier with return
b12635f migration: fix coverity migrate_mode finding
2b58a8b tests/qtest: postcopy migration with suspend
b1fdd21 tests/qtest: precopy migration with suspend
5014478 tests/qtest: option to suspend during migration
f064975 tests/qtest: migration events
49a5020 migration: preserve suspended for bg_migration
58b1057 migration: preserve suspended for snapshot
b4e9ddc migration: preserve suspended runstate
d3c86c99 migration: propagate suspended runstate
9ff5e79 cpus: vm_resume
0f1db06 cpus: check running not RUN_STATE_RUNNING
b9ae473 cpus: stop vm in suspended runstate
f06f316 cpus: vm_was_suspended

All of those landed in qemu 9.0.
---

- Steve

Re: [PATCH 00/20] qapi: new sphinx qapi domain pre-requisites

2024-05-16 Thread Stefan Hajnoczi

On Tue, May 14, 2024 at 05:57:19PM -0400, John Snow wrote:
> Howdy - this patch series is the first batch of patches meant to prepare
> the QAPI documentation for a new Sphinx module that adds
> cross-references, an index, improved inlining, elision of types unseen
> on the wire, and other goodies.
> 
> This series addresses just existing code and documentation that needs to
> be changed and doesn't introduce anything new just yet - except the rST
> conversion of Notes and Examples sections, which DOES impact the
> existing QAPI documentation generation.
> 
> If you're CC'd on this series, it's *probably* because I've adjusted
> some QAPI documentation that you're the maintainer of - In most cases,
> these changes are purely mechanical (converting QAPI sections into pure
> rST) and probably nothing too interesting. In a small handful of cases
> (patches 15-17), I've been a bit more invasive and you may want to take
> a quick peek.
> 
> Overview:
> 
> Patches 1-3: linter/typing cleanup
> Patches 4-12: QAPI generator fixes/miscellany
> Patch 13: qapidoc.py fix (to prepare for rST conversion)
> Patches 14-20: QAPI documentation modifications, rST conversion
> 
> Sorry,
> --js
> 
> John Snow (20):
>   [DO-NOT-MERGE]: Add some ad-hoc linting helpers.
>   qapi: linter fixups
>   docs/qapidoc: delint a tiny portion of the module
>   qapi/parser: preserve indentation in QAPIDoc sections
>   qapi/parser: adjust info location for doc body section
>   qapi/parser: fix comment parsing immediately following a doc block
>   qapi/parser: add semantic 'kind' parameter to QAPIDoc.Section
>   qapi/parser: differentiate intro and outro paragraphs
>   qapi/parser: add undocumented stub members to all_sections
>   qapi/schema: add __iter__ method to QAPISchemaVariants
>   qapi/schema: add doc_visible property to QAPISchemaDefinition
>   qapi/source: allow multi-line QAPISourceInfo advancing
>   docs/qapidoc: fix nested parsing under untagged sections
>   qapi: fix non-compliant JSON examples
>   qapi: remove developer factoring comments from QAPI doc blocks
>   qapi: rewrite StatsFilter comment
>   qapi: rewrite BlockExportOptions doc block
>   qapi: ensure all errors sections are uniformly typset
>   qapi: convert "Note" sections to plain rST
>   qapi: convert "Example" sections to rST
> 
>  docs/sphinx/qapidoc.py|  62 --
>  qapi/acpi.json|   6 +-
>  qapi/audio.json   |   5 +-
>  qapi/block-core.json  | 195 ++
>  qapi/block-export.json|  16 +-
>  qapi/block.json   |  62 +++---
>  qapi/char.json|  53 +++--
>  qapi/control.json |  32 +--
>  qapi/crypto.json  |  33 ++-
>  qapi/dump.json|  14 +-
>  qapi/introspect.json  |   6 +-
>  qapi/machine-target.json  |  29 +--
>  qapi/machine.json | 138 +++--
>  qapi/migration.json   | 159 +-
>  qapi/misc-target.json |  33 ++-
>  qapi/misc.json| 139 +++--
>  qapi/net.json |  49 +++--
>  qapi/pci.json |  11 +-
>  qapi/qapi-schema.json |   6 +-
>  qapi/qdev.json|  45 ++--
>  qapi/qom.json |  69 +++
>  qapi/replay.json  |  12 +-
>  qapi/rocker.json  |  30 +--
>  qapi/run-state.json   |  63 +++---
>  qapi/sockets.json |  10 +-
>  qapi/stats.json   |  30 ++-
>  qapi/tpm.json |   9 +-
>  qapi/trace.json   |   6 +-
>  qapi/transaction.json |  13 +-
>  qapi/ui.json  | 107 +-
>  qapi/virtio.json  |  50 ++---
>  qapi/yank.json|   6 +-
>  qga/qapi-schema.json  |  48 ++---
>  scripts/qapi-lint.sh  |  51 +
>  scripts/qapi/Makefile |   5 +
>  scripts/qapi/introspect.py|  12 +-
>  scripts/qapi/parser.py| 104 --
>  scripts/qapi/schema.py|  54 -
>  scripts/qapi/source.py|   4 +-
>  scripts/qapi/types.py |   4 +-
>  scripts/qapi/visit.py |   9 +-
>  tests/qapi-schema/doc-empty-section.err   |   2 +-
>  tests/qapi-schema/doc-empty-section.json  |   2 +-
>  tests/qapi-schema/doc-good.json   |  18 +-
>

Re: [PATCH v2 1/3] docs: introduce dedicated page about code provenance / sign-off

2024-05-16 Thread Peter Maydell

On Thu, 16 May 2024 at 18:34, Michael S. Tsirkin  wrote:
>
> On Thu, May 16, 2024 at 06:29:39PM +0100, Peter Maydell wrote:
> > On Thu, 16 May 2024 at 17:22, Daniel P. Berrangé  
> > wrote:
> > >
> > > Currently we have a short paragraph saying that patches must include
> > > a Signed-off-by line, and merely link to the kernel documentation.
> > > The linked kernel docs have a lot of content beyond the part about
> > > sign-off an thus are misleading/distracting to QEMU contributors.
> >
> > Thanks for this -- I've felt for ages that it was a bit awkward
> > that we didn't have a good place to link people to for the fuller
> > explanation of this.
> >
> > > This introduces a dedicated 'code-provenance' page in QEMU talking
> > > about why we require sign-off, explaining the other tags we commonly
> > > use, and what to do in some edge cases.
> >
> > The version of the kernel SubmittingPatches we used to link to
> > includes the text "sorry, no pseudonyms or anonymous contributions".
> > This new documentation doesn't say anything either way about
> > our approach to pseudonyms. I think we should probably say
> > something, but I don't know if we have an in-practice consensus
> > there, so maybe we should approach that as a separate change on
> > top of this patch.
>
>
> Well given we referred to kernel previously then I guess that's
> the concensus, no?

AIUI the kernel devs have changed their point of view on the
pseudonym question, so it's a question of whether we were
deliberately referring to that specific revision of the kernel's
practice because we agreed with it or just by chance...

https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=d4563201f33a022fc0353033d9dfeb1606a88330

is where the kernel changed to saying merely "no anonymous
contributions", dropping the 'pseudonyms' part.

-- PMM

Re: [PATCH v2 0/3] docs: define policy forbidding use of "AI" / LLM code generators

On Thu, May 16, 2024 at 06:34:13PM +0100, Peter Maydell wrote:
> On Thu, 16 May 2024 at 18:20, Michael S. Tsirkin  wrote:
> >
> > On Thu, May 16, 2024 at 05:22:27PM +0100, Daniel P. Berrangé wrote:
> > > AFAICT at its current state of (im)maturity the question of licensing
> > > of AI code generator output does not have a broadly accepted / settled
> > > legal position. This is an inherant bias/self-interest from the vendors
> > > promoting their usage, who tend to minimize/dismiss the legal questions.
> > > >From my POV, this puts such tools in a position of elevated legal risk.
> > >
> > > Given the fuzziness over the legal position of generated code from
> > > such tools, I don't consider it credible (today) for a contributor
> > > to assert compliance with the DCO terms (b) or (c) (which is a stated
> > > pre-requisite for QEMU accepting patches) when a patch includes (or is
> > > derived from) AI generated code.
> > >
> > > By implication, I think that QEMU must (for now) explicitly decline
> > > to (knowingly) accept AI generated code.
> > >
> > > Perhaps a few years down the line the legal uncertainty will have
> > > reduced and we can re-evaluate this policy.
> 
> > At this junction, the code generated by these tools is of such
> > quality that I really won't expect it to pass even cursory code
> > review.
> 
> I disagree, I think that in at least some cases they can
> produce code that would pass our quality bar, especially with
> human supervision and editing after the fact. If the problem
> was merely "LLMs tend to produce lousy output" then we wouldn't
> need to write anything new -- we already have a process for
> dealing with bad patches, which is to say we do code review and
> suggest changes or simply reject the patches. What we *don't* have
> any process to handle is the legal uncertainties that Dan outlines
> above.
> 
> -- PMM


Maybe I'm bad at prompting ;)

-- 
MST

Re: [PATCH v2 0/3] docs: define policy forbidding use of "AI" / LLM code generators

2024-05-16 Thread Peter Maydell

On Thu, 16 May 2024 at 18:20, Michael S. Tsirkin  wrote:
>
> On Thu, May 16, 2024 at 05:22:27PM +0100, Daniel P. Berrangé wrote:
> > AFAICT at its current state of (im)maturity the question of licensing
> > of AI code generator output does not have a broadly accepted / settled
> > legal position. This is an inherant bias/self-interest from the vendors
> > promoting their usage, who tend to minimize/dismiss the legal questions.
> > >From my POV, this puts such tools in a position of elevated legal risk.
> >
> > Given the fuzziness over the legal position of generated code from
> > such tools, I don't consider it credible (today) for a contributor
> > to assert compliance with the DCO terms (b) or (c) (which is a stated
> > pre-requisite for QEMU accepting patches) when a patch includes (or is
> > derived from) AI generated code.
> >
> > By implication, I think that QEMU must (for now) explicitly decline
> > to (knowingly) accept AI generated code.
> >
> > Perhaps a few years down the line the legal uncertainty will have
> > reduced and we can re-evaluate this policy.

> At this junction, the code generated by these tools is of such
> quality that I really won't expect it to pass even cursory code
> review.

I disagree, I think that in at least some cases they can
produce code that would pass our quality bar, especially with
human supervision and editing after the fact. If the problem
was merely "LLMs tend to produce lousy output" then we wouldn't
need to write anything new -- we already have a process for
dealing with bad patches, which is to say we do code review and
suggest changes or simply reject the patches. What we *don't* have
any process to handle is the legal uncertainties that Dan outlines
above.

-- PMM

Re: [PATCH v2 1/3] docs: introduce dedicated page about code provenance / sign-off

On Thu, May 16, 2024 at 06:29:39PM +0100, Peter Maydell wrote:
> On Thu, 16 May 2024 at 17:22, Daniel P. Berrangé  wrote:
> >
> > Currently we have a short paragraph saying that patches must include
> > a Signed-off-by line, and merely link to the kernel documentation.
> > The linked kernel docs have a lot of content beyond the part about
> > sign-off an thus are misleading/distracting to QEMU contributors.
> 
> Thanks for this -- I've felt for ages that it was a bit awkward
> that we didn't have a good place to link people to for the fuller
> explanation of this.
> 
> > This introduces a dedicated 'code-provenance' page in QEMU talking
> > about why we require sign-off, explaining the other tags we commonly
> > use, and what to do in some edge cases.
> 
> The version of the kernel SubmittingPatches we used to link to
> includes the text "sorry, no pseudonyms or anonymous contributions".
> This new documentation doesn't say anything either way about
> our approach to pseudonyms. I think we should probably say
> something, but I don't know if we have an in-practice consensus
> there, so maybe we should approach that as a separate change on
> top of this patch.


Well given we referred to kernel previously then I guess that's
the concensus, no?


> So for this patch:
> 
> Reviewed-by: Peter Maydell 
> 
> thanks
> -- PMM

Re: [PATCH v2 1/3] docs: introduce dedicated page about code provenance / sign-off

On Thu, May 16, 2024 at 05:22:28PM +0100, Daniel P. Berrangé wrote:
> Currently we have a short paragraph saying that patches must include
> a Signed-off-by line, and merely link to the kernel documentation.
> The linked kernel docs have a lot of content beyond the part about
> sign-off an thus are misleading/distracting to QEMU contributors.
> 
> This introduces a dedicated 'code-provenance' page in QEMU talking
> about why we require sign-off, explaining the other tags we commonly
> use, and what to do in some edge cases.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  docs/devel/code-provenance.rst| 212 ++
>  docs/devel/index-process.rst  |   1 +
>  docs/devel/submitting-a-patch.rst |  19 +--
>  3 files changed, 215 insertions(+), 17 deletions(-)
>  create mode 100644 docs/devel/code-provenance.rst
> 
> diff --git a/docs/devel/code-provenance.rst b/docs/devel/code-provenance.rst
> new file mode 100644
> index 00..7c42fae571
> --- /dev/null
> +++ b/docs/devel/code-provenance.rst
> @@ -0,0 +1,212 @@
> +.. _code-provenance:
> +
> +Code provenance
> +===
> +
> +Certifying patch submissions
> +
> +
> +The QEMU community **mandates** all contributors to certify provenance of
> +patch submissions they make to the project. To put it another way,
> +contributors must indicate that they are legally permitted to contribute to
> +the project.
> +
> +Certification is achieved with a low overhead by adding a single line to the
> +bottom of every git commit::
> +
> +   Signed-off-by: YOUR NAME 
> +
> +The addition of this line asserts that the author of the patch is 
> contributing
> +in accordance with the clauses specified in the
> +`Developer's Certificate of Origin `__:

Why are you linking to this one?
It's slightly different from kernel, with copyright and prohibition to change 
it.

there's also a bit more text in the kernel, e.g. the rule against
anonymous contributions.



> +.. _dco:
> +
> +::
> +  Developer's Certificate of Origin 1.1
> +  By making a contribution to this project, I certify that:
> +
> +  (a) The contribution was created in whole or in part by me and I
> +  have the right to submit it under the open source license
> +  indicated in the file; or
> +
> +  (b) The contribution is based upon previous work that, to the best
> +  of my knowledge, is covered under an appropriate open source
> +  license and I have the right under that license to submit that
> +  work with modifications, whether created in whole or in part
> +  by me, under the same open source license (unless I am
> +  permitted to submit under a different license), as indicated
> +  in the file; or
> +
> +  (c) The contribution was provided directly to me by some other
> +  person who certified (a), (b) or (c) and I have not modified
> +  it.
> +
> +  (d) I understand and agree that this project and the contribution
> +  are public and that a record of the contribution (including all
> +  personal information I submit with it, including my sign-off) is
> +  maintained indefinitely and may be redistributed consistent with
> +  this project or the open source license(s) involved.
> +
> +It is generally expected that the name and email addresses used in one of the
> +``Signed-off-by`` lines, matches that of the git commit ``Author`` field.
> +
> +If the person sending the mail is not one of the patch authors, they are none
> +the less expected to add their own ``Signed-off-by`` to comply with the DCO
> +clause (c).
> +
> +Multiple authorship
> +~~~
> +
> +It is not uncommon for a patch to have contributions from multiple authors. 
> In
> +this scenario, git commits will usually be expected to have a 
> ``Signed-off-by``
> +line for each contributor involved in creation of the patch. Some edge cases:
> +
> +  * The non-primary author's contributions were so trivial that they can be
> +considered not subject to copyright. In this case the secondary authors
> +need not include a ``Signed-off-by``.
> +
> +This case most commonly applies where QEMU reviewers give short snippets
> +of code as suggested fixes to a patch. The reviewers don't need to have
> +their own ``Signed-off-by`` added unless their code suggestion was
> +unusually large, but it is common to add ``Suggested-by`` as a credit
> +for non-trivial code.
> +
> +  * Both contributors work for the same employer and the employer requires
> +copyright assignment.
> +
> +It can be said that in this case a ``Signed-off-by`` is indicating that
> +the person has permission to contribute from their employer who is the
> +copyright holder. It is none the less still preferable to include a
> +``Signed-off-by`` for each contributor, as in some countries employees 
> are
> +not able to assign copyright to their employer, and it also covers any
> +

Re: [PATCH 06/20] qapi/parser: fix comment parsing immediately following a doc block

On Thu, May 16, 2024 at 2:01 AM Markus Armbruster  wrote:

> John Snow  writes:
>
> > If a comment immediately follows a doc block, the parser doesn't ignore
> > that token appropriately. Fix that.
>
> Reproducer?
>
> >
> > Signed-off-by: John Snow 
> > ---
> >  scripts/qapi/parser.py | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/scripts/qapi/parser.py b/scripts/qapi/parser.py
> > index 41b9319e5cb..161768b8b96 100644
> > --- a/scripts/qapi/parser.py
> > +++ b/scripts/qapi/parser.py
> > @@ -587,7 +587,7 @@ def get_doc(self) -> 'QAPIDoc':
> >  line = self.get_doc_line()
> >  first = False
> >
> > -self.accept(False)
> > +self.accept()
> >  doc.end()
> >  return doc
>
> Can't judge the fix without understanding the problem, and the problem
> will be easier to understand for me with a reproducer.
>

audio.json:

```
##
# = Audio
##

##
# @AudiodevPerDirectionOptions:
#
# General audio backend options that are used for both playback and
# recording.
#
```

Modify this excerpt to have a comment after the "= Audio" header, say for
instance if you were to take out that intro paragraph and transform it into
a comment that preceded the AudiodevPerDirectionOptions doc block.

e.g.

```
##
# = Audio
##

# Lorem Ipsum

##
# @AudiodevPerDirectionOptions:
```

the parser breaks because the line I changed that primes the next token is
still set to "not ignore comments", but that breaks the parser and gives a
rather unhelpful message:

../qapi/audio.json:13:1: junk after '##' at start of documentation comment

Encountered when converting developer commentary from documentation
paragraphs to mere QAPI comments.

--js

Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-05-16 Thread Michael Galaxy

These are very compelling results, no?

(40gbps cards, right? Are the cards active/active? or active/standby?)

- Michael

On 5/14/24 10:19, Yu Zhang wrote:

Hello Peter and all,

I did a comparison of the VM live-migration speeds between RDMA and
TCP/IP on our servers
and plotted the results to get an initial impression. Unfortunately,
the Ethernet NICs are not the
recent ones, therefore, it may not make much sense. I can do it on
servers with more recent Ethernet
NICs and keep you updated.

It seems that the benefits of RDMA becomes obviously when the VM has
large memory and is
running memory-intensive workload.

Best regards,
Yu Zhang @ IONOS Cloud

On Thu, May 9, 2024 at 4:14 PM Peter Xu wrote:

On Thu, May 09, 2024 at 04:58:34PM +0800, Zheng Chuan via wrote:

That's a good news to see the socket abstraction for RDMA!
When I was developed the series above, the most pain is the RDMA migration has
no QIOChannel abstraction and i need to take a 'fake channel'
for it which is awkward in code implementation.
So, as far as I know, we can do this by
i. the first thing is that we need to evaluate the rsocket is good enough to
satisfy our QIOChannel fundamental abstraction
ii. if it works right, then we will continue to see if it can give us
opportunity to hide the detail of rdma protocol
into rsocket by remove most of code in rdma.c and also some hack in
migration main process.
iii. implement the advanced features like multi-fd and multi-uri for rdma
migration.

Since I am not familiar with rsocket, I need some times to look at it and do
some quick verify with rdma migration based on rsocket.
But, yes, I am willing to involved in this refactor work and to see if we can
make this migration feature more better:）

Based on what we have now, it looks like we'd better halt the deprecation
process a bit, so I think we shouldn't need to rush it at least in 9.1
then, and we'll need to see how it goes on the refactoring.

It'll be perfect if rsocket works, otherwise supporting multifd with little
overhead / exported APIs would also be a good thing in general with
whatever approach. And obviously all based on the facts that we can get
resources from companies to support this feature first.

Note that so far nobody yet compared with rdma v.s. nic perf, so I hope if
any of us can provide some test results please do so. Many people are
saying RDMA is better, but I yet didn't see any numbers comparing it with
modern TCP networks. I don't want to have old impressions floating around
even if things might have changed.. When we have consolidated results, we
should share them out and also reflect that in QEMU's migration docs when a
rdma document page is ready.

Chuan, please check the whole thread discussion, it may help to understand
what we are looking for on rdma migrations [1]. Meanwhile please feel free
to sync with Jinpu's team and see how to move forward with such a project.

[1]
https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/87frwatp7n@suse.de/__;!!GjvTz_vk!QnXDo1zSlYecz7JvJky4SOQ9I8V5MoGHbINdAQAzMJQ_yYg_8_BSUXz9kjvbSgFefhG0wi1j38KaC3g$

Thanks,

--
Peter Xu

Re: [PATCH v2 1/3] docs: introduce dedicated page about code provenance / sign-off

2024-05-16 Thread Peter Maydell

On Thu, 16 May 2024 at 17:22, Daniel P. Berrangé  wrote:
>
> Currently we have a short paragraph saying that patches must include
> a Signed-off-by line, and merely link to the kernel documentation.
> The linked kernel docs have a lot of content beyond the part about
> sign-off an thus are misleading/distracting to QEMU contributors.

Thanks for this -- I've felt for ages that it was a bit awkward
that we didn't have a good place to link people to for the fuller
explanation of this.

> This introduces a dedicated 'code-provenance' page in QEMU talking
> about why we require sign-off, explaining the other tags we commonly
> use, and what to do in some edge cases.

The version of the kernel SubmittingPatches we used to link to
includes the text "sorry, no pseudonyms or anonymous contributions".
This new documentation doesn't say anything either way about
our approach to pseudonyms. I think we should probably say
something, but I don't know if we have an in-practice consensus
there, so maybe we should approach that as a separate change on
top of this patch.

So for this patch:

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: CPR/liveupdate: test results using prior bug fix

2024-05-16 Thread Michael Galaxy




On 5/14/24 08:54, Michael Tokarev wrote:

On 5/14/24 16:39, Michael Galaxy wrote:

Steve,

OK, so it does not look like this bugfix you wrote was included in 
8.2.4 (which was released yesterday). Unfortunately, that means that 
anyone using CPR in that release will still (eventually) encounter 
the bug like I did.


8.2.4 is basically a "bugfix" release for 8.2.3 which I somewhat
screwed up (in a minor way), plus a few currently (at the time)
queued up changes.   8.2.3 was a big release though.

I would recommend that y'all consider cherry-picking, perhaps, the 
relevant commits for a possible 8.2.5 ?


Please Cc changes which are relevant for -stable to, well,
qemu-sta...@nongnu.org :)

Which changes needs to be picked up?

Steve, can you comment here, please? At a minimum, we have this one: 
[PULL 20/25] migration: stop vm for cpr


But that pull came with a handful of other changes that are also not in 
QEMU v8, so I suspect I'm missing some other important changes that 
might be important for a stable release?


- Michael



Thanks,

/mjt

Re: [PATCH v2 0/3] docs: define policy forbidding use of "AI" / LLM code generators

On Thu, May 16, 2024 at 05:22:27PM +0100, Daniel P. Berrangé wrote:
> This patch kicks the hornet's nest of AI / LLM code generators.
> 
> With the increasing interest in code generators in recent times,
> it is inevitable that QEMU contributions will include AI generated
> code. Thus far we have remained silent on the matter. Given that
> everyone knows these tools exist, our current position has to be
> considered tacit acceptance of the use of AI generated code in QEMU.
> 
> The question for the project is whether that is a good position for
> QEMU to take or not ?
> 
> IANAL, but I like to think I'm reasonably proficient at understanding
> open source licensing. I am not inherantly against the use of AI tools,
> rather I am anti-risk. I also want to see OSS licenses respected and
> complied with.
> 
> AFAICT at its current state of (im)maturity the question of licensing
> of AI code generator output does not have a broadly accepted / settled
> legal position. This is an inherant bias/self-interest from the vendors
> promoting their usage, who tend to minimize/dismiss the legal questions.
> >From my POV, this puts such tools in a position of elevated legal risk.
> 
> Given the fuzziness over the legal position of generated code from
> such tools, I don't consider it credible (today) for a contributor
> to assert compliance with the DCO terms (b) or (c) (which is a stated
> pre-requisite for QEMU accepting patches) when a patch includes (or is
> derived from) AI generated code.
> 
> By implication, I think that QEMU must (for now) explicitly decline
> to (knowingly) accept AI generated code.
> 
> Perhaps a few years down the line the legal uncertainty will have
> reduced and we can re-evaluate this policy.
> 
> Discuss...

At this junction, the code generated by these tools is of such
quality that I really won't expect it to pass even cursory code
review.


So for now, I propose adding a single paragraph:

 If you wrote the patch, make sure your "From:" and "Signed-off-by:"
 lines use the same spelling. It's okay if you subscribe or contribute to
 the list via more than one address, but using multiple addresses in one
 commit just confuses things. If someone else wrote the patch, git will
 include a "From:" line in the body of the email (different from your
 envelope From:) that will give credit to the correct author; but again,
 that author's Signed-off-by: line is mandatory, with the same spelling.

+Q: I prompted ChatGPT/Copilot/Llama and it wrote
+   the patch for me. Can I submit it and how do I sign it?
+A: Your patch is likely trash or trivial. Please write your own code.






> Changes in v2:
> 
>  * Fix a huge number of typos in docs
>  * Clarify that maintainers should still add R-b where relevant, even
>if they are already adding their own S-oB.
>  * Clarify situation when contributor re-starts previously abandoned
>work from another contributor.
>  * Add info about Suggested-by tag
>  * Add new docs section dealing with the broad topic of "generated
>files" (whether code generators or compilers)
>  * Simplify the section related to prohibition of AI generated files
>and give further examples of tools considered covered
>  * Remove repeated references to "LLM" as a specific technology, just
>use the broad "AI" term, except for one use of LLM as an example.
>  * Add note that the policy may evolve if the legal clarity improves
>  * Add note that exceptions can be requested on case-by-case basis
>if contributor thinks they can demonstrate a credible copyright
>and licensing status
> 
> Daniel P. Berrangé (3):
>   docs: introduce dedicated page about code provenance / sign-off
>   docs: define policy limiting the inclusion of generated files
>   docs: define policy forbidding use of AI code generators
> 
>  docs/devel/code-provenance.rst| 315 ++
>  docs/devel/index-process.rst  |   1 +
>  docs/devel/submitting-a-patch.rst |  19 +-
>  3 files changed, 318 insertions(+), 17 deletions(-)
>  create mode 100644 docs/devel/code-provenance.rst
> 
> -- 
> 2.43.0

Re: [PATCH v7 00/12] Enabling DCD emulation support in Qemu

On Tue, May 14, 2024 at 02:16:51AM +, Zhijian Li (Fujitsu) wrote:
> Hi Fan
> 
> 
> Do you have a newer instruction to play with the DCD. It seems that
> the instruction in RFC[0] doesn't work for current code.
> 
> [0] https://lore.kernel.org/all/20230511175609.2091136-1-fan...@samsung.com/
> 

For the testing, the only thing that has been changed for this series is
the QMP interface for add/release DC extents.

https://lore.kernel.org/linux-cxl/d708f7c8-2598-4a17-9cbb-935c6ae2a...@fujitsu.com/T/#m05066f0098e976fb1c4b05db5e7ff7ca1bf27b1e

1. Add dynamic capacity extents:

For example, the command to add two continuous extents (each 128MiB long)
to region 0 (starting at DPA offset 0) looks like below:

{ "execute": "qmp_capabilities" }

{ "execute": "cxl-add-dynamic-capacity",
  "arguments": {
  "path": "/machine/peripheral/cxl-dcd0",
  "hid": 0,
  "selection-policy": 2,
  "region-id": 0,
  "tag": "",
  "extents": [
  {
  "offset": 0,
  "len": 134217728
  },
  {
  "offset": 134217728,
  "len": 134217728
  }
  ]
  }
}

2. Release dynamic capacity extents:

For example, the command to release an extent of size 128MiB from region 0
(DPA offset 128MiB) looks like below:

{ "execute": "cxl-release-dynamic-capacity",
  "arguments": {
  "path": "/machine/peripheral/cxl-dcd0",
  "hid": 0,
  "flags": 1,
  "region-id": 0,
  "tag": "",
  "extents": [
  {
  "offset": 134217728,
  "len": 134217728
  }
  ]
  }
}

btw, I have a wiki page to explain how to test CXL DCD with a tool I
wrote.
https://github.com/moking/moking.github.io/wiki/cxl%E2%80%90test%E2%80%90tool:-A-tool-to-ease-CXL-test-with-QEMU-setup%E2%80%90%E2%80%90Using-DCD-test-as-an-example

Let me know if you need more info for testing.


Fan

> 
> 
> On 19/04/2024 07:10, nifan@gmail.com wrote:
> > A git tree of this series can be found here (with one extra commit on top
> > for printing out accepted/pending extent list):
> > https://github.com/moking/qemu/tree/dcd-v7
> > 
> > v6->v7:
> > 
> > 1. Fixed the dvsec range register issue mentioned in the the cover letter 
> > in v6.
> > Only relevant bits are set to mark the device ready (Patch 6). 
> > (Jonathan)
> > 2. Moved the if statement in cxl_setup_memory from Patch 6 to Patch 4. 
> > (Jonathan)
> > 3. Used MIN instead of if statement to get record_count in Patch 7. 
> > (Jonathan)
> > 4. Added "Reviewed-by" tag to Patch 7.
> > 5. Modified cxl_dc_extent_release_dry_run so the updated extent list can be
> > reused in cmd_dcd_release_dyn_cap to simplify the process in Patch 8. 
> > (Jørgen)
> > 6. Added comments to indicate further "TODO" items in 
> > cmd_dcd_add_dyn_cap_rsp.
> >  (Jonathan)
> > 7. Avoided irrelevant code reformat in Patch 8. (Jonathan)
> > 8. Modified QMP interfaces for adding/releasing DC extents to allow passing
> > tags, selection policy, flags in the interface. (Jonathan, Gregory)
> > 9. Redesigned the pending list so extents in the same requests are grouped
> >  together. A new data structure is introduced to represent "extent 
> > group"
> >  in pending list.  (Jonathan)
> > 10. Added support in QMP interface for "More" flag.
> > 11. Check "Forced removal" flag for release request and not let it pass 
> > through.
> > 12. Removed the dynamic capacity log type from CxlEventLog definition in 
> > cxl.json
> > to avoid the side effect it may introduce to inject error to DC event 
> > log.
> > (Jonathan)
> > 13. Hard coded the event log type to dynamic capacity event log in QMP
> >  interfaces. (Jonathan)
> > 14. Adding space in between "-1]". (Jonathan)
> > 15. Some minor comment fixes.
> > 
> > The code is tested with similar setup and has passed similar tests as listed
> > in the cover letter of v5[1] and v6[2].
> > Also, the code is tested with the latest DCD kernel patchset[3].
> > 
> > [1] Qemu DCD patchset v5: 
> > https://lore.kernel.org/linux-cxl/20240304194331.1586191-1-nifan@gmail.com/T/#t
> > [2] Qemu DCD patchset v6: 
> > https://lore.kernel.org/linux-cxl/20240325190339.696686-1-nifan@gmail.com/T/#t
> > [3] DCD kernel patches: 
> > https://lore.kernel.org/linux-cxl/20240324-dcd-type2-upstream-v1-0-b7b00d623...@intel.com/T/#m11c571e21c4fe17c7d04ec5c2c7bc7cbf2cd07e3
> > 
> > 
> > Fan Ni (12):
> >hw/cxl/cxl-mailbox-utils: Add dc_event_log_size field to output
> >  payload of identify memory device command
> >hw/cxl/cxl-mailbox-utils: Add dynamic capacity region representative
> >  and mailbox command support
> >include/hw/cxl/cxl_device: Rename mem_size as static_mem_size for
> >  type3 memory devices
> >hw/mem/cxl_type3: Add support to create DC regions to type3 memory
> >  devices
> >hw/mem/cxl-type3: Refactor ct3_build_cdat_entries_for_mr to take mr
> >  size instead of mr as argument
> >hw/mem/cxl_type3: Add host backend and address space handling for DC

Re: [PATCH v2 3/3] docs: define policy forbidding use of AI code generators

On Thu, May 16, 2024 at 05:22:30PM +0100, Daniel P. Berrangé wrote:
> There has been an explosion of interest in so called AI code generators
> in the past year or two. Thus far though, this is has not been matched
> by a broadly accepted legal interpretation of the licensing implications
> for code generator outputs. While the vendors may claim there is no
> problem and a free choice of license is possible, they have an inherent
> conflict of interest in promoting this interpretation. More broadly
> there is, as yet, no broad consensus on the licensing implications of
> code generators trained on inputs under a wide variety of licenses
> 
> The DCO requires contributors to assert they have the right to
> contribute under the designated project license. Given the lack of
> consensus on the licensing of AI code generator output, it is not
> considered credible to assert compliance with the DCO clause (b) or (c)
> where a patch includes such generated code.
> 
> This patch thus defines a policy that the QEMU project will currently
> not accept contributions where use of AI code generators is either
> known, or suspected.
> 
> This merely reflects the current uncertainty of the field, and should
> this situation change, the policy is of course subject to future
> relaxation. Meanwhile requests for exceptions can also be considered on
> a case by case basis.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  docs/devel/code-provenance.rst | 50 +-
>  1 file changed, 49 insertions(+), 1 deletion(-)
> 
> diff --git a/docs/devel/code-provenance.rst b/docs/devel/code-provenance.rst
> index eabb3e7c08..846dda9a35 100644
> --- a/docs/devel/code-provenance.rst
> +++ b/docs/devel/code-provenance.rst
> @@ -264,4 +264,52 @@ boilerplate code template which is then filled in to 
> produce the final patch.
>  The output of such a tool would still be considered the "preferred format",
>  since it is intended to be a foundation for further human authored changes.
>  Such tools are acceptable to use, provided they follow a deterministic 
> process
> -and there is clearly defined copyright and licensing for their output.
> +and there is clearly defined copyright and licensing for their output. Note
> +in particular the caveats applying to AI code generators below.
> +
> +Use of AI code generators
> +~
> +
> +TL;DR:
> +
> +  **Current QEMU project policy is to DECLINE any contributions which are
> +  believed to include or derive from AI generated code. This includes 
> ChatGPT,
> +  CoPilot, Llama and similar tools**
> +
> +The increasing prevalence of AI code generators, most notably but not limited
> +to, `Large Language Models 
> `__
> +(LLMs) results in a number of difficult legal questions and risks for 
> software
> +projects, including QEMU.
> +
> +The QEMU community requires that contributors certify their patch submissions
> +are made in accordance with the rules of the :ref:`dco` (DCO).
> +
> +To satisfy the DCO, the patch contributor has to fully understand the
> +copyright and license status of code they are contributing to QEMU. With AI
> +code generators, the copyright and license status of the output is 
> ill-defined
> +with no generally accepted, settled legal foundation.
> +
> +Where the training material is known, it is common for it to include large
> +volumes of material under restrictive licensing/copyright terms. Even where
> +the training material is all known to be under open source licenses, it is
> +likely to be under a variety of terms, not all of which will be compatible
> +with QEMU's licensing requirements.
> +
> +With this in mind, the QEMU project does not consider it is currently 
> possible
> +for contributors to comply with DCO terms (b) or (c) for the output of 
> commonly
> +available AI code generators.
> +
> +The QEMU maintainers thus require that contributors refrain from using AI 
> code
> +generators on patches intended to be submitted to the project, and will
> +decline any contribution if use of AI is either known or suspected.
> +
> +Examples of tools impacted by this policy includes both GitHub's CoPilot,
> +OpenAI's ChatGPT, and Meta's Code Llama, amongst many others which are less
> +well known.
> +
> +This policy may evolve as the legal situation is clarifed. In the meanwhile,
> +requests for exceptions to this policy will be evaluated by the QEMU project
> +on a case by case basis. To be granted an exception, a contributor will need
> +to demonstrate clarity of the license and copyright status for the tool's
> +output in relation to its training model and code, to the satisfaction of the
> +project maintainers.

I would definitely want more contributors to pass their
comments and commit logs though a grammar checker.
It's unclear to me whether the contributors would
be required to know whether the checker in question is
considered "AI" or not.




> -- 
> 2.43.0

[PATCH 0/1] riscv, gdbstub.c: fix reg_width in ricsv_gen_dynamic_vector_feature()

Hi,

Commit 33a24910ae ("target/riscv: Use GDBFeature for dynamic XML")
changed 'reg_width' for vector regs, a change that I believe to be
unintended, and we're unable to print vector regs in GDB ATM.

The following is a gdb output of a simple program running with
qemu-riscv64 when trying to print the value of 'v1' after a 'vle'
insns:

(gdb) p $v1
$1 = {q = 0x0, l = 0x0, w = 0x0, s = {57920}, b = {64, 226}}
(gdb) 

After this patch:

(gdb) p $v1
$1 = {q = {9781192033638379298842687819604544}, l = {530239482618432, 
530239482618432}, w = {123456, 123456, 
123456, 123456}, s = {57920, 1, 57920, 1, 57920, 1, 57920, 1}, b = {64, 
226, 1, 0, 64, 226, 1, 0, 64, 226, 1, 
0, 64, 226, 1, 0}}
(gdb) 


Michael, this is a good pick for qemu-stable.

Daniel Henrique Barboza (1):
  riscv, gdbstub.c: fix reg_width in ricsv_gen_dynamic_vector_feature()

 target/riscv/gdbstub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

-- 
2.44.0

[PATCH 1/1] riscv, gdbstub.c: fix reg_width in ricsv_gen_dynamic_vector_feature()

Commit 33a24910ae changed 'reg_width' to use 'vlenb', i.e. vector length
in bytes, when in this context we want 'reg_width' as the length in
bits.

Fix 'reg_width' back to the value in bits like 7cb59921c05a
("target/riscv/gdbstub.c: use 'vlenb' instead of shifting 'vlen'") set
beforehand.

Cc: Akihiko Odaki 
Cc: Alex Bennée 
Reported-by: Robin Dapp 
Fixes: 33a24910ae ("target/riscv: Use GDBFeature for dynamic XML")
Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/gdbstub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/riscv/gdbstub.c b/target/riscv/gdbstub.c
index d0cc5762c2..358158c42a 100644
--- a/target/riscv/gdbstub.c
+++ b/target/riscv/gdbstub.c
@@ -288,7 +288,7 @@ static GDBFeature *riscv_gen_dynamic_csr_feature(CPUState 
*cs, int base_reg)
 static GDBFeature *ricsv_gen_dynamic_vector_feature(CPUState *cs, int base_reg)
 {
 RISCVCPU *cpu = RISCV_CPU(cs);
-int reg_width = cpu->cfg.vlenb;
+int reg_width = cpu->cfg.vlenb << 3;
 GDBFeatureBuilder builder;
 int i;
 
-- 
2.44.0

Re: [PATCH v7 06/12] hw/mem/cxl_type3: Add host backend and address space handling for DC regions

On Tue, May 14, 2024 at 08:28:27AM +, Zhijian Li (Fujitsu) wrote:
> 
> 
> On 19/04/2024 07:10, nifan@gmail.com wrote:
> > +uint64_t dc_size;
> > +
> > +mr = host_memory_backend_get_memory(ct3d->dc.host_dc);
> > +dc_size = memory_region_size(mr);
> > +region_len = DIV_ROUND_UP(dc_size, ct3d->dc.num_regions);
> > +
> > +if (dc_size % (ct3d->dc.num_regions * CXL_CAPACITY_MULTIPLIER) != 0) {
> > +error_setg(errp, "host backend size must be multiples of region 
> > len");
> 
> I prefer to have the %region_len% in the error message as well so that i can 
> update the
> backend file accordingly.

Will add.

Fan
> 
> 
> 
> > +return false;
> > +}

Re: [PATCH v7 04/12] hw/mem/cxl_type3: Add support to create DC regions to type3 memory devices

On Tue, May 14, 2024 at 08:14:59AM +, Zhijian Li (Fujitsu) wrote:
> 
> 
> On 19/04/2024 07:10, nifan@gmail.com wrote:
> > From: Fan Ni 
> > 
> 
> > +}
> > +
> >   static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp)
> >   {
> >   DeviceState *ds = DEVICE(ct3d);
> > @@ -635,6 +676,13 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error 
> > **errp)
> >   g_free(p_name);
> >   }
> >   
> > +if (ct3d->dc.num_regions > 0) {
> > +if (!cxl_create_dc_regions(ct3d, errp)) {
> > +error_setg(errp, "setup DC regions failed");
> 
> This error_set() would cause an assertion if the errp was assigned inside 
> cxl_create_dc_regions();
> Try error_append_hint() instead
Thanks, Let me check and fix.

Fan
> 
> #3  0x7f1fdc4fafc6 in annobin_assert.c_end () at /lib64/libc.so.6
> #4  0x555fd3edbea8 in error_setv
>  (errp=0x7ffe6d1a3de0, src=0x555fd3fe262b "../hw/mem/cxl_type3.c", 
> line=807, func=0x555fd3fe2fe0 <__func__.21> "cxl_setup_memory", 
> err_class=ERROR_CLASS_GENERIC_ERROR, fmt=0x555fd3fe2939 "setup DC regions 
> failed", ap=0x7ffe6d1a3
> c00, suffix=0x0) at ../util/error.c:68
> #5  0x555fd3edc126 in error_setg_internal
>  (errp=0x7ffe6d1a3de0, src=0x555fd3fe262b "../hw/mem/cxl_type3.c", 
> line=807, func=0x555fd3fe2fe0 <__func__.21> "cxl_setup_memory", 
> fmt=0x555fd3fe2939 "setup DC regions failed") at ../util/error.c:105
> #6  0x555fd3819c9f in cxl_setup_memory (ct3d=0x555fd8b2f3e0, 
> errp=0x7ffe6d1a3de0) at ../hw/mem/cxl_type3.c:807
> #7  0x555fd3819d7b in ct3_realize (pci_dev=0x555fd8b2f3e0, 
> errp=0x7ffe6d1a3de0) at ../hw/mem/cxl_type3.c:833
> #8  0x555fd38b575f in pci_qdev_realize (qdev=0x555fd8b2f3e0, 
> errp=0x7ffe6d1a3e60) at ../hw/pci/pci.c:2093
> #9  0x555fd3ccca9b in device_set_realized (obj=0x555fd8b2f3e0, 
> value=true, errp=0x7ffe6d1a40d0)

Re: [PATCH v7 00/12] Enabling DCD emulation support in Qemu

On Fri, Apr 19, 2024 at 02:24:36PM -0400, Gregory Price wrote:
> On Thu, Apr 18, 2024 at 04:10:51PM -0700, nifan@gmail.com wrote:
> > A git tree of this series can be found here (with one extra commit on top
> > for printing out accepted/pending extent list): 
> > https://github.com/moking/qemu/tree/dcd-v7
> > 
> > v6->v7:
> > 
> > 1. Fixed the dvsec range register issue mentioned in the the cover letter 
> > in v6.
> >Only relevant bits are set to mark the device ready (Patch 6). (Jonathan)
> > 2. Moved the if statement in cxl_setup_memory from Patch 6 to Patch 4. 
> > (Jonathan)
> > 3. Used MIN instead of if statement to get record_count in Patch 7. 
> > (Jonathan)
> > 4. Added "Reviewed-by" tag to Patch 7.
> > 5. Modified cxl_dc_extent_release_dry_run so the updated extent list can be
> >reused in cmd_dcd_release_dyn_cap to simplify the process in Patch 8. 
> > (Jørgen) 
> > 6. Added comments to indicate further "TODO" items in 
> > cmd_dcd_add_dyn_cap_rsp.
> > (Jonathan)
> > 7. Avoided irrelevant code reformat in Patch 8. (Jonathan)
> > 8. Modified QMP interfaces for adding/releasing DC extents to allow passing
> >tags, selection policy, flags in the interface. (Jonathan, Gregory)
> > 9. Redesigned the pending list so extents in the same requests are grouped
> > together. A new data structure is introduced to represent "extent group"
> > in pending list.  (Jonathan)
> > 10. Added support in QMP interface for "More" flag. 
> > 11. Check "Forced removal" flag for release request and not let it pass 
> > through.
> > 12. Removed the dynamic capacity log type from CxlEventLog definition in 
> > cxl.json
> >to avoid the side effect it may introduce to inject error to DC event 
> > log.
> >(Jonathan)
> > 13. Hard coded the event log type to dynamic capacity event log in QMP
> > interfaces. (Jonathan)
> > 14. Adding space in between "-1]". (Jonathan)
> > 15. Some minor comment fixes.
> > 
> > The code is tested with similar setup and has passed similar tests as listed
> > in the cover letter of v5[1] and v6[2].
> > Also, the code is tested with the latest DCD kernel patchset[3].
> > 
> > [1] Qemu DCD patchset v5: 
> > https://lore.kernel.org/linux-cxl/20240304194331.1586191-1-nifan@gmail.com/T/#t
> > [2] Qemu DCD patchset v6: 
> > https://lore.kernel.org/linux-cxl/20240325190339.696686-1-nifan@gmail.com/T/#t
> > [3] DCD kernel patches: 
> > https://lore.kernel.org/linux-cxl/20240324-dcd-type2-upstream-v1-0-b7b00d623...@intel.com/T/#m11c571e21c4fe17c7d04ec5c2c7bc7cbf2cd07e3
> >
> 
> added review to all patches, will hopefully be able to add a Tested-by
> tag early next week, along with a v1 RFC for MHD bit-tracking.
> 
> We've been testing v5/v6 for a bit, so I expect as soon as we get the
> MHD code ported over to v7 i'll ship a tested-by tag pretty quick.
> 
> The super-set release will complicate a few things but this doesn't
> look like a blocker on our end, just a change to how we track bits in a
> shared bit/bytemap.
> 

Hi Gregory,
I am planning to address all the concerns in this series and send out v8
next week. Jonathan mentioned you have few related patches built on top
of this series, can you point me to the latest version so I can look
into it? Also, would you like me to carry them over to send together
with my series in next version? It could be easier for you to avoid the
potential rebase needed for your patches?

Let me know.

Thanks,
Fan

> > 
> > Fan Ni (12):
> >   hw/cxl/cxl-mailbox-utils: Add dc_event_log_size field to output
> > payload of identify memory device command
> >   hw/cxl/cxl-mailbox-utils: Add dynamic capacity region representative
> > and mailbox command support
> >   include/hw/cxl/cxl_device: Rename mem_size as static_mem_size for
> > type3 memory devices
> >   hw/mem/cxl_type3: Add support to create DC regions to type3 memory
> > devices
> >   hw/mem/cxl-type3: Refactor ct3_build_cdat_entries_for_mr to take mr
> > size instead of mr as argument
> >   hw/mem/cxl_type3: Add host backend and address space handling for DC
> > regions
> >   hw/mem/cxl_type3: Add DC extent list representative and get DC extent
> > list mailbox support
> >   hw/cxl/cxl-mailbox-utils: Add mailbox commands to support add/release
> > dynamic capacity response
> >   hw/cxl/events: Add qmp interfaces to add/release dynamic capacity
> > extents
> >   hw/mem/cxl_type3: Add DPA range validation for accesses to DC regions
> >   hw/cxl/cxl-mailbox-utils: Add superset extent release mailbox support
> >   hw/mem/cxl_type3: Allow to release extent superset in QMP interface
> > 
> >  hw/cxl/cxl-mailbox-utils.c  | 620 ++-
> >  hw/mem/cxl_type3.c  | 633 +---
> >  hw/mem/cxl_type3_stubs.c|  20 ++
> >  include/hw/cxl/cxl_device.h |  81 -
> >  include/hw/cxl/cxl_events.h |  18 +
> >  qapi/cxl.json   |  69 
> >  6 files changed, 1396

Re: [PATCH v2 2/3] docs: define policy limiting the inclusion of generated files

On Thu, May 16, 2024 at 05:22:29PM +0100, Daniel P. Berrangé wrote:
> Files contributed to QEMU are generally expected to be provided in the
> preferred format for manipulation. IOW, we generally don't expect to
> have generated / compiled code included in the tree, rather, we expect
> to run the code generator / compiler as part of the build process.
> 
> There are some obvious exceptions to this seen in our existing tree, the
> biggest one being the inclusion of many binary firmware ROMs. A more
> niche example is the inclusion of a generated eBPF program. Or the CI
> dockerfiles which are mostly auto-generated. In these cases, however,
> the preferred format source code is still required to be included,
> alongside the generated output.
> 
> Tools which perform user defined algorithmic transformations on code are
> not considered to be "code generators". ie, we permit use of coccinelle,
> spell checkers, and sed/awk/etc to manipulate code. Such use of automated
> manipulation should still be declared in the commit message.
> 
> One off generators which create a boilerplate file which the author then
> fills in, are acceptable if their output has clear copyright and license
> status. This could be where a contributor writes a throwaway python
> script to automate creation of some mundane piece of code for example.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  docs/devel/code-provenance.rst | 55 ++
>  1 file changed, 55 insertions(+)
> 
> diff --git a/docs/devel/code-provenance.rst b/docs/devel/code-provenance.rst
> index 7c42fae571..eabb3e7c08 100644
> --- a/docs/devel/code-provenance.rst
> +++ b/docs/devel/code-provenance.rst
> @@ -210,3 +210,58 @@ mailing list.
>  It is also recommended to attempt to contact the original author to let them
>  know you are interested in taking over their work, in case they still 
> intended
>  to return to the work, or had any suggestions about the best way to continue.
> +
> +Inclusion of generated files
> +
> +
> +Files in patches contributed to QEMU are generally expected to be provided
> +only in the preferred format for making modifications. The implication of
> +this is that the output of code generators or compilers is usually not
> +appropriate to contribute to QEMU.
> +
> +For reasons of practicality there are some exceptions to this rule, where
> +generated code is permitted, provided it is also accompanied by the
> +corresponding preferred source format. This is done where it is impractical
> +to expect those building QEMU to run the code generation or compilation
> +process. A non-exhustive list of examples is:
> +
> + * Images: where an bitmap image is created from a vector file it is common
> +   to include the rendered bitmaps at desired resolution(s), since subtle
> +   changes in the rasterization process / tools may affect quality. The
> +   original vector file is expected to accompany any generated bitmaps.
> +
> + * Firmware: QEMU includes pre-compiled binary ROMs for a variety of guest
> +   firmwares. When such binary ROMs are contributed, the corresponding source
> +   must also be provided, either directly, or through a git submodule link.
> +
> + * Dockerfiles: the majority of the dockerfiles are automatically generated
> +   from a canonical list of build dependencies maintained in tree, together
> +   with the libvirt-ci git submodule link. The generated dockerfiles are
> +   included in tree because it is desirable to be able to directly build
> +   container images from a clean git checkout.
> +
> + * EBPF: QEMU includes some generated EBPF machine code, since the required
> +   eBPF compilation tools are not broadly available on all targetted OS
> +   distributions. The corresponding eBPF C code for the binary is also
> +   provided. This is a time limited exception until the eBPF toolchain is
> +   sufficiently broadly available in distros.
> +
> +In all cases above, the existence of generated files must be acknowledged
> +and justified in the commit that introduces them.
> +
> +Tools which perform changes to existing code with deterministic algorithmic
> +manipulation, driven by user specified inputs, are not generally considered
> +to be "generators".
> +
> +IOW, using coccinelle to convert code from one pattern to another pattern, or
> +fixing docs typos with a spell checker, or transforming code using sed / awk 
> /
> +etc, are not considered to be acts of code generation. Where an automated
> +manipulation is performed on code, however, this should be declared in the
> +commit message.
> +
> +At times contributors may use or create scripts/tools to generate an initial
> +boilerplate code template which is then filled in to produce the final patch.
> +The output of such a tool would still be considered the "preferred format",
> +since it is intended to be a foundation for further human authored changes.
> +Such tools are acceptable to use, provided they follow a

[PATCH] gitlab-ci: Replace Docker with Kaniko

2024-05-16 Thread Camilla Conte

Enables caching from the qemu-project repository.

Uses a dedicated "$NAME-cache" tag for caching, to address limitations.
See issue "when using --cache=true, kaniko fail to push cache layer [...]":
https://github.com/GoogleContainerTools/kaniko/issues/1459

Does not specify a context since no Dockerfile is using COPY or ADD 
instructions.

Does not enable reproducible builds as
that results in builds failing with an out of memory error.
See issue "Using --reproducible loads entire image into memory":
https://github.com/GoogleContainerTools/kaniko/issues/862

Previous attempts, for the records:
  - Alex Bennée: 
https://lore.kernel.org/qemu-devel/20230330101141.30199-12-alex.ben...@linaro.org/
  - Camilla Conte (me): 
https://lore.kernel.org/qemu-devel/20230531150824.32349-6-cco...@redhat.com/

Signed-off-by: Camilla Conte 
---
 .gitlab-ci.d/container-template.yml | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/.gitlab-ci.d/container-template.yml 
b/.gitlab-ci.d/container-template.yml
index 4eec72f383..066f253dd5 100644
--- a/.gitlab-ci.d/container-template.yml
+++ b/.gitlab-ci.d/container-template.yml
@@ -1,21 +1,18 @@
 .container_job_template:
   extends: .base_job_template
-  image: docker:latest
   stage: containers
-  services:
-- docker:dind
+  image:
+name: gcr.io/kaniko-project/executor:debug
+entrypoint: [""]
+  variables:
+DOCKERFILE: "$CI_PROJECT_DIR/tests/docker/dockerfiles/$NAME.docker"
+CACHE_REPO: "$CI_REGISTRY/qemu-project/qemu/qemu/$NAME-cache"
   before_script:
 - export TAG="$CI_REGISTRY_IMAGE/qemu/$NAME:$QEMU_CI_CONTAINER_TAG"
-# Always ':latest' because we always use upstream as a common cache source
-- export COMMON_TAG="$CI_REGISTRY/qemu-project/qemu/qemu/$NAME:latest"
-- docker login $CI_REGISTRY -u "$CI_REGISTRY_USER" -p 
"$CI_REGISTRY_PASSWORD"
-- until docker info; do sleep 1; done
   script:
 - echo "TAG:$TAG"
-- echo "COMMON_TAG:$COMMON_TAG"
-- docker build --tag "$TAG" --cache-from "$TAG" --cache-from "$COMMON_TAG"
-  --build-arg BUILDKIT_INLINE_CACHE=1
-  -f "tests/docker/dockerfiles/$NAME.docker" "."
-- docker push "$TAG"
-  after_script:
-- docker logout
+- /kaniko/executor
+  --dockerfile "$DOCKERFILE"
+  --destination "$TAG"
+  --cache=true
+  --cache-repo="$CACHE_REPO"
-- 
2.45.0

Re: [PATCH 00/16] VFIO: misc cleanups part2


Hello Zhenzhong,

On 5/15/24 10:20, Zhenzhong Duan wrote:

Hi

This is the last round of cleanup series to change functions in hw/vfio/
to return bool when the error is passed through errp parameter.

The first round is at 
https://lists.gnu.org/archive/html/qemu-devel/2024-05/msg01147.html

I see Cédric is also working on some migration stuff cleanup,
so didn't touch migration.c, but all other files in hw/vfio/ are cleanup now.

Patch1 is a fix patch, all others are cleanup patches.

Test done on x86 platform:
vfio device hotplug/unplug with different backend
reboot

This series is rebased to https://github.com/legoater/qemu/tree/vfio-next


I queued part 1 in vfio-next with other changes. part 2 is in vfio-9.1
for now and should reach vfio-next after reviews next week.

Then, we have to work on your v5 [1] which should have all my attention
again after the next vfio PR. You, Joao and Eric have followups series
that need a resync on top of v5, possibly others [2] and [3], not sent
AFAICT. Anyhow, we will need inputs from these people and IOMMU
stakeholders/maintainers.

Thanks,

C.

[1] [PATCH v5 00/19] Add a host IOMMU device abstraction to check with vIOMMU

https://lore.kernel.org/qemu-devel/20240507092043.1172717-1-zhenzhong.d...@intel.com/

[2] [PATCH ats_vtd v2 00/25] ATS support for VT-d

https://lore.kernel.org/all/20240515071057.33990-1-clement.mathieu--d...@eviden.com/

[3] Add Tegra241 (Grace) CMDQV Support
https://lore.kernel.org/all/cover.1712978212.git.nicol...@nvidia.com/
https://github.com/nicolinc/qemu/commits/wip/iommufd_vcmdq/





Thanks
Zhenzhong

Zhenzhong Duan (16):
   vfio/display: Fix error path in call site of ramfb_setup()
   vfio/display: Make vfio_display_*() return bool
   vfio/helpers: Use g_autofree in hw/vfio/helpers.c
   vfio/helpers: Make vfio_set_irq_signaling() return bool
   vfio/helpers: Make vfio_device_get_name() return bool
   vfio/platform: Make vfio_populate_device() and vfio_base_device_init()
 return bool
   vfio/ccw: Make vfio_ccw_get_region() return a bool
   vfio/pci: Make vfio_intx_enable_kvm() return a bool
   vfio/pci: Make vfio_pci_relocate_msix() and vfio_msix_early_setup()
 return a bool
   vfio/pci: Make vfio_populate_device() return a bool
   vfio/pci: Make vfio_intx_enable() return bool
   vfio/pci: Make vfio_populate_vga() return bool
   vfio/pci: Make capability related functions return bool
   vfio/pci: Use g_autofree for vfio_region_info pointer
   vfio/pci-quirks: Make vfio_pci_igd_opregion_init() return bool
   vfio/pci-quirks: Make vfio_add_*_cap() return bool

  hw/vfio/pci.h |  12 +-
  include/hw/vfio/vfio-common.h |   6 +-
  hw/vfio/ap.c  |  10 +-
  hw/vfio/ccw.c |  25 ++--
  hw/vfio/display.c |  22 ++--
  hw/vfio/helpers.c |  33 ++---
  hw/vfio/igd.c |   5 +-
  hw/vfio/pci-quirks.c  |  50 
  hw/vfio/pci.c | 227 --
  hw/vfio/platform.c|  61 -
  10 files changed, 213 insertions(+), 238 deletions(-)

Re: [PATCH v2 00/11] VFIO: misc cleanups


On 5/7/24 08:42, Zhenzhong Duan wrote:

Hi

This is a cleanup series to change functions in hw/vfio/ to return bool
when the error is passed through errp parameter, also some cleanup
with g_autofree.

See discussion at 
https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg04782.html

This series processed below files:
hw/vfio/container.c
hw/vfio/iommufd.c
hw/vfio/cpr.c
backends/iommufd.c

So above files are clean now, there are still other files need processing
in hw/vfio.

Test done on x86 platform:
vfio device hotplug/unplug with different backend
reboot

Thanks
Zhenzhong

Changelog:
v2:
- split out g_autofree code as a patch (Cédric)
- add processing for more files

Zhenzhong Duan (11):
   vfio/pci: Use g_autofree in vfio_realize
   vfio/pci: Use g_autofree in iommufd_cdev_get_info_iova_range()
   vfio: Make VFIOIOMMUClass::attach_device() and its wrapper return bool
   vfio: Make VFIOIOMMUClass::setup() return bool
   vfio: Make VFIOIOMMUClass::add_window() and its wrapper return bool
   vfio/container: Make vfio_connect_container() return bool
   vfio/container: Make vfio_set_iommu() return bool
   vfio/container: Make vfio_get_device() return bool
   vfio/iommufd: Make iommufd_cdev_*() return bool
   vfio/cpr: Make vfio_cpr_register_container() return bool
   backends/iommufd: Make iommufd_backend_*() return bool



Applied to vfio-next.

Thanks,

C.

Re: [PATCH v3 0/4] qapi/vfio: Add VFIO migration QAPI event


On 5/15/24 15:21, Avihai Horon wrote:

Hello,

This series adds a new QAPI event for VFIO device migration state
change. This event will be emitted when a VFIO device changes its
state, for example, during migration or when stopping/starting the
guest.

This event can be used by management applications to get updates on the
current state of the VFIO device for their own purposes.

A new per VFIO device capability, "migration-events", is added so events
can be enabled only for the required devices. It is disabled by default.

Feedback/comments are appreciated,



Applied to vfio-next.

Thanks,

C.




Thanks.

Changes from v2 [2]:
* Added assert for vbasedev->ops->vfio_get_object and obj. (Cedric)
* Renamed set_state() to vfio_migration_set_device_state(). (Cedric)
* Enhanced tracing of device state change. (Cedric)
* Added Cedric's R-b.

Changes from v1 [1]:
* Added more info to patch #1 commit mesasge. (Markus)
* Renamed VFIODeviceMigState to VfioMigrationState and
   VFIO_DEVICE_MIG_STATE_CHANGED to VFIO_MIGRATION. (Joao, Markus)
* Added qom-path and qdev id to VFIO_MIGRATION event data. (Markus)
* Handled no-op state transitions in vfio_migration_set_state().
   (Cedric)
* Added helper to set VFIO state and emit VFIO event. (Peter)

[1]
https://lore.kernel.org/qemu-devel/20240430051621.19597-1-avih...@nvidia.com/

[2]
https://lore.kernel.org/qemu-devel/20240509090954.16447-1-avih...@nvidia.com/

Avihai Horon (4):
   qapi/vfio: Add VFIO migration QAPI event
   vfio/migration: Emit VFIO migration QAPI event
   vfio/migration: Don't emit STOP_COPY VFIO migration QAPI event twice
   vfio/migration: Enhance VFIO migration state tracing

  MAINTAINERS   |  1 +
  qapi/qapi-schema.json |  1 +
  qapi/vfio.json| 67 +
  include/hw/vfio/vfio-common.h |  1 +
  hw/vfio/migration.c   | 71 ---
  hw/vfio/pci.c |  2 +
  hw/vfio/trace-events  |  3 +-
  qapi/meson.build  |  1 +
  8 files changed, 141 insertions(+), 6 deletions(-)
  create mode 100644 qapi/vfio.json

Re: [PATCH v4] vfio/pci: migration: Skip config space check for Vendor Specific Information in VSC during restore/load


On 5/3/24 16:51, Vinayak Kale wrote:

In case of migration, during restore operation, qemu checks config space of the
pci device with the config space in the migration stream captured during save
operation. In case of config space data mismatch, restore operation is failed.

config space check is done in function get_pci_config_device(). By default VSC
(vendor-specific-capability) in config space is checked.

Due to qemu's config space check for VSC, live migration is broken across NVIDIA
vGPU devices in situation where source and destination host driver is different.
In this situation, Vendor Specific Information in VSC varies on the destination
to ensure vGPU feature capabilities exposed to the guest driver are compatible
with destination host.

If a vfio-pci device is migration capable and vfio-pci vendor driver is OK with
volatile Vendor Specific Info in VSC then qemu should exempt config space check
for Vendor Specific Info. It is vendor driver's responsibility to ensure that
VSC is consistent across migration. Here consistency could mean that VSC format
should be same on source and destination, however actual Vendor Specific Info
may not be byte-to-byte identical.

This patch skips the check for Vendor Specific Information in VSC for VFIO-PCI
device by clearing pdev->cmask[] offsets. Config space check is still enforced
for 3 byte VSC header. If cmask[] is not set for an offset, then qemu skips
config space check for that offset.

VSC check is skipped for machine types >= 9.1. The check would be enforced on
older machine types (<= 9.0).

Signed-off-by: Vinayak Kale 
Cc: Alex Williamson 
Cc: Michael S. Tsirkin 
Cc: Cédric Le Goater 




Applied to vfio-next.

Thanks,

C.



---
Version History
v3->v4:
 - VSC check is skipped for machine types >= 9.1. The check would be 
enforced
   on older machine types (<= 9.0).
v2->v3:
 - Config space check skipped only for Vendor Specific Info in VSC, check is
   still enforced for 3 byte VSC header.
 - Updated commit description with live migration failure scenario.
v1->v2:
 - Limited scope of change to vfio-pci devices instead of all pci devices.

  hw/core/machine.c |  1 +
  hw/vfio/pci.c | 26 ++
  hw/vfio/pci.h |  1 +
  3 files changed, 28 insertions(+)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 4ff60911e7..fc3eb5115f 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -35,6 +35,7 @@
  
  GlobalProperty hw_compat_9_0[] = {

  {"arm-cpu", "backcompat-cntfrq", "true" },
+{"vfio-pci", "skip-vsc-check", "false" },
  };
  const size_t hw_compat_9_0_len = G_N_ELEMENTS(hw_compat_9_0);
  
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c

index 64780d1b79..2ece9407cc 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2134,6 +2134,28 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, 
uint8_t pos)
  }
  }
  
+static int vfio_add_vendor_specific_cap(VFIOPCIDevice *vdev, int pos,

+uint8_t size, Error **errp)
+{
+PCIDevice *pdev = >pdev;
+
+pos = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, size, errp);
+if (pos < 0) {
+return pos;
+}
+
+/*
+ * Exempt config space check for Vendor Specific Information during
+ * restore/load.
+ * Config space check is still enforced for 3 byte VSC header.
+ */
+if (vdev->skip_vsc_check && size > 3) {
+memset(pdev->cmask + pos + 3, 0, size - 3);
+}
+
+return pos;
+}
+
  static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
  {
  ERRP_GUARD();
@@ -2202,6 +2224,9 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t 
pos, Error **errp)
  vfio_check_af_flr(vdev, pos);
  ret = pci_add_capability(pdev, cap_id, pos, size, errp);
  break;
+case PCI_CAP_ID_VNDR:
+ret = vfio_add_vendor_specific_cap(vdev, pos, size, errp);
+break;
  default:
  ret = pci_add_capability(pdev, cap_id, pos, size, errp);
  break;
@@ -3390,6 +3415,7 @@ static Property vfio_pci_dev_properties[] = {
  DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
   TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
  #endif
+DEFINE_PROP_BOOL("skip-vsc-check", VFIOPCIDevice, skip_vsc_check, true),
  DEFINE_PROP_END_OF_LIST(),
  };
  
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h

index 6e64a2654e..92cd62d115 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -177,6 +177,7 @@ struct VFIOPCIDevice {
  OnOffAuto ramfb_migrate;
  bool defer_kvm_irq_routing;
  bool clear_parent_atomics_on_exit;
+bool skip_vsc_check;
  VFIODisplay *dpy;
  Notifier irqchip_change_notifier;
  };

Re: [PATCH v2 1/4] vfio/ap: Use g_autofree variable in vfio_ap_register_irq_notifier()




Applied series to vfio-next.

Thanks,

C.

On 4/25/24 11:02, Cédric Le Goater wrote:

Signed-off-by: Cédric Le Goater 
---
  hw/vfio/ap.c | 10 +++---
  1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 
7c4caa5938636937680fec87e999249ac84a4498..03f8ffaa5e2bf13cf8daa2f44aa4cf17809abd94
 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -77,7 +77,7 @@ static void vfio_ap_register_irq_notifier(VFIOAPDevice 
*vapdev,
  size_t argsz;
  IOHandler *fd_read;
  EventNotifier *notifier;
-struct vfio_irq_info *irq_info;
+g_autofree struct vfio_irq_info *irq_info = NULL;
  VFIODevice *vdev = >vdev;
  
  switch (irq) {

@@ -104,14 +104,14 @@ static void vfio_ap_register_irq_notifier(VFIOAPDevice 
*vapdev,
  if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO,
irq_info) < 0 || irq_info->count < 1) {
  error_setg_errno(errp, errno, "vfio: Error getting irq info");
-goto out_free_info;
+return;
  }
  
  if (event_notifier_init(notifier, 0)) {

  error_setg_errno(errp, errno,
   "vfio: Unable to init event notifier for irq (%d)",
   irq);
-goto out_free_info;
+return;
  }
  
  fd = event_notifier_get_fd(notifier);

@@ -122,10 +122,6 @@ static void vfio_ap_register_irq_notifier(VFIOAPDevice 
*vapdev,
  qemu_set_fd_handler(fd, NULL, NULL, vapdev);
  event_notifier_cleanup(notifier);
  }
-
-out_free_info:
-g_free(irq_info);
-
  }
  
  static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev,

Re: [PATCH 1/1] target/ppc: Move VMX integer add/sub saturate insns to decodetree.

2024-05-16 Thread Chinmay Rath


Hi Richard,

On 5/12/24 17:08, Richard Henderson wrote:

On 5/12/24 11:38, Chinmay Rath wrote:
@@ -2934,6 +2870,184 @@ static bool do_vx_vaddsubcuw(DisasContext 
*ctx, arg_VX *a, int add)

  return true;
  }
  +static inline void do_vadd_vsub_sat
+(
+    unsigned vece, TCGv_vec t, TCGv_vec sat, TCGv_vec a, TCGv_vec b,
+    void (*norm_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec),
+    void (*sat_op)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
+{
+    TCGv_vec x = tcg_temp_new_vec_matching(t);
+    norm_op(vece, x, a, b);
+    sat_op(vece, t, a, b);
+    tcg_gen_cmp_vec(TCG_COND_NE, vece, x, x, t);
+    tcg_gen_or_vec(vece, sat, sat, x);
+}


As a separate change, before or after, the cmp_vec may be simplified 
to xor_vec.  Which means that INDEX_op_cmp_vec need not be probed in 
the vecop_lists.  See


https://lore.kernel.org/qemu-devel/20240506010403.6204-31-richard.hender...@linaro.org/ 



which is performing the same operation on AArch64.


Noted ! Will do.



+static bool do_vx_vadd_vsub_sat(DisasContext *ctx, arg_VX *a,
+    int sign, int vece, int add)
+{
+    static const TCGOpcode vecop_list_sub_u[] = {
+    INDEX_op_sub_vec, INDEX_op_ussub_vec, INDEX_op_cmp_vec, 0
+    };
+    static const TCGOpcode vecop_list_sub_s[] = {
+    INDEX_op_sub_vec, INDEX_op_sssub_vec, INDEX_op_cmp_vec, 0
+    };
+    static const TCGOpcode vecop_list_add_u[] = {
+    INDEX_op_add_vec, INDEX_op_usadd_vec, INDEX_op_cmp_vec, 0
+    };
+    static const TCGOpcode vecop_list_add_s[] = {
+    INDEX_op_add_vec, INDEX_op_ssadd_vec, INDEX_op_cmp_vec, 0
+    };
+
+    static const GVecGen4 op[2][3][2] = {
+    {
+    {
+    {
+    .fniv = gen_vsub_sat_u,
+    .fno = gen_helper_VSUBUBS,
+    .opt_opc = vecop_list_sub_u,
+    .write_aofs = true,
+    .vece = MO_8
+    },

.
.
.

+    {
+    .fniv = gen_vadd_sat_s,
+    .fno = gen_helper_VADDSWS,
+    .opt_opc = vecop_list_add_s,
+    .write_aofs = true,
+    .vece = MO_32
+    },
+    },
+    },
+    };


While this table is not wrong, I think it is clearer to have separate 
tables, one per operation, which are then passed in to a common expander.



+
+    REQUIRE_INSNS_FLAGS(ctx, ALTIVEC);
+    REQUIRE_VECTOR(ctx);
+
+    tcg_gen_gvec_4(avr_full_offset(a->vrt), offsetof(CPUPPCState, 
vscr_sat),
+   avr_full_offset(a->vra), avr_full_offset(a->vrb), 
16, 16,

+   [sign][vece][add]);
+
+    return true;
+}
+
+TRANS(VSUBUBS, do_vx_vadd_vsub_sat, 0, MO_8, 0)


I think it is clearer to use TRANS_FLAGS than to sink the ISA check 
into the helper.  In general I seem to find the helper later gets 
reused for something else with a different ISA check.


Thus

static const TCGOpcode vecop_list_vsub_sat_u[] = {
    INDEX_op_sub_vec, INDEX_op_ussub_vec, 0
};
static const GVecGen4 op_vsububs = {
    .fno = gen_helper_VSUBUBS,
    .fniv = gen_vsub_sat_u,
    .opt_opc = vecop_list_vsub_sat_u,
    .write_aofs = true,
    .vece = MO_8
};
TRANS_FLAGS(VSUBUBS, do_vx_vadd_vsub_sat, _vsububs)

static const GVecGen4 op_vsubuhs = {
    .fno = gen_helper_VSUBUHS,
    .fniv = gen_vsub_sat_u,
    .opt_opc = vecop_list_vsub_sat_u,
    .write_aofs = true,
    .vece = MO_16
};
TRANS_FLAGS(VSUBUHS, do_vx_vadd_vsub_sat, _vsubuhs)

etc.


Will add those changes in v2.

-GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE),


You are correct in your cover letter that this is not right.
We should have been testing ISA300 for vmul10uq here.


Thank you very much for the clarification !

+GEN_VXFORM(vmul10euq, 0, 9),


And thus need GEN_VXFORM_300 here.


+GEN_VXFORM(vmul10euq, 0, 9),
+GEN_VXFORM(bcdcpsgn, 0, 13),
+GEN_VXFORM(bcdadd, 0, 24),
+GEN_VXFORM(bcdsub, 0, 25),

...

+GEN_VXFORM(xpnd04_2, 0, 30),


None of these are in the base ISA, so all need a flag check.



r~


Thanks & Regards,
Chinmay

Re: [PATCH v7 0/9] vfio: Improve error reporting (part 2)


On 5/16/24 14:46, Cédric Le Goater wrote:

Hello,

The motivation behind these changes is to improve error reporting to
the upper management layer (libvirt) with a more detailed error, this
to let it decide, depending on the reported error, whether to try
migration again later. It would be useful in cases where migration
fails due to lack of HW resources on the host. For instance, some
adapters can only initiate a limited number of simultaneous dirty
tracking requests and this imposes a limit on the the number of VMs
that can be migrated simultaneously.

We are not quite ready for such a mechanism but what we can do first is
to cleanup the error reporting in the early save_setup sequence. This
is what the following changes propose, by adding an Error** argument to
various handlers and propagating it to the core migration subsystem.

The first part [1] of this series modifying the core migration
subsystem is now merged. This is the second part changing VFIO which
was already proposed in March. See [2].

Thanks,

C.

[1] [PATCH for-9.1 v5 00/14] migration: Improve error reporting
 https://lore.kernel.org/qemu-devel/20240320064911.545001-1-...@redhat.com/

[2] [PATCH v4 00/25] migration: Improve error reporting
 https://lore.kernel.org/qemu-devel/20240306133441.2351700-1-...@redhat.com/

Changes in v7:

  - Commit log improvements (Eric)
  - vfio_set_migration_error() : err -> ret rename (Eric)
  - vfio_migration_set_state() :
Introduced an error prefix to remove redundancy in error messages (Eric)
Commented error_report when setting the device in recover state fails (Eric)
  - vfio_migration_state_notifier() :
Remove useless assignment of local ret variable (Avihai)
Rephrased comment regarding MigrationNotifyFunc API (Avihai)
  - Fixed even more line wrapping of *dirty_bitmap() routines (Avihai)
  - vfio_sync_dirty_bitmap()
Fixed return when vfio_sync_ram_discard_listener_dirty_bitmap() is called 
(Avihai)


I fixed this last issue as commented in patch 8. Let's address other
issues, if minor, with followup patches.

Applied to vfio-next.

Thanks,

C.




Changes in v6:

  - Commit log improvements (Avihai)
  - Modified some titles (Avihai)
  - vfio_migration_set_state() : Dropped the error_setg_errno()
change when setting device in recover state fails  (Avihai)
  - vfio_migration_state_notifier() : report local error (Avihai)
  - vfio_save_device_config_state() : Set errp if the migration
stream is in error (Avihai)
  - vfio_save_state() : Changed error prefix  (Avihai)
  - vfio_iommu_map_dirty_notify() : Modified goto label  (Avihai)
  - Fixed memory_get_xlat_addr documentation (Avihai)
  - Fixed line wrapping (Avihai)
  - Fixed query_dirty_bitmap documentation (Avihai)
  - Dropped last patch from v5 :
vfio: Extend vfio_set_migration_error() with Error* argument

Changes in v5:

  - Rebased on 20c64c8a51a4 ("migration: migration_file_set_error")
  - Fixed typo in set_dirty_page_tracking documentation
  - Used error_setg_errno() in vfio_devices_dma_logging_start()
  - Replaced error_setg() by error_setg_errno() in vfio_migration_set_state()
  - Replaced error_setg() by error_setg_errno() in
vfio_devices_query_dirty_bitmap() and vfio_legacy_query_dirty_bitmap()
  - ':' -> '-' in vfio_iommu_map_dirty_notify()

Cédric Le Goater (9):
   vfio: Add Error** argument to .set_dirty_page_tracking() handler
   vfio: Add Error** argument to vfio_devices_dma_logging_start()
   migration: Extend migration_file_set_error() with Error* argument
   vfio/migration: Add an Error** argument to vfio_migration_set_state()
   vfio/migration: Add Error** argument to .vfio_save_config() handler
   vfio: Reverse test on vfio_get_xlat_addr()
   memory: Add Error** argument to memory_get_xlat_addr()
   vfio: Add Error** argument to .get_dirty_bitmap() handler
   vfio: Also trace event failures in vfio_save_complete_precopy()

  include/exec/memory.h |  15 +++-
  include/hw/vfio/vfio-common.h |  30 ++-
  include/hw/vfio/vfio-container-base.h |  37 +++--
  include/migration/misc.h  |   2 +-
  hw/vfio/common.c  | 113 --
  hw/vfio/container-base.c  |  10 +--
  hw/vfio/container.c   |  20 +++--
  hw/vfio/migration.c   | 109 -
  hw/vfio/pci.c |   5 +-
  hw/virtio/vhost-vdpa.c|   5 +-
  migration/migration.c |   6 +-
  system/memory.c   |  10 +--
  12 files changed, 246 insertions(+), 116 deletions(-)

[PATCH v2 2/3] docs: define policy limiting the inclusion of generated files

Files contributed to QEMU are generally expected to be provided in the
preferred format for manipulation. IOW, we generally don't expect to
have generated / compiled code included in the tree, rather, we expect
to run the code generator / compiler as part of the build process.

There are some obvious exceptions to this seen in our existing tree, the
biggest one being the inclusion of many binary firmware ROMs. A more
niche example is the inclusion of a generated eBPF program. Or the CI
dockerfiles which are mostly auto-generated. In these cases, however,
the preferred format source code is still required to be included,
alongside the generated output.

Tools which perform user defined algorithmic transformations on code are
not considered to be "code generators". ie, we permit use of coccinelle,
spell checkers, and sed/awk/etc to manipulate code. Such use of automated
manipulation should still be declared in the commit message.

One off generators which create a boilerplate file which the author then
fills in, are acceptable if their output has clear copyright and license
status. This could be where a contributor writes a throwaway python
script to automate creation of some mundane piece of code for example.

Signed-off-by: Daniel P. Berrangé 
---
 docs/devel/code-provenance.rst | 55 ++
 1 file changed, 55 insertions(+)

diff --git a/docs/devel/code-provenance.rst b/docs/devel/code-provenance.rst
index 7c42fae571..eabb3e7c08 100644
--- a/docs/devel/code-provenance.rst
+++ b/docs/devel/code-provenance.rst
@@ -210,3 +210,58 @@ mailing list.
 It is also recommended to attempt to contact the original author to let them
 know you are interested in taking over their work, in case they still intended
 to return to the work, or had any suggestions about the best way to continue.
+
+Inclusion of generated files
+
+
+Files in patches contributed to QEMU are generally expected to be provided
+only in the preferred format for making modifications. The implication of
+this is that the output of code generators or compilers is usually not
+appropriate to contribute to QEMU.
+
+For reasons of practicality there are some exceptions to this rule, where
+generated code is permitted, provided it is also accompanied by the
+corresponding preferred source format. This is done where it is impractical
+to expect those building QEMU to run the code generation or compilation
+process. A non-exhustive list of examples is:
+
+ * Images: where an bitmap image is created from a vector file it is common
+   to include the rendered bitmaps at desired resolution(s), since subtle
+   changes in the rasterization process / tools may affect quality. The
+   original vector file is expected to accompany any generated bitmaps.
+
+ * Firmware: QEMU includes pre-compiled binary ROMs for a variety of guest
+   firmwares. When such binary ROMs are contributed, the corresponding source
+   must also be provided, either directly, or through a git submodule link.
+
+ * Dockerfiles: the majority of the dockerfiles are automatically generated
+   from a canonical list of build dependencies maintained in tree, together
+   with the libvirt-ci git submodule link. The generated dockerfiles are
+   included in tree because it is desirable to be able to directly build
+   container images from a clean git checkout.
+
+ * EBPF: QEMU includes some generated EBPF machine code, since the required
+   eBPF compilation tools are not broadly available on all targetted OS
+   distributions. The corresponding eBPF C code for the binary is also
+   provided. This is a time limited exception until the eBPF toolchain is
+   sufficiently broadly available in distros.
+
+In all cases above, the existence of generated files must be acknowledged
+and justified in the commit that introduces them.
+
+Tools which perform changes to existing code with deterministic algorithmic
+manipulation, driven by user specified inputs, are not generally considered
+to be "generators".
+
+IOW, using coccinelle to convert code from one pattern to another pattern, or
+fixing docs typos with a spell checker, or transforming code using sed / awk /
+etc, are not considered to be acts of code generation. Where an automated
+manipulation is performed on code, however, this should be declared in the
+commit message.
+
+At times contributors may use or create scripts/tools to generate an initial
+boilerplate code template which is then filled in to produce the final patch.
+The output of such a tool would still be considered the "preferred format",
+since it is intended to be a foundation for further human authored changes.
+Such tools are acceptable to use, provided they follow a deterministic process
+and there is clearly defined copyright and licensing for their output.
-- 
2.43.0

[PATCH v2 3/3] docs: define policy forbidding use of AI code generators

There has been an explosion of interest in so called AI code generators
in the past year or two. Thus far though, this is has not been matched
by a broadly accepted legal interpretation of the licensing implications
for code generator outputs. While the vendors may claim there is no
problem and a free choice of license is possible, they have an inherent
conflict of interest in promoting this interpretation. More broadly
there is, as yet, no broad consensus on the licensing implications of
code generators trained on inputs under a wide variety of licenses

The DCO requires contributors to assert they have the right to
contribute under the designated project license. Given the lack of
consensus on the licensing of AI code generator output, it is not
considered credible to assert compliance with the DCO clause (b) or (c)
where a patch includes such generated code.

This patch thus defines a policy that the QEMU project will currently
not accept contributions where use of AI code generators is either
known, or suspected.

This merely reflects the current uncertainty of the field, and should
this situation change, the policy is of course subject to future
relaxation. Meanwhile requests for exceptions can also be considered on
a case by case basis.

Signed-off-by: Daniel P. Berrangé 
---
 docs/devel/code-provenance.rst | 50 +-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/docs/devel/code-provenance.rst b/docs/devel/code-provenance.rst
index eabb3e7c08..846dda9a35 100644
--- a/docs/devel/code-provenance.rst
+++ b/docs/devel/code-provenance.rst
@@ -264,4 +264,52 @@ boilerplate code template which is then filled in to 
produce the final patch.
 The output of such a tool would still be considered the "preferred format",
 since it is intended to be a foundation for further human authored changes.
 Such tools are acceptable to use, provided they follow a deterministic process
-and there is clearly defined copyright and licensing for their output.
+and there is clearly defined copyright and licensing for their output. Note
+in particular the caveats applying to AI code generators below.
+
+Use of AI code generators
+~
+
+TL;DR:
+
+  **Current QEMU project policy is to DECLINE any contributions which are
+  believed to include or derive from AI generated code. This includes ChatGPT,
+  CoPilot, Llama and similar tools**
+
+The increasing prevalence of AI code generators, most notably but not limited
+to, `Large Language Models 
`__
+(LLMs) results in a number of difficult legal questions and risks for software
+projects, including QEMU.
+
+The QEMU community requires that contributors certify their patch submissions
+are made in accordance with the rules of the :ref:`dco` (DCO).
+
+To satisfy the DCO, the patch contributor has to fully understand the
+copyright and license status of code they are contributing to QEMU. With AI
+code generators, the copyright and license status of the output is ill-defined
+with no generally accepted, settled legal foundation.
+
+Where the training material is known, it is common for it to include large
+volumes of material under restrictive licensing/copyright terms. Even where
+the training material is all known to be under open source licenses, it is
+likely to be under a variety of terms, not all of which will be compatible
+with QEMU's licensing requirements.
+
+With this in mind, the QEMU project does not consider it is currently possible
+for contributors to comply with DCO terms (b) or (c) for the output of commonly
+available AI code generators.
+
+The QEMU maintainers thus require that contributors refrain from using AI code
+generators on patches intended to be submitted to the project, and will
+decline any contribution if use of AI is either known or suspected.
+
+Examples of tools impacted by this policy includes both GitHub's CoPilot,
+OpenAI's ChatGPT, and Meta's Code Llama, amongst many others which are less
+well known.
+
+This policy may evolve as the legal situation is clarifed. In the meanwhile,
+requests for exceptions to this policy will be evaluated by the QEMU project
+on a case by case basis. To be granted an exception, a contributor will need
+to demonstrate clarity of the license and copyright status for the tool's
+output in relation to its training model and code, to the satisfaction of the
+project maintainers.
-- 
2.43.0

[PATCH v2 0/3] docs: define policy forbidding use of "AI" / LLM code generators

This patch kicks the hornet's nest of AI / LLM code generators.

With the increasing interest in code generators in recent times,
it is inevitable that QEMU contributions will include AI generated
code. Thus far we have remained silent on the matter. Given that
everyone knows these tools exist, our current position has to be
considered tacit acceptance of the use of AI generated code in QEMU.

The question for the project is whether that is a good position for
QEMU to take or not ?

IANAL, but I like to think I'm reasonably proficient at understanding
open source licensing. I am not inherantly against the use of AI tools,
rather I am anti-risk. I also want to see OSS licenses respected and
complied with.

AFAICT at its current state of (im)maturity the question of licensing
of AI code generator output does not have a broadly accepted / settled
legal position. This is an inherant bias/self-interest from the vendors
promoting their usage, who tend to minimize/dismiss the legal questions.
>From my POV, this puts such tools in a position of elevated legal risk.

Given the fuzziness over the legal position of generated code from
such tools, I don't consider it credible (today) for a contributor
to assert compliance with the DCO terms (b) or (c) (which is a stated
pre-requisite for QEMU accepting patches) when a patch includes (or is
derived from) AI generated code.

By implication, I think that QEMU must (for now) explicitly decline
to (knowingly) accept AI generated code.

Perhaps a few years down the line the legal uncertainty will have
reduced and we can re-evaluate this policy.

Discuss...

Changes in v2:

 * Fix a huge number of typos in docs
 * Clarify that maintainers should still add R-b where relevant, even
   if they are already adding their own S-oB.
 * Clarify situation when contributor re-starts previously abandoned
   work from another contributor.
 * Add info about Suggested-by tag
 * Add new docs section dealing with the broad topic of "generated
   files" (whether code generators or compilers)
 * Simplify the section related to prohibition of AI generated files
   and give further examples of tools considered covered
 * Remove repeated references to "LLM" as a specific technology, just
   use the broad "AI" term, except for one use of LLM as an example.
 * Add note that the policy may evolve if the legal clarity improves
 * Add note that exceptions can be requested on case-by-case basis
   if contributor thinks they can demonstrate a credible copyright
   and licensing status

Daniel P. Berrangé (3):
  docs: introduce dedicated page about code provenance / sign-off
  docs: define policy limiting the inclusion of generated files
  docs: define policy forbidding use of AI code generators

 docs/devel/code-provenance.rst| 315 ++
 docs/devel/index-process.rst  |   1 +
 docs/devel/submitting-a-patch.rst |  19 +-
 3 files changed, 318 insertions(+), 17 deletions(-)
 create mode 100644 docs/devel/code-provenance.rst

-- 
2.43.0

[PATCH v2 1/3] docs: introduce dedicated page about code provenance / sign-off

Currently we have a short paragraph saying that patches must include
a Signed-off-by line, and merely link to the kernel documentation.
The linked kernel docs have a lot of content beyond the part about
sign-off an thus are misleading/distracting to QEMU contributors.

This introduces a dedicated 'code-provenance' page in QEMU talking
about why we require sign-off, explaining the other tags we commonly
use, and what to do in some edge cases.

Signed-off-by: Daniel P. Berrangé 
---
 docs/devel/code-provenance.rst| 212 ++
 docs/devel/index-process.rst  |   1 +
 docs/devel/submitting-a-patch.rst |  19 +--
 3 files changed, 215 insertions(+), 17 deletions(-)
 create mode 100644 docs/devel/code-provenance.rst

diff --git a/docs/devel/code-provenance.rst b/docs/devel/code-provenance.rst
new file mode 100644
index 00..7c42fae571
--- /dev/null
+++ b/docs/devel/code-provenance.rst
@@ -0,0 +1,212 @@
+.. _code-provenance:
+
+Code provenance
+===
+
+Certifying patch submissions
+
+
+The QEMU community **mandates** all contributors to certify provenance of
+patch submissions they make to the project. To put it another way,
+contributors must indicate that they are legally permitted to contribute to
+the project.
+
+Certification is achieved with a low overhead by adding a single line to the
+bottom of every git commit::
+
+   Signed-off-by: YOUR NAME 
+
+The addition of this line asserts that the author of the patch is contributing
+in accordance with the clauses specified in the
+`Developer's Certificate of Origin `__:
+
+.. _dco:
+
+::
+  Developer's Certificate of Origin 1.1
+
+  By making a contribution to this project, I certify that:
+
+  (a) The contribution was created in whole or in part by me and I
+  have the right to submit it under the open source license
+  indicated in the file; or
+
+  (b) The contribution is based upon previous work that, to the best
+  of my knowledge, is covered under an appropriate open source
+  license and I have the right under that license to submit that
+  work with modifications, whether created in whole or in part
+  by me, under the same open source license (unless I am
+  permitted to submit under a different license), as indicated
+  in the file; or
+
+  (c) The contribution was provided directly to me by some other
+  person who certified (a), (b) or (c) and I have not modified
+  it.
+
+  (d) I understand and agree that this project and the contribution
+  are public and that a record of the contribution (including all
+  personal information I submit with it, including my sign-off) is
+  maintained indefinitely and may be redistributed consistent with
+  this project or the open source license(s) involved.
+
+It is generally expected that the name and email addresses used in one of the
+``Signed-off-by`` lines, matches that of the git commit ``Author`` field.
+
+If the person sending the mail is not one of the patch authors, they are none
+the less expected to add their own ``Signed-off-by`` to comply with the DCO
+clause (c).
+
+Multiple authorship
+~~~
+
+It is not uncommon for a patch to have contributions from multiple authors. In
+this scenario, git commits will usually be expected to have a ``Signed-off-by``
+line for each contributor involved in creation of the patch. Some edge cases:
+
+  * The non-primary author's contributions were so trivial that they can be
+considered not subject to copyright. In this case the secondary authors
+need not include a ``Signed-off-by``.
+
+This case most commonly applies where QEMU reviewers give short snippets
+of code as suggested fixes to a patch. The reviewers don't need to have
+their own ``Signed-off-by`` added unless their code suggestion was
+unusually large, but it is common to add ``Suggested-by`` as a credit
+for non-trivial code.
+
+  * Both contributors work for the same employer and the employer requires
+copyright assignment.
+
+It can be said that in this case a ``Signed-off-by`` is indicating that
+the person has permission to contribute from their employer who is the
+copyright holder. It is none the less still preferable to include a
+``Signed-off-by`` for each contributor, as in some countries employees are
+not able to assign copyright to their employer, and it also covers any
+time invested outside working hours.
+
+When multiple ``Signed-off-by`` tags are present, they should be strictly kept
+in order of authorship, from oldest to newest.
+
+Other commit tags
+~
+
+While the ``Signed-off-by`` tag is mandatory, there are a number of other tags
+that are commonly used during QEMU development:
+
+ * **``Reviewed-by``**: when a QEMU community member reviews a patch on the
+   mailing list, if they consider the patch acceptable, they should send an
+

Re: target/ppc: Move VMX int add/sub saturate insns to decodetree.

2024-05-16 Thread Chinmay Rath


Hi Richard,

On 5/12/24 15:59, Richard Henderson wrote:

On 5/12/24 11:38, Chinmay Rath wrote:

1. vsubsbs and bcdtrunc :

In this pair, bcdtrunc has the insn flag check PPC2_ISA300 in the
vmx-impl file, within the GEN_VXFORM_DUAL macro, which does this flag
check.
However it also has this flag check in the vmx-ops file.
Hence I have retained the same in the new entry in the vmx-ops file.
This is consistent with the behaviour in done in the following commit :
https://github.com/qemu/qemu/commit/b132be53a4ba6a0a40d5643d791822f958a36e53 


So even though the flag check is removed from the vmx-impl file, it is
retained in the vmx-ops file. All good here.

2. vadduhs and vmul10euq :

In this pair, vmul10euq has the insn flag check PPC2_ISA300 in the
vmx-impl file, check done within the GEN_VXFORM_DUAL macro.
However the same flag was NOT originally present in the
vmx-ops file, so I have NOT included in its new entry in the vmx-ops
file. I have done this, following the behaviour done in the following
commit :
https://github.com/qemu/qemu/commit/c85929b2ddf6bbad737635c9b85213007ec043af 

So this flag check for vmul10euq is excluded now. Is this not a 
problem ?

I feel that this leads to the flag check being skipped now, however this
behaviour was followed in the above mentioned commit.


This second link is for VAVG* and VABSD*.

Yes you are correct that this second case was done incorrectly. 
Thankfully the mistake was fixed in the very next commit, when VABSD* 
was converted to decodetree as well.



Thank you very much for the clarification !


r~

Regards,
Chinmay

Re: [RFC PATCH v3 07/18] hw/arm/smmuv3: Translate CD and TT using stage-2 table

Hi Eric,

On Wed, May 15, 2024 at 03:15:02PM +0200, Eric Auger wrote:
> Hi Mostafa,
> 
> On 4/29/24 05:23, Mostafa Saleh wrote:
> > According to ARM SMMU architecture specification (ARM IHI 0070 F.b),
> > In "5.2 Stream Table Entry":
> >  [51:6] S1ContextPtr
> >  If Config[1] == 1 (stage 2 enabled), this pointer is an IPA translated by
> >  stage 2 and the programmed value must be within the range of the IAS.
> >
> > In "5.4.1 CD notes":
> >  The translation table walks performed from TTB0 or TTB1 are always 
> > performed
> >  in IPA space if stage 2 translations are enabled.
> >
> > This patch implements translation of the S1 context descriptor pointer and
> > TTBx base addresses through the S2 stage (IPA -> PA)
> >
> > smmuv3_do_translate() is updated to have one arg which is translation
> > class, this is useful for:
> s/for/to?
Will do.
> >  - Decide wether a translation is stage-2 only or use the STE config.
> >  - Populate the class in case of faults, WALK_EABT is lefat as it as
> left unchanged?
Yup, that's a typo.
> >it is always triggered from TT access so no need to use the input
> >class.
> >
> > In case for stage-2 only translation, which only used in nesting, the
> in case of S2 translation used in the contexted of a nested translation, ...
Will do.
> > stage and asid are saved and restored before and after calling
> > smmu_translate().
> >
> > Translating CD or TTBx can fail for the following reasons:
> > 1) Large address size: This is described in
> >(3.4.3 Address sizes of SMMU-originated accesses)
> >- For CD ptr larger than IAS, for SMMUv3.1, it can trigger either
> >  C_BAD_STE or Translation fault, we implement the latter as it
> >  requires no extra code.
> >- For TTBx, if larger than the effective stage 1 output address size, it
> >  triggers C_BAD_CD.
> >
> > 2) Faults from PTWs (7.3 Event records)
> >- F_ADDR_SIZE: large address size after first level causes stage 2 
> > Address
> >  Size fault (Also in 3.4.3 Address sizes of SMMU-originated accesses)
> >- F_PERMISSION: Same as an address translation. However, when
> >  CLASS == CD, the access is implicitly Data and a read.
> >- F_ACCESS: Same as an address translation.
> >- F_TRANSLATION: Same as an address translation.
> >- F_WALK_EABT: Same as an address translation.
> >   These are already implemented in the PTW logic, so no extra handling
> >   required.
> >
> > As, there is multiple locations where the address is calculated from
> > cached entry, a new macro is introduced CACHED_ENTRY_TO_ADDR.
> >
> > Signed-off-by: Mostafa Saleh 
> > ---
> >  hw/arm/smmuv3.c  | 76 ++--
> >  include/hw/arm/smmu-common.h |  3 ++
> >  2 files changed, 66 insertions(+), 13 deletions(-)
> >
> > diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
> > index cc61708160..cc61c82321 100644
> > --- a/hw/arm/smmuv3.c
> > +++ b/hw/arm/smmuv3.c
> > @@ -337,14 +337,33 @@ static int smmu_get_ste(SMMUv3State *s, dma_addr_t 
> > addr, STE *buf,
> >  
> >  }
> >  
> > +static SMMUTranslationStatus smmuv3_do_translate(SMMUv3State *s, hwaddr 
> > addr,
> > + SMMUTransCfg *cfg,
> > + SMMUEventInfo *event,
> > + IOMMUAccessFlags flag,
> > + SMMUTLBEntry **out_entry,
> > + SMMUTranslationClass 
> > class);
> >  /* @ssid > 0 not supported yet */
> > -static int smmu_get_cd(SMMUv3State *s, STE *ste, uint32_t ssid,
> > -   CD *buf, SMMUEventInfo *event)
> > +static int smmu_get_cd(SMMUv3State *s, STE *ste, SMMUTransCfg *cfg,
> > +   uint32_t ssid, CD *buf, SMMUEventInfo *event)
> >  {
> >  dma_addr_t addr = STE_CTXPTR(ste);
> >  int ret, i;
> > +SMMUTranslationStatus status;
> > +SMMUTLBEntry *entry;
> >  
> >  trace_smmuv3_get_cd(addr);
> > +
> > +if (cfg->stage == SMMU_NESTED) {
> > +status = smmuv3_do_translate(s, addr, cfg, event,
> > + IOMMU_RO, , SMMU_CLASS_CD);
> > +if (status != SMMU_TRANS_SUCCESS) {
> So I guess you rely on event being populated by above CD S2 translate().
> it does not need to be patched, correct?
> May be worth a comment.
Yes, only the class is different, I will add a comment.
> > +return -EINVAL;
> > +}
> > +
> > +addr = CACHED_ENTRY_TO_ADDR(entry, addr);
> > +}
> > +
> >  /* TODO: guarantee 64-bit single-copy atomicity */
> >  ret = dma_memory_read(_space_memory, addr, buf, sizeof(*buf),
> >MEMTXATTRS_UNSPECIFIED);
> > @@ -659,10 +678,13 @@ static int smmu_find_ste(SMMUv3State *s, uint32_t 
> > sid, STE *ste,
> >  return 0;
> >  }
> >  
> > -static int decode_cd(SMMUTransCfg *cfg, CD *cd, SMMUEventInfo *event)
> >

[PATCH v6 5/8] softmmu: Replace check for RAMBlock offset 0 with xen_mr_is_memory

From: "Edgar E. Iglesias" 

For xen, when checking for the first RAM (xen_memory), use
xen_mr_is_memory() rather than checking for a RAMBlock with
offset 0.

All Xen machines create xen_memory first so this has no
functional change for existing machines.

Signed-off-by: Edgar E. Iglesias 
Reviewed-by: Stefano Stabellini 
---
 system/physmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/system/physmem.c b/system/physmem.c
index 5e6257ef65..b7847db1a2 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2229,7 +2229,7 @@ static void *qemu_ram_ptr_length(RAMBlock *block, 
ram_addr_t addr,
  * because we don't want to map the entire memory in QEMU.
  * In that case just map the requested area.
  */
-if (block->offset == 0) {
+if (xen_mr_is_memory(block->mr)) {
 return xen_map_cache(block->mr, block->offset + addr,
  len, lock, lock,
  is_write);
-- 
2.40.1

[PATCH v6 7/8] xen: mapcache: Add support for grant mappings

From: "Edgar E. Iglesias" 

Add a second mapcache for grant mappings. The mapcache for
grants needs to work with XC_PAGE_SIZE granularity since
we can't map larger ranges than what has been granted to us.

Like with foreign mappings (xen_memory), machines using grants
are expected to initialize the xen_grants MR and map it
into their address-map accordingly.

Signed-off-by: Edgar E. Iglesias 
Reviewed-by: Stefano Stabellini 
---
 hw/xen/xen-hvm-common.c |  12 ++-
 hw/xen/xen-mapcache.c   | 163 ++--
 include/hw/xen/xen-hvm-common.h |   3 +
 include/sysemu/xen.h|   7 ++
 4 files changed, 152 insertions(+), 33 deletions(-)

diff --git a/hw/xen/xen-hvm-common.c b/hw/xen/xen-hvm-common.c
index a0a0252da0..b8ace1c368 100644
--- a/hw/xen/xen-hvm-common.c
+++ b/hw/xen/xen-hvm-common.c
@@ -10,12 +10,18 @@
 #include "hw/boards.h"
 #include "hw/xen/arch_hvm.h"
 
-MemoryRegion xen_memory;
+MemoryRegion xen_memory, xen_grants;
 
-/* Check for xen memory.  */
+/* Check for any kind of xen memory, foreign mappings or grants.  */
 bool xen_mr_is_memory(MemoryRegion *mr)
 {
-return mr == _memory;
+return mr == _memory || mr == _grants;
+}
+
+/* Check specifically for grants.  */
+bool xen_mr_is_grants(MemoryRegion *mr)
+{
+return mr == _grants;
 }
 
 void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, MemoryRegion *mr,
diff --git a/hw/xen/xen-mapcache.c b/hw/xen/xen-mapcache.c
index a07c47b0b1..1cbc2aeaa9 100644
--- a/hw/xen/xen-mapcache.c
+++ b/hw/xen/xen-mapcache.c
@@ -14,6 +14,7 @@
 
 #include 
 
+#include "hw/xen/xen-hvm-common.h"
 #include "hw/xen/xen_native.h"
 #include "qemu/bitmap.h"
 
@@ -21,6 +22,8 @@
 #include "sysemu/xen-mapcache.h"
 #include "trace.h"
 
+#include 
+#include 
 
 #if HOST_LONG_BITS == 32
 #  define MCACHE_MAX_SIZE (1UL<<31) /* 2GB Cap */
@@ -41,6 +44,7 @@ typedef struct MapCacheEntry {
 unsigned long *valid_mapping;
 uint32_t lock;
 #define XEN_MAPCACHE_ENTRY_DUMMY (1 << 0)
+#define XEN_MAPCACHE_ENTRY_GRANT (1 << 1)
 uint8_t flags;
 hwaddr size;
 struct MapCacheEntry *next;
@@ -71,6 +75,8 @@ typedef struct MapCache {
 } MapCache;
 
 static MapCache *mapcache;
+static MapCache *mapcache_grants;
+static xengnttab_handle *xen_region_gnttabdev;
 
 static inline void mapcache_lock(MapCache *mc)
 {
@@ -131,6 +137,12 @@ void xen_map_cache_init(phys_offset_to_gaddr_t f, void 
*opaque)
 unsigned long max_mcache_size;
 unsigned int bucket_shift;
 
+xen_region_gnttabdev = xengnttab_open(NULL, 0);
+if (xen_region_gnttabdev == NULL) {
+error_report("mapcache: Failed to open gnttab device");
+exit(EXIT_FAILURE);
+}
+
 if (HOST_LONG_BITS == 32) {
 bucket_shift = 16;
 } else {
@@ -159,6 +171,15 @@ void xen_map_cache_init(phys_offset_to_gaddr_t f, void 
*opaque)
 mapcache = xen_map_cache_init_single(f, opaque,
  bucket_shift,
  max_mcache_size);
+
+/*
+ * Grant mappings must use XC_PAGE_SIZE granularity since we can't
+ * map anything beyond the number of pages granted to us.
+ */
+mapcache_grants = xen_map_cache_init_single(f, opaque,
+XC_PAGE_SHIFT,
+max_mcache_size);
+
 setrlimit(RLIMIT_AS, _as);
 }
 
@@ -168,17 +189,24 @@ static void xen_remap_bucket(MapCache *mc,
  hwaddr size,
  hwaddr address_index,
  bool dummy,
+ bool grant,
+ bool is_write,
  ram_addr_t ram_offset)
 {
 uint8_t *vaddr_base;
-xen_pfn_t *pfns;
+uint32_t *refs = NULL;
+xen_pfn_t *pfns = NULL;
 int *err;
 unsigned int i;
 hwaddr nb_pfn = size >> XC_PAGE_SHIFT;
 
 trace_xen_remap_bucket(address_index);
 
-pfns = g_new0(xen_pfn_t, nb_pfn);
+if (grant) {
+refs = g_new0(uint32_t, nb_pfn);
+} else {
+pfns = g_new0(xen_pfn_t, nb_pfn);
+}
 err = g_new0(int, nb_pfn);
 
 if (entry->vaddr_base != NULL) {
@@ -207,21 +235,51 @@ static void xen_remap_bucket(MapCache *mc,
 g_free(entry->valid_mapping);
 entry->valid_mapping = NULL;
 
-for (i = 0; i < nb_pfn; i++) {
-pfns[i] = (address_index << (mc->bucket_shift - XC_PAGE_SHIFT)) + i;
+if (grant) {
+hwaddr grant_base = address_index - (ram_offset >> XC_PAGE_SHIFT);
+
+for (i = 0; i < nb_pfn; i++) {
+refs[i] = grant_base + i;
+}
+} else {
+for (i = 0; i < nb_pfn; i++) {
+pfns[i] = (address_index << (mc->bucket_shift - XC_PAGE_SHIFT)) + 
i;
+}
 }
 
-/*
- * If the caller has requested the mapping at a specific address use
- * MAP_FIXED to make sure it's honored.
- */
+entry->flags &= ~XEN_MAPCACHE_ENTRY_GRANT;
+

[PATCH v6 1/8] xen: mapcache: Make MCACHE_BUCKET_SHIFT runtime configurable

From: "Edgar E. Iglesias" 

Make MCACHE_BUCKET_SHIFT runtime configurable per cache instance.

Signed-off-by: Edgar E. Iglesias 
Reviewed-by: Stefano Stabellini 
---
 hw/xen/xen-mapcache.c | 54 ++-
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/hw/xen/xen-mapcache.c b/hw/xen/xen-mapcache.c
index fa6813b1ad..bc860f4373 100644
--- a/hw/xen/xen-mapcache.c
+++ b/hw/xen/xen-mapcache.c
@@ -23,13 +23,10 @@
 
 
 #if HOST_LONG_BITS == 32
-#  define MCACHE_BUCKET_SHIFT 16
 #  define MCACHE_MAX_SIZE (1UL<<31) /* 2GB Cap */
 #else
-#  define MCACHE_BUCKET_SHIFT 20
 #  define MCACHE_MAX_SIZE (1UL<<35) /* 32GB Cap */
 #endif
-#define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT)
 
 /* This is the size of the virtual address space reserve to QEMU that will not
  * be use by MapCache.
@@ -65,7 +62,8 @@ typedef struct MapCache {
 /* For most cases (>99.9%), the page address is the same. */
 MapCacheEntry *last_entry;
 unsigned long max_mcache_size;
-unsigned int mcache_bucket_shift;
+unsigned int bucket_shift;
+unsigned long bucket_size;
 
 phys_offset_to_gaddr_t phys_offset_to_gaddr;
 QemuMutex lock;
@@ -95,11 +93,14 @@ static inline int test_bits(int nr, int size, const 
unsigned long *addr)
 
 static MapCache *xen_map_cache_init_single(phys_offset_to_gaddr_t f,
void *opaque,
+   unsigned int bucket_shift,
unsigned long max_size)
 {
 unsigned long size;
 MapCache *mc;
 
+assert(bucket_shift >= XC_PAGE_SHIFT);
+
 mc = g_new0(MapCache, 1);
 
 mc->phys_offset_to_gaddr = f;
@@ -108,12 +109,14 @@ static MapCache 
*xen_map_cache_init_single(phys_offset_to_gaddr_t f,
 
 QTAILQ_INIT(>locked_entries);
 
+mc->bucket_shift = bucket_shift;
+mc->bucket_size = 1UL << bucket_shift;
 mc->max_mcache_size = max_size;
 
 mc->nr_buckets =
 (((mc->max_mcache_size >> XC_PAGE_SHIFT) +
-  (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >>
- (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT));
+  (1UL << (bucket_shift - XC_PAGE_SHIFT)) - 1) >>
+ (bucket_shift - XC_PAGE_SHIFT));
 
 size = mc->nr_buckets * sizeof(MapCacheEntry);
 size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1);
@@ -126,6 +129,13 @@ void xen_map_cache_init(phys_offset_to_gaddr_t f, void 
*opaque)
 {
 struct rlimit rlimit_as;
 unsigned long max_mcache_size;
+unsigned int bucket_shift;
+
+if (HOST_LONG_BITS == 32) {
+bucket_shift = 16;
+} else {
+bucket_shift = 20;
+}
 
 if (geteuid() == 0) {
 rlimit_as.rlim_cur = RLIM_INFINITY;
@@ -146,7 +156,9 @@ void xen_map_cache_init(phys_offset_to_gaddr_t f, void 
*opaque)
 }
 }
 
-mapcache = xen_map_cache_init_single(f, opaque, max_mcache_size);
+mapcache = xen_map_cache_init_single(f, opaque,
+ bucket_shift,
+ max_mcache_size);
 setrlimit(RLIMIT_AS, _as);
 }
 
@@ -195,7 +207,7 @@ static void xen_remap_bucket(MapCache *mc,
 entry->valid_mapping = NULL;
 
 for (i = 0; i < nb_pfn; i++) {
-pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i;
+pfns[i] = (address_index << (mc->bucket_shift - XC_PAGE_SHIFT)) + i;
 }
 
 /*
@@ -266,8 +278,8 @@ static uint8_t *xen_map_cache_unlocked(MapCache *mc,
 bool dummy = false;
 
 tryagain:
-address_index  = phys_addr >> MCACHE_BUCKET_SHIFT;
-address_offset = phys_addr & (MCACHE_BUCKET_SIZE - 1);
+address_index  = phys_addr >> mc->bucket_shift;
+address_offset = phys_addr & (mc->bucket_size - 1);
 
 trace_xen_map_cache(phys_addr);
 
@@ -294,14 +306,14 @@ tryagain:
 return mc->last_entry->vaddr_base + address_offset;
 }
 
-/* size is always a multiple of MCACHE_BUCKET_SIZE */
+/* size is always a multiple of mc->bucket_size */
 if (size) {
 cache_size = size + address_offset;
-if (cache_size % MCACHE_BUCKET_SIZE) {
-cache_size += MCACHE_BUCKET_SIZE - (cache_size % 
MCACHE_BUCKET_SIZE);
+if (cache_size % mc->bucket_size) {
+cache_size += mc->bucket_size - (cache_size % mc->bucket_size);
 }
 } else {
-cache_size = MCACHE_BUCKET_SIZE;
+cache_size = mc->bucket_size;
 }
 
 entry = >entry[address_index % mc->nr_buckets];
@@ -422,7 +434,7 @@ static ram_addr_t 
xen_ram_addr_from_mapcache_single(MapCache *mc, void *ptr)
 trace_xen_ram_addr_from_mapcache_not_in_cache(ptr);
 raddr = RAM_ADDR_INVALID;
 } else {
-raddr = (reventry->paddr_index << MCACHE_BUCKET_SHIFT) +
+raddr = (reventry->paddr_index << mc->bucket_shift) +
  ((unsigned long) ptr - (unsigned long) entry->vaddr_base);
 }
 mapcache_unlock(mc);
@@ -585,8 +597,8 @@

[PATCH v6 0/8] xen: Support grant mappings

From: "Edgar E. Iglesias" 

Hi,

Grant mappings are a mechanism in Xen for guests to grant each other
permissions to map and share pages. These grants can be temporary
so both map and unmaps must be respected. See here for more info:
https://github.com/xen-project/xen/blob/master/docs/misc/grant-tables.txt

Currently, the primary use-case for grants in QEMU, is with VirtIO backends.
Grant mappings will only work with models that use the address_space_map/unmap
interfaces, any other access will fail with appropriate error messages.

In response to feedback we got on v3, later version switch approach
from adding new MemoryRegion types and map/unmap hooks to instead reusing
the existing xen_map_cache() hooks (with extensions). Almost all of the
changes are now contained to the Xen modules.

This approach also refactors the mapcache to support multiple instances
(one for existing foreign mappings and another for grant mappings).

I've only enabled grants for the ARM PVH machine since that is what
I can currently test on.

Cheers,
Edgar

ChangeLog:

v5 -> v6:
* Correct passing of ram_addr_offset in xen_replace_cache_entry_unlocked.

v4 -> v5:
* Compute grant_ref from address_index to xen_remap_bucket().
* Rename grant_is_write to is_write.
* Remove unnecessary + mc->bucket_size - 1 in
  xen_invalidate_map_cache_entry_unlocked().
* Remove use of global mapcache in refactor of
  xen_replace_cache_entry_unlocked().
* Add error checking for xengnttab_unmap().
* Add assert in xen_replace_cache_entry_unlocked() against grant mappings.
* Fix memory leak when freeing first entry in mapcache buckets.
* Assert that bucket_shift is >= XC_PAGE_SHIFT when creating mapcache.
* Add missing use of xen_mr_is_memory() in hw/xen/xen-hvm-common.c.
* Rebase with master.

v3 -> v4:
* Reuse existing xen_map_cache hooks.
* Reuse existing map-cache for both foreign and grant mappings.
* Only enable grants for the ARM PVH machine (removed i386).

v2 -> v3:
* Drop patch 1/7. This was done because device unplug is an x86-only case.
* Add missing qemu_mutex_unlock() before return.

v1 -> v2:
* Split patch 2/7 to keep phymem.c changes in a separate.
* In patch "xen: add map and unmap callbacks for grant" add check for total
  allowed grant < XEN_MAX_VIRTIO_GRANTS.
* Fix formatting issues and re-based with master latest.


Edgar E. Iglesias (8):
  xen: mapcache: Make MCACHE_BUCKET_SHIFT runtime configurable
  xen: mapcache: Unmap first entries in buckets
  xen: Add xen_mr_is_memory()
  softmmu: xen: Always pass offset + addr to xen_map_cache
  softmmu: Replace check for RAMBlock offset 0 with xen_mr_is_memory
  xen: mapcache: Pass the ram_addr offset to xen_map_cache()
  xen: mapcache: Add support for grant mappings
  hw/arm: xen: Enable use of grant mappings

 hw/arm/xen_arm.c|   5 +
 hw/xen/xen-hvm-common.c |  18 ++-
 hw/xen/xen-mapcache.c   | 232 
 include/hw/xen/xen-hvm-common.h |   3 +
 include/sysemu/xen-mapcache.h   |   2 +
 include/sysemu/xen.h|  15 +++
 system/physmem.c|  12 +-
 7 files changed, 226 insertions(+), 61 deletions(-)

-- 
2.40.1

[PATCH v6 6/8] xen: mapcache: Pass the ram_addr offset to xen_map_cache()

From: "Edgar E. Iglesias" 

Pass the ram_addr offset to xen_map_cache.
This is in preparation for adding grant mappings that need
to compute the address within the RAMBlock.

No functional changes.

Signed-off-by: Edgar E. Iglesias 
---
 hw/xen/xen-mapcache.c | 16 +++-
 include/sysemu/xen-mapcache.h |  2 ++
 system/physmem.c  |  9 +
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/hw/xen/xen-mapcache.c b/hw/xen/xen-mapcache.c
index ec95445696..a07c47b0b1 100644
--- a/hw/xen/xen-mapcache.c
+++ b/hw/xen/xen-mapcache.c
@@ -167,7 +167,8 @@ static void xen_remap_bucket(MapCache *mc,
  void *vaddr,
  hwaddr size,
  hwaddr address_index,
- bool dummy)
+ bool dummy,
+ ram_addr_t ram_offset)
 {
 uint8_t *vaddr_base;
 xen_pfn_t *pfns;
@@ -266,6 +267,7 @@ static void xen_remap_bucket(MapCache *mc,
 
 static uint8_t *xen_map_cache_unlocked(MapCache *mc,
hwaddr phys_addr, hwaddr size,
+   ram_addr_t ram_offset,
uint8_t lock, bool dma, bool is_write)
 {
 MapCacheEntry *entry, *pentry = NULL,
@@ -337,14 +339,16 @@ tryagain:
 if (!entry) {
 entry = g_new0(MapCacheEntry, 1);
 pentry->next = entry;
-xen_remap_bucket(mc, entry, NULL, cache_size, address_index, dummy);
+xen_remap_bucket(mc, entry, NULL, cache_size, address_index, dummy,
+ ram_offset);
 } else if (!entry->lock) {
 if (!entry->vaddr_base || entry->paddr_index != address_index ||
 entry->size != cache_size ||
 !test_bits(address_offset >> XC_PAGE_SHIFT,
 test_bit_size >> XC_PAGE_SHIFT,
 entry->valid_mapping)) {
-xen_remap_bucket(mc, entry, NULL, cache_size, address_index, 
dummy);
+xen_remap_bucket(mc, entry, NULL, cache_size, address_index, dummy,
+ ram_offset);
 }
 }
 
@@ -391,13 +395,15 @@ tryagain:
 
 uint8_t *xen_map_cache(MemoryRegion *mr,
hwaddr phys_addr, hwaddr size,
+   ram_addr_t ram_addr_offset,
uint8_t lock, bool dma,
bool is_write)
 {
 uint8_t *p;
 
 mapcache_lock(mapcache);
-p = xen_map_cache_unlocked(mapcache, phys_addr, size, lock, dma, is_write);
+p = xen_map_cache_unlocked(mapcache, phys_addr, size, ram_addr_offset,
+   lock, dma, is_write);
 mapcache_unlock(mapcache);
 return p;
 }
@@ -632,7 +638,7 @@ static uint8_t *xen_replace_cache_entry_unlocked(MapCache 
*mc,
 trace_xen_replace_cache_entry_dummy(old_phys_addr, new_phys_addr);
 
 xen_remap_bucket(mc, entry, entry->vaddr_base,
- cache_size, address_index, false);
+ cache_size, address_index, false, old_phys_addr);
 if (!test_bits(address_offset >> XC_PAGE_SHIFT,
 test_bit_size >> XC_PAGE_SHIFT,
 entry->valid_mapping)) {
diff --git a/include/sysemu/xen-mapcache.h b/include/sysemu/xen-mapcache.h
index 1ec9e66752..b5e3ea1bc0 100644
--- a/include/sysemu/xen-mapcache.h
+++ b/include/sysemu/xen-mapcache.h
@@ -19,6 +19,7 @@ typedef hwaddr (*phys_offset_to_gaddr_t)(hwaddr phys_offset,
 void xen_map_cache_init(phys_offset_to_gaddr_t f,
 void *opaque);
 uint8_t *xen_map_cache(MemoryRegion *mr, hwaddr phys_addr, hwaddr size,
+   ram_addr_t ram_addr_offset,
uint8_t lock, bool dma,
bool is_write);
 ram_addr_t xen_ram_addr_from_mapcache(void *ptr);
@@ -37,6 +38,7 @@ static inline void xen_map_cache_init(phys_offset_to_gaddr_t 
f,
 static inline uint8_t *xen_map_cache(MemoryRegion *mr,
  hwaddr phys_addr,
  hwaddr size,
+ ram_addr_t ram_addr_offset,
  uint8_t lock,
  bool dma,
  bool is_write)
diff --git a/system/physmem.c b/system/physmem.c
index b7847db1a2..33d09f7571 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2231,13 +2231,14 @@ static void *qemu_ram_ptr_length(RAMBlock *block, 
ram_addr_t addr,
  */
 if (xen_mr_is_memory(block->mr)) {
 return xen_map_cache(block->mr, block->offset + addr,
- len, lock, lock,
- is_write);
+ len, block->offset,
+ lock, lock, is_write);
 }
 
 block->host = xen_map_cache(block->mr, block->offset,
-

[PATCH v6 4/8] softmmu: xen: Always pass offset + addr to xen_map_cache

From: "Edgar E. Iglesias" 

Always pass address with offset to xen_map_cache().
This is in preparation for support for grant mappings.

Since this is within a block that checks for offset == 0,
this has no functional changes.

Signed-off-by: Edgar E. Iglesias 
Reviewed-by: Stefano Stabellini 
---
 system/physmem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/system/physmem.c b/system/physmem.c
index 342b7a8fd4..5e6257ef65 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2230,7 +2230,8 @@ static void *qemu_ram_ptr_length(RAMBlock *block, 
ram_addr_t addr,
  * In that case just map the requested area.
  */
 if (block->offset == 0) {
-return xen_map_cache(block->mr, addr, len, lock, lock,
+return xen_map_cache(block->mr, block->offset + addr,
+ len, lock, lock,
  is_write);
 }
 
-- 
2.40.1

[PATCH v6 2/8] xen: mapcache: Unmap first entries in buckets

From: "Edgar E. Iglesias" 

When invalidating memory ranges, if we happen to hit the first
entry in a bucket we were never unmapping it. This was harmless
for foreign mappings but now that we're looking to reuse the
mapcache for transient grant mappings, we must unmap entries
when invalidated.

Signed-off-by: Edgar E. Iglesias 
Reviewed-by: Stefano Stabellini 
---
 hw/xen/xen-mapcache.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/hw/xen/xen-mapcache.c b/hw/xen/xen-mapcache.c
index bc860f4373..ec95445696 100644
--- a/hw/xen/xen-mapcache.c
+++ b/hw/xen/xen-mapcache.c
@@ -491,18 +491,23 @@ static void 
xen_invalidate_map_cache_entry_unlocked(MapCache *mc,
 return;
 }
 entry->lock--;
-if (entry->lock > 0 || pentry == NULL) {
+if (entry->lock > 0) {
 return;
 }
 
-pentry->next = entry->next;
 ram_block_notify_remove(entry->vaddr_base, entry->size, entry->size);
 if (munmap(entry->vaddr_base, entry->size) != 0) {
 perror("unmap fails");
 exit(-1);
 }
+
 g_free(entry->valid_mapping);
-g_free(entry);
+if (pentry) {
+pentry->next = entry->next;
+g_free(entry);
+} else {
+memset(entry, 0, sizeof *entry);
+}
 }
 
 typedef struct XenMapCacheData {
-- 
2.40.1

[PATCH v6 8/8] hw/arm: xen: Enable use of grant mappings

From: "Edgar E. Iglesias" 

Signed-off-by: Edgar E. Iglesias 
Reviewed-by: Stefano Stabellini 
---
 hw/arm/xen_arm.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/hw/arm/xen_arm.c b/hw/arm/xen_arm.c
index 15fa7dfa84..6fad829ede 100644
--- a/hw/arm/xen_arm.c
+++ b/hw/arm/xen_arm.c
@@ -125,6 +125,11 @@ static void xen_init_ram(MachineState *machine)
  GUEST_RAM1_BASE, ram_size[1]);
 memory_region_add_subregion(sysmem, GUEST_RAM1_BASE, _hi);
 }
+
+/* Setup support for grants.  */
+memory_region_init_ram(_grants, NULL, "xen.grants", block_len,
+   _fatal);
+memory_region_add_subregion(sysmem, XEN_GRANT_ADDR_OFF, _grants);
 }
 
 void arch_handle_ioreq(XenIOState *state, ioreq_t *req)
-- 
2.40.1

[PATCH v6 3/8] xen: Add xen_mr_is_memory()

From: "Edgar E. Iglesias" 

Add xen_mr_is_memory() to abstract away tests for the
xen_memory MR.

No functional changes.

Signed-off-by: Edgar E. Iglesias 
Reviewed-by: Stefano Stabellini 
Acked-by: David Hildenbrand 
---
 hw/xen/xen-hvm-common.c | 10 --
 include/sysemu/xen.h|  8 
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/hw/xen/xen-hvm-common.c b/hw/xen/xen-hvm-common.c
index 2d1b032121..a0a0252da0 100644
--- a/hw/xen/xen-hvm-common.c
+++ b/hw/xen/xen-hvm-common.c
@@ -12,6 +12,12 @@
 
 MemoryRegion xen_memory;
 
+/* Check for xen memory.  */
+bool xen_mr_is_memory(MemoryRegion *mr)
+{
+return mr == _memory;
+}
+
 void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, MemoryRegion *mr,
Error **errp)
 {
@@ -28,7 +34,7 @@ void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size, 
MemoryRegion *mr,
 return;
 }
 
-if (mr == _memory) {
+if (xen_mr_is_memory(mr)) {
 return;
 }
 
@@ -55,7 +61,7 @@ static void xen_set_memory(struct MemoryListener *listener,
 {
 XenIOState *state = container_of(listener, XenIOState, memory_listener);
 
-if (section->mr == _memory) {
+if (xen_mr_is_memory(section->mr)) {
 return;
 } else {
 if (add) {
diff --git a/include/sysemu/xen.h b/include/sysemu/xen.h
index 754ec2e6cb..dc72f83bcb 100644
--- a/include/sysemu/xen.h
+++ b/include/sysemu/xen.h
@@ -34,6 +34,8 @@ void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t 
length);
 void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size,
struct MemoryRegion *mr, Error **errp);
 
+bool xen_mr_is_memory(MemoryRegion *mr);
+
 #else /* !CONFIG_XEN_IS_POSSIBLE */
 
 #define xen_enabled() 0
@@ -47,6 +49,12 @@ static inline void xen_ram_alloc(ram_addr_t ram_addr, 
ram_addr_t size,
 g_assert_not_reached();
 }
 
+static inline bool xen_mr_is_memory(MemoryRegion *mr)
+{
+g_assert_not_reached();
+return false;
+}
+
 #endif /* CONFIG_XEN_IS_POSSIBLE */
 
 #endif
-- 
2.40.1

Re: [RFC PATCH v3 09/18] hw/arm/smmu-common: Rework TLB lookup for nesting

Hi Eric,

On Wed, May 15, 2024 at 03:54:36PM +0200, Eric Auger wrote:
> 
> 
> On 4/29/24 05:23, Mostafa Saleh wrote:
> > In the previous patch, comine_tlb() was added which combines 2 TLB
> combine
Will do.

> > entries into one, which chooses the granule and level from the
> > smallest entry.
> >
> > This means that a nested translation, an entry can be cached with the
> that with nested translation
Will do.

> > granule of stage-2 and not stage-1.
> >
> > However, the lookup for an IOVA in nested configuration is done with
> > stage-1 granule, this patch reworks lookup in that case, so it falls
> > back to stage-2 granule if no entry is found using stage-1 granule.
> I should have read that before commenting previous patch ;-)
> Anyway this shows that something is missing in previous patch, at least
> the above explanation ;-)

Yup, I can add a comment in the previous patch or reorder them, let me
know what you prefer.

Thanks,
Mostafa

> 
> Eric
> >
> > Signed-off-by: Mostafa Saleh 
> > ---
> >  hw/arm/smmu-common.c | 24 ++--
> >  1 file changed, 22 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
> > index 0d6945fa54..c67af3bc6d 100644
> > --- a/hw/arm/smmu-common.c
> > +++ b/hw/arm/smmu-common.c
> > @@ -66,8 +66,10 @@ SMMUIOTLBKey smmu_get_iotlb_key(int asid, int vmid, 
> > uint64_t iova,
> >  return key;
> >  }
> >  
> > -SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg,
> > -SMMUTransTableInfo *tt, hwaddr iova)
> > +static SMMUTLBEntry *smmu_iotlb_lookup_all_levels(SMMUState *bs,
> > +  SMMUTransCfg *cfg,
> > +  SMMUTransTableInfo *tt,
> > +  hwaddr iova)
> >  {
> >  uint8_t tg = (tt->granule_sz - 10) / 2;
> >  uint8_t inputsize = 64 - tt->tsz;
> > @@ -88,6 +90,24 @@ SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, 
> > SMMUTransCfg *cfg,
> >  }
> >  level++;
> >  }
> > +return entry;
> > +}
> > +
> > +SMMUTLBEntry *smmu_iotlb_lookup(SMMUState *bs, SMMUTransCfg *cfg,
> > +SMMUTransTableInfo *tt, hwaddr iova)
> > +{
> > +SMMUTLBEntry *entry = NULL;
> > +
> > +entry = smmu_iotlb_lookup_all_levels(bs, cfg, tt, iova);
> > +/*
> > + * For nested translation also try the s2 granule, as the TLB will 
> > insert
> > + * it if the size of s2 tlb entry was smaller.
> > + */
> > +if (!entry && (cfg->stage == SMMU_NESTED) &&
> > +(cfg->s2cfg.granule_sz != tt->granule_sz)) {
> > +tt->granule_sz = cfg->s2cfg.granule_sz;
> > +entry = smmu_iotlb_lookup_all_levels(bs, cfg, tt, iova);
> > +}
> >  
> >  if (entry) {
> >  cfg->iotlb_hits++;
>

Re: [PATCH v4 0/3] Fix "virtio-gpu: fix scanout migration post-load"

2024-05-16 Thread Peter Xu

Looks good here, thanks.

On Thu, May 16, 2024, 2:40 a.m.  wrote:

> From: Marc-André Lureau 
>
> Hi,
>
> The aforementioned patch breaks virtio-gpu device migrations for versions
> pre-9.0/9.0, both forwards and backwards. Versioning of `VMS_STRUCT` is
> more
> complex than it may initially appear, as evidenced in the problematic
> commit
> dfcf74fa68c ("virtio-gpu: fix scanout migration post-load").
>
> v2:
>  - use a manual version field test (instead of the more complex struct
> variant)
>
> v3:
>  - introduce machine_check_version()
>  - drop the VMSD version, and use machine version field test
>
> v4:
>  - drop machine_check_version() approach
>  - property renamed to x-scanout-vmstate-version
>
> Marc-André Lureau (3):
>   migration: add "exists" info to load-state-field trace
>   migration: fix a typo
>   virtio-gpu: fix v2 migration
>
>  include/hw/virtio/virtio-gpu.h |  1 +
>  hw/core/machine.c  |  1 +
>  hw/display/virtio-gpu.c| 24 
>  migration/vmstate.c|  7 ---
>  migration/trace-events |  2 +-
>  5 files changed, 23 insertions(+), 12 deletions(-)
>
> --
> 2.41.0.28.gd7d8841f67
>
>

Re: [RFC PATCH v3 08/18] hw/arm/smmu-common: Add support for nested TLB

Hi Eric,

On Wed, May 15, 2024 at 03:48:05PM +0200, Eric Auger wrote:
> Hi Mostafa,
> 
> On 4/29/24 05:23, Mostafa Saleh wrote:
> > This patch adds support for nested(combined) TLB entries.
> space between nested and (.
Will do.
> > The main function combine_tlb() is not used here but in the next
> > patches, but to simplify the patches it is introduced first.
> >
> > Main changes:
> > 1) New entry added in the TLB, parent_perm, for nested TLB, holds the
> s/entry/field, s/TLB/SMMUTLBEntry struct
Will do.
> >stage-2 permission, this can be used to know the origin of a
> >permission fault from a cached entry as caching the “and” of the
> >permissions loses this information.
> >
> >SMMUPTWEventInfo is used to hold information about PTW faults so
> >the event can be populated, the value of stage (which maps to S2
> >in the event) used to be set based on the current stage for TLB
> I don't understand "(which maps to S2 in the event)". What do you mean?
> This could be S1 or S2 depending on the active stage, no?

Not really, if the IPA size is larger than S2 input size, this is
considered stage-1 fault.

For TLB permission fault, yes, that is how it is decided.
However, with nesting, a permission fault from a cached entry can be
from a stage-1 or stage-2, that’s why we now cache both and not just
the combined permission, and the logic to set fault stage is modified
accordingly.

> >permission faults, however with the parent_perm, it is now set
> >based on which perm has the missing permission
> >
> >When nesting is not enabled it has the same value as perm which
> >doesn't change the logic.
> >
> > 2) As combined TLB implementation is used, the combination logic
> >chooses:
> >- tg and level from the entry which has the smallest addr_mask.
> tbh I am scared bout swapping s1/s2 tg and level. In smmu_iotlb_lookup()
> I see tt->granule_sz being used which is s1 data. I mean it is not
> obvious to me this is correct. Could you maybe give more explanations
> detailing why/how this is guaranted to work.

As you mentioned the next patch reworks the lookup logic, I can reorder
the 2 patches if that is better, please let me know what you think?

> 
> Can you give additional details about what s1+s2 combinations were tested?

I tested with S1 and S2 4K pages
S1 level = 3 and S2 level = 3
S1 level = 2 and S2 level = 3
S1 level = 3 and S2 level = 2
S1 level = 1 and S2 level = 2

And also tested with with S1 64K granule and S2 4K.

> >- Based on that the iova that would be cached is recalculated.
> >- Translated_addr is chosen from stage-2.
> >
> > Signed-off-by: Mostafa Saleh 
> > ---
> >  hw/arm/smmu-common.c | 32 
> >  include/hw/arm/smmu-common.h |  1 +
> >  2 files changed, 29 insertions(+), 4 deletions(-)
> >
> > diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
> > index 21982621c0..0d6945fa54 100644
> > --- a/hw/arm/smmu-common.c
> > +++ b/hw/arm/smmu-common.c
> > @@ -394,7 +394,7 @@ static int smmu_ptw_64_s1(SMMUTransCfg *cfg,
> >  tlbe->entry.translated_addr = gpa;
> >  tlbe->entry.iova = iova & ~mask;
> >  tlbe->entry.addr_mask = mask;
> > -tlbe->entry.perm = PTE_AP_TO_PERM(ap);
> > +tlbe->parent_perm = tlbe->entry.perm = PTE_AP_TO_PERM(ap);
> nit: I would prefer on separate lines.
Will do.

> >  tlbe->level = level;
> >  tlbe->granule = granule_sz;
> >  return 0;
> > @@ -515,7 +515,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
> >  tlbe->entry.translated_addr = gpa;
> >  tlbe->entry.iova = ipa & ~mask;
> >  tlbe->entry.addr_mask = mask;
> > -tlbe->entry.perm = s2ap;
> > +tlbe->parent_perm = tlbe->entry.perm = s2ap;
> >  tlbe->level = level;
> >  tlbe->granule = granule_sz;
> >  return 0;
> > @@ -530,6 +530,27 @@ error:
> >  return -EINVAL;
> >  }
> >  
> > +/* combine 2 TLB entries and return in tlbe in nested config. */
> suggestion: combine S1 and S2 TLB entries into a single entry. As a
> result the S1 entry is overriden with combined data.
Will do.

> > +static void __attribute__((unused)) combine_tlb(SMMUTLBEntry *tlbe,
> > +SMMUTLBEntry *tlbe_s2,
> > +dma_addr_t iova,
> > +SMMUTransCfg *cfg)
> > +{
> > +if (tlbe_s2->entry.addr_mask < tlbe->entry.addr_mask) {
> > +tlbe->entry.addr_mask = tlbe_s2->entry.addr_mask;
> > +tlbe->granule = tlbe_s2->granule;
> > +tlbe->level = tlbe_s2->level;
> > +}
> > +
> > +tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
> > +tlbe->entry.translated_addr);
> > +
> > +tlbe->entry.iova = iova & ~tlbe->entry.addr_mask;
> > +/* parent_perm has s2 perm while perm has s1 perm. */
> 
>  suggestion: while perm keeps s1

Re: [PATCH 08/20] qapi/parser: differentiate intro and outro paragraphs

On Thu, May 16, 2024, 5:34 AM Markus Armbruster  wrote:

> John Snow  writes:
>
> > Add a semantic tag to paragraphs that appear *before* tagged
> > sections/members/features and those that appear after. This will control
> > how they are inlined when doc sections are merged and flattened.
>
> This future use is not obvious to me now.  I guess the effective way to
> help me see it is actual patches, which will come in due time.
>

Head recursion and tail recursion, respectively :)

* intro
* inherited intro
* members [ancestor-descendent]
* features [ancestor-descendent]
* inherited outro
* outro

Child gets the first and final words. Inherited stuff goes in the sandwich
fillings.

It feels like a simple rule that's easy to internalize. As a bonus, you can
explain it by analogy to Americans as a burger, which is the only metaphor
we understand.


> > Signed-off-by: John Snow 
> > ---
> >  scripts/qapi/parser.py | 22 +-
> >  1 file changed, 17 insertions(+), 5 deletions(-)
> >
> > diff --git a/scripts/qapi/parser.py b/scripts/qapi/parser.py
> > index cf4cbca1c1f..b1794f71e12 100644
> > --- a/scripts/qapi/parser.py
> > +++ b/scripts/qapi/parser.py
> > @@ -503,6 +503,10 @@ def get_doc(self) -> 'QAPIDoc':
> >  self.accept(False)
> >  line = self.get_doc_line()
> >  no_more_args = False
> > +# Paragraphs before members/features/tagged are "intro"
> paragraphs.
> > +# Any appearing subsequently are "outro" paragraphs.
> > +# This is only semantic metadata for the doc generator.
>
> Not sure about the last sentence.  Isn't it true for almost everything
> around here?
>

I guess I was trying to say "There's no real difference between the two
mechanically, it's purely based on where it appears in the doc block, which
offers only a heuristic for its semantic value- introductory statements or
additional detail."

In my mind: the other "kind" values have some more mechanical difference to
them, but intro/outro don't.


> Also, long line.
>
> > +intro = True
> >
> >  while line is not None:
> >  # Blank lines
> > @@ -532,6 +536,7 @@ def get_doc(self) -> 'QAPIDoc':
> >  raise QAPIParseError(
> >  self, 'feature descriptions expected')
> >  no_more_args = True
> > +intro = False
>
> After feature descriptions.
>
> >  elif match := self._match_at_name_colon(line):
> >  # description
> >  if no_more_args:
> > @@ -547,6 +552,7 @@ def get_doc(self) -> 'QAPIDoc':
> >  doc.append_line(text)
> >  line = self.get_doc_indented(doc)
> >  no_more_args = True
> > +intro = False
>
> Or after member descriptions.
>
> >  elif match := re.match(
> >  r'(Returns|Errors|Since|Notes?|Examples?|TODO):
> *',
> >  line):
> > @@ -557,13 +563,14 @@ def get_doc(self) -> 'QAPIDoc':
> >  doc.append_line(text)
> >  line = self.get_doc_indented(doc)
> >  no_more_args = True
> > +intro = False
>
> Or after the first tagged section.
>
> Okay, it does what it says on the tin.
>
> >  elif line.startswith('='):
> >  raise QAPIParseError(
> >  self,
> >  "unexpected '=' markup in definition
> documentation")
> >  else:
> >  # tag-less paragraph
> > -doc.ensure_untagged_section(self.info)
> > +doc.ensure_untagged_section(self.info, intro)
> >  doc.append_line(line)
> >  line = self.get_doc_paragraph(doc)
> >  else:
> > @@ -617,7 +624,7 @@ def __init__(
> >  self,
> >  info: QAPISourceInfo,
> >  tag: Optional[str] = None,
> > -kind: str = 'paragraph',
> > +kind: str = 'intro-paragraph',
>
> The question "why is this optional?" crossed my mind when reviewing the
> previous patch.  I left it unasked, because I felt challenging the
> overlap between @kind and @tag was more useful.  However, the new
> default value 'intro-paragraph' feels more arbitrary to me than the old
> one 'paragraph', and that makes the question pop right back into my
> mind.
>

Just "don't break API" habit, nothing more. I can make it mandatory.


> Unless I'm mistaken, all calls but one @tag and @kind.  Making that one
> pass it too feels simpler to me.
>
> Moot if we fuse @tag and @kind, of course.


> >  ):
> >  # section source info, i.e. where it begins
> >  self.info = info
> > @@ -625,7 +632,7 @@ def __init__(
> >  self.tag = tag
> >  # section text

Re: [RFC/PATCH v2 03/12] hw/arm/virt: confidential guest support

On Thu, May 16, 2024 at 02:33:47PM +, Srivatsa Vaddagiri wrote:
> This adds support to launch hypervisor-assisted confidential guests,
> where guest's memory is protected from a potentially untrusted host.
> Hypervisor can setup host's page-tables so that it loses access to guest
> memory.
> 
> Since some guest drivers may need to communicate data with their host
> counterparts via shared memory, optionally allow setting aside some part
> of the confidential guest's memory as "shared". The size of this shared
> memory is specified via the optional "swiotlb-size" parameter.
> 
> -machine virt,confidential-guest-support=prot0 \
>   -object arm-confidential-guest,id=prot0,swiotlb-size=16777216
> 
> The size of this shared memory is indicated to the guest in size/reg
> property of device-tree node "/reserved-memory/restricted_dma_reserved".
> A memory-region property is added to device-tree node representing
> virtio-pcie hub, so that all DMA allocations requested by guest's virtio-pcie
> device drivers are satisfied from the shared swiotlb region.

For reference, there is another series proposing confidential guest
support for the 'virt' machine on AArch64 with KVM

 https://lists.nongnu.org/archive/html/qemu-devel/2024-04/msg02742.html

I've no idea how closely your impl matches the KVM proposed impl. ie
whether we need 2 distinct "ConfidentialGuest" subclasses for KVM vs
Gunyah, or whether 1 can cope with both.  If we do need 2 distinct
subclasses for each hypervisor, then calling this Gunyah targetted
object 'arm-confidential-guest' is too broad of an name.

> 
> Signed-off-by: Srivatsa Vaddagiri 
> ---
>  qapi/qom.json |  14 +
>  include/hw/arm/virt.h |   1 +
>  hw/arm/virt.c | 141 +-
>  3 files changed, 155 insertions(+), 1 deletion(-)
> 
> diff --git a/qapi/qom.json b/qapi/qom.json
> index 38dde6d785..9b3cd7ce22 100644
> --- a/qapi/qom.json
> +++ b/qapi/qom.json
> @@ -874,6 +874,18 @@
>'base': 'RngProperties',
>'data': { '*filename': 'str' } }
>  
> +##
> +# @ArmConfidentialGuestProperties:
> +#
> +# Properties for arm-confidential-guest objects.
> +#
> +# @swiotlb-size: swiotlb size
> +#
> +# Since: 2.12
> +##
> +{ 'struct': 'ArmConfidentialGuestProperties',
> +  'data': { 'swiotlb-size' : 'uint64' } }
> +
>  ##
>  # @SevGuestProperties:
>  #
> @@ -997,6 +1009,7 @@
>  { 'name': 'secret_keyring',
>'if': 'CONFIG_SECRET_KEYRING' },
>  'sev-guest',
> +'arm-confidential-guest',
>  'thread-context',
>  's390-pv-guest',
>  'throttle-group',
> @@ -1067,6 +1080,7 @@
>'secret_keyring': { 'type': 'SecretKeyringProperties',
>'if': 'CONFIG_SECRET_KEYRING' },
>'sev-guest':  'SevGuestProperties',
> +  'arm-confidential-guest': 'ArmConfidentialGuestProperties',
>'thread-context': 'ThreadContextProperties',
>'throttle-group': 'ThrottleGroupProperties',
>'tls-creds-anon': 'TlsCredsAnonProperties',
> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> index bb486d36b1..1e23f20972 100644
> --- a/include/hw/arm/virt.h
> +++ b/include/hw/arm/virt.h
> @@ -165,6 +165,7 @@ struct VirtMachineState {
>  uint32_t clock_phandle;
>  uint32_t gic_phandle;
>  uint32_t msi_phandle;
> +uint32_t restricted_dma_phandle;
>  uint32_t iommu_phandle;
>  int psci_conduit;
>  hwaddr highest_gpa;
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 3c93c0c0a6..2a3eb4075d 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -84,6 +84,9 @@
>  #include "hw/virtio/virtio-iommu.h"
>  #include "hw/char/pl011.h"
>  #include "qemu/guest-random.h"
> +#include "sysemu/cpus.h"
> +#include "exec/confidential-guest-support.h"
> +#include "qom/object_interfaces.h"
>  
>  static GlobalProperty arm_virt_compat[] = {
>  { TYPE_VIRTIO_IOMMU_PCI, "aw-bits", "48" },
> @@ -1545,6 +1548,11 @@ static void create_pcie(VirtMachineState *vms)
> nr_pcie_buses - 1);
>  qemu_fdt_setprop(ms->fdt, nodename, "dma-coherent", NULL, 0);
>  
> +if (vms->restricted_dma_phandle) {
> +qemu_fdt_setprop_cell(ms->fdt, nodename, "memory-region",
> +vms->restricted_dma_phandle);
> +}
> +
>  if (vms->msi_phandle) {
>  qemu_fdt_setprop_cells(ms->fdt, nodename, "msi-map",
> 0, vms->msi_phandle, 0, 0x1);
> @@ -2065,6 +2073,129 @@ static void virt_cpu_post_init(VirtMachineState *vms, 
> MemoryRegion *sysmem)
>  }
>  }
>  
> +#define TYPE_ARM_CONFIDENTIAL_GUEST "arm-confidential-guest"
> +OBJECT_DECLARE_SIMPLE_TYPE(ArmConfidentialGuestState, ARM_CONFIDENTIAL_GUEST)
> +
> +struct ArmConfidentialGuestState {
> +ConfidentialGuestSupport parent_obj;
> +
> +hwaddr swiotlb_size;
> +};
> +
> +static ArmConfidentialGuestState *acg;
> +
> +static void
>

Re: [PATCH v7 8/9] vfio: Add Error** argument to .get_dirty_bitmap() handler


On 5/16/24 14:46, Cédric Le Goater wrote:

Let the callers do the error reporting. Add documentation while at it.

Reviewed-by: Eric Auger 
Reviewed-by: Avihai Horon 
Signed-off-by: Cédric Le Goater 
---

  Changes in v7:
  
  - Fixed even more line wrapping of *dirty_bitmap() routines (Avihai)

  - vfio_sync_dirty_bitmap()
Fixed return when vfio_sync_ram_discard_listener_dirty_bitmap() is called 
(Avihai)

  include/hw/vfio/vfio-common.h |  5 +--

  include/hw/vfio/vfio-container-base.h | 19 +++--
  hw/vfio/common.c  | 60 +--
  hw/vfio/container-base.c  |  6 +--
  hw/vfio/container.c   | 14 ---
  5 files changed, 67 insertions(+), 37 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 
3ff633ad3b395e953a55683f5f0308bca50af3dd..b6ac24953667bc5f72f28480a6bf0f4722069cb9
 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -273,10 +273,9 @@ vfio_devices_all_running_and_mig_active(const 
VFIOContainerBase *bcontainer);
  bool
  vfio_devices_all_device_dirty_tracking(const VFIOContainerBase *bcontainer);
  int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
-VFIOBitmap *vbmap, hwaddr iova,
-hwaddr size);
+VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp);
  int vfio_get_dirty_bitmap(const VFIOContainerBase *bcontainer, uint64_t iova,
-  uint64_t size, ram_addr_t ram_addr);
+  uint64_t size, ram_addr_t ram_addr, Error **errp);
  
  /* Returns 0 on success, or a negative errno. */

  int vfio_device_get_name(VFIODevice *vbasedev, Error **errp);
diff --git a/include/hw/vfio/vfio-container-base.h 
b/include/hw/vfio/vfio-container-base.h
index 
326ceea52a2030eec9dad289a9845866c4a8c090..b04057ad1aff73d974ecec718d0fe45f7a930b59
 100644
--- a/include/hw/vfio/vfio-container-base.h
+++ b/include/hw/vfio/vfio-container-base.h
@@ -84,8 +84,7 @@ void vfio_container_del_section_window(VFIOContainerBase 
*bcontainer,
  int vfio_container_set_dirty_page_tracking(VFIOContainerBase *bcontainer,
 bool start, Error **errp);
  int vfio_container_query_dirty_bitmap(const VFIOContainerBase *bcontainer,
-  VFIOBitmap *vbmap,
-  hwaddr iova, hwaddr size);
+   VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp);
  
  void vfio_container_init(VFIOContainerBase *bcontainer,

   VFIOAddressSpace *space,
@@ -138,9 +137,21 @@ struct VFIOIOMMUClass {
   */
  int (*set_dirty_page_tracking)(const VFIOContainerBase *bcontainer,
 bool start, Error **errp);
+/**
+ * @query_dirty_bitmap
+ *
+ * Get bitmap of dirty pages from container
+ *
+ * @bcontainer: #VFIOContainerBase from which to get dirty pages
+ * @vbmap: #VFIOBitmap internal bitmap structure
+ * @iova: iova base address
+ * @size: size of iova range
+ * @errp: pointer to Error*, to store an error if it happens.
+ *
+ * Returns zero to indicate success and negative for error
+ */
  int (*query_dirty_bitmap)(const VFIOContainerBase *bcontainer,
-  VFIOBitmap *vbmap,
-  hwaddr iova, hwaddr size);
+VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp);
  /* PCI specific */
  int (*pci_hot_reset)(VFIODevice *vbasedev, bool single);
  
diff --git a/hw/vfio/common.c b/hw/vfio/common.c

index 
7313043f1d161ed0326b5ba3fa1085608eaf6740..21910802c0c58a0efdb07d31c5a709660e89e328
 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1140,8 +1140,7 @@ static int vfio_device_dma_logging_report(VFIODevice 
*vbasedev, hwaddr iova,
  }
  
  int vfio_devices_query_dirty_bitmap(const VFIOContainerBase *bcontainer,

-VFIOBitmap *vbmap, hwaddr iova,
-hwaddr size)
+ VFIOBitmap *vbmap, hwaddr iova, hwaddr size, Error **errp)
  {
  VFIODevice *vbasedev;
  int ret;
@@ -1150,10 +1149,10 @@ int vfio_devices_query_dirty_bitmap(const 
VFIOContainerBase *bcontainer,
  ret = vfio_device_dma_logging_report(vbasedev, iova, size,
   vbmap->bitmap);
  if (ret) {
-error_report("%s: Failed to get DMA logging report, iova: "
- "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx
- ", err: %d (%s)",
- vbasedev->name, iova, size, ret, strerror(-ret));
+error_setg_errno(errp, -ret,
+ "%s: Failed to get DMA logging report, iova: "
+ "0x%" HWADDR_PRIx ", size:

Re: [PATCH v2 1/4] accel/kvm: Extract common KVM vCPU {creation, parking} code

2024-05-16 Thread Harsh Prateek Bora

Hi Salil,

On 5/16/24 19:05, Salil Mehta wrote:

  From: Harsh Prateek Bora 
  Sent: Thursday, May 16, 2024 2:07 PM

  Hi Salil,

  On 5/16/24 17:42, Salil Mehta wrote:

  > Hi Harsh,
  >
  >>   From: Harsh Prateek Bora 
  >>   Sent: Thursday, May 16, 2024 11:15 AM
  >>
  >>   Hi Salil,
  >>
  >>   Thanks for your email.
  >>   Your patch 1/8 is included here based on review comments on my  previous
  >>   patch from one of the maintainers in the community and therefore I  had
  >>   kept you in CC to be aware of the desire of having this independent 
patch to
  >>   get merged earlier even if your other patches in the series may go 
through
  >>   further reviews.
  >
  > I really don’t know which discussion are  you pointing at? Please
  > understand you are fixing a bug and we are pushing a feature which has got 
large series.
  > It will break the patch-set  which is about t be merged.
  >
  > There will be significant overhead of testing on us for the work we
  > have been carrying forward for large time. This will be disruptive. Please 
dont!
  >

  I was referring to the review discussion on my prev patch here:

  https://lore.kernel.org/qemu-devel/d191d2jfar7l.2eh4s445m4...@gmail.com/

Sure, I'm, not sure what this means.

No worries. If you had followed the conversation on the review
link I shared, I had made it clear that we are expecting a patch update
from you and it is included here just to facilitate review of additional
patches on the top.

  Although your patch was included with this series only to facilitate review of
  the additional patches depending on just one of your patch.

Generally you rebase your patch-set over the other and clearly state on the 
cover
letter that this patch-set is dependent upon such and such patch-set. Just 
imagine
if everyone starts to unilaterally pick up patches from each other's patch-set 
it will
create a chaos not only for the feature owners but also for the maintainers.

Please go through the review discussion on the link I shared above. It
was included on the suggestion of one of the maintainers. However, if
you are going to send v9 soon, everyone would be happy to wait.

  I am not sure what is appearing disruptive here. It is a common practive in

  the community that maintainer(s) can pick individual patches from the
  series if it has been vetted by siginificant number of reviewers.

Don’t you think this patch-set is asking for acceptance for a patch already
part of another patch-set which is about to be accepted and is a bigger feature?
Will it cause maintenance overhead at the last moment? Yes, of course!

No, I dont think so.

  However, in this case, since you have mentioned to post next version soon,
  you need not worry about it as that would be the preferred version for both
  of the series.

Yes, but please understand we are working for the benefit of overall community.
Please cooperate here.

Hope I cleared your confusion. We are waiting to see your v9 soon.

  >

  >>
  >>   I am hoping to see your v9 soon and thereafter maintainer(s) may
  choose to
  >>   pick the latest independent patch if needs to be merged earlier.
  >
  >
  > I don’t think you are understanding what problem it is causing. For
  > your small bug fix you are causing significant delays at our end.
  >

  I hope I clarfied above that including your patch here doesnt delay anything.

  Hoping to see your v9 soon!

  Thanks

  Harsh
  >
  > Thanks
  > Salil.
  >>
  >>   Thanks for your work and let's be hopeful it gets merged soon.
  >>
  >>   regards,
  >>   Harsh
  >>
  >>   On 5/16/24 14:00, Salil Mehta wrote:
  >>   > Hi Harsh,
  >>   >
  >>   > Thanks for your interest in the patch-set but taking away patches like
  >>   > this from other series without any discussion can disrupt others work
  >>   > and its acceptance on time. This is because we will have to put lot of
  >>   > effort in rebasing bigger series and then testing overhead comes
  along
  >>   > with it.
  >>   >
  >>   > The patch-set (from where this  patch has been taken) is part of even
  >>   > bigger series and there have been many people and companies toiling
  to
  >>   > fix the bugs collectively in that series and for years.
  >>   >
  >>   > I'm about float the V9 version of the Arch agnostic series which this
  >>   > patch is part of and you can rebase your patch-set from there. I'm
  >>   > hopeful that it will get accepted in this cycle.
  >>   >
  >>   >
  >>   > Many thanks
  >>   > Salil.
  >>   >
  >>   >>   From: Harsh Prateek Bora 
  >>   >>   Sent: Thursday, May 16, 2024 6:32 AM
  >>   >>
  >>   >>   From: Salil Mehta 
  >>   >>
  >>   >>   KVM vCPU creation is done once during the vCPU realization when
  >>   Qemu
  >>   >>   vCPU thread is spawned. This is common to all the architectures as
  of
  >>   now.
  >>   >>
  >>   >>   Hot-unplug of vCPU results in destruction of the vCPU object in
  QOM
  >>   but
  >>   >>   the

Re: [RFC PATCH v3 03/18] hw/arm/smmuv3: Fix encoding of CLASS in events

Hi Eric,

On Wed, May 15, 2024 at 02:27:45PM +0200, Eric Auger wrote:
> Hi Mostafa,
> On 4/29/24 05:23, Mostafa Saleh wrote:
> > The SMMUv3 spec (ARM IHI 0070 F.b - 7.3 Event records) defines the
> > class of events faults as:
> >
> > CLASS: The class of the operation that caused the fault:
> > - 0b00: CD, CD fetch.
> > - 0b01: TTD, Stage 1 translation table fetch.
> > - 0b10: IN, Input address
> >
> > However, this value was not set and left as 0 which means CD and not
> > IN (0b10).
> > While at it, add an enum for class as it would be used for nesting.
> If this fixes somethings please add a Fixes: tag.
> 
> Also you may add that until nested gets implemented, CLASS values are
> the same for stage 1 and stage2. This will change later on.
Will do.

Thanks,
Mostafa

> 
> Besides
> Reviewed-by: Eric Auger 
> 
> Eric
> 
> >
> > Signed-off-by: Mostafa Saleh 
> > ---
> >  hw/arm/smmuv3-internal.h | 6 ++
> >  hw/arm/smmuv3.c  | 6 +-
> >  2 files changed, 11 insertions(+), 1 deletion(-)
> >
> > diff --git a/hw/arm/smmuv3-internal.h b/hw/arm/smmuv3-internal.h
> > index e4dd11e1e6..0f3ecec804 100644
> > --- a/hw/arm/smmuv3-internal.h
> > +++ b/hw/arm/smmuv3-internal.h
> > @@ -32,6 +32,12 @@ typedef enum SMMUTranslationStatus {
> >  SMMU_TRANS_SUCCESS,
> >  } SMMUTranslationStatus;
> >  
> > +typedef enum SMMUTranslationClass {
> > +SMMU_CLASS_CD,
> > +SMMU_CLASS_TT,
> > +SMMU_CLASS_IN,
> > +} SMMUTranslationClass;
> > +
> >  /* MMIO Registers */
> >  
> >  REG32(IDR0,0x0)
> > diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
> > index 9dd3ea48e4..1eb5b160d2 100644
> > --- a/hw/arm/smmuv3.c
> > +++ b/hw/arm/smmuv3.c
> > @@ -942,7 +942,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
> > *mr, hwaddr addr,
> >  event.type = SMMU_EVT_F_WALK_EABT;
> >  event.u.f_walk_eabt.addr = addr;
> >  event.u.f_walk_eabt.rnw = flag & 0x1;
> > -event.u.f_walk_eabt.class = 0x1;
> > +event.u.f_walk_eabt.class = SMMU_CLASS_TT;
> >  event.u.f_walk_eabt.addr2 = ptw_info.addr;
> >  break;
> >  case SMMU_PTW_ERR_TRANSLATION:
> > @@ -950,6 +950,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
> > *mr, hwaddr addr,
> >  event.type = SMMU_EVT_F_TRANSLATION;
> >  event.u.f_translation.addr = addr;
> >  event.u.f_translation.addr2 = ptw_info.addr;
> > +event.u.f_translation.class = SMMU_CLASS_IN;
> >  event.u.f_translation.rnw = flag & 0x1;
> >  }
> >  break;
> > @@ -958,6 +959,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
> > *mr, hwaddr addr,
> >  event.type = SMMU_EVT_F_ADDR_SIZE;
> >  event.u.f_addr_size.addr = addr;
> >  event.u.f_addr_size.addr2 = ptw_info.addr;
> > +event.u.f_translation.class = SMMU_CLASS_IN;
> >  event.u.f_addr_size.rnw = flag & 0x1;
> >  }
> >  break;
> > @@ -966,6 +968,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
> > *mr, hwaddr addr,
> >  event.type = SMMU_EVT_F_ACCESS;
> >  event.u.f_access.addr = addr;
> >  event.u.f_access.addr2 = ptw_info.addr;
> > +event.u.f_translation.class = SMMU_CLASS_IN;
> >  event.u.f_access.rnw = flag & 0x1;
> >  }
> >  break;
> > @@ -974,6 +977,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
> > *mr, hwaddr addr,
> >  event.type = SMMU_EVT_F_PERMISSION;
> >  event.u.f_permission.addr = addr;
> >  event.u.f_permission.addr2 = ptw_info.addr;
> > +event.u.f_translation.class = SMMU_CLASS_IN;
> >  event.u.f_permission.rnw = flag & 0x1;
> >  }
> >  break;
>

Re: [PATCH 07/20] qapi/parser: add semantic 'kind' parameter to QAPIDoc.Section

On Thu, May 16, 2024, 2:18 AM Markus Armbruster  wrote:

> John Snow  writes:
>
> > When iterating all_sections, this is helpful to be able to distinguish
> > "members" from "features"; the only other way to do so is to
> > cross-reference these sections against QAPIDoc.args or QAPIDoc.features,
> > but if the desired end goal for QAPIDoc is to remove everything except
> > all_sections, we need *something* accessible to distinguish them.
> >
> > To keep types simple, add this semantic parameter to the base Section
> > and not just ArgSection; we can use this to filter out paragraphs and
> > tagged sections, too.
> >
> > Signed-off-by: John Snow 
> > ---
> >  scripts/qapi/parser.py | 25 -
> >  1 file changed, 16 insertions(+), 9 deletions(-)
> >
> > diff --git a/scripts/qapi/parser.py b/scripts/qapi/parser.py
> > index 161768b8b96..cf4cbca1c1f 100644
> > --- a/scripts/qapi/parser.py
> > +++ b/scripts/qapi/parser.py
> > @@ -613,21 +613,27 @@ class QAPIDoc:
> >
> >  class Section:
> >  # pylint: disable=too-few-public-methods
> > -def __init__(self, info: QAPISourceInfo,
> > - tag: Optional[str] = None):
> > +def __init__(
> > +self,
> > +info: QAPISourceInfo,
> > +tag: Optional[str] = None,
> > +kind: str = 'paragraph',
> > +):
> >  # section source info, i.e. where it begins
> >  self.info = info
> >  # section tag, if any ('Returns', '@name', ...)
> >  self.tag = tag
> >  # section text without tag
> >  self.text = ''
> > +# section type - {paragraph, feature, member, tagged}
> > +self.kind = kind
>
> Hmm.  .kind is almost redundant with .tag.
>

Almost, yes. But the crucial bit is members/features as you notice. That's
the real necessity here that saves a lot of code when relying on *only*
all_sections.

(If you want to remove the other fields leaving only all_sections behind,
this is strictly necessary.)


> Untagged section:.kind is 'paragraph', .tag is None
>
> Member description:  .kind is 'member', .tag matches @NAME
>
> Feature description: .kind is 'feature', .tag matches @NAME


> Tagged section:  .kind is 'tagged', .tag matches
>   r'Returns|Errors|Since|Notes?|Examples?|TODO'
>
> .kind can directly be derived from .tag except for member and feature
> descriptions.  And you want to tell these two apart in a straightforward
> manner in later patches, as you explain in your commit message.
>
> If .kind is 'member' or 'feature', then self must be an ArgSection.
> Worth a comment?  An assertion?
>

No real need. The classes don't differ much in practice so there's not much
benefit, and asserting it won't help the static typer out anyway because it
can't remember the inference from string->type anyway.

If you wanted to be FANCY, we could use string literal typing on the field
and restrict valid values per-class, but that's so needless not even I'm
tempted by it.


> Some time back, I considered changing .tag for member and feature
> descriptions to suitable strings, like your 'member' and 'feature', and
> move the member / feature name into ArgSection.  I didn't, because the
> benefit wasn't worth the churn at the time.  Perhaps it's worth it now.
> Would it result in simpler code than your solution?
>

Not considerably, I think. Would just be shuffling around which field names
I touch and where/when.

It might actually just add some lines where I have to assert isinstance to
do type narrowing in the generator.


> Terminology nit: the section you call 'paragraph' isn't actually a
> paragraph: it could be several paragraphs.  Best to call it 'untagged',
> as in .ensure_untagged_section().
>

Oh, I hate when you make a good point. I was avoiding the term because I'm
removing Notes and Examples, and we have plans to eliminate Since ... the
tagged sections are almost going away entirely, leaving just TODO, which we
ignore.

Uhm, can I name it paragraphs? :) or open to other suggestions, incl.
untagged if that's still your preference.


> >
> >  def append_line(self, line: str) -> None:
> >  self.text += line + '\n'
> >
> >  class ArgSection(Section):
> > -def __init__(self, info: QAPISourceInfo, tag: str):
> > -super().__init__(info, tag)
> > +def __init__(self, info: QAPISourceInfo, tag: str, kind: str):
> > +super().__init__(info, tag, kind)
> >  self.member: Optional['QAPISchemaMember'] = None
> >
> >  def connect(self, member: 'QAPISchemaMember') -> None:
>
> [...]
>
>

Re: [RFC PATCH v3 02/18] hw/arm/smmu: Fix IPA for stage-2 events

Hi Eric,

On Mon, May 13, 2024 at 01:47:44PM +0200, Eric Auger wrote:
> Hi Mostafa,
> 
> On 4/29/24 05:23, Mostafa Saleh wrote:
> > For the following events (ARM IHI 0070 F.b - 7.3 Event records):
> > - F_TRANSLATION
> > - F_ACCESS
> > - F_PERMISSION
> > - F_ADDR_SIZE
> >
> > If fault occurs at stage 2, S2 == 1 and:
> >   - If translating an IPA for a transaction (whether by input to
> > stage 2-only configuration, or after successful stage 1 translation),
> > CLASS == IN, and IPA is provided.
> CLASS == IN sounds a bit confusing here since the class value depends on
> what is being translated and class is not handled in that patch.
At this point only CLASS IN is used as nesting is not supported,
I will clarify that in the commit message.

> >
> > However, this was not implemented correctly, as for stage 2, we Qemu
> s/we QEMU/ the code
Will do.

> > only sets the  S2 bit but not the IPA.
> If this is a fix, please add the "Fixes:" tag and fixed commit sha1.
Will do.

> >
> > This field has the same bits as FetchAddr in F_WALK_EABT which is
> > populated correctly, so we don’t change that.
> > The population of this field should be done from the walker as the IPA 
> > address
> s/population/setting? I am not a native english speaker though
Me neither :), I will change it.

Thanks,
Mostafa
> > wouldn't be known in case of nesting.
> >
> > For stage 1, the spec says:
> >   If fault occurs at stage 1, S2 == 0 and:
> >   CLASS == IN, IPA is UNKNOWN.
> >
> > So, no need to set it to for stage 1, as ptw_info is initialised by zero in
> > smmuv3_translate().
> >
> > Signed-off-by: Mostafa Saleh 
> > ---
> >  hw/arm/smmu-common.c | 10 ++
> >  hw/arm/smmuv3.c  |  4 
> >  2 files changed, 10 insertions(+), 4 deletions(-)
> >
> > diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
> > index eb2356bc35..8a8c718e6b 100644
> > --- a/hw/arm/smmu-common.c
> > +++ b/hw/arm/smmu-common.c
> > @@ -448,7 +448,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
> >   */
> >  if (ipa >= (1ULL << inputsize)) {
> >  info->type = SMMU_PTW_ERR_TRANSLATION;
> > -goto error;
> > +goto error_ipa;
> >  }
> >  
> >  while (level < VMSA_LEVELS) {
> > @@ -494,13 +494,13 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
> >   */
> >  if (!PTE_AF(pte) && !cfg->s2cfg.affd) {
> >  info->type = SMMU_PTW_ERR_ACCESS;
> > -goto error;
> > +goto error_ipa;
> >  }
> >  
> >  s2ap = PTE_AP(pte);
> >  if (is_permission_fault_s2(s2ap, perm)) {
> >  info->type = SMMU_PTW_ERR_PERMISSION;
> > -goto error;
> > +goto error_ipa;
> >  }
> >  
> >  /*
> > @@ -509,7 +509,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
> >   */
> >  if (gpa >= (1ULL << cfg->s2cfg.eff_ps)) {
> >  info->type = SMMU_PTW_ERR_ADDR_SIZE;
> > -goto error;
> > +goto error_ipa;
> >  }
> >  
> >  tlbe->entry.translated_addr = gpa;
> > @@ -522,6 +522,8 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
> >  }
> >  info->type = SMMU_PTW_ERR_TRANSLATION;
> >  
> > +error_ipa:
> > +info->addr = ipa;
> >  error:
> >  info->stage = 2;
> >  tlbe->entry.perm = IOMMU_NONE;
> > diff --git a/hw/arm/smmuv3.c b/hw/arm/smmuv3.c
> > index 2d1e0d55ec..9dd3ea48e4 100644
> > --- a/hw/arm/smmuv3.c
> > +++ b/hw/arm/smmuv3.c
> > @@ -949,6 +949,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
> > *mr, hwaddr addr,
> >  if (PTW_RECORD_FAULT(cfg)) {
> >  event.type = SMMU_EVT_F_TRANSLATION;
> >  event.u.f_translation.addr = addr;
> > +event.u.f_translation.addr2 = ptw_info.addr;
> >  event.u.f_translation.rnw = flag & 0x1;
> >  }
> >  break;
> > @@ -956,6 +957,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
> > *mr, hwaddr addr,
> >  if (PTW_RECORD_FAULT(cfg)) {
> >  event.type = SMMU_EVT_F_ADDR_SIZE;
> >  event.u.f_addr_size.addr = addr;
> > +event.u.f_addr_size.addr2 = ptw_info.addr;
> >  event.u.f_addr_size.rnw = flag & 0x1;
> >  }
> >  break;
> > @@ -963,6 +965,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
> > *mr, hwaddr addr,
> >  if (PTW_RECORD_FAULT(cfg)) {
> >  event.type = SMMU_EVT_F_ACCESS;
> >  event.u.f_access.addr = addr;
> > +event.u.f_access.addr2 = ptw_info.addr;
> >  event.u.f_access.rnw = flag & 0x1;
> >  }
> >  break;
> > @@ -970,6 +973,7 @@ static IOMMUTLBEntry smmuv3_translate(IOMMUMemoryRegion 
> > *mr, hwaddr addr,
> >  if (PTW_RECORD_FAULT(cfg)) {
> >  event.type = SMMU_EVT_F_PERMISSION;
> >  event.u.f_permission.addr = addr;
> > +

Re: [PATCH 02/13] s390_flic: add migration-enabled property

2024-05-16 Thread Marc Hartmayer

On Thu, May 09, 2024 at 07:00 PM +0200, Paolo Bonzini  
wrote:
> Instead of mucking with css_migration_enabled(), add a property specific to
> the FLIC device, similar to what is done for TYPE_S390_STATTRIB.
>
> Signed-off-by: Paolo Bonzini 
> ---
>  include/hw/s390x/s390_flic.h | 1 +
>  hw/intc/s390_flic.c  | 6 +-
>  hw/s390x/s390-virtio-ccw.c   | 1 +
>  3 files changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/include/hw/s390x/s390_flic.h b/include/hw/s390x/s390_flic.h
> index 3907a13d076..bcb081def58 100644
> --- a/include/hw/s390x/s390_flic.h
> +++ b/include/hw/s390x/s390_flic.h
> @@ -47,6 +47,7 @@ struct S390FLICState {
>  /* to limit AdapterRoutes.num_routes for compat */
>  uint32_t adapter_routes_max_batch;
>  bool ais_supported;
> +bool migration_enabled;
>  };
>  
>  
> diff --git a/hw/intc/s390_flic.c b/hw/intc/s390_flic.c
> index f4a848460b8..7f930800877 100644
> --- a/hw/intc/s390_flic.c
> +++ b/hw/intc/s390_flic.c
> @@ -405,6 +405,8 @@ static void qemu_s390_flic_class_init(ObjectClass *oc, 
> void *data)
>  static Property s390_flic_common_properties[] = {
>  DEFINE_PROP_UINT32("adapter_routes_max_batch", S390FLICState,
> adapter_routes_max_batch, ADAPTER_ROUTES_MAX_GSI),
> +DEFINE_PROP_BOOL("migration-enabled", S390FLICState,
> + migration_enabled, true),
>  DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -457,7 +459,9 @@ type_init(qemu_s390_flic_register_types)
>  
>  static bool adapter_info_so_needed(void *opaque)
>  {
> -return css_migration_enabled();
> +S390FLICState *fs = S390_FLIC_COMMON(opaque);
> +
> +return fs->migration_enabled;
>  }
>  
>  const VMStateDescription vmstate_adapter_info_so = {
> diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
> index feabc173eb3..1383e47eeb5 100644
> --- a/hw/s390x/s390-virtio-ccw.c
> +++ b/hw/s390x/s390-virtio-ccw.c
> @@ -1174,6 +1174,7 @@ static void ccw_machine_2_9_class_options(MachineClass 
> *mc)
>  S390CcwMachineClass *s390mc = S390_CCW_MACHINE_CLASS(mc);
>  static GlobalProperty compat[] = {
>  { TYPE_S390_STATTRIB, "migration-enabled", "off", },
> +{ TYPE_S390_FLIC_COMMON, "migration-enabled", "off", },
>  };
>  
>  ccw_machine_2_10_class_options(mc);
> -- 
> 2.45.0
>
>

This patch causes QEMU to crash when trying to save the domain state
(e.g. using libvirt)

$ cat dom.xml

  bug
  1048576
  2
  
hvm
/var/lib/libvirt/images/hades/vmlinux-s390x
/var/lib/libvirt/images/hades/ramdisk-s390x
  
  
/usr/bin/qemu-system-s390x

  

  

$ virsh create dom.xml
Domain 'bug' created from dom.xml
$ virsh save bug data
error: Failed to save domain 'bug' to data
error: operation failed: domain is not running

$ coredumpctl gdb
(gdb) bt
#0  object_dynamic_cast_assert (obj=0x2aa364aedd0, 
typename=typename@entry=0x2aa3491bd56 "s390-flic", 
file=file@entry=0x2aa34920c7a "/root/git/qemu/include/hw/s390x/s390_flic.h", 
line=line@entry=42, func=func@entry=0x2aa34a4b964 <__func__.14> 
"S390_FLIC_COMMON")
at ../qom/object.c:923
#1  0x02aa3459b518 in S390_FLIC_COMMON (obj=) at 
/root/git/qemu/include/hw/s390x/s390_flic.h:42
#2  adapter_info_so_needed (opaque=) at 
../hw/intc/s390_flic.c:462
#3  0x02aa348b24dc in vmstate_section_needed (vmsd=0x2aa34c80fe0 
, opaque=) at ../migration/vmstate.c:330
#4  vmstate_subsection_save (f=0x2aa36602bd0, vmsd=0x2aa34c80f78 
, opaque=0x2aa364aedd0, vmdesc=0x0) at 
../migration/vmstate.c:528
#5  vmstate_save_state_v (f=f@entry=0x2aa36602bd0, vmsd=0x2aa34c80f78 
, opaque=opaque@entry=0x2aa364aedd0, 
vmdesc=vmdesc@entry=0x0, version_id=version_id@entry=1, errp=0x0) at 
../migration/vmstate.c:443
#6  0x02aa348b2886 in vmstate_save_state (f=0x2aa36602bd0, vmsd=, opaque=0x2aa364aedd0, vmdesc_id=0x0) at ../migration/vmstate.c:341
#7  vmstate_save_state_v (f=f@entry=0x2aa36602bd0, vmsd=0x2aa34c80e50 
, opaque=opaque@entry=0x2aa364aedd0, 
vmdesc=vmdesc@entry=0x0, version_id=version_id@entry=1, errp=0x0) at 
../migration/vmstate.c:401
#8  0x02aa348b2886 in vmstate_save_state (f=0x2aa36602bd0, vmsd=, opaque=0x2aa364aedd0, vmdesc_id=0x0) at ../migration/vmstate.c:341
#9  vmstate_save_state_v (f=0x2aa36602bd0, vmsd=0x2aa34c6cdf0 
, opaque=, vmdesc=0x0, 
version_id=version_id@entry=1, errp=0x0) at ../migration/vmstate.c:401
#10 0x02aa348b2d7e in vmstate_save_state (f=, 
vmsd=, opaque=, vmdesc_id=) at 
../migration/vmstate.c:341
#11 0x02aa345c9726 in virtio_save (vdev=0x2aa364afe20, f=0x2aa36602bd0) at 
../hw/virtio/virtio.c:2808
#12 0x02aa348b23de in vmstate_save_state_v (f=f@entry=0x2aa36602bd0, 
vmsd=0x2aa34c80cd0 , opaque=, 
vmdesc=vmdesc@entry=0x2aa36602280, version_id=version_id@entry=3, 
errp=0x3ff73efb438) at ../migration/vmstate.c:408
#13 0x02aa348b2dbe in vmstate_save_state_with_err (f=f@entry=0x2aa36602bd0, 
vmsd=, opaque=, 
vmdesc_id=vmdesc_id@entry=0x2aa36602280, errp=errp@entry=0x3ff73efb438) at

[RFC/PATCH v2 01/12] gunyah: UAPI header (NOT FOR MERGE)

Gunyah UAPI header file is provided for ease of use, until Gunyah kernel
driver is merged upstream. scripts/update-linux-headers.sh will be
updated via a separate patch after that for Qemu to pick up gunyah.h
UAPI header from Linux kernel sources.

This header file is based on the Gunyah driver present in Android-14.

https://android.googlesource.com/kernel/common
Branch: android14-6.1

Signed-off-by: Srivatsa Vaddagiri 
---
 linux-headers/linux/gunyah.h | 311 +++
 1 file changed, 311 insertions(+)
 create mode 100644 linux-headers/linux/gunyah.h

diff --git a/linux-headers/linux/gunyah.h b/linux-headers/linux/gunyah.h
new file mode 100644
index 00..c8d250a834
--- /dev/null
+++ b/linux-headers/linux/gunyah.h
@@ -0,0 +1,311 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights 
reserved.
+ */
+
+#ifndef _LINUX_GUNYAH_H
+#define _LINUX_GUNYAH_H
+
+/*
+ * Userspace interface for /dev/gunyah - gunyah based virtual machine
+ */
+
+#include 
+#include 
+
+#define GH_IOCTL_TYPE  'G'
+
+/*
+ * ioctls for /dev/gunyah fds:
+ */
+#define GH_CREATE_VM   _IO(GH_IOCTL_TYPE, 0x0) /* Returns a 
Gunyah VM fd */
+
+/*
+ * ioctls for VM fds
+ */
+
+/**
+ * enum gh_mem_flags - Possible flags on  gh_userspace_memory_region
+ * @GH_MEM_ALLOW_READ: Allow guest to read the memory
+ * @GH_MEM_ALLOW_WRITE: Allow guest to write to the memory
+ * @GH_MEM_ALLOW_EXEC: Allow guest to execute instructions in the memory
+ */
+enum gh_mem_flags {
+   GH_MEM_ALLOW_READ   = 1UL << 0,
+   GH_MEM_ALLOW_WRITE  = 1UL << 1,
+   GH_MEM_ALLOW_EXEC   = 1UL << 2,
+};
+
+/**
+ * struct gh_userspace_memory_region - Userspace memory descripion for 
GH_VM_SET_USER_MEM_REGION
+ * @label: Identifer to the region which is unique to the VM.
+ * @flags: Flags for memory parcel behavior. See  gh_mem_flags.
+ * @guest_phys_addr: Location of the memory region in guest's memory space 
(page-aligned)
+ * @memory_size: Size of the region (page-aligned)
+ * @userspace_addr: Location of the memory region in caller (userspace)'s 
memory
+ *
+ * See Documentation/virt/gunyah/vm-manager.rst for further details.
+ */
+struct gh_userspace_memory_region {
+   __u32 label;
+   __u32 flags;
+   __u64 guest_phys_addr;
+   __u64 memory_size;
+   __u64 userspace_addr;
+};
+
+#define GH_VM_SET_USER_MEM_REGION  _IOW(GH_IOCTL_TYPE, 0x1, \
+   struct 
gh_userspace_memory_region)
+
+/**
+ * struct gh_vm_dtb_config - Set the location of the VM's devicetree blob
+ * @guest_phys_addr: Address of the VM's devicetree in guest memory.
+ * @size: Maximum size of the devicetree including space for overlays.
+ *Resource manager applies an overlay to the DTB and dtb_size should
+ *include room for the overlay. A page of memory is typicaly plenty.
+ */
+struct gh_vm_dtb_config {
+   __u64 guest_phys_addr;
+   __u64 size;
+};
+#define GH_VM_SET_DTB_CONFIG   _IOW(GH_IOCTL_TYPE, 0x2, struct 
gh_vm_dtb_config)
+
+#define GH_VM_START_IO(GH_IOCTL_TYPE, 0x3)
+
+/**
+ * enum gh_fn_type - Valid types of Gunyah VM functions
+ * @GH_FN_VCPU: create a vCPU instance to control a vCPU
+ *   gh_fn_desc.arg is a pointer to  gh_fn_vcpu_arg
+ *  Return: file descriptor to manipulate the vcpu.
+ * @GH_FN_IRQFD: register eventfd to assert a Gunyah doorbell
+ *gh_fn_desc.arg is a pointer to  gh_fn_irqfd_arg
+ * @GH_FN_IOEVENTFD: register ioeventfd to trigger when VM faults on parameter
+ *gh_fn_desc.arg is a pointer to  
gh_fn_ioeventfd_arg
+ */
+enum gh_fn_type {
+   GH_FN_VCPU = 1,
+   GH_FN_IRQFD,
+   GH_FN_IOEVENTFD,
+};
+
+#define GH_FN_MAX_ARG_SIZE 256
+
+/**
+ * struct gh_fn_vcpu_arg - Arguments to create a vCPU.
+ * @id: vcpu id
+ *
+ * Create this function with _VM_ADD_FUNCTION using type _FN_VCPU.
+ *
+ * The vcpu type will register with the VM Manager to expect to control
+ * vCPU number `vcpu_id`. It returns a file descriptor allowing interaction 
with
+ * the vCPU. See the Gunyah vCPU API description sections for interacting with
+ * the Gunyah vCPU file descriptors.
+ */
+struct gh_fn_vcpu_arg {
+   __u32 id;
+};
+
+/**
+ * enum gh_irqfd_flags - flags for use in gh_fn_irqfd_arg
+ * @GH_IRQFD_FLAGS_LEVEL: make the interrupt operate like a level triggered
+ *interrupt on guest side. Triggering IRQFD before
+ *guest handles the interrupt causes interrupt to
+ *stay asserted.
+ */
+enum gh_irqfd_flags {
+   GH_IRQFD_FLAGS_LEVEL= 1UL << 0,
+};
+
+/**
+ * struct gh_fn_irqfd_arg - Arguments to create an irqfd function.
+ *
+ * Create this function with _VM_ADD_FUNCTION using type _FN_IRQFD.
+ *
+ * Allows setting an eventfd to directly trigger a

[RFC/PATCH v2 06/12] gunyah: Add IRQFD and IOEVENTFD functions

IRQFD function allows registering of an @eventfd and @irq. @irq will be
injected inside guest when @eventfd is written into.

IOEVENTFD function allows registering an @eventfd and a guest physical
address, @addr, along with optional data. A poll() on @eventfd  will be
woken up when guest attempts to access @addr.

Signed-off-by: Srivatsa Vaddagiri 
---
 include/sysemu/gunyah_int.h |  1 +
 accel/gunyah/gunyah-all.c   | 94 +
 2 files changed, 95 insertions(+)

diff --git a/include/sysemu/gunyah_int.h b/include/sysemu/gunyah_int.h
index 0967b2d7d7..8c0b479f62 100644
--- a/include/sysemu/gunyah_int.h
+++ b/include/sysemu/gunyah_int.h
@@ -49,5 +49,6 @@ struct GUNYAHState {
 int gunyah_create_vm(void);
 int gunyah_vm_ioctl(int type, ...);
 void *gunyah_cpu_thread_fn(void *arg);
+int gunyah_add_irqfd(int irqfd, int label, Error **errp);
 
 #endif/* GUNYAH_INT_H */
diff --git a/accel/gunyah/gunyah-all.c b/accel/gunyah/gunyah-all.c
index 19f96225a0..948ccfbeee 100644
--- a/accel/gunyah/gunyah-all.c
+++ b/accel/gunyah/gunyah-all.c
@@ -24,11 +24,21 @@
 #include "qemu/error-report.h"
 #include "exec/address-spaces.h"
 #include "hw/boards.h"
+#include "qapi/error.h"
+#include "qemu/event_notifier.h"
 
 static void gunyah_region_add(MemoryListener *listener,
MemoryRegionSection *section);
 static void gunyah_region_del(MemoryListener *listener,
MemoryRegionSection *section);
+static void gunyah_mem_ioeventfd_add(MemoryListener *listener,
+  MemoryRegionSection *section,
+  bool match_data, uint64_t data,
+  EventNotifier *e);
+static void gunyah_mem_ioeventfd_del(MemoryListener *listener,
+  MemoryRegionSection *section,
+  bool match_data, uint64_t data,
+  EventNotifier *e);
 
 static int gunyah_ioctl(int type, ...)
 {
@@ -65,6 +75,8 @@ static MemoryListener gunyah_memory_listener = {
 .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 .region_add = gunyah_region_add,
 .region_del = gunyah_region_del,
+.eventfd_add = gunyah_mem_ioeventfd_add,
+.eventfd_del = gunyah_mem_ioeventfd_del,
 };
 
 int gunyah_create_vm(void)
@@ -319,6 +331,88 @@ void gunyah_set_swiotlb_size(uint64_t size)
 s->swiotlb_size = size;
 }
 
+int gunyah_add_irqfd(int irqfd, int label, Error **errp)
+{
+int ret;
+struct gh_fn_desc fdesc;
+struct gh_fn_irqfd_arg ghirqfd;
+
+fdesc.type = GH_FN_IRQFD;
+fdesc.arg_size = sizeof(struct gh_fn_irqfd_arg);
+fdesc.arg = (__u64)();
+
+ghirqfd.fd = irqfd;
+ghirqfd.label = label;
+ghirqfd.flags = GH_IRQFD_FLAGS_LEVEL;
+
+ret = gunyah_vm_ioctl(GH_VM_ADD_FUNCTION, );
+if (ret) {
+error_setg_errno(errp, errno, "GH_FN_IRQFD failed");
+}
+
+return ret;
+}
+
+static int gunyah_set_ioeventfd_mmio(int fd, hwaddr addr,
+uint32_t size, uint32_t data, bool datamatch, bool assign)
+{
+int ret;
+struct gh_fn_ioeventfd_arg io;
+struct gh_fn_desc fdesc;
+
+io.fd = fd;
+io.datamatch = datamatch ? data : 0;
+io.len = size;
+io.addr = addr;
+io.flags = datamatch ? GH_IOEVENTFD_FLAGS_DATAMATCH : 0;
+
+fdesc.type = GH_FN_IOEVENTFD;
+fdesc.arg_size = sizeof(struct gh_fn_ioeventfd_arg);
+fdesc.arg = (__u64)();
+
+if (assign) {
+ret = gunyah_vm_ioctl(GH_VM_ADD_FUNCTION, );
+} else {
+ret = gunyah_vm_ioctl(GH_VM_REMOVE_FUNCTION, );
+}
+
+return ret;
+}
+
+static void gunyah_mem_ioeventfd_add(MemoryListener *listener,
+  MemoryRegionSection *section,
+  bool match_data, uint64_t data,
+  EventNotifier *e)
+{
+int fd = event_notifier_get_fd(e);
+int r;
+
+r = gunyah_set_ioeventfd_mmio(fd, section->offset_within_address_space,
+   int128_get64(section->size), data, match_data,
+   true);
+if (r < 0) {
+error_report("error adding ioeventfd: %s", strerror(errno));
+exit(1);
+}
+}
+
+static void gunyah_mem_ioeventfd_del(MemoryListener *listener,
+  MemoryRegionSection *section,
+  bool match_data, uint64_t data,
+  EventNotifier *e)
+{
+int fd = event_notifier_get_fd(e);
+int r;
+
+r = gunyah_set_ioeventfd_mmio(fd, section->offset_within_address_space,
+   int128_get64(section->size), data, match_data,
+   false);
+if (r < 0) {
+error_report("error deleting ioeventfd: %s", strerror(errno));
+exit(1);
+}
+}
+
 void *gunyah_cpu_thread_fn(void *arg)
 {
 CPUState *cpu = arg;
-- 
2.25.1

[RFC/PATCH v2 12/12] gunyah: Documentation

Add gunyah.rst that provide some informaiton on how to build and test
'gunyah' accelerator with open-source Gunyah hypervisor.

Signed-off-by: Srivatsa Vaddagiri 
---
 MAINTAINERS|   1 +
 docs/system/arm/gunyah.rst | 326 +
 2 files changed, 327 insertions(+)
 create mode 100644 docs/system/arm/gunyah.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index c42fdc2afd..2f69114814 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -532,6 +532,7 @@ F: include/sysemu/gunyah.h
 F: include/sysemu/gunyah_int.h
 F: target/arm/arm_gicv3_gunyah.c
 F: hw/intc/arm_gicv3_gunyah.c
+F: docs/system/arm/gunyah.rst
 
 WHPX CPUs
 M: Sunil Muthuswamy 
diff --git a/docs/system/arm/gunyah.rst b/docs/system/arm/gunyah.rst
new file mode 100644
index 00..971bfd30c1
--- /dev/null
+++ b/docs/system/arm/gunyah.rst
@@ -0,0 +1,326 @@
+'gunyah' accelerator (``gunyah``)
+=
+
+Gunyah is a high performance, scalable and flexible hypervisor built for
+demanding battery powered, real-time, safety and security use cases.
+
+The Gunyah Hypervisor open source project provides a reference Type-1 
hypervisor
+configuration suitable for general purpose hosting of multiple trusted and
+dependent VMs. Further information on open-source version of Gunyah Hypervisor
+can be obtained from:
+
+https://github.com/quic/gunyah-hypervisor
+
+To get started with open-source version of Gunyah Hypervisor, refer to the
+instructions available at:
+
+https://github.com/quic/gunyah-support-scripts
+
+Build and testing
+-
+
+Configure and build Qemu
+
+
+Apply the proposed patches for 'gunyah' accelerator support in Qemu and build
+it.
+
+.. code-block:: bash
+
+$ ./configure --target-list=aarch64-softmmu --enable-debug 
--enable-gunyah --static
+$ make -j4
+$ mv build/qemu-system-aarch64 build/qemu-gunyah
+
+Clone gunyah-support scripts
+
+
+.. code-block:: bash
+
+$ git clone https://github.com/quic/gunyah-support-scripts
+
+Instructions in this document to build and test Gunyah hypervisor was validated
+with the latest commit in gunyah-support-scripts being:
+
+6a959c8 tools: Fix permission and version related
+
+Patch gunyah-support scripts
+
+Apply below patch to gunyah-support scripts. This is required **temporarily** 
until
+the scripts can be updated to support Qemu as VMM (in addition to CrosVM) and
+also to fix some issues.
+
+.. code-block:: bash
+
+diff --git a/scripts/build-docker-img.sh b/scripts/build-docker-img.sh
+index 98e7881..a6aa774 100755
+--- a/scripts/build-docker-img.sh
++++ b/scripts/build-docker-img.sh
+@@ -38,7 +38,7 @@ DOCKER_OPTIONS=" build . "
+ #DOCKER_OPTIONS+=" --progress=plain "
+
+ #  no-cache alleviates some install errors for not finding some packages
+-#DOCKER_OPTIONS+=" --no-cache "
++DOCKER_OPTIONS+=" --no-cache "
+
+ # user environment related so the permissions will same as the host 
machine
+ DOCKER_OPTIONS+=" --build-arg UID=$(id -u) "
+diff --git a/scripts/core-utils/clone-linux.sh 
b/scripts/core-utils/clone-linux.sh
+index 714162e..2b79bc7 100755
+--- a/scripts/core-utils/clone-linux.sh
++++ b/scripts/core-utils/clone-linux.sh
+@@ -26,8 +26,7 @@ cd ${LINUX_DIR}/src
+ LINUX_VER="v6.5"
+ echo -e "\nCloning Linux ${LINUX_VER}:"
+ git clone \
+---depth=1 --progress -c advice.detachedHead=false \
+--b ${LINUX_VER}  \
++--progress -c advice.detachedHead=false \
+ https://github.com/torvalds/linux.git   || {
+echo "Unable to clone Linux"
+return
+@@ -58,7 +57,11 @@ echo "Installed b4 to ${LINUX_DIR}/tools/b4"
+
+ cd ${LINUX_DIR}/src/linux
+
+-${LINUX_DIR}/tools/b4/b4.sh shazam 
https://lore.kernel.org/all/20230613172054.3959700-1-quic_eber...@quicinc.com/
++
++${LINUX_DIR}/tools/b4/b4.sh am 
https://lore.kernel.org/all/20230613172054.3959700-1-quic_eber...@quicinc.com/
++git checkout -b v14_20230613_quic_eberman_quicinc_com 
858fd168a95c5b9669aac8db6c14a9aeab446375
++git am ./v14_20230613_quic_eberman_drivers_for_gunyah_hypervisor.mbx
++
+ echo "Applied gunyah drivers patch successfully"
+
+ echo "Generate gunyah.config"
+diff --git a/scripts/install-wsp-imgs.sh b/scripts/install-wsp-imgs.sh
+index 12150f3..32107e0 100755
+--- a/scripts/install-wsp-imgs.sh
++++ b/scripts/install-wsp-imgs.sh
+@@ -100,15 +100,23 @@ if [[ ! -f ${WORKSPACE}/run-qemu.sh ]]; then
+ cp ${BASE_DIR}/utils/run-qemu.sh ${WORKSPACE}/run-qemu.sh
+ fi
+
+-if [[ ! -f ${WORKSPACE}/crosvm/crosvm ]]; then
+-mkdir -p ${WORKSPACE}/crosvm
+-cd ${WORKSPACE}/crosvm
+-. clone-crosvm.sh
+-. build-crosvm.sh
+-
+-echo -e 'export CROSVM_FILE_PATH=${WORKSPACE}/crosvm/crosvm' >> 
${WORKSPACE}/.wsp-env
+-.

[RFC/PATCH v2 07/12] gunyah: Add gicv3 interrupt controller

Gunyah hypervisor supports emulation of a GICv3 compatible interrupt
controller. Emulation is handled by hypervisor itself, with Qemu being
allowed to specify some of the properties such as IO address at which
GICv3 should be mapped in guest address space. These properties are
conveyed to hypervisor via the device-tree, which is parsed by
hypervisor (or more specifically Resource Manager VM, which is the
trusted agent of hypervisor), before VM begins execution.

Injection of interrupts inside guest is supported by doorbell API of
Gunyah hypervisor. Each doorbell is associated with a specific
interrupt. An eventfd is created and associated with each doorbell/irq.
Injection of a specific irq is accomplished by writing to the eventfd
associated with that irq.

Signed-off-by: Srivatsa Vaddagiri 
---
 MAINTAINERS|   2 +
 include/sysemu/gunyah_int.h|   3 +
 accel/gunyah/gunyah-all.c  |   5 ++
 hw/arm/virt.c  |   5 ++
 hw/intc/arm_gicv3_common.c |   3 +
 hw/intc/arm_gicv3_gunyah.c | 106 +
 hw/intc/arm_gicv3_its_common.c |   3 +
 hw/intc/meson.build|   1 +
 8 files changed, 128 insertions(+)
 create mode 100644 hw/intc/arm_gicv3_gunyah.c

diff --git a/MAINTAINERS b/MAINTAINERS
index d8d63b1c3a..d0289ded2f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -529,6 +529,8 @@ S: Maintained
 F: accel/gunyah
 F: include/sysemu/gunyah.h
 F: include/sysemu/gunyah_int.h
+F: target/arm/arm_gicv3_gunyah.c
+F: hw/intc/arm_gicv3_gunyah.c
 
 WHPX CPUs
 M: Sunil Muthuswamy 
diff --git a/include/sysemu/gunyah_int.h b/include/sysemu/gunyah_int.h
index 8c0b479f62..e19872dae2 100644
--- a/include/sysemu/gunyah_int.h
+++ b/include/sysemu/gunyah_int.h
@@ -44,11 +44,14 @@ struct GUNYAHState {
 int vmfd;
 uint64_t swiotlb_size;
 bool preshmem_reserved;
+uint32_t preshmem_size;
+uint32_t nr_irqs;
 };
 
 int gunyah_create_vm(void);
 int gunyah_vm_ioctl(int type, ...);
 void *gunyah_cpu_thread_fn(void *arg);
 int gunyah_add_irqfd(int irqfd, int label, Error **errp);
+GUNYAHState *get_gunyah_state(void);
 
 #endif/* GUNYAH_INT_H */
diff --git a/accel/gunyah/gunyah-all.c b/accel/gunyah/gunyah-all.c
index 948ccfbeee..d8c3758c59 100644
--- a/accel/gunyah/gunyah-all.c
+++ b/accel/gunyah/gunyah-all.c
@@ -413,6 +413,11 @@ static void gunyah_mem_ioeventfd_del(MemoryListener 
*listener,
 }
 }
 
+GUNYAHState *get_gunyah_state(void)
+{
+return GUNYAH_STATE(current_accel());
+}
+
 void *gunyah_cpu_thread_fn(void *arg)
 {
 CPUState *cpu = arg;
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 3b0fcf812f..bfb7f3d92e 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -694,6 +694,9 @@ static void create_its(VirtMachineState *vms)
 if (!vms->tcg_its) {
 itsclass = NULL;
 }
+} else if (!strcmp(itsclass, "arm-its-gunyah")) {
+/* ITS is not yet supported */
+itsclass = NULL;
 }
 
 if (!itsclass) {
@@ -1996,6 +1999,8 @@ static void finalize_gic_version(VirtMachineState *vms)
 gics_supported |= VIRT_GIC_VERSION_4_MASK;
 }
 }
+} else if (gunyah_enabled()) {
+gics_supported |= VIRT_GIC_VERSION_3_MASK;
 } else {
 error_report("Unsupported accelerator, can not determine GIC support");
 exit(1);
diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c
index bd50a1b079..ec05d31e1b 100644
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -32,6 +32,7 @@
 #include "gicv3_internal.h"
 #include "hw/arm/linux-boot-if.h"
 #include "sysemu/kvm.h"
+#include "sysemu/gunyah.h"
 
 
 static void gicv3_gicd_no_migration_shift_bug_post_load(GICv3State *cs)
@@ -662,6 +663,8 @@ const char *gicv3_class_name(void)
 {
 if (kvm_irqchip_in_kernel()) {
 return "kvm-arm-gicv3";
+} else if (gunyah_enabled()) {
+return "gunyah-arm-gicv3";
 } else {
 if (kvm_enabled()) {
 error_report("Userspace GICv3 is not supported with KVM");
diff --git a/hw/intc/arm_gicv3_gunyah.c b/hw/intc/arm_gicv3_gunyah.c
new file mode 100644
index 00..f52e82bf9a
--- /dev/null
+++ b/hw/intc/arm_gicv3_gunyah.c
@@ -0,0 +1,106 @@
+/*
+ * QEMU Gunyah hypervisor support
+ *
+ * Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "cpu.h"
+#include "hw/intc/arm_gicv3_common.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "sysemu/gunyah.h"
+#include "sysemu/gunyah_int.h"
+#include "sysemu/runstate.h"
+#include "gicv3_internal.h"
+#include "vgic_common.h"
+#include "migration/blocker.h"
+#include "qom/object.h"
+#include "target/arm/cpregs.h"
+#include "qemu/event_notifier.h"
+
+struct GUNYAHARMGICv3Class {
+ARMGICv3CommonClass parent_class;
+DeviceRealize parent_realize;
+ResettablePhases parent_phases;
+};
+
+#define

[RFC/PATCH v2 11/12] gunyah: Workarounds (NOT FOR MERGE)

These are some work-arounds required temporarily until some limitations
with Gunyah hypervisor are addressed.

Signed-off-by: Srivatsa Vaddagiri 
---
 include/sysemu/gunyah_int.h |  1 +
 accel/gunyah/gunyah-all.c   | 18 ++
 hw/arm/boot.c   | 17 -
 hw/arm/virt.c   |  3 ++-
 4 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/include/sysemu/gunyah_int.h b/include/sysemu/gunyah_int.h
index ef82c6edd8..bdda430dbd 100644
--- a/include/sysemu/gunyah_int.h
+++ b/include/sysemu/gunyah_int.h
@@ -62,5 +62,6 @@ int gunyah_add_irqfd(int irqfd, int label, Error **errp);
 GUNYAHState *get_gunyah_state(void);
 int gunyah_arch_put_registers(CPUState *cs, int level);
 void gunyah_cpu_synchronize_post_reset(CPUState *cpu);
+gunyah_slot *gunyah_find_slot_by_addr(uint64_t addr);
 
 #endif/* GUNYAH_INT_H */
diff --git a/accel/gunyah/gunyah-all.c b/accel/gunyah/gunyah-all.c
index 4c56dd8c73..bc106fbad6 100644
--- a/accel/gunyah/gunyah-all.c
+++ b/accel/gunyah/gunyah-all.c
@@ -158,6 +158,24 @@ static gunyah_slot *gunyah_find_overlap_slot(GUNYAHState 
*s,
 return NULL;
 }
 
+gunyah_slot *gunyah_find_slot_by_addr(uint64_t addr)
+{
+GUNYAHState *s = GUNYAH_STATE(current_accel());
+int i;
+gunyah_slot *slot = NULL;
+
+gunyah_slots_lock(s);
+for (i = 0; i < s->nr_slots; ++i) {
+slot = >slots[i];
+if (slot->size &&
+(addr >= slot->start && addr <= slot->start + slot->size))
+break;
+}
+gunyah_slots_unlock(s);
+
+return slot;
+}
+
 /* Called with s->slots_lock held */
 static gunyah_slot *gunyah_get_free_slot(GUNYAHState *s)
 {
diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 84ea6a807a..a29b2cb0f9 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -413,7 +413,8 @@ static int fdt_add_memory_node(void *fdt, uint32_t acells, 
hwaddr mem_base,
 char *nodename;
 int ret;
 
-nodename = g_strdup_printf("/memory@%" PRIx64, mem_base);
+/* Workaround until RM can parse memory nodes of type memory@XYZ. */
+nodename = g_strdup_printf("/memory");
 qemu_fdt_add_subnode(fdt, nodename);
 qemu_fdt_setprop_string(fdt, nodename, "device_type", "memory");
 ret = qemu_fdt_setprop_sized_cells(fdt, nodename, "reg", acells, mem_base,
@@ -661,6 +662,20 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info 
*binfo,
 binfo->modify_dtb(binfo, fdt);
 }
 
+/*
+ * Gunyah RM inspects and modifies device-tree (to provide additional
+ * information that VM may need). It depends on knowing total size reserved
+ * for device-tree (i.e FDT_MAX_SIZE) and current size (via @totalsize). At
+ * this point however, @totalsize = FDT_MAX_SIZE, making RM think that 
there
+ * is no room for modification and fail to start VM.
+ *
+ * RM should ideally pack device-tree so that @totalsize reflects the 
actual
+ * size before it attempts modification. Until RM is fixed, pack
+ * device-tree so that @toalsize reflects the actual size.
+ */
+
+fdt_pack(fdt);
+
 qemu_fdt_dumpdtb(fdt, size);
 
 /* Put the DTB into the memory map as a ROM image: this will ensure
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index b0132beddd..5f3075e748 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -188,7 +188,8 @@ static const MemMapEntry base_memmap[] = {
 [VIRT_PCIE_PIO] =   { 0x3eff, 0x0001 },
 [VIRT_PCIE_ECAM] =  { 0x3f00, 0x0100 },
 /* Actual RAM size depends on initial RAM and device memory settings */
-[VIRT_MEM] ={ GiB, LEGACY_RAMLIMIT_BYTES },
+/* Workaround until Gunyah can accept mapping that starts from GiB */
+[VIRT_MEM] ={ 2 * GiB, LEGACY_RAMLIMIT_BYTES },
 };
 
 /*
-- 
2.25.1

Re: [RFC/PATCH v2 00/12] Gunyah hypervisor support

* Srivatsa Vaddagiri  [2024-05-16 14:33:08]:

> Appreciate any quick comments you have. This is v2 that I intend to publish on
> qemu lists. Main changes since v1 is adding support for protected VM. 

Pls ignore this !

[RFC/PATCH v2 10/12] gunyah: CPU execution loop

Complete the cpu execution loop. At this time, we recognize exits
associated with only MMIO access. Future patches will add support for
recognizing other exit reasons, such as PSCI calls made by guest.

Signed-off-by: Srivatsa Vaddagiri 
---
 include/sysemu/gunyah_int.h |   9 ++
 accel/gunyah/gunyah-accel-ops.c |   7 +
 accel/gunyah/gunyah-all.c   | 252 +++-
 target/arm/gunyah.c |  18 +++
 4 files changed, 285 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/gunyah_int.h b/include/sysemu/gunyah_int.h
index e19872dae2..ef82c6edd8 100644
--- a/include/sysemu/gunyah_int.h
+++ b/include/sysemu/gunyah_int.h
@@ -46,12 +46,21 @@ struct GUNYAHState {
 bool preshmem_reserved;
 uint32_t preshmem_size;
 uint32_t nr_irqs;
+uint32_t vm_started;
+};
+
+struct AccelCPUState {
+int fd;
+struct gh_vcpu_run *run;
 };
 
 int gunyah_create_vm(void);
+void gunyah_start_vm(void);
 int gunyah_vm_ioctl(int type, ...);
 void *gunyah_cpu_thread_fn(void *arg);
 int gunyah_add_irqfd(int irqfd, int label, Error **errp);
 GUNYAHState *get_gunyah_state(void);
+int gunyah_arch_put_registers(CPUState *cs, int level);
+void gunyah_cpu_synchronize_post_reset(CPUState *cpu);
 
 #endif/* GUNYAH_INT_H */
diff --git a/accel/gunyah/gunyah-accel-ops.c b/accel/gunyah/gunyah-accel-ops.c
index af8a4ad606..f6a0d8d782 100644
--- a/accel/gunyah/gunyah-accel-ops.c
+++ b/accel/gunyah/gunyah-accel-ops.c
@@ -37,6 +37,11 @@ static void gunyah_accel_instance_init(Object *obj)
 s->vmfd = -1;
 }
 
+static void gunyah_setup_post(MachineState *ms, AccelState *accel)
+{
+gunyah_start_vm();
+}
+
 static void gunyah_accel_class_init(ObjectClass *oc, void *data)
 {
 AccelClass *ac = ACCEL_CLASS(oc);
@@ -44,6 +49,7 @@ static void gunyah_accel_class_init(ObjectClass *oc, void 
*data)
 ac->name = "GUNYAH";
 ac->init_machine = gunyah_init;
 ac->allowed = _allowed;
+ac->setup_post = gunyah_setup_post;
 }
 
 static const TypeInfo gunyah_accel_type = {
@@ -104,6 +110,7 @@ static void gunyah_accel_ops_class_init(ObjectClass *oc, 
void *data)
 ops->kick_vcpu_thread = gunyah_kick_vcpu_thread;
 ops->cpu_thread_is_idle = gunyah_vcpu_thread_is_idle;
 ops->check_capability = gunyah_check_capability;
+ops->synchronize_post_reset = gunyah_cpu_synchronize_post_reset;
 };
 
 static const TypeInfo gunyah_accel_ops_type = {
diff --git a/accel/gunyah/gunyah-all.c b/accel/gunyah/gunyah-all.c
index d8c3758c59..4c56dd8c73 100644
--- a/accel/gunyah/gunyah-all.c
+++ b/accel/gunyah/gunyah-all.c
@@ -26,6 +26,9 @@
 #include "hw/boards.h"
 #include "qapi/error.h"
 #include "qemu/event_notifier.h"
+#include "qemu/main-loop.h"
+#include "sysemu/runstate.h"
+#include "qemu/guest-random.h"
 
 static void gunyah_region_add(MemoryListener *listener,
MemoryRegionSection *section);
@@ -40,6 +43,18 @@ static void gunyah_mem_ioeventfd_del(MemoryListener 
*listener,
   bool match_data, uint64_t data,
   EventNotifier *e);
 
+/* Keep this here until Linux kernel UAPI header file (gunyah.h) is updated */
+enum gh_vm_exit_type {
+GH_RM_EXIT_TYPE_VM_EXIT = 0,
+GH_RM_EXIT_TYPE_PSCI_POWER_OFF = 1,
+GH_RM_EXIT_TYPE_PSCI_SYSTEM_RESET = 2,
+GH_RM_EXIT_TYPE_PSCI_SYSTEM_RESET2 = 3,
+GH_RM_EXIT_TYPE_WDT_BITE = 4,
+GH_RM_EXIT_TYPE_HYP_ERROR = 5,
+GH_RM_EXIT_TYPE_ASYNC_EXT_ABORT = 6,
+GH_RM_EXIT_TYPE_VM_FORCE_STOPPED = 7,
+};
+
 static int gunyah_ioctl(int type, ...)
 {
 void *arg;
@@ -70,6 +85,18 @@ int gunyah_vm_ioctl(int type, ...)
 return ioctl(s->vmfd, type, arg);
 }
 
+static int gunyah_vcpu_ioctl(CPUState *cpu, int type, ...)
+{
+void *arg;
+va_list ap;
+
+va_start(ap, type);
+arg = va_arg(ap, void *);
+va_end(ap);
+
+return ioctl(cpu->accel->fd, type, arg);
+}
+
 static MemoryListener gunyah_memory_listener = {
 .name = "gunyah",
 .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
@@ -282,6 +309,11 @@ static void gunyah_set_phys_mem(GUNYAHState *s,
 error_report("Overlapping slot registration not supported!");
 exit(1);
 }
+
+if (qatomic_read(>vm_started)) {
+error_report("Memory map changes after VM start not supported!");
+exit(1);
+}
 }
 
 if (area->readonly ||
@@ -418,13 +450,231 @@ GUNYAHState *get_gunyah_state(void)
 return GUNYAH_STATE(current_accel());
 }
 
+static void gunyah_ipi_signal(int sig)
+{
+if (current_cpu) {
+qatomic_set(_cpu->accel->run->immediate_exit, 1);
+}
+}
+
+static void gunyah_cpu_kick_self(void)
+{
+qatomic_set(_cpu->accel->run->immediate_exit, 1);
+}
+
+static int gunyah_init_vcpu(CPUState *cpu, Error **errp)
+{
+int ret;
+struct gh_fn_desc fdesc;
+struct gh_fn_vcpu_arg vcpu;
+struct sigaction sigact;
+sigset_t set;
+
+cpu->accel = g_new0(AccelCPUState, 1);
+
+/* init cpu

[RFC/PATCH v2 08/12] gunyah: Specific device-tree location

Specify the location of device-tree and its size, as Gunyah requires the
device-tree to be parsed before VM can begin its execution.

Signed-off-by: Srivatsa Vaddagiri 
---
 MAINTAINERS   |  1 +
 include/sysemu/gunyah.h   |  2 ++
 accel/stubs/gunyah-stub.c |  5 +
 hw/arm/virt.c |  6 ++
 target/arm/gunyah.c   | 45 +++
 target/arm/meson.build|  3 +++
 6 files changed, 62 insertions(+)
 create mode 100644 target/arm/gunyah.c

diff --git a/MAINTAINERS b/MAINTAINERS
index d0289ded2f..c42fdc2afd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -527,6 +527,7 @@ GUNYAH
 M: Srivatsa Vaddagiri 
 S: Maintained
 F: accel/gunyah
+F: target/arm/gunyah.c
 F: include/sysemu/gunyah.h
 F: include/sysemu/gunyah_int.h
 F: target/arm/arm_gicv3_gunyah.c
diff --git a/include/sysemu/gunyah.h b/include/sysemu/gunyah.h
index 78cb80f01e..ba4862a1a6 100644
--- a/include/sysemu/gunyah.h
+++ b/include/sysemu/gunyah.h
@@ -29,4 +29,6 @@ typedef struct GUNYAHState GUNYAHState;
 DECLARE_INSTANCE_CHECKER(GUNYAHState, GUNYAH_STATE,
  TYPE_GUNYAH_ACCEL)
 
+int gunyah_arm_set_dtb(uint64_t dtb_start, uint64_t dtb_size);
+
 #endif  /* QEMU_GUNYAH_H */
diff --git a/accel/stubs/gunyah-stub.c b/accel/stubs/gunyah-stub.c
index 2028fa04c7..8f6e952938 100644
--- a/accel/stubs/gunyah-stub.c
+++ b/accel/stubs/gunyah-stub.c
@@ -16,3 +16,8 @@ void gunyah_set_swiotlb_size(uint64_t size)
 {
 return;
 }
+
+int gunyah_arm_set_dtb(__u64 dtb_start, __u64 dtb_size)
+{
+return -1;
+}
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index bfb7f3d92e..a485388d3c 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1738,6 +1738,12 @@ void virt_machine_done(Notifier *notifier, void *data)
 exit(1);
 }
 
+if (gunyah_enabled()) {
+if (gunyah_arm_set_dtb(info->dtb_start, vms->fdt_size)) {
+exit(1);
+}
+}
+
 fw_cfg_add_extra_pci_roots(vms->bus, vms->fw_cfg);
 
 virt_acpi_setup(vms);
diff --git a/target/arm/gunyah.c b/target/arm/gunyah.c
new file mode 100644
index 00..d655cd9a79
--- /dev/null
+++ b/target/arm/gunyah.c
@@ -0,0 +1,45 @@
+/*
+ * QEMU Gunyah hypervisor support
+ *
+ * Copyright(c) 2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "sysemu/gunyah.h"
+#include "sysemu/gunyah_int.h"
+#include "linux-headers/linux/gunyah.h"
+
+/*
+ * Specify location of device-tree in guest address space.
+ *
+ * @dtb_start - Guest physical address where VM's device-tree is found
+ * @dtb_size - Size of device-tree (and any free space after it).
+ *
+ * RM or Resource Manager VM is a trusted and privileged VM that works in
+ * collaboration with Gunyah hypevisor to setup resources for a VM before it 
can
+ * begin execution. One of its functions includes inspection/modification of a
+ * VM's device-tree before VM begins its execution. Modification can
+ * include specification of runtime resources allocated by hypervisor,
+ * details of which needs to be visible to VM.  VM's device-tree is modified
+ * "inline" making use of "free" space that could exist at the end of device
+ * tree.
+ */
+int gunyah_arm_set_dtb(uint64_t dtb_start, uint64_t dtb_size)
+{
+int ret;
+struct gh_vm_dtb_config dtb;
+
+dtb.guest_phys_addr = dtb_start;
+dtb.size = dtb_size;
+
+ret = gunyah_vm_ioctl(GH_VM_SET_DTB_CONFIG, );
+if (ret != 0) {
+error_report("GH_VM_SET_DTB_CONFIG failed: %s", strerror(errno));
+exit(1);
+}
+
+return 0;
+}
diff --git a/target/arm/meson.build b/target/arm/meson.build
index 2e10464dbb..951226b0a2 100644
--- a/target/arm/meson.build
+++ b/target/arm/meson.build
@@ -25,6 +25,9 @@ arm_system_ss.add(files(
   'machine.c',
   'ptw.c',
 ))
+arm_system_ss.add(when: 'CONFIG_GUNYAH', if_true: files(
+  'gunyah.c',
+))
 
 arm_user_ss = ss.source_set()
 
-- 
2.25.1

[RFC/PATCH v2 09/12] gunyah: Customize device-tree

Customize device-tree with Gunyah specific properties. Some of these
properties include specification of doorbells that need to be created
and associated with various interrupts.

Signed-off-by: Srivatsa Vaddagiri 
---
 include/sysemu/gunyah.h   |  2 +
 accel/stubs/gunyah-stub.c |  5 +++
 hw/arm/virt.c | 11 ++
 target/arm/gunyah.c   | 79 +++
 4 files changed, 97 insertions(+)

diff --git a/include/sysemu/gunyah.h b/include/sysemu/gunyah.h
index ba4862a1a6..58d0379b72 100644
--- a/include/sysemu/gunyah.h
+++ b/include/sysemu/gunyah.h
@@ -30,5 +30,7 @@ DECLARE_INSTANCE_CHECKER(GUNYAHState, GUNYAH_STATE,
  TYPE_GUNYAH_ACCEL)
 
 int gunyah_arm_set_dtb(uint64_t dtb_start, uint64_t dtb_size);
+void gunyah_arm_fdt_customize(void *fdt, uint64_t mem_base,
+uint32_t gic_phandle);
 
 #endif  /* QEMU_GUNYAH_H */
diff --git a/accel/stubs/gunyah-stub.c b/accel/stubs/gunyah-stub.c
index 8f6e952938..19649ea40b 100644
--- a/accel/stubs/gunyah-stub.c
+++ b/accel/stubs/gunyah-stub.c
@@ -21,3 +21,8 @@ int gunyah_arm_set_dtb(__u64 dtb_start, __u64 dtb_size)
 {
 return -1;
 }
+
+void gunyah_arm_fdt_customize(void *fdt, uint64_t mem_base,
+uint32_t gic_phandle) {
+return;
+}
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index a485388d3c..b0132beddd 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2214,6 +2214,14 @@ static void fdt_add_reserved_memory(VirtMachineState 
*vms)
 g_free(nodename);
 }
 
+static void virt_modify_dtb(const struct arm_boot_info *binfo, void *fdt)
+{
+const VirtMachineState *vms = container_of(binfo, VirtMachineState,
+ bootinfo);
+
+gunyah_arm_fdt_customize(fdt, vms->memmap[VIRT_MEM].base, 
vms->gic_phandle);
+}
+
 static void machvirt_init(MachineState *machine)
 {
 VirtMachineState *vms = VIRT_MACHINE(machine);
@@ -2533,6 +2541,9 @@ static void machvirt_init(MachineState *machine)
 vms->bootinfo.skip_dtb_autoload = true;
 vms->bootinfo.firmware_loaded = firmware_loaded;
 vms->bootinfo.psci_conduit = vms->psci_conduit;
+if (gunyah_enabled()) {
+vms->bootinfo.modify_dtb = virt_modify_dtb;
+}
 arm_load_kernel(ARM_CPU(first_cpu), machine, >bootinfo);
 
 vms->machine_done.notify = virt_machine_done;
diff --git a/target/arm/gunyah.c b/target/arm/gunyah.c
index d655cd9a79..c33a0c0615 100644
--- a/target/arm/gunyah.c
+++ b/target/arm/gunyah.c
@@ -11,6 +11,9 @@
 #include "sysemu/gunyah.h"
 #include "sysemu/gunyah_int.h"
 #include "linux-headers/linux/gunyah.h"
+#include "exec/memory.h"
+#include "sysemu/device_tree.h"
+#include "hw/arm/fdt.h"
 
 /*
  * Specify location of device-tree in guest address space.
@@ -43,3 +46,79 @@ int gunyah_arm_set_dtb(uint64_t dtb_start, uint64_t dtb_size)
 
 return 0;
 }
+
+void gunyah_arm_fdt_customize(void *fdt, uint64_t mem_base,
+uint32_t gic_phandle)
+{
+char *nodename;
+int i;
+GUNYAHState *state = get_gunyah_state();
+
+qemu_fdt_add_subnode(fdt, "/gunyah-vm-config");
+qemu_fdt_setprop_string(fdt, "/gunyah-vm-config",
+"image-name", "qemu-vm");
+qemu_fdt_setprop_string(fdt, "/gunyah-vm-config", "os-type", "linux");
+
+nodename = g_strdup_printf("/gunyah-vm-config/memory");
+qemu_fdt_add_subnode(fdt, nodename);
+qemu_fdt_setprop_cell(fdt, nodename, "#address-cells", 2);
+qemu_fdt_setprop_cell(fdt, nodename, "#size-cells", 2);
+qemu_fdt_setprop_u64(fdt, nodename, "base-address", mem_base);
+
+g_free(nodename);
+
+nodename = g_strdup_printf("/gunyah-vm-config/interrupts");
+qemu_fdt_add_subnode(fdt, nodename);
+qemu_fdt_setprop_cell(fdt, nodename, "config", gic_phandle);
+g_free(nodename);
+
+nodename = g_strdup_printf("/gunyah-vm-config/vcpus");
+qemu_fdt_add_subnode(fdt, nodename);
+qemu_fdt_setprop_string(fdt, nodename, "affinity", "proxy");
+g_free(nodename);
+
+nodename = g_strdup_printf("/gunyah-vm-config/vdevices");
+qemu_fdt_add_subnode(fdt, nodename);
+qemu_fdt_setprop_string(fdt, nodename, "generate", "/hypervisor");
+g_free(nodename);
+
+for (i = 0; i < state->nr_slots; ++i) {
+if (!state->slots[i].start || state->slots[i].lend ||
+state->slots[i].start == mem_base) {
+continue;
+}
+
+nodename = g_strdup_printf("/gunyah-vm-config/vdevices/shm-%x", i);
+qemu_fdt_add_subnode(fdt, nodename);
+qemu_fdt_setprop_string(fdt, nodename, "vdevice-type", "shm");
+qemu_fdt_setprop_string(fdt, nodename, "push-compatible", "dma");
+qemu_fdt_setprop(fdt, nodename, "peer-default", NULL, 0);
+qemu_fdt_setprop_u64(fdt, nodename, "dma_base", 0);
+g_free(nodename);
+
+nodename = g_strdup_printf("/gunyah-vm-config/vdevices/shm-%x/memory",
+i);
+

[RFC/PATCH v2 04/12] gunyah: Basic support