date:20230628

Re: [PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-06-28 Thread Akihiko Odaki


On 2023/06/29 13:07, Ani Sinha wrote:

PCI Express ports only have one slot, so PCI Express devices can only be
plugged into slot 0 on a PCIE port. Enforce it.

The change has been tested to not break ARI by instantiating seven vfs on an
emulated igb device (the maximum number of vfs the linux igb driver supports).
The vfs are seen to have non-zero device/slot numbers in the conventional
PCI BDF representation.

CC: jus...@redhat.com
CC: imamm...@redhat.com
CC: akihiko.od...@daynix.com

Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
Signed-off-by: Ani Sinha 
Reviewed-by: Julia Suvorova 
---
  hw/pci/pci.c | 15 +++
  1 file changed, 15 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e2eb4c3b4a..0320ac2bb3 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -65,6 +65,7 @@ bool pci_available = true;
  static char *pcibus_get_dev_path(DeviceState *dev);
  static char *pcibus_get_fw_dev_path(DeviceState *dev);
  static void pcibus_reset(BusState *qbus);
+static bool pcie_has_upstream_port(PCIDevice *dev);
  
  static Property pci_props[] = {

  DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
@@ -1190,6 +1191,20 @@ static PCIDevice *do_pci_register_device(PCIDevice 
*pci_dev,
 name);
  
 return NULL;

+} /*
+   * With SRIOV and ARI, vfs can have non-zero slot in the conventional
+   * PCI interpretation as all five bits reserved for slot addresses are
+   * also used for function bits for the various vfs. Ignore that case.
+   * It is too early here to check for ARI capabilities in the PCI config
+   * space. Hence, we check for a vf device instead.
+   */


Why don't just perform this check after the capabilities are set?

Regards,
Akihiko Odaki


+else if (!pci_is_vf(pci_dev) &&
+ pcie_has_upstream_port(pci_dev) &&
+ PCI_SLOT(devfn)) {
+error_setg(errp, "PCI: slot %d is not valid for %s,"
+   " parent device only allows plugging into slot 0.",
+   PCI_SLOT(devfn), name);
+return NULL;
  }
  
  pci_dev->devfn = devfn;

Re: [PATCH 2/6] tests/qtest: migration: Expose migrate_set_capability

2023-06-28 Thread Thomas Huth


On 28/06/2023 18.55, Fabiano Rosas wrote:

The following patch will make use of this function from within
migrate-helpers.c, so move it there.

Signed-off-by: Fabiano Rosas 
Reviewed-by: Juan Quintela 
---
  tests/qtest/migration-helpers.c | 11 +++
  tests/qtest/migration-helpers.h |  3 +++
  tests/qtest/migration-test.c| 11 ---
  3 files changed, 14 insertions(+), 11 deletions(-)


Reviewed-by: Thomas Huth

Re: [PATCH v3 6/6] target/ppc: Remove pointless checks of CONFIG_USER_ONLY in 'kvm_ppc.h'

2023-06-28 Thread Greg Kurz

On Tue, 27 Jun 2023 13:51:24 +0200
Philippe Mathieu-Daudé  wrote:

> Signed-off-by: Philippe Mathieu-Daudé 
> ---

Reviewed-by: Greg Kurz 

>  target/ppc/kvm_ppc.h | 5 -
>  1 file changed, 5 deletions(-)
> 
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index 901e188c9a..6a4dd9c560 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -42,7 +42,6 @@ int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu);
>  target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu,
>   bool radix, bool gtse,
>   uint64_t proc_tbl);
> -#ifndef CONFIG_USER_ONLY
>  bool kvmppc_spapr_use_multitce(void);
>  int kvmppc_spapr_enable_inkernel_multitce(void);
>  void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t page_shift,
> @@ -52,7 +51,6 @@ int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t 
> window_size);
>  int kvmppc_reset_htab(int shift_hint);
>  uint64_t kvmppc_vrma_limit(unsigned int hash_shift);
>  bool kvmppc_has_cap_spapr_vfio(void);
> -#endif /* !CONFIG_USER_ONLY */
>  bool kvmppc_has_cap_epr(void);
>  int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function);
>  int kvmppc_get_htab_fd(bool write, uint64_t index, Error **errp);
> @@ -262,7 +260,6 @@ static inline void kvmppc_set_reg_tb_offset(PowerPCCPU 
> *cpu, int64_t tb_offset)
>  {
>  }
>  
> -#ifndef CONFIG_USER_ONLY
>  static inline bool kvmppc_spapr_use_multitce(void)
>  {
>  return false;
> @@ -322,8 +319,6 @@ static inline void kvmppc_write_hpte(hwaddr ptex, 
> uint64_t pte0, uint64_t pte1)
>  abort();
>  }
>  
> -#endif /* !CONFIG_USER_ONLY */
> -
>  static inline bool kvmppc_has_cap_epr(void)
>  {
>  return false;



-- 
Greg

Re: [PATCH v3 4/6] target/ppc: Define TYPE_HOST_POWERPC_CPU in cpu-qom.h

2023-06-28 Thread Greg Kurz

On Tue, 27 Jun 2023 13:51:22 +0200
Philippe Mathieu-Daudé  wrote:

> TYPE_HOST_POWERPC_CPU is used in various places of cpu_init.c,
> in order to restrict "kvm_ppc.h" to sysemu, move this QOM-related
> definition to cpu-qom.h.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---

Reviewed-by: Greg Kurz 

>  target/ppc/cpu-qom.h | 2 ++
>  target/ppc/kvm_ppc.h | 2 --
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
> index c2bff349cc..4e4061068e 100644
> --- a/target/ppc/cpu-qom.h
> +++ b/target/ppc/cpu-qom.h
> @@ -36,6 +36,8 @@ OBJECT_DECLARE_CPU_TYPE(PowerPCCPU, PowerPCCPUClass, 
> POWERPC_CPU)
>  #define CPU_RESOLVING_TYPE TYPE_POWERPC_CPU
>  #define cpu_list ppc_cpu_list
>  
> +#define TYPE_HOST_POWERPC_CPU POWERPC_CPU_TYPE_NAME("host")
> +
>  ObjectClass *ppc_cpu_class_by_name(const char *name);
>  
>  typedef struct CPUArchState CPUPPCState;
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index 49954a300b..901e188c9a 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -13,8 +13,6 @@
>  #include "exec/hwaddr.h"
>  #include "cpu.h"
>  
> -#define TYPE_HOST_POWERPC_CPU POWERPC_CPU_TYPE_NAME("host")
> -
>  #ifdef CONFIG_KVM
>  
>  uint32_t kvmppc_get_tbfreq(void);



-- 
Greg

Re: [PATCH v3 5/6] target/ppc: Restrict 'kvm_ppc.h' to sysemu in cpu_init.c

2023-06-28 Thread Greg Kurz

On Tue, 27 Jun 2023 13:51:23 +0200
Philippe Mathieu-Daudé  wrote:

> User emulation shouldn't need any of the KVM prototypes
> declared in "kvm_ppc.h".
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---

Reviewed-by: Greg Kurz 

>  target/ppc/cpu_init.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
> index aeff71d063..f2afb539eb 100644
> --- a/target/ppc/cpu_init.c
> +++ b/target/ppc/cpu_init.c
> @@ -21,7 +21,6 @@
>  #include "qemu/osdep.h"
>  #include "disas/dis-asm.h"
>  #include "gdbstub/helpers.h"
> -#include "kvm_ppc.h"
>  #include "sysemu/cpus.h"
>  #include "sysemu/hw_accel.h"
>  #include "sysemu/tcg.h"
> @@ -49,6 +48,7 @@
>  #ifndef CONFIG_USER_ONLY
>  #include "hw/boards.h"
>  #include "hw/intc/intc.h"
> +#include "kvm_ppc.h"
>  #endif
>  
>  /* #define PPC_DEBUG_SPR */



-- 
Greg

Re: [PATCH] MAINTAINERS: Promote Cédric to VFIO co-maintainer

2023-06-28 Thread Cédric Le Goater


On 6/28/23 19:29, Alex Williamson wrote:

Cédric has stepped up involvement in vfio, reviewing and managing
patches, as well as pull requests.  This work deserves gratitude and
punishment with a promotion to co-maintainer ;)


:)


Signed-off-by: Alex Williamson 


Acked-by: Cédric Le Goater 



---

Cédric, I'd also support if you wanted to add a tree entry here.


OK. I had some habits on github. For VFIO, I might change for gitlab
where the CI runs. It can come later.

Thanks,

C.




  MAINTAINERS | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index e07746ac7d45..37e48c72b16a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2051,7 +2051,7 @@ F: hw/usb/dev-serial.c
  
  VFIO

  M: Alex Williamson 
-R: Cédric Le Goater 
+M: Cédric Le Goater 
  S: Supported
  F: hw/vfio/*
  F: include/hw/vfio/

Re: [RFC PATCH 0/3] ppc/pnv: SMT support for powernv

2023-06-28 Thread Cédric Le Goater


On 6/29/23 04:16, Nicholas Piggin wrote:

These patches implement enough to boot a SMT powernv machine to Linux
and boot a SMP KVM guest inside that.

There are a few more SPRs that need to be done, and per-LPAR SPRs are
mostly not annotated yet so it can't run in 1LPAR mode. But it is
enough to run skiboot/Linux with SMT so I'll just post the minimal
patches for RFC because the concept isn't really different to add
more SPRs and things.


This is good material for 8.1. Please send a v1 and Cc: all pnv reviewers.

Thanks,

C.

Re: [PULL 00/23] Block layer patches

2023-06-28 Thread Richard Henderson


On 6/28/23 16:15, Kevin Wolf wrote:

The following changes since commit 52ed34cbddde1cb89b2ac263e758e349a77f21e1:

   Merge tag 'pull-request-2023-06-26' ofhttps://gitlab.com/thuth/qemu  into 
staging (2023-06-26 10:38:41 +0200)

are available in the Git repository at:

   https://repo.or.cz/qemu/kevin.git  tags/for-upstream

for you to fetch changes up to 17362398ee1a7f04e8006a46333145d8b707fd35:

   block: use bdrv_co_debug_event in coroutine context (2023-06-28 09:46:34 
+0200)


Block layer patches

- Re-enable the graph lock
- More fixes to coroutine_fn marking


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/8.1 as 
appropriate.


r~

Re: [PATCH qemu] aspeed add montblanc bmc reference from fuji

2023-06-28 Thread Cédric Le Goater


On 6/29/23 04:57, Sittisak Sinprem wrote:

Hi Cédric,

I had fixed the function name to support in current branch,
but facing about below error while starting

./build/qemu-system-arm -machine montblanc-bmc  -drive 
file=~/flash-montblanc,format=raw,if=mtd -nographic -netdev 
tap,id=netdev0,script=no,downscript=no,ifname=tap0 -net 
nic,netdev=netdev0,model=ftgmac100

qemu-system-arm: device requires 134217728 bytes, block backend provides 
27726336 bytes


File ~/flash-montblanc is too big for the mx66l1g45g flash device and
128MB is expected. Use truncate maybe.

Thanks,

C.

Re: [RFC PATCH 3/3] ppc/pnv: SMT support for powernv

2023-06-28 Thread Cédric Le Goater


On 6/29/23 04:16, Nicholas Piggin wrote:

Set the TIR default value with the SMT thread index, and place some
standard limits on SMT configurations. Now powernv is able to boot
skiboot and Linux with a SMT topology, including booting a KVM guest.

There are several other per-core SPRs, but they are not so important to
run OPAL/Linux. Some important per-LPAR ones to convert before powernv
could run in 1LPAR mode. Broadcast msgsnd is not yet implemented either,
but skiboot/Linux does not use that. KVM uses an implementation-specific
detail of POWER9/10 TLBs where TLBIEL invalidates translations of all
threads on a core, but that is not required here because KVM does not
cache translations across PID or LPID switch. Most of these I have or
aren't too hard to implement, but I start with a small bare bones for
comments.

Not-yet-Signed-off-by: Nicholas Piggin 


I am glad this becoming possible. You can model the missing parts
later on.

Reviewed-by: Cédric Le Goater 

Thanks,

C.


---
  hw/ppc/pnv.c  | 12 
  hw/ppc/pnv_core.c | 13 +
  2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index fc083173f3..f599ccad1d 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -887,6 +887,18 @@ static void pnv_init(MachineState *machine)
  
  pnv->num_chips =

  machine->smp.max_cpus / (machine->smp.cores * machine->smp.threads);
+
+if (machine->smp.threads > 8) {
+error_report("Cannot support more than 8 threads/core "
+ "on a powernv machine");
+exit(1);
+}
+if (!is_power_of_2(machine->smp.threads)) {
+error_report("Cannot support %d threads/core on a powernv"
+ "machine because it must be a power of 2",
+ machine->smp.threads);
+exit(1);
+}
  /*
   * TODO: should we decide on how many chips we can create based
   * on #cores and Venice vs. Murano vs. Naples chip type etc...,
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 0bc3ad41c8..acd83caee8 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -167,12 +167,13 @@ static const MemoryRegionOps pnv_core_power9_xscom_ops = {
  .endianness = DEVICE_BIG_ENDIAN,
  };
  
-static void pnv_core_cpu_realize(PnvCore *pc, PowerPCCPU *cpu, Error **errp)

+static void pnv_core_cpu_realize(PnvCore *pc, PowerPCCPU *cpu, Error **errp,
+ int thread_index)
  {
  CPUPPCState *env = &cpu->env;
  int core_pir;
-int thread_index = 0; /* TODO: TCG supports only one thread */
  ppc_spr_t *pir = &env->spr_cb[SPR_PIR];
+ppc_spr_t *tir = &env->spr_cb[SPR_TIR];
  Error *local_err = NULL;
  PnvChipClass *pcc = PNV_CHIP_GET_CLASS(pc->chip);
  
@@ -188,11 +189,7 @@ static void pnv_core_cpu_realize(PnvCore *pc, PowerPCCPU *cpu, Error **errp)
  
  core_pir = object_property_get_uint(OBJECT(pc), "pir", &error_abort);
  
-/*

- * The PIR of a thread is the core PIR + the thread index. We will
- * need to find a way to get the thread index when TCG supports
- * more than 1. We could use the object name ?
- */
+tir->default_value = thread_index;
  pir->default_value = core_pir + thread_index;
  
  /* Set time-base frequency to 512 MHz */

@@ -241,7 +238,7 @@ static void pnv_core_realize(DeviceState *dev, Error **errp)
  }
  
  for (j = 0; j < cc->nr_threads; j++) {

-pnv_core_cpu_realize(pc, pc->threads[j], &local_err);
+pnv_core_cpu_realize(pc, pc->threads[j], &local_err, j);
  if (local_err) {
  goto err;
  }

Re: [PATCH] hw/ppc: Simplify clock update arithmetic

2023-06-28 Thread Cédric Le Goater


On 6/25/23 14:20, Nicholas Piggin wrote:

The clock update logic reads the clock twice to compute the new clock
value, with a value derived from the later time subtracted from a value
derived from the earlier time. This can lead to an underflow in
subtractions in bits that are intended to cancel exactly. This might
not cause any real problem, but it is more complicated than necessary
to reason about.

Simplify this by reading the clock once.

Signed-off-by: Nicholas Piggin 


This patch has ben superseded by

 
https://patchwork.ozlabs.org/project/qemu-ppc/patch/20230629020713.327745-1-npig...@gmail.com/

It is nice to add a v2 prefix, even if you change the change the subject.

Thanks,

C.



---
  hw/ppc/ppc.c | 33 +
  1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index f4fe1767d6..5d0a09eb5e 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -536,23 +536,24 @@ static inline void cpu_ppc_store_tb(ppc_tb_t *tb_env, 
uint64_t vmclk,
  void cpu_ppc_store_tbl (CPUPPCState *env, uint32_t value)
  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->tb_offset);

+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->tb_offset);
  tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->tb_offset, tb | (uint64_t)value);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->tb_offset, tb | (uint64_t)value);
  }
  
  static inline void _cpu_ppc_store_tbu(CPUPPCState *env, uint32_t value)

  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->tb_offset);

+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->tb_offset);
  tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->tb_offset, ((uint64_t)value << 32) | tb);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->tb_offset,
+ ((uint64_t)value << 32) | tb);
  }
  
  void cpu_ppc_store_tbu (CPUPPCState *env, uint32_t value)

@@ -585,23 +586,24 @@ uint32_t cpu_ppc_load_atbu (CPUPPCState *env)
  void cpu_ppc_store_atbl (CPUPPCState *env, uint32_t value)
  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->atb_offset);

+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->atb_offset);
  tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->atb_offset, tb | (uint64_t)value);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->atb_offset, tb | (uint64_t)value);
  }
  
  void cpu_ppc_store_atbu (CPUPPCState *env, uint32_t value)

  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->atb_offset);

+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->atb_offset);
  tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->atb_offset, ((uint64_t)value << 32) | tb);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->atb_offset,
+ ((uint64_t)value << 32) | tb);
  }
  
  uint64_t cpu_ppc_load_vtb(CPUPPCState *env)

@@ -623,14 +625,13 @@ void cpu_ppc_store_vtb(CPUPPCState *env, uint64_t value)
  void cpu_ppc_store_tbu40(CPUPPCState *env, uint64_t value)
  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),

-tb_env->tb_offset);
+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->tb_offset);
  tb &= 0xFFUL;
  tb |= (value & ~0xFFUL);
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->tb_offset, tb);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->tb_offset, tb);
  }
  
  static void cpu_ppc_tb_stop (CPUPPCState *env)

Re: [PATCH] net: add initial support for AF_XDP network backend

2023-06-28 Thread Jason Wang

On Wed, Jun 28, 2023 at 4:25 PM Stefan Hajnoczi  wrote:
>
> On Wed, 28 Jun 2023 at 10:19, Jason Wang  wrote:
> >
> > On Wed, Jun 28, 2023 at 4:15 PM Stefan Hajnoczi  wrote:
> > >
> > > On Wed, 28 Jun 2023 at 09:59, Jason Wang  wrote:
> > > >
> > > > On Wed, Jun 28, 2023 at 3:46 PM Stefan Hajnoczi  
> > > > wrote:
> > > > >
> > > > > On Wed, 28 Jun 2023 at 05:28, Jason Wang  wrote:
> > > > > >
> > > > > > On Wed, Jun 28, 2023 at 6:45 AM Ilya Maximets  
> > > > > > wrote:
> > > > > > >
> > > > > > > On 6/27/23 04:54, Jason Wang wrote:
> > > > > > > > On Mon, Jun 26, 2023 at 9:17 PM Ilya Maximets 
> > > > > > > >  wrote:
> > > > > > > >>
> > > > > > > >> On 6/26/23 08:32, Jason Wang wrote:
> > > > > > > >>> On Sun, Jun 25, 2023 at 3:06 PM Jason Wang 
> > > > > > > >>>  wrote:
> > > > > > > 
> > > > > > >  On Fri, Jun 23, 2023 at 5:58 AM Ilya Maximets 
> > > > > > >   wrote:
> > > > > > > >> It is noticeably more performant than a tap with vhost=on in 
> > > > > > > >> terms of PPS.
> > > > > > > >> So, that might be one case.  Taking into account that just rcu 
> > > > > > > >> lock and
> > > > > > > >> unlock in virtio-net code takes more time than a packet copy, 
> > > > > > > >> some batching
> > > > > > > >> on QEMU side should improve performance significantly.  And it 
> > > > > > > >> shouldn't be
> > > > > > > >> too hard to implement.
> > > > > > > >>
> > > > > > > >> Performance over virtual interfaces may potentially be 
> > > > > > > >> improved by creating
> > > > > > > >> a kernel thread for async Tx.  Similarly to what io_uring 
> > > > > > > >> allows.  Currently
> > > > > > > >> Tx on non-zero-copy interfaces is synchronous, and that 
> > > > > > > >> doesn't allow to
> > > > > > > >> scale well.
> > > > > > > >
> > > > > > > > Interestingly, actually, there are a lot of "duplication" 
> > > > > > > > between
> > > > > > > > io_uring and AF_XDP:
> > > > > > > >
> > > > > > > > 1) both have similar memory model (user register)
> > > > > > > > 2) both use ring for communication
> > > > > > > >
> > > > > > > > I wonder if we can let io_uring talks directly to AF_XDP.
> > > > > > >
> > > > > > > Well, if we submit poll() in QEMU main loop via io_uring, then we 
> > > > > > > can
> > > > > > > avoid cost of the synchronous Tx for non-zero-copy modes, i.e. for
> > > > > > > virtual interfaces.  io_uring thread in the kernel will be able to
> > > > > > > perform transmission for us.
> > > > > >
> > > > > > It would be nice if we can use iothread/vhost other than the main 
> > > > > > loop
> > > > > > even if io_uring can use kthreads. We can avoid the memory 
> > > > > > translation
> > > > > > cost.
> > > > >
> > > > > The QEMU event loop (AioContext) has io_uring code
> > > > > (utils/fdmon-io_uring.c) but it's disabled at the moment. I'm working
> > > > > on patches to re-enable it and will probably send them in July. The
> > > > > patches also add an API to submit arbitrary io_uring operations so
> > > > > that you can do stuff besides file descriptor monitoring. Both the
> > > > > main loop and IOThreads will be able to use io_uring on Linux hosts.
> > > >
> > > > Just to make sure I understand. If we still need a copy from guest to
> > > > io_uring buffer, we still need to go via memory API for GPA which
> > > > seems expensive.
> > > >
> > > > Vhost seems to be a shortcut for this.
> > >
> > > I'm not sure how exactly you're thinking of using io_uring.
> > >
> > > Simply using io_uring for the event loop (file descriptor monitoring)
> > > doesn't involve an extra buffer, but the packet payload still needs to
> > > reside in AF_XDP umem, so there is a copy between guest memory and
> > > umem.
> >
> > So there would be a translation from GPA to HVA (unless io_uring
> > support 2 stages) which needs to go via qemu memory core. And this
> > part seems to be very expensive according to my test in the past.
>
> Yes, but in the current approach where AF_XDP is implemented as a QEMU
> netdev, there is already QEMU device emulation (e.g. virtio-net)
> happening. So the GPA to HVA translation will happen anyway in device
> emulation.

Just to make sure we're on the same page.

I meant, AF_XDP can do more than e.g 10Mpps. So if we still use the
QEMU netdev, it would be very hard to achieve that if we stick to
using the Qemu memory core translations which need to take care about
too much extra stuff. That's why I suggest using vhost in io threads
which only cares about ram so the translation could be very fast.

>
> Are you thinking about AF_XDP passthrough where the guest directly
> interacts with AF_XDP?

This could be another way to solve, since it won't use Qemu's memory
core to do the translation.

>
> > > If umem encompasses guest memory,
> >
> > It requires you to pin the whole guest memory and a GPA to HVA
> > translation is still required.
>
> Ilya mentioned that umem uses relative offsets instead of absolute
> memory addresses. In the AF_XDP passthrough case this means no address
>

Re: [PATCH] hw/ppc: Fix clock update drift

2023-06-28 Thread Cédric Le Goater


On 6/29/23 04:07, Nicholas Piggin wrote:

The clock update logic reads the clock twice to compute the new clock
value, with a value derived from the later time subtracted from a value
derived from the earlier time. The delta causes time to be lost.

This can ultimately result in time becoming unsynchronized between CPUs
and that can cause OS lockups, timeouts, watchdogs, etc. This can be
seen running a KVM guest (that causes lots of TB updates) on a powernv
SMP machine.

Fix this by reading the clock once.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Nicholas Piggin 


Fixes: dbdd25065e90 ("Implement time-base start/stop helpers. Implement PowerPC 6xx 
time-base enable input pin.")

Reviewed-by: Cédric Le Goater 

Thanks,

C.


---
I also made a test case that can trigger this with kvm-unit-tests, but
it's been taking me a while to get that upstreamed.

Thanks,
Nick

  hw/ppc/ppc.c | 33 +
  1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 82e4408c5c..6233f43c01 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -535,23 +535,24 @@ static inline void cpu_ppc_store_tb(ppc_tb_t *tb_env, 
uint64_t vmclk,
  void cpu_ppc_store_tbl (CPUPPCState *env, uint32_t value)
  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->tb_offset);

+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->tb_offset);
  tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->tb_offset, tb | (uint64_t)value);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->tb_offset, tb | (uint64_t)value);
  }
  
  static inline void _cpu_ppc_store_tbu(CPUPPCState *env, uint32_t value)

  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->tb_offset);

+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->tb_offset);
  tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->tb_offset, ((uint64_t)value << 32) | tb);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->tb_offset,
+ ((uint64_t)value << 32) | tb);
  }
  
  void cpu_ppc_store_tbu (CPUPPCState *env, uint32_t value)

@@ -584,23 +585,24 @@ uint32_t cpu_ppc_load_atbu (CPUPPCState *env)
  void cpu_ppc_store_atbl (CPUPPCState *env, uint32_t value)
  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->atb_offset);

+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->atb_offset);
  tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->atb_offset, tb | (uint64_t)value);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->atb_offset, tb | (uint64_t)value);
  }
  
  void cpu_ppc_store_atbu (CPUPPCState *env, uint32_t value)

  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), tb_env->atb_offset);

+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->atb_offset);
  tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->atb_offset, ((uint64_t)value << 32) | tb);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->atb_offset,
+ ((uint64_t)value << 32) | tb);
  }
  
  uint64_t cpu_ppc_load_vtb(CPUPPCState *env)

@@ -622,14 +624,13 @@ void cpu_ppc_store_vtb(CPUPPCState *env, uint64_t value)
  void cpu_ppc_store_tbu40(CPUPPCState *env, uint64_t value)
  {
  ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
  uint64_t tb;
  
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),

-tb_env->tb_offset);
+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->tb_offset);
  tb &= 0xFFUL;
  tb |= (value & ~0xFFUL);
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->tb_offset, tb);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->tb_offset, tb);
  }
  
  static void cpu_ppc_tb_stop (CPUPPCState *env)

Re: [PATCH v3 6/6] target/ppc: Remove pointless checks of CONFIG_USER_ONLY in 'kvm_ppc.h'

2023-06-28 Thread Cédric Le Goater


On 6/27/23 13:51, Philippe Mathieu-Daudé wrote:

Signed-off-by: Philippe Mathieu-Daudé 



I guess the MemoryRegion code has sufficiently changed since commit
98efaf75282a ("ppc: Fix up usermode only builds") ?

Reviewed-by: Cédric Le Goater 

Thanks,

C.





---
  target/ppc/kvm_ppc.h | 5 -
  1 file changed, 5 deletions(-)

diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index 901e188c9a..6a4dd9c560 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -42,7 +42,6 @@ int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu);
  target_ulong kvmppc_configure_v3_mmu(PowerPCCPU *cpu,
   bool radix, bool gtse,
   uint64_t proc_tbl);
-#ifndef CONFIG_USER_ONLY
  bool kvmppc_spapr_use_multitce(void);
  int kvmppc_spapr_enable_inkernel_multitce(void);
  void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t page_shift,
@@ -52,7 +51,6 @@ int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t 
window_size);
  int kvmppc_reset_htab(int shift_hint);
  uint64_t kvmppc_vrma_limit(unsigned int hash_shift);
  bool kvmppc_has_cap_spapr_vfio(void);
-#endif /* !CONFIG_USER_ONLY */
  bool kvmppc_has_cap_epr(void);
  int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function);
  int kvmppc_get_htab_fd(bool write, uint64_t index, Error **errp);
@@ -262,7 +260,6 @@ static inline void kvmppc_set_reg_tb_offset(PowerPCCPU 
*cpu, int64_t tb_offset)
  {
  }
  
-#ifndef CONFIG_USER_ONLY

  static inline bool kvmppc_spapr_use_multitce(void)
  {
  return false;
@@ -322,8 +319,6 @@ static inline void kvmppc_write_hpte(hwaddr ptex, uint64_t 
pte0, uint64_t pte1)
  abort();
  }
  
-#endif /* !CONFIG_USER_ONLY */

-
  static inline bool kvmppc_has_cap_epr(void)
  {
  return false;

Re: [PATCH v3 5/6] target/ppc: Restrict 'kvm_ppc.h' to sysemu in cpu_init.c

2023-06-28 Thread Cédric Le Goater


On 6/27/23 13:51, Philippe Mathieu-Daudé wrote:

User emulation shouldn't need any of the KVM prototypes
declared in "kvm_ppc.h".

Signed-off-by: Philippe Mathieu-Daudé 




Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  target/ppc/cpu_init.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index aeff71d063..f2afb539eb 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -21,7 +21,6 @@
  #include "qemu/osdep.h"
  #include "disas/dis-asm.h"
  #include "gdbstub/helpers.h"
-#include "kvm_ppc.h"
  #include "sysemu/cpus.h"
  #include "sysemu/hw_accel.h"
  #include "sysemu/tcg.h"
@@ -49,6 +48,7 @@
  #ifndef CONFIG_USER_ONLY
  #include "hw/boards.h"
  #include "hw/intc/intc.h"
+#include "kvm_ppc.h"
  #endif
  
  /* #define PPC_DEBUG_SPR */

Re: [PATCH v3 4/6] target/ppc: Define TYPE_HOST_POWERPC_CPU in cpu-qom.h

2023-06-28 Thread Cédric Le Goater


On 6/27/23 13:51, Philippe Mathieu-Daudé wrote:

TYPE_HOST_POWERPC_CPU is used in various places of cpu_init.c,
in order to restrict "kvm_ppc.h" to sysemu, move this QOM-related
definition to cpu-qom.h.

Signed-off-by: Philippe Mathieu-Daudé 





Reviewed-by: Cédric Le Goater 

Thanks,

C.




---
  target/ppc/cpu-qom.h | 2 ++
  target/ppc/kvm_ppc.h | 2 --
  2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
index c2bff349cc..4e4061068e 100644
--- a/target/ppc/cpu-qom.h
+++ b/target/ppc/cpu-qom.h
@@ -36,6 +36,8 @@ OBJECT_DECLARE_CPU_TYPE(PowerPCCPU, PowerPCCPUClass, 
POWERPC_CPU)
  #define CPU_RESOLVING_TYPE TYPE_POWERPC_CPU
  #define cpu_list ppc_cpu_list
  
+#define TYPE_HOST_POWERPC_CPU POWERPC_CPU_TYPE_NAME("host")

+
  ObjectClass *ppc_cpu_class_by_name(const char *name);
  
  typedef struct CPUArchState CPUPPCState;

diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index 49954a300b..901e188c9a 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -13,8 +13,6 @@
  #include "exec/hwaddr.h"
  #include "cpu.h"
  
-#define TYPE_HOST_POWERPC_CPU POWERPC_CPU_TYPE_NAME("host")

-
  #ifdef CONFIG_KVM
  
  uint32_t kvmppc_get_tbfreq(void);

Re: [PATCH v3 3/6] target/ppc: Move CPU QOM definitions to cpu-qom.h

2023-06-28 Thread Cédric Le Goater


On 6/27/23 13:51, Philippe Mathieu-Daudé wrote:

Signed-off-by: Philippe Mathieu-Daudé 





Reviewed-by: Cédric Le Goater 

Thanks,

C.




---
  target/ppc/cpu-qom.h | 5 +
  target/ppc/cpu.h | 6 --
  2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
index 9666f54f65..c2bff349cc 100644
--- a/target/ppc/cpu-qom.h
+++ b/target/ppc/cpu-qom.h
@@ -31,6 +31,11 @@
  
  OBJECT_DECLARE_CPU_TYPE(PowerPCCPU, PowerPCCPUClass, POWERPC_CPU)
  
+#define POWERPC_CPU_TYPE_SUFFIX "-" TYPE_POWERPC_CPU

+#define POWERPC_CPU_TYPE_NAME(model) model POWERPC_CPU_TYPE_SUFFIX
+#define CPU_RESOLVING_TYPE TYPE_POWERPC_CPU
+#define cpu_list ppc_cpu_list
+
  ObjectClass *ppc_cpu_class_by_name(const char *name);
  
  typedef struct CPUArchState CPUPPCState;

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index af12c93ebc..e91e1774e5 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1468,12 +1468,6 @@ static inline uint64_t ppc_dump_gpr(CPUPPCState *env, 
int gprn)
  int ppc_dcr_read(ppc_dcr_t *dcr_env, int dcrn, uint32_t *valp);
  int ppc_dcr_write(ppc_dcr_t *dcr_env, int dcrn, uint32_t val);
  
-#define POWERPC_CPU_TYPE_SUFFIX "-" TYPE_POWERPC_CPU

-#define POWERPC_CPU_TYPE_NAME(model) model POWERPC_CPU_TYPE_SUFFIX
-#define CPU_RESOLVING_TYPE TYPE_POWERPC_CPU
-
-#define cpu_list ppc_cpu_list
-
  /* MMU modes definitions */
  #define MMU_USER_IDX 0
  static inline int cpu_mmu_index(CPUPPCState *env, bool ifetch)

Re: [PATCH v3 1/6] target/ppc: Have 'kvm_ppc.h' include 'sysemu/kvm.h'

2023-06-28 Thread Cédric Le Goater


On 6/27/23 13:51, Philippe Mathieu-Daudé wrote:

"kvm_ppc.h" declares:

   int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run);

'struct kvm_run' is declared in "sysemu/kvm.h", include it.

Signed-off-by: Philippe Mathieu-Daudé 





Reviewed-by: Cédric Le Goater 

Thanks,

C.




---
  target/ppc/kvm_ppc.h | 1 +
  1 file changed, 1 insertion(+)

diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index 611debc3ce..2e395416f0 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -9,6 +9,7 @@
  #ifndef KVM_PPC_H
  #define KVM_PPC_H
  
+#include "sysemu/kvm.h"

  #include "exec/hwaddr.h"
  #include "cpu.h"

Re: [PATCH v3 2/6] target/ppc: Reorder #ifdef'ry in kvm_ppc.h

2023-06-28 Thread Cédric Le Goater


On 6/27/23 13:51, Philippe Mathieu-Daudé wrote:

Keep a single if/else/endif block checking CONFIG_KVM.

Signed-off-by: Philippe Mathieu-Daudé 






Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  target/ppc/kvm_ppc.h | 62 
  1 file changed, 28 insertions(+), 34 deletions(-)

diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index 2e395416f0..49954a300b 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -93,7 +93,34 @@ void kvmppc_set_reg_tb_offset(PowerPCCPU *cpu, int64_t 
tb_offset);
  
  int kvm_handle_nmi(PowerPCCPU *cpu, struct kvm_run *run);
  
-#else

+#define kvmppc_eieio() \
+do {  \
+if (kvm_enabled()) {  \
+asm volatile("eieio" : : : "memory"); \
+} \
+} while (0)
+
+/* Store data cache blocks back to memory */
+static inline void kvmppc_dcbst_range(PowerPCCPU *cpu, uint8_t *addr, int len)
+{
+uint8_t *p;
+
+for (p = addr; p < addr + len; p += cpu->env.dcache_line_size) {
+asm volatile("dcbst 0,%0" : : "r"(p) : "memory");
+}
+}
+
+/* Invalidate instruction cache blocks */
+static inline void kvmppc_icbi_range(PowerPCCPU *cpu, uint8_t *addr, int len)
+{
+uint8_t *p;
+
+for (p = addr; p < addr + len; p += cpu->env.icache_line_size) {
+asm volatile("icbi 0,%0" : : "r"(p));
+}
+}
+
+#else /* !CONFIG_KVM */
  
  static inline uint32_t kvmppc_get_tbfreq(void)

  {
@@ -440,10 +467,6 @@ static inline bool 
kvmppc_pvr_workaround_required(PowerPCCPU *cpu)
  return false;
  }
  
-#endif

-
-#ifndef CONFIG_KVM
-
  #define kvmppc_eieio() do { } while (0)
  
  static inline void kvmppc_dcbst_range(PowerPCCPU *cpu, uint8_t *addr, int len)

@@ -454,35 +477,6 @@ static inline void kvmppc_icbi_range(PowerPCCPU *cpu, 
uint8_t *addr, int len)
  {
  }
  
-#else   /* CONFIG_KVM */

-
-#define kvmppc_eieio() \
-do {  \
-if (kvm_enabled()) {  \
-asm volatile("eieio" : : : "memory"); \
-} \
-} while (0)
-
-/* Store data cache blocks back to memory */
-static inline void kvmppc_dcbst_range(PowerPCCPU *cpu, uint8_t *addr, int len)
-{
-uint8_t *p;
-
-for (p = addr; p < addr + len; p += cpu->env.dcache_line_size) {
-asm volatile("dcbst 0,%0" : : "r"(p) : "memory");
-}
-}
-
-/* Invalidate instruction cache blocks */
-static inline void kvmppc_icbi_range(PowerPCCPU *cpu, uint8_t *addr, int len)
-{
-uint8_t *p;
-
-for (p = addr; p < addr + len; p += cpu->env.icache_line_size) {
-asm volatile("icbi 0,%0" : : "r"(p));
-}
-}
-
  #endif  /* CONFIG_KVM */
  
  #endif /* KVM_PPC_H */

Re: [PATCH 0/2] target/ppc: Easy parts of the POWER chiptod series

2023-06-28 Thread Cédric Le Goater


On 6/25/23 14:03, Nicholas Piggin wrote:

Cedric kindly reviewed these already so I think they should be
good to go now. This is just a rebase and slight rewording the
changelog. Still haven't completed the main chiptod device yet.

Thanks,
Nick

Nicholas Piggin (2):
   target/ppc: Tidy POWER book4 SPR registration
   target/ppc: Add TFMR SPR implementation with read and write helpers

  target/ppc/cpu_init.c| 82 
  target/ppc/helper.h  |  2 +
  target/ppc/spr_common.h  |  2 +
  target/ppc/timebase_helper.c | 13 ++
  target/ppc/translate.c   | 10 +
  5 files changed, 82 insertions(+), 27 deletions(-)




Daniel,

When you start building the next PPC PR, I think you can also take
this patch :

  [4/4] target/ppc: Implement core timebase state machine and TFMR
  
https://patchwork.ozlabs.org/project/qemu-ppc/patch/20230603233612.125879-5-npig...@gmail.com/

It belongs to the same series.

Thanks,

C.

[PATCH v6 4/5] tests/qtest/hd-geo-test: fix incorrect pcie-root-port usage and simplify test

2023-06-28 Thread Ani Sinha

The test attaches a SCSI controller to a non-zero slot and a pcie-to-pci bridge
on slot 0 on the same pcie-root-port. Since a downstream device can be attached
to a pcie-root-port only on slot 0, the above test configuration is not allowed.
Additionally using pcie.0 as id for pcie-to-pci bridge is incorrect as that id
is reserved only for the root bus.

In the test scenario, there is no need to attach a pcie-root-port to the
root complex. A SCSI controller can be attached to a pcie-to-pci bridge
which can then be directly attached to the root bus (pcie.0).

Fix the test and simplify it.

CC: m...@redhat.com
CC: imamm...@redhat.com
CC: Michael Labiuk 

Signed-off-by: Ani Sinha 
---
 tests/qtest/hd-geo-test.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tests/qtest/hd-geo-test.c b/tests/qtest/hd-geo-test.c
index 5aa258a2b3..d08bffad91 100644
--- a/tests/qtest/hd-geo-test.c
+++ b/tests/qtest/hd-geo-test.c
@@ -784,14 +784,12 @@ static void test_override_scsi(void)
 test_override(args, "pc", expected);
 }
 
-static void setup_pci_bridge(TestArgs *args, const char *id, const char 
*rootid)
+static void setup_pci_bridge(TestArgs *args, const char *id)
 {
 
-char *root, *br;
-root = g_strdup_printf("-device pcie-root-port,id=%s", rootid);
-br = g_strdup_printf("-device pcie-pci-bridge,bus=%s,id=%s", rootid, id);
+char *br;
+br = g_strdup_printf("-device pcie-pci-bridge,bus=pcie.0,id=%s", id);
 
-args->argc = append_arg(args->argc, args->argv, ARGV_SIZE, root);
 args->argc = append_arg(args->argc, args->argv, ARGV_SIZE, br);
 }
 
@@ -811,8 +809,8 @@ static void test_override_scsi_q35(void)
 add_drive_with_mbr(args, empty_mbr, 1);
 add_drive_with_mbr(args, empty_mbr, 1);
 add_drive_with_mbr(args, empty_mbr, 1);
-setup_pci_bridge(args, "pcie.0", "br");
-add_scsi_controller(args, "lsi53c895a", "br", 3);
+setup_pci_bridge(args, "pcie-pci-br");
+add_scsi_controller(args, "lsi53c895a", "pcie-pci-br", 3);
 add_scsi_disk(args, 0, 0, 0, 0, 0, 1, 120, 30);
 add_scsi_disk(args, 1, 0, 0, 1, 0, 9000, 120, 30);
 add_scsi_disk(args, 2, 0, 0, 2, 0, 1, 0, 0);
@@ -868,9 +866,9 @@ static void test_override_virtio_blk_q35(void)
 };
 add_drive_with_mbr(args, empty_mbr, 1);
 add_drive_with_mbr(args, empty_mbr, 1);
-setup_pci_bridge(args, "pcie.0", "br");
-add_virtio_disk(args, 0, "br", 3, 1, 120, 30);
-add_virtio_disk(args, 1, "br", 4, 9000, 120, 30);
+setup_pci_bridge(args, "pcie-pci-br");
+add_virtio_disk(args, 0, "pcie-pci-br", 3, 1, 120, 30);
+add_virtio_disk(args, 1, "pcie-pci-br", 4, 9000, 120, 30);
 test_override(args, "q35", expected);
 }
 
-- 
2.39.1

[PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-06-28 Thread Ani Sinha

PCI Express ports only have one slot, so PCI Express devices can only be
plugged into slot 0 on a PCIE port. Enforce it.

The change has been tested to not break ARI by instantiating seven vfs on an
emulated igb device (the maximum number of vfs the linux igb driver supports).
The vfs are seen to have non-zero device/slot numbers in the conventional
PCI BDF representation.

CC: jus...@redhat.com
CC: imamm...@redhat.com
CC: akihiko.od...@daynix.com

Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
Signed-off-by: Ani Sinha 
Reviewed-by: Julia Suvorova 
---
 hw/pci/pci.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e2eb4c3b4a..0320ac2bb3 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -65,6 +65,7 @@ bool pci_available = true;
 static char *pcibus_get_dev_path(DeviceState *dev);
 static char *pcibus_get_fw_dev_path(DeviceState *dev);
 static void pcibus_reset(BusState *qbus);
+static bool pcie_has_upstream_port(PCIDevice *dev);
 
 static Property pci_props[] = {
 DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
@@ -1190,6 +1191,20 @@ static PCIDevice *do_pci_register_device(PCIDevice 
*pci_dev,
name);
 
return NULL;
+} /*
+   * With SRIOV and ARI, vfs can have non-zero slot in the conventional
+   * PCI interpretation as all five bits reserved for slot addresses are
+   * also used for function bits for the various vfs. Ignore that case.
+   * It is too early here to check for ARI capabilities in the PCI config
+   * space. Hence, we check for a vf device instead.
+   */
+else if (!pci_is_vf(pci_dev) &&
+ pcie_has_upstream_port(pci_dev) &&
+ PCI_SLOT(devfn)) {
+error_setg(errp, "PCI: slot %d is not valid for %s,"
+   " parent device only allows plugging into slot 0.",
+   PCI_SLOT(devfn), name);
+return NULL;
 }
 
 pci_dev->devfn = devfn;
-- 
2.39.1

[PATCH v6 0/5] test and QEMU fixes to ensure proper PCIE device usage

2023-06-28 Thread Ani Sinha

Patches 1-4:
Fix tests so that devices do not use non-zero slots on the pcie root
ports. PCIE ports only have one slot, so PCIE devices can only be
plugged into slot 0 on a PCIE port.

Patch 5:
Enforce only one slot on PCIE port.

The test fixes must be applied before the QEMU change that checks for use
of a single slot in PCIE port.

CC: m...@redhat.com
CC: imamm...@redhat.com
CC: jus...@redhat.com
CC: th...@redhat.com
CC: lviv...@redhat.com
CC: michael.lab...@virtuozzo.com

Changelog:
===
v6: make patch 5 ARI compliant. fix commit message 
(s/pcie-root-port/pcie-to-pci/)
in patch 4. Rebase patchset to latest master.

v5: no code changes - correct a mistake in the commit log message.

v4: reword commit log for patch 4.

v3: tags added. reword the error description in patch 5. Reword commit log in 
patch 4. 

v2: add hd-geo-test fix as well as the actual QEMU code fix to the patchset.

The patches are added in the right order.


Ani Sinha (5):
  tests/acpi: allow changes in DSDT.noacpihp table blob
  tests/acpi/bios-tables-test: use the correct slot on the
pcie-root-port
  tests/acpi/bios-tables-test: update acpi blob q35/DSDT.noacpihp
  tests/qtest/hd-geo-test: fix incorrect pcie-root-port usage and
simplify test
  hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

 hw/pci/pci.c  |  15 +++
 tests/data/acpi/q35/DSDT.noacpihp | Bin 8248 -> 8241 bytes
 tests/qtest/bios-tables-test.c|   4 ++--
 tests/qtest/hd-geo-test.c |  18 --
 4 files changed, 25 insertions(+), 12 deletions(-)

-- 
2.39.1

[PATCH v6 3/5] tests/acpi/bios-tables-test: update acpi blob q35/DSDT.noacpihp

2023-06-28 Thread Ani Sinha

Some fixes were committed in bios-tables-test in the previous commit. Update
the acpi blob and clear bios-tables-test-allowed-diff.h so that the test
continues to pass with the changes in the bios-tables-test.

Following is the asl diff between the old and the newly updated blob:

@@ -1,30 +1,30 @@
 /*
  * Intel ACPI Component Architecture
  * AML/ASL+ Disassembler version 20210604 (64-bit version)
  * Copyright (c) 2000 - 2021 Intel Corporation
  *
  * Disassembling to symbolic ASL+ operators
  *
- * Disassembly of tests/data/acpi/q35/DSDT.noacpihp, Wed Jun 21 18:26:52 2023
+ * Disassembly of /tmp/aml-O8SU61, Wed Jun 21 18:26:52 2023
  *
  * Original Table Header:
  * Signature"DSDT"
- * Length   0x2038 (8248)
+ * Length   0x2031 (8241)
  * Revision 0x01  32-bit table (V1), no 64-bit math support
- * Checksum 0x4A
+ * Checksum 0x89
  * OEM ID   "BOCHS "
  * OEM Table ID "BXPC"
  * OEM Revision 0x0001 (1)
  * Compiler ID  "BXPC"
  * Compiler Version 0x0001 (1)
  */
 DefinitionBlock ("", "DSDT", 1, "BOCHS ", "BXPC", 0x0001)
 {
 Scope (\)
 {
 OperationRegion (DBG, SystemIO, 0x0402, One)
 Field (DBG, ByteAcc, NoLock, Preserve)
 {
 DBGB,   8
 }

@@ -3148,48 +3148,48 @@
 {
 Name (_ADR, Zero)  // _ADR: Address
 Method (_DSM, 4, Serialized)  // _DSM: Device-Specific 
Method
 {
 Local0 = Package (0x01)
 {
 0x01F5
 }
 Return (EDSM (Arg0, Arg1, Arg2, Arg3, Local0))
 }
 }
 }

 Device (S40)
 {
 Name (_ADR, 0x0008)  // _ADR: Address
-Device (S41)
+Device (S01)
 {
-Name (_ADR, 0x00080001)  // _ADR: Address
+Name (_ADR, One)  // _ADR: Address
 Method (_DSM, 4, Serialized)  // _DSM: Device-Specific 
Method
 {
 Local0 = Package (0x01)
 {
 0x0259
 }
 Return (EDSM (Arg0, Arg1, Arg2, Arg3, Local0))
 }
 }

-Device (S48)
+Device (S02)
 {
-Name (_ADR, 0x0009)  // _ADR: Address
+Name (_ADR, 0x02)  // _ADR: Address
 Device (S00)
 {
 Name (_ADR, Zero)  // _ADR: Address
 }
 }
 }

 Device (SF8)
 {
 Name (_ADR, 0x001F)  // _ADR: Address
 OperationRegion (PIRQ, PCI_Config, 0x60, 0x0C)
 Scope (\_SB)
 {
 Field (PCI0.SF8.PIRQ, ByteAcc, NoLock, Preserve)
 {
 PRQA,   8,

Signed-off-by: Ani Sinha 
Acked-by: Igor Mammedov 
---
 tests/data/acpi/q35/DSDT.noacpihp   | Bin 8248 -> 8241 bytes
 tests/qtest/bios-tables-test-allowed-diff.h |   1 -
 2 files changed, 1 deletion(-)

diff --git a/tests/data/acpi/q35/DSDT.noacpihp 
b/tests/data/acpi/q35/DSDT.noacpihp
index 
6ab1f0e52543fcb7f84a7fd1327fe5aa42010565..8cab2f8eb9ae94e0165f3f17857ec7d080fb0e13
 100644
GIT binary patch
delta 109
zcmdntu+f3bCDi)r&-xoSoL
DyqFtK

delta 94
zcmdn!u)~4NCD

[PATCH v6 2/5] tests/acpi/bios-tables-test: use the correct slot on the pcie-root-port

2023-06-28 Thread Ani Sinha

PCIE ports only have one slot, slot 0. Hence, non-zero slots are not available
for PCIE devices on PCIE root ports. Fix test_acpi_q35_tcg_no_acpi_hotplug()
so that the test does not use them.

Signed-off-by: Ani Sinha 
Reviewed-by: Igor Mammedov 
---
 tests/qtest/bios-tables-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c
index ed1c69cf01..47ba20b957 100644
--- a/tests/qtest/bios-tables-test.c
+++ b/tests/qtest/bios-tables-test.c
@@ -1020,9 +1020,9 @@ static void test_acpi_q35_tcg_no_acpi_hotplug(void)
 " -device pci-testdev,bus=nohprp,acpi-index=501"
 " -device pcie-root-port,id=nohprpint,port=0x0,chassis=3,hotplug=off,"
  "multifunction=on,addr=8.0"
-" -device pci-testdev,bus=nohprpint,acpi-index=601,addr=8.1"
+" -device pci-testdev,bus=nohprpint,acpi-index=601,addr=0.1"
 " -device pcie-root-port,id=hprp2,port=0x0,chassis=4,bus=nohprpint,"
- "addr=9.0"
+ "addr=0.2"
 " -device pci-testdev,bus=hprp2,acpi-index=602"
 , &data);
 free_test_data(&data);
-- 
2.39.1

[PATCH v6 1/5] tests/acpi: allow changes in DSDT.noacpihp table blob

2023-06-28 Thread Ani Sinha

We are going to fix bio-tables-test in the next patch and hence need to
make sure the acpi tests continue to pass.

Signed-off-by: Ani Sinha 
Acked-by: Igor Mammedov 
---
 tests/qtest/bios-tables-test-allowed-diff.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/qtest/bios-tables-test-allowed-diff.h 
b/tests/qtest/bios-tables-test-allowed-diff.h
index dfb8523c8b..31df9c6187 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1 +1,2 @@
 /* List of comma-separated changed AML files to ignore */
+"tests/data/acpi/q35/DSDT.noacpihp",
-- 
2.39.1

Re: [PATCH qemu] aspeed add montblanc bmc reference from fuji

2023-06-28 Thread Sittisak Sinprem

Hi Cédric,

I had fixed the function name to support in current branch,
but facing about below error while starting

./build/qemu-system-arm -machine montblanc-bmc  -drive
> file=~/flash-montblanc,format=raw,if=mtd -nographic -netdev
> tap,id=netdev0,script=no,downscript=no,ifname=tap0 -net
> nic,netdev=netdev0,model=ftgmac100
>


qemu-system-arm: device requires 134217728 bytes, block backend provides
> 27726336 bytes




On Wed, Jun 28, 2023 at 11:34 PM Cédric Le Goater  wrote:

> On 6/28/23 12:07, Sittisak Sinprem wrote:
> > Got it Cedric, I just know for it,
> >
> > I am fixing, and will re-send the patch as V2.
>
> Could you please use the patch below and send in your series ?
>
> Thanks,
>
> C.
>
>
>  From cfbc865ffe8a4dffe4ac764eb10416aa906a7170 Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= 
> Date: Wed, 28 Jun 2023 18:32:20 +0200
> Subject: [PATCH] aspeed: Introduce ASPEED_RAM_SIZE helper for 32-bit hosts
>   limitation
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
>
> On 32-bit hosts, RAM has a 2047 MB limit. Use a macro to define the
> default ram size of machines (AST2600 SoC) that can have 2 GB.
>
> Signed-off-by: Cédric Le Goater 
> ---
>   hw/arm/aspeed.c | 21 +
>   1 file changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
> index eefd2e275015..0ae252232597 100644
> --- a/hw/arm/aspeed.c
> +++ b/hw/arm/aspeed.c
> @@ -49,6 +49,13 @@ struct AspeedMachineState {
>   uint32_t hw_strap1;
>   };
>
> +/* On 32-bit hosts, lower RAM to 1G because of the 2047 MB limit */
> +#if HOST_LONG_BITS == 32
> +#define ASPEED_RAM_SIZE(sz) MIN((sz), 1 * GiB)
> +#else
> +#define ASPEED_RAM_SIZE(sz) (sz)
> +#endif
> +
>   /* Palmetto hardware value: 0x120CE416 */
>   #define PALMETTO_BMC_HW_STRAP1 (\
>   SCU_AST2400_HW_STRAP_DRAM_SIZE(DRAM_SIZE_256MB) |   \
> @@ -1504,12 +1511,7 @@ static void
> aspeed_machine_rainier_class_init(ObjectClass *oc, void *data)
>   aspeed_machine_ast2600_class_init(oc, data);
>   };
>
> -/* On 32-bit hosts, lower RAM to 1G because of the 2047 MB limit */
> -#if HOST_LONG_BITS == 32
> -#define FUJI_BMC_RAM_SIZE (1 * GiB)
> -#else
> -#define FUJI_BMC_RAM_SIZE (2 * GiB)
> -#endif
> +#define FUJI_BMC_RAM_SIZE ASPEED_RAM_SIZE(2 * GiB)
>
>   static void aspeed_machine_fuji_class_init(ObjectClass *oc, void *data)
>   {
> @@ -1533,12 +1535,7 @@ static void
> aspeed_machine_fuji_class_init(ObjectClass *oc, void *data)
>   aspeed_machine_ast2600_class_init(oc, data);
>   };
>
> -/* On 32-bit hosts, lower RAM to 1G because of the 2047 MB limit */
> -#if HOST_LONG_BITS == 32
> -#define BLETCHLEY_BMC_RAM_SIZE (1 * GiB)
> -#else
> -#define BLETCHLEY_BMC_RAM_SIZE (2 * GiB)
> -#endif
> +#define BLETCHLEY_BMC_RAM_SIZE ASPEED_RAM_SIZE(2 * GiB)
>
>   static void aspeed_machine_bletchley_class_init(ObjectClass *oc, void
> *data)
>   {
> --
> 2.41.0
>
>
>

Re: ARI and igb emulation

2023-06-28 Thread Ani Sinha




> On 28-Jun-2023, at 6:19 PM, Ani Sinha  wrote:
> 
> 
> 
>> On 28-Jun-2023, at 6:15 PM, Akihiko Odaki  wrote:
>> 
>> Adding CC.
>> 
>> On 2023/06/28 21:24, Akihiko Odaki wrote:
>>> On 2023/06/27 23:32, Ani Sinha wrote:
 Hi :
 I am proposing a patch in QEMU [1] which may or may not break ARI but I 
 wanted to give my best shot in making sure I am not breaking anything with 
 ARI enabled. I see that your igb emulation code enables ARI with its SRIOV 
 emulation. I ran the qtest and avocado tests that are mentioned in [2] and 
 they both pass. Is there anything else/any tweaks that I should be doing 
 to make sure I am not breaking ARI with igb?
 
 Thanks for information,
 Ani
 
 1. https://lists.gnu.org/archive/html/qemu-devel/2023-06/msg05478.html
 2. https://www.qemu.org/docs/master/system/devices/igb.html
 
 
>>> This indeed resulted in the following error message with igb:
>>> qemu-system-aarch64: PCI: slot 16 is not valid for igbvf, parent device 
>>> only allows plugging into slot 0.
>>> To reproduce the issue, add the following to QEMU command line:
>>> -device pcie-root-port,id=p -device igb,bus=p
>>> And enable 7 virtual functions on the guest. For Linux, see:
>>> https://docs.kernel.org/PCI/pci-iov-howto.html
>>> You may only enforce the slot number restriction only for devices without 
>>> ARI.

>>> The slot number is assumed as 0 when ARI is enabled if I understand 
>>> correctly.

Yes, in linux there is something like this:

device = pci_ari_enabled(dev->bus) ? 0 : PCI_SLOT(dev->devfn);

We will have to make similar changes in QEMU to fix all PCI_SLOT references and 
adjust code accordingly.

[RFC PATCH 3/3] ppc/pnv: SMT support for powernv

2023-06-28 Thread Nicholas Piggin

Set the TIR default value with the SMT thread index, and place some
standard limits on SMT configurations. Now powernv is able to boot
skiboot and Linux with a SMT topology, including booting a KVM guest.

There are several other per-core SPRs, but they are not so important to
run OPAL/Linux. Some important per-LPAR ones to convert before powernv
could run in 1LPAR mode. Broadcast msgsnd is not yet implemented either,
but skiboot/Linux does not use that. KVM uses an implementation-specific
detail of POWER9/10 TLBs where TLBIEL invalidates translations of all
threads on a core, but that is not required here because KVM does not
cache translations across PID or LPID switch. Most of these I have or
aren't too hard to implement, but I start with a small bare bones for
comments.

Not-yet-Signed-off-by: Nicholas Piggin 
---
 hw/ppc/pnv.c  | 12 
 hw/ppc/pnv_core.c | 13 +
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index fc083173f3..f599ccad1d 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -887,6 +887,18 @@ static void pnv_init(MachineState *machine)
 
 pnv->num_chips =
 machine->smp.max_cpus / (machine->smp.cores * machine->smp.threads);
+
+if (machine->smp.threads > 8) {
+error_report("Cannot support more than 8 threads/core "
+ "on a powernv machine");
+exit(1);
+}
+if (!is_power_of_2(machine->smp.threads)) {
+error_report("Cannot support %d threads/core on a powernv"
+ "machine because it must be a power of 2",
+ machine->smp.threads);
+exit(1);
+}
 /*
  * TODO: should we decide on how many chips we can create based
  * on #cores and Venice vs. Murano vs. Naples chip type etc...,
diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index 0bc3ad41c8..acd83caee8 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -167,12 +167,13 @@ static const MemoryRegionOps pnv_core_power9_xscom_ops = {
 .endianness = DEVICE_BIG_ENDIAN,
 };
 
-static void pnv_core_cpu_realize(PnvCore *pc, PowerPCCPU *cpu, Error **errp)
+static void pnv_core_cpu_realize(PnvCore *pc, PowerPCCPU *cpu, Error **errp,
+ int thread_index)
 {
 CPUPPCState *env = &cpu->env;
 int core_pir;
-int thread_index = 0; /* TODO: TCG supports only one thread */
 ppc_spr_t *pir = &env->spr_cb[SPR_PIR];
+ppc_spr_t *tir = &env->spr_cb[SPR_TIR];
 Error *local_err = NULL;
 PnvChipClass *pcc = PNV_CHIP_GET_CLASS(pc->chip);
 
@@ -188,11 +189,7 @@ static void pnv_core_cpu_realize(PnvCore *pc, PowerPCCPU 
*cpu, Error **errp)
 
 core_pir = object_property_get_uint(OBJECT(pc), "pir", &error_abort);
 
-/*
- * The PIR of a thread is the core PIR + the thread index. We will
- * need to find a way to get the thread index when TCG supports
- * more than 1. We could use the object name ?
- */
+tir->default_value = thread_index;
 pir->default_value = core_pir + thread_index;
 
 /* Set time-base frequency to 512 MHz */
@@ -241,7 +238,7 @@ static void pnv_core_realize(DeviceState *dev, Error **errp)
 }
 
 for (j = 0; j < cc->nr_threads; j++) {
-pnv_core_cpu_realize(pc, pc->threads[j], &local_err);
+pnv_core_cpu_realize(pc, pc->threads[j], &local_err, j);
 if (local_err) {
 goto err;
 }
-- 
2.40.1

[RFC PATCH 2/3] target/ppc: SMT support for the HID SPR

2023-06-28 Thread Nicholas Piggin

HID is a per-core shared register, skiboot sets this (e.g., setting
HILE) on one thread and that must affect all threads of the core.

Signed-off-by: Nicholas Piggin 
---
 target/ppc/cpu_init.c|  2 +-
 target/ppc/helper.h  |  1 +
 target/ppc/misc_helper.c | 21 +
 target/ppc/spr_common.h  |  1 +
 target/ppc/translate.c   | 16 
 5 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index dc3a65a575..a1b01dc479 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5373,7 +5373,7 @@ static void register_book3s_ids_sprs(CPUPPCState *env)
 spr_register_hv(env, SPR_HID0, "HID0",
  SPR_NOACCESS, SPR_NOACCESS,
  SPR_NOACCESS, SPR_NOACCESS,
- &spr_read_generic, &spr_write_generic,
+ &spr_read_generic, &spr_core_write_generic,
  0x);
 spr_register_hv(env, SPR_TSCR, "TSCR",
  SPR_NOACCESS, SPR_NOACCESS,
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index fda40b8a60..719752b38a 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -704,6 +704,7 @@ DEF_HELPER_3(store_dcr, void, env, tl, tl)
 
 DEF_HELPER_2(load_dump_spr, void, env, i32)
 DEF_HELPER_2(store_dump_spr, void, env, i32)
+DEF_HELPER_3(spr_core_write_generic, void, env, i32, tl)
 DEF_HELPER_3(spr_write_CTRL, void, env, i32, tl)
 
 DEF_HELPER_4(fscr_facility_check, void, env, i32, i32, i32)
diff --git a/target/ppc/misc_helper.c b/target/ppc/misc_helper.c
index 1f1af21f33..0da335472e 100644
--- a/target/ppc/misc_helper.c
+++ b/target/ppc/misc_helper.c
@@ -43,6 +43,27 @@ void helper_store_dump_spr(CPUPPCState *env, uint32_t sprn)
  env->spr[sprn]);
 }
 
+void helper_spr_core_write_generic(CPUPPCState *env, uint32_t sprn,
+   target_ulong val)
+{
+CPUState *cs = env_cpu(env);
+CPUState *ccs;
+uint32_t nr_threads = cs->nr_threads;
+uint32_t core_id = env->spr[SPR_PIR] & ~(nr_threads - 1);
+
+assert(core_id == env->spr[SPR_PIR] - env->spr[SPR_TIR]);
+
+if (nr_threads == 1) {
+env->spr[sprn] = val;
+return;
+}
+
+THREAD_SIBLING_FOREACH(cs, ccs) {
+CPUPPCState *cenv = &POWERPC_CPU(ccs)->env;
+cenv->spr[sprn] = val;
+}
+}
+
 void helper_spr_write_CTRL(CPUPPCState *env, uint32_t sprn,
target_ulong val)
 {
diff --git a/target/ppc/spr_common.h b/target/ppc/spr_common.h
index 4c0f2bed77..aae66ee966 100644
--- a/target/ppc/spr_common.h
+++ b/target/ppc/spr_common.h
@@ -82,6 +82,7 @@ void spr_noaccess(DisasContext *ctx, int gprn, int sprn);
 void spr_read_generic(DisasContext *ctx, int gprn, int sprn);
 void spr_write_generic(DisasContext *ctx, int sprn, int gprn);
 void spr_write_generic32(DisasContext *ctx, int sprn, int gprn);
+void spr_core_write_generic(DisasContext *ctx, int sprn, int gprn);
 void spr_write_MMCR0(DisasContext *ctx, int sprn, int gprn);
 void spr_write_MMCR1(DisasContext *ctx, int sprn, int gprn);
 void spr_write_PMC(DisasContext *ctx, int sprn, int gprn);
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index ef186396b4..f7a5ecc0c1 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -448,6 +448,22 @@ void spr_write_generic32(DisasContext *ctx, int sprn, int 
gprn)
 #endif
 }
 
+void spr_core_write_generic(DisasContext *ctx, int sprn, int gprn)
+{
+if (!(ctx->flags & POWERPC_FLAG_SMT)) {
+spr_write_generic(ctx, sprn, gprn);
+return;
+}
+
+if (!gen_serialize(ctx)) {
+return;
+}
+
+gen_helper_spr_core_write_generic(cpu_env, tcg_constant_i32(sprn),
+  cpu_gpr[gprn]);
+spr_store_dump_spr(sprn);
+}
+
 static void spr_write_CTRL_ST(DisasContext *ctx, int sprn, int gprn)
 {
 /* This does not implement >1 thread */
-- 
2.40.1

[RFC PATCH 1/3] target/ppc: Add LPAR-per-core vs per-thread mode flag

2023-06-28 Thread Nicholas Piggin

The Power ISA has the concept of sub-processors:

  Hardware is allowed to sub-divide a multi-threaded processor into
  "sub-processors" that appear to privileged programs as multi-threaded
  processors with fewer threads.

POWER9 and POWER10 have two modes, either every thread is a
sub-processor or all threads appear as one multi-threaded processor.
In the user manuals these are known as "LPAR-per-thread" and "LPAR
per core" (or "1LPAR"), respectively.

The practical difference is in LPAR-per-thread mode, non-hypervisor SPRs
are not shared between threads and msgsndp can not be used to message
siblings. In 1LPAR mode some SPRs are shared and msgsndp is usable. LPPT
allows multiple partitions to run concurrently on the same core,
and is a requirement for KVM to run on POWER9/10.

Traditionally, SMT in PAPR environments including PowerVM and the
pseries machine with KVM acceleration beahves as in 1LPAR mode. In
OPAL systems, LPAR-per-thread is used. When adding SMT to the powernv
machine, it is preferable to emulate OPAL LPAR-per-thread, so to
account for this difference a flag is added and SPRs may become either
per-thread, per-core shared, or per-LPAR shared. Per-LPAR registers
become either per-thread or per-core shared depending on the mode.

Signed-off-by: Nicholas Piggin 
---
 hw/ppc/spapr_cpu_core.c |  2 ++
 target/ppc/cpu.h|  3 +++
 target/ppc/cpu_init.c   | 12 
 target/ppc/translate.c  | 16 +---
 4 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index a4e3c2fadd..b482d9754a 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -270,6 +270,8 @@ static bool spapr_realize_vcpu(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 env->spr_cb[SPR_PIR].default_value = cs->cpu_index;
 env->spr_cb[SPR_TIR].default_value = thread_index;
 
+cpu_ppc_set_1lpar(cpu);
+
 /* Set time-base frequency to 512 MHz. vhyp must be set first. */
 cpu_ppc_tb_init(env, SPAPR_TIMEBASE_FREQ);
 
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 94497aa115..beddc5db5b 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -674,6 +674,8 @@ enum {
 POWERPC_FLAG_SCV  = 0x0020,
 /* Has >1 thread per core*/
 POWERPC_FLAG_SMT  = 0x0040,
+/* Using "LPAR per core" mode  (as opposed to per-thread)*/
+POWERPC_FLAG_1LPAR= 0x0080,
 };
 
 /*
@@ -1435,6 +1437,7 @@ void store_booke_tsr(CPUPPCState *env, target_ulong val);
 void ppc_tlb_invalidate_all(CPUPPCState *env);
 void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr);
 void cpu_ppc_set_vhyp(PowerPCCPU *cpu, PPCVirtualHypervisor *vhyp);
+void cpu_ppc_set_1lpar(PowerPCCPU *cpu);
 int ppcmas_tlb_check(CPUPPCState *env, ppcmas_tlb_t *tlb, hwaddr *raddrp,
  target_ulong address, uint32_t pid);
 int ppcemb_tlb_search(CPUPPCState *env, target_ulong address, uint32_t pid);
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index aeff71d063..dc3a65a575 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6601,6 +6601,18 @@ void cpu_ppc_set_vhyp(PowerPCCPU *cpu, 
PPCVirtualHypervisor *vhyp)
 env->msr_mask &= ~MSR_HVB;
 }
 
+void cpu_ppc_set_1lpar(PowerPCCPU *cpu)
+{
+CPUPPCState *env = &cpu->env;
+
+/*
+ * pseries SMT means "LPAR per core" mode, e.g., msgsndp is usable
+ * between threads.
+ */
+if (env->flags & POWERPC_FLAG_SMT) {
+env->flags |= POWERPC_FLAG_1LPAR;
+}
+}
 #endif /* !defined(CONFIG_USER_ONLY) */
 
 #endif /* defined(TARGET_PPC64) */
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 372ee600b2..ef186396b4 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -256,6 +256,16 @@ static inline bool gen_serialize_core(DisasContext *ctx)
 }
 #endif
 
+static inline bool gen_serialize_core_lpar(DisasContext *ctx)
+{
+/* 1LPAR implies SMT */
+if (ctx->flags & POWERPC_FLAG_1LPAR) {
+return gen_serialize(ctx);
+}
+
+return true;
+}
+
 /* SPR load/store helpers */
 static inline void gen_load_spr(TCGv t, int reg)
 {
@@ -451,7 +461,7 @@ static void spr_write_CTRL_ST(DisasContext *ctx, int sprn, 
int gprn)
 
 void spr_write_CTRL(DisasContext *ctx, int sprn, int gprn)
 {
-if (!(ctx->flags & POWERPC_FLAG_SMT)) {
+if (!(ctx->flags & POWERPC_FLAG_1LPAR)) {
 spr_write_CTRL_ST(ctx, sprn, gprn);
 goto out;
 }
@@ -815,7 +825,7 @@ void spr_write_pcr(DisasContext *ctx, int sprn, int gprn)
 /* DPDES */
 void spr_read_dpdes(DisasContext *ctx, int gprn, int sprn)
 {
-if (!gen_serialize_core(ctx)) {
+if (!gen_serialize_core_lpar(ctx)) {
 return;
 }
 
@@ -824,7 +834,7 @@ void spr_read_dpdes(DisasContext *ctx, int gprn, int sprn)
 
 void spr_write_dpdes(DisasContext *ctx, int sprn, int gprn)
 {
-if (!gen_serialize_core(ctx)) {
+if (!gen_serialize_core_lpar(ctx)) {
 re

[RFC PATCH 0/3] ppc/pnv: SMT support for powernv

2023-06-28 Thread Nicholas Piggin

These patches implement enough to boot a SMT powernv machine to Linux
and boot a SMP KVM guest inside that.

There are a few more SPRs that need to be done, and per-LPAR SPRs are
mostly not annotated yet so it can't run in 1LPAR mode. But it is
enough to run skiboot/Linux with SMT so I'll just post the minimal
patches for RFC because the concept isn't really different to add
more SPRs and things.

Thanks,
Nick

Nicholas Piggin (3):
  target/ppc: Add LPAR-per-core vs per-thread mode flag
  target/ppc: SMT support for the HID SPR
  ppc/pnv: SMT support for powernv

 hw/ppc/pnv.c | 12 
 hw/ppc/pnv_core.c| 13 +
 hw/ppc/spapr_cpu_core.c  |  2 ++
 target/ppc/cpu.h |  3 +++
 target/ppc/cpu_init.c| 14 +-
 target/ppc/helper.h  |  1 +
 target/ppc/misc_helper.c | 21 +
 target/ppc/spr_common.h  |  1 +
 target/ppc/translate.c   | 32 +---
 9 files changed, 87 insertions(+), 12 deletions(-)

-- 
2.40.1

[PATCH] hw/ppc: Fix clock update drift

2023-06-28 Thread Nicholas Piggin

The clock update logic reads the clock twice to compute the new clock
value, with a value derived from the later time subtracted from a value
derived from the earlier time. The delta causes time to be lost.

This can ultimately result in time becoming unsynchronized between CPUs
and that can cause OS lockups, timeouts, watchdogs, etc. This can be
seen running a KVM guest (that causes lots of TB updates) on a powernv
SMP machine.

Fix this by reading the clock once.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Nicholas Piggin 
---
I also made a test case that can trigger this with kvm-unit-tests, but
it's been taking me a while to get that upstreamed.

Thanks,
Nick

 hw/ppc/ppc.c | 33 +
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 82e4408c5c..6233f43c01 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -535,23 +535,24 @@ static inline void cpu_ppc_store_tb(ppc_tb_t *tb_env, 
uint64_t vmclk,
 void cpu_ppc_store_tbl (CPUPPCState *env, uint32_t value)
 {
 ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 uint64_t tb;
 
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 
tb_env->tb_offset);
+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->tb_offset);
 tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->tb_offset, tb | (uint64_t)value);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->tb_offset, tb | (uint64_t)value);
 }
 
 static inline void _cpu_ppc_store_tbu(CPUPPCState *env, uint32_t value)
 {
 ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 uint64_t tb;
 
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 
tb_env->tb_offset);
+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->tb_offset);
 tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->tb_offset, ((uint64_t)value << 32) | tb);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->tb_offset,
+ ((uint64_t)value << 32) | tb);
 }
 
 void cpu_ppc_store_tbu (CPUPPCState *env, uint32_t value)
@@ -584,23 +585,24 @@ uint32_t cpu_ppc_load_atbu (CPUPPCState *env)
 void cpu_ppc_store_atbl (CPUPPCState *env, uint32_t value)
 {
 ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 uint64_t tb;
 
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 
tb_env->atb_offset);
+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->atb_offset);
 tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->atb_offset, tb | (uint64_t)value);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->atb_offset, tb | (uint64_t)value);
 }
 
 void cpu_ppc_store_atbu (CPUPPCState *env, uint32_t value)
 {
 ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 uint64_t tb;
 
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL), 
tb_env->atb_offset);
+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->atb_offset);
 tb &= 0xULL;
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->atb_offset, ((uint64_t)value << 32) | tb);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->atb_offset,
+ ((uint64_t)value << 32) | tb);
 }
 
 uint64_t cpu_ppc_load_vtb(CPUPPCState *env)
@@ -622,14 +624,13 @@ void cpu_ppc_store_vtb(CPUPPCState *env, uint64_t value)
 void cpu_ppc_store_tbu40(CPUPPCState *env, uint64_t value)
 {
 ppc_tb_t *tb_env = env->tb_env;
+int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 uint64_t tb;
 
-tb = cpu_ppc_get_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
-tb_env->tb_offset);
+tb = cpu_ppc_get_tb(tb_env, clock, tb_env->tb_offset);
 tb &= 0xFFUL;
 tb |= (value & ~0xFFUL);
-cpu_ppc_store_tb(tb_env, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
- &tb_env->tb_offset, tb);
+cpu_ppc_store_tb(tb_env, clock, &tb_env->tb_offset, tb);
 }
 
 static void cpu_ppc_tb_stop (CPUPPCState *env)
-- 
2.40.1

RE: [PATCH v2 2/2] vfio/migration: Make VFIO migration non-experimental

2023-06-28 Thread Duan, Zhenzhong

>-Original Message-
>From: Cédric Le Goater 
>Sent: Thursday, June 29, 2023 12:04 AM
>To: Martins, Joao ; Avihai Horon
>
>Cc: Alex Williamson ; Juan Quintela
>; Peter Xu ; Leonardo Bras
>; Duan, Zhenzhong ;
>Yishai Hadas ; Jason Gunthorpe ;
>Maor Gottlieb ; Kirti Wankhede
>; Tarun Gupta ; qemu-
>de...@nongnu.org
>Subject: Re: [PATCH v2 2/2] vfio/migration: Make VFIO migration non-
>experimental
>
>On 6/28/23 16:51, Joao Martins wrote:
>> On 28/06/2023 13:54, Cédric Le Goater wrote:
>>> On 6/28/23 09:31, Avihai Horon wrote:
 The major parts of VFIO migration are supported today in QEMU. This
 includes basic VFIO migration, device dirty page tracking and
 precopy support.

 Thus, at this point in time, it seems appropriate to make VFIO
 migration
 non-experimental: remove the x prefix from enable_migration
 property, change it to ON_OFF_AUTO and let the default value be AUTO.

 In addition, make the following adjustments:
 1. When enable_migration is ON and migration is not supported, fail
 VFIO
      device realization.
 2. When enable_migration is AUTO (i.e., not explicitly enabled),
 require
      device dirty tracking support. This is because device dirty
 tracking
      is currently the only method to do dirty page tracking, which
 is
      essential for migrating in a reasonable downtime. Setting
      enable_migration to ON will not require device dirty tracking.
 3. Make migration error and blocker messages more elaborate.
 4. Remove error prints in vfio_migration_query_flags().
 5. Rename trace_vfio_migration_probe() to
      trace_vfio_migration_realize().

 Signed-off-by: Avihai Horon 
>>>
>>>
>>> We should rework the return value of most of the routines called by
>>> vfio_migration_realize() and simply use a bool. I think Zhenzhong is
>>> working it.
>>>
>>> Zhenzhong,
>>>
>>> When you resend v4 of the "VFIO migration related refactor and bug fix"
>>> series, please rebase on this patch since it should be merged.
>>>
>>
>> This, and his switchover-ack series from Avihai that preceeds it.
>>
>> Perhaps it might be easier to point to your tree:branch where you are
>> queueing all the patches?
>>
>
>Sure.
>
>I track QEMU patches for various subsystems under :
>
>  https://github.com/legoater/qemu
>
>
>VFIO candidates are under :
>
>   https://github.com/legoater/qemu/tree/vfio-8.1
>
>This is a wip tree, patches come and go. It contains the VFIO patches of the
>day/week, good for testing new ideas and checking CI.
>
>
>The vfio-next branch contains what I am 90% sure to send upstream :
>
>  https://github.com/legoater/qemu/tree/vfio-next

Great, I'll rebase on this branch, thanks for sharing.

Regards
Zhenzhong

Re: [PATCH 5/7] migration: Display error in query-migrate irrelevant of status

2023-06-28 Thread Fabiano Rosas

Peter Xu  writes:

> Display it as long as being set, irrelevant of FAILED status.  E.g., it may
> also be applicable to PAUSED stage of postcopy, to provide hint on what has
> gone wrong.

This might have made the documentation slightly inaccurate:

# @error-desc: the human readable error description string, when
# @status is 'failed'. Clients should not attempt to parse the
# error strings.  (Since 2.7)

But it's not wrong, so:

Reviewed-by: Fabiano Rosas

Re: [PATCH 4/7] migration: Deliver return path file error to migrate state too

2023-06-28 Thread Fabiano Rosas

Peter Xu  writes:

> We've already did this for most of the return path thread errors, but not
> yet for the IO errors happened on the return path qemufile.  Do that too.
>
> Remember to reset "err" always, because the ownership is not us anymore,
> otherwise we're prone to use-after-free later after recovered.
>
> Signed-off-by: Peter Xu 

Reviewed-by: Fabiano Rosas

Re: [PATCH 3/7] migration: Refactor error handling in source return path

2023-06-28 Thread Fabiano Rosas

Peter Xu  writes:

> rp_state.error was a boolean used to show error happened in return path
> thread.  That's not only duplicating error reporting (migrate_set_error),
> but also not good enough in that we only do error_report() and set it to
> true, we never can keep a history of the exact error and show it in
> query-migrate.
>
> To make this better, a few things done:
>
>   - Use error_setg() rather than error_report() across the whole lifecycle
> of return path thread, keeping the error in an Error*.
>
>   - Use migrate_set_error() to apply that captured error to the global
> migration object when error occured in this thread.
>
>   - With above, no need to have mark_source_rp_bad(), remove it, alongside
> with rp_state.error itself.
>
> Signed-off-by: Peter Xu 

Reviewed-by: Fabiano Rosas

Re: [PATCH] target/arm: gdbstub: Guard M-profile code with CONFIG_TCG

2023-06-28 Thread Fabiano Rosas

Fabiano Rosas  writes:

> Philippe Mathieu-Daudé  writes:
>
>> On 28/6/23 18:48, Fabiano Rosas wrote:
>>> This code is only relevant when TCG is present in the build. Building
>>> with --disable-tcg --enable-xen on an x86 host we get:
>>> 
>>> $ ../configure --target-list=x86_64-softmmu,aarch64-softmmu --disable-tcg 
>>> --enable-xen
>>> $ make -j$(nproc)
>>> ...
>>> libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
>>> `m_sysreg_ptr':
>>>   ../target/arm/gdbstub.c:358: undefined reference to `arm_v7m_get_sp_ptr'
>>>   ../target/arm/gdbstub.c:361: undefined reference to `arm_v7m_get_sp_ptr'
>>> 
>>> libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
>>> `arm_gdb_get_m_systemreg':
>>> ../target/arm/gdbstub.c:405: undefined reference to `arm_v7m_mrs_control'
>>
>> I'm a bit confused, isn't this covered by the cross-arm64-xen-only
>> job?
>
> It should be. Perhaps the CI is using different optimization flags. I'll
> try to figure it out.

Yep. The CI has -O2 while I am using --enable-debug.

Re: [PATCH v3] multifd: Add colo support

2023-06-28 Thread Peter Xu

copy hailiang.

On Thu, Jun 22, 2023 at 01:18:46PM +0200, Lukas Straub wrote:
> Like in the normal ram_load() path, put the received pages into the
> colo cache and mark the pages in the bitmap so that they will be
> flushed to the guest later.
> 
> Signed-off-by: Juan Quintela 
> Signed-off-by: Lukas Straub 
> ---
>  migration/meson.build|  1 +
>  migration/multifd-colo.c | 53 
>  migration/multifd-colo.h | 24 ++
>  migration/multifd.c  |  5 
>  4 files changed, 83 insertions(+)
>  create mode 100644 migration/multifd-colo.c
>  create mode 100644 migration/multifd-colo.h
> 
> diff --git a/migration/meson.build b/migration/meson.build
> index 1ae28523a1..063e0e0a8c 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -21,6 +21,7 @@ system_ss.add(files(
>'migration.c',
>'multifd.c',
>'multifd-zlib.c',
> +  'multifd-colo.c',
>'ram-compress.c',
>'options.c',
>'postcopy-ram.c',
> diff --git a/migration/multifd-colo.c b/migration/multifd-colo.c
> new file mode 100644
> index 00..4872dc6d01
> --- /dev/null
> +++ b/migration/multifd-colo.c
> @@ -0,0 +1,53 @@
> +/*
> + * multifd colo implementation
> + *
> + * Copyright (c) Lukas Straub 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "exec/target_page.h"
> +#include "exec/ramblock.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
> +#include "ram.h"
> +#include "multifd.h"
> +#include "options.h"
> +#include "io/channel-socket.h"
> +#include "migration/colo.h"
> +#include "multifd-colo.h"
> +
> +void multifd_colo_prepare_recv_pages(MultiFDRecvParams *p)
> +{
> +if (!migrate_colo())
> +return;
> +
> +assert(p->block->colo_cache);
> +
> +/*
> + * While we're still in precopy state (not yet in colo state), we copy
> + * received pages to both guest and cache. No need to set dirty bits,
> + * since guest and cache memory are in sync.
> + */
> +if (migration_incoming_in_colo_state()) {
> +colo_record_bitmap(p->block, p->normal, p->normal_num);
> +}
> +p->host = p->block->colo_cache;
> +}
> +
> +void multifd_colo_process_recv_pages(MultiFDRecvParams *p)
> +{
> +if (!migrate_colo())
> +return;
> +
> +if (!migration_incoming_in_colo_state()) {
> +for (int i = 0; i < p->normal_num; i++) {
> +void *guest = p->block->host + p->normal[i];
> +void *cache = p->host + p->normal[i];
> +memcpy(guest, cache, p->page_size);
> +}
> +}
> +p->host = p->block->host;
> +}
> diff --git a/migration/multifd-colo.h b/migration/multifd-colo.h
> new file mode 100644
> index 00..58920a0583
> --- /dev/null
> +++ b/migration/multifd-colo.h
> @@ -0,0 +1,24 @@
> +/*
> + * multifd colo header
> + *
> + * Copyright (c) Lukas Straub 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef QEMU_MIGRATION_MULTIFD_COLO_H
> +#define QEMU_MIGRATION_MULTIFD_COLO_H
> +
> +#ifdef CONFIG_REPLICATION
> +
> +void multifd_colo_prepare_recv_pages(MultiFDRecvParams *p);
> +void multifd_colo_process_recv_pages(MultiFDRecvParams *p);
> +
> +#else
> +
> +static inline void multifd_colo_prepare_recv_pages(MultiFDRecvParams *p) {}
> +static inline void multifd_colo_process_recv_pages(MultiFDRecvParams *p) {}
> +
> +#endif
> +#endif
> diff --git a/migration/multifd.c b/migration/multifd.c
> index 3387d8277f..6b031c1fd2 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -25,6 +25,7 @@
>  #include "qemu-file.h"
>  #include "trace.h"
>  #include "multifd.h"
> +#include "multifd-colo.h"
>  #include "threadinfo.h"
>  #include "options.h"
>  #include "qemu/yank.h"
> @@ -1134,10 +1135,14 @@ static void *multifd_recv_thread(void *opaque)
>  qemu_mutex_unlock(&p->mutex);
>  
>  if (p->normal_num) {
> +multifd_colo_prepare_recv_pages(p);
> +
>  ret = multifd_recv_state->ops->recv_pages(p, &local_err);
>  if (ret != 0) {
>  break;
>  }
> +
> +multifd_colo_process_recv_pages(p);
>  }
>  
>  if (flags & MULTIFD_FLAG_SYNC) {
> -- 
> 2.39.2



-- 
Peter Xu

Re: [PATCH] target/arm: gdbstub: Guard M-profile code with CONFIG_TCG

2023-06-28 Thread Fabiano Rosas

Philippe Mathieu-Daudé  writes:

> On 28/6/23 18:48, Fabiano Rosas wrote:
>> This code is only relevant when TCG is present in the build. Building
>> with --disable-tcg --enable-xen on an x86 host we get:
>> 
>> $ ../configure --target-list=x86_64-softmmu,aarch64-softmmu --disable-tcg 
>> --enable-xen
>> $ make -j$(nproc)
>> ...
>> libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
>> `m_sysreg_ptr':
>>   ../target/arm/gdbstub.c:358: undefined reference to `arm_v7m_get_sp_ptr'
>>   ../target/arm/gdbstub.c:361: undefined reference to `arm_v7m_get_sp_ptr'
>> 
>> libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
>> `arm_gdb_get_m_systemreg':
>> ../target/arm/gdbstub.c:405: undefined reference to `arm_v7m_mrs_control'
>
> I'm a bit confused, isn't this covered by the cross-arm64-xen-only
> job?

It should be. Perhaps the CI is using different optimization flags. I'll
try to figure it out.

[PATCH 7/7] migration: Provide explicit error message for file shutdowns

2023-06-28 Thread Peter Xu

Provide an explicit reason for qemu_file_shutdown()s, which can be
displayed in query-migrate when used.

This will make e.g. migrate-pause to display explicit error descriptions,
from:

"error-desc": "Channel error: Input/output error"

To:

"error-desc": "Channel is explicitly shutdown by the user"

in query-migrate.

Signed-off-by: Peter Xu 
---
 migration/qemu-file.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 419b4092e7..ff605027de 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -87,7 +87,10 @@ int qemu_file_shutdown(QEMUFile *f)
  *  --> guest crash!
  */
 if (!f->last_error) {
-qemu_file_set_error(f, -EIO);
+Error *err = NULL;
+
+error_setg(&err, "Channel is explicitly shutdown by the user");
+qemu_file_set_error_obj(f, -EIO, err);
 }
 
 if (!qio_channel_has_feature(f->ioc,
-- 
2.41.0

[PATCH 0/7] migration: Better error handling in return path thread

2023-06-28 Thread Peter Xu

This is a small series that reworks error handling of postcopy return path
threads.

We used to contain a bunch of error_report(), converting them into
error_setg() properly and deliver any of those errors to migration generic
error reports (via migrate_set_error()).  Then these errors can also be
observed in query-migrate after postcopy is paused.

Dropped the return-path specific error reporting: mark_source_rp_bad(),
because it's a duplication if we can always use migrate_set_error().

Please have a look, thanks.

Peter Xu (7):
  migration: Let migrate_set_error() take ownership
  migration: Introduce migrate_has_error()
  migration: Refactor error handling in source return path
  migration: Deliver return path file error to migrate state too
  migration: Display error in query-migrate irrelevant of status
  qemufile: Always return a verbose error
  migration: Provide explicit error message for file shutdowns

 migration/migration.h|   8 +-
 migration/ram.h  |   5 +-
 migration/channel.c  |   1 -
 migration/migration.c| 168 +++
 migration/multifd.c  |  10 +--
 migration/postcopy-ram.c |   1 -
 migration/qemu-file.c|  20 -
 migration/ram.c  |  42 +-
 migration/trace-events   |   2 +-
 9 files changed, 147 insertions(+), 110 deletions(-)

-- 
2.41.0

[PATCH 3/7] migration: Refactor error handling in source return path

2023-06-28 Thread Peter Xu

rp_state.error was a boolean used to show error happened in return path
thread.  That's not only duplicating error reporting (migrate_set_error),
but also not good enough in that we only do error_report() and set it to
true, we never can keep a history of the exact error and show it in
query-migrate.

To make this better, a few things done:

  - Use error_setg() rather than error_report() across the whole lifecycle
of return path thread, keeping the error in an Error*.

  - Use migrate_set_error() to apply that captured error to the global
migration object when error occured in this thread.

  - With above, no need to have mark_source_rp_bad(), remove it, alongside
with rp_state.error itself.

Signed-off-by: Peter Xu 
---
 migration/migration.h  |   1 -
 migration/ram.h|   5 +-
 migration/migration.c  | 118 -
 migration/ram.c|  41 +++---
 migration/trace-events |   2 +-
 5 files changed, 82 insertions(+), 85 deletions(-)

diff --git a/migration/migration.h b/migration/migration.h
index e48317ca97..d9c4b753b5 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -290,7 +290,6 @@ struct MigrationState {
 /* Protected by qemu_file_lock */
 QEMUFile *from_dst_file;
 QemuThreadrp_thread;
-bool  error;
 /*
  * We can also check non-zero of rp_thread, but there's no "official"
  * way to do this, so this bool makes it slightly more elegant.
diff --git a/migration/ram.h b/migration/ram.h
index ea1f3c25b5..02a4af2db6 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -52,7 +52,8 @@ uint64_t ram_bytes_total(void);
 void mig_throttle_counter_reset(void);
 
 uint64_t ram_pagesize_summary(void);
-int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len);
+int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len,
+ Error **errp);
 void ram_postcopy_migrated_memory_release(MigrationState *ms);
 /* For outgoing discard bitmap */
 void ram_postcopy_send_discard_bitmap(MigrationState *ms);
@@ -72,7 +73,7 @@ void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr);
 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr);
 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
   const char *block_name);
-int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb);
+int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp);
 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start);
 void postcopy_preempt_shutdown_file(MigrationState *s);
 void *postcopy_preempt_thread(void *opaque);
diff --git a/migration/migration.c b/migration/migration.c
index 39674a3981..f8c41c4d98 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1409,7 +1409,6 @@ void migrate_init(MigrationState *s)
 s->to_dst_file = NULL;
 s->state = MIGRATION_STATUS_NONE;
 s->rp_state.from_dst_file = NULL;
-s->rp_state.error = false;
 s->mbps = 0.0;
 s->pages_per_second = 0.0;
 s->downtime = 0;
@@ -1724,16 +1723,6 @@ void qmp_migrate_continue(MigrationStatus state, Error 
**errp)
 qemu_sem_post(&s->pause_sem);
 }
 
-/* migration thread support */
-/*
- * Something bad happened to the RP stream, mark an error
- * The caller shall print or trace something to indicate why
- */
-static void mark_source_rp_bad(MigrationState *s)
-{
-s->rp_state.error = true;
-}
-
 static struct rp_cmd_args {
 ssize_t len; /* -1 = variable */
 const char *name;
@@ -1754,7 +1743,7 @@ static struct rp_cmd_args {
  * and we don't need to send pages that have already been sent.
  */
 static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
-   ram_addr_t start, size_t len)
+ram_addr_t start, size_t len, Error 
**errp)
 {
 long our_host_ps = qemu_real_host_page_size();
 
@@ -1766,15 +1755,12 @@ static void migrate_handle_rp_req_pages(MigrationState 
*ms, const char* rbname,
  */
 if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
 !QEMU_IS_ALIGNED(len, our_host_ps)) {
-error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
- " len: %zd", __func__, start, len);
-mark_source_rp_bad(ms);
+error_setg(errp, "MIG_RP_MSG_REQ_PAGES: Misaligned page request, 
start:"
+   RAM_ADDR_FMT " len: %zd", start, len);
 return;
 }
 
-if (ram_save_queue_pages(rbname, start, len)) {
-mark_source_rp_bad(ms);
-}
+ram_save_queue_pages(rbname, start, len, errp);
 }
 
 /* Return true to retry, false to quit */
@@ -1789,26 +1775,28 @@ static bool 
postcopy_pause_return_path_thread(MigrationState *s)
 return true;
 }
 
-static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
+static int migrate_handle_rp_recv_bitmap(Migra

[PATCH 6/7] qemufile: Always return a verbose error

2023-06-28 Thread Peter Xu

There're a lot of cases where we only have an errno set in last_error but
without a detailed error description.  When this happens, try to generate
an error contains the errno as a descriptive error.

This will be helpful in cases where one relies on the Error*.  E.g.,
migration state only caches Error* in MigrationState.error.  With this,
we'll display correct error messages in e.g. query-migrate when the error
was only set by qemu_file_set_error().

Signed-off-by: Peter Xu 
---
 migration/qemu-file.c | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index acc282654a..419b4092e7 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -156,15 +156,24 @@ void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks 
*hooks)
  *
  * Return negative error value if there has been an error on previous
  * operations, return 0 if no error happened.
- * Optional, it returns Error* in errp, but it may be NULL even if return value
- * is not 0.
  *
+ * If errp is specified, a verbose error message will be copied over.
  */
 int qemu_file_get_error_obj(QEMUFile *f, Error **errp)
 {
+if (!f->last_error) {
+return 0;
+}
+
+/* There is an error */
 if (errp) {
-*errp = f->last_error_obj ? error_copy(f->last_error_obj) : NULL;
+if (f->last_error_obj) {
+*errp = error_copy(f->last_error_obj);
+} else {
+error_setg_errno(errp, -f->last_error, "Channel error");
+}
 }
+
 return f->last_error;
 }
 
-- 
2.41.0

[PATCH 5/7] migration: Display error in query-migrate irrelevant of status

2023-06-28 Thread Peter Xu

Display it as long as being set, irrelevant of FAILED status.  E.g., it may
also be applicable to PAUSED stage of postcopy, to provide hint on what has
gone wrong.

The error_mutex seems to be overlooked when referencing the error, add it
to be very safe.

Signed-off-by: Peter Xu 
---
 migration/migration.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 234dd3601d..7455353918 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1033,9 +1033,6 @@ static void fill_source_migration_info(MigrationInfo 
*info)
 break;
 case MIGRATION_STATUS_FAILED:
 info->has_status = true;
-if (s->error) {
-info->error_desc = g_strdup(error_get_pretty(s->error));
-}
 break;
 case MIGRATION_STATUS_CANCELLED:
 info->has_status = true;
@@ -1045,6 +1042,11 @@ static void fill_source_migration_info(MigrationInfo 
*info)
 break;
 }
 info->status = state;
+
+QEMU_LOCK_GUARD(&s->error_mutex);
+if (s->error) {
+info->error_desc = g_strdup(error_get_pretty(s->error));
+}
 }
 
 static void fill_destination_migration_info(MigrationInfo *info)
-- 
2.41.0

[PATCH 1/7] migration: Let migrate_set_error() take ownership

2023-06-28 Thread Peter Xu

migrate_set_error() used one error_copy() so it always copy an error.
However that's not the major use case - the major use case is one would
like to pass the error to migrate_set_error() without further touching the
error.

It can be proved if we see most of the callers are freeing the error
explicitly right afterwards.  There're a few outliers (only if when the
caller) where we can use error_copy() explicitly there.

Signed-off-by: Peter Xu 
---
 migration/migration.h|  6 +++---
 migration/channel.c  |  1 -
 migration/migration.c| 20 ++--
 migration/multifd.c  | 10 --
 migration/postcopy-ram.c |  1 -
 migration/ram.c  |  1 -
 6 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/migration/migration.h b/migration/migration.h
index 721b1c9473..32f87e3834 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -468,8 +468,8 @@ bool  migration_has_all_channels(void);
 
 uint64_t migrate_max_downtime(void);
 
-void migrate_set_error(MigrationState *s, const Error *error);
-void migrate_fd_error(MigrationState *s, const Error *error);
+void migrate_set_error(MigrationState *s, Error *error);
+void migrate_fd_error(MigrationState *s, Error *error);
 
 void migrate_fd_connect(MigrationState *s, Error *error_in);
 
@@ -513,7 +513,7 @@ int foreach_not_ignored_block(RAMBlockIterFunc func, void 
*opaque);
 void migration_make_urgent_request(void);
 void migration_consume_urgent_request(void);
 bool migration_rate_limit(void);
-void migration_cancel(const Error *error);
+void migration_cancel(Error *error);
 
 void populate_vfio_info(MigrationInfo *info);
 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
diff --git a/migration/channel.c b/migration/channel.c
index ca3319a309..48b3f6abd6 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -90,7 +90,6 @@ void migration_channel_connect(MigrationState *s,
 }
 }
 migrate_fd_connect(s, error);
-error_free(error);
 }
 
 
diff --git a/migration/migration.c b/migration/migration.c
index 203d40d4c9..13dccc4c12 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -160,7 +160,7 @@ void migration_object_init(void)
 dirty_bitmap_mig_init();
 }
 
-void migration_cancel(const Error *error)
+void migration_cancel(Error *error)
 {
 if (error) {
 migrate_set_error(current_migration, error);
@@ -1197,11 +1197,20 @@ static void migrate_fd_cleanup_bh(void *opaque)
 object_unref(OBJECT(s));
 }
 
-void migrate_set_error(MigrationState *s, const Error *error)
+/*
+ * Set error for current migration state.  The `error' ownership will be
+ * moved from the caller to MigrationState, so the caller doesn't need to
+ * free the error.
+ *
+ * If the caller still needs to reference the `error' passed in, one should
+ * use error_copy() explicitly.
+ */
+void migrate_set_error(MigrationState *s, Error *error)
 {
 QEMU_LOCK_GUARD(&s->error_mutex);
 if (!s->error) {
-s->error = error_copy(error);
+/* Record the first error triggered */
+s->error = error;
 }
 }
 
@@ -1214,7 +1223,7 @@ static void migrate_error_free(MigrationState *s)
 }
 }
 
-void migrate_fd_error(MigrationState *s, const Error *error)
+void migrate_fd_error(MigrationState *s, Error *error)
 {
 trace_migrate_fd_error(error_get_pretty(error));
 assert(s->to_dst_file == NULL);
@@ -1678,7 +1687,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 if (!(has_resume && resume)) {
 yank_unregister_instance(MIGRATION_YANK_INSTANCE);
 }
-migrate_fd_error(s, local_err);
+migrate_fd_error(s, error_copy(local_err));
 error_propagate(errp, local_err);
 return;
 }
@@ -2595,7 +2604,6 @@ static MigThrError migration_detect_error(MigrationState 
*s)
 
 if (local_error) {
 migrate_set_error(s, local_error);
-error_free(local_error);
 }
 
 if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) {
diff --git a/migration/multifd.c b/migration/multifd.c
index 3387d8277f..62bc2dbf49 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -551,7 +551,6 @@ void multifd_save_cleanup(void)
 multifd_send_state->ops->send_cleanup(p, &local_err);
 if (local_err) {
 migrate_set_error(migrate_get_current(), local_err);
-error_free(local_err);
 }
 }
 qemu_sem_destroy(&multifd_send_state->channels_ready);
@@ -750,7 +749,6 @@ out:
 if (local_err) {
 trace_multifd_send_error(p->id);
 multifd_send_terminate_threads(local_err);
-error_free(local_err);
 }
 
 /*
@@ -883,7 +881,6 @@ static void 
multifd_new_send_channel_cleanup(MultiFDSendParams *p,
   */
  p->quit = true;
  object_unref(OBJECT(ioc));
- error_free(err);
 }
 
 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
@@ -1148,7 +1145,6 @@ static void *multifd_recv_thread(void *opaque)
 
 if (l

[PATCH 2/7] migration: Introduce migrate_has_error()

2023-06-28 Thread Peter Xu

Introduce a helper to detect whether MigrationState.error is set for
whatever reason.  It is intended to not taking the error_mutex here because
neither do we reference the pointer, nor do we modify the pointer.  State
why it's safe to do so.

This is preparation work for any thread (e.g. source return path thread) to
setup errors in an unified way to MigrationState, rather than relying on
its own way to set errors (mark_source_rp_bad()).

Signed-off-by: Peter Xu 
---
 migration/migration.h |  1 +
 migration/migration.c | 15 +++
 2 files changed, 16 insertions(+)

diff --git a/migration/migration.h b/migration/migration.h
index 32f87e3834..e48317ca97 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -469,6 +469,7 @@ bool  migration_has_all_channels(void);
 uint64_t migrate_max_downtime(void);
 
 void migrate_set_error(MigrationState *s, Error *error);
+bool migrate_has_error(MigrationState *s);
 void migrate_fd_error(MigrationState *s, Error *error);
 
 void migrate_fd_connect(MigrationState *s, Error *error_in);
diff --git a/migration/migration.c b/migration/migration.c
index 13dccc4c12..39674a3981 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1214,6 +1214,21 @@ void migrate_set_error(MigrationState *s, Error *error)
 }
 }
 
+/*
+ * Whether the migration state has error set?
+ *
+ * Note this function explicitly didn't use error_mutex, because it only
+ * reads the error pointer for a boolean status.
+ *
+ * As long as the Error* is set, it shouldn't be freed before migration
+ * cleanup, so any thread can use this helper to safely detect whether
+ * there's anything wrong happened already.
+ */
+bool migrate_has_error(MigrationState *s)
+{
+return qatomic_read(&s->error);
+}
+
 static void migrate_error_free(MigrationState *s)
 {
 QEMU_LOCK_GUARD(&s->error_mutex);
-- 
2.41.0

[PATCH 4/7] migration: Deliver return path file error to migrate state too

2023-06-28 Thread Peter Xu

We've already did this for most of the return path thread errors, but not
yet for the IO errors happened on the return path qemufile.  Do that too.

Remember to reset "err" always, because the ownership is not us anymore,
otherwise we're prone to use-after-free later after recovered.

Signed-off-by: Peter Xu 
---
 migration/migration.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index f8c41c4d98..234dd3601d 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1992,6 +1992,13 @@ out:
 
 res = qemu_file_get_error(rp);
 if (res) {
+/* We have forwarded any error in "err" already, reuse "error" */
+assert(err == NULL);
+/* Try to deliver this file error to migration state */
+qemu_file_get_error_obj(rp, &err);
+migrate_set_error(ms, err);
+err = NULL;
+
 if (res && migration_in_postcopy()) {
 /*
  * Maybe there is something we can do: it looks like a
-- 
2.41.0

[PATCH v6 20/20] target/riscv/kvm.c: read/write (cbom|cboz)_blocksize in KVM

2023-06-28 Thread Daniel Henrique Barboza

If we don't set a proper cbom_blocksize|cboz_blocksize in the FDT the
Linux Kernel will fail to detect the availability of the CBOM/CBOZ
extensions, regardless of the contents of the 'riscv,isa' DT prop.

The FDT is being written using the cpu->cfg.cbom|z_blocksize attributes,
so let's expose them as user properties like it is already done with
TCG.

This will also require us to determine proper blocksize values during
init() time since the FDT is already created during realize(). We'll
take a ride in kvm_riscv_init_multiext_cfg() to do it. Note that we
don't need to fetch both cbom and cboz blocksizes every time: check for
their parent extensions (icbom and icboz) and only read the blocksizes
if needed.

In contrast with cbom/z_blocksize properties from TCG, the user is not
able to set any value that is different from the 'host' value when
running KVM. KVM can be particularly harsh dealing with it: a ENOTSUPP
can be thrown for the mere attempt of executing kvm_set_one_reg() for
these 2 regs.

Hopefully, we don't need to call kvm_set_one_reg() for these regs.
We'll check if the user input matches the host value in
kvm_cpu_set_cbomz_blksize(), the set() accessor for both blocksize
properties. We'll fail fast since it's already known to not be
supported.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
---
 target/riscv/kvm.c | 70 ++
 1 file changed, 70 insertions(+)

diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index d1bf518893..e535884830 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -276,6 +276,42 @@ static void kvm_cpu_set_multi_ext_cfg(Object *obj, Visitor 
*v,
 kvm_cpu_cfg_set(cpu, multi_ext_cfg, value);
 }
 
+static KVMCPUConfig kvm_cbom_blocksize = {
+.name = "cbom_blocksize",
+.offset = CPUCFG(cbom_blocksize),
+.kvm_reg_id = KVM_REG_RISCV_CONFIG_REG(zicbom_block_size)
+};
+
+static KVMCPUConfig kvm_cboz_blocksize = {
+.name = "cboz_blocksize",
+.offset = CPUCFG(cboz_blocksize),
+.kvm_reg_id = KVM_REG_RISCV_CONFIG_REG(zicboz_block_size)
+};
+
+static void kvm_cpu_set_cbomz_blksize(Object *obj, Visitor *v,
+  const char *name,
+  void *opaque, Error **errp)
+{
+KVMCPUConfig *cbomz_cfg = opaque;
+RISCVCPU *cpu = RISCV_CPU(obj);
+uint16_t value, *host_val;
+
+if (!visit_type_uint16(v, name, &value, errp)) {
+return;
+}
+
+host_val = kvmconfig_get_cfg_addr(cpu, cbomz_cfg);
+
+if (value != *host_val) {
+error_report("Unable to set %s to a different value than "
+ "the host (%u)",
+ cbomz_cfg->name, *host_val);
+exit(EXIT_FAILURE);
+}
+
+cbomz_cfg->user_set = true;
+}
+
 static void kvm_riscv_update_cpu_cfg_isa_ext(RISCVCPU *cpu, CPUState *cs)
 {
 CPURISCVState *env = &cpu->env;
@@ -329,6 +365,14 @@ static void kvm_riscv_add_cpu_user_properties(Object 
*cpu_obj)
 kvm_cpu_set_multi_ext_cfg,
 NULL, multi_cfg);
 }
+
+object_property_add(cpu_obj, "cbom_blocksize", "uint16",
+NULL, kvm_cpu_set_cbomz_blksize,
+NULL, &kvm_cbom_blocksize);
+
+object_property_add(cpu_obj, "cboz_blocksize", "uint16",
+NULL, kvm_cpu_set_cbomz_blksize,
+NULL, &kvm_cboz_blocksize);
 }
 
 static int kvm_riscv_get_regs_core(CPUState *cs)
@@ -644,6 +688,24 @@ static void kvm_riscv_init_misa_ext_mask(RISCVCPU *cpu,
 env->misa_ext = env->misa_ext_mask;
 }
 
+static void kvm_riscv_read_cbomz_blksize(RISCVCPU *cpu, KVMScratchCPU *kvmcpu,
+ KVMCPUConfig *cbomz_cfg)
+{
+CPURISCVState *env = &cpu->env;
+struct kvm_one_reg reg;
+int ret;
+
+reg.id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
+  cbomz_cfg->kvm_reg_id);
+reg.addr = (uint64_t)kvmconfig_get_cfg_addr(cpu, cbomz_cfg);
+ret = ioctl(kvmcpu->cpufd, KVM_GET_ONE_REG, ®);
+if (ret != 0) {
+error_report("Unable to read KVM reg %s, error %d",
+ cbomz_cfg->name, ret);
+exit(EXIT_FAILURE);
+}
+}
+
 static void kvm_riscv_init_multiext_cfg(RISCVCPU *cpu, KVMScratchCPU *kvmcpu)
 {
 CPURISCVState *env = &cpu->env;
@@ -675,6 +737,14 @@ static void kvm_riscv_init_multiext_cfg(RISCVCPU *cpu, 
KVMScratchCPU *kvmcpu)
 
 kvm_cpu_cfg_set(cpu, multi_ext_cfg, val);
 }
+
+if (cpu->cfg.ext_icbom) {
+kvm_riscv_read_cbomz_blksize(cpu, kvmcpu, &kvm_cbom_blocksize);
+}
+
+if (cpu->cfg.ext_icboz) {
+kvm_riscv_read_cbomz_blksize(cpu, kvmcpu, &kvm_cboz_blocksize);
+}
 }
 
 void kvm_riscv_init_user_properties(Object *cpu_obj)
-- 
2.41.0

[PATCH v6 09/20] linux-headers: Update to v6.4-rc1

2023-06-28 Thread Daniel Henrique Barboza

Update to commit ac9a78681b92 ("Linux 6.4-rc1").

Signed-off-by: Daniel Henrique Barboza 
Acked-by: Alistair Francis 
---
 include/standard-headers/linux/const.h|  2 +-
 include/standard-headers/linux/virtio_blk.h   | 18 +++
 .../standard-headers/linux/virtio_config.h|  6 +++
 include/standard-headers/linux/virtio_net.h   |  1 +
 linux-headers/asm-arm64/kvm.h | 33 
 linux-headers/asm-riscv/kvm.h | 53 ++-
 linux-headers/asm-riscv/unistd.h  |  9 
 linux-headers/asm-s390/unistd_32.h|  1 +
 linux-headers/asm-s390/unistd_64.h|  1 +
 linux-headers/asm-x86/kvm.h   |  3 ++
 linux-headers/linux/const.h   |  2 +-
 linux-headers/linux/kvm.h | 12 +++--
 linux-headers/linux/psp-sev.h |  7 +++
 linux-headers/linux/userfaultfd.h | 17 +-
 14 files changed, 149 insertions(+), 16 deletions(-)

diff --git a/include/standard-headers/linux/const.h 
b/include/standard-headers/linux/const.h
index 5e48987251..1eb84b5087 100644
--- a/include/standard-headers/linux/const.h
+++ b/include/standard-headers/linux/const.h
@@ -28,7 +28,7 @@
 #define _BITUL(x)  (_UL(1) << (x))
 #define _BITULL(x) (_ULL(1) << (x))
 
-#define __ALIGN_KERNEL(x, a)   __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 
1)
+#define __ALIGN_KERNEL(x, a)   __ALIGN_KERNEL_MASK(x, 
(__typeof__(x))(a) - 1)
 #define __ALIGN_KERNEL_MASK(x, mask)   (((x) + (mask)) & ~(mask))
 
 #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
diff --git a/include/standard-headers/linux/virtio_blk.h 
b/include/standard-headers/linux/virtio_blk.h
index 7155b1a470..d7be3cf5e4 100644
--- a/include/standard-headers/linux/virtio_blk.h
+++ b/include/standard-headers/linux/virtio_blk.h
@@ -138,11 +138,11 @@ struct virtio_blk_config {
 
/* Zoned block device characteristics (if VIRTIO_BLK_F_ZONED) */
struct virtio_blk_zoned_characteristics {
-   uint32_t zone_sectors;
-   uint32_t max_open_zones;
-   uint32_t max_active_zones;
-   uint32_t max_append_sectors;
-   uint32_t write_granularity;
+   __virtio32 zone_sectors;
+   __virtio32 max_open_zones;
+   __virtio32 max_active_zones;
+   __virtio32 max_append_sectors;
+   __virtio32 write_granularity;
uint8_t model;
uint8_t unused2[3];
} zoned;
@@ -239,11 +239,11 @@ struct virtio_blk_outhdr {
  */
 struct virtio_blk_zone_descriptor {
/* Zone capacity */
-   uint64_t z_cap;
+   __virtio64 z_cap;
/* The starting sector of the zone */
-   uint64_t z_start;
+   __virtio64 z_start;
/* Zone write pointer position in sectors */
-   uint64_t z_wp;
+   __virtio64 z_wp;
/* Zone type */
uint8_t z_type;
/* Zone state */
@@ -252,7 +252,7 @@ struct virtio_blk_zone_descriptor {
 };
 
 struct virtio_blk_zone_report {
-   uint64_t nr_zones;
+   __virtio64 nr_zones;
uint8_t reserved[56];
struct virtio_blk_zone_descriptor zones[];
 };
diff --git a/include/standard-headers/linux/virtio_config.h 
b/include/standard-headers/linux/virtio_config.h
index 965ee6ae23..8a7d0dc8b0 100644
--- a/include/standard-headers/linux/virtio_config.h
+++ b/include/standard-headers/linux/virtio_config.h
@@ -97,6 +97,12 @@
  */
 #define VIRTIO_F_SR_IOV37
 
+/*
+ * This feature indicates that the driver passes extra data (besides
+ * identifying the virtqueue) in its device notifications.
+ */
+#define VIRTIO_F_NOTIFICATION_DATA 38
+
 /*
  * This feature indicates that the driver can reset a queue individually.
  */
diff --git a/include/standard-headers/linux/virtio_net.h 
b/include/standard-headers/linux/virtio_net.h
index c0e797067a..2325485f2c 100644
--- a/include/standard-headers/linux/virtio_net.h
+++ b/include/standard-headers/linux/virtio_net.h
@@ -61,6 +61,7 @@
 #define VIRTIO_NET_F_GUEST_USO655  /* Guest can handle USOv6 in. */
 #define VIRTIO_NET_F_HOST_USO  56  /* Host can handle USO in. */
 #define VIRTIO_NET_F_HASH_REPORT  57   /* Supports hash report */
+#define VIRTIO_NET_F_GUEST_HDRLEN  59  /* Guest provides the exact hdr_len 
value. */
 #define VIRTIO_NET_F_RSS 60/* Supports RSS RX steering */
 #define VIRTIO_NET_F_RSC_EXT 61/* extended coalescing info */
 #define VIRTIO_NET_F_STANDBY 62/* Act as standby for another device
diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h
index d7e7bb885e..38e5957526 100644
--- a/linux-headers/asm-arm64/kvm.h
+++ b/linux-headers/asm-arm64/kvm.h
@@ -198,6 +198,15 @@ struct kvm_arm_copy_mte_tags {
__u64 reserved[2];
 };
 
+/*
+ * Counter/Timer offset structure. Describe the virtual/physical offset.
+ * To be used with KVM_ARM_SET_COUNTER_OFFSET.
+ */

[PATCH v6 16/20] target/riscv/cpu.c: remove priv_ver check from riscv_isa_string_ext()

2023-06-28 Thread Daniel Henrique Barboza

riscv_isa_string_ext() is being used by riscv_isa_string(), which is
then used by boards to retrieve the 'riscv,isa' string to be written in
the FDT. All this happens after riscv_cpu_realize(), meaning that we're
already past riscv_cpu_validate_set_extensions() and, more important,
riscv_cpu_disable_priv_spec_isa_exts().

This means that all extensions that needed to be disabled due to
priv_spec mismatch are already disabled. Checking this again during
riscv_isa_string_ext() is unneeded. Remove it.

As a bonus, riscv_isa_string_ext() can now be used with the 'host'
KVM-only CPU type since it doesn't have a env->priv_ver assigned and it
would fail this check for no good reason.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
---
 target/riscv/cpu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index b4a6fd8bab..79c8ffe6b7 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -2015,8 +2015,7 @@ static void riscv_isa_string_ext(RISCVCPU *cpu, char 
**isa_str,
 int i;
 
 for (i = 0; i < ARRAY_SIZE(isa_edata_arr); i++) {
-if (cpu->env.priv_ver >= isa_edata_arr[i].min_version &&
-isa_ext_is_enabled(cpu, &isa_edata_arr[i])) {
+if (isa_ext_is_enabled(cpu, &isa_edata_arr[i])) {
 new = g_strconcat(old, "_", isa_edata_arr[i].name, NULL);
 g_free(old);
 old = new;
-- 
2.41.0

[PATCH v6 19/20] target/riscv/kvm.c: add kvmconfig_get_cfg_addr() helper

2023-06-28 Thread Daniel Henrique Barboza

There are 2 places in which we need to get a pointer to a certain
property of the cpu->cfg struct based on property offset. Next patch
will add a couple more.

Create a helper to avoid repeating this code over and over.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
---
 target/riscv/kvm.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index aa4edc8a56..d1bf518893 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -215,11 +215,15 @@ static KVMCPUConfig kvm_multi_ext_cfgs[] = {
 KVM_EXT_CFG("svpbmt", ext_svpbmt, KVM_RISCV_ISA_EXT_SVPBMT),
 };
 
+static void *kvmconfig_get_cfg_addr(RISCVCPU *cpu, KVMCPUConfig *kvmcfg)
+{
+return (void *)&cpu->cfg + kvmcfg->offset;
+}
+
 static void kvm_cpu_cfg_set(RISCVCPU *cpu, KVMCPUConfig *multi_ext,
 uint32_t val)
 {
-int cpu_cfg_offset = multi_ext->offset;
-bool *ext_enabled = (void *)&cpu->cfg + cpu_cfg_offset;
+bool *ext_enabled = kvmconfig_get_cfg_addr(cpu, multi_ext);
 
 *ext_enabled = val;
 }
@@ -227,8 +231,7 @@ static void kvm_cpu_cfg_set(RISCVCPU *cpu, KVMCPUConfig 
*multi_ext,
 static uint32_t kvm_cpu_cfg_get(RISCVCPU *cpu,
 KVMCPUConfig *multi_ext)
 {
-int cpu_cfg_offset = multi_ext->offset;
-bool *ext_enabled = (void *)&cpu->cfg + cpu_cfg_offset;
+bool *ext_enabled = kvmconfig_get_cfg_addr(cpu, multi_ext);
 
 return *ext_enabled;
 }
-- 
2.41.0

[PATCH v6 02/20] hw/riscv/virt.c: skip 'mmu-type' FDT if satp mode not set

2023-06-28 Thread Daniel Henrique Barboza

The absence of a satp mode in riscv_host_cpu_init() is causing the
following error:

$ sudo ./qemu/build/qemu-system-riscv64  -machine virt,accel=kvm \
-m 2G -smp 1  -nographic -snapshot \
-kernel ./guest_imgs/Image \
-initrd ./guest_imgs/rootfs_kvm_riscv64.img \
-append "earlycon=sbi root=/dev/ram rw" \
-cpu host
**
ERROR:../target/riscv/cpu.c:320:satp_mode_str: code should not be
reached
Bail out! ERROR:../target/riscv/cpu.c:320:satp_mode_str: code should
not be reached
Aborted

The error is triggered from create_fdt_socket_cpus() in hw/riscv/virt.c.
It's trying to get satp_mode_str for a NULL cpu->cfg.satp_mode.map.

For this KVM cpu we would need to inherit the satp supported modes
from the RISC-V host. At this moment this is not possible because the
KVM driver does not support it. And even when it does we can't just let
this broken for every other older kernel.

Since mmu-type is not a required node, according to [1], skip the
'mmu-type' FDT node if there's no satp_mode set. We'll revisit this
logic when we can get satp information from KVM.

[1] 
https://github.com/devicetree-org/dt-schema/blob/main/dtschema/schemas/cpu.yaml

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
Reviewed-by: Alistair Francis 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/riscv/virt.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
index 95708d890e..f025a0fcaf 100644
--- a/hw/riscv/virt.c
+++ b/hw/riscv/virt.c
@@ -243,13 +243,13 @@ static void create_fdt_socket_cpus(RISCVVirtState *s, int 
socket,
 s->soc[socket].hartid_base + cpu);
 qemu_fdt_add_subnode(ms->fdt, cpu_name);
 
-satp_mode_max = satp_mode_max_from_map(
-s->soc[socket].harts[cpu].cfg.satp_mode.map);
-sv_name = g_strdup_printf("riscv,%s",
-  satp_mode_str(satp_mode_max, is_32_bit));
-qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type", sv_name);
-g_free(sv_name);
-
+if (cpu_ptr->cfg.satp_mode.supported != 0) {
+satp_mode_max = satp_mode_max_from_map(cpu_ptr->cfg.satp_mode.map);
+sv_name = g_strdup_printf("riscv,%s",
+  satp_mode_str(satp_mode_max, is_32_bit));
+qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type", sv_name);
+g_free(sv_name);
+}
 
 name = riscv_isa_string(cpu_ptr);
 qemu_fdt_setprop_string(ms->fdt, cpu_name, "riscv,isa", name);
-- 
2.41.0

[PATCH v6 14/20] target/riscv/kvm.c: add multi-letter extension KVM properties

2023-06-28 Thread Daniel Henrique Barboza

Let's add KVM user properties for the multi-letter extensions that KVM
currently supports: zicbom, zicboz, zihintpause, zbb, ssaia, sstc,
svinval and svpbmt.

As with MISA extensions, we're using the KVMCPUConfig type to hold
information about the state of each extension. However, multi-letter
extensions have more cases to cover than MISA extensions, so we're
adding an extra 'supported' flag as well. This flag will reflect if a
given extension is supported by KVM, i.e. KVM knows how to handle it.
This is determined during KVM extension discovery in
kvm_riscv_init_multiext_cfg(), where we test for EINVAL errors. Any
other error different from EINVAL will cause an abort.

The use of the 'user_set' is similar to what we already do with MISA
extensions: the flag set only if the user is changing the extension
state.

The 'supported' flag will be used later on to make an exception for
users that are disabling multi-letter extensions that are unknown to
KVM.

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c |   8 +++
 target/riscv/kvm.c | 119 +
 2 files changed, 127 insertions(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index f4b1868466..5428402cfa 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1751,6 +1751,14 @@ static void riscv_cpu_add_user_properties(Object *obj)
 riscv_cpu_add_misa_properties(obj);
 
 for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {
+#ifndef CONFIG_USER_ONLY
+if (kvm_enabled()) {
+/* Check if KVM created the property already */
+if (object_property_find(obj, prop->name)) {
+continue;
+}
+}
+#endif
 qdev_property_add_static(dev, prop);
 }
 
diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index ba97b0cbed..68f6538b04 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -113,6 +113,7 @@ typedef struct KVMCPUConfig {
 target_ulong offset;
 int kvm_reg_id;
 bool user_set;
+bool supported;
 } KVMCPUConfig;
 
 #define KVM_MISA_CFG(_bit, _reg_id) \
@@ -197,6 +198,81 @@ static void kvm_riscv_update_cpu_misa_ext(RISCVCPU *cpu, 
CPUState *cs)
 }
 }
 
+#define CPUCFG(_prop) offsetof(struct RISCVCPUConfig, _prop)
+
+#define KVM_EXT_CFG(_name, _prop, _reg_id) \
+{.name = _name, .offset = CPUCFG(_prop), \
+ .kvm_reg_id = _reg_id}
+
+static KVMCPUConfig kvm_multi_ext_cfgs[] = {
+KVM_EXT_CFG("zicbom", ext_icbom, KVM_RISCV_ISA_EXT_ZICBOM),
+KVM_EXT_CFG("zicboz", ext_icboz, KVM_RISCV_ISA_EXT_ZICBOZ),
+KVM_EXT_CFG("zihintpause", ext_zihintpause, KVM_RISCV_ISA_EXT_ZIHINTPAUSE),
+KVM_EXT_CFG("zbb", ext_zbb, KVM_RISCV_ISA_EXT_ZBB),
+KVM_EXT_CFG("ssaia", ext_ssaia, KVM_RISCV_ISA_EXT_SSAIA),
+KVM_EXT_CFG("sstc", ext_sstc, KVM_RISCV_ISA_EXT_SSTC),
+KVM_EXT_CFG("svinval", ext_svinval, KVM_RISCV_ISA_EXT_SVINVAL),
+KVM_EXT_CFG("svpbmt", ext_svpbmt, KVM_RISCV_ISA_EXT_SVPBMT),
+};
+
+static void kvm_cpu_cfg_set(RISCVCPU *cpu, KVMCPUConfig *multi_ext,
+uint32_t val)
+{
+int cpu_cfg_offset = multi_ext->offset;
+bool *ext_enabled = (void *)&cpu->cfg + cpu_cfg_offset;
+
+*ext_enabled = val;
+}
+
+static uint32_t kvm_cpu_cfg_get(RISCVCPU *cpu,
+KVMCPUConfig *multi_ext)
+{
+int cpu_cfg_offset = multi_ext->offset;
+bool *ext_enabled = (void *)&cpu->cfg + cpu_cfg_offset;
+
+return *ext_enabled;
+}
+
+static void kvm_cpu_set_multi_ext_cfg(Object *obj, Visitor *v,
+  const char *name,
+  void *opaque, Error **errp)
+{
+KVMCPUConfig *multi_ext_cfg = opaque;
+RISCVCPU *cpu = RISCV_CPU(obj);
+bool value, host_val;
+
+if (!visit_type_bool(v, name, &value, errp)) {
+return;
+}
+
+host_val = kvm_cpu_cfg_get(cpu, multi_ext_cfg);
+
+/*
+ * Ignore if the user is setting the same value
+ * as the host.
+ */
+if (value == host_val) {
+return;
+}
+
+if (!multi_ext_cfg->supported) {
+/*
+ * Error out if the user is trying to enable an
+ * extension that KVM doesn't support. Ignore
+ * option otherwise.
+ */
+if (value) {
+error_setg(errp, "KVM does not support disabling extension %s",
+   multi_ext_cfg->name);
+}
+
+return;
+}
+
+multi_ext_cfg->user_set = true;
+kvm_cpu_cfg_set(cpu, multi_ext_cfg, value);
+}
+
 static void kvm_riscv_add_cpu_user_properties(Object *cpu_obj)
 {
 int i;
@@ -215,6 +291,15 @@ static void kvm_riscv_add_cpu_user_properties(Object 
*cpu_obj)
 object_property_set_description(cpu_obj, misa_cfg->name,
 misa_cfg->description);
 }
+
+for (i = 0; i < ARRAY_SIZE(kvm_multi_ext_cfgs); i++) {
+KVMCPUConfig *multi_cfg = &kvm_multi_ext_cfgs[i];
+
+object_property_add(cpu_obj

[PATCH v6 18/20] target/riscv: update multi-letter extension KVM properties

2023-06-28 Thread Daniel Henrique Barboza

We're now ready to update the multi-letter extensions status for KVM.

kvm_riscv_update_cpu_cfg_isa_ext() is called called during vcpu creation
time to verify which user options changes host defaults (via the 'user_set'
flag) and tries to write them back to KVM.

Failure to commit a change to KVM is only ignored in case KVM doesn't
know about the extension (-EINVAL error code) and the user wanted to
disable the given extension. Otherwise we're going to abort the boot
process.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
---
 target/riscv/kvm.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index 68f6538b04..aa4edc8a56 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -273,6 +273,32 @@ static void kvm_cpu_set_multi_ext_cfg(Object *obj, Visitor 
*v,
 kvm_cpu_cfg_set(cpu, multi_ext_cfg, value);
 }
 
+static void kvm_riscv_update_cpu_cfg_isa_ext(RISCVCPU *cpu, CPUState *cs)
+{
+CPURISCVState *env = &cpu->env;
+uint64_t id, reg;
+int i, ret;
+
+for (i = 0; i < ARRAY_SIZE(kvm_multi_ext_cfgs); i++) {
+KVMCPUConfig *multi_ext_cfg = &kvm_multi_ext_cfgs[i];
+
+if (!multi_ext_cfg->user_set) {
+continue;
+}
+
+id = kvm_riscv_reg_id(env, KVM_REG_RISCV_ISA_EXT,
+  multi_ext_cfg->kvm_reg_id);
+reg = kvm_cpu_cfg_get(cpu, multi_ext_cfg);
+ret = kvm_set_one_reg(cs, id, ®);
+if (ret != 0) {
+error_report("Unable to %s extension %s in KVM, error %d",
+ reg ? "enable" : "disable",
+ multi_ext_cfg->name, ret);
+exit(EXIT_FAILURE);
+}
+}
+}
+
 static void kvm_riscv_add_cpu_user_properties(Object *cpu_obj)
 {
 int i;
@@ -792,6 +818,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 }
 
 kvm_riscv_update_cpu_misa_ext(cpu, cs);
+kvm_riscv_update_cpu_cfg_isa_ext(cpu, cs);
 
 return ret;
 }
-- 
2.41.0

[PATCH v6 17/20] target/riscv/cpu.c: create KVM mock properties

2023-06-28 Thread Daniel Henrique Barboza

KVM-specific properties are being created inside target/riscv/kvm.c. But
at this moment we're gathering all the remaining properties from TCG and
adding them as is when running KVM. This creates a situation where
non-KVM properties are setting flags to 'true' due to its default
settings (e.g.  Zawrs). Users can also freely enable them via command
line.

This doesn't impact runtime per se because KVM doesn't care about these
flags, but code such as riscv_isa_string_ext() take those flags into
account. The result is that, for a KVM guest, setting non-KVM properties
will make them appear in the riscv,isa DT.

We want to keep the same API for both TCG and KVM and at the same time,
when running KVM, forbid non-KVM extensions to be enabled internally. We
accomplish both by changing riscv_cpu_add_user_properties() to add a
mock boolean property for every non-KVM extension in
riscv_cpu_extensions[]. Then, when running KVM, users are still free to
set extensions at will, but we'll error out if a non-KVM extension is
enabled. Setting such extension to 'false' will be ignored.

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 79c8ffe6b7..6d7a0bc4ae 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1731,6 +1731,26 @@ static Property riscv_cpu_extensions[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
+
+#ifndef CONFIG_USER_ONLY
+static void cpu_set_cfg_unavailable(Object *obj, Visitor *v,
+const char *name,
+void *opaque, Error **errp)
+{
+const char *propname = opaque;
+bool value;
+
+if (!visit_type_bool(v, name, &value, errp)) {
+return;
+}
+
+if (value) {
+error_setg(errp, "extension %s is not available with KVM",
+   propname);
+}
+}
+#endif
+
 /*
  * Add CPU properties with user-facing flags.
  *
@@ -1759,6 +1779,22 @@ static void riscv_cpu_add_user_properties(Object *obj)
 if (object_property_find(obj, prop->name)) {
 continue;
 }
+
+/*
+ * Set the default to disabled for every extension
+ * unknown to KVM and error out if the user attempts
+ * to enable any of them.
+ *
+ * We're giving a pass for non-bool properties since they're
+ * not related to the availability of extensions and can be
+ * safely ignored as is.
+ */
+if (prop->info == &qdev_prop_bool) {
+object_property_add(obj, prop->name, "bool",
+NULL, cpu_set_cfg_unavailable,
+NULL, (void *)prop->name);
+continue;
+}
 }
 #endif
 qdev_property_add_static(dev, prop);
-- 
2.41.0

[PATCH v6 03/20] target/riscv/cpu.c: restrict 'mvendorid' value

2023-06-28 Thread Daniel Henrique Barboza

We're going to change the handling of mvendorid/marchid/mimpid by the
KVM driver. Since these are always present in all CPUs let's put the
same validation for everyone.

It doesn't make sense to allow 'mvendorid' to be different than it
is already set in named (vendor) CPUs. Generic (dynamic) CPUs can have
any 'mvendorid' they want.

Change 'mvendorid' to be a class property created via
'object_class_property_add', instead of using the DEFINE_PROP_UINT32()
macro. This allow us to define a custom setter for it that will verify,
for named CPUs, if mvendorid is different than it is already set by the
CPU. This is the error thrown for the 'veyron-v1' CPU if 'mvendorid' is
set to an invalid value:

$ qemu-system-riscv64 -M virt -nographic -cpu veyron-v1,mvendorid=2
qemu-system-riscv64: can't apply global veyron-v1-riscv-cpu.mvendorid=2:
Unable to change veyron-v1-riscv-cpu mvendorid (0x61f)

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.c | 38 +-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index bbb201a2b3..652988db25 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1730,7 +1730,6 @@ static void riscv_cpu_add_user_properties(Object *obj)
 static Property riscv_cpu_properties[] = {
 DEFINE_PROP_BOOL("debug", RISCVCPU, cfg.debug, true),
 
-DEFINE_PROP_UINT32("mvendorid", RISCVCPU, cfg.mvendorid, 0),
 DEFINE_PROP_UINT64("marchid", RISCVCPU, cfg.marchid, RISCV_CPU_MARCHID),
 DEFINE_PROP_UINT64("mimpid", RISCVCPU, cfg.mimpid, RISCV_CPU_MIMPID),
 
@@ -1817,6 +1816,40 @@ static const struct TCGCPUOps riscv_tcg_ops = {
 #endif /* !CONFIG_USER_ONLY */
 };
 
+static bool riscv_cpu_is_dynamic(Object *cpu_obj)
+{
+return object_dynamic_cast(cpu_obj, TYPE_RISCV_DYNAMIC_CPU) != NULL;
+}
+
+static void cpu_set_mvendorid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+bool dynamic_cpu = riscv_cpu_is_dynamic(obj);
+RISCVCPU *cpu = RISCV_CPU(obj);
+uint32_t prev_val = cpu->cfg.mvendorid;
+uint32_t value;
+
+if (!visit_type_uint32(v, name, &value, errp)) {
+return;
+}
+
+if (!dynamic_cpu && prev_val != value) {
+error_setg(errp, "Unable to change %s mvendorid (0x%x)",
+   object_get_typename(obj), prev_val);
+return;
+}
+
+cpu->cfg.mvendorid = value;
+}
+
+static void cpu_get_mvendorid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+bool value = RISCV_CPU(obj)->cfg.mvendorid;
+
+visit_type_bool(v, name, &value, errp);
+}
+
 static void riscv_cpu_class_init(ObjectClass *c, void *data)
 {
 RISCVCPUClass *mcc = RISCV_CPU_CLASS(c);
@@ -1848,6 +1881,9 @@ static void riscv_cpu_class_init(ObjectClass *c, void 
*data)
 cc->gdb_get_dynamic_xml = riscv_gdb_get_dynamic_xml;
 cc->tcg_ops = &riscv_tcg_ops;
 
+object_class_property_add(c, "mvendorid", "uint32", cpu_get_mvendorid,
+  cpu_set_mvendorid, NULL, NULL);
+
 device_class_set_props(dc, riscv_cpu_properties);
 }
 
-- 
2.41.0

[PATCH v6 07/20] target/riscv: read marchid/mimpid in kvm_riscv_init_machine_ids()

2023-06-28 Thread Daniel Henrique Barboza

Allow 'marchid' and 'mimpid' to also be initialized in
kvm_riscv_init_machine_ids().

After this change, the handling of mvendorid/marchid/mimpid for the
'host' CPU type will be equal to what we already have for TCG named
CPUs, i.e. the user is not able to set these values to a different val
than the one that is already preset.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
Acked-by: Alistair Francis 
---
 target/riscv/kvm.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index 37f0f70794..cd2974c663 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -378,6 +378,22 @@ static void kvm_riscv_init_machine_ids(RISCVCPU *cpu, 
KVMScratchCPU *kvmcpu)
 if (ret != 0) {
 error_report("Unable to retrieve mvendorid from host, error %d", ret);
 }
+
+reg.id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
+  KVM_REG_RISCV_CONFIG_REG(marchid));
+reg.addr = (uint64_t)&cpu->cfg.marchid;
+ret = ioctl(kvmcpu->cpufd, KVM_GET_ONE_REG, ®);
+if (ret != 0) {
+error_report("Unable to retrieve marchid from host, error %d", ret);
+}
+
+reg.id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
+  KVM_REG_RISCV_CONFIG_REG(mimpid));
+reg.addr = (uint64_t)&cpu->cfg.mimpid;
+ret = ioctl(kvmcpu->cpufd, KVM_GET_ONE_REG, ®);
+if (ret != 0) {
+error_report("Unable to retrieve mimpid from host, error %d", ret);
+}
 }
 
 void kvm_riscv_init_user_properties(Object *cpu_obj)
-- 
2.41.0

[PATCH v6 13/20] target/riscv/kvm.c: update KVM MISA bits

2023-06-28 Thread Daniel Henrique Barboza

Our design philosophy with KVM properties can be resumed in two main
decisions based on KVM interface availability and what the user wants to
do:

- if the user disables an extension that the host KVM module doesn't
know about (i.e. it doesn't implement the kvm_get_one_reg() interface),
keep booting the CPU. This will avoid users having to deal with issues
with older KVM versions while disabling features they don't care;

- for any other case we're going to error out immediately. If the user
wants to enable a feature that KVM doesn't know about this a problem that
is worth aborting - the user must know that the feature wasn't enabled
in the hart. Likewise, if KVM knows about the extension, the user wants
to enable/disable it, and we fail to do it so, that's also a problem we
can't shrug it off.

In the case of MISA bits we won't even try enabling bits that aren't
already available in the host. The ioctl() is so likely to fail that
it's not worth trying. This check is already done in the previous patch,
in kvm_cpu_set_misa_ext_cfg(), thus we don't need to worry about it now.

In kvm_riscv_update_cpu_misa_ext() we'll go through every potential user
option and do as follows:

- if the user didn't set the property or set to the same value of the
host, do nothing;

- Disable the given extension in KVM. Error out if anything goes wrong.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
---
 target/riscv/kvm.c | 40 
 1 file changed, 40 insertions(+)

diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index 0fb63cced3..ba97b0cbed 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -162,6 +162,41 @@ static void kvm_cpu_set_misa_ext_cfg(Object *obj, Visitor 
*v,
"enabled in the host", misa_ext_cfg->name);
 }
 
+static void kvm_riscv_update_cpu_misa_ext(RISCVCPU *cpu, CPUState *cs)
+{
+CPURISCVState *env = &cpu->env;
+uint64_t id, reg;
+int i, ret;
+
+for (i = 0; i < ARRAY_SIZE(kvm_misa_ext_cfgs); i++) {
+KVMCPUConfig *misa_cfg = &kvm_misa_ext_cfgs[i];
+target_ulong misa_bit = misa_cfg->offset;
+
+if (!misa_cfg->user_set) {
+continue;
+}
+
+/* If we're here we're going to disable the MISA bit */
+reg = 0;
+id = kvm_riscv_reg_id(env, KVM_REG_RISCV_ISA_EXT,
+  misa_cfg->kvm_reg_id);
+ret = kvm_set_one_reg(cs, id, ®);
+if (ret != 0) {
+/*
+ * We're not checking for -EINVAL because if the bit is about
+ * to be disabled, it means that it was already enabled by
+ * KVM. We determined that by fetching the 'isa' register
+ * during init() time. Any error at this point is worth
+ * aborting.
+ */
+error_report("Unable to set KVM reg %s, error %d",
+ misa_cfg->name, ret);
+exit(EXIT_FAILURE);
+}
+env->misa_ext &= ~misa_bit;
+}
+}
+
 static void kvm_riscv_add_cpu_user_properties(Object *cpu_obj)
 {
 int i;
@@ -632,8 +667,13 @@ int kvm_arch_init_vcpu(CPUState *cs)
 
 if (!object_dynamic_cast(OBJECT(cpu), TYPE_RISCV_CPU_HOST)) {
 ret = kvm_vcpu_set_machine_ids(cpu, cs);
+if (ret != 0) {
+return ret;
+}
 }
 
+kvm_riscv_update_cpu_misa_ext(cpu, cs);
+
 return ret;
 }
 
-- 
2.41.0

[PATCH v6 10/20] target/riscv/kvm.c: init 'misa_ext_mask' with scratch CPU

2023-06-28 Thread Daniel Henrique Barboza

At this moment we're retrieving env->misa_ext during
kvm_arch_init_cpu(), leaving env->misa_ext_mask behind.

We want to set env->misa_ext_mask, and we want to set it as early as
possible. The reason is that we're going to use it in the validation
process of the KVM MISA properties we're going to add next. Setting it
during arch_init_cpu() is too late for user validation.

Move the code to a new helper that is going to be called during init()
time, via kvm_riscv_init_user_properties(), like we're already doing for
the machine ID properties. Set both misa_ext and misa_ext_mask to the
same value retrieved by the 'isa' config reg.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
Acked-by: Alistair Francis 
---
 target/riscv/kvm.c | 34 +++---
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index 602727cdfd..4d0808cb9a 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -396,6 +396,28 @@ static void kvm_riscv_init_machine_ids(RISCVCPU *cpu, 
KVMScratchCPU *kvmcpu)
 }
 }
 
+static void kvm_riscv_init_misa_ext_mask(RISCVCPU *cpu,
+ KVMScratchCPU *kvmcpu)
+{
+CPURISCVState *env = &cpu->env;
+struct kvm_one_reg reg;
+int ret;
+
+reg.id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
+  KVM_REG_RISCV_CONFIG_REG(isa));
+reg.addr = (uint64_t)&env->misa_ext_mask;
+ret = ioctl(kvmcpu->cpufd, KVM_GET_ONE_REG, ®);
+
+if (ret) {
+error_report("Unable to fetch ISA register from KVM, "
+ "error %d", ret);
+kvm_riscv_destroy_scratch_vcpu(kvmcpu);
+exit(EXIT_FAILURE);
+}
+
+env->misa_ext = env->misa_ext_mask;
+}
+
 void kvm_riscv_init_user_properties(Object *cpu_obj)
 {
 RISCVCPU *cpu = RISCV_CPU(cpu_obj);
@@ -406,6 +428,7 @@ void kvm_riscv_init_user_properties(Object *cpu_obj)
 }
 
 kvm_riscv_init_machine_ids(cpu, &kvmcpu);
+kvm_riscv_init_misa_ext_mask(cpu, &kvmcpu);
 
 kvm_riscv_destroy_scratch_vcpu(&kvmcpu);
 }
@@ -525,21 +548,10 @@ static int kvm_vcpu_set_machine_ids(RISCVCPU *cpu, 
CPUState *cs)
 int kvm_arch_init_vcpu(CPUState *cs)
 {
 int ret = 0;
-target_ulong isa;
 RISCVCPU *cpu = RISCV_CPU(cs);
-CPURISCVState *env = &cpu->env;
-uint64_t id;
 
 qemu_add_vm_change_state_handler(kvm_riscv_vm_state_change, cs);
 
-id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
-  KVM_REG_RISCV_CONFIG_REG(isa));
-ret = kvm_get_one_reg(cs, id, &isa);
-if (ret) {
-return ret;
-}
-env->misa_ext = isa;
-
 if (!object_dynamic_cast(OBJECT(cpu), TYPE_RISCV_CPU_HOST)) {
 ret = kvm_vcpu_set_machine_ids(cpu, cs);
 }
-- 
2.41.0

[PATCH v6 00/20] target/riscv, KVM: fixes and enhancements

2023-06-28 Thread Daniel Henrique Barboza

Hi,

This new version has changes suggested by Phil in v5. It also has some
changes I made on the fly. The short summary is:

- use "#ifndef CONFIG_USER_MODE kvm_enabled() #endif" instead of a helper
  that wraps them all up. This was suggested by Phil to be more clear
  about the code that depends on KVM or not. A new patch (15) was added
  to keep the amount of #ifndefs in control in
  riscv_cpu_add_user_properties() due to this design change;

- misa_ext_info_arr[] is now indexed from 0 to 25 instead of 0 from
  potentially 1 << 25. Getters for 'name' and 'description' were added.

It's worth mentioning a change I decided to make that wasn't mentioned
in the v5 review. In patch 12, where we added KVM MISA properties, we
were trying to find the property before proceeding with
object_property_add() to add the remaining properties. I remembered that
we can also use object_property_try_add(). In fact,
object_property_add() is a simple call for object_property_try_add()
that uses &error_fatal. And it turns out that and the only error being
reported by try_add() happens when object_property_find() finds a
property with the same name. This means that we can use
object_property_try_add() with an error pointer and, if an error is
reported, it means that we already have the property. This change makes
us do a single object_property_find() instead of 2. 

Patches 6 and 17 (former 16) had trivial changes, all of them related to
the extinct riscv_running_kvm() helper, and I decided to keep their r-bs
for reviewer convenience.

Series was rebased on top of riscv-to-apply.next.

Patches missing reviews: 1, 11, 12, 14, 15.

Changes from v5:
- trivial changes (R-bs kept):
  - patch 6: use kvm_enabled() instead of riscv_running_kvm()
  - patch 17 (former 16): wrap cpu_set_cfg_unavailable() inside '#ifndef
CONFIG_USER_MODE'
- patch 1:
  - riscv_running_kvm() removed
  - rename riscv_cpu_realize_functions() to riscv_cpu_realize_tcg()
  - use 'tcg_enabled()' before riscv_cpu_realize_tcg()
  - reword commit msg to indicate that we're checking positive for TCG
instead of negative for KVM
- patch 11:
  - misa_ext_info_arr[] is now indexed from 0 (A-A) to 25 (Z-A)
  - getter functions for 'name' and 'description' were added
  - misa_ext_info_arr[] is no longer exported in target/riscv/cpu.h
  - 'name' and 'description' of misa_ext_cfg[] must now be populated
after initialization because misa_ext_info_arr[] is no longer a
constant array
- patch 12:
  - use object_property_try_add() in riscv_cpu_add_misa_properties().
It'll set an Error pointer if the property already exists. Use that
to skip an already created property instead of doing a redundant
object_property_find() beforehand
  - moved the object_property_find() code that was checking for
multi-letter properties to patch 14
  - use the new getters() from patch 11 instead of reading the array
directly
- patch 14:
  - added the code from patch 12 that checks for an existing
multi-letter property. Add a kvm_enabled() check around it in
preparation for the mock KVM properties patch later on
- patch 15 (new):
  - create satp_mode properties earlier to avoid an extra #ifndef
CONFIG_USER_MODE block
- v5 link: https://lists.gnu.org/archive/html/qemu-devel/2023-06/msg05984.html

Daniel Henrique Barboza (20):
  target/riscv: skip features setup for KVM CPUs
  hw/riscv/virt.c: skip 'mmu-type' FDT if satp mode not set
  target/riscv/cpu.c: restrict 'mvendorid' value
  target/riscv/cpu.c: restrict 'mimpid' value
  target/riscv/cpu.c: restrict 'marchid' value
  target/riscv: use KVM scratch CPUs to init KVM properties
  target/riscv: read marchid/mimpid in kvm_riscv_init_machine_ids()
  target/riscv: handle mvendorid/marchid/mimpid for KVM CPUs
  linux-headers: Update to v6.4-rc1
  target/riscv/kvm.c: init 'misa_ext_mask' with scratch CPU
  target/riscv/cpu: add misa_ext_info_arr[]
  target/riscv: add KVM specific MISA properties
  target/riscv/kvm.c: update KVM MISA bits
  target/riscv/kvm.c: add multi-letter extension KVM properties
  target/riscv/cpu.c: add satp_mode properties earlier
  target/riscv/cpu.c: remove priv_ver check from riscv_isa_string_ext()
  target/riscv/cpu.c: create KVM mock properties
  target/riscv: update multi-letter extension KVM properties
  target/riscv/kvm.c: add kvmconfig_get_cfg_addr() helper
  target/riscv/kvm.c: read/write (cbom|cboz)_blocksize in KVM

 hw/riscv/virt.c   |  14 +-
 include/standard-headers/linux/const.h|   2 +-
 include/standard-headers/linux/virtio_blk.h   |  18 +-
 .../standard-headers/linux/virtio_config.h|   6 +
 include/standard-headers/linux/virtio_net.h   |   1 +
 linux-headers/asm-arm64/kvm.h |  33 ++
 linux-headers/asm-riscv/kvm.h |  53 +-
 linux-headers/asm-riscv/unistd.h  |   9 +
 linux-headers/asm-s390/unistd_32.h|   1 +
 linux-headers/asm-s390/unistd_64.h|   1 +
 linux-hea

[PATCH v6 08/20] target/riscv: handle mvendorid/marchid/mimpid for KVM CPUs

2023-06-28 Thread Daniel Henrique Barboza

After changing user validation for mvendorid/marchid/mimpid to guarantee
that the value is validated on user input time, coupled with the work in
fetching KVM default values for them by using a scratch CPU, we're
certain that the values in cpu->cfg.(mvendorid|marchid|mimpid) are
already good to be written back to KVM.

There's no need to write the values back for 'host' type CPUs since the
values can't be changed, so let's do that just for generic CPUs.

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
Acked-by: Alistair Francis 
---
 target/riscv/kvm.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index cd2974c663..602727cdfd 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -495,6 +495,33 @@ void kvm_arch_init_irq_routing(KVMState *s)
 {
 }
 
+static int kvm_vcpu_set_machine_ids(RISCVCPU *cpu, CPUState *cs)
+{
+CPURISCVState *env = &cpu->env;
+uint64_t id;
+int ret;
+
+id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
+  KVM_REG_RISCV_CONFIG_REG(mvendorid));
+ret = kvm_set_one_reg(cs, id, &cpu->cfg.mvendorid);
+if (ret != 0) {
+return ret;
+}
+
+id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
+  KVM_REG_RISCV_CONFIG_REG(marchid));
+ret = kvm_set_one_reg(cs, id, &cpu->cfg.marchid);
+if (ret != 0) {
+return ret;
+}
+
+id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
+  KVM_REG_RISCV_CONFIG_REG(mimpid));
+ret = kvm_set_one_reg(cs, id, &cpu->cfg.mimpid);
+
+return ret;
+}
+
 int kvm_arch_init_vcpu(CPUState *cs)
 {
 int ret = 0;
@@ -513,6 +540,10 @@ int kvm_arch_init_vcpu(CPUState *cs)
 }
 env->misa_ext = isa;
 
+if (!object_dynamic_cast(OBJECT(cpu), TYPE_RISCV_CPU_HOST)) {
+ret = kvm_vcpu_set_machine_ids(cpu, cs);
+}
+
 return ret;
 }
 
-- 
2.41.0

[PATCH v6 15/20] target/riscv/cpu.c: add satp_mode properties earlier

2023-06-28 Thread Daniel Henrique Barboza

riscv_cpu_add_user_properties() ended up with an excess of "#ifndef
CONFIG_USER_ONLY" blocks after changes that added KVM properties
handling.

KVM specific properties are required to be created earlier than their
TCG counterparts, but the remaining props can be created at any order.
Move riscv_add_satp_mode_properties() to the start of the function,
inside the !CONFIG_USER_ONLY block already present there, to remove the
last ifndef block.

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 5428402cfa..b4a6fd8bab 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1743,6 +1743,8 @@ static void riscv_cpu_add_user_properties(Object *obj)
 DeviceState *dev = DEVICE(obj);
 
 #ifndef CONFIG_USER_ONLY
+riscv_add_satp_mode_properties(obj);
+
 if (kvm_enabled()) {
 kvm_riscv_init_user_properties(obj);
 }
@@ -1761,10 +1763,6 @@ static void riscv_cpu_add_user_properties(Object *obj)
 #endif
 qdev_property_add_static(dev, prop);
 }
-
-#ifndef CONFIG_USER_ONLY
-riscv_add_satp_mode_properties(obj);
-#endif
 }
 
 static Property riscv_cpu_properties[] = {
-- 
2.41.0

[PATCH v6 11/20] target/riscv/cpu: add misa_ext_info_arr[]

2023-06-28 Thread Daniel Henrique Barboza

Next patch will add KVM specific user properties for both MISA and
multi-letter extensions. For MISA extensions we want to make use of what
is already available in misa_ext_cfgs[] to avoid code repetition.

misa_ext_info_arr[] array will hold name and description for each MISA
extension that misa_ext_cfgs[] is declaring. We'll then use this new
array in KVM code to avoid duplicating strings.

There's nothing holding us back from doing the same with multi-letter
extensions. For now doing just with MISA extensions is enough.

Suggested-by: Andrew Jones 
Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 83 ++
 target/riscv/cpu.h |  7 +++-
 2 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 2485e820f8..90dd2078ae 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1558,33 +1558,57 @@ static void cpu_get_misa_ext_cfg(Object *obj, Visitor 
*v, const char *name,
 visit_type_bool(v, name, &value, errp);
 }
 
-static const RISCVCPUMisaExtConfig misa_ext_cfgs[] = {
-{.name = "a", .description = "Atomic instructions",
- .misa_bit = RVA, .enabled = true},
-{.name = "c", .description = "Compressed instructions",
- .misa_bit = RVC, .enabled = true},
-{.name = "d", .description = "Double-precision float point",
- .misa_bit = RVD, .enabled = true},
-{.name = "f", .description = "Single-precision float point",
- .misa_bit = RVF, .enabled = true},
-{.name = "i", .description = "Base integer instruction set",
- .misa_bit = RVI, .enabled = true},
-{.name = "e", .description = "Base integer instruction set (embedded)",
- .misa_bit = RVE, .enabled = false},
-{.name = "m", .description = "Integer multiplication and division",
- .misa_bit = RVM, .enabled = true},
-{.name = "s", .description = "Supervisor-level instructions",
- .misa_bit = RVS, .enabled = true},
-{.name = "u", .description = "User-level instructions",
- .misa_bit = RVU, .enabled = true},
-{.name = "h", .description = "Hypervisor",
- .misa_bit = RVH, .enabled = true},
-{.name = "x-j", .description = "Dynamic translated languages",
- .misa_bit = RVJ, .enabled = false},
-{.name = "v", .description = "Vector operations",
- .misa_bit = RVV, .enabled = false},
-{.name = "g", .description = "General purpose (IMAFD_Zicsr_Zifencei)",
- .misa_bit = RVG, .enabled = false},
+typedef struct misa_ext_info {
+const char *name;
+const char *description;
+} MISAExtInfo;
+
+#define MISA_EXT_INFO(_idx, _propname, _descr) \
+[(_idx - 'A')] = {.name = _propname, .description = _descr}
+
+static const MISAExtInfo misa_ext_info_arr[] = {
+MISA_EXT_INFO('A', "a", "Atomic instructions"),
+MISA_EXT_INFO('C', "c", "Compressed instructions"),
+MISA_EXT_INFO('D', "d", "Double-precision float point"),
+MISA_EXT_INFO('F', "f", "Single-precision float point"),
+MISA_EXT_INFO('I', "i", "Base integer instruction set"),
+MISA_EXT_INFO('E', "e", "Base integer instruction set (embedded)"),
+MISA_EXT_INFO('M', "m", "Integer multiplication and division"),
+MISA_EXT_INFO('S', "s", "Supervisor-level instructions"),
+MISA_EXT_INFO('U', "u", "User-level instructions"),
+MISA_EXT_INFO('H', "h", "Hypervisor"),
+MISA_EXT_INFO('J', "x-j", "Dynamic translated languages"),
+MISA_EXT_INFO('V', "v", "Vector operations"),
+MISA_EXT_INFO('G', "g", "General purpose (IMAFD_Zicsr_Zifencei)"),
+};
+
+const char *riscv_get_misa_ext_name(uint32_t bit)
+{
+return misa_ext_info_arr[ctz32(bit)].name;
+}
+
+const char *riscv_get_misa_ext_descr(uint32_t bit)
+{
+return misa_ext_info_arr[ctz32(bit)].description;
+}
+
+#define MISA_CFG(_bit, _enabled) \
+{.misa_bit = _bit, .enabled = _enabled}
+
+static RISCVCPUMisaExtConfig misa_ext_cfgs[] = {
+MISA_CFG(RVA, true),
+MISA_CFG(RVC, true),
+MISA_CFG(RVD, true),
+MISA_CFG(RVF, true),
+MISA_CFG(RVI, true),
+MISA_CFG(RVE, false),
+MISA_CFG(RVM, true),
+MISA_CFG(RVS, true),
+MISA_CFG(RVU, true),
+MISA_CFG(RVH, true),
+MISA_CFG(RVJ, false),
+MISA_CFG(RVV, false),
+MISA_CFG(RVG, false),
 };
 
 static void riscv_cpu_add_misa_properties(Object *cpu_obj)
@@ -1592,7 +1616,10 @@ static void riscv_cpu_add_misa_properties(Object 
*cpu_obj)
 int i;
 
 for (i = 0; i < ARRAY_SIZE(misa_ext_cfgs); i++) {
-const RISCVCPUMisaExtConfig *misa_cfg = &misa_ext_cfgs[i];
+RISCVCPUMisaExtConfig *misa_cfg = &misa_ext_cfgs[i];
+
+misa_cfg->name = riscv_get_misa_ext_name(misa_cfg->misa_bit);
+misa_cfg->description = riscv_get_misa_ext_descr(misa_cfg->misa_bit);
 
 object_property_add(cpu_obj, misa_cfg->name, "bool",
 cpu_get_misa_ext_cfg,
diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index cc20ee25a7..acae118b9b 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@

[PATCH v6 04/20] target/riscv/cpu.c: restrict 'mimpid' value

2023-06-28 Thread Daniel Henrique Barboza

Following the same logic used with 'mvendorid' let's also restrict
'mimpid' for named CPUs. Generic CPUs keep setting the value freely.

Note that we're getting rid of the default RISCV_CPU_MARCHID value. The
reason is that this is not a good default since it's dynamic, changing
with with every QEMU version, regardless of whether the actual
implementation of the CPU changed from one QEMU version to the other.
Named CPU should set it to a meaningful value instead and generic CPUs
can set whatever they want.

This is the error thrown for an invalid 'mimpid' value for the veyron-v1
CPU:

$ ./qemu-system-riscv64 -M virt -nographic -cpu veyron-v1,mimpid=2
qemu-system-riscv64: can't apply global veyron-v1-riscv-cpu.mimpid=2:
Unable to change veyron-v1-riscv-cpu mimpid (0x111)

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.c | 34 --
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 652988db25..7ca7b7bbfa 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -43,7 +43,6 @@
 #define RISCV_CPU_MARCHID   ((QEMU_VERSION_MAJOR << 16) | \
  (QEMU_VERSION_MINOR << 8)  | \
  (QEMU_VERSION_MICRO))
-#define RISCV_CPU_MIMPIDRISCV_CPU_MARCHID
 
 static const char riscv_single_letter_exts[] = "IEMAFDQCPVH";
 
@@ -1731,7 +1730,6 @@ static Property riscv_cpu_properties[] = {
 DEFINE_PROP_BOOL("debug", RISCVCPU, cfg.debug, true),
 
 DEFINE_PROP_UINT64("marchid", RISCVCPU, cfg.marchid, RISCV_CPU_MARCHID),
-DEFINE_PROP_UINT64("mimpid", RISCVCPU, cfg.mimpid, RISCV_CPU_MIMPID),
 
 #ifndef CONFIG_USER_ONLY
 DEFINE_PROP_UINT64("resetvec", RISCVCPU, env.resetvec, DEFAULT_RSTVEC),
@@ -1850,6 +1848,35 @@ static void cpu_get_mvendorid(Object *obj, Visitor *v, 
const char *name,
 visit_type_bool(v, name, &value, errp);
 }
 
+static void cpu_set_mimpid(Object *obj, Visitor *v, const char *name,
+   void *opaque, Error **errp)
+{
+bool dynamic_cpu = riscv_cpu_is_dynamic(obj);
+RISCVCPU *cpu = RISCV_CPU(obj);
+uint64_t prev_val = cpu->cfg.mimpid;
+uint64_t value;
+
+if (!visit_type_uint64(v, name, &value, errp)) {
+return;
+}
+
+if (!dynamic_cpu && prev_val != value) {
+error_setg(errp, "Unable to change %s mimpid (0x%lx)",
+   object_get_typename(obj), prev_val);
+return;
+}
+
+cpu->cfg.mimpid = value;
+}
+
+static void cpu_get_mimpid(Object *obj, Visitor *v, const char *name,
+   void *opaque, Error **errp)
+{
+bool value = RISCV_CPU(obj)->cfg.mimpid;
+
+visit_type_bool(v, name, &value, errp);
+}
+
 static void riscv_cpu_class_init(ObjectClass *c, void *data)
 {
 RISCVCPUClass *mcc = RISCV_CPU_CLASS(c);
@@ -1884,6 +1911,9 @@ static void riscv_cpu_class_init(ObjectClass *c, void 
*data)
 object_class_property_add(c, "mvendorid", "uint32", cpu_get_mvendorid,
   cpu_set_mvendorid, NULL, NULL);
 
+object_class_property_add(c, "mimpid", "uint64", cpu_get_mimpid,
+  cpu_set_mimpid, NULL, NULL);
+
 device_class_set_props(dc, riscv_cpu_properties);
 }
 
-- 
2.41.0

[PATCH v6 05/20] target/riscv/cpu.c: restrict 'marchid' value

2023-06-28 Thread Daniel Henrique Barboza

'marchid' shouldn't be set to a different value as previously set for
named CPUs.

For all other CPUs it shouldn't be freely set either - the spec requires
that 'marchid' can't have the MSB (most significant bit) set and every
other bit set to zero, i.e. 0x8000 is an invalid 'marchid' value for
32 bit CPUs.

As with 'mimpid', setting a default value based on the current QEMU
version is not a good idea because it implies that the CPU
implementation changes from one QEMU version to the other. Named CPUs
should set 'marchid' to a meaningful value instead, and generic CPUs can
set to any valid value.

For the 'veyron-v1' CPU this is the error thrown if 'marchid' is set to
a different val:

$ ./build/qemu-system-riscv64 -M virt -nographic -cpu 
veyron-v1,marchid=0x8000
qemu-system-riscv64: can't apply global veyron-v1-riscv-cpu.marchid=0x8000:
Unable to change veyron-v1-riscv-cpu marchid (0x8001)

And, for generics CPUs, this is the error when trying to set to an
invalid val:

$ ./build/qemu-system-riscv64 -M virt -nographic -cpu 
rv64,marchid=0x8000
qemu-system-riscv64: can't apply global 
rv64-riscv-cpu.marchid=0x8000:
Unable to set marchid with MSB (64) bit set and the remaining bits zero

Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.c | 60 --
 1 file changed, 53 insertions(+), 7 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 7ca7b7bbfa..51f055b8be 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -39,11 +39,6 @@
 #include "tcg/tcg.h"
 
 /* RISC-V CPU definitions */
-
-#define RISCV_CPU_MARCHID   ((QEMU_VERSION_MAJOR << 16) | \
- (QEMU_VERSION_MINOR << 8)  | \
- (QEMU_VERSION_MICRO))
-
 static const char riscv_single_letter_exts[] = "IEMAFDQCPVH";
 
 struct isa_ext_data {
@@ -1729,8 +1724,6 @@ static void riscv_cpu_add_user_properties(Object *obj)
 static Property riscv_cpu_properties[] = {
 DEFINE_PROP_BOOL("debug", RISCVCPU, cfg.debug, true),
 
-DEFINE_PROP_UINT64("marchid", RISCVCPU, cfg.marchid, RISCV_CPU_MARCHID),
-
 #ifndef CONFIG_USER_ONLY
 DEFINE_PROP_UINT64("resetvec", RISCVCPU, env.resetvec, DEFAULT_RSTVEC),
 #endif
@@ -1877,6 +1870,56 @@ static void cpu_get_mimpid(Object *obj, Visitor *v, 
const char *name,
 visit_type_bool(v, name, &value, errp);
 }
 
+static void cpu_set_marchid(Object *obj, Visitor *v, const char *name,
+void *opaque, Error **errp)
+{
+bool dynamic_cpu = riscv_cpu_is_dynamic(obj);
+RISCVCPU *cpu = RISCV_CPU(obj);
+uint64_t prev_val = cpu->cfg.marchid;
+uint64_t value, invalid_val;
+uint32_t mxlen = 0;
+
+if (!visit_type_uint64(v, name, &value, errp)) {
+return;
+}
+
+if (!dynamic_cpu && prev_val != value) {
+error_setg(errp, "Unable to change %s marchid (0x%lx)",
+   object_get_typename(obj), prev_val);
+return;
+}
+
+switch (riscv_cpu_mxl(&cpu->env)) {
+case MXL_RV32:
+mxlen = 32;
+break;
+case MXL_RV64:
+case MXL_RV128:
+mxlen = 64;
+break;
+default:
+g_assert_not_reached();
+}
+
+invalid_val = 1LL << (mxlen - 1);
+
+if (value == invalid_val) {
+error_setg(errp, "Unable to set marchid with MSB (%u) bit set "
+ "and the remaining bits zero", mxlen);
+return;
+}
+
+cpu->cfg.marchid = value;
+}
+
+static void cpu_get_marchid(Object *obj, Visitor *v, const char *name,
+   void *opaque, Error **errp)
+{
+bool value = RISCV_CPU(obj)->cfg.marchid;
+
+visit_type_bool(v, name, &value, errp);
+}
+
 static void riscv_cpu_class_init(ObjectClass *c, void *data)
 {
 RISCVCPUClass *mcc = RISCV_CPU_CLASS(c);
@@ -1914,6 +1957,9 @@ static void riscv_cpu_class_init(ObjectClass *c, void 
*data)
 object_class_property_add(c, "mimpid", "uint64", cpu_get_mimpid,
   cpu_set_mimpid, NULL, NULL);
 
+object_class_property_add(c, "marchid", "uint64", cpu_get_marchid,
+  cpu_set_marchid, NULL, NULL);
+
 device_class_set_props(dc, riscv_cpu_properties);
 }
 
-- 
2.41.0

[PATCH v6 06/20] target/riscv: use KVM scratch CPUs to init KVM properties

2023-06-28 Thread Daniel Henrique Barboza

Certain validations, such as the validations done for the machine IDs
(mvendorid/marchid/mimpid), are done before starting the CPU.
Non-dynamic (named) CPUs tries to match user input with a preset
default. As it is today we can't prefetch a KVM default for these cases
because we're only able to read/write KVM regs after the vcpu is
spinning.

Our target/arm friends use a concept called "scratch CPU", which
consists of creating a vcpu for doing queries and validations and so on,
which is discarded shortly after use [1]. This is a suitable solution
for what we need so let's implement it in target/riscv as well.

kvm_riscv_init_machine_ids() will be used to do any pre-launch setup for
KVM CPUs, via riscv_cpu_add_user_properties(). The function will create
a KVM scratch CPU, fetch KVM regs that work as default values for user
properties, and then discard the scratch CPU afterwards.

We're starting by initializing 'mvendorid'. This concept will be used to
init other KVM specific properties in the next patches as well.

[1] target/arm/kvm.c, kvm_arm_create_scratch_host_vcpu()

Suggested-by: Andrew Jones 
Signed-off-by: Daniel Henrique Barboza 
Reviewed-by: Andrew Jones 
Acked-by: Alistair Francis 
---
 target/riscv/cpu.c   |  6 +++
 target/riscv/kvm.c   | 85 
 target/riscv/kvm_riscv.h |  1 +
 3 files changed, 92 insertions(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 51f055b8be..2485e820f8 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1710,6 +1710,12 @@ static void riscv_cpu_add_user_properties(Object *obj)
 Property *prop;
 DeviceState *dev = DEVICE(obj);
 
+#ifndef CONFIG_USER_ONLY
+if (kvm_enabled()) {
+kvm_riscv_init_user_properties(obj);
+}
+#endif
+
 riscv_cpu_add_misa_properties(obj);
 
 for (prop = riscv_cpu_extensions; prop && prop->name; prop++) {
diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index 0f932a5b96..37f0f70794 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -309,6 +309,91 @@ static void kvm_riscv_put_regs_timer(CPUState *cs)
 env->kvm_timer_dirty = false;
 }
 
+typedef struct KVMScratchCPU {
+int kvmfd;
+int vmfd;
+int cpufd;
+} KVMScratchCPU;
+
+/*
+ * Heavily inspired by kvm_arm_create_scratch_host_vcpu()
+ * from target/arm/kvm.c.
+ */
+static bool kvm_riscv_create_scratch_vcpu(KVMScratchCPU *scratch)
+{
+int kvmfd = -1, vmfd = -1, cpufd = -1;
+
+kvmfd = qemu_open_old("/dev/kvm", O_RDWR);
+if (kvmfd < 0) {
+goto err;
+}
+do {
+vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
+} while (vmfd == -1 && errno == EINTR);
+if (vmfd < 0) {
+goto err;
+}
+cpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
+if (cpufd < 0) {
+goto err;
+}
+
+scratch->kvmfd =  kvmfd;
+scratch->vmfd = vmfd;
+scratch->cpufd = cpufd;
+
+return true;
+
+ err:
+if (cpufd >= 0) {
+close(cpufd);
+}
+if (vmfd >= 0) {
+close(vmfd);
+}
+if (kvmfd >= 0) {
+close(kvmfd);
+}
+
+return false;
+}
+
+static void kvm_riscv_destroy_scratch_vcpu(KVMScratchCPU *scratch)
+{
+close(scratch->cpufd);
+close(scratch->vmfd);
+close(scratch->kvmfd);
+}
+
+static void kvm_riscv_init_machine_ids(RISCVCPU *cpu, KVMScratchCPU *kvmcpu)
+{
+CPURISCVState *env = &cpu->env;
+struct kvm_one_reg reg;
+int ret;
+
+reg.id = kvm_riscv_reg_id(env, KVM_REG_RISCV_CONFIG,
+  KVM_REG_RISCV_CONFIG_REG(mvendorid));
+reg.addr = (uint64_t)&cpu->cfg.mvendorid;
+ret = ioctl(kvmcpu->cpufd, KVM_GET_ONE_REG, ®);
+if (ret != 0) {
+error_report("Unable to retrieve mvendorid from host, error %d", ret);
+}
+}
+
+void kvm_riscv_init_user_properties(Object *cpu_obj)
+{
+RISCVCPU *cpu = RISCV_CPU(cpu_obj);
+KVMScratchCPU kvmcpu;
+
+if (!kvm_riscv_create_scratch_vcpu(&kvmcpu)) {
+return;
+}
+
+kvm_riscv_init_machine_ids(cpu, &kvmcpu);
+
+kvm_riscv_destroy_scratch_vcpu(&kvmcpu);
+}
+
 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 KVM_CAP_LAST_INFO
 };
diff --git a/target/riscv/kvm_riscv.h b/target/riscv/kvm_riscv.h
index ed281bdce0..e3ba935808 100644
--- a/target/riscv/kvm_riscv.h
+++ b/target/riscv/kvm_riscv.h
@@ -19,6 +19,7 @@
 #ifndef QEMU_KVM_RISCV_H
 #define QEMU_KVM_RISCV_H
 
+void kvm_riscv_init_user_properties(Object *cpu_obj);
 void kvm_riscv_reset_vcpu(RISCVCPU *cpu);
 void kvm_riscv_set_irq(RISCVCPU *cpu, int irq, int level);
 
-- 
2.41.0

[PATCH v6 12/20] target/riscv: add KVM specific MISA properties

2023-06-28 Thread Daniel Henrique Barboza

Using all TCG user properties in KVM is tricky. First because KVM
supports only a small subset of what TCG provides, so most of the
cpu->cfg flags do nothing for KVM.

Second, and more important, we don't have a way of telling if any given
value is an user input or not. For TCG this has a small impact since we
just validating everything and error out if needed. But for KVM it would
be good to know if a given value was set by the user or if it's a value
already provided by KVM. Otherwise we don't know how to handle failed
kvm_set_one_regs() when writing the configurations back.

These characteristics make it overly complicated to use the same user
facing flags for both KVM and TCG. A simpler approach is to create KVM
specific properties that have specialized logic, forking KVM and TCG use
cases for those cases only. Fully separating KVM/TCG properties is
unneeded at this point - in fact we want the user experience to be as
equal as possible, regardless of the acceleration chosen.

We'll start this fork with the MISA properties, adding the MISA bits
that the KVM driver currently supports. A new KVMCPUConfig type is
introduced. It'll hold general information about an extension. For MISA
extensions we're going to use the newly created getters of
misa_ext_infos[] to populate their name and description. 'offset' holds
the MISA bit (RVA, RVC, ...). We're calling it 'offset' instead of
'misa_bit' because this same KVMCPUConfig struct will be used to
multi-letter extensions later on.

This new type also holds a 'user_set' flag. This flag will be set when
the user set an option that's different than what is already configured
in the host, requiring KVM intervention to write the regs back during
kvm_arch_init_vcpu(). Similar mechanics will be implemented for
multi-letter extensions as well.

There is no need to duplicate more code than necessary, so we're going
to use the existing kvm_riscv_init_user_properties() to add the KVM
specific properties. Any code that is adding a TCG user prop is then
changed slightly to verify first if there's a KVM prop with the same
name already added.

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 13 +---
 target/riscv/kvm.c | 78 ++
 2 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 90dd2078ae..f4b1868466 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -1617,14 +1617,19 @@ static void riscv_cpu_add_misa_properties(Object 
*cpu_obj)
 
 for (i = 0; i < ARRAY_SIZE(misa_ext_cfgs); i++) {
 RISCVCPUMisaExtConfig *misa_cfg = &misa_ext_cfgs[i];
+Error *local_err = NULL;
 
 misa_cfg->name = riscv_get_misa_ext_name(misa_cfg->misa_bit);
 misa_cfg->description = riscv_get_misa_ext_descr(misa_cfg->misa_bit);
 
-object_property_add(cpu_obj, misa_cfg->name, "bool",
-cpu_get_misa_ext_cfg,
-cpu_set_misa_ext_cfg,
-NULL, (void *)misa_cfg);
+object_property_try_add(cpu_obj, misa_cfg->name, "bool",
+cpu_get_misa_ext_cfg, cpu_set_misa_ext_cfg,
+NULL, (void *)misa_cfg, &local_err);
+if (local_err) {
+/* Someone (KVM) already created the property */
+continue;
+}
+
 object_property_set_description(cpu_obj, misa_cfg->name,
 misa_cfg->description);
 object_property_set_bool(cpu_obj, misa_cfg->name,
diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
index 4d0808cb9a..0fb63cced3 100644
--- a/target/riscv/kvm.c
+++ b/target/riscv/kvm.c
@@ -22,8 +22,10 @@
 #include 
 
 #include "qemu/timer.h"
+#include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
+#include "qapi/visitor.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_int.h"
@@ -105,6 +107,81 @@ static uint64_t kvm_riscv_reg_id(CPURISCVState *env, 
uint64_t type,
 } \
 } while (0)
 
+typedef struct KVMCPUConfig {
+const char *name;
+const char *description;
+target_ulong offset;
+int kvm_reg_id;
+bool user_set;
+} KVMCPUConfig;
+
+#define KVM_MISA_CFG(_bit, _reg_id) \
+{.offset = _bit, .kvm_reg_id = _reg_id}
+
+/* KVM ISA extensions */
+static KVMCPUConfig kvm_misa_ext_cfgs[] = {
+KVM_MISA_CFG(RVA, KVM_RISCV_ISA_EXT_A),
+KVM_MISA_CFG(RVC, KVM_RISCV_ISA_EXT_C),
+KVM_MISA_CFG(RVD, KVM_RISCV_ISA_EXT_D),
+KVM_MISA_CFG(RVF, KVM_RISCV_ISA_EXT_F),
+KVM_MISA_CFG(RVH, KVM_RISCV_ISA_EXT_H),
+KVM_MISA_CFG(RVI, KVM_RISCV_ISA_EXT_I),
+KVM_MISA_CFG(RVM, KVM_RISCV_ISA_EXT_M),
+};
+
+static void kvm_cpu_set_misa_ext_cfg(Object *obj, Visitor *v,
+ const char *name,
+ void *opaque, Error **errp)
+{
+KVMCPUConfig *misa_ext_cfg = opaque;
+target_ulong mis

[PATCH v6 01/20] target/riscv: skip features setup for KVM CPUs

2023-06-28 Thread Daniel Henrique Barboza

As it is today it's not possible to use '-cpu host' if the RISC-V host
has RVH enabled. This is the resulting error:

$ sudo ./qemu/build/qemu-system-riscv64 \
-machine virt,accel=kvm -m 2G -smp 1 \
-nographic -snapshot -kernel ./guest_imgs/Image  \
-initrd ./guest_imgs/rootfs_kvm_riscv64.img \
-append "earlycon=sbi root=/dev/ram rw" \
-cpu host
qemu-system-riscv64: H extension requires priv spec 1.12.0

This happens because we're checking for priv spec for all CPUs, and
since we're not setting  env->priv_ver for the 'host' CPU, it's being
default to zero (i.e. PRIV_SPEC_1_10_0).

In reality env->priv_ver does not make sense when running with the KVM
'host' CPU. It's used to gate certain CSRs/extensions during translation
to make them unavailable if the hart declares an older spec version. It
doesn't have any other use. E.g. OpenSBI version 1.2 retrieves the spec
checking if the CSR_MCOUNTEREN, CSR_MCOUNTINHIBIT and CSR_MENVCFG CSRs
are available [1].

'priv_ver' is just one example. We're doing a lot of feature validation
and setup during riscv_cpu_realize() that it doesn't apply to KVM CPUs.
Validating the feature set for those CPUs is a KVM problem that should
be handled in KVM specific code.

The new riscv_cpu_realize_tcg() helper contains all validation logic that
are applicable to TCG CPUs only. riscv_cpu_realize() verifies if we're
running TCG and, if it's the case, proceed with the usual TCG realize()
logic.

[1] lib/sbi/sbi_hart.c, hart_detect_features()

Signed-off-by: Daniel Henrique Barboza 
---
 target/riscv/cpu.c | 35 +--
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index fb8458bf74..bbb201a2b3 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -34,6 +34,7 @@
 #include "migration/vmstate.h"
 #include "fpu/softfloat-helpers.h"
 #include "sysemu/kvm.h"
+#include "sysemu/tcg.h"
 #include "kvm_riscv.h"
 #include "tcg/tcg.h"
 
@@ -1308,20 +1309,12 @@ static void riscv_cpu_validate_misa_priv(CPURISCVState 
*env, Error **errp)
 }
 }
 
-static void riscv_cpu_realize(DeviceState *dev, Error **errp)
+static void riscv_cpu_realize_tcg(DeviceState *dev, Error **errp)
 {
-CPUState *cs = CPU(dev);
 RISCVCPU *cpu = RISCV_CPU(dev);
 CPURISCVState *env = &cpu->env;
-RISCVCPUClass *mcc = RISCV_CPU_GET_CLASS(dev);
 Error *local_err = NULL;
 
-cpu_exec_realizefn(cs, &local_err);
-if (local_err != NULL) {
-error_propagate(errp, local_err);
-return;
-}
-
 riscv_cpu_validate_misa_mxl(cpu, &local_err);
 if (local_err != NULL) {
 error_propagate(errp, local_err);
@@ -1356,7 +1349,7 @@ static void riscv_cpu_realize(DeviceState *dev, Error 
**errp)
 }
 
 #ifndef CONFIG_USER_ONLY
-cs->tcg_cflags |= CF_PCREL;
+CPU(dev)->tcg_cflags |= CF_PCREL;
 
 if (cpu->cfg.ext_sstc) {
 riscv_timer_init(cpu);
@@ -1369,6 +1362,28 @@ static void riscv_cpu_realize(DeviceState *dev, Error 
**errp)
 }
  }
 #endif
+}
+
+static void riscv_cpu_realize(DeviceState *dev, Error **errp)
+{
+CPUState *cs = CPU(dev);
+RISCVCPU *cpu = RISCV_CPU(dev);
+RISCVCPUClass *mcc = RISCV_CPU_GET_CLASS(dev);
+Error *local_err = NULL;
+
+cpu_exec_realizefn(cs, &local_err);
+if (local_err != NULL) {
+error_propagate(errp, local_err);
+return;
+}
+
+if (tcg_enabled()) {
+riscv_cpu_realize_tcg(dev, &local_err);
+if (local_err != NULL) {
+error_propagate(errp, local_err);
+return;
+}
+}
 
 riscv_cpu_finalize_features(cpu, &local_err);
 if (local_err != NULL) {
-- 
2.41.0

Re: [PATCH v2 0/2] vfio/migration: Make VFIO migration non-experimental

2023-06-28 Thread Alex Williamson

On Wed, 28 Jun 2023 10:31:10 +0300
Avihai Horon  wrote:

> Hello,
> 
> The major parts of VFIO migration are supported today in QEMU. This
> includes basic VFIO migration, device dirty page tracking and precopy
> support. Thus, at this point in time, it seems appropriate to make VFIO
> migration non-experimental.
> 
> This short series (which is based on the precopy series [1]) does that
> and also adds a few improvements:
> - Patch #1 resets bytes_transferred counter properly.
> - Patch #2 cleans up the VFIO migration realize flow and makes VFIO
>   migration non-experimental.
> 
> Note that Zhenzhong's series [2] fixes additional bugs and further
> cleans the VFIO migration realize flow.
> 
> Changes from v1 [3]:
> * Dropped patch #1 as it's an optimization. (Cedric)
> * Added Fixes tag to patch #2. (Cedric)
> * Made VFIO device realization fail if migration is not supported and
>   enable_migration is ON. (Cedric)
> * Kept the error message of errno == ENOTTY case as it was in
>   vfio_migration_query_flags(). (Cedric)
> * Added a warn when enable_migration is ON and device dirty tracking is
>   not supported. (Alex)
> * Renamed trace_vfio_migration_probe() to
> * trace_vfio_migration_realize().
> 
> 
> Thanks.
> 
> [1]
> https://lore.kernel.org/qemu-devel/2023062201.29729-1-avih...@nvidia.com/
> 
> [2]
> https://lore.kernel.org/qemu-devel/20230621080204.420723-1-zhenzhong.d...@intel.com/
> 
> [3]
> https://lore.kernel.org/qemu-devel/20230626082353.18535-1-avih...@nvidia.com/
> 
> Avihai Horon (2):
>   vfio/migration: Reset bytes_transferred properly
>   vfio/migration: Make VFIO migration non-experimental
> 
>  include/hw/vfio/vfio-common.h |  7 +--
>  migration/migration.h |  1 +
>  hw/vfio/common.c  | 16 ++-
>  hw/vfio/migration.c   | 85 ---
>  hw/vfio/pci.c |  4 +-
>  migration/migration.c |  1 +
>  migration/savevm.c|  1 +
>  migration/target.c| 17 ++-
>  hw/vfio/trace-events  |  2 +-
>  9 files changed, 97 insertions(+), 37 deletions(-)
> 

Reviewed-by: Alex Williamson

[QEMU][PATCH v1] tests/qtest: xlnx-canfd-test: Fix code coverity issues

2023-06-28 Thread Vikram Garhwal

Following are done to fix the coverity issues:
1. Change read_data to fix the CID 1512899: Out-of-bounds access (OVERRUN)
2. Fix match_rx_tx_data to fix CID 1512900: Logically dead code (DEADCODE)
3. Replace rand() in generate_random_data() with g_rand_int()

Signed-off-by: Vikram Garhwal 
---
 tests/qtest/xlnx-canfd-test.c | 33 +++--
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/tests/qtest/xlnx-canfd-test.c b/tests/qtest/xlnx-canfd-test.c
index 76ee106d4f..78ec9ef2a7 100644
--- a/tests/qtest/xlnx-canfd-test.c
+++ b/tests/qtest/xlnx-canfd-test.c
@@ -170,23 +170,23 @@ static void generate_random_data(uint32_t *buf_tx, bool 
is_canfd_frame)
 /* Generate random TX data for CANFD frame. */
 if (is_canfd_frame) {
 for (int i = 0; i < CANFD_FRAME_SIZE - 2; i++) {
-buf_tx[2 + i] = rand();
+buf_tx[2 + i] = g_random_int();
 }
 } else {
 /* Generate random TX data for CAN frame. */
 for (int i = 0; i < CAN_FRAME_SIZE - 2; i++) {
-buf_tx[2 + i] = rand();
+buf_tx[2 + i] = g_random_int();
 }
 }
 }
 
-static void read_data(QTestState *qts, uint64_t can_base_addr, uint32_t 
*buf_rx)
+static void read_data(QTestState *qts, uint64_t can_base_addr, uint32_t 
*buf_rx,
+  uint32_t frame_size)
 {
 uint32_t int_status;
 uint32_t fifo_status_reg_value;
 /* At which RX FIFO the received data is stored. */
 uint8_t store_ind = 0;
-bool is_canfd_frame = false;
 
 /* Read the interrupt on CANFD rx. */
 int_status = qtest_readl(qts, can_base_addr + R_ISR_OFFSET) & ISR_RXOK;
@@ -207,16 +207,9 @@ static void read_data(QTestState *qts, uint64_t 
can_base_addr, uint32_t *buf_rx)
 buf_rx[0] = qtest_readl(qts, can_base_addr + R_RX0_ID_OFFSET);
 buf_rx[1] = qtest_readl(qts, can_base_addr + R_RX0_DLC_OFFSET);
 
-is_canfd_frame = (buf_rx[1] >> DLC_FD_BIT_SHIFT) & 1;
-
-if (is_canfd_frame) {
-for (int i = 0; i < CANFD_FRAME_SIZE - 2; i++) {
-buf_rx[i + 2] = qtest_readl(qts,
-can_base_addr + R_RX0_DATA1_OFFSET + 4 * 
i);
-}
-} else {
-buf_rx[2] = qtest_readl(qts, can_base_addr + R_RX0_DATA1_OFFSET);
-buf_rx[3] = qtest_readl(qts, can_base_addr + R_RX0_DATA2_OFFSET);
+for (int i = 0; i < frame_size - 2; i++) {
+buf_rx[i + 2] = qtest_readl(qts,
+can_base_addr + R_RX0_DATA1_OFFSET + 4 * i);
 }
 
 /* Clear the RX interrupt. */
@@ -272,10 +265,6 @@ static void match_rx_tx_data(const uint32_t *buf_tx, const 
uint32_t *buf_rx,
 g_assert_cmpint((buf_rx[size] & DLC_FD_BIT_MASK), ==,
 (buf_tx[size] & DLC_FD_BIT_MASK));
 } else {
-if (!is_canfd_frame && size == 4) {
-break;
-}
-
 g_assert_cmpint(buf_rx[size], ==, buf_tx[size]);
 }
 
@@ -318,7 +307,7 @@ static void test_can_data_transfer(void)
 write_data(qts, CANFD0_BASE_ADDR, buf_tx, false);
 
 send_data(qts, CANFD0_BASE_ADDR);
-read_data(qts, CANFD1_BASE_ADDR, buf_rx);
+read_data(qts, CANFD1_BASE_ADDR, buf_rx, CAN_FRAME_SIZE);
 match_rx_tx_data(buf_tx, buf_rx, false);
 
 qtest_quit(qts);
@@ -358,7 +347,7 @@ static void test_canfd_data_transfer(void)
 write_data(qts, CANFD0_BASE_ADDR, buf_tx, true);
 
 send_data(qts, CANFD0_BASE_ADDR);
-read_data(qts, CANFD1_BASE_ADDR, buf_rx);
+read_data(qts, CANFD1_BASE_ADDR, buf_rx, CANFD_FRAME_SIZE);
 match_rx_tx_data(buf_tx, buf_rx, true);
 
 qtest_quit(qts);
@@ -397,7 +386,7 @@ static void test_can_loopback(void)
 write_data(qts, CANFD0_BASE_ADDR, buf_tx, true);
 
 send_data(qts, CANFD0_BASE_ADDR);
-read_data(qts, CANFD0_BASE_ADDR, buf_rx);
+read_data(qts, CANFD0_BASE_ADDR, buf_rx, CANFD_FRAME_SIZE);
 match_rx_tx_data(buf_tx, buf_rx, true);
 
 generate_random_data(buf_tx, true);
@@ -405,7 +394,7 @@ static void test_can_loopback(void)
 write_data(qts, CANFD1_BASE_ADDR, buf_tx, true);
 
 send_data(qts, CANFD1_BASE_ADDR);
-read_data(qts, CANFD1_BASE_ADDR, buf_rx);
+read_data(qts, CANFD1_BASE_ADDR, buf_rx, CANFD_FRAME_SIZE);
 match_rx_tx_data(buf_tx, buf_rx, true);
 
 qtest_quit(qts);
-- 
2.30.2

Re: [PATCH] target/arm: gdbstub: Guard M-profile code with CONFIG_TCG

2023-06-28 Thread Philippe Mathieu-Daudé


On 28/6/23 18:48, Fabiano Rosas wrote:

This code is only relevant when TCG is present in the build. Building
with --disable-tcg --enable-xen on an x86 host we get:

$ ../configure --target-list=x86_64-softmmu,aarch64-softmmu --disable-tcg 
--enable-xen
$ make -j$(nproc)
...
libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function `m_sysreg_ptr':
  ../target/arm/gdbstub.c:358: undefined reference to `arm_v7m_get_sp_ptr'
  ../target/arm/gdbstub.c:361: undefined reference to `arm_v7m_get_sp_ptr'

libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
`arm_gdb_get_m_systemreg':
../target/arm/gdbstub.c:405: undefined reference to `arm_v7m_mrs_control'


I'm a bit confused, isn't this covered by the cross-arm64-xen-only job?


Signed-off-by: Fabiano Rosas 
---
This is a respin of:
https://lore.kernel.org/r/20230313151058.19645-5-faro...@suse.de
---
  target/arm/gdbstub.c | 4 
  1 file changed, 4 insertions(+)

Re: [PATCH] MAINTAINERS: Promote Cédric to VFIO co-maintainer

2023-06-28 Thread Philippe Mathieu-Daudé


On 28/6/23 19:29, Alex Williamson wrote:

Cédric has stepped up involvement in vfio, reviewing and managing
patches, as well as pull requests.  This work deserves gratitude and
punishment with a promotion to co-maintainer ;)

Signed-off-by: Alex Williamson 
---

Cédric, I'd also support if you wanted to add a tree entry here.

  MAINTAINERS | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Philippe Mathieu-Daudé

[PATCH v2 10/16] hw/pci-host/i440fx: Make MemoryRegion pointers accessible as properties

2023-06-28 Thread Bernhard Beschow

The goal is to eliminate i440fx_init() which is a legacy init function. This
neccessitates the memory regions to be properties, like in Q35, which will be
assigned in board code.

Since i440fx needs different PCI devices in Xen mode, and since i440fx shall
be self-contained, the PCI device will be created during realization of the
host. Thus the pointers need to be moved to the host structure to be usable as
properties.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Philippe Mathieu-Daudé 
---
 include/hw/pci-host/i440fx.h |  3 ---
 hw/pci-host/i440fx.c | 42 +---
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/include/hw/pci-host/i440fx.h b/include/hw/pci-host/i440fx.h
index bf57216c78..e3a550021e 100644
--- a/include/hw/pci-host/i440fx.h
+++ b/include/hw/pci-host/i440fx.h
@@ -25,9 +25,6 @@ struct PCII440FXState {
 PCIDevice parent_obj;
 /*< public >*/
 
-MemoryRegion *system_memory;
-MemoryRegion *pci_address_space;
-MemoryRegion *ram_memory;
 PAMMemoryRegion pam_regions[PAM_REGIONS_COUNT];
 MemoryRegion smram_region;
 MemoryRegion smram, low_smram;
diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index 9df4688b2e..efc940ba12 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -47,6 +47,10 @@ OBJECT_DECLARE_SIMPLE_TYPE(I440FXState, 
I440FX_PCI_HOST_BRIDGE)
 
 struct I440FXState {
 PCIHostState parent_obj;
+
+MemoryRegion *system_memory;
+MemoryRegion *pci_address_space;
+MemoryRegion *ram_memory;
 Range pci_hole;
 uint64_t pci_hole64_size;
 bool pci_hole64_fix;
@@ -214,12 +218,25 @@ static void i440fx_pcihost_get_pci_hole64_end(Object 
*obj, Visitor *v,
 
 static void i440fx_pcihost_initfn(Object *obj)
 {
+I440FXState *s = I440FX_PCI_HOST_BRIDGE(obj);
 PCIHostState *phb = PCI_HOST_BRIDGE(obj);
 
 memory_region_init_io(&phb->conf_mem, obj, &pci_host_conf_le_ops, phb,
   "pci-conf-idx", 4);
 memory_region_init_io(&phb->data_mem, obj, &pci_host_data_le_ops, phb,
   "pci-conf-data", 4);
+
+object_property_add_link(obj, PCI_HOST_PROP_RAM_MEM, TYPE_MEMORY_REGION,
+ (Object **) &s->ram_memory,
+ qdev_prop_allow_set_link_before_realize, 0);
+
+object_property_add_link(obj, PCI_HOST_PROP_PCI_MEM, TYPE_MEMORY_REGION,
+ (Object **) &s->pci_address_space,
+ qdev_prop_allow_set_link_before_realize, 0);
+
+object_property_add_link(obj, PCI_HOST_PROP_SYSTEM_MEM, TYPE_MEMORY_REGION,
+ (Object **) &s->system_memory,
+ qdev_prop_allow_set_link_before_realize, 0);
 }
 
 static void i440fx_pcihost_realize(DeviceState *dev, Error **errp)
@@ -255,7 +272,11 @@ PCIBus *i440fx_init(const char *pci_type,
 PCII440FXState *f;
 unsigned i;
 
-b = pci_root_bus_new(dev, NULL, pci_address_space,
+s->system_memory = address_space_mem;
+s->pci_address_space = pci_address_space;
+s->ram_memory = ram_memory;
+
+b = pci_root_bus_new(dev, NULL, s->pci_address_space,
  address_space_io, 0, TYPE_PCI_BUS);
 phb->bus = b;
 object_property_add_child(qdev_get_machine(), "i440fx", OBJECT(dev));
@@ -263,20 +284,17 @@ PCIBus *i440fx_init(const char *pci_type,
 
 d = pci_create_simple(b, 0, pci_type);
 f = I440FX_PCI_DEVICE(d);
-f->system_memory = address_space_mem;
-f->pci_address_space = pci_address_space;
-f->ram_memory = ram_memory;
 
 range_set_bounds(&s->pci_hole, below_4g_mem_size,
  IO_APIC_DEFAULT_ADDRESS - 1);
 
 /* setup pci memory mapping */
-pc_pci_as_mapping_init(f->system_memory, f->pci_address_space);
+pc_pci_as_mapping_init(s->system_memory, s->pci_address_space);
 
 /* if *disabled* show SMRAM to all CPUs */
 memory_region_init_alias(&f->smram_region, OBJECT(d), "smram-region",
- f->pci_address_space, SMRAM_C_BASE, SMRAM_C_SIZE);
-memory_region_add_subregion_overlap(f->system_memory, SMRAM_C_BASE,
+ s->pci_address_space, SMRAM_C_BASE, SMRAM_C_SIZE);
+memory_region_add_subregion_overlap(s->system_memory, SMRAM_C_BASE,
 &f->smram_region, 1);
 memory_region_set_enabled(&f->smram_region, true);
 
@@ -284,17 +302,17 @@ PCIBus *i440fx_init(const char *pci_type,
 memory_region_init(&f->smram, OBJECT(d), "smram", 4 * GiB);
 memory_region_set_enabled(&f->smram, true);
 memory_region_init_alias(&f->low_smram, OBJECT(d), "smram-low",
- f->ram_memory, SMRAM_C_BASE, SMRAM_C_SIZE);
+ s->ram_memory, SMRAM_C_BASE, SMRAM_C_SIZE);
 memory_region_set_enabled(&f->low_smram, true);
 memory_region_add_subregion(&f->smram, SMRAM_C_BASE, &f->low_smram);
 object_property_add_con

[PATCH v2 05/16] hw/pci-host/q35: Initialize PCI_HOST_BYPASS_IOMMU property from board code

2023-06-28 Thread Bernhard Beschow

The Q35 PCI host already has a PCI_HOST_BYPASS_IOMMU property. However, the
host initializes this property itself by accessing global machine state,
thereby assuming it to be a PC machine. Avoid this by having board code
set this property.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Igor Mammedov 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/i386/pc_q35.c  | 2 ++
 hw/pci-host/q35.c | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 4edc0b35f4..852250e8cb 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -230,6 +230,8 @@ static void pc_q35_init(MachineState *machine)
 x86ms->below_4g_mem_size, NULL);
 object_property_set_int(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
 x86ms->above_4g_mem_size, NULL);
+object_property_set_bool(phb, PCI_HOST_BYPASS_IOMMU,
+ pcms->default_bus_bypass_iommu, NULL);
 sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);
 
 /* pci */
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 0604464074..d2830cee34 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -66,8 +66,7 @@ static void q35_host_realize(DeviceState *dev, Error **errp)
 s->mch.pci_address_space,
 s->mch.address_space_io,
 0, TYPE_PCIE_BUS);
-pci->bypass_iommu =
-PC_MACHINE(qdev_get_machine())->default_bus_bypass_iommu;
+
 qdev_realize(DEVICE(&s->mch), BUS(pci->bus), &error_fatal);
 }
 
-- 
2.41.0

[PATCH v2 11/16] hw/pci-host/i440fx: Add PCI_HOST_PROP_IO_MEM property

2023-06-28 Thread Bernhard Beschow

Introduce the property in anticipation of QOM'ification; Q35 has the same
property.

Signed-off-by: Bernhard Beschow 
---
 hw/pci-host/i440fx.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index efc940ba12..a740d1762b 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -27,7 +27,6 @@
 #include "qemu/range.h"
 #include "hw/i386/pc.h"
 #include "hw/pci/pci.h"
-#include "hw/pci/pci_bus.h"
 #include "hw/pci/pci_host.h"
 #include "hw/pci-host/i440fx.h"
 #include "hw/qdev-properties.h"
@@ -49,6 +48,7 @@ struct I440FXState {
 PCIHostState parent_obj;
 
 MemoryRegion *system_memory;
+MemoryRegion *io_memory;
 MemoryRegion *pci_address_space;
 MemoryRegion *ram_memory;
 Range pci_hole;
@@ -237,17 +237,22 @@ static void i440fx_pcihost_initfn(Object *obj)
 object_property_add_link(obj, PCI_HOST_PROP_SYSTEM_MEM, TYPE_MEMORY_REGION,
  (Object **) &s->system_memory,
  qdev_prop_allow_set_link_before_realize, 0);
+
+object_property_add_link(obj, PCI_HOST_PROP_IO_MEM, TYPE_MEMORY_REGION,
+ (Object **) &s->io_memory,
+ qdev_prop_allow_set_link_before_realize, 0);
 }
 
 static void i440fx_pcihost_realize(DeviceState *dev, Error **errp)
 {
+I440FXState *s = I440FX_PCI_HOST_BRIDGE(dev);
 PCIHostState *phb = PCI_HOST_BRIDGE(dev);
 SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
 
-memory_region_add_subregion(phb->bus->address_space_io, 0xcf8, 
&phb->conf_mem);
+memory_region_add_subregion(s->io_memory, 0xcf8, &phb->conf_mem);
 sysbus_init_ioports(sbd, 0xcf8, 4);
 
-memory_region_add_subregion(phb->bus->address_space_io, 0xcfc, 
&phb->data_mem);
+memory_region_add_subregion(s->io_memory, 0xcfc, &phb->data_mem);
 sysbus_init_ioports(sbd, 0xcfc, 4);
 
 /* register i440fx 0xcf8 port as coalesced pio */
@@ -273,11 +278,12 @@ PCIBus *i440fx_init(const char *pci_type,
 unsigned i;
 
 s->system_memory = address_space_mem;
+s->io_memory = address_space_io;
 s->pci_address_space = pci_address_space;
 s->ram_memory = ram_memory;
 
 b = pci_root_bus_new(dev, NULL, s->pci_address_space,
- address_space_io, 0, TYPE_PCI_BUS);
+ s->io_memory, 0, TYPE_PCI_BUS);
 phb->bus = b;
 object_property_add_child(qdev_get_machine(), "i440fx", OBJECT(dev));
 sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
-- 
2.41.0

[PATCH v2 09/16] hw/pci-host/i440fx: Move i440fx_realize() into PCII440FXState section

2023-06-28 Thread Bernhard Beschow

i440fx_realize() realizes the PCI device inside the host bridge
(PCII440FXState), but is implemented between i440fx_pcihost_realize() and
i440fx_init() which deal with the host bridge itself (I440FXState). Since we
want to append i440fx_init() to i440fx_pcihost_realize() later let's move
i440fx_realize() out of the way.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/pci-host/i440fx.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index 88beaf99c4..9df4688b2e 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -65,6 +65,15 @@ struct I440FXState {
  */
 #define I440FX_COREBOOT_RAM_SIZE 0x57
 
+static void i440fx_realize(PCIDevice *dev, Error **errp)
+{
+dev->config[I440FX_SMRAM] = 0x02;
+
+if (object_property_get_bool(qdev_get_machine(), "iommu", NULL)) {
+warn_report("i440fx doesn't support emulated iommu");
+}
+}
+
 static void i440fx_update_memory_mappings(PCII440FXState *d)
 {
 int i;
@@ -229,15 +238,6 @@ static void i440fx_pcihost_realize(DeviceState *dev, Error 
**errp)
 memory_region_add_coalescing(&phb->conf_mem, 0, 4);
 }
 
-static void i440fx_realize(PCIDevice *dev, Error **errp)
-{
-dev->config[I440FX_SMRAM] = 0x02;
-
-if (object_property_get_bool(qdev_get_machine(), "iommu", NULL)) {
-warn_report("i440fx doesn't support emulated iommu");
-}
-}
-
 PCIBus *i440fx_init(const char *pci_type,
 DeviceState *dev,
 MemoryRegion *address_space_mem,
-- 
2.41.0

[PATCH v2 15/16] hw/i386/pc_piix: Turn some local variables into initializers

2023-06-28 Thread Bernhard Beschow

Eliminates an else branch.

Suggested-by: Igor Mammedov 
Signed-off-by: Bernhard Beschow 
---
 hw/i386/pc_piix.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 1df309b8e2..d07218a8c9 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -122,11 +122,11 @@ static void pc_init1(MachineState *machine,
 BusState *idebus[MAX_IDE_BUS];
 ISADevice *rtc_state;
 MemoryRegion *ram_memory;
-MemoryRegion *pci_memory;
-MemoryRegion *rom_memory;
+MemoryRegion *pci_memory = NULL;
+MemoryRegion *rom_memory = system_memory;
 ram_addr_t lowmem;
-uint64_t hole64_size;
-Object *i440fx_host;
+uint64_t hole64_size = 0;
+Object *i440fx_host = NULL;
 
 /*
  * Calculate ram split, for memory below and above 4G.  It's a bit
@@ -205,11 +205,6 @@ static void pc_init1(MachineState *machine,
 hole64_size = object_property_get_uint(i440fx_host,
PCI_HOST_PROP_PCI_HOLE64_SIZE,
&error_abort);
-} else {
-pci_memory = NULL;
-rom_memory = system_memory;
-i440fx_host = NULL;
-hole64_size = 0;
 }
 
 pc_guest_info_init(pcms);
-- 
2.41.0

[PATCH v2 13/16] hw/pci-host/i440fx: Add I440FX_HOST_PROP_PCI_TYPE property

2023-06-28 Thread Bernhard Beschow

I440FX needs a different PCI device model if the "igd-passthru" property is
enabled. The type name is currently passed as a parameter to i440fx_init(). This
parameter will be replaced by a property assignment once i440fx_init() gets
resolved.

Signed-off-by: Bernhard Beschow 
---
 include/hw/pci-host/i440fx.h | 2 ++
 hw/pci-host/i440fx.c | 6 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/hw/pci-host/i440fx.h b/include/hw/pci-host/i440fx.h
index 7e38456ebb..2d7bae5a45 100644
--- a/include/hw/pci-host/i440fx.h
+++ b/include/hw/pci-host/i440fx.h
@@ -15,6 +15,8 @@
 #include "hw/pci-host/pam.h"
 #include "qom/object.h"
 
+#define I440FX_HOST_PROP_PCI_TYPE "pci-type"
+
 #define TYPE_I440FX_PCI_HOST_BRIDGE "i440FX-pcihost"
 #define TYPE_I440FX_PCI_DEVICE "i440FX"
 
diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index 4dd70e68fa..e8e66afc11 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -57,6 +57,8 @@ struct I440FXState {
 uint64_t pci_hole64_size;
 bool pci_hole64_fix;
 uint32_t short_root_bus;
+
+char *pci_type;
 };
 
 #define I440FX_PAM  0x59
@@ -284,6 +286,7 @@ PCIBus *i440fx_init(const char *pci_type,
 s->ram_memory = ram_memory;
 s->below_4g_mem_size = below_4g_mem_size;
 s->above_4g_mem_size = above_4g_mem_size;
+s->pci_type = (char *)pci_type;
 
 b = pci_root_bus_new(dev, NULL, s->pci_address_space,
  s->io_memory, 0, TYPE_PCI_BUS);
@@ -291,7 +294,7 @@ PCIBus *i440fx_init(const char *pci_type,
 object_property_add_child(qdev_get_machine(), "i440fx", OBJECT(dev));
 sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
 
-d = pci_create_simple(b, 0, pci_type);
+d = pci_create_simple(b, 0, s->pci_type);
 f = I440FX_PCI_DEVICE(d);
 
 range_set_bounds(&s->pci_hole, s->below_4g_mem_size,
@@ -390,6 +393,7 @@ static Property i440fx_props[] = {
 DEFINE_PROP_SIZE(PCI_HOST_ABOVE_4G_MEM_SIZE, I440FXState,
  above_4g_mem_size, 0),
 DEFINE_PROP_BOOL("x-pci-hole64-fix", I440FXState, pci_hole64_fix, true),
+DEFINE_PROP_STRING(I440FX_HOST_PROP_PCI_TYPE, I440FXState, pci_type),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.41.0

[PATCH v2 08/16] hw/pci-host/i440fx: Have common names for some local variables

2023-06-28 Thread Bernhard Beschow

`PCIHostState` is often referred to as `phb`, own device state usually as `s`.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/pci-host/i440fx.c | 26 --
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index daa4d11104..88beaf99c4 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -205,28 +205,28 @@ static void i440fx_pcihost_get_pci_hole64_end(Object 
*obj, Visitor *v,
 
 static void i440fx_pcihost_initfn(Object *obj)
 {
-PCIHostState *s = PCI_HOST_BRIDGE(obj);
+PCIHostState *phb = PCI_HOST_BRIDGE(obj);
 
-memory_region_init_io(&s->conf_mem, obj, &pci_host_conf_le_ops, s,
+memory_region_init_io(&phb->conf_mem, obj, &pci_host_conf_le_ops, phb,
   "pci-conf-idx", 4);
-memory_region_init_io(&s->data_mem, obj, &pci_host_data_le_ops, s,
+memory_region_init_io(&phb->data_mem, obj, &pci_host_data_le_ops, phb,
   "pci-conf-data", 4);
 }
 
 static void i440fx_pcihost_realize(DeviceState *dev, Error **errp)
 {
-PCIHostState *s = PCI_HOST_BRIDGE(dev);
+PCIHostState *phb = PCI_HOST_BRIDGE(dev);
 SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
 
-memory_region_add_subregion(s->bus->address_space_io, 0xcf8, &s->conf_mem);
+memory_region_add_subregion(phb->bus->address_space_io, 0xcf8, 
&phb->conf_mem);
 sysbus_init_ioports(sbd, 0xcf8, 4);
 
-memory_region_add_subregion(s->bus->address_space_io, 0xcfc, &s->data_mem);
+memory_region_add_subregion(phb->bus->address_space_io, 0xcfc, 
&phb->data_mem);
 sysbus_init_ioports(sbd, 0xcfc, 4);
 
 /* register i440fx 0xcf8 port as coalesced pio */
-memory_region_set_flush_coalesced(&s->data_mem);
-memory_region_add_coalescing(&s->conf_mem, 0, 4);
+memory_region_set_flush_coalesced(&phb->data_mem);
+memory_region_add_coalescing(&phb->conf_mem, 0, 4);
 }
 
 static void i440fx_realize(PCIDevice *dev, Error **errp)
@@ -248,17 +248,16 @@ PCIBus *i440fx_init(const char *pci_type,
 MemoryRegion *pci_address_space,
 MemoryRegion *ram_memory)
 {
+I440FXState *s = I440FX_PCI_HOST_BRIDGE(dev);
+PCIHostState *phb = PCI_HOST_BRIDGE(dev);
 PCIBus *b;
 PCIDevice *d;
-PCIHostState *s;
 PCII440FXState *f;
 unsigned i;
-I440FXState *i440fx;
 
-s = PCI_HOST_BRIDGE(dev);
 b = pci_root_bus_new(dev, NULL, pci_address_space,
  address_space_io, 0, TYPE_PCI_BUS);
-s->bus = b;
+phb->bus = b;
 object_property_add_child(qdev_get_machine(), "i440fx", OBJECT(dev));
 sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), &error_fatal);
 
@@ -268,8 +267,7 @@ PCIBus *i440fx_init(const char *pci_type,
 f->pci_address_space = pci_address_space;
 f->ram_memory = ram_memory;
 
-i440fx = I440FX_PCI_HOST_BRIDGE(dev);
-range_set_bounds(&i440fx->pci_hole, below_4g_mem_size,
+range_set_bounds(&s->pci_hole, below_4g_mem_size,
  IO_APIC_DEFAULT_ADDRESS - 1);
 
 /* setup pci memory mapping */
-- 
2.41.0

[PATCH v2 14/16] hw/pci-host/i440fx: Resolve i440fx_init()

2023-06-28 Thread Bernhard Beschow

i440fx_init() is a legacy init function. The previous patches worked towards
TYPE_I440FX_PCI_HOST_BRIDGE to be instantiated the QOM way. Do this now by
transforming the parameters passed to i440fx_init() into property assignments.

Signed-off-by: Bernhard Beschow 
---
 include/hw/pci-host/i440fx.h | 10 --
 hw/i386/pc_piix.c| 30 +-
 hw/pci-host/i440fx.c | 34 +-
 3 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/include/hw/pci-host/i440fx.h b/include/hw/pci-host/i440fx.h
index 2d7bae5a45..c988f70890 100644
--- a/include/hw/pci-host/i440fx.h
+++ b/include/hw/pci-host/i440fx.h
@@ -34,14 +34,4 @@ struct PCII440FXState {
 
 #define TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE "igd-passthrough-i440FX"
 
-PCIBus *i440fx_init(const char *pci_type,
-DeviceState *dev,
-MemoryRegion *address_space_mem,
-MemoryRegion *address_space_io,
-ram_addr_t below_4g_mem_size,
-ram_addr_t above_4g_mem_size,
-MemoryRegion *pci_memory,
-MemoryRegion *ram_memory);
-
-
 #endif
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 87bee368fc..1df309b8e2 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -126,7 +126,7 @@ static void pc_init1(MachineState *machine,
 MemoryRegion *rom_memory;
 ram_addr_t lowmem;
 uint64_t hole64_size;
-DeviceState *i440fx_host;
+Object *i440fx_host;
 
 /*
  * Calculate ram split, for memory below and above 4G.  It's a bit
@@ -201,8 +201,8 @@ static void pc_init1(MachineState *machine,
 pci_memory = g_new(MemoryRegion, 1);
 memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
 rom_memory = pci_memory;
-i440fx_host = qdev_new(host_type);
-hole64_size = object_property_get_uint(OBJECT(i440fx_host),
+i440fx_host = OBJECT(qdev_new(host_type));
+hole64_size = object_property_get_uint(i440fx_host,
PCI_HOST_PROP_PCI_HOLE64_SIZE,
&error_abort);
 } else {
@@ -243,12 +243,24 @@ static void pc_init1(MachineState *machine,
 PIIX3State *piix3;
 PCIDevice *pci_dev;
 
-pci_bus = i440fx_init(pci_type,
-  i440fx_host,
-  system_memory, system_io,
-  x86ms->below_4g_mem_size,
-  x86ms->above_4g_mem_size,
-  pci_memory, ram_memory);
+object_property_add_child(OBJECT(machine), "i440fx", i440fx_host);
+object_property_set_link(i440fx_host, PCI_HOST_PROP_RAM_MEM,
+ OBJECT(ram_memory), &error_fatal);
+object_property_set_link(i440fx_host, PCI_HOST_PROP_PCI_MEM,
+ OBJECT(pci_memory), &error_fatal);
+object_property_set_link(i440fx_host, PCI_HOST_PROP_SYSTEM_MEM,
+ OBJECT(system_memory), &error_fatal);
+object_property_set_link(i440fx_host, PCI_HOST_PROP_IO_MEM,
+ OBJECT(system_io), &error_fatal);
+object_property_set_uint(i440fx_host, PCI_HOST_BELOW_4G_MEM_SIZE,
+ x86ms->below_4g_mem_size, &error_fatal);
+object_property_set_uint(i440fx_host, PCI_HOST_ABOVE_4G_MEM_SIZE,
+ x86ms->above_4g_mem_size, &error_fatal);
+object_property_set_str(i440fx_host, I440FX_HOST_PROP_PCI_TYPE,
+pci_type, &error_fatal);
+sysbus_realize_and_unref(SYS_BUS_DEVICE(i440fx_host), &error_fatal);
+
+pci_bus = PCI_BUS(qdev_get_child_bus(DEVICE(i440fx_host), "pci.0"));
 pci_bus_map_irqs(pci_bus,
  xen_enabled() ? xen_pci_slot_get_pirq
: pc_pci_slot_get_pirq);
diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index e8e66afc11..62d6287681 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -249,9 +249,14 @@ static void i440fx_pcihost_initfn(Object *obj)
 
 static void i440fx_pcihost_realize(DeviceState *dev, Error **errp)
 {
+ERRP_GUARD();
 I440FXState *s = I440FX_PCI_HOST_BRIDGE(dev);
 PCIHostState *phb = PCI_HOST_BRIDGE(dev);
 SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
+PCIBus *b;
+PCIDevice *d;
+PCII440FXState *f;
+unsigned i;
 
 memory_region_add_subregion(s->io_memory, 0xcf8, &phb->conf_mem);
 sysbus_init_ioports(sbd, 0xcf8, 4);
@@ -262,37 +267,10 @@ static void i440fx_pcihost_realize(DeviceState *dev, 
Error **errp)
 /* register i440fx 0xcf8 port as coalesced pio */
 memory_region_set_flush_coalesced(&phb->data_mem);
 memory_region_add_coalescing(&phb->conf_mem, 0, 4);
-}
-
-PCIBus *i440fx_init(const char *pci_type,
-

[PATCH v2 04/16] hw/pci/pci_host: Introduce PCI_HOST_BYPASS_IOMMU macro

2023-06-28 Thread Bernhard Beschow

Introduce a macro to avoid copy and pasting strings which can easily
cause typos.

Suggested-by: Michael S. Tsirkin 
Signed-off-by: Bernhard Beschow 
Reviewed-by: Igor Mammedov 
---
 include/hw/pci/pci_host.h | 2 ++
 hw/pci/pci_host.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/hw/pci/pci_host.h b/include/hw/pci/pci_host.h
index c6f4eb4585..e52d8ec2cd 100644
--- a/include/hw/pci/pci_host.h
+++ b/include/hw/pci/pci_host.h
@@ -31,6 +31,8 @@
 #include "hw/sysbus.h"
 #include "qom/object.h"
 
+#define PCI_HOST_BYPASS_IOMMU "bypass-iommu"
+
 #define TYPE_PCI_HOST_BRIDGE "pci-host-bridge"
 OBJECT_DECLARE_TYPE(PCIHostState, PCIHostBridgeClass, PCI_HOST_BRIDGE)
 
diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c
index dfd185bbb4..7af8afdcbe 100644
--- a/hw/pci/pci_host.c
+++ b/hw/pci/pci_host.c
@@ -232,7 +232,7 @@ const VMStateDescription vmstate_pcihost = {
 static Property pci_host_properties_common[] = {
 DEFINE_PROP_BOOL("x-config-reg-migration-enabled", PCIHostState,
  mig_enabled, true),
-DEFINE_PROP_BOOL("bypass-iommu", PCIHostState, bypass_iommu, false),
+DEFINE_PROP_BOOL(PCI_HOST_BYPASS_IOMMU, PCIHostState, bypass_iommu, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.41.0

[PATCH v2 07/16] hw/pci-host/i440fx: Replace magic values by existing constants

2023-06-28 Thread Bernhard Beschow

Signed-off-by: Bernhard Beschow 
Reviewed-by: Igor Mammedov 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/pci-host/i440fx.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index 61e7b97ff4..daa4d11104 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -277,8 +277,8 @@ PCIBus *i440fx_init(const char *pci_type,
 
 /* if *disabled* show SMRAM to all CPUs */
 memory_region_init_alias(&f->smram_region, OBJECT(d), "smram-region",
- f->pci_address_space, 0xa, 0x2);
-memory_region_add_subregion_overlap(f->system_memory, 0xa,
+ f->pci_address_space, SMRAM_C_BASE, SMRAM_C_SIZE);
+memory_region_add_subregion_overlap(f->system_memory, SMRAM_C_BASE,
 &f->smram_region, 1);
 memory_region_set_enabled(&f->smram_region, true);
 
@@ -286,9 +286,9 @@ PCIBus *i440fx_init(const char *pci_type,
 memory_region_init(&f->smram, OBJECT(d), "smram", 4 * GiB);
 memory_region_set_enabled(&f->smram, true);
 memory_region_init_alias(&f->low_smram, OBJECT(d), "smram-low",
- f->ram_memory, 0xa, 0x2);
+ f->ram_memory, SMRAM_C_BASE, SMRAM_C_SIZE);
 memory_region_set_enabled(&f->low_smram, true);
-memory_region_add_subregion(&f->smram, 0xa, &f->low_smram);
+memory_region_add_subregion(&f->smram, SMRAM_C_BASE, &f->low_smram);
 object_property_add_const_link(qdev_get_machine(), "smram",
OBJECT(&f->smram));
 
-- 
2.41.0

[PATCH v2 02/16] hw/pci-host/q35: Fix double, contradicting .endianness assignment

2023-06-28 Thread Bernhard Beschow

Fixes the following clangd warning (-Winitializer-overrides):

  q35.c:297:19: Initializer overrides prior initialization of this subobject
  q35.c:292:19: previous initialization is here

Settle on little endian which is consistent with using pci_host_conf_le_ops.

Fixes: bafc90bdc594 ("q35: implement TSEG")
Signed-off-by: Bernhard Beschow 
---
 hw/pci-host/q35.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index fd18920e7f..84137b9ad9 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -285,7 +285,6 @@ static void blackhole_write(void *opaque, hwaddr addr, 
uint64_t val,
 static const MemoryRegionOps blackhole_ops = {
 .read = blackhole_read,
 .write = blackhole_write,
-.endianness = DEVICE_NATIVE_ENDIAN,
 .valid.min_access_size = 1,
 .valid.max_access_size = 4,
 .impl.min_access_size = 4,
-- 
2.41.0

[PATCH v2 16/16] hw/i386/pc_piix: Move i440fx' realize near its qdev_new()

2023-06-28 Thread Bernhard Beschow

I440FX realization is currently mixed with PIIX3 creation. Furthermore, it is
common practice to only set properties between a device's qdev_new() and
qdev_realize(). Clean up to resolve both issues.

Since I440FX spawns a PCI bus let's also move the pci_bus initialization there.

Note that when running `qemu-system-x86_64 -M pc -S` before and after this
patch, `info mtree` in the QEMU console doesn't show any differences except that
the ordering is different.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/i386/pc_piix.c | 57 ---
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index d07218a8c9..b18443d3df 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -114,7 +114,7 @@ static void pc_init1(MachineState *machine,
 X86MachineState *x86ms = X86_MACHINE(machine);
 MemoryRegion *system_memory = get_system_memory();
 MemoryRegion *system_io = get_system_io();
-PCIBus *pci_bus;
+PCIBus *pci_bus = NULL;
 ISABus *isa_bus;
 int piix3_devfn = -1;
 qemu_irq smi_irq;
@@ -126,7 +126,6 @@ static void pc_init1(MachineState *machine,
 MemoryRegion *rom_memory = system_memory;
 ram_addr_t lowmem;
 uint64_t hole64_size = 0;
-Object *i440fx_host = NULL;
 
 /*
  * Calculate ram split, for memory below and above 4G.  It's a bit
@@ -198,11 +197,37 @@ static void pc_init1(MachineState *machine,
 }
 
 if (pcmc->pci_enabled) {
+Object *phb;
+
 pci_memory = g_new(MemoryRegion, 1);
 memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
 rom_memory = pci_memory;
-i440fx_host = OBJECT(qdev_new(host_type));
-hole64_size = object_property_get_uint(i440fx_host,
+
+phb = OBJECT(qdev_new(host_type));
+object_property_add_child(OBJECT(machine), "i440fx", phb);
+object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM,
+ OBJECT(ram_memory), &error_fatal);
+object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM,
+ OBJECT(pci_memory), &error_fatal);
+object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM,
+ OBJECT(system_memory), &error_fatal);
+object_property_set_link(phb, PCI_HOST_PROP_IO_MEM,
+ OBJECT(system_io), &error_fatal);
+object_property_set_uint(phb, PCI_HOST_BELOW_4G_MEM_SIZE,
+ x86ms->below_4g_mem_size, &error_fatal);
+object_property_set_uint(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
+ x86ms->above_4g_mem_size, &error_fatal);
+object_property_set_str(phb, I440FX_HOST_PROP_PCI_TYPE, pci_type,
+&error_fatal);
+sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);
+
+pci_bus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pci.0"));
+pci_bus_map_irqs(pci_bus,
+ xen_enabled() ? xen_pci_slot_get_pirq
+   : pc_pci_slot_get_pirq);
+pcms->bus = pci_bus;
+
+hole64_size = object_property_get_uint(phb,
PCI_HOST_PROP_PCI_HOLE64_SIZE,
&error_abort);
 }
@@ -238,29 +263,6 @@ static void pc_init1(MachineState *machine,
 PIIX3State *piix3;
 PCIDevice *pci_dev;
 
-object_property_add_child(OBJECT(machine), "i440fx", i440fx_host);
-object_property_set_link(i440fx_host, PCI_HOST_PROP_RAM_MEM,
- OBJECT(ram_memory), &error_fatal);
-object_property_set_link(i440fx_host, PCI_HOST_PROP_PCI_MEM,
- OBJECT(pci_memory), &error_fatal);
-object_property_set_link(i440fx_host, PCI_HOST_PROP_SYSTEM_MEM,
- OBJECT(system_memory), &error_fatal);
-object_property_set_link(i440fx_host, PCI_HOST_PROP_IO_MEM,
- OBJECT(system_io), &error_fatal);
-object_property_set_uint(i440fx_host, PCI_HOST_BELOW_4G_MEM_SIZE,
- x86ms->below_4g_mem_size, &error_fatal);
-object_property_set_uint(i440fx_host, PCI_HOST_ABOVE_4G_MEM_SIZE,
- x86ms->above_4g_mem_size, &error_fatal);
-object_property_set_str(i440fx_host, I440FX_HOST_PROP_PCI_TYPE,
-pci_type, &error_fatal);
-sysbus_realize_and_unref(SYS_BUS_DEVICE(i440fx_host), &error_fatal);
-
-pci_bus = PCI_BUS(qdev_get_child_bus(DEVICE(i440fx_host), "pci.0"));
-pci_bus_map_irqs(pci_bus,
- xen_enabled() ? xen_pci_slot_get_pirq
-   : pc_pci_slot_get_pirq);
-pcms->bus = pci_bus;
-
 pci_dev = pci_create_simple_multifunction(pci

[PATCH v2 00/16] Q35 and I440FX host bridge QOM cleanup

2023-06-28 Thread Bernhard Beschow

This series resolves the legacy i440fx_init() function and instantiates the
I440FX host bridge the QOM way. As a preparation the Q35 host bridge receives
some cleanup as well.

Most of the Q35 patches have been submitted under [1] before. This series
incorporates only the changes making the two device models consistent with
each other.

The original plan of [1] was to clean up Q35 first and then submit a separate
follow-up I440FX QOM'ification series. This series takes a more direct approach
by cutting down on the changes in both device models while still allowing both
device models to be instantiated the same way. The remaining patches in [1]
would still be doable.

The series is structured as follows: The first 5 patches clean up Q35, the next
patch massages Q35 to share its property names with I440FX. The rest of the
series resolves i440fx_init().

Tesging done:
* `make check`
* `make check-avocado`
* Run `xl create` under Xen with the following config:
name = "Manjaro"
type = 'hvm'
memory = 1536
apic = 1
usb = 1
disk = [ "file:manjaro-kde-21.2.6-220416-linux515.iso,hdc:cdrom,r" ]
device_model_override = "/usr/bin/qemu-system-x86_64"
vga = "stdvga"
sdl = 1

v2:
* Rename `address_space_io` to `io_memory` (Phil)
* Eliminate one else branch in pc_piix (Igor)
* Make Q35's blackhole_ops DEVICE_LITTLE_ENDIAN again (Igor)
* Possibly ongoing discussion regarding bringing together i440fx new and realize

[1] https://patchew.org/QEMU/20230304152648.103749-1-shen...@gmail.com/

Bernhard Beschow (16):
  hw/i386/pc_q35: Resolve redundant q35_host variable
  hw/pci-host/q35: Fix double, contradicting .endianness assignment
  hw/pci-host/q35: Initialize PCMachineState::bus in board code
  hw/pci/pci_host: Introduce PCI_HOST_BYPASS_IOMMU macro
  hw/pci-host/q35: Initialize PCI_HOST_BYPASS_IOMMU property from board
code
  hw/pci-host/q35: Make some property name macros reusable by i440fx
  hw/pci-host/i440fx: Replace magic values by existing constants
  hw/pci-host/i440fx: Have common names for some local variables
  hw/pci-host/i440fx: Move i440fx_realize() into PCII440FXState section
  hw/pci-host/i440fx: Make MemoryRegion pointers accessible as
properties
  hw/pci-host/i440fx: Add PCI_HOST_PROP_IO_MEM property
  hw/pci-host/i440fx: Add PCI_HOST_{ABOVE, BELOW}_4G_MEM_SIZE properties
  hw/pci-host/i440fx: Add I440FX_HOST_PROP_PCI_TYPE property
  hw/pci-host/i440fx: Resolve i440fx_init()
  hw/i386/pc_piix: Turn some local variables into initializers
  hw/i386/pc_piix: Move i440fx' realize near its qdev_new()

 include/hw/i386/pc.h |   4 ++
 include/hw/pci-host/i440fx.h |  16 +
 include/hw/pci-host/q35.h|   5 --
 include/hw/pci/pci_host.h|   2 +
 hw/i386/pc_piix.c|  59 +---
 hw/i386/pc_q35.c |  31 +
 hw/pci-host/i440fx.c | 128 +++
 hw/pci-host/q35.c|  13 ++--
 hw/pci/pci_host.c|   2 +-
 9 files changed, 135 insertions(+), 125 deletions(-)

-- 
2.41.0

[PATCH v2 12/16] hw/pci-host/i440fx: Add PCI_HOST_{ABOVE, BELOW}_4G_MEM_SIZE properties

2023-06-28 Thread Bernhard Beschow

Introduce the properties in anticipation of QOM'ification; Q35 has the same
properties.

Note that we want to avoid a "ram size" property in the QOM interface since it
seems redundant to both properties introduced in this change. Thus the removal
of the ram_size parameter. We assume the invariant of both properties to sum up
to "ram size" which is already asserted in pc_memory_init(). Under Xen the
invariant seems to hold as well, so we now also check it there.

Signed-off-by: Bernhard Beschow 
---
 include/hw/pci-host/i440fx.h |  1 -
 hw/i386/pc_piix.c|  5 -
 hw/pci-host/i440fx.c | 12 ++--
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/include/hw/pci-host/i440fx.h b/include/hw/pci-host/i440fx.h
index e3a550021e..7e38456ebb 100644
--- a/include/hw/pci-host/i440fx.h
+++ b/include/hw/pci-host/i440fx.h
@@ -36,7 +36,6 @@ PCIBus *i440fx_init(const char *pci_type,
 DeviceState *dev,
 MemoryRegion *address_space_mem,
 MemoryRegion *address_space_io,
-ram_addr_t ram_size,
 ram_addr_t below_4g_mem_size,
 ram_addr_t above_4g_mem_size,
 MemoryRegion *pci_memory,
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index f9947fbc10..87bee368fc 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -227,6 +227,9 @@ static void pc_init1(MachineState *machine,
 if (!xen_enabled()) {
 pc_memory_init(pcms, system_memory, rom_memory, hole64_size);
 } else {
+assert(machine->ram_size == x86ms->below_4g_mem_size +
+x86ms->above_4g_mem_size);
+
 pc_system_flash_cleanup_unused(pcms);
 if (machine->kernel_filename != NULL) {
 /* For xen HVM direct kernel boot, load linux here */
@@ -242,7 +245,7 @@ static void pc_init1(MachineState *machine,
 
 pci_bus = i440fx_init(pci_type,
   i440fx_host,
-  system_memory, system_io, machine->ram_size,
+  system_memory, system_io,
   x86ms->below_4g_mem_size,
   x86ms->above_4g_mem_size,
   pci_memory, ram_memory);
diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index a740d1762b..4dd70e68fa 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -52,6 +52,8 @@ struct I440FXState {
 MemoryRegion *pci_address_space;
 MemoryRegion *ram_memory;
 Range pci_hole;
+uint64_t below_4g_mem_size;
+uint64_t above_4g_mem_size;
 uint64_t pci_hole64_size;
 bool pci_hole64_fix;
 uint32_t short_root_bus;
@@ -264,7 +266,6 @@ PCIBus *i440fx_init(const char *pci_type,
 DeviceState *dev,
 MemoryRegion *address_space_mem,
 MemoryRegion *address_space_io,
-ram_addr_t ram_size,
 ram_addr_t below_4g_mem_size,
 ram_addr_t above_4g_mem_size,
 MemoryRegion *pci_address_space,
@@ -281,6 +282,8 @@ PCIBus *i440fx_init(const char *pci_type,
 s->io_memory = address_space_io;
 s->pci_address_space = pci_address_space;
 s->ram_memory = ram_memory;
+s->below_4g_mem_size = below_4g_mem_size;
+s->above_4g_mem_size = above_4g_mem_size;
 
 b = pci_root_bus_new(dev, NULL, s->pci_address_space,
  s->io_memory, 0, TYPE_PCI_BUS);
@@ -291,7 +294,7 @@ PCIBus *i440fx_init(const char *pci_type,
 d = pci_create_simple(b, 0, pci_type);
 f = I440FX_PCI_DEVICE(d);
 
-range_set_bounds(&s->pci_hole, below_4g_mem_size,
+range_set_bounds(&s->pci_hole, s->below_4g_mem_size,
  IO_APIC_DEFAULT_ADDRESS - 1);
 
 /* setup pci memory mapping */
@@ -322,6 +325,7 @@ PCIBus *i440fx_init(const char *pci_type,
  PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE, PAM_EXPAN_SIZE);
 }
 
+ram_addr_t ram_size = s->below_4g_mem_size + s->above_4g_mem_size;
 ram_size = ram_size / 8 / 1024 / 1024;
 if (ram_size > 255) {
 ram_size = 255;
@@ -381,6 +385,10 @@ static Property i440fx_props[] = {
 DEFINE_PROP_SIZE(PCI_HOST_PROP_PCI_HOLE64_SIZE, I440FXState,
  pci_hole64_size, I440FX_PCI_HOST_HOLE64_SIZE_DEFAULT),
 DEFINE_PROP_UINT32("short_root_bus", I440FXState, short_root_bus, 0),
+DEFINE_PROP_SIZE(PCI_HOST_BELOW_4G_MEM_SIZE, I440FXState,
+ below_4g_mem_size, 0),
+DEFINE_PROP_SIZE(PCI_HOST_ABOVE_4G_MEM_SIZE, I440FXState,
+ above_4g_mem_size, 0),
 DEFINE_PROP_BOOL("x-pci-hole64-fix", I440FXState, pci_hole64_fix, true),
 DEFINE_PROP_END_OF_LIST(),
 };
-- 
2.41.0

[PATCH v2 03/16] hw/pci-host/q35: Initialize PCMachineState::bus in board code

2023-06-28 Thread Bernhard Beschow

The Q35 PCI host currently sets the PC machine's PCI bus attribute
through global state, thereby assuming the machine to be a PC machine.
The Q35 machine code already holds on to Q35's pci bus attribute, so can
easily set its own property while preserving encapsulation.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Igor Mammedov 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/i386/pc_q35.c  | 4 +++-
 hw/pci-host/q35.c | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index d9f3764184..4edc0b35f4 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -230,10 +230,12 @@ static void pc_q35_init(MachineState *machine)
 x86ms->below_4g_mem_size, NULL);
 object_property_set_int(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
 x86ms->above_4g_mem_size, NULL);
+sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);
 
 /* pci */
-sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);
 host_bus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pcie.0"));
+pcms->bus = host_bus;
+
 /* create ISA bus */
 lpc = pci_new_multifunction(PCI_DEVFN(ICH9_LPC_DEV, ICH9_LPC_FUNC), true,
 TYPE_ICH9_LPC_DEVICE);
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 84137b9ad9..0604464074 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -66,7 +66,6 @@ static void q35_host_realize(DeviceState *dev, Error **errp)
 s->mch.pci_address_space,
 s->mch.address_space_io,
 0, TYPE_PCIE_BUS);
-PC_MACHINE(qdev_get_machine())->bus = pci->bus;
 pci->bypass_iommu =
 PC_MACHINE(qdev_get_machine())->default_bus_bypass_iommu;
 qdev_realize(DEVICE(&s->mch), BUS(pci->bus), &error_fatal);
-- 
2.41.0

[PATCH v2 01/16] hw/i386/pc_q35: Resolve redundant q35_host variable

2023-06-28 Thread Bernhard Beschow

The variable is redundant to "phb" and is never used by its real type.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Thomas Huth 
---
 hw/i386/pc_q35.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 11a7084ea1..d9f3764184 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -120,8 +120,7 @@ static void pc_q35_init(MachineState *machine)
 PCMachineState *pcms = PC_MACHINE(machine);
 PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
 X86MachineState *x86ms = X86_MACHINE(machine);
-Q35PCIHost *q35_host;
-PCIHostState *phb;
+Object *phb;
 PCIBus *host_bus;
 PCIDevice *lpc;
 DeviceState *lpc_dev;
@@ -207,10 +206,10 @@ static void pc_q35_init(MachineState *machine)
 }
 
 /* create pci host bus */
-q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE));
+phb = OBJECT(qdev_new(TYPE_Q35_HOST_DEVICE));
 
 if (pcmc->pci_enabled) {
-pci_hole64_size = object_property_get_uint(OBJECT(q35_host),
+pci_hole64_size = object_property_get_uint(phb,

PCI_HOST_PROP_PCI_HOLE64_SIZE,
&error_abort);
 }
@@ -218,23 +217,23 @@ static void pc_q35_init(MachineState *machine)
 /* allocate ram and load rom/bios */
 pc_memory_init(pcms, system_memory, rom_memory, pci_hole64_size);
 
-object_property_add_child(OBJECT(machine), "q35", OBJECT(q35_host));
-object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM,
+object_property_add_child(OBJECT(machine), "q35", phb);
+object_property_set_link(phb, MCH_HOST_PROP_RAM_MEM,
  OBJECT(machine->ram), NULL);
-object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_PCI_MEM,
+object_property_set_link(phb, MCH_HOST_PROP_PCI_MEM,
  OBJECT(pci_memory), NULL);
-object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_SYSTEM_MEM,
+object_property_set_link(phb, MCH_HOST_PROP_SYSTEM_MEM,
  OBJECT(system_memory), NULL);
-object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_IO_MEM,
+object_property_set_link(phb, MCH_HOST_PROP_IO_MEM,
  OBJECT(system_io), NULL);
-object_property_set_int(OBJECT(q35_host), PCI_HOST_BELOW_4G_MEM_SIZE,
+object_property_set_int(phb, PCI_HOST_BELOW_4G_MEM_SIZE,
 x86ms->below_4g_mem_size, NULL);
-object_property_set_int(OBJECT(q35_host), PCI_HOST_ABOVE_4G_MEM_SIZE,
+object_property_set_int(phb, PCI_HOST_ABOVE_4G_MEM_SIZE,
 x86ms->above_4g_mem_size, NULL);
+
 /* pci */
-sysbus_realize_and_unref(SYS_BUS_DEVICE(q35_host), &error_fatal);
-phb = PCI_HOST_BRIDGE(q35_host);
-host_bus = phb->bus;
+sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal);
+host_bus = PCI_BUS(qdev_get_child_bus(DEVICE(phb), "pcie.0"));
 /* create ISA bus */
 lpc = pci_new_multifunction(PCI_DEVFN(ICH9_LPC_DEV, ICH9_LPC_FUNC), true,
 TYPE_ICH9_LPC_DEVICE);
-- 
2.41.0

[PATCH v2 06/16] hw/pci-host/q35: Make some property name macros reusable by i440fx

2023-06-28 Thread Bernhard Beschow

Signed-off-by: Bernhard Beschow 
---
 include/hw/i386/pc.h  | 4 
 include/hw/pci-host/q35.h | 5 -
 hw/i386/pc_q35.c  | 8 
 hw/pci-host/q35.c | 8 
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 6eec0fc51d..c34c698cdd 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -146,6 +146,10 @@ void pc_acpi_smi_interrupt(void *opaque, int irq, int 
level);
 
 void pc_guest_info_init(PCMachineState *pcms);
 
+#define PCI_HOST_PROP_RAM_MEM  "ram-mem"
+#define PCI_HOST_PROP_PCI_MEM  "pci-mem"
+#define PCI_HOST_PROP_SYSTEM_MEM   "system-mem"
+#define PCI_HOST_PROP_IO_MEM   "io-mem"
 #define PCI_HOST_PROP_PCI_HOLE_START   "pci-hole-start"
 #define PCI_HOST_PROP_PCI_HOLE_END "pci-hole-end"
 #define PCI_HOST_PROP_PCI_HOLE64_START "pci-hole64-start"
diff --git a/include/hw/pci-host/q35.h b/include/hw/pci-host/q35.h
index e89329c51e..1d98bbfe0d 100644
--- a/include/hw/pci-host/q35.h
+++ b/include/hw/pci-host/q35.h
@@ -74,11 +74,6 @@ struct Q35PCIHost {
  * gmch part
  */
 
-#define MCH_HOST_PROP_RAM_MEM "ram-mem"
-#define MCH_HOST_PROP_PCI_MEM "pci-mem"
-#define MCH_HOST_PROP_SYSTEM_MEM "system-mem"
-#define MCH_HOST_PROP_IO_MEM "io-mem"
-
 /* PCI configuration */
 #define MCH_HOST_BRIDGE"MCH"
 
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 852250e8cb..02dd274276 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -218,13 +218,13 @@ static void pc_q35_init(MachineState *machine)
 pc_memory_init(pcms, system_memory, rom_memory, pci_hole64_size);
 
 object_property_add_child(OBJECT(machine), "q35", phb);
-object_property_set_link(phb, MCH_HOST_PROP_RAM_MEM,
+object_property_set_link(phb, PCI_HOST_PROP_RAM_MEM,
  OBJECT(machine->ram), NULL);
-object_property_set_link(phb, MCH_HOST_PROP_PCI_MEM,
+object_property_set_link(phb, PCI_HOST_PROP_PCI_MEM,
  OBJECT(pci_memory), NULL);
-object_property_set_link(phb, MCH_HOST_PROP_SYSTEM_MEM,
+object_property_set_link(phb, PCI_HOST_PROP_SYSTEM_MEM,
  OBJECT(system_memory), NULL);
-object_property_set_link(phb, MCH_HOST_PROP_IO_MEM,
+object_property_set_link(phb, PCI_HOST_PROP_IO_MEM,
  OBJECT(system_io), NULL);
 object_property_set_int(phb, PCI_HOST_BELOW_4G_MEM_SIZE,
 x86ms->below_4g_mem_size, NULL);
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index d2830cee34..91c46df9ae 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -240,19 +240,19 @@ static void q35_host_initfn(Object *obj)
 object_property_add_uint64_ptr(obj, PCIE_HOST_MCFG_SIZE,
&pehb->size, OBJ_PROP_FLAG_READ);
 
-object_property_add_link(obj, MCH_HOST_PROP_RAM_MEM, TYPE_MEMORY_REGION,
+object_property_add_link(obj, PCI_HOST_PROP_RAM_MEM, TYPE_MEMORY_REGION,
  (Object **) &s->mch.ram_memory,
  qdev_prop_allow_set_link_before_realize, 0);
 
-object_property_add_link(obj, MCH_HOST_PROP_PCI_MEM, TYPE_MEMORY_REGION,
+object_property_add_link(obj, PCI_HOST_PROP_PCI_MEM, TYPE_MEMORY_REGION,
  (Object **) &s->mch.pci_address_space,
  qdev_prop_allow_set_link_before_realize, 0);
 
-object_property_add_link(obj, MCH_HOST_PROP_SYSTEM_MEM, TYPE_MEMORY_REGION,
+object_property_add_link(obj, PCI_HOST_PROP_SYSTEM_MEM, TYPE_MEMORY_REGION,
  (Object **) &s->mch.system_memory,
  qdev_prop_allow_set_link_before_realize, 0);
 
-object_property_add_link(obj, MCH_HOST_PROP_IO_MEM, TYPE_MEMORY_REGION,
+object_property_add_link(obj, PCI_HOST_PROP_IO_MEM, TYPE_MEMORY_REGION,
  (Object **) &s->mch.address_space_io,
  qdev_prop_allow_set_link_before_realize, 0);
 }
-- 
2.41.0

[PATCH] ui/gtk: Make sure the right EGL context is currently bound

2023-06-28 Thread Dongwon Kim

Observed a wrong context is bound when changing the scanout mode.
To prevent problem, it is needed to make sure to bind the right
context in gtk_egl_set_scanout_mode/gtk_gl_area_set_scanout_mode
as well as unbind one in the end of gd_egl_update/gd_gl_area_update.

Cc: Gerd Hoffmann 
Cc: Marc-André Lureau 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 ui/gtk-egl.c | 4 
 ui/gtk-gl-area.c | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
index 19130041bc..79f9f334f2 100644
--- a/ui/gtk-egl.c
+++ b/ui/gtk-egl.c
@@ -31,6 +31,8 @@ static void gtk_egl_set_scanout_mode(VirtualConsole *vc, bool 
scanout)
 
 vc->gfx.scanout_mode = scanout;
 if (!vc->gfx.scanout_mode) {
+eglMakeCurrent(qemu_egl_display, vc->gfx.esurface,
+   vc->gfx.esurface, vc->gfx.ectx);
 egl_fb_destroy(&vc->gfx.guest_fb);
 if (vc->gfx.surface) {
 surface_gl_destroy_texture(vc->gfx.gls, vc->gfx.ds);
@@ -134,6 +136,8 @@ void gd_egl_update(DisplayChangeListener *dcl,
vc->gfx.esurface, vc->gfx.ectx);
 surface_gl_update_texture(vc->gfx.gls, vc->gfx.ds, x, y, w, h);
 vc->gfx.glupdates++;
+eglMakeCurrent(qemu_egl_display, EGL_NO_SURFACE,
+   EGL_NO_SURFACE, EGL_NO_CONTEXT);
 }
 
 void gd_egl_refresh(DisplayChangeListener *dcl)
diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c
index c384a1516b..285e661071 100644
--- a/ui/gtk-gl-area.c
+++ b/ui/gtk-gl-area.c
@@ -26,6 +26,7 @@ static void gtk_gl_area_set_scanout_mode(VirtualConsole *vc, 
bool scanout)
 
 vc->gfx.scanout_mode = scanout;
 if (!vc->gfx.scanout_mode) {
+gtk_gl_area_make_current(GTK_GL_AREA(vc->gfx.drawing_area));
 egl_fb_destroy(&vc->gfx.guest_fb);
 if (vc->gfx.surface) {
 surface_gl_destroy_texture(vc->gfx.gls, vc->gfx.ds);
@@ -115,6 +116,7 @@ void gd_gl_area_update(DisplayChangeListener *dcl,
 gtk_gl_area_make_current(GTK_GL_AREA(vc->gfx.drawing_area));
 surface_gl_update_texture(vc->gfx.gls, vc->gfx.ds, x, y, w, h);
 vc->gfx.glupdates++;
+gdk_gl_context_clear_current();
 }
 
 void gd_gl_area_refresh(DisplayChangeListener *dcl)
-- 
2.34.1

[PATCH] ui/gtk: skip refresh if new dmabuf is submitted

2023-06-28 Thread Dongwon Kim

Skip refresh if a new dmabuf (guest scanout frame) is submitted
and ready to be drawn because the scanout will be refreshed with
new frame anyway.

Also, setting scanout mode is better to be done right before
a draw event is scheduled because the mode can be reset anytime
after it is set in dpy_gl_scanout_texture by any asynchronouse
dpy_refresh call, which eventually cancels the drawing of the
guest scanout texture.

Cc: Gerd Hoffmann 
Cc: Marc-André Lureau 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 ui/gtk-egl.c | 6 +-
 ui/gtk-gl-area.c | 6 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
index 19130041bc..62ff6812b6 100644
--- a/ui/gtk-egl.c
+++ b/ui/gtk-egl.c
@@ -143,6 +143,10 @@ void gd_egl_refresh(DisplayChangeListener *dcl)
 gd_update_monitor_refresh_rate(
 vc, vc->window ? vc->window : vc->gfx.drawing_area);
 
+if (vc->gfx.guest_fb.dmabuf && vc->gfx.guest_fb.dmabuf->draw_submitted) {
+return;
+}
+
 if (!vc->gfx.esurface) {
 gd_egl_init(vc);
 if (!vc->gfx.esurface) {
@@ -236,7 +240,6 @@ void gd_egl_scanout_texture(DisplayChangeListener *dcl,
 eglMakeCurrent(qemu_egl_display, vc->gfx.esurface,
vc->gfx.esurface, vc->gfx.ectx);
 
-gtk_egl_set_scanout_mode(vc, true);
 egl_fb_setup_for_tex(&vc->gfx.guest_fb, backing_width, backing_height,
  backing_id, false);
 }
@@ -344,6 +347,7 @@ void gd_egl_flush(DisplayChangeListener *dcl,
 if (vc->gfx.guest_fb.dmabuf && !vc->gfx.guest_fb.dmabuf->draw_submitted) {
 graphic_hw_gl_block(vc->gfx.dcl.con, true);
 vc->gfx.guest_fb.dmabuf->draw_submitted = true;
+gtk_egl_set_scanout_mode(vc, true);
 gtk_widget_queue_draw_area(area, x, y, w, h);
 return;
 }
diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c
index c384a1516b..68eff3435b 100644
--- a/ui/gtk-gl-area.c
+++ b/ui/gtk-gl-area.c
@@ -123,6 +123,10 @@ void gd_gl_area_refresh(DisplayChangeListener *dcl)
 
 gd_update_monitor_refresh_rate(vc, vc->window ? vc->window : 
vc->gfx.drawing_area);
 
+if (vc->gfx.guest_fb.dmabuf && vc->gfx.guest_fb.dmabuf->draw_submitted) {
+return;
+}
+
 if (!vc->gfx.gls) {
 if (!gtk_widget_get_realized(vc->gfx.drawing_area)) {
 return;
@@ -261,7 +265,6 @@ void gd_gl_area_scanout_texture(DisplayChangeListener *dcl,
 return;
 }
 
-gtk_gl_area_set_scanout_mode(vc, true);
 egl_fb_setup_for_tex(&vc->gfx.guest_fb, backing_width, backing_height,
  backing_id, false);
 }
@@ -281,6 +284,7 @@ void gd_gl_area_scanout_flush(DisplayChangeListener *dcl,
 if (vc->gfx.guest_fb.dmabuf && !vc->gfx.guest_fb.dmabuf->draw_submitted) {
 graphic_hw_gl_block(vc->gfx.dcl.con, true);
 vc->gfx.guest_fb.dmabuf->draw_submitted = true;
+gtk_gl_area_set_scanout_mode(vc, true);
 }
 gtk_gl_area_queue_render(GTK_GL_AREA(vc->gfx.drawing_area));
 }
-- 
2.34.1

Re: [PATCH] Add support for RAPL MSRs in KVM/Qemu

2023-06-28 Thread Anthony Harivel

Marcelo Tosatti, Jun 28, 2023 at 19:26:

Hi Marcelo,

> On Fri, Jun 16, 2023 at 04:08:30PM +0200, Anthony Harivel wrote:
> This feature is activated with -accel kvm,rapl=true.
>
>> I suppose this should be a CPU flag instead? -cpu xxx,rapl=on.

It's possible yes then I might need to make sure that '-accel=kvm' is
selected because this rapl feature depends on the KVM msr filtering
mecanism. 

Like '-cpu host' is checking that KVM or HVF is selected.

Thanks
Anthony

[PATCH] MAINTAINERS: Promote Cédric to VFIO co-maintainer

2023-06-28 Thread Alex Williamson

Cédric has stepped up involvement in vfio, reviewing and managing
patches, as well as pull requests.  This work deserves gratitude and
punishment with a promotion to co-maintainer ;)

Signed-off-by: Alex Williamson 
---

Cédric, I'd also support if you wanted to add a tree entry here.

 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index e07746ac7d45..37e48c72b16a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2051,7 +2051,7 @@ F: hw/usb/dev-serial.c
 
 VFIO
 M: Alex Williamson 
-R: Cédric Le Goater 
+M: Cédric Le Goater 
 S: Supported
 F: hw/vfio/*
 F: include/hw/vfio/
-- 
2.40.1

Re: [PATCH] Add support for RAPL MSRs in KVM/Qemu

2023-06-28 Thread Marcelo Tosatti

On Fri, Jun 16, 2023 at 04:08:30PM +0200, Anthony Harivel wrote:
> Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> interface (Running Average Power Limit) for advertising the accumulated
> energy consumption of various power domains (e.g. CPU packages, DRAM,
> etc.).
> 
> The consumption is reported via MSRs (model specific registers) like
> MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> 64 bits registers that represent the accumulated energy consumption in
> micro Joules. They are updated by microcode every ~1ms.
> 
> For now, KVM always returns 0 when the guest requests the value of
> these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle
> these MSRs dynamically in userspace.
> 
> To limit the amount of system calls for every MSR call, create a new
> thread in QEMU that updates the "virtual" MSR values asynchronously.
> 
> Each vCPU has its own vMSR to reflect the independence of vCPUs. The
> thread updates the vMSR values with the ratio of energy consumed of
> the whole physical CPU package the vCPU thread runs on and the
> thread's utime and stime values.
> 
> All other non-vCPU threads are also taken into account. Their energy
> consumption is evenly distributed among all vCPUs threads running on
> the same physical CPU package.
> 
> This feature is activated with -accel kvm,rapl=true.

I suppose this should be a CPU flag instead? -cpu xxx,rapl=on.

Re: [PATCH V3 2/2] migration: file URI offset

2023-06-28 Thread Peter Xu

On Thu, Jun 22, 2023 at 01:37:31PM -0700, Steve Sistare wrote:
> Allow an offset option to be specified as part of the file URI, in
> the form "file:filename,offset=offset", where offset accepts the common
> size suffixes, or the 0x prefix, but not both.  Migration data is written
> to and read from the file starting at offset.  If unspecified, it defaults
> to 0.
> 
> This is needed by libvirt to store its own data at the head of the file.
> 
> Suggested-by: Daniel P. Berrange 
> Signed-off-by: Steve Sistare 

Ideally I think we should make "," the separator from the start, but I
suppose no one will try to apply previous patch only, and then start using
a file name contains ",offset=".. so maybe not that a huge deal.

Reviewed-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH V3 1/2] migration: file URI

2023-06-28 Thread Peter Xu

On Wed, Jun 28, 2023 at 01:07:24PM -0400, Peter Xu wrote:
> On Thu, Jun 22, 2023 at 01:37:30PM -0700, Steve Sistare wrote:
> > Extend the migration URI to support file:.  This can be used for
> > any migration scenario that does not require a reverse path.  It can be
> > used as an alternative to 'exec:cat > file' in minimized containers that
> > do not contain /bin/sh, and it is easier to use than the fd: URI.
> > It can be used in HMP commands, and as a qemu command-line parameter.
> > 
> > For best performance, guest ram should be shared and x-ignore-shared
> > should be true, so guest pages are not written to the file, in which case
> > the guest may remain running.  If ram is not so configured, then the user
> > is advised to stop the guest first.  Otherwise, a busy guest may re-dirty
> > the same page, causing it to be appended to the file multiple times,
> > and the file may grow unboundedly.  That issue is being addressed in the
> > "fixed-ram" patch series.
> > 
> > Signed-off-by: Steve Sistare 
> > Reviewed-by: Fabiano Rosas 
> > ---
> >  migration/file.c   | 62 
> > ++
> >  migration/file.h   | 14 
> >  migration/meson.build  |  1 +
> >  migration/migration.c  |  5 
> >  migration/trace-events |  4 
> >  qemu-options.hx|  6 -
> >  6 files changed, 91 insertions(+), 1 deletion(-)
> >  create mode 100644 migration/file.c
> >  create mode 100644 migration/file.h
> > 
> > diff --git a/migration/file.c b/migration/file.c
> > new file mode 100644
> > index 000..8e35827
> > --- /dev/null
> > +++ b/migration/file.c
> > @@ -0,0 +1,62 @@
> > +/*
> > + * Copyright (c) 2021-2023 Oracle and/or its affiliates.
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "channel.h"
> > +#include "file.h"
> > +#include "migration.h"
> > +#include "io/channel-file.h"
> > +#include "io/channel-util.h"
> > +#include "trace.h"
> > +
> > +void file_start_outgoing_migration(MigrationState *s, const char *filename,
> > +   Error **errp)
> > +{
> > +g_autoptr(QIOChannelFile) fioc = NULL;
> > +QIOChannel *ioc;
> > +
> > +trace_migration_file_outgoing(filename);
> > +
> > +fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | 
> > O_TRUNC,
> > + 0600, errp);
> > +if (!fioc) {
> > +return;
> > +}
> > +
> > +ioc = QIO_CHANNEL(fioc);
> > +qio_channel_set_name(ioc, "migration-file-outgoing");
> > +migration_channel_connect(s, ioc, NULL, NULL);
> 
> Miss one object_unref(ioc)?

Never mind, I overlooked the g_autoptr.. all fine:

Reviewed-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH v3 0/6] net/tap: Fix QEMU frozen issue when the maximum number of file descriptors is very large

2023-06-28 Thread Michael Tokarev


17.06.2023 08:36, Bin Meng wrote:


Current codes using a brute-force traversal of all file descriptors
do not scale on a system where the maximum number of file descriptors
is set to a very large value (e.g.: in a Docker container of Manjaro
distribution it is set to 1073741816). QEMU just looks frozen during
start-up.


What's the reason to close all these file descriptors in the first place?

No other software I know does this.

For some situations, such closing is actually bad, -- think, eg,

  flock lockfile qemu-system-foo ...  --

this one opens a file, locks it using fcntl/flock, and executes the
command, keeping the file descriptor open across exec, so the file
stays locked until the process terminates. This works and works well.
Qemu with its let's-close-everything approach breaks this.

Why? :)

/mjt

Re: [RFC 7/7] hw/mem/cxl_type3: add read/write support to dynamic capacity

2023-06-28 Thread ni...@outlook.com

The 05/15/2023 16:22, Jonathan Cameron wrote:
> On Thu, 11 May 2023 17:56:40 +
> Fan Ni  wrote:
> 
> > From: Fan Ni 
> > 
> > Before the change, read from or write to dynamic capacity of the memory
> > device is not support as 1) no host backed file/memory is provided for
> > it; 2) no address space is created for the dynamic capacity.
> 
> Ah nice. I should have read ahead.  Probably makes sense to reorder things
> so that when we present DCD region it will work.

We can back dynamic capacity with host memory/file and create address
space for dc regions, but until extents can be added we should not expect
any read/write can happen to the dynamic capacity, right?

Fan
> 
> > 
> > With the change, add code to support following:
> > 1. add a new property to type3 device "dc-memdev" to point to host
> >memory backend for dynamic capacity;
> > 2. add a bitmap for each region to track whether a block is host backed,
> > which will be used for address check when read/write dynamic capacity;
> > 3. add namespace for dynamic capacity for read/write support;
> > 4. create cdat entries for each dynamic capacity region;
> > 
> > Signed-off-by: Fan Ni 
> > ---
> >  hw/cxl/cxl-mailbox-utils.c  |  21 ++-
> >  hw/mem/cxl_type3.c  | 336 +---
> >  include/hw/cxl/cxl_device.h |   8 +-
> >  3 files changed, 298 insertions(+), 67 deletions(-)
> > 
> > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> > index 7212934627..efe61e67fb 100644
> > --- a/hw/cxl/cxl-mailbox-utils.c
> > +++ b/hw/cxl/cxl-mailbox-utils.c
> > @@ -391,9 +391,11 @@ static CXLRetCode cmd_firmware_update_get_info(struct 
> > cxl_cmd *cmd,
> >  char fw_rev4[0x10];
> >  } QEMU_PACKED *fw_info;
> >  QEMU_BUILD_BUG_ON(sizeof(*fw_info) != 0x50);
> > +   CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
> >  
> >  if ((cxl_dstate->vmem_size < CXL_CAPACITY_MULTIPLIER) ||
> > -(cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER)) {
> > +   (cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER) ||
> 
> Keep old alignment
> 
> > +   (ct3d->dc.total_dynamic_capicity < CXL_CAPACITY_MULTIPLIER)) {
> 
> We should think about the separation between what goes in cxl_dstate and 
> directly
> in ct3d. That boundary has been blurring for a while and getting some review
> comments.
> 
> >  return CXL_MBOX_INTERNAL_ERROR;
> >  }
> >  
> > @@ -534,7 +536,9 @@ static CXLRetCode cmd_identify_memory_device(struct 
> > cxl_cmd *cmd,
> >  CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d);
> >  
> >  if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) 
> > ||
> > -(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, 
> > CXL_CAPACITY_MULTIPLIER))) {
> > +   (!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, 
> > CXL_CAPACITY_MULTIPLIER)) ||
> > +   (!QEMU_IS_ALIGNED(ct3d->dc.total_dynamic_capicity,
> > +   CXL_CAPACITY_MULTIPLIER))) {
> >  return CXL_MBOX_INTERNAL_ERROR;
> >  }
> >  
> > @@ -543,7 +547,8 @@ static CXLRetCode cmd_identify_memory_device(struct 
> > cxl_cmd *cmd,
> >  
> >  snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);
> >  
> > -stq_le_p(&id->total_capacity, cxl_dstate->mem_size / 
> > CXL_CAPACITY_MULTIPLIER);
> > +   stq_le_p(&id->total_capacity,
> > +   cxl_dstate->static_mem_size / CXL_CAPACITY_MULTIPLIER);
> 
> Pull the rename out as a precursor patch.
> 
> >  stq_le_p(&id->persistent_capacity, cxl_dstate->pmem_size / 
> > CXL_CAPACITY_MULTIPLIER);
> >  stq_le_p(&id->volatile_capacity, cxl_dstate->vmem_size / 
> > CXL_CAPACITY_MULTIPLIER);
> >  stl_le_p(&id->lsa_size, cvc->get_lsa_size(ct3d));
> > @@ -568,9 +573,12 @@ static CXLRetCode cmd_ccls_get_partition_info(struct 
> > cxl_cmd *cmd,
> >  uint64_t next_pmem;
> >  } QEMU_PACKED *part_info = (void *)cmd->payload;
> >  QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20);
> > +   CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
> >  
> >  if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) 
> > ||
> > -(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, 
> > CXL_CAPACITY_MULTIPLIER))) {
> > +   (!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, 
> > CXL_CAPACITY_MULTIPLIER)) ||
> > +   (!QEMU_IS_ALIGNED(ct3d->dc.total_dynamic_capicity,
> > +   CXL_CAPACITY_MULTIPLIER))) {
> >  return CXL_MBOX_INTERNAL_ERROR;
> >  }
> >  
> > @@ -881,9 +889,8 @@ static CXLRetCode cmd_media_clear_poison(struct cxl_cmd 
> > *cmd,
> >  struct clear_poison_pl *in = (void *)cmd->payload;
> >  
> >  dpa = ldq_le_p(&in->dpa);
> > -if (dpa + 64 > cxl_dstate->mem_size) {
> > -return CXL_MBOX_INVALID_PA;
> > -}
> > +   if (dpa + 64 > cxl_dstate->static_mem_size && ct3d->dc.num_regions == 0)
> 
> This test will need expanding to include DPAs in D

Re: [PATCH V3 1/2] migration: file URI

2023-06-28 Thread Peter Xu

On Thu, Jun 22, 2023 at 01:37:30PM -0700, Steve Sistare wrote:
> Extend the migration URI to support file:.  This can be used for
> any migration scenario that does not require a reverse path.  It can be
> used as an alternative to 'exec:cat > file' in minimized containers that
> do not contain /bin/sh, and it is easier to use than the fd: URI.
> It can be used in HMP commands, and as a qemu command-line parameter.
> 
> For best performance, guest ram should be shared and x-ignore-shared
> should be true, so guest pages are not written to the file, in which case
> the guest may remain running.  If ram is not so configured, then the user
> is advised to stop the guest first.  Otherwise, a busy guest may re-dirty
> the same page, causing it to be appended to the file multiple times,
> and the file may grow unboundedly.  That issue is being addressed in the
> "fixed-ram" patch series.
> 
> Signed-off-by: Steve Sistare 
> Reviewed-by: Fabiano Rosas 
> ---
>  migration/file.c   | 62 
> ++
>  migration/file.h   | 14 
>  migration/meson.build  |  1 +
>  migration/migration.c  |  5 
>  migration/trace-events |  4 
>  qemu-options.hx|  6 -
>  6 files changed, 91 insertions(+), 1 deletion(-)
>  create mode 100644 migration/file.c
>  create mode 100644 migration/file.h
> 
> diff --git a/migration/file.c b/migration/file.c
> new file mode 100644
> index 000..8e35827
> --- /dev/null
> +++ b/migration/file.c
> @@ -0,0 +1,62 @@
> +/*
> + * Copyright (c) 2021-2023 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "channel.h"
> +#include "file.h"
> +#include "migration.h"
> +#include "io/channel-file.h"
> +#include "io/channel-util.h"
> +#include "trace.h"
> +
> +void file_start_outgoing_migration(MigrationState *s, const char *filename,
> +   Error **errp)
> +{
> +g_autoptr(QIOChannelFile) fioc = NULL;
> +QIOChannel *ioc;
> +
> +trace_migration_file_outgoing(filename);
> +
> +fioc = qio_channel_file_new_path(filename, O_CREAT | O_WRONLY | O_TRUNC,
> + 0600, errp);
> +if (!fioc) {
> +return;
> +}
> +
> +ioc = QIO_CHANNEL(fioc);
> +qio_channel_set_name(ioc, "migration-file-outgoing");
> +migration_channel_connect(s, ioc, NULL, NULL);

Miss one object_unref(ioc)?

> +}
> +
> +static gboolean file_accept_incoming_migration(QIOChannel *ioc,
> +   GIOCondition condition,
> +   gpointer opaque)
> +{
> +migration_channel_process_incoming(ioc);
> +object_unref(OBJECT(ioc));
> +return G_SOURCE_REMOVE;
> +}
> +
> +void file_start_incoming_migration(const char *filename, Error **errp)
> +{
> +QIOChannelFile *fioc = NULL;
> +QIOChannel *ioc;
> +
> +trace_migration_file_incoming(filename);
> +
> +fioc = qio_channel_file_new_path(filename, O_RDONLY, 0, errp);
> +if (!fioc) {
> +return;
> +}
> +
> +ioc = QIO_CHANNEL(fioc);
> +qio_channel_set_name(QIO_CHANNEL(ioc), "migration-file-incoming");
> +qio_channel_add_watch_full(ioc, G_IO_IN,
> +   file_accept_incoming_migration,
> +   NULL, NULL,
> +   g_main_context_get_thread_default());
> +}
> diff --git a/migration/file.h b/migration/file.h
> new file mode 100644
> index 000..841b94a
> --- /dev/null
> +++ b/migration/file.h
> @@ -0,0 +1,14 @@
> +/*
> + * Copyright (c) 2021-2023 Oracle and/or its affiliates.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef QEMU_MIGRATION_FILE_H
> +#define QEMU_MIGRATION_FILE_H
> +void file_start_incoming_migration(const char *filename, Error **errp);
> +
> +void file_start_outgoing_migration(MigrationState *s, const char *filename,
> +   Error **errp);
> +#endif
> diff --git a/migration/meson.build b/migration/meson.build
> index 8ba6e42..3af817e 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -16,6 +16,7 @@ softmmu_ss.add(files(
>'dirtyrate.c',
>'exec.c',
>'fd.c',
> +  'file.c',
>'global_state.c',
>'migration-hmp-cmds.c',
>'migration.c',
> diff --git a/migration/migration.c b/migration/migration.c
> index dc05c6f..cfbde86 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -20,6 +20,7 @@
>  #include "migration/blocker.h"
>  #include "exec.h"
>  #include "fd.h"
> +#include "file.h"
>  #include "socket.h"
>  #include "sysemu/runstate.h"
>  #include "sysemu/sysemu.h"
> @@ -442,6 +443,8 @@ static void qemu_start_incoming_migration(const char 
> *uri, Error **errp)
>

1 2 3 4 >

1 - 100 of 326 matches

Mail list logo