[PATCH v3 2/2] target/i386: Raise #GP on unaligned m128 accesses when required.

2022-08-29 Thread Ricky Zhou
Many instructions which load/store 128-bit values are supposed to
raise #GP when the memory operand isn't 16-byte aligned. This includes:
 - Instructions explicitly requiring memory alignment (Exceptions Type 1
   in the "AVX and SSE Instruction Exception Specification" section of
   the SDM)
 - Legacy SSE instructions that load/store 128-bit values (Exceptions
   Types 2 and 4).

This change sets MO_ALIGN_16 on 128-bit memory accesses that require
16-byte alignment. It adds cpu_record_sigbus and cpu_do_unaligned_access
hooks that simulate a #GP exception in qemu-user and qemu-system,
respectively.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/217
Reviewed-by: Richard Henderson 
Signed-off-by: Ricky Zhou 
---
 target/i386/tcg/excp_helper.c| 13 
 target/i386/tcg/helper-tcg.h | 28 ++---
 target/i386/tcg/sysemu/excp_helper.c |  8 +
 target/i386/tcg/tcg-cpu.c|  2 ++
 target/i386/tcg/translate.c  | 45 +---
 target/i386/tcg/user/excp_helper.c   |  7 +
 6 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/target/i386/tcg/excp_helper.c b/target/i386/tcg/excp_helper.c
index c1ffa1c0ef..7c3c8dc7fe 100644
--- a/target/i386/tcg/excp_helper.c
+++ b/target/i386/tcg/excp_helper.c
@@ -140,3 +140,16 @@ G_NORETURN void raise_exception_ra(CPUX86State *env, int 
exception_index,
 {
 raise_interrupt2(env, exception_index, 0, 0, 0, retaddr);
 }
+
+G_NORETURN void handle_unaligned_access(CPUX86State *env, vaddr vaddr,
+MMUAccessType access_type,
+uintptr_t retaddr)
+{
+/*
+ * Unaligned accesses are currently only triggered by SSE/AVX
+ * instructions that impose alignment requirements on memory
+ * operands. These instructions raise #GP(0) upon accessing an
+ * unaligned address.
+ */
+raise_exception_ra(env, EXCP0D_GPF, retaddr);
+}
diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
index 34167e2e29..cd1723389a 100644
--- a/target/i386/tcg/helper-tcg.h
+++ b/target/i386/tcg/helper-tcg.h
@@ -42,17 +42,6 @@ void x86_cpu_do_interrupt(CPUState *cpu);
 bool x86_cpu_exec_interrupt(CPUState *cpu, int int_req);
 #endif
 
-/* helper.c */
-#ifdef CONFIG_USER_ONLY
-void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr,
-MMUAccessType access_type,
-bool maperr, uintptr_t ra);
-#else
-bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
-  MMUAccessType access_type, int mmu_idx,
-  bool probe, uintptr_t retaddr);
-#endif
-
 void breakpoint_handler(CPUState *cs);
 
 /* n must be a constant to be efficient */
@@ -78,6 +67,23 @@ G_NORETURN void raise_exception_err_ra(CPUX86State *env, int 
exception_index,
int error_code, uintptr_t retaddr);
 G_NORETURN void raise_interrupt(CPUX86State *nenv, int intno, int is_int,
 int error_code, int next_eip_addend);
+G_NORETURN void handle_unaligned_access(CPUX86State *env, vaddr vaddr,
+MMUAccessType access_type,
+uintptr_t retaddr);
+#ifdef CONFIG_USER_ONLY
+void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr,
+MMUAccessType access_type,
+bool maperr, uintptr_t ra);
+void x86_cpu_record_sigbus(CPUState *cs, vaddr addr,
+   MMUAccessType access_type, uintptr_t ra);
+#else
+bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
+  MMUAccessType access_type, int mmu_idx,
+  bool probe, uintptr_t retaddr);
+G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
+MMUAccessType access_type,
+int mmu_idx, uintptr_t retaddr);
+#endif
 
 /* cc_helper.c */
 extern const uint8_t parity_table[256];
diff --git a/target/i386/tcg/sysemu/excp_helper.c 
b/target/i386/tcg/sysemu/excp_helper.c
index 48feba7e75..796dc2a1f3 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -439,3 +439,11 @@ bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size,
 }
 return true;
 }
+
+G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
+MMUAccessType access_type,
+int mmu_idx, uintptr_t retaddr)
+{
+X86CPU *cpu = X86_CPU(cs);
+handle_unaligned_access(>env, vaddr, access_type, retaddr);
+}
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index 6fdfdf9598..d3c2b8fb49 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -75,10 +75,12 @@ static const struct TCGCPUOps x86_tcg_ops = {
 #ifdef CONFIG_USER_ONLY
 

[PATCH v3 1/2] target/i386: Read 8 bytes from cvttps2pi/cvtps2pi memory operands

2022-08-29 Thread Ricky Zhou
Before this change, emulation of cvttps2pi and cvtps2pi instructions
would read 16 bytes of memory instead of 8. The SDM states that
cvttps2pi takes a 64-bit memory location. The documentation for cvtps2pi
claims that it takes a a 128-bit memory location, but as with cvttps2pi,
the operand is written as xmm/m64. I double-checked on real hardware
that both of these instructions only read 8 bytes.

Reviewed-by: Richard Henderson 
Signed-off-by: Ricky Zhou 
---
 target/i386/tcg/translate.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index b7972f0ff5..3ba5f76156 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3621,7 +3621,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
 op2_offset = offsetof(CPUX86State,xmm_t0);
-gen_ldo_env_A0(s, op2_offset);
+if (b1) {
+gen_ldo_env_A0(s, op2_offset);
+} else {
+gen_ldq_env_A0(s, op2_offset);
+}
 } else {
 rm = (modrm & 7) | REX_B(s);
 op2_offset = offsetof(CPUX86State,xmm_regs[rm]);
-- 
2.37.2




[PATCH] hcd-ohci: Fix inconsistency when resetting ohci root hubs

2022-08-29 Thread Qiang Liu
I found an assertion failure in usb_cancel_packet() and posted my analysis in
https://gitlab.com/qemu-project/qemu/-/issues/1180. I think this issue is
because the inconsistency when resetting ohci root hubs.

There are two ways to reset ohci root hubs: 1) through HcRhPortStatus, 2)
through HcControl. However, when the packet's status is USB_PACKET_ASYNC,
resetting through HcRhPortStatus will complete the packet and thus resetting
through HcControl will fail. That is because IMO resetting through
HcRhPortStatus should first detach the port and then invoked usb_device_reset()
just like through HcControl. Therefore, I change usb_device_reset() to
usb_port_reset() where usb_detach() and usb_device_reset() are invoked
consequently.

Fixes: d28f4e2d8631 ("usb: kill USB_MSG_RESET")
Reported-by: Qiang Liu 
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1180
Signed-off-by: Qiang Liu 
---
 hw/usb/hcd-ohci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c
index 895b29fb86..72df917834 100644
--- a/hw/usb/hcd-ohci.c
+++ b/hw/usb/hcd-ohci.c
@@ -1426,7 +1426,7 @@ static void ohci_port_set_status(OHCIState *ohci, int 
portnum, uint32_t val)
 
 if (ohci_port_set_if_connected(ohci, portnum, val & OHCI_PORT_PRS)) {
 trace_usb_ohci_port_reset(portnum);
-usb_device_reset(port->port.dev);
+usb_port_reset(>port);
 port->ctrl &= ~OHCI_PORT_PRS;
 /* ??? Should this also set OHCI_PORT_PESC.  */
 port->ctrl |= OHCI_PORT_PES | OHCI_PORT_PRSC;
-- 
2.25.1




Re: [PATCH v2 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.

2022-08-29 Thread Richard Henderson

On 8/29/22 19:11, Ricky Zhou wrote:

Many instructions which load/store 128-bit values are supposed to
raise #GP when the memory operand isn't 16-byte aligned. This includes:
  - Instructions explicitly requiring memory alignment (Exceptions Type 1
in the "AVX and SSE Instruction Exception Specification" section of
the SDM)
  - Legacy SSE instructions that load/store 128-bit values (Exceptions
Types 2 and 4).

This change sets MO_ALIGN_16 on 128-bit memory accesses that require
16-byte alignment. It adds cpu_record_sigbus and cpu_do_unaligned_access
handlers that simulate a #GP exception in qemu-user and qemu-system,
respectively.





One minor behavior change apart from what is described above: Prior to
this change, emulation of cvttps2pi and cvtps2pi instructions would
incorrectly read 16 bytes of memory instead of 8. I double-checked on
real hardware that these instructions only read 8 bytes (and do not have
any address alignment requirements).


This should really be split out to a separate patch.



@@ -3621,7 +3629,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
  if (mod != 3) {
  gen_lea_modrm(env, s, modrm);
  op2_offset = offsetof(CPUX86State,xmm_t0);
-gen_ldo_env_A0(s, op2_offset);
+if ((b >> 8) & 1) {


Aka b1.

Otherwise,
Reviewed-by: Richard Henderson 


r~



[PATCH v2 0/1] target/i386: Raise #GP on unaligned m128 accesses when required.

2022-08-29 Thread Ricky Zhou
Thanks Richard for the detailed comments/code pointers! I've switched to
using MO_ALIGN_16 and implemented record_sigbus and do_unaligned_access
hooks to simulate #GP(0) as suggested. Given what was said about the low
likelihood of implementing #AC anytime soon, I have hardcoded #GP(0) in
these hooks for now rather than plumbing through an extra bit in MemOp.
Let me know if that seems reasonable, thanks!

Ricky Zhou (1):
  target/i386: Raise #GP on unaligned m128 accesses when required.

 target/i386/tcg/excp_helper.c| 13 
 target/i386/tcg/helper-tcg.h | 28 +---
 target/i386/tcg/sysemu/excp_helper.c |  8 +
 target/i386/tcg/tcg-cpu.c|  1 +
 target/i386/tcg/translate.c  | 49 ++--
 target/i386/tcg/user/excp_helper.c   |  7 
 6 files changed, 77 insertions(+), 29 deletions(-)

-- 
2.37.2




Re: [PATCH for-7.2 v4 15/21] qmp/hmp, device_tree.c: introduce 'info fdt' command

2022-08-29 Thread David Gibson
On Mon, Aug 29, 2022 at 07:00:55PM -0300, Daniel Henrique Barboza wrote:
> 
> 
> On 8/29/22 00:34, David Gibson wrote:
> > On Fri, Aug 26, 2022 at 11:11:44AM -0300, Daniel Henrique Barboza wrote:
> > > Reading the FDT requires that the user saves the fdt_blob and then use
> > > 'dtc' to read the contents. Saving the file and using 'dtc' is a strong
> > > use case when we need to compare two FDTs, but it's a lot of steps if
> > > you want to do quick check on a certain node or property.
> > > 
> > > 'info fdt' retrieves FDT nodes (and properties, later on) and print it
> > > to the user. This can be used to check the FDT on running machines
> > > without having to save the blob and use 'dtc'.
> > > 
> > > The implementation is based on the premise that the machine thas a FDT
> > > created using libfdt and pointed by 'machine->fdt'. As long as this
> > > pre-requisite is met the machine should be able to support it.
> > > 
> > > For now we're going to add the required QMP/HMP boilerplate and the
> > > capability of printing the name of the properties of a given node. Next
> > > patches will extend 'info fdt' to be able to print nodes recursively,
> > > and then individual properties.
> > > 
> > > This command will always be executed in-band (i.e. holding BQL),
> > > avoiding potential race conditions with machines that might change the
> > > FDT during runtime (e.g. PowerPC 'pseries' machine).
> > > 
> > > 'info fdt' is not something that we expect to be used aside from 
> > > debugging,
> > > so we're implementing it in QMP as 'x-query-fdt'.
> > > 
> > > This is an example of 'info fdt' fetching the '/chosen' node of the
> > > pSeries machine:
> > > 
> > > (qemu) info fdt /chosen
> > > chosen {
> > >  ibm,architecture-vec-5;
> > >  rng-seed;
> > >  ibm,arch-vec-5-platform-support;
> > >  linux,pci-probe-only;
> > >  stdout-path;
> > >  linux,stdout-path;
> > >  qemu,graphic-depth;
> > >  qemu,graphic-height;
> > >  qemu,graphic-width;
> > > };
> > > 
> > > And the same node for the aarch64 'virt' machine:
> > > 
> > > (qemu) info fdt /chosen
> > > chosen {
> > >  stdout-path;
> > >  rng-seed;
> > >  kaslr-seed;
> > > };
> > 
> > So, I'm reasonably convinced allowing dumping the whole dtb from
> > qmp/hmp is useful.  I'm less convined that info fdt is worth the
> > additional complexity it incurs.  Note that as well as being able to
> > decompile a whole dtb using dtc, you can also extract and list
> > specific properties from a dtb blob using the 'fdtget' tool which is
> > part of the dtc tree.
> 
> What's your opinion on patch 21/21, where 'dumpdtb' can write a formatted
> FDT in a file with an extra option? That was possible because of the
> format helpers introduced for 'info fdt'. The idea is that since we're
> able to format a FDT in DTS format, we can also write the FDT in text
> format without relying on DTC to decode it.

Since it's mostly the same code, I think it's reasonable to throw in
if the info fdt stuff is there, but I don't think it's worth including
without that.  As a whole, I remain dubious that (info fdt + dumpdts)
is worth the complexity cost.

People with more practical experience debugging the embedded ARM
platforms might have a different opinion if they thing info fdt would
be really useful though.

> If we think that this 'dumpdtb' capability is worth having, I can respin
> the patches without 'info fdt' but adding these helpers to enable this
> 'dumpdtb' support. If not, then we can just remove patches 15-21 and
> be done with it.
> 
> 
> Thanks,
> 
> 
> Daniel
> 
> > 
> > > 
> > > Cc: Dr. David Alan Gilbert 
> > > Acked-by: Dr. David Alan Gilbert 
> > > Signed-off-by: Daniel Henrique Barboza 
> > > ---
> > >   hmp-commands-info.hx | 13 ++
> > >   include/monitor/hmp.h|  1 +
> > >   include/sysemu/device_tree.h |  4 +++
> > >   monitor/hmp-cmds.c   | 13 ++
> > >   monitor/qmp-cmds.c   | 12 +
> > >   qapi/machine.json| 19 +++
> > >   softmmu/device_tree.c| 47 
> > >   7 files changed, 109 insertions(+)
> > > 
> > > diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
> > > index 188d9ece3b..743b48865d 100644
> > > --- a/hmp-commands-info.hx
> > > +++ b/hmp-commands-info.hx
> > > @@ -921,3 +921,16 @@ SRST
> > > ``stats``
> > >   Show runtime-collected statistics
> > >   ERST
> > > +
> > > +{
> > > +.name   = "fdt",
> > > +.args_type  = "nodepath:s",
> > > +.params = "nodepath",
> > > +.help   = "show firmware device tree node given its path",
> > > +.cmd= hmp_info_fdt,
> > > +},
> > > +
> > > +SRST
> > > +  ``info fdt``
> > > +Show a firmware device tree node given its path. Requires libfdt.
> > > +ERST
> > > diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
> > > index d7f324da59..c0883dd1e3 100644
> > > --- 

[PATCH v2 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.

2022-08-29 Thread Ricky Zhou
Many instructions which load/store 128-bit values are supposed to
raise #GP when the memory operand isn't 16-byte aligned. This includes:
 - Instructions explicitly requiring memory alignment (Exceptions Type 1
   in the "AVX and SSE Instruction Exception Specification" section of
   the SDM)
 - Legacy SSE instructions that load/store 128-bit values (Exceptions
   Types 2 and 4).

This change sets MO_ALIGN_16 on 128-bit memory accesses that require
16-byte alignment. It adds cpu_record_sigbus and cpu_do_unaligned_access
handlers that simulate a #GP exception in qemu-user and qemu-system,
respectively.

One minor behavior change apart from what is described above: Prior to
this change, emulation of cvttps2pi and cvtps2pi instructions would
incorrectly read 16 bytes of memory instead of 8. I double-checked on
real hardware that these instructions only read 8 bytes (and do not have
any address alignment requirements).

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/217
Signed-off-by: Ricky Zhou 
---
 target/i386/tcg/excp_helper.c| 13 
 target/i386/tcg/helper-tcg.h | 28 +---
 target/i386/tcg/sysemu/excp_helper.c |  8 +
 target/i386/tcg/tcg-cpu.c|  2 ++
 target/i386/tcg/translate.c  | 49 ++--
 target/i386/tcg/user/excp_helper.c   |  7 
 6 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/target/i386/tcg/excp_helper.c b/target/i386/tcg/excp_helper.c
index c1ffa1c0ef..7c3c8dc7fe 100644
--- a/target/i386/tcg/excp_helper.c
+++ b/target/i386/tcg/excp_helper.c
@@ -140,3 +140,16 @@ G_NORETURN void raise_exception_ra(CPUX86State *env, int 
exception_index,
 {
 raise_interrupt2(env, exception_index, 0, 0, 0, retaddr);
 }
+
+G_NORETURN void handle_unaligned_access(CPUX86State *env, vaddr vaddr,
+MMUAccessType access_type,
+uintptr_t retaddr)
+{
+/*
+ * Unaligned accesses are currently only triggered by SSE/AVX
+ * instructions that impose alignment requirements on memory
+ * operands. These instructions raise #GP(0) upon accessing an
+ * unaligned address.
+ */
+raise_exception_ra(env, EXCP0D_GPF, retaddr);
+}
diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
index 34167e2e29..cd1723389a 100644
--- a/target/i386/tcg/helper-tcg.h
+++ b/target/i386/tcg/helper-tcg.h
@@ -42,17 +42,6 @@ void x86_cpu_do_interrupt(CPUState *cpu);
 bool x86_cpu_exec_interrupt(CPUState *cpu, int int_req);
 #endif
 
-/* helper.c */
-#ifdef CONFIG_USER_ONLY
-void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr,
-MMUAccessType access_type,
-bool maperr, uintptr_t ra);
-#else
-bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
-  MMUAccessType access_type, int mmu_idx,
-  bool probe, uintptr_t retaddr);
-#endif
-
 void breakpoint_handler(CPUState *cs);
 
 /* n must be a constant to be efficient */
@@ -78,6 +67,23 @@ G_NORETURN void raise_exception_err_ra(CPUX86State *env, int 
exception_index,
int error_code, uintptr_t retaddr);
 G_NORETURN void raise_interrupt(CPUX86State *nenv, int intno, int is_int,
 int error_code, int next_eip_addend);
+G_NORETURN void handle_unaligned_access(CPUX86State *env, vaddr vaddr,
+MMUAccessType access_type,
+uintptr_t retaddr);
+#ifdef CONFIG_USER_ONLY
+void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr,
+MMUAccessType access_type,
+bool maperr, uintptr_t ra);
+void x86_cpu_record_sigbus(CPUState *cs, vaddr addr,
+   MMUAccessType access_type, uintptr_t ra);
+#else
+bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
+  MMUAccessType access_type, int mmu_idx,
+  bool probe, uintptr_t retaddr);
+G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
+MMUAccessType access_type,
+int mmu_idx, uintptr_t retaddr);
+#endif
 
 /* cc_helper.c */
 extern const uint8_t parity_table[256];
diff --git a/target/i386/tcg/sysemu/excp_helper.c 
b/target/i386/tcg/sysemu/excp_helper.c
index 48feba7e75..796dc2a1f3 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -439,3 +439,11 @@ bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size,
 }
 return true;
 }
+
+G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
+MMUAccessType access_type,
+int mmu_idx, uintptr_t retaddr)
+{
+X86CPU *cpu = X86_CPU(cs);
+handle_unaligned_access(>env, vaddr, 

Re: [PATCH v1 15/25] Deprecate 32 bit big-endian MIPS

2022-08-29 Thread Huacai Chen
Reviewed-by: Huacai Chen 

On Tue, Aug 30, 2022 at 7:39 AM Philippe Mathieu-Daudé  wrote:
>
> Hi Alex,
>
> (+Aleksandar/Huacai)
>
> On 26/8/22 19:21, Alex Bennée wrote:
> > It's becoming harder to maintain a cross-compiler to test this host
> > architecture as the old stable Debian 10 ("Buster") moved into LTS
> > which supports fewer architectures. For now:
> >
> >- mark it's deprecation in the docs
> >- downgrade the containers to build TCG tests only
> >- drop the cross builds from our CI
> >
> > Users with an appropriate toolchain and user-space can still take
> > their chances building it.
> >
> > Signed-off-by: Alex Bennée 
> > ---
> >   docs/about/build-platforms.rst|  2 +-
> >   docs/about/deprecated.rst | 13 ++
> >   .gitlab-ci.d/container-cross.yml  |  1 -
> >   .gitlab-ci.d/crossbuilds.yml  | 14 ---
> >   tests/docker/Makefile.include |  5 +--
> >   .../dockerfiles/debian-mips-cross.docker  | 40 +--
> >   6 files changed, 27 insertions(+), 48 deletions(-)
> >
> > diff --git a/docs/about/build-platforms.rst b/docs/about/build-platforms.rst
> > index 26028756d0..1ca9144a7d 100644
> > --- a/docs/about/build-platforms.rst
> > +++ b/docs/about/build-platforms.rst
> > @@ -41,7 +41,7 @@ Those hosts are officially supported, with various 
> > accelerators:
> >- Accelerators
> >  * - Arm
> >- kvm (64 bit only), tcg, xen
> > -   * - MIPS
> > +   * - MIPS (LE only)
> >- kvm, tcg
> >  * - PPC
> >- kvm, tcg
> > diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
> > index 91b03115ee..22c2f4f4de 100644
> > --- a/docs/about/deprecated.rst
> > +++ b/docs/about/deprecated.rst
> > @@ -213,6 +213,19 @@ MIPS ``Trap-and-Emul`` KVM support (since 6.0)
> >   The MIPS ``Trap-and-Emul`` KVM host and guest support has been removed
> >   from Linux upstream kernel, declare it deprecated.
> >
> > +Host Architectures
> > +--
> > +
> > +BE MIPS (since 7.2)
> > +'''
> > +
> > +A Debian 10 ("Buster") moved into LTS the big endian 32 bit version of
> > +MIPS moved out of support making it hard to maintain our
> > +cross-compilation CI tests of the architecture. As we no longer have
> > +CI coverage support may bitrot away before the deprecation process
> > +completes. The little endian variants of MIPS (both 32 and 64 bit) are
> > +still a supported host architecture.
>
> For completeness we should update meson.build to consider
> host_machine.endian() and adapt this section:
>
>
>if not supported_cpus.contains(cpu)
>  message()
>  warning('SUPPORT FOR THIS HOST CPU WILL GO AWAY IN FUTURE RELEASES!')
>  message()
>  message('CPU host architecture ' + cpu + ' support is not currently
> maintained.')
>...
>
> This can be done later, and I might be able to do so in few weeks,
> so meanwhile (with Thomas comment addressed):
> Reviewed-by: Philippe Mathieu-Daudé 



Re: [PATCH v8 0/7] Add support for zoned device

2022-08-29 Thread Sam Li
Stefan Hajnoczi  于2022年8月30日周二 03:44写道:
>
> On Fri, Aug 26, 2022 at 11:15:29PM +0800, Sam Li wrote:
> > Zoned Block Devices (ZBDs) devide the LBA space to block regions called 
> > zones
> > that are larger than the LBA size. It can only allow sequential writes, 
> > which
> > reduces write amplification in SSD, leading to higher throughput and 
> > increased
> > capacity. More details about ZBDs can be found at:
> >
> > https://zonedstorage.io/docs/introduction/zoned-storage
> >
> > The zoned device support aims to let guests (virtual machines) access zoned
> > storage devices on the host (hypervisor) through a virtio-blk device. This
> > involves extending QEMU's block layer and virtio-blk emulation code.  In its
> > current status, the virtio-blk device is not aware of ZBDs but the guest 
> > sees
> > host-managed drives as regular drive that will runs correctly under the most
> > common write workloads.
> >
> > This patch series extend the block layer APIs with the minimum set of zoned
> > commands that are necessary to support zoned devices. The commands are - 
> > Report
> > Zones, four zone operations and Zone Append (developing).
> >
> > It can be tested on a null_blk device using qemu-io or qemu-iotests. For
> > example, the command line for zone report using qemu-io is:
> > $ path/to/qemu-io --image-opts -n 
> > driver=zoned_host_device,filename=/dev/nullb0
> > -c "zrp offset nr_zones"
> >
> > v8:
> > - address review comments
> >   * solve patch conflicts and merge sysfs helper funcations into one patch
> >   * add cache.direct=on check in config
>
> Hi Sam,
> I have left a few comments.

That's great! Thanks for reviewing. I'll send a revision soon.

Sam



Re: [PATCH 1/5] Update version for v7.1.0-rc4 release

2022-08-29 Thread Dmitry Osipenko
On 8/29/22 18:40, Antonio Caggiano wrote:
> From: Richard Henderson 
> 
> Signed-off-by: Richard Henderson 
> ---
>  VERSION | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/VERSION b/VERSION
> index 1c944b9863..b8d5f3ebb6 100644
> --- a/VERSION
> +++ b/VERSION
> @@ -1 +1 @@
> -7.0.93
> +7.0.94

This patch shouldn't be here.

-- 
Best regards,
Dmitry



Re: [PATCH v1 13/25] gitlab-ci/custom-runners: Disable -static-pie for ubuntu-20.04-aarch64

2022-08-29 Thread Richard Henderson

On 8/29/22 16:16, Philippe Mathieu-Daudé wrote:
Shouldn't "--extra-cflags='-fno-pie -no-pie'" be handled by the configure script while 
processing the --disable-pie option?


I think configure just passes b_pie=off to meson, but yes, this could be improved -- 
there's definitely a disconnect somewhere.



r~



Re: [PATCH v1 15/25] Deprecate 32 bit big-endian MIPS

2022-08-29 Thread Philippe Mathieu-Daudé via

Hi Alex,

(+Aleksandar/Huacai)

On 26/8/22 19:21, Alex Bennée wrote:

It's becoming harder to maintain a cross-compiler to test this host
architecture as the old stable Debian 10 ("Buster") moved into LTS
which supports fewer architectures. For now:

   - mark it's deprecation in the docs
   - downgrade the containers to build TCG tests only
   - drop the cross builds from our CI

Users with an appropriate toolchain and user-space can still take
their chances building it.

Signed-off-by: Alex Bennée 
---
  docs/about/build-platforms.rst|  2 +-
  docs/about/deprecated.rst | 13 ++
  .gitlab-ci.d/container-cross.yml  |  1 -
  .gitlab-ci.d/crossbuilds.yml  | 14 ---
  tests/docker/Makefile.include |  5 +--
  .../dockerfiles/debian-mips-cross.docker  | 40 +--
  6 files changed, 27 insertions(+), 48 deletions(-)

diff --git a/docs/about/build-platforms.rst b/docs/about/build-platforms.rst
index 26028756d0..1ca9144a7d 100644
--- a/docs/about/build-platforms.rst
+++ b/docs/about/build-platforms.rst
@@ -41,7 +41,7 @@ Those hosts are officially supported, with various 
accelerators:
   - Accelerators
 * - Arm
   - kvm (64 bit only), tcg, xen
-   * - MIPS
+   * - MIPS (LE only)
   - kvm, tcg
 * - PPC
   - kvm, tcg
diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 91b03115ee..22c2f4f4de 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -213,6 +213,19 @@ MIPS ``Trap-and-Emul`` KVM support (since 6.0)
  The MIPS ``Trap-and-Emul`` KVM host and guest support has been removed
  from Linux upstream kernel, declare it deprecated.
  
+Host Architectures

+--
+
+BE MIPS (since 7.2)
+'''
+
+A Debian 10 ("Buster") moved into LTS the big endian 32 bit version of
+MIPS moved out of support making it hard to maintain our
+cross-compilation CI tests of the architecture. As we no longer have
+CI coverage support may bitrot away before the deprecation process
+completes. The little endian variants of MIPS (both 32 and 64 bit) are
+still a supported host architecture.


For completeness we should update meson.build to consider 
host_machine.endian() and adapt this section:



  if not supported_cpus.contains(cpu)
message()
warning('SUPPORT FOR THIS HOST CPU WILL GO AWAY IN FUTURE RELEASES!')
message()
message('CPU host architecture ' + cpu + ' support is not currently 
maintained.')

  ...

This can be done later, and I might be able to do so in few weeks,
so meanwhile (with Thomas comment addressed):
Reviewed-by: Philippe Mathieu-Daudé 



Re: [PATCH v1 13/25] gitlab-ci/custom-runners: Disable -static-pie for ubuntu-20.04-aarch64

2022-08-29 Thread Philippe Mathieu-Daudé via

On 26/8/22 19:21, Alex Bennée wrote:

From: Richard Henderson 

The project has reached the magic size at which we see

/usr/aarch64-linux-gnu/lib/libc.a(init-first.o): in function 
`__libc_init_first':
(.text+0x10): relocation truncated to fit: R_AARCH64_LD64_GOTPAGE_LO15 against \
symbol `__environ' defined in .bss section in 
/usr/aarch64-linux-gnu/lib/libc.a(environ.o)
/usr/bin/ld: (.text+0x10): warning: too many GOT entries for -fpic, please 
recompile with -fPIC

The bug has been reported upstream, but in the meantime there is
nothing we can do except build a non-pie executable.

Signed-off-by: Richard Henderson 
Signed-off-by: Alex Bennée 
Message-Id: <20220823210329.1969895-1-richard.hender...@linaro.org>
---
  .gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml 
b/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml
index 3d878914e7..85a234801a 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml
@@ -16,7 +16,9 @@ ubuntu-20.04-aarch64-all-linux-static:
   # --disable-glusterfs is needed because there's no static version of those 
libs in distro supplied packages
   - mkdir build
   - cd build
- - ../configure --enable-debug --static --disable-system --disable-glusterfs 
--disable-libssh
+ # Disable -static-pie due to build error with system libc:
+ # https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1987438
+ - ../configure --enable-debug --static --disable-system --disable-glusterfs 
--disable-libssh --disable-pie --extra-cflags='-fno-pie -no-pie'


Shouldn't "--extra-cflags='-fno-pie -no-pie'" be handled by the 
configure script while processing the --disable-pie option?




Re: [PATCH v1 12/25] tests/vm: Remove obsolete Fedora VM test

2022-08-29 Thread Philippe Mathieu-Daudé via

On 26/8/22 19:21, Alex Bennée wrote:

From: Thomas Huth 

It's still based on Fedora 30 - which is not supported anymore by QEMU
since years. Seems like nobody is using (and refreshing) this, and it's
easier to test this via a container anyway, so let's remove this now.

Signed-off-by: Thomas Huth 
Message-Id: <20220822175317.190551-1-th...@redhat.com>
Signed-off-by: Alex Bennée 
---
  tests/vm/Makefile.include |   3 +-
  tests/vm/fedora   | 190 --
  2 files changed, 1 insertion(+), 192 deletions(-)
  delete mode 100755 tests/vm/fedora


Reviewed-by: Philippe Mathieu-Daudé 



Re: [PATCH] MAINTAINERS: Update Akihiko Odaki's email address

2022-08-29 Thread Philippe Mathieu-Daudé via

On 29/8/22 10:31, Akihiko Odaki wrote:

I am now employed by Daynix. Although my role as a reviewer of
macOS-related change is not very relevant to the employment, I decided
to use the company email address to avoid confusions from different
addresses.

Signed-off-by: Akihiko Odaki 
---
  MAINTAINERS | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 





Re: [PATCH] tests/avocado/migration: Get find_free_port() from the ports

2022-08-29 Thread Philippe Mathieu-Daudé via

On 29/8/22 14:19, Thomas Huth wrote:

In upstream Avocado, the find_free_port() function is not available
from "network" anymore, but must be used via "ports", see:

  https://github.com/avocado-framework/avocado/commit/22fc98c6ff76cc55c48

To be able to update to a newer Avocado version later, let's use
the new way for accessing the find_free_port() function here.

Signed-off-by: Thomas Huth 
---
  tests/avocado/migration.py | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 



Re: [PATCH 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.

2022-08-29 Thread Richard Henderson

On 8/29/22 13:46, Ricky Zhou wrote:

Thanks for taking a look at this - did you see the bit in the cover
letter where I discuss doing this via alignment requirements on the
memory operation? My logic was that the memop alignment checks seem to
be more oriented towards triggering #AC exceptions (even though this is
not currently implemented),


I missed that in the cover.  However... implementing #AC is pretty hypothetical.  It's not 
something that I've ever seen used, and not something that anyone has asked for.



One slightly more involved way to use alignment on the MemOp could be to
arrange to pass the problematic MemOp to do_unaligned_access and
helper_unaligned_{ld,st}. Then we could allow CPUs to handle
misalignment of different MemOps differently (e.g. raise #GP/SIGSEGV for
certain ops and #AC/SIGBUS for others). For this change to x86, we could
maybe get away with making MO_ALIGN_16 and above trigger #GP/SIGSEGV and
everything else trigger #AC/SIGBUS. If that's a little hacky, we could
instead add some dedicated bits to MemOp that distinguish different
types of unaligned accesses.


There's another related problem that actually has gotten a bug report in the past: when 
the form of the address should raise #SS instead of #GP in system mode.


My initial thought was to record information about "the" memory access in the per-insn 
unwind info, until I realized that there are insns with  multiple memory operations 
requiring different treatment.  E.g. "push (%rax)", where the read might raise #GP and the 
write might raise #SS.  So I think we'd need to encode #GP vs #SS into the mmu_idx used 
(e.g. in the lsb).


However, I don't think there are any similar situations of multiple memory types affecting 
SSE, so #AC vs #GP could in fact be encoded into the per-insn unwind info.


As for SIGBUS vs SIGSEGV for SSE and user-only, you only need implement the 
x86_cpu_ops.record_sigbus hook.  C.f. the s390x version which raises PGM_SPECIFICATION -> 
SIGILL for unaligned atomic operations.



r~



Re: [PATCH for-7.2 v4 15/21] qmp/hmp, device_tree.c: introduce 'info fdt' command

2022-08-29 Thread Daniel Henrique Barboza




On 8/29/22 00:34, David Gibson wrote:

On Fri, Aug 26, 2022 at 11:11:44AM -0300, Daniel Henrique Barboza wrote:

Reading the FDT requires that the user saves the fdt_blob and then use
'dtc' to read the contents. Saving the file and using 'dtc' is a strong
use case when we need to compare two FDTs, but it's a lot of steps if
you want to do quick check on a certain node or property.

'info fdt' retrieves FDT nodes (and properties, later on) and print it
to the user. This can be used to check the FDT on running machines
without having to save the blob and use 'dtc'.

The implementation is based on the premise that the machine thas a FDT
created using libfdt and pointed by 'machine->fdt'. As long as this
pre-requisite is met the machine should be able to support it.

For now we're going to add the required QMP/HMP boilerplate and the
capability of printing the name of the properties of a given node. Next
patches will extend 'info fdt' to be able to print nodes recursively,
and then individual properties.

This command will always be executed in-band (i.e. holding BQL),
avoiding potential race conditions with machines that might change the
FDT during runtime (e.g. PowerPC 'pseries' machine).

'info fdt' is not something that we expect to be used aside from debugging,
so we're implementing it in QMP as 'x-query-fdt'.

This is an example of 'info fdt' fetching the '/chosen' node of the
pSeries machine:

(qemu) info fdt /chosen
chosen {
 ibm,architecture-vec-5;
 rng-seed;
 ibm,arch-vec-5-platform-support;
 linux,pci-probe-only;
 stdout-path;
 linux,stdout-path;
 qemu,graphic-depth;
 qemu,graphic-height;
 qemu,graphic-width;
};

And the same node for the aarch64 'virt' machine:

(qemu) info fdt /chosen
chosen {
 stdout-path;
 rng-seed;
 kaslr-seed;
};


So, I'm reasonably convinced allowing dumping the whole dtb from
qmp/hmp is useful.  I'm less convined that info fdt is worth the
additional complexity it incurs.  Note that as well as being able to
decompile a whole dtb using dtc, you can also extract and list
specific properties from a dtb blob using the 'fdtget' tool which is
part of the dtc tree.


What's your opinion on patch 21/21, where 'dumpdtb' can write a formatted
FDT in a file with an extra option? That was possible because of the
format helpers introduced for 'info fdt'. The idea is that since we're
able to format a FDT in DTS format, we can also write the FDT in text
format without relying on DTC to decode it.

If we think that this 'dumpdtb' capability is worth having, I can respin
the patches without 'info fdt' but adding these helpers to enable this
'dumpdtb' support. If not, then we can just remove patches 15-21 and
be done with it.


Thanks,


Daniel





Cc: Dr. David Alan Gilbert 
Acked-by: Dr. David Alan Gilbert 
Signed-off-by: Daniel Henrique Barboza 
---
  hmp-commands-info.hx | 13 ++
  include/monitor/hmp.h|  1 +
  include/sysemu/device_tree.h |  4 +++
  monitor/hmp-cmds.c   | 13 ++
  monitor/qmp-cmds.c   | 12 +
  qapi/machine.json| 19 +++
  softmmu/device_tree.c| 47 
  7 files changed, 109 insertions(+)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 188d9ece3b..743b48865d 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -921,3 +921,16 @@ SRST
``stats``
  Show runtime-collected statistics
  ERST
+
+{
+.name   = "fdt",
+.args_type  = "nodepath:s",
+.params = "nodepath",
+.help   = "show firmware device tree node given its path",
+.cmd= hmp_info_fdt,
+},
+
+SRST
+  ``info fdt``
+Show a firmware device tree node given its path. Requires libfdt.
+ERST
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index d7f324da59..c0883dd1e3 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -135,6 +135,7 @@ void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict 
*qdict);
  void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
  void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
  void hmp_dumpdtb(Monitor *mon, const QDict *qdict);
+void hmp_info_fdt(Monitor *mon, const QDict *qdict);
  void hmp_human_readable_text_helper(Monitor *mon,
  HumanReadableText *(*qmp_handler)(Error 
**));
  void hmp_info_stats(Monitor *mon, const QDict *qdict);
diff --git a/include/sysemu/device_tree.h b/include/sysemu/device_tree.h
index bf7684e4ed..057d13e397 100644
--- a/include/sysemu/device_tree.h
+++ b/include/sysemu/device_tree.h
@@ -14,6 +14,8 @@
  #ifndef DEVICE_TREE_H
  #define DEVICE_TREE_H
  
+#include "qapi/qapi-types-common.h"

+
  void *create_device_tree(int *sizep);
  void *load_device_tree(const char *filename_path, int *sizep);
  #ifdef CONFIG_LINUX
@@ -137,6 +139,8 @@ int qemu_fdt_add_path(void *fdt, const char *path);
  
  void 

Re: [PATCH] ui/console: Get tab completion working again in the SDL monitor vc

2022-08-29 Thread Cal Peake
Hi Gerd,

Can you take a look at this and let me know what you think?

Thanks,
-Cal


On Thu, 11 Aug 2022, Cal Peake wrote:

> Define a QEMU special key constant for the tab key and add an entry for
> it in the qcode_to_keysym table. This allows tab completion to work again
> in the SDL monitor virtual console, which has been broken ever since the
> migration from SDL1 to SDL2.
> 
> Signed-off-by: Cal Peake 
> ---
>  include/ui/console.h | 1 +
>  ui/console.c | 1 +
>  2 files changed, 2 insertions(+)
> 
> diff --git a/include/ui/console.h b/include/ui/console.h
> index c0520c694c..e400ee9fa7 100644
> --- a/include/ui/console.h
> +++ b/include/ui/console.h
> @@ -70,6 +70,7 @@ void hmp_mouse_set(Monitor *mon, const QDict *qdict);
>  /* keysym is a unicode code except for special keys (see QEMU_KEY_xxx
> constants) */
>  #define QEMU_KEY_ESC1(c) ((c) | 0xe100)
> +#define QEMU_KEY_TAB0x0009
>  #define QEMU_KEY_BACKSPACE  0x007f
>  #define QEMU_KEY_UP QEMU_KEY_ESC1('A')
>  #define QEMU_KEY_DOWN   QEMU_KEY_ESC1('B')
> diff --git a/ui/console.c b/ui/console.c
> index e139f7115e..addaafba28 100644
> --- a/ui/console.c
> +++ b/ui/console.c
> @@ -1368,6 +1368,7 @@ static const int qcode_to_keysym[Q_KEY_CODE__MAX] = {
>  [Q_KEY_CODE_PGUP]   = QEMU_KEY_PAGEUP,
>  [Q_KEY_CODE_PGDN]   = QEMU_KEY_PAGEDOWN,
>  [Q_KEY_CODE_DELETE] = QEMU_KEY_DELETE,
> +[Q_KEY_CODE_TAB]= QEMU_KEY_TAB,
>  [Q_KEY_CODE_BACKSPACE] = QEMU_KEY_BACKSPACE,
>  };
>  
> -- 
> 2.35.3
> 
> 



Re: [PATCH 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.

2022-08-29 Thread Ricky Zhou
On Mon, Aug 29, 2022 at 9:45 AM Richard Henderson
 wrote:
>
> On 8/29/22 07:23, Ricky Zhou wrote:
> This trap should be raised via the memory operation:
> ...
> Only the first of the two loads/stores must be aligned, as the other is known 
> to be +8.
> You then must fill in the x86_tcg_ops.do_unaligned_access hook to raise #GP.
Thanks for taking a look at this - did you see the bit in the cover
letter where I discuss doing this via alignment requirements on the
memory operation? My logic was that the memop alignment checks seem to
be more oriented towards triggering #AC exceptions (even though this is
not currently implemented), since qemu-user's unaligned access handlers
(helper_unaligned_{ld,st}) already trigger SIGBUS as opposed to SIGSEGV.
I was concerned that implementing this via MO_ALIGN_16 would get in the
way of a hypothetical future implementation of the AC flag, since
do_unaligned_access would need to raise #AC instead of #GP for that.

One slightly more involved way to use alignment on the MemOp could be to
arrange to pass the problematic MemOp to do_unaligned_access and
helper_unaligned_{ld,st}. Then we could allow CPUs to handle
misalignment of different MemOps differently (e.g. raise #GP/SIGSEGV for
certain ops and #AC/SIGBUS for others). For this change to x86, we could
maybe get away with making MO_ALIGN_16 and above trigger #GP/SIGSEGV and
everything else trigger #AC/SIGBUS. If that's a little hacky, we could
instead add some dedicated bits to MemOp that distinguish different
types of unaligned accesses.

What do you think? Happy to implement whichever approach is preferred!

Thanks,
Ricky



Re: [PATCH v5 09/18] dump: Use a buffer for ELF section data and headers

2022-08-29 Thread Janis Schoetterl-Glausch
On Thu, 2022-08-11 at 12:11 +, Janosch Frank wrote:
> Currently we're writing the NULL section header if we overflow the
> physical header number in the ELF header. But in the future we'll add
> custom section headers AND section data.
> 
> To facilitate this we need to rearange section handling a bit. As with
> the other ELF headers we split the code into a prepare and a write
> step.
> 
> Signed-off-by: Janosch Frank 
> ---
>  dump/dump.c   | 83 +--
>  include/sysemu/dump.h |  2 ++
>  2 files changed, 58 insertions(+), 27 deletions(-)
> 
> diff --git a/dump/dump.c b/dump/dump.c
> index a905316fe5..0051c71d08 100644
> --- a/dump/dump.c
> +++ b/dump/dump.c
> @@ -380,30 +380,57 @@ static void write_elf_phdr_note(DumpState *s, Error 
> **errp)
>  }
>  }
>  
> -static void write_elf_section(DumpState *s, int type, Error **errp)
> +static void prepare_elf_section_hdr_zero(DumpState *s)
>  {
> -Elf32_Shdr shdr32;
> -Elf64_Shdr shdr64;
> -int shdr_size;
> -void *shdr;
> +if (dump_is_64bit(s)) {
> +Elf64_Shdr *shdr64 = s->elf_section_hdrs;
> +
> +shdr64->sh_info = cpu_to_dump32(s, s->phdr_num);
> +} else {
> +Elf32_Shdr *shdr32 = s->elf_section_hdrs;
> +
> +shdr32->sh_info = cpu_to_dump32(s, s->phdr_num);
> +}
> +}
> +
> +static void prepare_elf_section_hdrs(DumpState *s)
> +{
> +size_t len, sizeof_shdr;
> +
> +/*
> + * Section ordering:
> + * - HDR zero
> + */
> +sizeof_shdr = dump_is_64bit(s) ? sizeof(Elf64_Shdr) : sizeof(Elf32_Shdr);
> +len = sizeof_shdr * s->shdr_num;
> +s->elf_section_hdrs = g_malloc0(len);

I'm not seeing this being freed.
> +
> +/*
> + * The first section header is ALWAYS a special initial section
> + * header.
> + *
> + * The header should be 0 with one exception being that if
> + * phdr_num is PN_XNUM then the sh_info field contains the real
> + * number of segment entries.
> + *
> + * As we zero allocate the buffer we will only need to modify
> + * sh_info for the PN_XNUM case.
> + */
> +if (s->phdr_num >= PN_XNUM) {
> +prepare_elf_section_hdr_zero(s);
> +}
> +}
> +
> +static void write_elf_section_headers(DumpState *s, Error **errp)

[...]

> @@ -579,6 +606,12 @@ static void dump_begin(DumpState *s, Error **errp)
>  return;
>  }
>  
> +/* write section headers to vmcore */
> +write_elf_section_headers(s, errp);
> +if (*errp) {
> +return;
> +}
> +
>  /* write PT_NOTE to vmcore */
>  write_elf_phdr_note(s, errp);
>  if (*errp) {
> @@ -591,14 +624,6 @@ static void dump_begin(DumpState *s, Error **errp)
>  return;
>  }
>  
> -/* write section to vmcore */
> -if (s->shdr_num) {
> -write_elf_section(s, 1, errp);
> -if (*errp) {
> -return;
> -}
> -}
> -

Here you change the order of the headers, but the elf header is only
fixed in patch 11.
I agree that this should be a separate patch, with an explanation on
why it's necessary. 
So basically squashed into patch 11, except I think the comment change
in that one should go into another patch.

>  /* write notes to vmcore */
>  write_elf_notes(s, errp);
>  }
> @@ -674,7 +699,11 @@ static void create_vmcore(DumpState *s, Error **errp)
>  return;
>  }
>  
> +/* Iterate over memory and dump it to file */
>  dump_iterate(s, errp);
> +if (*errp) {
> +return;
> +}
>  }
>  
>  static int write_start_flat_header(int fd)
> diff --git a/include/sysemu/dump.h b/include/sysemu/dump.h
> index b62513d87d..9995f65dc8 100644
> --- a/include/sysemu/dump.h
> +++ b/include/sysemu/dump.h
> @@ -177,6 +177,8 @@ typedef struct DumpState {
>  int64_t filter_area_begin;  /* Start address of partial guest memory 
> area */
>  int64_t filter_area_length; /* Length of partial guest memory area */
>  
> +void *elf_section_hdrs; /* Pointer to section header buffer */
> +
>  uint8_t *note_buf;  /* buffer for notes */
>  size_t note_buf_offset; /* the writing place in note_buf */
>  uint32_t nr_cpus;   /* number of guest's cpu */




Re: [PATCH v5 04/18] dump: Rework get_start_block

2022-08-29 Thread Janis Schoetterl-Glausch
On Thu, 2022-08-11 at 12:10 +, Janosch Frank wrote:
> get_start_block() returns the start address of the first memory block
> or -1.
> 
> With the GuestPhysBlock iterator conversion we don't need to set the
> start address and can therefore remove that code and the "start"
> DumpState struct member. The only functionality left is the validation
> of the start block so it only makes sense to re-name the function to
> validate_start_block()

Nit, since you don't return an address anymore, I find retaining the -
1/0 return value instead of true/false weird.
> 
> Signed-off-by: Janosch Frank 
> Reviewed-by: Marc-André Lureau 
> Reviewed-by: Janis Schoetterl-Glausch 
> ---
>  dump/dump.c   | 20 ++--
>  include/sysemu/dump.h |  2 --
>  2 files changed, 6 insertions(+), 16 deletions(-)
> 
> diff --git a/dump/dump.c b/dump/dump.c
> index 340de5a1e7..e204912a89 100644
> --- a/dump/dump.c
> +++ b/dump/dump.c
> @@ -1500,30 +1500,22 @@ static void create_kdump_vmcore(DumpState *s, Error 
> **errp)
>  }
>  }
>  
> -static ram_addr_t get_start_block(DumpState *s)
> +static int validate_start_block(DumpState *s)
>  {
>  GuestPhysBlock *block;
>  
>  if (!s->has_filter) {
> -s->next_block = QTAILQ_FIRST(>guest_phys_blocks.head);
>  return 0;
>  }
>  
>  QTAILQ_FOREACH(block, >guest_phys_blocks.head, next) {
> +/* This block is out of the range */
>  if (block->target_start >= s->begin + s->length ||
>  block->target_end <= s->begin) {
> -/* This block is out of the range */
>  continue;
>  }
> -
> -s->next_block = block;
> -if (s->begin > block->target_start) {
> -s->start = s->begin - block->target_start;
> -} else {
> -s->start = 0;
> -}
> -return s->start;
> -}
> +return 0;
> +   }
>  
>  return -1;
>  }
> @@ -1670,8 +1662,8 @@ static void dump_init(DumpState *s, int fd, bool 
> has_format,
>  goto cleanup;
>  }
>  
> -s->start = get_start_block(s);
> -if (s->start == -1) {
> +/* Is the filter filtering everything? */
> +if (validate_start_block(s) == -1) {
>  error_setg(errp, QERR_INVALID_PARAMETER, "begin");
>  goto cleanup;
>  }
> diff --git a/include/sysemu/dump.h b/include/sysemu/dump.h
> index ffc2ea1072..7fce1d4af6 100644
> --- a/include/sysemu/dump.h
> +++ b/include/sysemu/dump.h
> @@ -166,8 +166,6 @@ typedef struct DumpState {
>  hwaddr memory_offset;
>  int fd;
>  
> -GuestPhysBlock *next_block;
> -ram_addr_t start;
>  bool has_filter;
>  int64_t begin;
>  int64_t length;




Re: [PATCH v8 0/7] Add support for zoned device

2022-08-29 Thread Stefan Hajnoczi
On Fri, Aug 26, 2022 at 11:15:29PM +0800, Sam Li wrote:
> Zoned Block Devices (ZBDs) devide the LBA space to block regions called zones
> that are larger than the LBA size. It can only allow sequential writes, which
> reduces write amplification in SSD, leading to higher throughput and increased
> capacity. More details about ZBDs can be found at:
> 
> https://zonedstorage.io/docs/introduction/zoned-storage
> 
> The zoned device support aims to let guests (virtual machines) access zoned
> storage devices on the host (hypervisor) through a virtio-blk device. This
> involves extending QEMU's block layer and virtio-blk emulation code.  In its
> current status, the virtio-blk device is not aware of ZBDs but the guest sees
> host-managed drives as regular drive that will runs correctly under the most
> common write workloads.
> 
> This patch series extend the block layer APIs with the minimum set of zoned
> commands that are necessary to support zoned devices. The commands are - 
> Report
> Zones, four zone operations and Zone Append (developing).
> 
> It can be tested on a null_blk device using qemu-io or qemu-iotests. For
> example, the command line for zone report using qemu-io is:
> $ path/to/qemu-io --image-opts -n 
> driver=zoned_host_device,filename=/dev/nullb0
> -c "zrp offset nr_zones"
> 
> v8:
> - address review comments
>   * solve patch conflicts and merge sysfs helper funcations into one patch
>   * add cache.direct=on check in config

Hi Sam,
I have left a few comments.

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-08-29 Thread Stefan Hajnoczi
On Sat, Aug 27, 2022 at 12:17:04AM +0800, Sam Li wrote:
> +/*
> + * Send a zone_management command.
> + * op is the zone operation.
> + * offset is the starting zone specified as a sector offset.

Does "sector offset" mean "byte offset from the start of the device" or
does it mean in 512B sector units? For consistency this should be in
bytes.

> + * len is the maximum number of sectors the command should operate on. It
> + * should be aligned with the zone sector size.

Please use bytes for consistency with QEMU's block layer APIs.

> @@ -3022,6 +3183,118 @@ static void raw_account_discard(BDRVRawState *s, 
> uint64_t nbytes, int ret)
>  }
>  }
>  
> +/*
> + * zone report - Get a zone block device's information in the form
> + * of an array of zone descriptors.
> + *
> + * @param bs: passing zone block device file descriptor
> + * @param zones: an array of zone descriptors to hold zone
> + * information on reply
> + * @param offset: offset can be any byte within the zone size.

This isn't an offset within a zone, it's an offset within the entire
device, so I think "zone size" is confusing here.

> + * @param len: (not sure yet.

Please remove this and document nr_zones instead.

> + * @return 0 on success, -1 on failure
> + */
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t 
> offset,
> +   unsigned int *nr_zones,
> +   BlockZoneDescriptor *zones) {
> +#if defined(CONFIG_BLKZONED)
> +BDRVRawState *s = bs->opaque;
> +RawPosixAIOData acb;
> +
> +acb = (RawPosixAIOData) {
> +.bs = bs,
> +.aio_fildes = s->fd,
> +.aio_type   = QEMU_AIO_ZONE_REPORT,
> +.aio_offset = offset,
> +.zone_report= {
> +.nr_zones   = nr_zones,
> +.zones  = zones,
> +},
> +};
> +
> +return raw_thread_pool_submit(bs, handle_aiocb_zone_report, );
> +#else
> +return -ENOTSUP;
> +#endif
> +}
> +
> +/*
> + * zone management operations - Execute an operation on a zone
> + */
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp 
> op,
> +int64_t offset, int64_t len) {
> +#if defined(CONFIG_BLKZONED)
> +BDRVRawState *s = bs->opaque;
> +RawPosixAIOData acb;
> +int64_t zone_sector, zone_sector_mask;
> +const char *ioctl_name;
> +unsigned long zone_op;
> +int ret;
> +
> +struct stat st;
> +if (fstat(s->fd, ) < 0) {
> +ret = -errno;
> +return ret;
> +}

st is not used and can be removed.

> +zone_sector = bs->bl.zone_sectors;
> +zone_sector_mask = zone_sector - 1;
> +if (offset & zone_sector_mask) {
> +error_report("sector offset %" PRId64 " is not aligned to zone size "
> + "%" PRId64 "", offset, zone_sector);
> +return -EINVAL;
> +}
> +
> +if (len & zone_sector_mask) {
> +error_report("number of sectors %" PRId64 " is not aligned to zone 
> size"
> +  " %" PRId64 "", len, zone_sector);
> +return -EINVAL;
> +}
> +
> +switch (op) {
> +case BLK_ZO_OPEN:
> +ioctl_name = "BLKOPENZONE";
> +zone_op = BLKOPENZONE;
> +break;
> +case BLK_ZO_CLOSE:
> +ioctl_name = "BLKCLOSEZONE";
> +zone_op = BLKCLOSEZONE;
> +break;
> +case BLK_ZO_FINISH:
> +ioctl_name = "BLKFINISHZONE";
> +zone_op = BLKFINISHZONE;
> +break;
> +case BLK_ZO_RESET:
> +ioctl_name = "BLKRESETZONE";
> +zone_op = BLKRESETZONE;
> +break;
> +default:
> +error_report("Invalid zone operation 0x%x", op);
> +return -EINVAL;
> +}
> +
> +acb = (RawPosixAIOData) {
> +.bs = bs,
> +.aio_fildes = s->fd,
> +.aio_type   = QEMU_AIO_ZONE_MGMT,
> +.aio_offset = offset,
> +.aio_nbytes = len,
> +.zone_mgmt  = {
> +.zone_op = zone_op,
> +},
> +};
> +
> +ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, );
> +if (ret != 0) {
> +error_report("ioctl %s failed %d", ioctl_name, errno);
> +return -errno;

ret contains a negative errno value. The errno variable is not used by
raw_thread_pool_submit().

I suggest simplifying it to:

  return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, );

That's what most of the other raw_thread_pool_submit() callers.


signature.asc
Description: PGP signature


Re: [PATCH v8 2/7] file-posix: introduce helper funcations for sysfs attributes

2022-08-29 Thread Stefan Hajnoczi
On Sat, Aug 27, 2022 at 12:11:21AM +0800, Sam Li wrote:

If you send another revision please fix the "funcations" typo in the
commit message.


signature.asc
Description: PGP signature


Re: [PATCH 4/9] hw/isa/vt82c686: QOM'ify via-ide creation

2022-08-29 Thread BB



Am 29. August 2022 19:04:06 MESZ schrieb BALATON Zoltan :
>On Mon, 29 Aug 2022, BB wrote:
>> Am 25. August 2022 01:18:56 MESZ schrieb BALATON Zoltan :
>>> On Thu, 25 Aug 2022, Bernhard Beschow wrote:
 On Wed, Aug 24, 2022 at 3:54 PM BALATON Zoltan  wrote:
> On Tue, 23 Aug 2022, Bernhard Beschow wrote:
>> The IDE function is closely tied to the ISA function (e.g. the IDE
>> interrupt routing happens there), so it makes sense that the IDE
>> function is instantiated within the southbridge itself. As a side effect,
>> duplicated code in the boards is resolved.
>> 
>> Signed-off-by: Bernhard Beschow 
>> ---
>> configs/devices/mips64el-softmmu/default.mak |  1 -
>> hw/isa/Kconfig   |  1 +
>> hw/isa/vt82c686.c| 18 ++
>> hw/mips/fuloong2e.c  |  3 ---
>> hw/ppc/Kconfig   |  1 -
>> hw/ppc/pegasos2.c|  4 
>> 6 files changed, 19 insertions(+), 9 deletions(-)
>> 
>> diff --git a/configs/devices/mips64el-softmmu/default.mak
> b/configs/devices/mips64el-softmmu/default.mak
>> index c610749ac1..d5188f7ea5 100644
>> --- a/configs/devices/mips64el-softmmu/default.mak
>> +++ b/configs/devices/mips64el-softmmu/default.mak
>> @@ -1,7 +1,6 @@
>> # Default configuration for mips64el-softmmu
>> 
>> include ../mips-softmmu/common.mak
>> -CONFIG_IDE_VIA=y
>> CONFIG_FULOONG=y
>> CONFIG_LOONGSON3V=y
>> CONFIG_ATI_VGA=y
>> diff --git a/hw/isa/Kconfig b/hw/isa/Kconfig
>> index d42143a991..20de7e9294 100644
>> --- a/hw/isa/Kconfig
>> +++ b/hw/isa/Kconfig
>> @@ -53,6 +53,7 @@ config VT82C686
>> select I8254
>> select I8257
>> select I8259
>> +select IDE_VIA
>> select MC146818RTC
>> select PARALLEL
>> 
>> diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
>> index 5582c0b179..37d9ed635d 100644
>> --- a/hw/isa/vt82c686.c
>> +++ b/hw/isa/vt82c686.c
>> @@ -17,6 +17,7 @@
>> #include "hw/isa/vt82c686.h"
>> #include "hw/pci/pci.h"
>> #include "hw/qdev-properties.h"
>> +#include "hw/ide/pci.h"
>> #include "hw/isa/isa.h"
>> #include "hw/isa/superio.h"
>> #include "hw/intc/i8259.h"
>> @@ -544,6 +545,7 @@ struct ViaISAState {
>> qemu_irq cpu_intr;
>> qemu_irq *isa_irqs;
>> ViaSuperIOState via_sio;
>> +PCIIDEState ide;
>> };
>> 
>> static const VMStateDescription vmstate_via = {
>> @@ -556,10 +558,18 @@ static const VMStateDescription vmstate_via = {
>> }
>> };
>> 
>> +static void via_isa_init(Object *obj)
>> +{
>> +ViaISAState *s = VIA_ISA(obj);
>> +
>> +object_initialize_child(obj, "ide", >ide, "via-ide");
>> +}
>> +
>> static const TypeInfo via_isa_info = {
>> .name  = TYPE_VIA_ISA,
>> .parent= TYPE_PCI_DEVICE,
>> .instance_size = sizeof(ViaISAState),
>> +.instance_init = via_isa_init,
>> .abstract  = true,
>> .interfaces= (InterfaceInfo[]) {
>> { INTERFACE_CONVENTIONAL_PCI_DEVICE },
>> @@ -583,6 +593,7 @@ static void via_isa_realize(PCIDevice *d, Error
> **errp)
>> {
>> ViaISAState *s = VIA_ISA(d);
>> DeviceState *dev = DEVICE(d);
>> +PCIBus *pci_bus = pci_get_bus(d);
>> qemu_irq *isa_irq;
>> ISABus *isa_bus;
>> int i;
>> @@ -607,6 +618,13 @@ static void via_isa_realize(PCIDevice *d, Error
> **errp)
>> if (!qdev_realize(DEVICE(>via_sio), BUS(isa_bus), errp)) {
>> return;
>> }
>> +
>> +/* Function 1: IDE */
>> +qdev_prop_set_int32(DEVICE(>ide), "addr", d->devfn + 1);
>> +if (!qdev_realize(DEVICE(>ide), BUS(pci_bus), errp)) {
>> +return;
>> +}
>> +pci_ide_create_devs(PCI_DEVICE(>ide));
> 
> I'm not sure about moving pci_ide_create_devs() here. This is usally
> called from board code and only piix4 seems to do this. Maybe that's wrong
> because if all IDE devices did this then one machine could not have more
> than one different ide devices (like having an on-board ide and adding a
> pci ide controoler with -device) so this probably belongs to the board
> code to add devices to its default ide controller only as this is machine
> specific. Unless I'm wrong in which case somebody will correct me.
> 
 
 Grepping the code it can be seen that it's always called right after
 creating the IDE controllers. The only notable exception is the "sii3112"
 device in the sam460ex board which is not emulated yet. Since the IDE
>>> 
>>> The problem with sii3112 is that it only has 2 channels becuase I did not 
>>> bother to implement more so pci_ide_create_devs() probably would not 

Re: [PATCH 8/9] hw/isa/vt82c686: QOM'ify RTC creation

2022-08-29 Thread BB



Am 29. August 2022 19:50:10 MESZ schrieb BALATON Zoltan :
>On Mon, 29 Aug 2022, BB wrote:
>> Am 24. August 2022 01:23:14 MESZ schrieb BALATON Zoltan :
>>> On Tue, 23 Aug 2022, Bernhard Beschow wrote:
 On Tue, Aug 23, 2022 at 2:20 AM BALATON Zoltan  wrote:
> On Tue, 23 Aug 2022, Bernhard Beschow wrote:
>> Signed-off-by: Bernhard Beschow 
>> ---
>> hw/isa/vt82c686.c | 12 +++-
>> 1 file changed, 11 insertions(+), 1 deletion(-)
>> 
>> diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
>> index 47f2fd2669..ee745d5d49 100644
>> --- a/hw/isa/vt82c686.c
>> +++ b/hw/isa/vt82c686.c
>> @@ -546,6 +546,7 @@ struct ViaISAState {
>> qemu_irq cpu_intr;
>> qemu_irq *isa_irqs;
>> ViaSuperIOState via_sio;
>> +RTCState rtc;
>> PCIIDEState ide;
>> UHCIState uhci[2];
>> ViaPMState pm;
>> @@ -567,6 +568,7 @@ static void via_isa_init(Object *obj)
>> {
>> ViaISAState *s = VIA_ISA(obj);
>> 
>> +object_initialize_child(obj, "rtc", >rtc, TYPE_MC146818_RTC);
>> object_initialize_child(obj, "ide", >ide, "via-ide");
>> object_initialize_child(obj, "uhci1", >uhci[0],
> "vt82c686b-usb-uhci");
>> object_initialize_child(obj, "uhci2", >uhci[1],
> "vt82c686b-usb-uhci");
>> @@ -615,7 +617,15 @@ static void via_isa_realize(PCIDevice *d, Error
> **errp)
>> isa_bus_irqs(isa_bus, s->isa_irqs);
>> i8254_pit_init(isa_bus, 0x40, 0, NULL);
>> i8257_dma_init(isa_bus, 0);
>> -mc146818_rtc_init(isa_bus, 2000, NULL);
>> +
>> +/* RTC */
>> +qdev_prop_set_int32(DEVICE(>rtc), "base_year", 2000);
>> +if (!qdev_realize(DEVICE(>rtc), BUS(isa_bus), errp)) {
>> +return;
>> +}
>> +object_property_add_alias(qdev_get_machine(), "rtc-time",
> OBJECT(>rtc),
>> +  "date");
>> +isa_connect_gpio_out(ISA_DEVICE(>rtc), 0, s->rtc.isairq);
>> 
>> for (i = 0; i < PCI_CONFIG_HEADER_SIZE; i++) {
>> if (i < PCI_COMMAND || i >= PCI_REVISION_ID) {
>> 
> 
> This actually introduces code duplication as all other places except piix4
> seem to still use the init function (probably to ensure that the rtc-rime
> alias on the machine is properly set) so I'd keep this the same as
> everything else and drop this patch until this init function is removed
> from all other places as well.
> 
 
 Hi Zoltan,
 
 Thanks for the fast reply! Regarding code homogeneity and duplication I've
 made a similar argument for mc146818_rtc_init() in the past [1] and I've
 learnt that my patch went backwards. Incidentally, Peter mentioned vt686c
 as a candidate for the embed-the-device-struct style which - again
 incidentally - I've now done.
>>> 
>>> I've seen patches embedding devices recently but in this case it looked not 
>>> that simple because of the rtc-time alias.
>>> 
 The rtc-time alias is actually only used by a couple of PPC machines where
 Pegasos II is one of them. So the alias actually needs to be created only
 for these machines, and identifying the cases where it has to be preserved
 requires a lot of careful investigation. In the Pegasos II case this seems
 especially complicated since one needs to look through several layers of
 devices. During my work on the VT82xx south bridges I've gained some
 knowledge such that I'd like to make this simplifying contribution.
>>> 
>>> I've used it to implement the get-time-of-day rtas call with VOF in 
>>> pegasos2 because otherwise it would need to access internals of the RTC 
>>> model and/or duplicate some code. Here's the message discussing this:
>>> 
>>> https://lists.nongnu.org/archive/html/qemu-ppc/2021-10/msg00170.html
>>> 
>>> so this alias still seems to be the simplest way.
>>> 
>>> I think the primary function of this alias is not these ppc machines but 
>>> some QMP/HMP command or to make the guest time available from the monitor 
>>> or something like that so it's probably also used from there and therefore 
>>> all rtc probably should have it but I'm not sure about it.
>> 
>> Indeed, the alias seems to be a convenience for some QMP/HMP commands. 
>> AFAICS only the mc146818 sets the alias while it is probably not the only 
>> RTC modelled in QEMU. So I wonder why boards using another RTC don't need it 
>> and whether removing the alias constitutes a compatibility break.
>> 
 Our discussion makes me realize that the creation of the alias could now
 actually be moved to the Pegasos II board. This way, the Pegasos II board
 would both create and consume that alias, which seems to remove quite some
 cognitive load. Do you agree? Would moving the alias to the board work for
 you?
>>> 
>>> Yes I think that would be better. This way the vt82xx and piix4 would be 
>>> similar and the alias would also 

Re: [PATCH 8/9] hw/isa/vt82c686: QOM'ify RTC creation

2022-08-29 Thread BALATON Zoltan

On Mon, 29 Aug 2022, BB wrote:

Am 24. August 2022 01:23:14 MESZ schrieb BALATON Zoltan :

On Tue, 23 Aug 2022, Bernhard Beschow wrote:

On Tue, Aug 23, 2022 at 2:20 AM BALATON Zoltan  wrote:

On Tue, 23 Aug 2022, Bernhard Beschow wrote:

Signed-off-by: Bernhard Beschow 
---
hw/isa/vt82c686.c | 12 +++-
1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
index 47f2fd2669..ee745d5d49 100644
--- a/hw/isa/vt82c686.c
+++ b/hw/isa/vt82c686.c
@@ -546,6 +546,7 @@ struct ViaISAState {
qemu_irq cpu_intr;
qemu_irq *isa_irqs;
ViaSuperIOState via_sio;
+RTCState rtc;
PCIIDEState ide;
UHCIState uhci[2];
ViaPMState pm;
@@ -567,6 +568,7 @@ static void via_isa_init(Object *obj)
{
ViaISAState *s = VIA_ISA(obj);

+object_initialize_child(obj, "rtc", >rtc, TYPE_MC146818_RTC);
object_initialize_child(obj, "ide", >ide, "via-ide");
object_initialize_child(obj, "uhci1", >uhci[0],

"vt82c686b-usb-uhci");

object_initialize_child(obj, "uhci2", >uhci[1],

"vt82c686b-usb-uhci");

@@ -615,7 +617,15 @@ static void via_isa_realize(PCIDevice *d, Error

**errp)

isa_bus_irqs(isa_bus, s->isa_irqs);
i8254_pit_init(isa_bus, 0x40, 0, NULL);
i8257_dma_init(isa_bus, 0);
-mc146818_rtc_init(isa_bus, 2000, NULL);
+
+/* RTC */
+qdev_prop_set_int32(DEVICE(>rtc), "base_year", 2000);
+if (!qdev_realize(DEVICE(>rtc), BUS(isa_bus), errp)) {
+return;
+}
+object_property_add_alias(qdev_get_machine(), "rtc-time",

OBJECT(>rtc),

+  "date");
+isa_connect_gpio_out(ISA_DEVICE(>rtc), 0, s->rtc.isairq);

for (i = 0; i < PCI_CONFIG_HEADER_SIZE; i++) {
if (i < PCI_COMMAND || i >= PCI_REVISION_ID) {



This actually introduces code duplication as all other places except piix4
seem to still use the init function (probably to ensure that the rtc-rime
alias on the machine is properly set) so I'd keep this the same as
everything else and drop this patch until this init function is removed
from all other places as well.



Hi Zoltan,

Thanks for the fast reply! Regarding code homogeneity and duplication I've
made a similar argument for mc146818_rtc_init() in the past [1] and I've
learnt that my patch went backwards. Incidentally, Peter mentioned vt686c
as a candidate for the embed-the-device-struct style which - again
incidentally - I've now done.


I've seen patches embedding devices recently but in this case it looked not 
that simple because of the rtc-time alias.


The rtc-time alias is actually only used by a couple of PPC machines where
Pegasos II is one of them. So the alias actually needs to be created only
for these machines, and identifying the cases where it has to be preserved
requires a lot of careful investigation. In the Pegasos II case this seems
especially complicated since one needs to look through several layers of
devices. During my work on the VT82xx south bridges I've gained some
knowledge such that I'd like to make this simplifying contribution.


I've used it to implement the get-time-of-day rtas call with VOF in 
pegasos2 because otherwise it would need to access internals of the RTC 
model and/or duplicate some code. Here's the message discussing this:


https://lists.nongnu.org/archive/html/qemu-ppc/2021-10/msg00170.html

so this alias still seems to be the simplest way.

I think the primary function of this alias is not these ppc machines 
but some QMP/HMP command or to make the guest time available from the 
monitor or something like that so it's probably also used from there 
and therefore all rtc probably should have it but I'm not sure about 
it.


Indeed, the alias seems to be a convenience for some QMP/HMP commands. 
AFAICS only the mc146818 sets the alias while it is probably not the 
only RTC modelled in QEMU. So I wonder why boards using another RTC 
don't need it and whether removing the alias constitutes a compatibility 
break.



Our discussion makes me realize that the creation of the alias could now
actually be moved to the Pegasos II board. This way, the Pegasos II board
would both create and consume that alias, which seems to remove quite some
cognitive load. Do you agree? Would moving the alias to the board work for
you?


Yes I think that would be better. This way the vt82xx and piix4 would 
be similar and the alias would also be clear within the pegasos2 code 
and it also has the machine directly at that point so it's clearer that 
way.


All in all I wonder if we need to preserve the alias for the fuloong2e board?


I don't know. A quick investigation shows that it seems to be added by 
commit 654a36d857ff94 which suggests something may use it (or was intended 
to use it back then, but not sure if things have changed in the meantime). 
I don't think any management app cares about fuloong2e but if this should 
be a generic thing then all machine may need it. Then it's also mentioned 
in commit 29551fdcf4d99 that suggests one ought to 

Re: [PATCH RFC 00/13] migration: Postcopy Preempt-Full

2022-08-29 Thread Peter Xu
On Mon, Aug 29, 2022 at 12:56:46PM -0400, Peter Xu wrote:
> This is a RFC series.  Tree is here:
> 
>   https://github.com/xzpeter/qemu/tree/preempt-full
> 
> It's not complete because there're still something we need to do which will
> be attached to the end of this cover letter, however this series can
> already safely pass qtest and any of my test.
> 
> Comparing to the recently merged preempt mode I called it "preempt-full"
> because it threadifies the postcopy channels so now urgent pages can be
> fully handled separately outside of the ram save loop.  Sorry to have the
> same name as the PREEMPT_FULL in the Linux RT world, it's just that we
> needed a name for the capability and it was named as preempt already
> anyway..
> 
> The existing preempt code has reduced ramdom page req latency over 10Gbps
> network from ~12ms to ~500us which has already landed.
> 
> This preempt-full series can further reduces that ~500us to ~230us per my
> initial test.  More to share below.
> 
> Note that no new capability is needed, IOW it's fully compatible with the
> existing preempt mode.  So the naming is actually not important but just to
> identify the difference on the binaries.  It's because this series only
> reworks the sender side code and does not change the migration protocol, it
> just runs faster.
> 
> IOW, old "preempt" QEMU can also migrate to "preempt-full" QEMU, vice versa.
> 
>   - When old "preempt" mode QEMU migrates to "preempt-full" QEMU, it'll be
> the same as running both old "preempt" QEMUs.
> 
>   - When "preempt-full" QEMU migrates to old "preempt" QEMU, it'll be the
> same as running both "preempt-full".
> 
> The logic of the series is quite simple too: simply moving the existing
> preempt channel page sends to rp-return thread.  It can slow down rp-return
> thread on receiving pages, but I don't really see a major issue with it so
> far.
> 
> This latency number is getting close to the extreme of 4K page request
> latency of any TCP roundtrip of the 10Gbps nic I have.  The 'extreme
> number' is something I get from mig_mon tool which has a mode [1] to
> emulate the extreme tcp roundtrips of page requests.
> 
> Performance
> ===
> 
> Page request latencies has distributions as below, with a VM of 20G mem, 20
> cores, 10Gbps nic, 18G fully random writes:
> 
> Postcopy Vanilla
> 
> 
> Average: 12093 (us)
> @delay_us:
> [1]1 |
> |
> [2, 4) 0 |
> |
> [4, 8) 0 |
> |
> [8, 16)0 |
> |
> [16, 32)   1 |
> |
> [32, 64)   8 |
> |
> [64, 128) 11 |
> |
> [128, 256)14 |
> |
> [256, 512)19 |
> |
> [512, 1K) 14 |
> |
> [1K, 2K)  35 |
> |
> [2K, 4K)  18 |
> |
> [4K, 8K)  87 |@   
> |
> [8K, 16K)   2397 
> ||
> [16K, 32K) 7 |
> |
> [32K, 64K) 2 |
> |
> [64K, 128K)   20 |
> |
> [128K, 256K)   6 |
> |
> 
> Postcopy Preempt
> 
> 
> Average: 496 (us)
> 
> @delay_us:
> [32, 64)   2 |
> |
> [64, 128)   2306 |
> |
> [128, 256) 25422 
> ||
> [256, 512)  8238 |
> |
> [512, 1K)   1066 |@@  
> |
> [1K, 2K)2167 |
> |
> [2K, 4K)3329 |@@  
> |
> [4K, 8K) 109 |
> |
> [8K, 16K) 48 |
> |
> 
> Postcopy Preempt-Full
> -
> 
> Average: 229 (us)
> 
> @delay_us:
> [8, 16)1 |
> |
> [16, 32)   3 | 

Re: [PATCH 1/1] monitor/hmp: print trace as option in help for log command

2022-08-29 Thread Dongli Zhang
Sorry that the format for "none" should be changed as well.

I have sent a v2:

https://lists.gnu.org/archive/html/qemu-devel/2022-08/msg04445.html

Thank you very much!

Dongli Zhang

On 8/29/22 3:04 AM, Dongli Zhang wrote:
> The below is printed when printing help information in qemu-system-x86_64
> command line, and when CONFIG_TRACE_LOG is enabled:
> 
> $ qemu-system-x86_64 -d help
> ... ...
> trace:PATTERN   enable trace events
> 
> Use "-d trace:help" to get a list of trace events.
> 
> However, they are not printed in hmp "help log" command.
> 
> Cc: Joe Jin 
> Signed-off-by: Dongli Zhang 
> ---
>  monitor/hmp.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/monitor/hmp.c b/monitor/hmp.c
> index 15ca047..9f48b70 100644
> --- a/monitor/hmp.c
> +++ b/monitor/hmp.c
> @@ -287,8 +287,13 @@ void help_cmd(Monitor *mon, const char *name)
>  monitor_printf(mon, "Log items (comma separated):\n");
>  monitor_printf(mon, "%-10s %s\n", "none", "remove all logs");
>  for (item = qemu_log_items; item->mask != 0; item++) {
> -monitor_printf(mon, "%-10s %s\n", item->name, item->help);
> +monitor_printf(mon, "%-15s %s\n", item->name, item->help);
>  }
> +#ifdef CONFIG_TRACE_LOG
> +monitor_printf(mon, "trace:PATTERN   enable trace events\n");
> +monitor_printf(mon, "\nUse \"info trace-events\" to get a list 
> of "
> +"trace events.\n\n");
> +#endif
>  return;
>  }
>  
> 



Re: [PATCH 8/9] hw/isa/vt82c686: QOM'ify RTC creation

2022-08-29 Thread BB



Am 24. August 2022 01:23:14 MESZ schrieb BALATON Zoltan :
>On Tue, 23 Aug 2022, Bernhard Beschow wrote:
>> On Tue, Aug 23, 2022 at 2:20 AM BALATON Zoltan  wrote:
>>> On Tue, 23 Aug 2022, Bernhard Beschow wrote:
 Signed-off-by: Bernhard Beschow 
 ---
 hw/isa/vt82c686.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)
 
 diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
 index 47f2fd2669..ee745d5d49 100644
 --- a/hw/isa/vt82c686.c
 +++ b/hw/isa/vt82c686.c
 @@ -546,6 +546,7 @@ struct ViaISAState {
 qemu_irq cpu_intr;
 qemu_irq *isa_irqs;
 ViaSuperIOState via_sio;
 +RTCState rtc;
 PCIIDEState ide;
 UHCIState uhci[2];
 ViaPMState pm;
 @@ -567,6 +568,7 @@ static void via_isa_init(Object *obj)
 {
 ViaISAState *s = VIA_ISA(obj);
 
 +object_initialize_child(obj, "rtc", >rtc, TYPE_MC146818_RTC);
 object_initialize_child(obj, "ide", >ide, "via-ide");
 object_initialize_child(obj, "uhci1", >uhci[0],
>>> "vt82c686b-usb-uhci");
 object_initialize_child(obj, "uhci2", >uhci[1],
>>> "vt82c686b-usb-uhci");
 @@ -615,7 +617,15 @@ static void via_isa_realize(PCIDevice *d, Error
>>> **errp)
 isa_bus_irqs(isa_bus, s->isa_irqs);
 i8254_pit_init(isa_bus, 0x40, 0, NULL);
 i8257_dma_init(isa_bus, 0);
 -mc146818_rtc_init(isa_bus, 2000, NULL);
 +
 +/* RTC */
 +qdev_prop_set_int32(DEVICE(>rtc), "base_year", 2000);
 +if (!qdev_realize(DEVICE(>rtc), BUS(isa_bus), errp)) {
 +return;
 +}
 +object_property_add_alias(qdev_get_machine(), "rtc-time",
>>> OBJECT(>rtc),
 +  "date");
 +isa_connect_gpio_out(ISA_DEVICE(>rtc), 0, s->rtc.isairq);
 
 for (i = 0; i < PCI_CONFIG_HEADER_SIZE; i++) {
 if (i < PCI_COMMAND || i >= PCI_REVISION_ID) {
 
>>> 
>>> This actually introduces code duplication as all other places except piix4
>>> seem to still use the init function (probably to ensure that the rtc-rime
>>> alias on the machine is properly set) so I'd keep this the same as
>>> everything else and drop this patch until this init function is removed
>>> from all other places as well.
>>> 
>> 
>> Hi Zoltan,
>> 
>> Thanks for the fast reply! Regarding code homogeneity and duplication I've
>> made a similar argument for mc146818_rtc_init() in the past [1] and I've
>> learnt that my patch went backwards. Incidentally, Peter mentioned vt686c
>> as a candidate for the embed-the-device-struct style which - again
>> incidentally - I've now done.
>
>I've seen patches embedding devices recently but in this case it looked not 
>that simple because of the rtc-time alias.
>
>> The rtc-time alias is actually only used by a couple of PPC machines where
>> Pegasos II is one of them. So the alias actually needs to be created only
>> for these machines, and identifying the cases where it has to be preserved
>> requires a lot of careful investigation. In the Pegasos II case this seems
>> especially complicated since one needs to look through several layers of
>> devices. During my work on the VT82xx south bridges I've gained some
>> knowledge such that I'd like to make this simplifying contribution.
>
>I've used it to implement the get-time-of-day rtas call with VOF in pegasos2 
>because otherwise it would need to access internals of the RTC model and/or 
>duplicate some code. Here's the message discussing this:
>
>https://lists.nongnu.org/archive/html/qemu-ppc/2021-10/msg00170.html
>
>so this alias still seems to be the simplest way.
>
>I think the primary function of this alias is not these ppc machines but some 
>QMP/HMP command or to make the guest time available from the monitor or 
>something like that so it's probably also used from there and therefore all 
>rtc probably should have it but I'm not sure about it.

Indeed, the alias seems to be a convenience for some QMP/HMP commands. AFAICS 
only the mc146818 sets the alias while it is probably not the only RTC modelled 
in QEMU. So I wonder why boards using another RTC don't need it and whether 
removing the alias constitutes a compatibility break. 

>> Our discussion makes me realize that the creation of the alias could now
>> actually be moved to the Pegasos II board. This way, the Pegasos II board
>> would both create and consume that alias, which seems to remove quite some
>> cognitive load. Do you agree? Would moving the alias to the board work for
>> you?
>
>Yes I think that would be better. This way the vt82xx and piix4 would be 
>similar and the alias would also be clear within the pegasos2 code and it also 
>has the machine directly at that point so it's clearer that way.

All in all I wonder if we need to preserve the alias for the fuloong2e board?

Best regards,
Bernhard
>
>Regards,
>BALATON Zoltan



[PATCH v2 1/1] monitor/hmp: print trace as option in help for log command

2022-08-29 Thread Dongli Zhang
The below is printed when printing help information in qemu-system-x86_64
command line, and when CONFIG_TRACE_LOG is enabled:

$ qemu-system-x86_64 -d help
... ...
trace:PATTERN   enable trace events

Use "-d trace:help" to get a list of trace events.

However, they are not printed in hmp "help log" command.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
Changed since v1:
- change format for "none" as well.

 monitor/hmp.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/monitor/hmp.c b/monitor/hmp.c
index 15ca047..467fc84 100644
--- a/monitor/hmp.c
+++ b/monitor/hmp.c
@@ -285,10 +285,15 @@ void help_cmd(Monitor *mon, const char *name)
 if (!strcmp(name, "log")) {
 const QEMULogItem *item;
 monitor_printf(mon, "Log items (comma separated):\n");
-monitor_printf(mon, "%-10s %s\n", "none", "remove all logs");
+monitor_printf(mon, "%-15s %s\n", "none", "remove all logs");
 for (item = qemu_log_items; item->mask != 0; item++) {
-monitor_printf(mon, "%-10s %s\n", item->name, item->help);
+monitor_printf(mon, "%-15s %s\n", item->name, item->help);
 }
+#ifdef CONFIG_TRACE_LOG
+monitor_printf(mon, "trace:PATTERN   enable trace events\n");
+monitor_printf(mon, "\nUse \"info trace-events\" to get a list of "
+"trace events.\n\n");
+#endif
 return;
 }
 
-- 
1.8.3.1




[PATCH v2 2/4] tests/x86: Add 'q35' machine type to ivshmem-test

2022-08-29 Thread Michael Labiuk via
Configure pci bridge setting to test ivshmem on 'q35'.
Signed-off-by: Michael Labiuk 
---
 tests/qtest/ivshmem-test.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/tests/qtest/ivshmem-test.c b/tests/qtest/ivshmem-test.c
index e23a97fa8e..c4ca7efc62 100644
--- a/tests/qtest/ivshmem-test.c
+++ b/tests/qtest/ivshmem-test.c
@@ -378,6 +378,32 @@ static void test_ivshmem_server(void)
 close(thread.pipe[0]);
 }
 
+static void device_del(QTestState *qtest, const char *id)
+{
+QDict *resp;
+
+resp = qtest_qmp(qtest,
+ "{'execute': 'device_del',"
+ " 'arguments': { 'id': %s } }", id);
+
+g_assert(qdict_haskey(resp, "return"));
+qobject_unref(resp);
+}
+
+static void test_ivshmem_hotplug_q35(void)
+{
+QTestState *qts = qtest_init("-object memory-backend-ram,size=1M,id=mb1 "
+ "-device pcie-root-port,id=p1 "
+ "-device pcie-pci-bridge,bus=p1,id=b1 "
+ "-machine q35");
+
+qtest_qmp_device_add(qts, "ivshmem-plain", "iv1",
+ "{'memdev': 'mb1', 'bus': 'b1'}");
+device_del(qts, "iv1");
+
+qtest_quit(qts);
+}
+
 #define PCI_SLOT_HP 0x06
 
 static void test_ivshmem_hotplug(void)
@@ -469,6 +495,7 @@ int main(int argc, char **argv)
 {
 int ret, fd;
 gchar dir[] = "/tmp/ivshmem-test.XX";
+const char *arch = qtest_get_arch();
 
 g_test_init(, , NULL);
 
@@ -494,6 +521,9 @@ int main(int argc, char **argv)
 qtest_add_func("/ivshmem/pair", test_ivshmem_pair);
 qtest_add_func("/ivshmem/server", test_ivshmem_server);
 }
+if (!strcmp(arch, "x86_64")) {
+qtest_add_func("/ivshmem/hotplug-q35", test_ivshmem_hotplug_q35);
+}
 
 out:
 ret = g_test_run();
-- 
2.34.1




[PATCH v2 3/4] tests/x86: Add 'q35' machine type to hd-geo-test

2022-08-29 Thread Michael Labiuk via
Add pci bridge setting to test hotplug.
Duplicate tests for plugging scsi and virtio devices for q35 machine type.
Signed-off-by: Michael Labiuk 
---
 tests/qtest/hd-geo-test.c | 148 ++
 1 file changed, 148 insertions(+)

diff --git a/tests/qtest/hd-geo-test.c b/tests/qtest/hd-geo-test.c
index 413cf964c0..256450729f 100644
--- a/tests/qtest/hd-geo-test.c
+++ b/tests/qtest/hd-geo-test.c
@@ -874,6 +874,78 @@ static void test_override_scsi_hot_unplug(void)
 g_free(args);
 }
 
+static void test_override_scsi_hot_unplug_q35(void)
+{
+QTestState *qts;
+char *joined_args;
+QFWCFG *fw_cfg;
+QDict *response;
+int i;
+TestArgs *args = create_args();
+CHSResult expected[] = {
+{
+"/pci@i0cf8/pci-bridge@1/pci-bridge@0/scsi@2/channel@0/disk@0,0",
+{1, 120, 30}
+},
+{
+"/pci@i0cf8/pci-bridge@1/pci-bridge@0/scsi@2/channel@0/disk@1,0",
+{20, 20, 20}
+},
+{NULL, {0, 0, 0} }
+};
+CHSResult expected2[] = {
+{
+"/pci@i0cf8/pci-bridge@1/pci-bridge@0/scsi@2/channel@0/disk@1,0",
+{20, 20, 20}
+},
+{NULL, {0, 0, 0} }
+};
+add_drive_with_mbr(args, empty_mbr, 1);
+add_drive_with_mbr(args, empty_mbr, 1);
+add_scsi_controller(args, "virtio-scsi-pci", "b1", 2);
+add_scsi_disk(args, 0, 0, 0, 0, 0, 1, 120, 30);
+add_scsi_disk(args, 1, 0, 0, 1, 0, 20, 20, 20);
+
+joined_args = g_strjoinv(" ", args->argv);
+
+qts = qtest_initf("-device pcie-root-port,id=p0 "
+  "-device pcie-pci-bridge,bus=p0,id=b1 "
+  "-machine q35 %s", joined_args);
+fw_cfg = pc_fw_cfg_init(qts);
+
+read_bootdevices(fw_cfg, expected);
+
+/* unplug device an restart */
+response = qtest_qmp(qts,
+ "{ 'execute': 'device_del',"
+ "  'arguments': {'id': 'scsi-disk0' }}");
+g_assert(response);
+g_assert(!qdict_haskey(response, "error"));
+qobject_unref(response);
+response = qtest_qmp(qts,
+ "{ 'execute': 'system_reset', 'arguments': { }}");
+g_assert(response);
+g_assert(!qdict_haskey(response, "error"));
+qobject_unref(response);
+
+qtest_qmp_eventwait(qts, "RESET");
+
+read_bootdevices(fw_cfg, expected2);
+
+g_free(joined_args);
+qtest_quit(qts);
+
+g_free(fw_cfg);
+
+for (i = 0; i < args->n_drives; i++) {
+unlink(args->drives[i]);
+free(args->drives[i]);
+}
+g_free(args->drives);
+g_strfreev(args->argv);
+g_free(args);
+}
+
 static void test_override_virtio_hot_unplug(void)
 {
 QTestState *qts;
@@ -934,6 +1006,77 @@ static void test_override_virtio_hot_unplug(void)
 g_free(args);
 }
 
+static void test_override_virtio_hot_unplug_q35(void)
+{
+QTestState *qts;
+char *joined_args;
+QFWCFG *fw_cfg;
+QDict *response;
+int i;
+TestArgs *args = create_args();
+CHSResult expected[] = {
+{
+"/pci@i0cf8/pci-bridge@2/pci-bridge@0/scsi@2/disk@0,0",
+{1, 120, 30}
+},
+{
+"/pci@i0cf8/pci-bridge@2/pci-bridge@0/scsi@3/disk@0,0",
+{20, 20, 20}
+},
+{NULL, {0, 0, 0} }
+};
+CHSResult expected2[] = {
+{
+"/pci@i0cf8/pci-bridge@2/pci-bridge@0/scsi@3/disk@0,0",
+{20, 20, 20}
+},
+{NULL, {0, 0, 0} }
+};
+add_drive_with_mbr(args, empty_mbr, 1);
+add_drive_with_mbr(args, empty_mbr, 1);
+add_virtio_disk(args, 0, "b1", 2, 1, 120, 30);
+add_virtio_disk(args, 1, "b1", 3, 20, 20, 20);
+
+joined_args = g_strjoinv(" ", args->argv);
+
+qts = qtest_initf("-device pcie-root-port,id=p0 "
+  "-device pcie-pci-bridge,bus=p0,id=b1 "
+  "-machine pc %s", joined_args);
+fw_cfg = pc_fw_cfg_init(qts);
+
+read_bootdevices(fw_cfg, expected);
+
+/* unplug device an restart */
+response = qtest_qmp(qts,
+ "{ 'execute': 'device_del',"
+ "  'arguments': {'id': 'virtio-disk0' }}");
+g_assert(response);
+g_assert(!qdict_haskey(response, "error"));
+qobject_unref(response);
+response = qtest_qmp(qts,
+ "{ 'execute': 'system_reset', 'arguments': { }}");
+g_assert(response);
+g_assert(!qdict_haskey(response, "error"));
+qobject_unref(response);
+
+qtest_qmp_eventwait(qts, "RESET");
+
+read_bootdevices(fw_cfg, expected2);
+
+g_free(joined_args);
+qtest_quit(qts);
+
+g_free(fw_cfg);
+
+for (i = 0; i < args->n_drives; i++) {
+unlink(args->drives[i]);
+free(args->drives[i]);
+}
+g_free(args->drives);
+g_strfreev(args->argv);
+g_free(args);
+}
+
 int main(int argc, char **argv)
 {
 Backend i;
@@ -974,8 +1117,13 @@ int main(int argc, char **argv)
  

Re: [PATCH 4/9] hw/isa/vt82c686: QOM'ify via-ide creation

2022-08-29 Thread BALATON Zoltan

On Mon, 29 Aug 2022, BB wrote:

Am 25. August 2022 01:18:56 MESZ schrieb BALATON Zoltan :

On Thu, 25 Aug 2022, Bernhard Beschow wrote:

On Wed, Aug 24, 2022 at 3:54 PM BALATON Zoltan  wrote:

On Tue, 23 Aug 2022, Bernhard Beschow wrote:

The IDE function is closely tied to the ISA function (e.g. the IDE
interrupt routing happens there), so it makes sense that the IDE
function is instantiated within the southbridge itself. As a side effect,
duplicated code in the boards is resolved.

Signed-off-by: Bernhard Beschow 
---
configs/devices/mips64el-softmmu/default.mak |  1 -
hw/isa/Kconfig   |  1 +
hw/isa/vt82c686.c| 18 ++
hw/mips/fuloong2e.c  |  3 ---
hw/ppc/Kconfig   |  1 -
hw/ppc/pegasos2.c|  4 
6 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/configs/devices/mips64el-softmmu/default.mak

b/configs/devices/mips64el-softmmu/default.mak

index c610749ac1..d5188f7ea5 100644
--- a/configs/devices/mips64el-softmmu/default.mak
+++ b/configs/devices/mips64el-softmmu/default.mak
@@ -1,7 +1,6 @@
# Default configuration for mips64el-softmmu

include ../mips-softmmu/common.mak
-CONFIG_IDE_VIA=y
CONFIG_FULOONG=y
CONFIG_LOONGSON3V=y
CONFIG_ATI_VGA=y
diff --git a/hw/isa/Kconfig b/hw/isa/Kconfig
index d42143a991..20de7e9294 100644
--- a/hw/isa/Kconfig
+++ b/hw/isa/Kconfig
@@ -53,6 +53,7 @@ config VT82C686
select I8254
select I8257
select I8259
+select IDE_VIA
select MC146818RTC
select PARALLEL

diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
index 5582c0b179..37d9ed635d 100644
--- a/hw/isa/vt82c686.c
+++ b/hw/isa/vt82c686.c
@@ -17,6 +17,7 @@
#include "hw/isa/vt82c686.h"
#include "hw/pci/pci.h"
#include "hw/qdev-properties.h"
+#include "hw/ide/pci.h"
#include "hw/isa/isa.h"
#include "hw/isa/superio.h"
#include "hw/intc/i8259.h"
@@ -544,6 +545,7 @@ struct ViaISAState {
qemu_irq cpu_intr;
qemu_irq *isa_irqs;
ViaSuperIOState via_sio;
+PCIIDEState ide;
};

static const VMStateDescription vmstate_via = {
@@ -556,10 +558,18 @@ static const VMStateDescription vmstate_via = {
}
};

+static void via_isa_init(Object *obj)
+{
+ViaISAState *s = VIA_ISA(obj);
+
+object_initialize_child(obj, "ide", >ide, "via-ide");
+}
+
static const TypeInfo via_isa_info = {
.name  = TYPE_VIA_ISA,
.parent= TYPE_PCI_DEVICE,
.instance_size = sizeof(ViaISAState),
+.instance_init = via_isa_init,
.abstract  = true,
.interfaces= (InterfaceInfo[]) {
{ INTERFACE_CONVENTIONAL_PCI_DEVICE },
@@ -583,6 +593,7 @@ static void via_isa_realize(PCIDevice *d, Error

**errp)

{
ViaISAState *s = VIA_ISA(d);
DeviceState *dev = DEVICE(d);
+PCIBus *pci_bus = pci_get_bus(d);
qemu_irq *isa_irq;
ISABus *isa_bus;
int i;
@@ -607,6 +618,13 @@ static void via_isa_realize(PCIDevice *d, Error

**errp)

if (!qdev_realize(DEVICE(>via_sio), BUS(isa_bus), errp)) {
return;
}
+
+/* Function 1: IDE */
+qdev_prop_set_int32(DEVICE(>ide), "addr", d->devfn + 1);
+if (!qdev_realize(DEVICE(>ide), BUS(pci_bus), errp)) {
+return;
+}
+pci_ide_create_devs(PCI_DEVICE(>ide));


I'm not sure about moving pci_ide_create_devs() here. This is usally
called from board code and only piix4 seems to do this. Maybe that's wrong
because if all IDE devices did this then one machine could not have more
than one different ide devices (like having an on-board ide and adding a
pci ide controoler with -device) so this probably belongs to the board
code to add devices to its default ide controller only as this is machine
specific. Unless I'm wrong in which case somebody will correct me.



Grepping the code it can be seen that it's always called right after
creating the IDE controllers. The only notable exception is the "sii3112"
device in the sam460ex board which is not emulated yet. Since the IDE


The problem with sii3112 is that it only has 2 channels becuase I did not 
bother to implement more so pci_ide_create_devs() probably would not work as it 
assumes 4 channels. AFAIK this means that the short -hda, -cdrom, etc. 
convenience options don't work with sam460ex but yhou have to use the long way 
of creating ide-hd and ide-cd devices on the command line. I think there's a 
version of this controller with 4 channels, maybe called sii3114 or similar and 
it would be easy to enhance the current model for that but I haven't done that. 
What's not emulated on sam460ex is the on-board SATA ports of the SoC because 
it's too complex and all guest OSes have sii31xx drivers so it was simpler to 
implement that instead. The network port is the same as we already have working 
PCI network cards so I did not try to implement the 460EX network ports.


controllers are often created in board code this means
pci_ide_create_devs() is called there as well.

Grouping these calls 

[PATCH v2 1/4] tests/x86: Add subtest with 'q35' machine type to device-plug-test

2022-08-29 Thread Michael Labiuk via
Configure pci bridge setting to plug pci device and unplug.
Signed-off-by: Michael Labiuk 
---
 tests/qtest/device-plug-test.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/tests/qtest/device-plug-test.c b/tests/qtest/device-plug-test.c
index 2e3137843e..2f07b37ba1 100644
--- a/tests/qtest/device-plug-test.c
+++ b/tests/qtest/device-plug-test.c
@@ -165,6 +165,26 @@ static void test_spapr_phb_unplug_request(void)
 qtest_quit(qtest);
 }
 
+static void test_q35_pci_unplug_request(void)
+{
+
+QTestState *qtest = qtest_initf("-machine q35 "
+"-device pcie-root-port,id=p1 "
+"-device pcie-pci-bridge,bus=p1,id=b1 "
+"-device virtio-mouse-pci,bus=b1,id=dev0");
+
+/*
+ * Request device removal. As the guest is not running, the request won't
+ * be processed. However during system reset, the removal will be
+ * handled, removing the device.
+ */
+device_del(qtest, "dev0");
+system_reset(qtest);
+wait_device_deleted_event(qtest, "dev0");
+
+qtest_quit(qtest);
+}
+
 int main(int argc, char **argv)
 {
 const char *arch = qtest_get_arch();
@@ -195,5 +215,11 @@ int main(int argc, char **argv)
test_spapr_phb_unplug_request);
 }
 
+if (!strcmp(arch, "x86_64")) {
+qtest_add_func("/device-plug/q35-pci-unplug-request",
+   test_q35_pci_unplug_request);
+
+}
+
 return g_test_run();
 }
-- 
2.34.1




[PATCH v2 4/4] tests/x86: Add 'q35' machine type to drive_del-test

2022-08-29 Thread Michael Labiuk via
Configure pci bridge setting
Also run tests on 'q35' machine type.
Signed-off-by: Michael Labiuk 
---
 tests/qtest/drive_del-test.c | 111 +++
 1 file changed, 111 insertions(+)

diff --git a/tests/qtest/drive_del-test.c b/tests/qtest/drive_del-test.c
index 5e6d58b4dd..3a2ddecf22 100644
--- a/tests/qtest/drive_del-test.c
+++ b/tests/qtest/drive_del-test.c
@@ -258,6 +258,27 @@ static void test_cli_device_del(void)
 qtest_quit(qts);
 }
 
+static void test_cli_device_del_q35(void)
+{
+QTestState *qts;
+
+/*
+ * -drive/-device and device_del.  Start with a drive used by a
+ * device that unplugs after reset.
+ */
+qts = qtest_initf("-drive if=none,id=drive0,file=null-co://,"
+  "file.read-zeroes=on,format=raw "
+  "-machine q35 -device pcie-root-port,id=p1 "
+  "-device pcie-pci-bridge,bus=p1,id=b1 "
+  "-device virtio-blk-%s,drive=drive0,bus=b1,id=dev0",
+  qvirtio_get_dev_type());
+
+device_del(qts, true);
+g_assert(!has_drive(qts));
+
+qtest_quit(qts);
+}
+
 static void test_empty_device_del(void)
 {
 QTestState *qts;
@@ -294,6 +315,45 @@ static void test_device_add_and_del(void)
 qtest_quit(qts);
 }
 
+static void device_add_q35(QTestState *qts)
+{
+QDict *response;
+char driver[32];
+snprintf(driver, sizeof(driver), "virtio-blk-%s",
+ qvirtio_get_dev_type());
+
+response = qtest_qmp(qts, "{'execute': 'device_add',"
+  " 'arguments': {"
+  "   'driver': %s,"
+  "   'drive': 'drive0',"
+  "   'id': 'dev0',"
+  "   'bus': 'b1'"
+  "}}", driver);
+g_assert(response);
+g_assert(qdict_haskey(response, "return"));
+qobject_unref(response);
+}
+
+static void test_device_add_and_del_q35(void)
+{
+QTestState *qts;
+
+/*
+ * -drive/device_add and device_del.  Start with a drive used by a
+ * device that unplugs after reset.
+ */
+qts = qtest_initf("-machine q35 -device pcie-root-port,id=p1 "
+ "-device pcie-pci-bridge,bus=p1,id=b1 "
+ "-drive if=none,id=drive0,file=null-co://,"
+ "file.read-zeroes=on,format=raw");
+
+device_add_q35(qts);
+device_del(qts, true);
+g_assert(!has_drive(qts));
+
+qtest_quit(qts);
+}
+
 static void test_drive_add_device_add_and_del(void)
 {
 QTestState *qts;
@@ -318,6 +378,25 @@ static void test_drive_add_device_add_and_del(void)
 qtest_quit(qts);
 }
 
+static void test_drive_add_device_add_and_del_q35(void)
+{
+QTestState *qts;
+
+qts = qtest_init("-machine q35 -device pcie-root-port,id=p1 "
+ "-device pcie-pci-bridge,bus=p1,id=b1");
+
+/*
+ * drive_add/device_add and device_del.  The drive is used by a
+ * device that unplugs after reset.
+ */
+drive_add_with_media(qts);
+device_add_q35(qts);
+device_del(qts, true);
+g_assert(!has_drive(qts));
+
+qtest_quit(qts);
+}
+
 static void test_blockdev_add_device_add_and_del(void)
 {
 QTestState *qts;
@@ -342,8 +421,29 @@ static void test_blockdev_add_device_add_and_del(void)
 qtest_quit(qts);
 }
 
+static void test_blockdev_add_device_add_and_del_q35(void)
+{
+QTestState *qts;
+
+qts = qtest_init("-machine q35 -device pcie-root-port,id=p1 "
+ "-device pcie-pci-bridge,bus=p1,id=b1");
+
+/*
+ * blockdev_add/device_add and device_del.  The it drive is used by a
+ * device that unplugs after reset, but it doesn't go away.
+ */
+blockdev_add_with_media(qts);
+device_add_q35(qts);
+device_del(qts, true);
+g_assert(has_blockdev(qts));
+
+qtest_quit(qts);
+}
+
 int main(int argc, char **argv)
 {
+const char *arch = qtest_get_arch();
+
 g_test_init(, , NULL);
 
 qtest_add_func("/drive_del/without-dev", test_drive_without_dev);
@@ -363,6 +463,17 @@ int main(int argc, char **argv)
test_empty_device_del);
 qtest_add_func("/device_del/blockdev",
test_blockdev_add_device_add_and_del);
+
+if (!strcmp(arch, "x86_64")) {
+qtest_add_func("/device_del/drive/cli_device_q35",
+   test_cli_device_del_q35);
+qtest_add_func("/device_del/drive/device_add_q35",
+   test_device_add_and_del_q35);
+qtest_add_func("/device_del/drive/drive_add_device_add_q35",
+   test_drive_add_device_add_and_del_q35);
+qtest_add_func("/device_del/blockdev_q35",
+   test_blockdev_add_device_add_and_del_q35);
+}
 }
 
 return g_test_run();
-- 
2.34.1




[PATCH v2 0/4] Add 'q35' machine type to hotplug tests

2022-08-29 Thread Michael Labiuk via
Add pci bridge setting to run hotplug tests on q35 machine type.
Hotplug tests was bounded to 'pc' machine type by commit 7b172333f1b

Michael Labiuk (4):
  tests/x86: Add subtest with 'q35' machine type to device-plug-test
  tests/x86: Add 'q35' machine type to ivshmem-test
  tests/x86: Add 'q35' machine type to hd-geo-test
  tests/x86: Add 'q35' machine type to drive_del-test

 tests/qtest/device-plug-test.c |  26 ++
 tests/qtest/drive_del-test.c   | 111 +
 tests/qtest/hd-geo-test.c  | 148 +
 tests/qtest/ivshmem-test.c |  30 +++
 4 files changed, 315 insertions(+)

-- 
2.34.1




[PATCH RFC 13/13] migration: Send requested page directly in rp-return thread

2022-08-29 Thread Peter Xu
With all the facilities ready, send the requested page directly in the
rp-return thread rather than queuing it in the request queue, if and only
if postcopy preempt is enabled.  It can achieve so because it uses separate
channel for sending urgent pages.  The only shared data is bitmap and it's
protected by the bitmap_mutex.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 108 
 1 file changed, 108 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index ef89812c69..e731a70255 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -539,6 +539,8 @@ static QemuThread *decompress_threads;
 static QemuMutex decomp_done_lock;
 static QemuCond decomp_done_cond;
 
+static int ram_save_host_page_urgent(PageSearchStatus *pss);
+
 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock 
*block,
  ram_addr_t offset, uint8_t *source_buf);
 
@@ -553,6 +555,16 @@ static void pss_init(PageSearchStatus *pss, RAMBlock *rb, 
ram_addr_t page)
 pss->complete_round = false;
 }
 
+/*
+ * Check whether two PSSs are actively sending the same page.  Return true
+ * if it is, false otherwise.
+ */
+static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
+{
+return pss1->host_page_sending && pss2->host_page_sending &&
+(pss1->host_page_start == pss2->host_page_start);
+}
+
 static void *do_data_compress(void *opaque)
 {
 CompressParam *param = opaque;
@@ -2250,6 +2262,53 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t 
start, ram_addr_t len)
 return -1;
 }
 
+/*
+ * When with postcopy preempt, we send back the page directly in the
+ * rp-return thread.
+ */
+if (postcopy_preempt_active()) {
+ram_addr_t page_start = start >> TARGET_PAGE_BITS;
+size_t page_size = qemu_ram_pagesize(ramblock);
+PageSearchStatus *pss = _state->pss[RAM_CHANNEL_POSTCOPY];
+int ret = 0;
+
+qemu_mutex_lock(>bitmap_mutex);
+
+pss_init(pss, ramblock, page_start);
+/* Always use the preempt channel, and make sure it's there */
+pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
+pss->postcopy_requested = true;
+assert(pss->pss_channel);
+
+/*
+ * It must be either one or multiple of host page size.  Just
+ * assert; if something wrong we're mostly split brain anyway.
+ */
+assert(len % page_size == 0);
+while (len) {
+if (ram_save_host_page_urgent(pss)) {
+error_report("%s: ram_save_host_page_urgent() failed: "
+ "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
+ __func__, ramblock->idstr, start);
+ret = -1;
+break;
+}
+/*
+ * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
+ * will automatically be moved and point to the next host page
+ * we're going to send, so no need to update here.
+ *
+ * Normally QEMU never sends >1 host page in requests, so
+ * logically we don't even need that as the loop should only
+ * run once, but just to be consistent.
+ */
+len -= page_size;
+};
+qemu_mutex_unlock(>bitmap_mutex);
+
+return ret;
+}
+
 struct RAMSrcPageRequest *new_entry =
 g_new0(struct RAMSrcPageRequest, 1);
 new_entry->rb = ramblock;
@@ -2528,6 +2587,55 @@ static void pss_host_page_finish(PageSearchStatus *pss)
 pss->host_page_end = 0;
 }
 
+/*
+ * Send an urgent host page specified by `pss'.  Need to be called with
+ * bitmap_mutex held.
+ *
+ * Returns 0 if save host page succeeded, false otherwise.
+ */
+static int ram_save_host_page_urgent(PageSearchStatus *pss)
+{
+bool page_dirty, sent = false;
+RAMState *rs = ram_state;
+int ret = 0;
+
+trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
+pss_host_page_prepare(pss);
+
+/*
+ * If precopy is sending the same page, let it be done in precopy, or
+ * we could send the same page in two channels and none of them will
+ * receive the whole page.
+ */
+if (pss_overlap(pss, _state->pss[RAM_CHANNEL_PRECOPY])) {
+trace_postcopy_preempt_hit(pss->block->idstr,
+   pss->page << TARGET_PAGE_BITS);
+return 0;
+}
+
+do {
+page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
+
+if (page_dirty) {
+/* Be strict to return code; it must be 1, or what else? */
+if (ram_save_target_page(rs, pss) != 1) {
+error_report_once("%s: ram_save_target_page failed", __func__);
+ret = -1;
+goto out;
+}
+sent = true;
+}
+pss_find_next_dirty(pss);
+} while 

[PATCH RFC 08/13] migration: Teach PSS about host page

2022-08-29 Thread Peter Xu
Migration code has a lot to do with host pages.  Teaching PSS core about
the idea of host page helps a lot and makes the code clean.  Meanwhile,
this prepares for the future changes that can leverage the new PSS helpers
that this patch introduces to send host page in another thread.

Three more fields are introduced for this:

  (1) host_page_sending: this is set to true when QEMU is sending a host
  page, false otherwise.

  (2) host_page_{start|end}: this points to the end of host page, and it's
  only valid when host_page_sending==true.

For example, when we look up the next dirty page on the ramblock, with
host_page_sending==true, we'll not even try to look for anything beyond the
current host page.  This can be efficient than current code because
currently we'll set pss->page to next dirty bit (which can be over current
host page) and reset it to host page boundary if we found overflow.  The
latter is not efficient as we don't need to scan over host page boundary.

Meanwhile with above, we can easily make migration_bitmap_find_dirty() self
contained by updating pss->page properly.  rs* parameter is removed because
it's not even used in old code.

When sending a host page, we should use the pss helpers like this:

  - pss_host_page_prepare(pss): called before sending host page
  - pss_within_range(pss): whether we're still working on the cur host page?
  - pss_host_page_finish(pss): called after sending a host page

If there'll be another function to send host page (e.g. in return path
thread) in the future, it should follow the same style.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 91 +++--
 1 file changed, 73 insertions(+), 18 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 2f37520be4..e2b922ad59 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -474,6 +474,11 @@ struct PageSearchStatus {
  * postcopy pages via postcopy preempt channel.
  */
 bool postcopy_target_channel;
+/* Whether we're sending a host page */
+bool  host_page_sending;
+/* The start/end of current host page.  Invalid if 
host_page_sending==false */
+unsigned long host_page_start;
+unsigned long host_page_end;
 };
 typedef struct PageSearchStatus PageSearchStatus;
 
@@ -851,26 +856,36 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 }
 
 /**
- * migration_bitmap_find_dirty: find the next dirty page from start
+ * pss_find_next_dirty: find the next dirty page of current ramblock
  *
- * Returns the page offset within memory region of the start of a dirty page
+ * This function updates pss->page to point to the next dirty page index
+ * within the ramblock, or the end of ramblock when nothing found.
  *
  * @rs: current RAM state
- * @rb: RAMBlock where to search for dirty pages
- * @start: page where we start the search
+ * @pss: the current page search status
  */
-static inline
-unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
-  unsigned long start)
+static void pss_find_next_dirty(PageSearchStatus *pss)
 {
+RAMBlock *rb = pss->block;
 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 unsigned long *bitmap = rb->bmap;
 
 if (ramblock_is_ignored(rb)) {
-return size;
+/* Points directly to the end, so we know no dirty page */
+pss->page = size;
+return;
 }
 
-return find_next_bit(bitmap, size, start);
+/*
+ * If during sending a host page, only look for dirty pages within the
+ * current host page being send.
+ */
+if (pss->host_page_sending) {
+assert(pss->host_page_end);
+size = MIN(size, pss->host_page_end);
+}
+
+pss->page = find_next_bit(bitmap, size, pss->page);
 }
 
 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
@@ -1555,7 +1570,9 @@ static bool find_dirty_block(RAMState *rs, 
PageSearchStatus *pss, bool *again)
 pss->postcopy_requested = false;
 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
 
-pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
+/* Update pss->page for the next dirty bit in ramblock */
+pss_find_next_dirty(pss);
+
 if (pss->complete_round && pss->block == rs->last_seen_block &&
 pss->page >= rs->last_page) {
 /*
@@ -2445,6 +2462,44 @@ static void postcopy_preempt_reset_channel(RAMState *rs)
 }
 }
 
+/* Should be called before sending a host page */
+static void pss_host_page_prepare(PageSearchStatus *pss)
+{
+/* How many guest pages are there in one host page? */
+size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
+
+pss->host_page_sending = true;
+pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
+pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
+}
+
+/*
+ * Whether the page pointed by PSS is within the host page being sent.
+ * Must be called 

[PATCH RFC 12/13] migration: Move last_sent_block into PageSearchStatus

2022-08-29 Thread Peter Xu
Since we use PageSearchStatus to represent a channel, it makes perfect
sense to keep last_sent_block (aka, leverage RAM_SAVE_FLAG_CONTINUE) to be
per-channel rather than global because each channel can be sending
different pages on ramblocks.

Hence move it from RAMState into PageSearchStatus.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 71 -
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 2be9b91ffc..ef89812c69 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -89,6 +89,8 @@ XBZRLECacheStats xbzrle_counters;
 struct PageSearchStatus {
 /* The migration channel used for a specific host page */
 QEMUFile*pss_channel;
+/* Last block from where we have sent data */
+RAMBlock *last_sent_block;
 /* Current block being searched */
 RAMBlock*block;
 /* Current page to search from */
@@ -368,8 +370,6 @@ struct RAMState {
 int uffdio_fd;
 /* Last block that we have visited searching for dirty pages */
 RAMBlock *last_seen_block;
-/* Last block from where we have sent data */
-RAMBlock *last_sent_block;
 /* Last dirty target page we have sent */
 ram_addr_t last_page;
 /* last ram version we have seen */
@@ -677,16 +677,17 @@ exit:
  *
  * Returns the number of bytes written
  *
- * @f: QEMUFile where to send the data
+ * @pss: current PSS channel status
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  *  in the lower bits, it contains flags
  */
-static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
+static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
ram_addr_t offset)
 {
 size_t size, len;
-bool same_block = (block == rs->last_sent_block);
+bool same_block = (block == pss->last_sent_block);
+QEMUFile *f = pss->pss_channel;
 
 if (same_block) {
 offset |= RAM_SAVE_FLAG_CONTINUE;
@@ -699,7 +700,7 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f,  
RAMBlock *block,
 qemu_put_byte(f, len);
 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 size += 1 + len;
-rs->last_sent_block = block;
+pss->last_sent_block = block;
 }
 return size;
 }
@@ -783,17 +784,19 @@ static void xbzrle_cache_zero_page(RAMState *rs, 
ram_addr_t current_addr)
  *  -1 means that xbzrle would be longer than normal
  *
  * @rs: current RAM state
+ * @pss: current PSS channel
  * @current_data: pointer to the address of the page contents
  * @current_addr: addr of the page
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_xbzrle_page(RAMState *rs, QEMUFile *file,
+static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 uint8_t **current_data, ram_addr_t current_addr,
 RAMBlock *block, ram_addr_t offset)
 {
 int encoded_len = 0, bytes_xbzrle;
 uint8_t *prev_cached_page;
+QEMUFile *file = pss->pss_channel;
 
 if (!cache_is_cached(XBZRLE.cache, current_addr,
  ram_counters.dirty_sync_count)) {
@@ -858,7 +861,7 @@ static int save_xbzrle_page(RAMState *rs, QEMUFile *file,
 }
 
 /* Send XBZRLE based compressed page */
-bytes_xbzrle = save_page_header(rs, file, block,
+bytes_xbzrle = save_page_header(pss, block,
 offset | RAM_SAVE_FLAG_XBZRLE);
 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 qemu_put_be16(file, encoded_len);
@@ -1286,19 +1289,19 @@ static void ram_release_page(const char *rbname, 
uint64_t offset)
  * Returns the size of data written to the file, 0 means the page is not
  * a zero page
  *
- * @rs: current RAM state
- * @file: the file where the data is saved
+ * @pss: current PSS channel
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
+static int save_zero_page_to_file(PageSearchStatus *pss,
   RAMBlock *block, ram_addr_t offset)
 {
 uint8_t *p = block->host + offset;
+QEMUFile *file = pss->pss_channel;
 int len = 0;
 
 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
-len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
+len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
 qemu_put_byte(file, 0);
 len += 1;
 ram_release_page(block->idstr, offset);
@@ -1311,14 +1314,14 @@ static int save_zero_page_to_file(RAMState *rs, 
QEMUFile *file,
  *
  * Returns the number of pages written.
  *
- * @rs: current RAM state
+ * @pss: current PSS channel
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */

[PATCH RFC 11/13] migration: Make PageSearchStatus part of RAMState

2022-08-29 Thread Peter Xu
We used to allocate PSS structure on the stack for precopy when sending
pages.  Make it static, so as to describe per-channel ram migration status.

Here we declared RAM_CHANNEL_MAX instances, preparing for postcopy to use
it, even though this patch has not yet to start using the 2nd instance.

This should not have any functional change per se, but it already starts to
export PSS information via the RAMState, so that e.g. one PSS channel can
start to reference the other PSS channel.

Always protect PSS access using the same RAMState.bitmap_mutex.  We already
do so, so no code change needed, just some comment update.  Maybe we should
consider renaming bitmap_mutex some day as it's going to be a more commonly
and big mutex we use for ram states, but just leave it for later.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 116 ++--
 1 file changed, 63 insertions(+), 53 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index bdfcc6171a..2be9b91ffc 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -85,6 +85,46 @@
 
 XBZRLECacheStats xbzrle_counters;
 
+/* used by the search for pages to send */
+struct PageSearchStatus {
+/* The migration channel used for a specific host page */
+QEMUFile*pss_channel;
+/* Current block being searched */
+RAMBlock*block;
+/* Current page to search from */
+unsigned long page;
+/* Set once we wrap around */
+bool complete_round;
+/*
+ * [POSTCOPY-ONLY] Whether current page is explicitly requested by
+ * postcopy.  When set, the request is "urgent" because the dest QEMU
+ * threads are waiting for us.
+ */
+bool postcopy_requested;
+/*
+ * [POSTCOPY-ONLY] The target channel to use to send current page.
+ *
+ * Note: This may _not_ match with the value in postcopy_requested
+ * above. Let's imagine the case where the postcopy request is exactly
+ * the page that we're sending in progress during precopy. In this case
+ * we'll have postcopy_requested set to true but the target channel
+ * will be the precopy channel (so that we don't split brain on that
+ * specific page since the precopy channel already contains partial of
+ * that page data).
+ *
+ * Besides that specific use case, postcopy_target_channel should
+ * always be equal to postcopy_requested, because by default we send
+ * postcopy pages via postcopy preempt channel.
+ */
+bool postcopy_target_channel;
+/* Whether we're sending a host page */
+bool  host_page_sending;
+/* The start/end of current host page.  Invalid if 
host_page_sending==false */
+unsigned long host_page_start;
+unsigned long host_page_end;
+};
+typedef struct PageSearchStatus PageSearchStatus;
+
 /* struct contains XBZRLE cache and a static page
used by the compression */
 static struct {
@@ -319,6 +359,11 @@ typedef struct {
 struct RAMState {
 /* QEMUFile used for this migration */
 QEMUFile *f;
+/*
+ * PageSearchStatus structures for the channels when send pages.
+ * Protected by the bitmap_mutex.
+ */
+PageSearchStatus pss[RAM_CHANNEL_MAX];
 /* UFFD file descriptor, used in 'write-tracking' migration */
 int uffdio_fd;
 /* Last block that we have visited searching for dirty pages */
@@ -362,7 +407,12 @@ struct RAMState {
 uint64_t target_page_count;
 /* number of dirty bits in the bitmap */
 uint64_t migration_dirty_pages;
-/* Protects modification of the bitmap and migration dirty pages */
+/*
+ * Protects:
+ * - dirty/clear bitmap
+ * - migration_dirty_pages
+ * - pss structures
+ */
 QemuMutex bitmap_mutex;
 /* The RAMBlock used in the last src_page_requests */
 RAMBlock *last_req_rb;
@@ -444,46 +494,6 @@ void dirty_sync_missed_zero_copy(void)
 ram_counters.dirty_sync_missed_zero_copy++;
 }
 
-/* used by the search for pages to send */
-struct PageSearchStatus {
-/* The migration channel used for a specific host page */
-QEMUFile*pss_channel;
-/* Current block being searched */
-RAMBlock*block;
-/* Current page to search from */
-unsigned long page;
-/* Set once we wrap around */
-bool complete_round;
-/*
- * [POSTCOPY-ONLY] Whether current page is explicitly requested by
- * postcopy.  When set, the request is "urgent" because the dest QEMU
- * threads are waiting for us.
- */
-bool postcopy_requested;
-/*
- * [POSTCOPY-ONLY] The target channel to use to send current page.
- *
- * Note: This may _not_ match with the value in postcopy_requested
- * above. Let's imagine the case where the postcopy request is exactly
- * the page that we're sending in progress during precopy. In this case
- * we'll have postcopy_requested set to true but the target channel
- * will be the precopy channel (so that we 

[PATCH RFC 09/13] migration: Introduce pss_channel

2022-08-29 Thread Peter Xu
Introduce pss_channel for PageSearchStatus, define it as "the migration
channel to be used to transfer this host page".

We used to have rs->f, which is a mirror to MigrationState.to_dst_file.

After postcopy preempt initial version, rs->f can be dynamically changed
depending on which channel we want to use.

But that later work still doesn't grant full concurrency of sending pages
in e.g. different threads, because rs->f can either be the PRECOPY channel
or POSTCOPY channel.  This needs to be per-thread too.

PageSearchStatus is actually a good piece of struct which we can leverage
if we want to have multiple threads sending pages.  Sending a single guest
page may not make sense, so we make the granule to be "host page", and in
the PSS structure we allow specify a QEMUFile* to migrate a specific host
page.  Then we open the possibility to specify different channels in
different threads with different PSS structures.

The PSS prefix can be slightly misleading here because e.g. for the
upcoming usage of postcopy channel/thread it's not "searching" (or,
scanning) at all but sending the explicit page that was requested.  However
since PSS existed for some years keep it as-is until someone complains.

This patch mostly (simply) replace rs->f with pss->pss_channel only. No
functional change intended for this patch yet.  But it does prepare to
finally drop rs->f, and make ram_save_guest_page() thread safe.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 70 +++--
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index e2b922ad59..adcc57c584 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -446,6 +446,8 @@ void dirty_sync_missed_zero_copy(void)
 
 /* used by the search for pages to send */
 struct PageSearchStatus {
+/* The migration channel used for a specific host page */
+QEMUFile*pss_channel;
 /* Current block being searched */
 RAMBlock*block;
 /* Current page to search from */
@@ -768,9 +770,9 @@ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t 
current_addr)
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
-ram_addr_t current_addr, RAMBlock *block,
-ram_addr_t offset)
+static int save_xbzrle_page(RAMState *rs, QEMUFile *file,
+uint8_t **current_data, ram_addr_t current_addr,
+RAMBlock *block, ram_addr_t offset)
 {
 int encoded_len = 0, bytes_xbzrle;
 uint8_t *prev_cached_page;
@@ -838,11 +840,11 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 }
 
 /* Send XBZRLE based compressed page */
-bytes_xbzrle = save_page_header(rs, rs->f, block,
+bytes_xbzrle = save_page_header(rs, file, block,
 offset | RAM_SAVE_FLAG_XBZRLE);
-qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
-qemu_put_be16(rs->f, encoded_len);
-qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
+qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
+qemu_put_be16(file, encoded_len);
+qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 bytes_xbzrle += encoded_len + 1 + 2;
 /*
  * Like compressed_size (please see update_compress_thread_counts),
@@ -1295,9 +1297,10 @@ static int save_zero_page_to_file(RAMState *rs, QEMUFile 
*file,
  * @block: block that contains the page we want to send
  * @offset: offset inside the block for the page
  */
-static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
+static int save_zero_page(RAMState *rs, QEMUFile *file, RAMBlock *block,
+  ram_addr_t offset)
 {
-int len = save_zero_page_to_file(rs, rs->f, block, offset);
+int len = save_zero_page_to_file(rs, file, block, offset);
 
 if (len) {
 ram_counters.duplicate++;
@@ -1314,15 +1317,15 @@ static int save_zero_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset)
  *
  * Return true if the pages has been saved, otherwise false is returned.
  */
-static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
-  int *pages)
+static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
+  ram_addr_t offset, int *pages)
 {
 uint64_t bytes_xmit = 0;
 int ret;
 
 *pages = -1;
-ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
-_xmit);
+ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
+TARGET_PAGE_SIZE, _xmit);
 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
 return false;
 }
@@ -1356,17 +1359,17 @@ static bool control_save_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset,
  * @buf: the page to be 

[PATCH RFC 07/13] migration: Remove RAMState.f references in compression code

2022-08-29 Thread Peter Xu
Removing referencing to RAMState.f in compress_page_with_multi_thread() and
flush_compressed_data().

Compression code by default isn't compatible with having >1 channels (or it
won't currently know which channel to flush the compressed data), so to
make it simple we always flush on the default to_dst_file port until
someone wants to add >1 ports support, as rs->f right now can really
change (after postcopy preempt is introduced).

There should be no functional change at all after patch applied, since as
long as rs->f referenced in compression code, it must be to_dst_file.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 43893d0a40..2f37520be4 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1461,6 +1461,7 @@ static bool save_page_use_compression(RAMState *rs);
 
 static void flush_compressed_data(RAMState *rs)
 {
+MigrationState *ms = migrate_get_current();
 int idx, len, thread_count;
 
 if (!save_page_use_compression(rs)) {
@@ -1479,7 +1480,7 @@ static void flush_compressed_data(RAMState *rs)
 for (idx = 0; idx < thread_count; idx++) {
 qemu_mutex_lock(_param[idx].mutex);
 if (!comp_param[idx].quit) {
-len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
+len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
 /*
  * it's safe to fetch zero_page without holding comp_done_lock
  * as there is no further request submitted to the thread,
@@ -1498,11 +1499,11 @@ static inline void set_compress_params(CompressParam 
*param, RAMBlock *block,
 param->offset = offset;
 }
 
-static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
-   ram_addr_t offset)
+static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
 {
 int idx, thread_count, bytes_xmit = -1, pages = -1;
 bool wait = migrate_compress_wait_thread();
+MigrationState *ms = migrate_get_current();
 
 thread_count = migrate_compress_threads();
 qemu_mutex_lock(_done_lock);
@@ -1510,7 +1511,8 @@ retry:
 for (idx = 0; idx < thread_count; idx++) {
 if (comp_param[idx].done) {
 comp_param[idx].done = false;
-bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
+bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
+comp_param[idx].file);
 qemu_mutex_lock(_param[idx].mutex);
 set_compress_params(_param[idx], block, offset);
 qemu_cond_signal(_param[idx].cond);
@@ -2263,7 +2265,7 @@ static bool save_compress_page(RAMState *rs, RAMBlock 
*block, ram_addr_t offset)
 return false;
 }
 
-if (compress_page_with_multi_thread(rs, block, offset) > 0) {
+if (compress_page_with_multi_thread(block, offset) > 0) {
 return true;
 }
 
-- 
2.32.0




[PATCH RFC 06/13] migration: Trivial cleanup save_page_header() on same block check

2022-08-29 Thread Peter Xu
The 2nd check on RAM_SAVE_FLAG_CONTINUE is a bit redundant.  Use a boolean
to be clearer.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 612c7dd708..43893d0a40 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -661,14 +661,15 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f, 
 RAMBlock *block,
ram_addr_t offset)
 {
 size_t size, len;
+bool same_block = (block == rs->last_sent_block);
 
-if (block == rs->last_sent_block) {
+if (same_block) {
 offset |= RAM_SAVE_FLAG_CONTINUE;
 }
 qemu_put_be64(f, offset);
 size = 8;
 
-if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
+if (!same_block) {
 len = strlen(block->idstr);
 qemu_put_byte(f, len);
 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
-- 
2.32.0




[PATCH RFC 04/13] migration: Cleanup xbzrle zero page cache update logic

2022-08-29 Thread Peter Xu
The major change is to replace "!save_page_use_compression()" with
"xbzrle_enabled" to make it clear.

Reasonings:

(1) When compression enabled, "!save_page_use_compression()" is exactly the
same as checking "xbzrle_enabled".

(2) When compression disabled, "!save_page_use_compression()" always return
true.  We used to try calling the xbzrle code, but after this change we
won't, and we shouldn't need to.

Since at it, drop the xbzrle_enabled check in xbzrle_cache_zero_page()
because with this change it's not needed anymore.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 9e96a46323..612c7dd708 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -741,10 +741,6 @@ void mig_throttle_counter_reset(void)
  */
 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 {
-if (!rs->xbzrle_enabled) {
-return;
-}
-
 /* We don't care if this fails to allocate a new cache page
  * as long as it updated an old one */
 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
@@ -2301,7 +2297,7 @@ static int ram_save_target_page(RAMState *rs, 
PageSearchStatus *pss)
 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
  * page would be stale
  */
-if (!save_page_use_compression(rs)) {
+if (rs->xbzrle_enabled) {
 XBZRLE_cache_lock();
 xbzrle_cache_zero_page(rs, block->offset + offset);
 XBZRLE_cache_unlock();
-- 
2.32.0




[PATCH RFC 10/13] migration: Add pss_init()

2022-08-29 Thread Peter Xu
Helper to init PSS structures.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index adcc57c584..bdfcc6171a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -535,6 +535,14 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream 
*stream, RAMBlock *block,
 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
  bool postcopy_requested);
 
+/* NOTE: page is the PFN not real ram_addr_t. */
+static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
+{
+pss->block = rb;
+pss->page = page;
+pss->complete_round = false;
+}
+
 static void *do_data_compress(void *opaque)
 {
 CompressParam *param = opaque;
@@ -2625,9 +2633,7 @@ static int ram_find_and_save_block(RAMState *rs)
 return pages;
 }
 
-pss.block = rs->last_seen_block;
-pss.page = rs->last_page;
-pss.complete_round = false;
+pss_init(, rs->last_seen_block, rs->last_page);
 
 if (!pss.block) {
 pss.block = QLIST_FIRST_RCU(_list.blocks);
-- 
2.32.0




[PATCH RFC 05/13] migration: Disallow postcopy preempt to be used with compress

2022-08-29 Thread Peter Xu
The preempt mode requires the capability to assign channel for each of the
page, while the compression logic will currently assign pages to different
compress thread/local-channel so potentially they're incompatible.

Signed-off-by: Peter Xu 
---
 migration/migration.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index bb8bbddfe4..844bca1ff6 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1336,6 +1336,17 @@ static bool migrate_caps_check(bool *cap_list,
 error_setg(errp, "Postcopy preempt requires postcopy-ram");
 return false;
 }
+
+/*
+ * Preempt mode requires urgent pages to be sent in separate
+ * channel, OTOH compression logic will disorder all pages into
+ * different compression channels, which is not compatible with the
+ * preempt assumptions on channel assignments.
+ */
+if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
+error_setg(errp, "Postcopy preempt not compatible with compress");
+return false;
+}
 }
 
 return true;
-- 
2.32.0




[PATCH RFC 02/13] migration: Add postcopy_preempt_active()

2022-08-29 Thread Peter Xu
Add the helper to show that postcopy preempt enabled, meanwhile active.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..8c5d5332e8 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -162,6 +162,11 @@ out:
 return ret;
 }
 
+static bool postcopy_preempt_active(void)
+{
+return migrate_postcopy_preempt() && migration_in_postcopy();
+}
+
 bool ramblock_is_ignored(RAMBlock *block)
 {
 return !qemu_ram_is_migratable(block) ||
@@ -2434,7 +2439,7 @@ static void postcopy_preempt_choose_channel(RAMState *rs, 
PageSearchStatus *pss)
 /* We need to make sure rs->f always points to the default channel elsewhere */
 static void postcopy_preempt_reset_channel(RAMState *rs)
 {
-if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+if (postcopy_preempt_active()) {
 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
 rs->f = migrate_get_current()->to_dst_file;
 trace_postcopy_preempt_reset_channel();
@@ -2472,7 +2477,7 @@ static int ram_save_host_page(RAMState *rs, 
PageSearchStatus *pss)
 return 0;
 }
 
-if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+if (postcopy_preempt_active()) {
 postcopy_preempt_choose_channel(rs, pss);
 }
 
-- 
2.32.0




[PATCH RFC 01/13] migration: Use non-atomic ops for clear log bitmap

2022-08-29 Thread Peter Xu
Since we already have bitmap_mutex to protect either the dirty bitmap or
the clear log bitmap, we don't need atomic operations to set/clear/test on
the clear log bitmap.  Switching all ops from atomic to non-atomic
versions, meanwhile touch up the comments to show which lock is in charge.

Introduced non-atomic version of bitmap_test_and_clear_atomic(), mostly the
same as the atomic version but simplified a few places, e.g. dropped the
"old_bits" variable, and also the explicit memory barriers.

Signed-off-by: Peter Xu 
---
 include/exec/ram_addr.h | 11 +-
 include/qemu/bitmap.h   |  1 +
 util/bitmap.c   | 45 +
 3 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index f3e0c78161..5092a2e0ff 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -42,7 +42,8 @@ static inline long clear_bmap_size(uint64_t pages, uint8_t 
shift)
 }
 
 /**
- * clear_bmap_set: set clear bitmap for the page range
+ * clear_bmap_set: set clear bitmap for the page range.  Must be with
+ * bitmap_mutex held.
  *
  * @rb: the ramblock to operate on
  * @start: the start page number
@@ -55,12 +56,12 @@ static inline void clear_bmap_set(RAMBlock *rb, uint64_t 
start,
 {
 uint8_t shift = rb->clear_bmap_shift;
 
-bitmap_set_atomic(rb->clear_bmap, start >> shift,
-  clear_bmap_size(npages, shift));
+bitmap_set(rb->clear_bmap, start >> shift, clear_bmap_size(npages, shift));
 }
 
 /**
- * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set
+ * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set.
+ * Must be with bitmap_mutex held.
  *
  * @rb: the ramblock to operate on
  * @page: the page number to check
@@ -71,7 +72,7 @@ static inline bool clear_bmap_test_and_clear(RAMBlock *rb, 
uint64_t page)
 {
 uint8_t shift = rb->clear_bmap_shift;
 
-return bitmap_test_and_clear_atomic(rb->clear_bmap, page >> shift, 1);
+return bitmap_test_and_clear(rb->clear_bmap, page >> shift, 1);
 }
 
 static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset)
diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h
index 82a1d2f41f..3ccb00865f 100644
--- a/include/qemu/bitmap.h
+++ b/include/qemu/bitmap.h
@@ -253,6 +253,7 @@ void bitmap_set(unsigned long *map, long i, long len);
 void bitmap_set_atomic(unsigned long *map, long i, long len);
 void bitmap_clear(unsigned long *map, long start, long nr);
 bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr);
+bool bitmap_test_and_clear(unsigned long *map, long start, long nr);
 void bitmap_copy_and_clear_atomic(unsigned long *dst, unsigned long *src,
   long nr);
 unsigned long bitmap_find_next_zero_area(unsigned long *map,
diff --git a/util/bitmap.c b/util/bitmap.c
index f81d8057a7..8d12e90a5a 100644
--- a/util/bitmap.c
+++ b/util/bitmap.c
@@ -240,6 +240,51 @@ void bitmap_clear(unsigned long *map, long start, long nr)
 }
 }
 
+bool bitmap_test_and_clear(unsigned long *map, long start, long nr)
+{
+unsigned long *p = map + BIT_WORD(start);
+const long size = start + nr;
+int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+bool dirty = false;
+
+assert(start >= 0 && nr >= 0);
+
+/* First word */
+if (nr - bits_to_clear > 0) {
+if ((*p) & mask_to_clear) {
+dirty = true;
+}
+*p &= ~mask_to_clear;
+nr -= bits_to_clear;
+bits_to_clear = BITS_PER_LONG;
+p++;
+}
+
+/* Full words */
+if (bits_to_clear == BITS_PER_LONG) {
+while (nr >= BITS_PER_LONG) {
+if (*p) {
+dirty = true;
+*p = 0;
+}
+nr -= BITS_PER_LONG;
+p++;
+}
+}
+
+/* Last word */
+if (nr) {
+mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+if ((*p) & mask_to_clear) {
+dirty = true;
+}
+*p &= ~mask_to_clear;
+}
+
+return dirty;
+}
+
 bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr)
 {
 unsigned long *p = map + BIT_WORD(start);
-- 
2.32.0




[PATCH RFC 03/13] migration: Yield bitmap_mutex properly when sending/sleeping

2022-08-29 Thread Peter Xu
Don't take the bitmap mutex when sending pages, or when being throttled by
migration_rate_limit() (which is a bit tricky to call it here in ram code,
but seems still helpful).

It prepares for the possibility of concurrently sending pages in >1 threads
using the function ram_save_host_page() because all threads may need the
bitmap_mutex to operate on bitmaps, so that either sendmsg() or any kind of
qemu_sem_wait() blocking for one thread will not block the other from
progressing.

Signed-off-by: Peter Xu 
---
 migration/ram.c | 42 +++---
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 8c5d5332e8..9e96a46323 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2470,6 +2470,7 @@ static int ram_save_host_page(RAMState *rs, 
PageSearchStatus *pss)
 unsigned long hostpage_boundary =
 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
 unsigned long start_page = pss->page;
+bool page_dirty;
 int res;
 
 if (ramblock_is_ignored(pss->block)) {
@@ -2487,22 +2488,41 @@ static int ram_save_host_page(RAMState *rs, 
PageSearchStatus *pss)
 break;
 }
 
+page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
+/*
+ * Properly yield the lock only in postcopy preempt mode because
+ * both migration thread and rp-return thread can operate on the
+ * bitmaps.
+ */
+if (postcopy_preempt_active()) {
+qemu_mutex_unlock(>bitmap_mutex);
+}
+
 /* Check the pages is dirty and if it is send it */
-if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
+if (page_dirty) {
 tmppages = ram_save_target_page(rs, pss);
-if (tmppages < 0) {
-return tmppages;
+if (tmppages >= 0) {
+pages += tmppages;
+/*
+ * Allow rate limiting to happen in the middle of huge pages if
+ * something is sent in the current iteration.
+ */
+if (pagesize_bits > 1 && tmppages > 0) {
+migration_rate_limit();
+}
 }
+} else {
+tmppages = 0;
+}
 
-pages += tmppages;
-/*
- * Allow rate limiting to happen in the middle of huge pages if
- * something is sent in the current iteration.
- */
-if (pagesize_bits > 1 && tmppages > 0) {
-migration_rate_limit();
-}
+if (postcopy_preempt_active()) {
+qemu_mutex_lock(>bitmap_mutex);
 }
+
+if (tmppages < 0) {
+return tmppages;
+}
+
 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
 } while ((pss->page < hostpage_boundary) &&
  offset_in_ramblock(pss->block,
-- 
2.32.0




[PATCH RFC 00/13] migration: Postcopy Preempt-Full

2022-08-29 Thread Peter Xu
This is a RFC series.  Tree is here:

  https://github.com/xzpeter/qemu/tree/preempt-full

It's not complete because there're still something we need to do which will
be attached to the end of this cover letter, however this series can
already safely pass qtest and any of my test.

Comparing to the recently merged preempt mode I called it "preempt-full"
because it threadifies the postcopy channels so now urgent pages can be
fully handled separately outside of the ram save loop.  Sorry to have the
same name as the PREEMPT_FULL in the Linux RT world, it's just that we
needed a name for the capability and it was named as preempt already
anyway..

The existing preempt code has reduced ramdom page req latency over 10Gbps
network from ~12ms to ~500us which has already landed.

This preempt-full series can further reduces that ~500us to ~230us per my
initial test.  More to share below.

Note that no new capability is needed, IOW it's fully compatible with the
existing preempt mode.  So the naming is actually not important but just to
identify the difference on the binaries.  It's because this series only
reworks the sender side code and does not change the migration protocol, it
just runs faster.

IOW, old "preempt" QEMU can also migrate to "preempt-full" QEMU, vice versa.

  - When old "preempt" mode QEMU migrates to "preempt-full" QEMU, it'll be
the same as running both old "preempt" QEMUs.

  - When "preempt-full" QEMU migrates to old "preempt" QEMU, it'll be the
same as running both "preempt-full".

The logic of the series is quite simple too: simply moving the existing
preempt channel page sends to rp-return thread.  It can slow down rp-return
thread on receiving pages, but I don't really see a major issue with it so
far.

This latency number is getting close to the extreme of 4K page request
latency of any TCP roundtrip of the 10Gbps nic I have.  The 'extreme
number' is something I get from mig_mon tool which has a mode [1] to
emulate the extreme tcp roundtrips of page requests.

Performance
===

Page request latencies has distributions as below, with a VM of 20G mem, 20
cores, 10Gbps nic, 18G fully random writes:

Postcopy Vanilla


Average: 12093 (us)
@delay_us:
[1]1 ||
[2, 4) 0 ||
[4, 8) 0 ||
[8, 16)0 ||
[16, 32)   1 ||
[32, 64)   8 ||
[64, 128) 11 ||
[128, 256)14 ||
[256, 512)19 ||
[512, 1K) 14 ||
[1K, 2K)  35 ||
[2K, 4K)  18 ||
[4K, 8K)  87 |@   |
[8K, 16K)   2397 ||
[16K, 32K) 7 ||
[32K, 64K) 2 ||
[64K, 128K)   20 ||
[128K, 256K)   6 ||

Postcopy Preempt


Average: 496 (us)

@delay_us:
[32, 64)   2 ||
[64, 128)   2306 ||
[128, 256) 25422 ||
[256, 512)  8238 ||
[512, 1K)   1066 |@@  |
[1K, 2K)2167 ||
[2K, 4K)3329 |@@  |
[4K, 8K) 109 ||
[8K, 16K) 48 ||

Postcopy Preempt-Full
-

Average: 229 (us)

@delay_us:
[8, 16)1 ||
[16, 32)   3 ||
[32, 64)   2 ||
[64, 128)  11956 |@@  |
[128, 256) 60403 ||
[256, 512) 15047 | 

Re: [PATCH] linux-user: fix readlinkat handling with magic exe symlink

2022-08-29 Thread Jameson Nash
bump? This helps fix one of the libuv tests when run under qemu
https://github.com/libuv/libuv/pull/2941#issuecomment-1207145306

On Mon, Aug 8, 2022 at 3:07 PM Jameson Nash  wrote:

> Exactly the same as f17f4989fa193fa8279474c5462289a3cfe69aea before was
> for readlink. I suppose this was simply missed at the time.
>
> Signed-off-by: Jameson Nash 
> ---
>  linux-user/syscall.c | 15 +--
>  1 file changed, 13 insertions(+), 2 deletions(-)
>
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index ef53feb5ab..6ef4e42b21 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -9894,11 +9894,22 @@ static abi_long do_syscall1(CPUArchState *cpu_env,
> int num, abi_long arg1,
>  p2 = lock_user(VERIFY_WRITE, arg3, arg4, 0);
>  if (!p || !p2) {
>  ret = -TARGET_EFAULT;
> +} else if (!arg4) {
> +/* Short circuit this for the magic exe check. */
> +ret = -TARGET_EINVAL;
>  } else if (is_proc_myself((const char *)p, "exe")) {
>  char real[PATH_MAX], *temp;
>  temp = realpath(exec_path, real);
> -ret = temp == NULL ? get_errno(-1) : strlen(real) ;
> -snprintf((char *)p2, arg4, "%s", real);
> +/* Return value is # of bytes that we wrote to the
> buffer. */
> +if (temp == NULL) {
> +ret = get_errno(-1);
> +} else {
> +/* Don't worry about sign mismatch as earlier mapping
> + * logic would have thrown a bad address error. */
> +ret = MIN(strlen(real), arg4);
> +/* We cannot NUL terminate the string. */
> +memcpy(p2, real, ret);
> +}
>  } else {
>  ret = get_errno(readlinkat(arg1, path(p), p2, arg4));
>  }
> --
> 2.25.1
>
>


Re: [PATCH 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.

2022-08-29 Thread Richard Henderson

On 8/29/22 07:23, Ricky Zhou wrote:

Many instructions which load/store 128-bit values are supposed to
raise #GP when the memory operand isn't 16-byte aligned. This includes:
  - Instructions explicitly requiring memory alignment (Exceptions Type 1
in the "AVX and SSE Instruction Exception Specification" section of
the SDM)
  - Legacy SSE instructions that load/store 128-bit values (Exceptions
Types 2 and 4).

This change adds a raise_gp_if_unaligned helper which raises #GP if an
address is not properly aligned. This helper is called before 128-bit
loads/stores where appropriate.

Resolves:https://gitlab.com/qemu-project/qemu/-/issues/217
Signed-off-by: Ricky Zhou
---
  target/i386/helper.h |  1 +
  target/i386/tcg/mem_helper.c |  8 
  target/i386/tcg/translate.c  | 38 +---
  3 files changed, 44 insertions(+), 3 deletions(-)



This trap should be raised via the memory operation:

- static inline void gen_ldo_env_A0(DisasContext *s, int offset)

+ static inline void gen_ldo_env_A0(DisasContext *s, int offset, bool aligned)
  {

  int mem_index = s->mem_index;

- tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ);

+ tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index,
+ MO_LEUQ | (aligned ? MO_ALIGN_16 : 0));
  tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(0)));

  tcg_gen_addi_tl(s->tmp0, s->A0, 8);

  tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ);

  tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(1)));

  }


Only the first of the two loads/stores must be aligned, as the other is known to be +8. 
You then must fill in the x86_tcg_ops.do_unaligned_access hook to raise #GP.



r~



Re: [PATCH 4/9] hw/isa/vt82c686: QOM'ify via-ide creation

2022-08-29 Thread BB



Am 25. August 2022 01:18:56 MESZ schrieb BALATON Zoltan :
>On Thu, 25 Aug 2022, Bernhard Beschow wrote:
>> On Wed, Aug 24, 2022 at 3:54 PM BALATON Zoltan  wrote:
>>> On Tue, 23 Aug 2022, Bernhard Beschow wrote:
 The IDE function is closely tied to the ISA function (e.g. the IDE
 interrupt routing happens there), so it makes sense that the IDE
 function is instantiated within the southbridge itself. As a side effect,
 duplicated code in the boards is resolved.
 
 Signed-off-by: Bernhard Beschow 
 ---
 configs/devices/mips64el-softmmu/default.mak |  1 -
 hw/isa/Kconfig   |  1 +
 hw/isa/vt82c686.c| 18 ++
 hw/mips/fuloong2e.c  |  3 ---
 hw/ppc/Kconfig   |  1 -
 hw/ppc/pegasos2.c|  4 
 6 files changed, 19 insertions(+), 9 deletions(-)
 
 diff --git a/configs/devices/mips64el-softmmu/default.mak
>>> b/configs/devices/mips64el-softmmu/default.mak
 index c610749ac1..d5188f7ea5 100644
 --- a/configs/devices/mips64el-softmmu/default.mak
 +++ b/configs/devices/mips64el-softmmu/default.mak
 @@ -1,7 +1,6 @@
 # Default configuration for mips64el-softmmu
 
 include ../mips-softmmu/common.mak
 -CONFIG_IDE_VIA=y
 CONFIG_FULOONG=y
 CONFIG_LOONGSON3V=y
 CONFIG_ATI_VGA=y
 diff --git a/hw/isa/Kconfig b/hw/isa/Kconfig
 index d42143a991..20de7e9294 100644
 --- a/hw/isa/Kconfig
 +++ b/hw/isa/Kconfig
 @@ -53,6 +53,7 @@ config VT82C686
 select I8254
 select I8257
 select I8259
 +select IDE_VIA
 select MC146818RTC
 select PARALLEL
 
 diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
 index 5582c0b179..37d9ed635d 100644
 --- a/hw/isa/vt82c686.c
 +++ b/hw/isa/vt82c686.c
 @@ -17,6 +17,7 @@
 #include "hw/isa/vt82c686.h"
 #include "hw/pci/pci.h"
 #include "hw/qdev-properties.h"
 +#include "hw/ide/pci.h"
 #include "hw/isa/isa.h"
 #include "hw/isa/superio.h"
 #include "hw/intc/i8259.h"
 @@ -544,6 +545,7 @@ struct ViaISAState {
 qemu_irq cpu_intr;
 qemu_irq *isa_irqs;
 ViaSuperIOState via_sio;
 +PCIIDEState ide;
 };
 
 static const VMStateDescription vmstate_via = {
 @@ -556,10 +558,18 @@ static const VMStateDescription vmstate_via = {
 }
 };
 
 +static void via_isa_init(Object *obj)
 +{
 +ViaISAState *s = VIA_ISA(obj);
 +
 +object_initialize_child(obj, "ide", >ide, "via-ide");
 +}
 +
 static const TypeInfo via_isa_info = {
 .name  = TYPE_VIA_ISA,
 .parent= TYPE_PCI_DEVICE,
 .instance_size = sizeof(ViaISAState),
 +.instance_init = via_isa_init,
 .abstract  = true,
 .interfaces= (InterfaceInfo[]) {
 { INTERFACE_CONVENTIONAL_PCI_DEVICE },
 @@ -583,6 +593,7 @@ static void via_isa_realize(PCIDevice *d, Error
>>> **errp)
 {
 ViaISAState *s = VIA_ISA(d);
 DeviceState *dev = DEVICE(d);
 +PCIBus *pci_bus = pci_get_bus(d);
 qemu_irq *isa_irq;
 ISABus *isa_bus;
 int i;
 @@ -607,6 +618,13 @@ static void via_isa_realize(PCIDevice *d, Error
>>> **errp)
 if (!qdev_realize(DEVICE(>via_sio), BUS(isa_bus), errp)) {
 return;
 }
 +
 +/* Function 1: IDE */
 +qdev_prop_set_int32(DEVICE(>ide), "addr", d->devfn + 1);
 +if (!qdev_realize(DEVICE(>ide), BUS(pci_bus), errp)) {
 +return;
 +}
 +pci_ide_create_devs(PCI_DEVICE(>ide));
>>> 
>>> I'm not sure about moving pci_ide_create_devs() here. This is usally
>>> called from board code and only piix4 seems to do this. Maybe that's wrong
>>> because if all IDE devices did this then one machine could not have more
>>> than one different ide devices (like having an on-board ide and adding a
>>> pci ide controoler with -device) so this probably belongs to the board
>>> code to add devices to its default ide controller only as this is machine
>>> specific. Unless I'm wrong in which case somebody will correct me.
>>> 
>> 
>> Grepping the code it can be seen that it's always called right after
>> creating the IDE controllers. The only notable exception is the "sii3112"
>> device in the sam460ex board which is not emulated yet. Since the IDE
>
>The problem with sii3112 is that it only has 2 channels becuase I did not 
>bother to implement more so pci_ide_create_devs() probably would not work as 
>it assumes 4 channels. AFAIK this means that the short -hda, -cdrom, etc. 
>convenience options don't work with sam460ex but yhou have to use the long way 
>of creating ide-hd and ide-cd devices on the command line. I think there's a 
>version of this controller with 4 channels, maybe called sii3114 or similar 
>and it 

Re: [PATCH v7 2/2] target/s390x: support PRNO_TRNG instruction

2022-08-29 Thread Jason A. Donenfeld
On Fri, Aug 26, 2022 at 01:28:11PM +0200, Thomas Huth wrote:
> > +qemu_guest_getrandom_nofail(tmp, block);
> > +for (size_t i = 0; i < block; ++i) {
> > +cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra);
> > +*buf_reg = deposit64(*buf_reg, 0, message_reg_len, *buf_reg + 
> > 1);
> > +--*len_reg;
> 
> I know it's annoying, but technically, you must not touch the upper bits of 
> the len_reg if running in 31- or 24-bit addressing mode. The Principles of 
> Operations say:
> 
> "In either the 24- or 31-bit addressing mode, bits 32-63 of the odd-numbered 
> register are decremented by the number
> of bytes processed for the respective operand, and
> bits 0-31 of the register remain unchanged."
> 

This is what I was trying to do with the use of deposit64, following
David's guidance. Did I mess something up?

> > +}
> > +len -= block;
> > +}
> > +}
> > +
> >   uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, 
> > uint32_t r3,
> >uint32_t type)
> >   {
> 
> Don't you also need to modify the "query" part to signal the availability of 
> the function? Doesn't Linux in the guest check the availability first before 
> using it?

I think this is already handled at the upper layers. Linux detects it
fine.

> 
> > @@ -209,6 +235,10 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, 
> > uint32_t r2, uint32_t r3,
> >   return klmd_sha512(env, ra, env->regs[1], >regs[r2], 
> > >regs[r2 + 1]);
> >   }
> >   break;
> > +case 114: /* CPACF_PRNO_TRNG */
> > +fill_buf_random(env, ra, >regs[r1], >regs[r1 + 1]);
> > +fill_buf_random(env, ra, >regs[r2], >regs[r2 + 1]);
> > +break;
> >   default:
> >   /* we don't implement any other subfunction yet */
> >   g_assert_not_reached();
> 
> Maybe one more thing to check (according the "Special Conditions" section in 
> the Principles of Operation):
> 
> "A specification exception is recognized and no other
> action is taken if any of the following conditions exist:
> 
> ...
> 
> 2. The R1 or R2 fields designate an odd-numbered
> register or general register 0. This exception is
> recognized regardless of the function code.
> "

This is taken care of already by the function that calls into this
function.

Jason



Re: [PATCH v7 1/2] target/s390x: support SHA-512 extensions

2022-08-29 Thread Jason A. Donenfeld
On Fri, Aug 26, 2022 at 12:21:36PM +0200, Thomas Huth wrote:
> > + *  Copyright (C) 2022 Jason A. Donenfeld . All Rights 
> > Reserved.
> 
> Please drop the "All rights reserved" ... it does not have any legal meaning 

No.

> > +{
> > +enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep 
> > interactivity. */
> > +uint64_t z[8], b[8], a[8], w[16], t;
> > +uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, 
> > processed = 0;
> 
> The line is very long, could you please declare message and len on separate 
> lines?

Will do.

> 
> > +int i, j, message_reg_len = 64, blocks = 0, cc = 0;
> > +
> > +if (!(env->psw.mask & PSW_MASK_64)) {
> > +len = (uint32_t)len;
> > +message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
> > +}
> > +
> > +for (i = 0; i < 8; ++i) {
> > +z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, 
> > parameter_block + 8 * i), ra);
> 
> Quite a long line again, maybe split it like this:
> 
>abi_ptr addr = wrap_address(env, parameter_block + 8 * i);
>z[i] = a[i] = cpu_ldq_be_data_ra(env, addr, ra);

Sure.

> 
> > +}
> > +
> > +while (len >= 128) {
> > +if (++blocks > MAX_BLOCKS_PER_RUN) {
> > +cc = 3;
> > +break;
> > +}
> > +
> > +for (i = 0; i < 16; ++i) {
> > +if (message) {
> > +w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 
> > 8 * i), ra);
> 
> Long line again, please split.

Okay.

> >   cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
> 
> So for KIMD and KLMD, I think we now have to set the bit that corresponds to 
> SHA-512 in the query status information, too? Otherwise the guest might not 
> use the function if it thinks that it is not available?

That's already taken care of generically I think. This works fine from
Linux's autodetection.

Jason



Re: [PATCH v7 11/14] KVM: Register/unregister the guest private memory regions

2022-08-29 Thread Chao Peng
On Fri, Aug 26, 2022 at 04:19:43PM +0100, Fuad Tabba wrote:
> > +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm)
> > +{
> > +   return false;
> > +}
> > +
> >  static int check_memory_region_flags(const struct kvm_user_mem_region *mem)
> >  {
> > u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
> > @@ -4689,6 +4729,22 @@ static long kvm_vm_ioctl(struct file *filp,
> > r = kvm_vm_ioctl_set_memory_region(kvm, );
> > break;
> > }
> > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
> > +   case KVM_MEMORY_ENCRYPT_REG_REGION:
> > +   case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
> > +   struct kvm_enc_region region;
> > +
> > +   if (!kvm_arch_private_mem_supported(kvm))
> > +   goto arch_vm_ioctl;
> > +
> > +   r = -EFAULT;
> > +   if (copy_from_user(, argp, sizeof(region)))
> > +   goto out;
> > +
> > +   r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, );
> > +   break;
> > +   }
> > +#endif
> > case KVM_GET_DIRTY_LOG: {
> > struct kvm_dirty_log log;
> >
> > @@ -4842,6 +4898,7 @@ static long kvm_vm_ioctl(struct file *filp,
> > r = kvm_vm_ioctl_get_stats_fd(kvm);
> > break;
> > default:
> > +arch_vm_ioctl:
> 
> It might be good to make this label conditional on
> CONFIG_HAVE_KVM_PRIVATE_MEM, otherwise you get a warning if
> CONFIG_HAVE_KVM_PRIVATE_MEM isn't defined.
> 
> +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM
>  arch_vm_ioctl:
> +#endif

Right, as the bot already complains.

Chao
> 
> Cheers,
> /fuad
> 
> 
> 
> 
> 
> > r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > }
> >  out:
> > --
> > 2.25.1
> >



Re: [PATCH] target/sh4: Fix TB_FLAG_UNALIGN

2022-08-29 Thread Richard Henderson

On 8/29/22 02:05, BALATON Zoltan wrote:

On Sun, 28 Aug 2022, Richard Henderson wrote:

The value previously chosen overlaps GUSA_MASK.

Cc: qemu-sta...@nongnu.org
Fixes: 4da06fb3062 ("target/sh4: Implement prctl_unalign_sigbus")
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/856
Signed-off-by: Richard Henderson 
---
target/sh4/cpu.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index 9f15ef913c..e79cbc59e2 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -84,7 +84,7 @@
#define DELAY_SLOT_RTE (1 << 2)

#define TB_FLAG_PENDING_MOVCA  (1 << 3)
-#define TB_FLAG_UNALIGN    (1 << 4)
+#define TB_FLAG_UNALIGN    (1 << 13)


Is it worth a comment to note why that value to avoid the same problem if another flag is 
added in the future?


Hmm, or perhaps move it down below, so that we see bit 3 used, then bits 4-12, 
then bit 13.


r~



Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

2022-08-29 Thread Chao Peng
On Fri, Aug 26, 2022 at 04:19:25PM +0100, Fuad Tabba wrote:
> Hi,
> 
> On Wed, Jul 6, 2022 at 9:24 AM Chao Peng  wrote:
> >
> > This is the v7 of this series which tries to implement the fd-based KVM
> > guest private memory. The patches are based on latest kvm/queue branch
> > commit:
> >
> >   b9b71f43683a (kvm/queue) KVM: x86/mmu: Buffer nested MMU
> > split_desc_cache only by default capacity
> >
> > Introduction
> > 
> > In general this patch series introduce fd-based memslot which provides
> > guest memory through memory file descriptor fd[offset,size] instead of
> > hva/size. The fd can be created from a supported memory filesystem
> > like tmpfs/hugetlbfs etc. which we refer as memory backing store. KVM
> > and the the memory backing store exchange callbacks when such memslot
> > gets created. At runtime KVM will call into callbacks provided by the
> > backing store to get the pfn with the fd+offset. Memory backing store
> > will also call into KVM callbacks when userspace punch hole on the fd
> > to notify KVM to unmap secondary MMU page table entries.
> >
> > Comparing to existing hva-based memslot, this new type of memslot allows
> > guest memory unmapped from host userspace like QEMU and even the kernel
> > itself, therefore reduce attack surface and prevent bugs.
> >
> > Based on this fd-based memslot, we can build guest private memory that
> > is going to be used in confidential computing environments such as Intel
> > TDX and AMD SEV. When supported, the memory backing store can provide
> > more enforcement on the fd and KVM can use a single memslot to hold both
> > the private and shared part of the guest memory.
> >
> > mm extension
> > -
> > Introduces new MFD_INACCESSIBLE flag for memfd_create(), the file
> > created with these flags cannot read(), write() or mmap() etc via normal
> > MMU operations. The file content can only be used with the newly
> > introduced memfile_notifier extension.
> >
> > The memfile_notifier extension provides two sets of callbacks for KVM to
> > interact with the memory backing store:
> >   - memfile_notifier_ops: callbacks for memory backing store to notify
> > KVM when memory gets invalidated.
> >   - backing store callbacks: callbacks for KVM to call into memory
> > backing store to request memory pages for guest private memory.
> >
> > The memfile_notifier extension also provides APIs for memory backing
> > store to register/unregister itself and to trigger the notifier when the
> > bookmarked memory gets invalidated.
> >
> > The patchset also introduces a new memfd seal F_SEAL_AUTO_ALLOCATE to
> > prevent double allocation caused by unintentional guest when we only
> > have a single side of the shared/private memfds effective.
> >
> > memslot extension
> > -
> > Add the private fd and the fd offset to existing 'shared' memslot so
> > that both private/shared guest memory can live in one single memslot.
> > A page in the memslot is either private or shared. Whether a guest page
> > is private or shared is maintained through reusing existing SEV ioctls
> > KVM_MEMORY_ENCRYPT_{UN,}REG_REGION.
> >
> 
> I'm on the Android pKVM team at Google, and we've been looking into
> how this approach fits with what we've been doing with pkvm/arm64.
> I've had a go at porting your patches, along with some fixes and
> additions so it would go on top of our latest pkvm patch series [1] to
> see how well this proposal fits with what we’re doing. You can find
> the ported code at this link [2].
> 
> In general, an fd-based approach fits very well with pKVM for the
> reasons you mention. It means that we don't necessarily need to map
> the guest memory, and with the new extensions it allows the host
> kernel to control whether to restrict migration and swapping.

Good to hear that.

> 
> For pKVM, we would also need the guest private memory not to be
> GUP’able by the kernel so that userspace can’t trick the kernel into
> accessing guest private memory in a context where it isn’t prepared to
> handle the fault injected by the hypervisor. We’re looking at whether
> we could use memfd_secret to achieve this, or maybe whether extending
> your work might solve the problem.

This is interesting and can be a valuable addition to this series.

> 
> However, during the porting effort, the main issue we've encountered
> is that many of the details of this approach seem to be targeted at
> TDX/SEV and don’t readily align with the design of pKVM. My knowledge
> on TDX is very rudimentary, so please bear with me if I get things
> wrong.

No doubt this series is initially designed for confidential computing
usages, but pKVM can definitely extend it if it finds useful.

> 
> The idea of the memslot having two references to the backing memory,
> the (new) private_fd (a file descriptor) as well as the userspace_addr
> (a memory address), with the meaning changing depending on whether the
> memory is private or shared. Both can 

[PATCH 5/5] virtio-gpu: Don't require udmabuf when blob support is enabled

2022-08-29 Thread Antonio Caggiano
From: Dmitry Osipenko 

Host blobs don't need udmabuf, it's only needed by guest blobs. The host
blobs are utilized by the Mesa virgl driver when persistent memory mapping
is needed by a GL buffer, otherwise virgl driver doesn't use blobs.
Persistent mapping support bumps GL version from 4.3 to 4.5 in guest.
Relax the udmabuf requirement.

Signed-off-by: Dmitry Osipenko 
Reviewed-by: Antonio Caggiano 
---
 hw/display/virtio-gpu.c | 20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 527c0aeede..4c2a9b7ea7 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -367,7 +367,9 @@ static void virtio_gpu_resource_create_blob(VirtIOGPU *g,
 return;
 }
 
-virtio_gpu_init_udmabuf(res);
+if (cblob.blob_mem == VIRTIO_GPU_BLOB_MEM_GUEST) {
+virtio_gpu_init_udmabuf(res);
+}
 QTAILQ_INSERT_HEAD(>reslist, res, next);
 }
 
@@ -1315,19 +1317,13 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error 
**errp)
 VirtIODevice *vdev = VIRTIO_DEVICE(qdev);
 VirtIOGPU *g = VIRTIO_GPU(qdev);
 
-if (virtio_gpu_blob_enabled(g->parent_obj.conf)) {
-if (!virtio_gpu_have_udmabuf()) {
-error_setg(errp, "cannot enable blob resources without udmabuf");
-return;
-}
-
 #ifndef HAVE_VIRGL_RESOURCE_BLOB
-if (virtio_gpu_virgl_enabled(g->parent_obj.conf)) {
-error_setg(errp, "Linked virglrenderer does not support blob 
resources");
-return;
-}
-#endif
+if (virtio_gpu_blob_enabled(g->parent_obj.conf) &&
+virtio_gpu_virgl_enabled(g->parent_obj.conf)) {
+error_setg(errp, "Linked virglrenderer does not support blob 
resources");
+return;
 }
+#endif
 
 if (!virtio_gpu_base_device_realize(qdev,
 virtio_gpu_handle_ctrl_cb,
-- 
2.34.1




[PATCH 0/5] virtio-gpu: Blob resources

2022-08-29 Thread Antonio Caggiano
Add shared memory and support blob resource creation, mapping and
unmapping through virglrenderer new stable APIs[0] when available.

[0] https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/891

Antonio Caggiano (1):
  virtio-gpu: Handle resource blob commands

Dmitry Osipenko (1):
  virtio-gpu: Don't require udmabuf when blob support is enabled

Dr. David Alan Gilbert (1):
  virtio: Add shared memory capability

Gerd Hoffmann (1):
  virtio-gpu: hostmem

Richard Henderson (1):
  Update version for v7.1.0-rc4 release

 VERSION  |   2 +-
 hw/display/virtio-gpu-pci.c  |  15 +++
 hw/display/virtio-gpu-virgl.c| 169 +++
 hw/display/virtio-gpu.c  |  25 ++--
 hw/display/virtio-vga.c  |  33 --
 hw/virtio/virtio-pci.c   |  18 +++
 include/hw/virtio/virtio-gpu-bswap.h |  18 +++
 include/hw/virtio/virtio-gpu.h   |  11 ++
 include/hw/virtio/virtio-pci.h   |   4 +
 meson.build  |   5 +
 10 files changed, 276 insertions(+), 24 deletions(-)

-- 
2.34.1




[PATCH 4/5] virtio-gpu: Handle resource blob commands

2022-08-29 Thread Antonio Caggiano
Support BLOB resources creation by calling virgl_renderer_resource_create_blob.

Signed-off-by: Antonio Caggiano 
Signed-off-by: Dmitry Osipenko 
---
 hw/display/virtio-gpu-virgl.c| 169 +++
 hw/display/virtio-gpu.c  |   8 +-
 include/hw/virtio/virtio-gpu-bswap.h |  18 +++
 include/hw/virtio/virtio-gpu.h   |   6 +
 meson.build  |   5 +
 5 files changed, 202 insertions(+), 4 deletions(-)

diff --git a/hw/display/virtio-gpu-virgl.c b/hw/display/virtio-gpu-virgl.c
index 73cb92c8d5..c4c2c31d76 100644
--- a/hw/display/virtio-gpu-virgl.c
+++ b/hw/display/virtio-gpu-virgl.c
@@ -16,6 +16,8 @@
 #include "trace.h"
 #include "hw/virtio/virtio.h"
 #include "hw/virtio/virtio-gpu.h"
+#include "hw/virtio/virtio-gpu-bswap.h"
+#include "hw/virtio/virtio-iommu.h"
 
 #include 
 
@@ -398,6 +400,162 @@ static void virgl_cmd_get_capset(VirtIOGPU *g,
 g_free(resp);
 }
 
+#ifdef HAVE_VIRGL_RESOURCE_BLOB
+
+static void virgl_cmd_resource_create_blob(VirtIOGPU *g,
+   struct virtio_gpu_ctrl_command *cmd)
+{
+struct virtio_gpu_simple_resource *res;
+struct virtio_gpu_resource_create_blob cblob;
+int ret;
+
+VIRTIO_GPU_FILL_CMD(cblob);
+virtio_gpu_create_blob_bswap();
+trace_virtio_gpu_cmd_res_create_blob(cblob.resource_id, cblob.size);
+
+if (cblob.resource_id == 0) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: resource id 0 is not allowed\n",
+  __func__);
+cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID;
+return;
+}
+
+res = virtio_gpu_find_resource(g, cblob.resource_id);
+if (res) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: resource already exists %d\n",
+  __func__, cblob.resource_id);
+cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID;
+return;
+}
+
+res = g_new0(struct virtio_gpu_simple_resource, 1);
+res->resource_id = cblob.resource_id;
+res->blob_size = cblob.size;
+
+if (res->iov) {
+cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC;
+return;
+}
+
+if (cblob.blob_mem != VIRTIO_GPU_BLOB_MEM_HOST3D) {
+ret = virtio_gpu_create_mapping_iov(g, cblob.nr_entries, sizeof(cblob),
+cmd, >addrs, >iov,
+>iov_cnt);
+if (ret != 0) {
+cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC;
+return;
+}
+}
+
+if (cblob.blob_mem == VIRTIO_GPU_BLOB_MEM_GUEST) {
+virtio_gpu_init_udmabuf(res);
+}
+QTAILQ_INSERT_HEAD(>reslist, res, next);
+
+const struct virgl_renderer_resource_create_blob_args virgl_args = {
+.res_handle = cblob.resource_id,
+.ctx_id = cblob.hdr.ctx_id,
+.blob_mem = cblob.blob_mem,
+.blob_id = cblob.blob_id,
+.blob_flags = cblob.blob_flags,
+.size = cblob.size,
+.iovecs = res->iov,
+.num_iovs = res->iov_cnt,
+};
+ret = virgl_renderer_resource_create_blob(_args);
+if (ret) {
+g_print("Virgl blob create error: %s\n", strerror(-ret));
+}
+}
+
+static void virgl_cmd_resource_map_blob(VirtIOGPU *g,
+struct virtio_gpu_ctrl_command *cmd)
+{
+struct virtio_gpu_simple_resource *res;
+struct virtio_gpu_resource_map_blob mblob;
+int ret;
+void *data;
+uint64_t size;
+struct virtio_gpu_resp_map_info resp;
+
+VIRTIO_GPU_FILL_CMD(mblob);
+virtio_gpu_map_blob_bswap();
+
+if (mblob.resource_id == 0) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: resource id 0 is not allowed\n",
+  __func__);
+cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID;
+return;
+}
+
+res = virtio_gpu_find_resource(g, mblob.resource_id);
+if (!res) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: resource does not exist %d\n",
+  __func__, mblob.resource_id);
+cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID;
+return;
+}
+
+ret = virgl_renderer_resource_map(res->resource_id, , );
+if (ret) {
+g_print("Virgl blob resource map error: %s\n", strerror(-ret));
+cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID;
+return;
+}
+
+memory_region_init_ram_device_ptr(>region, OBJECT(g), NULL, size, 
data);
+memory_region_add_subregion(>parent_obj.hostmem, mblob.offset, 
>region);
+memory_region_set_enabled(>region, true);
+
+memset(, 0, sizeof(resp));
+resp.hdr.type = VIRTIO_GPU_RESP_OK_MAP_INFO;
+virgl_renderer_resource_get_map_info(mblob.resource_id, _info);
+virtio_gpu_ctrl_response(g, cmd, , sizeof(resp));
+
+res->mapped = true;
+}
+
+static void virgl_cmd_resource_unmap_blob(VirtIOGPU *g,
+struct virtio_gpu_ctrl_command *cmd)
+{
+struct virtio_gpu_simple_resource *res;
+struct 

[PATCH 3/5] virtio-gpu: hostmem

2022-08-29 Thread Antonio Caggiano
From: Gerd Hoffmann 

Use VIRTIO_GPU_SHM_ID_HOST_VISIBLE as id for virtio-gpu.

v2: Formatting fixes

Signed-off-by: Antonio Caggiano 
Acked-by: Michael S. Tsirkin 
---
 hw/display/virtio-gpu-pci.c| 15 +++
 hw/display/virtio-gpu.c|  1 +
 hw/display/virtio-vga.c| 33 -
 include/hw/virtio/virtio-gpu.h |  5 +
 4 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/hw/display/virtio-gpu-pci.c b/hw/display/virtio-gpu-pci.c
index 93f214ff58..2cbbacd7fe 100644
--- a/hw/display/virtio-gpu-pci.c
+++ b/hw/display/virtio-gpu-pci.c
@@ -33,6 +33,21 @@ static void virtio_gpu_pci_base_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 DeviceState *vdev = DEVICE(g);
 int i;
 
+if (virtio_gpu_hostmem_enabled(g->conf)) {
+vpci_dev->msix_bar_idx = 1;
+vpci_dev->modern_mem_bar_idx = 2;
+memory_region_init(>hostmem, OBJECT(g), "virtio-gpu-hostmem",
+   g->conf.hostmem);
+pci_register_bar(_dev->pci_dev, 4,
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_PREFETCH |
+ PCI_BASE_ADDRESS_MEM_TYPE_64,
+ >hostmem);
+virtio_pci_add_shm_cap(vpci_dev, 4, 0, g->conf.hostmem,
+   VIRTIO_GPU_SHM_ID_HOST_VISIBLE);
+}
+
+qdev_set_parent_bus(vdev, BUS(_dev->bus), errp);
 virtio_pci_force_virtio_1(vpci_dev);
 if (!qdev_realize(vdev, BUS(_dev->bus), errp)) {
 return;
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 20cc703dcc..506b3b8eef 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -1424,6 +1424,7 @@ static Property virtio_gpu_properties[] = {
  256 * MiB),
 DEFINE_PROP_BIT("blob", VirtIOGPU, parent_obj.conf.flags,
 VIRTIO_GPU_FLAG_BLOB_ENABLED, false),
+DEFINE_PROP_SIZE("hostmem", VirtIOGPU, parent_obj.conf.hostmem, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/display/virtio-vga.c b/hw/display/virtio-vga.c
index 4dcb34c4a7..aa8d1ab993 100644
--- a/hw/display/virtio-vga.c
+++ b/hw/display/virtio-vga.c
@@ -115,17 +115,32 @@ static void virtio_vga_base_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 pci_register_bar(_dev->pci_dev, 0,
  PCI_BASE_ADDRESS_MEM_PREFETCH, >vram);
 
-/*
- * Configure virtio bar and regions
- *
- * We use bar #2 for the mmio regions, to be compatible with stdvga.
- * virtio regions are moved to the end of bar #2, to make room for
- * the stdvga mmio registers at the start of bar #2.
- */
-vpci_dev->modern_mem_bar_idx = 2;
-vpci_dev->msix_bar_idx = 4;
 vpci_dev->modern_io_bar_idx = 5;
 
+if (!virtio_gpu_hostmem_enabled(g->conf)) {
+/*
+ * Configure virtio bar and regions
+ *
+ * We use bar #2 for the mmio regions, to be compatible with stdvga.
+ * virtio regions are moved to the end of bar #2, to make room for
+ * the stdvga mmio registers at the start of bar #2.
+ */
+vpci_dev->modern_mem_bar_idx = 2;
+vpci_dev->msix_bar_idx = 4;
+} else {
+vpci_dev->msix_bar_idx = 1;
+vpci_dev->modern_mem_bar_idx = 2;
+memory_region_init(>hostmem, OBJECT(g), "virtio-gpu-hostmem",
+   g->conf.hostmem);
+pci_register_bar(_dev->pci_dev, 4,
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_PREFETCH |
+ PCI_BASE_ADDRESS_MEM_TYPE_64,
+ >hostmem);
+virtio_pci_add_shm_cap(vpci_dev, 4, 0, g->conf.hostmem,
+   VIRTIO_GPU_SHM_ID_HOST_VISIBLE);
+}
+
 if (!(vpci_dev->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)) {
 /*
  * with page-per-vq=off there is no padding space we can use
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index 2e28507efe..eafce75b04 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -102,12 +102,15 @@ enum virtio_gpu_base_conf_flags {
 (_cfg.flags & (1 << VIRTIO_GPU_FLAG_DMABUF_ENABLED))
 #define virtio_gpu_blob_enabled(_cfg) \
 (_cfg.flags & (1 << VIRTIO_GPU_FLAG_BLOB_ENABLED))
+#define virtio_gpu_hostmem_enabled(_cfg) \
+(_cfg.hostmem > 0)
 
 struct virtio_gpu_base_conf {
 uint32_t max_outputs;
 uint32_t flags;
 uint32_t xres;
 uint32_t yres;
+uint64_t hostmem;
 };
 
 struct virtio_gpu_ctrl_command {
@@ -131,6 +134,8 @@ struct VirtIOGPUBase {
 int renderer_blocked;
 int enable;
 
+MemoryRegion hostmem;
+
 struct virtio_gpu_scanout scanout[VIRTIO_GPU_MAX_SCANOUTS];
 
 int enabled_output_bitmask;
-- 
2.34.1




[PATCH 1/5] Update version for v7.1.0-rc4 release

2022-08-29 Thread Antonio Caggiano
From: Richard Henderson 

Signed-off-by: Richard Henderson 
---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index 1c944b9863..b8d5f3ebb6 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-7.0.93
+7.0.94
-- 
2.34.1




[PATCH 2/5] virtio: Add shared memory capability

2022-08-29 Thread Antonio Caggiano
From: "Dr. David Alan Gilbert" 

Define a new capability type 'VIRTIO_PCI_CAP_SHARED_MEMORY_CFG'
and the data structure 'virtio_pci_shm_cap' to go with it.
They allow defining shared memory regions with sizes and offsets
of 2^32 and more.
Multiple instances of the capability are allowed and distinguished
by a device-specific 'id'.

v2: Remove virtio_pci_shm_cap as virtio_pci_cap64 is used instead.
v3: No need for mask32 as cpu_to_le32 truncates the value.

Signed-off-by: Dr. David Alan Gilbert 
Signed-off-by: Antonio Caggiano 
---
 hw/virtio/virtio-pci.c | 18 ++
 include/hw/virtio/virtio-pci.h |  4 
 2 files changed, 22 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index a50c5a57d7..377bb06fec 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1169,6 +1169,24 @@ static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
 return offset;
 }
 
+int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy,
+   uint8_t bar, uint64_t offset, uint64_t length,
+   uint8_t id)
+{
+struct virtio_pci_cap64 cap = {
+.cap.cap_len = sizeof cap,
+.cap.cfg_type = VIRTIO_PCI_CAP_SHARED_MEMORY_CFG,
+};
+
+cap.cap.bar = bar;
+cap.cap.length = cpu_to_le32(length);
+cap.length_hi = cpu_to_le32(length >> 32);
+cap.cap.offset = cpu_to_le32(offset);
+cap.offset_hi = cpu_to_le32(offset >> 32);
+cap.cap.id = id;
+return virtio_pci_add_mem_cap(proxy, );
+}
+
 static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr,
unsigned size)
 {
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 2446dcd9ae..5e5c4a4c6d 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -252,4 +252,8 @@ void virtio_pci_types_register(const 
VirtioPCIDeviceTypeInfo *t);
  */
 unsigned virtio_pci_optimal_num_queues(unsigned fixed_queues);
 
+int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy,
+   uint8_t bar, uint64_t offset, uint64_t length,
+   uint8_t id);
+
 #endif
-- 
2.34.1




Re: [PATCH v7 01/14] mm: Add F_SEAL_AUTO_ALLOCATE seal to memfd

2022-08-29 Thread Chao Peng
On Fri, Aug 26, 2022 at 04:19:32PM +0100, Fuad Tabba wrote:
> Hi Chao,
> 
> On Wed, Jul 6, 2022 at 9:25 AM Chao Peng  wrote:
> >
> > Normally, a write to unallocated space of a file or the hole of a sparse
> > file automatically causes space allocation, for memfd, this equals to
> > memory allocation. This new seal prevents such automatically allocating,
> > either this is from a direct write() or a write on the previously
> > mmap-ed area. The seal does not prevent fallocate() so an explicit
> > fallocate() can still cause allocating and can be used to reserve
> > memory.
> >
> > This is used to prevent unintentional allocation from userspace on a
> > stray or careless write and any intentional allocation should use an
> > explicit fallocate(). One of the main usecases is to avoid memory double
> > allocation for confidential computing usage where we use two memfds to
> > back guest memory and at a single point only one memfd is alive and we
> > want to prevent memory allocation for the other memfd which may have
> > been mmap-ed previously. More discussion can be found at:
> >
> >   https://lkml.org/lkml/2022/6/14/1255
> >
> > Suggested-by: Sean Christopherson 
> > Signed-off-by: Chao Peng 
> > ---
> >  include/uapi/linux/fcntl.h |  1 +
> >  mm/memfd.c |  3 ++-
> >  mm/shmem.c | 16 ++--
> >  3 files changed, 17 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> > index 2f86b2ad6d7e..98bdabc8e309 100644
> > --- a/include/uapi/linux/fcntl.h
> > +++ b/include/uapi/linux/fcntl.h
> > @@ -43,6 +43,7 @@
> >  #define F_SEAL_GROW0x0004  /* prevent file from growing */
> >  #define F_SEAL_WRITE   0x0008  /* prevent writes */
> >  #define F_SEAL_FUTURE_WRITE0x0010  /* prevent future writes while 
> > mapped */
> > +#define F_SEAL_AUTO_ALLOCATE   0x0020  /* prevent allocation for writes */
> 
> I think this should also be added to tools/include/uapi/linux/fcntl.h

Yes, thanks.

Chao
> 
> Cheers,
> /fuad
> 
> 
> >  /* (1U << 31) is reserved for signed error codes */
> >
> >  /*
> > diff --git a/mm/memfd.c b/mm/memfd.c
> > index 08f5f8304746..2afd898798e4 100644
> > --- a/mm/memfd.c
> > +++ b/mm/memfd.c
> > @@ -150,7 +150,8 @@ static unsigned int *memfd_file_seals_ptr(struct file 
> > *file)
> >  F_SEAL_SHRINK | \
> >  F_SEAL_GROW | \
> >  F_SEAL_WRITE | \
> > -F_SEAL_FUTURE_WRITE)
> > +F_SEAL_FUTURE_WRITE | \
> > +F_SEAL_AUTO_ALLOCATE)
> >
> >  static int memfd_add_seals(struct file *file, unsigned int seals)
> >  {
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index a6f565308133..6c8aef15a17d 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2051,6 +2051,8 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
> > struct vm_area_struct *vma = vmf->vma;
> > struct inode *inode = file_inode(vma->vm_file);
> > gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
> > +   struct shmem_inode_info *info = SHMEM_I(inode);
> > +   enum sgp_type sgp;
> > int err;
> > vm_fault_t ret = VM_FAULT_LOCKED;
> >
> > @@ -2113,7 +2115,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
> > spin_unlock(>i_lock);
> > }
> >
> > -   err = shmem_getpage_gfp(inode, vmf->pgoff, >page, SGP_CACHE,
> > +   if (unlikely(info->seals & F_SEAL_AUTO_ALLOCATE))
> > +   sgp = SGP_NOALLOC;
> > +   else
> > +   sgp = SGP_CACHE;
> > +
> > +   err = shmem_getpage_gfp(inode, vmf->pgoff, >page, sgp,
> >   gfp, vma, vmf, );
> > if (err)
> > return vmf_error(err);
> > @@ -2459,6 +2466,7 @@ shmem_write_begin(struct file *file, struct 
> > address_space *mapping,
> > struct inode *inode = mapping->host;
> > struct shmem_inode_info *info = SHMEM_I(inode);
> > pgoff_t index = pos >> PAGE_SHIFT;
> > +   enum sgp_type sgp;
> > int ret = 0;
> >
> > /* i_rwsem is held by caller */
> > @@ -2470,7 +2478,11 @@ shmem_write_begin(struct file *file, struct 
> > address_space *mapping,
> > return -EPERM;
> > }
> >
> > -   ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
> > +   if (unlikely(info->seals & F_SEAL_AUTO_ALLOCATE))
> > +   sgp = SGP_NOALLOC;
> > +   else
> > +   sgp = SGP_WRITE;
> > +   ret = shmem_getpage(inode, index, pagep, sgp);
> >
> > if (ret)
> > return ret;
> > --
> > 2.25.1
> >



Re: New feature shout outs for KVM Forum QEMU Status Report

2022-08-29 Thread Stefan Hajnoczi
block/
- VDUSE export was added so qemu-storage-daemon can export disk images
  to the host kernel and guests. See qemu-storage-daemon(1) man page for
  details.

hw/remote/
- vfio-user server support was added so PCI devices can be emulated in a
  separate QEMU process. vfio-user client support is not yet available
  in QEMU so guests are currently unable to connect to the server.


signature.asc
Description: PGP signature


Re: [PATCH 22/51] tests/qtest: qmp-test: Skip running test_qmp_oob for win32

2022-08-29 Thread Markus Armbruster
Bin Meng  writes:

> Hi Markus,
>
> On Mon, Aug 29, 2022 at 9:14 PM Markus Armbruster  wrote:
>>
>> Bin Meng  writes:
>>
>> > From: Bin Meng 
>> >
>> > The test_qmp_oob test case calls mkfifo() which does not exist on
>> > win32. Exclude it.
>> >
>> > Signed-off-by: Bin Meng 
>> > ---
>> >
>> >  tests/qtest/qmp-test.c | 6 ++
>> >  1 file changed, 6 insertions(+)
>> >
>> > diff --git a/tests/qtest/qmp-test.c b/tests/qtest/qmp-test.c
>> > index b950dbafaf..4a165447f8 100644
>> > --- a/tests/qtest/qmp-test.c
>> > +++ b/tests/qtest/qmp-test.c
>> > @@ -159,6 +159,8 @@ static void test_qmp_protocol(void)
>> >  qtest_quit(qts);
>> >  }
>> >
>> > +#ifndef _WIN32
>> > +
>> >  /* Out-of-band tests */
>> >
>> >  char *tmpdir;
>> > @@ -279,6 +281,8 @@ static void test_qmp_oob(void)
>> >  qtest_quit(qts);
>> >  }
>> >
>> > +#endif /* _WIN32 */
>> > +
>> >  /* Preconfig tests */
>> >
>> >  static void test_qmp_preconfig(void)
>> > @@ -338,7 +342,9 @@ int main(int argc, char *argv[])
>> >  g_test_init(, , NULL);
>> >
>> >  qtest_add_func("qmp/protocol", test_qmp_protocol);
>> > +#ifndef _WIN32
>> >  qtest_add_func("qmp/oob", test_qmp_oob);
>> > +#endif
>> >  qtest_add_func("qmp/preconfig", test_qmp_preconfig);
>> >  qtest_add_func("qmp/missing-any-arg", test_qmp_missing_any_arg);
>>
>> I'd appreciate a comment explaining why we have to disable this test on
>> Windows.
>
> The reason is explained in the commit message.

Yes, and putting it there is a good idea.  But I'd appreciate if you
*also* put it in the code, so future readers of the code don't have to
dig through git history.




[PATCH] pci: Abort if pci_add_capability fails

2022-08-29 Thread 小田喜陽彦
From: Akihiko Odaki 

pci_add_capability appears most PCI devices. The error handling required
lots of code, and led to inconsistent behaviors such as:
- passing error_abort
- passing error_fatal
- asserting the returned value
- propagating the error to the caller
- skipping the rest of the function
- just ignoring

pci_add_capability fails if the new capability overlaps with an existing
one. It happens only if the device implementation is wrong so
pci_add_capability can just abort instead of returning to the caller in
the case, fixing inconsistencies and removing extra code.

Signed-off-by: Akihiko Odaki 
---
 docs/pcie_sriov.txt|  4 +--
 hw/display/bochs-display.c |  4 +--
 hw/i386/amd_iommu.c| 18 ++-
 hw/ide/ich.c   |  8 ++---
 hw/net/e1000e.c| 22 +++--
 hw/net/eepro100.c  |  7 +---
 hw/nvme/ctrl.c | 10 +-
 hw/pci-bridge/cxl_downstream.c |  9 ++
 hw/pci-bridge/cxl_upstream.c   |  9 ++
 hw/pci-bridge/i82801b11.c  | 15 ++---
 hw/pci-bridge/pci_bridge_dev.c |  2 +-
 hw/pci-bridge/pcie_pci_bridge.c| 17 +++---
 hw/pci-bridge/pcie_root_port.c | 16 ++---
 hw/pci-bridge/xio3130_downstream.c | 15 ++---
 hw/pci-bridge/xio3130_upstream.c   | 15 ++---
 hw/pci-host/designware.c   |  3 +-
 hw/pci-host/xilinx-pcie.c  |  5 +--
 hw/pci/msi.c   |  9 +-
 hw/pci/msix.c  |  8 ++---
 hw/pci/pci.c   | 14 +++-
 hw/pci/pci_bridge.c| 22 -
 hw/pci/pcie.c  | 52 --
 hw/pci/shpc.c  | 16 +++--
 hw/pci/slotid_cap.c|  8 ++---
 hw/usb/hcd-xhci-pci.c  |  3 +-
 hw/vfio/pci-quirks.c   | 15 ++---
 hw/vfio/pci.c  | 12 +++
 hw/virtio/virtio-pci.c | 22 -
 include/hw/pci/pci.h   |  5 ++-
 include/hw/pci/pci_bridge.h|  5 ++-
 include/hw/pci/pcie.h  | 11 +++
 include/hw/pci/shpc.h  |  5 ++-
 include/hw/virtio/virtio-pci.h |  2 +-
 33 files changed, 98 insertions(+), 290 deletions(-)

diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt
index 11158dbf88..728a73ba7b 100644
--- a/docs/pcie_sriov.txt
+++ b/docs/pcie_sriov.txt
@@ -49,7 +49,7 @@ setting up a BAR for a VF.
pci_your_pf_dev_realize( ... )
{
   ...
-  int ret = pcie_endpoint_cap_init(d, 0x70);
+  pcie_endpoint_cap_init(d, 0x70);
   ...
   pcie_ari_init(d, 0x100, 1);
   ...
@@ -79,7 +79,7 @@ setting up a BAR for a VF.
pci_your_vf_dev_realize( ... )
{
   ...
-  int ret = pcie_endpoint_cap_init(d, 0x60);
+  pcie_endpoint_cap_init(d, 0x60);
   ...
   pcie_ari_init(d, 0x100, 1);
   ...
diff --git a/hw/display/bochs-display.c b/hw/display/bochs-display.c
index 8ed734b195..111cabcfb3 100644
--- a/hw/display/bochs-display.c
+++ b/hw/display/bochs-display.c
@@ -265,7 +265,6 @@ static void bochs_display_realize(PCIDevice *dev, Error 
**errp)
 {
 BochsDisplayState *s = BOCHS_DISPLAY(dev);
 Object *obj = OBJECT(dev);
-int ret;
 
 if (s->vgamem < 4 * MiB) {
 error_setg(errp, "bochs-display: video memory too small");
@@ -302,8 +301,7 @@ static void bochs_display_realize(PCIDevice *dev, Error 
**errp)
 }
 
 if (pci_bus_is_express(pci_get_bus(dev))) {
-ret = pcie_endpoint_cap_init(dev, 0x80);
-assert(ret > 0);
+pcie_endpoint_cap_init(dev, 0x80);
 } else {
 dev->cap_present &= ~QEMU_PCI_CAP_EXPRESS;
 }
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 725f69095b..256ecba1c3 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1553,23 +1553,11 @@ static void amdvi_sysbus_realize(DeviceState *dev, 
Error **errp)
 if (!qdev_realize(DEVICE(>pci), >qbus, errp)) {
 return;
 }
-ret = pci_add_capability(>pci.dev, AMDVI_CAPAB_ID_SEC, 0,
- AMDVI_CAPAB_SIZE, errp);
-if (ret < 0) {
-return;
-}
+pci_add_capability(>pci.dev, AMDVI_CAPAB_ID_SEC, 0, AMDVI_CAPAB_SIZE);
 s->capab_offset = ret;
 
-ret = pci_add_capability(>pci.dev, PCI_CAP_ID_MSI, 0,
- AMDVI_CAPAB_REG_SIZE, errp);
-if (ret < 0) {
-return;
-}
-ret = pci_add_capability(>pci.dev, PCI_CAP_ID_HT, 0,
- AMDVI_CAPAB_REG_SIZE, errp);
-if (ret < 0) {
-return;
-}
+pci_add_capability(>pci.dev, PCI_CAP_ID_MSI, 0, AMDVI_CAPAB_REG_SIZE);
+pci_add_capability(>pci.dev, PCI_CAP_ID_HT, 0, AMDVI_CAPAB_REG_SIZE);
 
 /* Pseudo address space under root PCI bus. */
 x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID);
diff --git a/hw/ide/ich.c b/hw/ide/ich.c
index 1007a51fcb..7349faa78f 100644
--- 

[PATCH 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.

2022-08-29 Thread Ricky Zhou
Many instructions which load/store 128-bit values are supposed to
raise #GP when the memory operand isn't 16-byte aligned. This includes:
 - Instructions explicitly requiring memory alignment (Exceptions Type 1
   in the "AVX and SSE Instruction Exception Specification" section of
   the SDM)
 - Legacy SSE instructions that load/store 128-bit values (Exceptions
   Types 2 and 4).

This change adds a raise_gp_if_unaligned helper which raises #GP if an
address is not properly aligned. This helper is called before 128-bit
loads/stores where appropriate.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/217
Signed-off-by: Ricky Zhou 
---
 target/i386/helper.h |  1 +
 target/i386/tcg/mem_helper.c |  8 
 target/i386/tcg/translate.c  | 38 +---
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/target/i386/helper.h b/target/i386/helper.h
index ac3b4d1ee3..17d78f2b0d 100644
--- a/target/i386/helper.h
+++ b/target/i386/helper.h
@@ -213,6 +213,7 @@ DEF_HELPER_1(update_mxcsr, void, env)
 DEF_HELPER_1(enter_mmx, void, env)
 DEF_HELPER_1(emms, void, env)
 DEF_HELPER_3(movq, void, env, ptr, ptr)
+DEF_HELPER_3(raise_gp_if_unaligned, void, env, tl, tl)
 
 #define SHIFT 0
 #include "ops_sse_header.h"
diff --git a/target/i386/tcg/mem_helper.c b/target/i386/tcg/mem_helper.c
index e3cdafd2d4..79259abef3 100644
--- a/target/i386/tcg/mem_helper.c
+++ b/target/i386/tcg/mem_helper.c
@@ -181,3 +181,11 @@ void helper_boundl(CPUX86State *env, target_ulong a0, int 
v)
 raise_exception_ra(env, EXCP05_BOUND, GETPC());
 }
 }
+
+void helper_raise_gp_if_unaligned(CPUX86State *env, target_ulong addr,
+  target_ulong align_mask)
+{
+if (unlikely((addr & align_mask) != 0)) {
+raise_exception_ra(env, EXCP0D_GPF, GETPC());
+}
+}
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index b7972f0ff5..de13f483b6 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -3054,7 +3054,7 @@ static const struct SSEOpHelper_epp sse_op_table6[256] = {
 [0x25] = SSE41_OP(pmovsxdq),
 [0x28] = SSE41_OP(pmuldq),
 [0x29] = SSE41_OP(pcmpeqq),
-[0x2a] = SSE41_SPECIAL, /* movntqda */
+[0x2a] = SSE41_SPECIAL, /* movntdqa */
 [0x2b] = SSE41_OP(packusdw),
 [0x30] = SSE41_OP(pmovzxbw),
 [0x31] = SSE41_OP(pmovzxbd),
@@ -3194,10 +3194,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 break;
 case 0x1e7: /* movntdq */
 case 0x02b: /* movntps */
-case 0x12b: /* movntps */
+case 0x12b: /* movntpd */
 if (mod == 3)
 goto illegal_op;
 gen_lea_modrm(env, s, modrm);
+gen_helper_raise_gp_if_unaligned(cpu_env, s->A0, 
tcg_const_tl(0xf));
 gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
 break;
 case 0x3f0: /* lddqu */
@@ -3273,6 +3274,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 case 0x26f: /* movdqu xmm, ea */
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
+/* movaps, movapd, movdqa */
+if (b == 0x028 || b == 0x128 || b == 0x16f) {
+gen_helper_raise_gp_if_unaligned(cpu_env, s->A0,
+ tcg_const_tl(0xf));
+}
 gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
 } else {
 rm = (modrm & 7) | REX_B(s);
@@ -3331,6 +3337,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 case 0x212: /* movsldup */
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
+if (!(s->prefix & PREFIX_VEX)) {
+gen_helper_raise_gp_if_unaligned(cpu_env, s->A0,
+ tcg_const_tl(0xf));
+}
 gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
 } else {
 rm = (modrm & 7) | REX_B(s);
@@ -3373,6 +3383,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 case 0x216: /* movshdup */
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
+if (!(s->prefix & PREFIX_VEX)) {
+gen_helper_raise_gp_if_unaligned(cpu_env, s->A0,
+ tcg_const_tl(0xf));
+}
 gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg]));
 } else {
 rm = (modrm & 7) | REX_B(s);
@@ -3465,6 +3479,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, 
int b,
 case 0x27f: /* movdqu ea, xmm */
 if (mod != 3) {
 gen_lea_modrm(env, s, modrm);
+if (b == 0x029 || b == 0x129 || b == 0x17f) {
+

[PATCH 0/1] target/i386: Raise #GP on unaligned m128 accesses when required.

2022-08-29 Thread Ricky Zhou
This is a change to raise #GP on unaligned m128 loads/stores when
required by the spec. Some notes on this change:

1. I considered making use of the existing support for enforcing memory
   alignment (setting MO_ALIGN_16 in the load/store's MemOp), but
   rejected this approach. There are at least two scenarios where we
   might want to do alignment checks in x86:
   
   a. Loads/stores when the AC flag is enabled (which should raise #AC
  on misalignment)
   b. SSE/AVX instructions which require memory alignment (which raise
  #GP on misalignment)
   
   The MemOp alignment checking mechanism can only handle one of these
   scenarios, since they require different exceptions to be raised. I
   think it make more sense to use the existing memory alignment support
   for implementing (a), since helper_unaligned_{ld,st} is already
   triggers SIGBUS in qemu-user. This is why I ended up implementing (b)
   with a helper.

2. It is often the case that legacy SSE instructions require 16 byte
   alignment of 128-bit memory operands, but AVX versions of the
   instructions do not (e.g. movsldup requires alignment and vmovsldup
   does not). From what I can tell, QEMU currently doesn't appear to
   report AVX support in cpuid, but it still seems to emulate some of
   these instructions if you tell it to execute them. This change
   attempts to distinguish between legacy SSE instructions and AVX
   instructions by by conditioning on !(s->prefix & PREFIX_VEX). Not
   sure this is very future-proof though - for example, it may need to
   be updated if support for EVEX prefixes is added. LMK if there's a
   nicer way to do this.

3. I tested this by running a Linux VM in qemu-system-x86_64 and
   verifying that movaps on an misaligned address triggers a segfault.

Ricky Zhou (1):
  target/i386: Raise #GP on unaligned m128 accesses when required.

 target/i386/helper.h |  1 +
 target/i386/tcg/mem_helper.c |  8 
 target/i386/tcg/translate.c  | 38 +---
 3 files changed, 44 insertions(+), 3 deletions(-)

-- 
2.37.2




Re: [PATCH 22/51] tests/qtest: qmp-test: Skip running test_qmp_oob for win32

2022-08-29 Thread Bin Meng
Hi Markus,

On Mon, Aug 29, 2022 at 9:14 PM Markus Armbruster  wrote:
>
> Bin Meng  writes:
>
> > From: Bin Meng 
> >
> > The test_qmp_oob test case calls mkfifo() which does not exist on
> > win32. Exclude it.
> >
> > Signed-off-by: Bin Meng 
> > ---
> >
> >  tests/qtest/qmp-test.c | 6 ++
> >  1 file changed, 6 insertions(+)
> >
> > diff --git a/tests/qtest/qmp-test.c b/tests/qtest/qmp-test.c
> > index b950dbafaf..4a165447f8 100644
> > --- a/tests/qtest/qmp-test.c
> > +++ b/tests/qtest/qmp-test.c
> > @@ -159,6 +159,8 @@ static void test_qmp_protocol(void)
> >  qtest_quit(qts);
> >  }
> >
> > +#ifndef _WIN32
> > +
> >  /* Out-of-band tests */
> >
> >  char *tmpdir;
> > @@ -279,6 +281,8 @@ static void test_qmp_oob(void)
> >  qtest_quit(qts);
> >  }
> >
> > +#endif /* _WIN32 */
> > +
> >  /* Preconfig tests */
> >
> >  static void test_qmp_preconfig(void)
> > @@ -338,7 +342,9 @@ int main(int argc, char *argv[])
> >  g_test_init(, , NULL);
> >
> >  qtest_add_func("qmp/protocol", test_qmp_protocol);
> > +#ifndef _WIN32
> >  qtest_add_func("qmp/oob", test_qmp_oob);
> > +#endif
> >  qtest_add_func("qmp/preconfig", test_qmp_preconfig);
> >  qtest_add_func("qmp/missing-any-arg", test_qmp_missing_any_arg);
>
> I'd appreciate a comment explaining why we have to disable this test on
> Windows.

The reason is explained in the commit message.

Regards,
Bin



Re: [RFC PATCH v2 0/8] qapi: add generator for Golang interface

2022-08-29 Thread Victor Toso
Hi,

On Mon, Aug 29, 2022 at 01:53:51PM +0200, Markus Armbruster wrote:
> Victor Toso  writes:
>
> > Hi,
> >
> > On Tue, Jul 05, 2022 at 08:46:34AM -0700, Andrea Bolognani wrote:
> >> I've commented in detail to the single patches, just a couple of
> >> additional points.
> >>
> >> On Fri, Jun 17, 2022 at 02:19:24PM +0200, Victor Toso wrote:
> >> > * 7) Flat structs by removing embed types. Discussion with Andrea
> >> >  Thread: 
> >> > https://lists.gnu.org/archive/html/qemu-devel/2022-05/msg01590.html
> >> >
> >> >  No one required it but I decided to give it a try. Major issue that
> >> >  I see with this approach is to have generated a few 'Base' structs
> >> >  that are now useless. Overall, less nested structs seems better to
> >> >  me. Opnions?
> >> >
> >> >  Example:
> >> >   | /* This is now useless, should be removed? */
> >> >   | type InetSocketAddressBase struct {
> >> >   | Host string `json:"host"`
> >> >   | Port string `json:"port"`
> >> >   | }
> >>
> >> Can we somehow keep track, in the generator, of types that are
> >> only used as building blocks for other types, and prevent them
> >> from showing up in the generated code?
> >
> > I'm not 100% sure it is good to remove them from generated code
> > because technically it is a valid qapi type. If all @base types
> > are embed types and they don't show in other way or form, sure we
> > can remove them from generated code... I'm not sure if it is
> > possible to guarantee this.
> >
> > But yes, if possible, I'd like to remove what seems useless type
> > definitions.
>
> The existing C generators have to generate all the types, because the
> generated code is for QEMU's own use, where we need all the types.
>
> The existing introspection generator generates only the types visible in
> QAPI/QMP introspection.
>
> The former generate for internal use (where we want all the types), and
> the latter for external use (where only the types visible in the
> external interface are actually useful).

My doubt are on types that might be okay to be hidden because
they are embed in other types, like InetSocketAddressBase.

Note that what I mean with the struct being embed is that the
actual fields of InetSocketAddressBase being added to the type
which uses it, like InetSocketAddress.

  | type InetSocketAddressBase struct {
  | Host string `json:"host"`
  | Port string `json:"port"`
  | }
  |
  | type InetSocketAddress struct {
  | // Base fields
  | Host string `json:"host"`
  | Port string `json:"port"`
  |
  | Numeric   *bool   `json:"numeric,omitempty"`
  | To*uint16 `json:"to,omitempty"`
  | Ipv4  *bool   `json:"ipv4,omitempty"`
  | Ipv6  *bool   `json:"ipv6,omitempty"`
  | KeepAlive *bool   `json:"keep-alive,omitempty"`
  | Mptcp *bool   `json:"mptcp,omitempty"`
  | }

Andrea's suggestions is to have the generator to track if a given
type is always embed in which case we can skip generating it in
the Go module.

I think that could work indeed. In the hypothetical case that
hiddens structs like InetSocketAddressBase becomes a parameter on
command in the future, the generator would know and start
generating this type from that point onwards.

> >> Finally, looking at the repository containing the generated
> >> code I see that the generated type are sorted by kind, e.g. all
> >> unions are in a file, all events in another one and so on. I
> >> believe the structure should match more closely that of the
> >> QAPI schema, so e.g.  block-related types should all go in one
> >> file, net-related types in another one and so on.
> >
> > That's something I don't mind adding but some hardcoded mapping
> > is needed. If you look into git history of qapi/ folder, .json
> > files can come and go, types be moved around, etc. So, we need to
> > proper map types in a way that the generated code would be kept
> > stable even if qapi files would have been rearranged. What I
> > proposed was only the simplest solution.
> >
> > Also, the generator takes a qapi-schema.json as input. We are
> > more focused in qemu/qapi/qapi-schema.json generated coded but
> > would not hurt to think we could even use it for qemu-guest-agent
> > from qemu/qga/qapi-schema.json -- this to say that the hardcoded
> > mapping needs to take into account non qemu qapi schemas too.
>
> In the beginning, the QAPI schema was monolithic.
> qga/qapi-schema.json still is.
>
> When keeping everything in a single qapi-schema.json became
> unwieldy, we split it into "modules" tied together with a
> simple include directive.  Generated code remained monolithic.

> When monolithic generated code became too annoying (touch
> schema, recompile everything), we made it match the module
> structure: code for FOO.json goes into *-FOO.c and *-FOO.h,
> where the *-FOO.h #include the generated headers for the .json
> modules FOO.json includes.
>
> Schema code motion hasn't been much of a 

Re: [RFC PATCH v2 2/8] qapi: golang: Generate qapi's alternate types in Go

2022-08-29 Thread Victor Toso
Hi,

On Mon, Aug 29, 2022 at 01:27:06PM +0200, Markus Armbruster wrote:
> Victor Toso  writes:
>
> > Hi,
> >
> > On Fri, Aug 19, 2022 at 11:27:13AM -0500, Andrea Bolognani wrote:
> >> On Wed, Aug 17, 2022 at 04:04:19PM +0200, Victor Toso wrote:
> >> > On Tue, Jul 05, 2022 at 08:45:06AM -0700, Andrea Bolognani wrote:
> >> > > On Fri, Jun 17, 2022 at 02:19:26PM +0200, Victor Toso wrote:
> >> > > > func (s *BlockdevRef) UnmarshalJSON(data []byte) error {
> >> > > > // Check for json-null first
> >> > > > if string(data) == "null" {
> >> > > > return errors.New(`null not supported for BlockdevRef`)
> >> > > > }
> >> > > > // Check for BlockdevOptions
> >> > > > {
> >> > > > s.Definition = new(BlockdevOptions)
> >> > > > if err := StrictDecode(s.Definition, data); err == nil {
> >> > > > return nil
> >> > > > }
> >> > >
> >> > > The use of StrictDecode() here means that we won't be able to
> >> > > parse an alternate produced by a version of QEMU where
> >> > > BlockdevOptions has gained additional fields, doesn't it?
> >> >
> >> > That's correct. This means that with this RFCv2 proposal, qapi-go
> >> > based on qemu version 7.1 might not be able to decode a qmp
> >> > message from qemu version 7.2 if it has introduced a new field.
> >> >
> >> > This needs fixing, not sure yet the way to go.
> >> >
> >> > > Considering that we will happily parse such a BlockdevOptions
> >> > > outside of the context of BlockdevRef, I think we should be
> >> > > consistent and allow the same to happen here.
> >> >
> >> > StrictDecode is only used with alternates because, unlike unions,
> >> > Alternate types don't have a 'discriminator' field that would
> >> > allow us to know what data type to expect.
> >> >
> >> > With this in mind, theoretically speaking, we could have very
> >> > similar struct types as Alternate fields and we have to find on
> >> > runtime which type is that underlying byte stream.
> >> >
> >> > So, to reply to your suggestion, if we allow BlockdevRef without
> >> > StrictDecode we might find ourselves in a situation that it
> >> > matched a few fields of BlockdevOptions but it the byte stream
> >> > was actually another type.
> >>
> >> IIUC your concern is that the QAPI schema could gain a new
> >> type, TotallyNotBlockdevOptions, which looks exactly like
> >> BlockdevOptions except for one or more extra fields.
> >>
> >> If QEMU then produced a JSON like
> >>
> >>   { "description": { /* a TotallyNotBlockdevOptions here */ } }
> >>
> >> and we'd try to deserialize it with Go code like
> >>
> >>   ref := BlockdevRef{}
> >>   json.Unmarsal()
> >>
> >> we'd end up mistakenly parsing the TotallyNotBlockdevOptions as a
> >> valid BlockdevOptions, dropping the extra fields in the process.
> >>
> >> Does that correctly describe the reason why you feel that the use of
> >> StrictDecode is necessary?
> >
> > Not quite. The problem here is related to the Alternate types of
> > the QAPI specification [0], just to name a simple in-use example,
> > BlockdevRefOrNul [1].
> >
> > [0] 
> > https://gitlab.com/qemu-project/qemu/-/blob/master/docs/devel/qapi-code-gen.rst?plain=1#L387
> > [1] 
> > https://gitlab.com/qemu-project/qemu/-/blob/master/qapi/block-core.json#L4349
> >
> > To exemplify the problem that I try to solve with StrictDecode,
> > let's say there is a DeviceRef alternate type that looks like:
> >
> > { 'alternate': 'DeviceRef',
> >   'data': { 'memory': 'BlockdevRefInMemory',
> > 'disk': 'BlockdevRefInDisk',
> > 'cloud': 'BlockdevRefInCloud' } }
> >
> > Just a quick recap, at runtime we don't have data's payload name
> > (e.g: disk).  We need to check the actual data and try to find
> > what is the payload type.
> >
> > type BlockdevRefInMemory struct {
> > Name  *string
> > Size  uint64
> > Start uint64
> > End   uint64
> > }
> >
> > type BlockdevRefInDisk struct {
> > Name  *string
> > Size  uint64
> > Path  *string
> > }
> >
> > type BlockdevRefInCloud struct {
> > Name  *string
> > Size  uint64
> > Uri   *string
> > }
> >
> > All types have unique fields but they all share some fields too.
>
> Quick intercession (I merely skimmed the review thread; forgive me if
> it's not useful or not new):
>
> An alternate type is like a union type, except there is no
> discriminator on the wire.  Instead, the branch to use is inferred
> from the value.  An alternate can only express a choice between types
> represented differently on the wire.
>
> This is docs/devel/qapi-code-gen.rst.  Implied there: the inference is
> based on the JSON type *only*, i.e. no two branches can have the same
> JSON type on the wire.  Since all complex types (struct or union) are
> JSON object on the wire, at most one alternate branch can be of complex
> type.

Ah, I've missed this bit. Thank you, it does make it much
simpler.

> More sophisticated inference would be possible if we need it.
> So 

Re: [PATCH 22/51] tests/qtest: qmp-test: Skip running test_qmp_oob for win32

2022-08-29 Thread Markus Armbruster
Bin Meng  writes:

> From: Bin Meng 
>
> The test_qmp_oob test case calls mkfifo() which does not exist on
> win32. Exclude it.
>
> Signed-off-by: Bin Meng 
> ---
>
>  tests/qtest/qmp-test.c | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/tests/qtest/qmp-test.c b/tests/qtest/qmp-test.c
> index b950dbafaf..4a165447f8 100644
> --- a/tests/qtest/qmp-test.c
> +++ b/tests/qtest/qmp-test.c
> @@ -159,6 +159,8 @@ static void test_qmp_protocol(void)
>  qtest_quit(qts);
>  }
>  
> +#ifndef _WIN32
> +
>  /* Out-of-band tests */
>  
>  char *tmpdir;
> @@ -279,6 +281,8 @@ static void test_qmp_oob(void)
>  qtest_quit(qts);
>  }
>  
> +#endif /* _WIN32 */
> +
>  /* Preconfig tests */
>  
>  static void test_qmp_preconfig(void)
> @@ -338,7 +342,9 @@ int main(int argc, char *argv[])
>  g_test_init(, , NULL);
>  
>  qtest_add_func("qmp/protocol", test_qmp_protocol);
> +#ifndef _WIN32
>  qtest_add_func("qmp/oob", test_qmp_oob);
> +#endif
>  qtest_add_func("qmp/preconfig", test_qmp_preconfig);
>  qtest_add_func("qmp/missing-any-arg", test_qmp_missing_any_arg);

I'd appreciate a comment explaining why we have to disable this test on
Windows.




Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-08-29 Thread Sam Li
Sam Li  于2022年8月29日周一 20:53写道:
>
> By adding zone management operations in BlockDriver, storage controller
> emulation can use the new block layer APIs including Report Zone and
> four zone management operations (open, close, finish, reset).
>
> Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
> zone_close(zc), zone_reset(zrs), zone_finish(zf).
>
> For example, to test zone_report, use following command:
> $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
> -c "zrp offset nr_zones"
>
> Signed-off-by: Sam Li 
> Reviewed-by: Hannes Reinecke 
> ---
>  block/block-backend.c |  51 +
>  block/file-posix.c| 326 +-
>  block/io.c|  41 
>  include/block/block-io.h  |   7 +
>  include/block/block_int-common.h  |  21 ++
>  include/block/raw-aio.h   |   6 +-
>  include/sysemu/block-backend-io.h |  17 ++
>  meson.build   |   1 +
>  qapi/block-core.json  |   8 +-
>  qemu-io-cmds.c| 143 +
>  10 files changed, 617 insertions(+), 4 deletions(-)
>
> diff --git a/block/block-backend.c b/block/block-backend.c
> index d4a5df2ac2..c5798651df 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1775,6 +1775,57 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>  return ret;
>  }
>
> +/*
> + * Send a zone_report command.
> + * offset is a byte offset from the start of the device. No alignment
> + * required for offset.
> + * nr_zones represents IN maximum and OUT actual.
> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +unsigned int *nr_zones,
> +BlockZoneDescriptor *zones)
> +{
> +int ret;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk); /* increase before waiting */
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +blk_dec_in_flight(blk);
> +return -ENOMEDIUM;
> +}
> +ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
> +/*
> + * Send a zone_management command.
> + * op is the zone operation.
> + * offset is the starting zone specified as a sector offset.
> + * len is the maximum number of sectors the command should operate on. It
> + * should be aligned with the zone sector size.
> + */
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +int64_t offset, int64_t len)
> +{
> +int ret;
> +IO_CODE();
> +
> +
> +blk_inc_in_flight(blk);
> +blk_wait_while_drained(blk);
> +
> +ret = blk_check_byte_request(blk, offset, len);
> +if (ret < 0) {
> +return ret;
> +}
> +
> +ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 0a8b4b426e..e3efba6db7 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -67,6 +67,9 @@
>  #include 
>  #include 
>  #include 
> +#if defined(CONFIG_BLKZONED)
> +#include 
> +#endif
>  #include 
>  #include 
>  #include 
> @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData {
>  PreallocMode prealloc;
>  Error **errp;
>  } truncate;
> +struct {
> +unsigned int *nr_zones;
> +BlockZoneDescriptor *zones;
> +} zone_report;
> +struct {
> +unsigned long zone_op;
> +} zone_mgmt;
>  };
>  } RawPosixAIOData;
>
> @@ -1339,7 +1349,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  #endif
>
>  if (bs->sg || S_ISBLK(st.st_mode)) {
> -int ret = hdev_get_max_hw_transfer(s->fd, );
> +ret = hdev_get_max_hw_transfer(s->fd, );
>
>  if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
>  bs->bl.max_hw_transfer = ret;
> @@ -1356,6 +1366,27 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  zoned = BLK_Z_NONE;
>  }
>  bs->bl.zoned = zoned;
> +if (zoned != BLK_Z_NONE) {
> +ret = get_sysfs_long_val(, "chunk_sectors");
> +if (ret > 0) {
> +bs->bl.zone_sectors = ret;
> +}
> +
> +ret = get_sysfs_long_val(, "zone_append_max_bytes");
> +if (ret > 0) {
> +bs->bl.zone_append_max_bytes = ret;
> +}
> +
> +ret = get_sysfs_long_val(, "max_open_zones");
> +if (ret >= 0) {
> +bs->bl.max_open_zones = ret;
> +}
> +
> +ret = get_sysfs_long_val(, "max_active_zones");
> +if (ret >= 0) {
> +bs->bl.max_active_zones = ret;
> +}
> +}
>  }
>
>  static int check_for_dasd(int fd)
> @@ -1850,6 +1881,136 @@ static off_t copy_file_range(int in_fd, 

[PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-08-29 Thread Sam Li
By adding zone management operations in BlockDriver, storage controller
emulation can use the new block layer APIs including Report Zone and
four zone management operations (open, close, finish, reset).

Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
zone_close(zc), zone_reset(zrs), zone_finish(zf).

For example, to test zone_report, use following command:
$ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
-c "zrp offset nr_zones"

Signed-off-by: Sam Li 
Reviewed-by: Hannes Reinecke 
---
 block/block-backend.c |  51 +
 block/file-posix.c| 326 +-
 block/io.c|  41 
 include/block/block-io.h  |   7 +
 include/block/block_int-common.h  |  21 ++
 include/block/raw-aio.h   |   6 +-
 include/sysemu/block-backend-io.h |  17 ++
 meson.build   |   1 +
 qapi/block-core.json  |   8 +-
 qemu-io-cmds.c| 143 +
 10 files changed, 617 insertions(+), 4 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index d4a5df2ac2..c5798651df 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1775,6 +1775,57 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
 return ret;
 }
 
+/*
+ * Send a zone_report command.
+ * offset is a byte offset from the start of the device. No alignment
+ * required for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+unsigned int *nr_zones,
+BlockZoneDescriptor *zones)
+{
+int ret;
+IO_CODE();
+
+blk_inc_in_flight(blk); /* increase before waiting */
+blk_wait_while_drained(blk);
+if (!blk_is_available(blk)) {
+blk_dec_in_flight(blk);
+return -ENOMEDIUM;
+}
+ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
+blk_dec_in_flight(blk);
+return ret;
+}
+
+/*
+ * Send a zone_management command.
+ * op is the zone operation.
+ * offset is the starting zone specified as a sector offset.
+ * len is the maximum number of sectors the command should operate on. It
+ * should be aligned with the zone sector size.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
+int64_t offset, int64_t len)
+{
+int ret;
+IO_CODE();
+
+
+blk_inc_in_flight(blk);
+blk_wait_while_drained(blk);
+
+ret = blk_check_byte_request(blk, offset, len);
+if (ret < 0) {
+return ret;
+}
+
+ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
+blk_dec_in_flight(blk);
+return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
diff --git a/block/file-posix.c b/block/file-posix.c
index 0a8b4b426e..e3efba6db7 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -67,6 +67,9 @@
 #include 
 #include 
 #include 
+#if defined(CONFIG_BLKZONED)
+#include 
+#endif
 #include 
 #include 
 #include 
@@ -216,6 +219,13 @@ typedef struct RawPosixAIOData {
 PreallocMode prealloc;
 Error **errp;
 } truncate;
+struct {
+unsigned int *nr_zones;
+BlockZoneDescriptor *zones;
+} zone_report;
+struct {
+unsigned long zone_op;
+} zone_mgmt;
 };
 } RawPosixAIOData;
 
@@ -1339,7 +1349,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 #endif
 
 if (bs->sg || S_ISBLK(st.st_mode)) {
-int ret = hdev_get_max_hw_transfer(s->fd, );
+ret = hdev_get_max_hw_transfer(s->fd, );
 
 if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
 bs->bl.max_hw_transfer = ret;
@@ -1356,6 +1366,27 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 zoned = BLK_Z_NONE;
 }
 bs->bl.zoned = zoned;
+if (zoned != BLK_Z_NONE) {
+ret = get_sysfs_long_val(, "chunk_sectors");
+if (ret > 0) {
+bs->bl.zone_sectors = ret;
+}
+
+ret = get_sysfs_long_val(, "zone_append_max_bytes");
+if (ret > 0) {
+bs->bl.zone_append_max_bytes = ret;
+}
+
+ret = get_sysfs_long_val(, "max_open_zones");
+if (ret >= 0) {
+bs->bl.max_open_zones = ret;
+}
+
+ret = get_sysfs_long_val(, "max_active_zones");
+if (ret >= 0) {
+bs->bl.max_active_zones = ret;
+}
+}
 }
 
 static int check_for_dasd(int fd)
@@ -1850,6 +1881,136 @@ static off_t copy_file_range(int in_fd, off_t *in_off, 
int out_fd,
 }
 #endif
 
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+#if defined(CONFIG_BLKZONED)
+static inline void parse_zone(struct BlockZoneDescriptor *zone,
+  const struct blk_zone *blkz) {
+zone->start = blkz->start;
+zone->length = blkz->len;
+zone->cap = 

Re: [PATCH 0/1] Update vfio-user module to the latest

2022-08-29 Thread Thomas Huth

On 07/08/2022 12.39, John Levon wrote:

On Fri, Aug 05, 2022 at 09:24:56AM +0100, Daniel P. Berrangé wrote:

[...]

If we do add something as a submodule for some reason, I'd like us to
say upfront that this is for a fixed time period (ie maximum of 3
releases aka 1 year) only after which we'll remove it no matter what.


I'm not clear on the costs of having the submodule: could you please explain why
it's an issue exactly?


Some reasoning can be found here:


https://lore.kernel.org/qemu-devel/d7a7b28f-a665-2567-0fb6-e31e7ecbb...@redhat.com/


I can only think of the flaky test issue (for which I'm
sorry).


Speaking of that test issue, yes, it would be good to get this patch 
included now as soon as the 7.1 release has been done. Who's going to send a 
pull request for this?


 Thomas




[PATCH] tests/avocado/migration: Get find_free_port() from the ports

2022-08-29 Thread Thomas Huth
In upstream Avocado, the find_free_port() function is not available
from "network" anymore, but must be used via "ports", see:

 https://github.com/avocado-framework/avocado/commit/22fc98c6ff76cc55c48

To be able to update to a newer Avocado version later, let's use
the new way for accessing the find_free_port() function here.

Signed-off-by: Thomas Huth 
---
 tests/avocado/migration.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/avocado/migration.py b/tests/avocado/migration.py
index 584d6ef53f..4b25680c50 100644
--- a/tests/avocado/migration.py
+++ b/tests/avocado/migration.py
@@ -14,7 +14,7 @@
 from avocado_qemu import QemuSystemTest
 from avocado import skipUnless
 
-from avocado.utils import network
+from avocado.utils.network import ports
 from avocado.utils import wait
 from avocado.utils.path import find_command
 
@@ -57,7 +57,7 @@ def do_migrate(self, dest_uri, src_uri=None):
 self.assert_migration(source_vm, dest_vm)
 
 def _get_free_port(self):
-port = network.find_free_port()
+port = ports.find_free_port()
 if port is None:
 self.cancel('Failed to find a free port')
 return port
-- 
2.31.1




Re: [PATCH v5 18/18] s390x: pv: Add dump support

2022-08-29 Thread Thomas Huth

On 11/08/2022 14.11, Janosch Frank wrote:

Sometimes dumping a guest from the outside is the only way to get the
data that is needed. This can be the case if a dumping mechanism like
KDUMP hasn't been configured or data needs to be fetched at a specific
point. Dumping a protected guest from the outside without help from
fw/hw doesn't yield sufficient data to be useful. Hence we now
introduce PV dump support.

The PV dump support works by integrating the firmware into the dump
process. New Ultravisor calls are used to initiate the dump process,
dump cpu data, dump memory state and lastly complete the dump process.
The UV calls are exposed by KVM via the new KVM_PV_DUMP command and
its subcommands. The guest's data is fully encrypted and can only be
decrypted by the entity that owns the customer communication key for
the dumped guest. Also dumping needs to be allowed via a flag in the
SE header.

On the QEMU side of things we store the PV dump data in the newly
introduced architecture ELF sections (storage state and completion
data) and the cpu notes (for cpu dump data).

Users can use the zgetdump tool to convert the encrypted QEMU dump to an
unencrypted one.

Signed-off-by: Janosch Frank 
---

[...]

@@ -207,22 +226,41 @@ static int s390x_write_elf64_notes(const char *note_name,
 DumpState *s,
 const NoteFuncDesc *funcs)
  {
-Note note;
+Note note, *notep;
  const NoteFuncDesc *nf;
-int note_size;
+int note_size, content_size;
  int ret = -1;
  
  assert(strlen(note_name) < sizeof(note.name));
  
  for (nf = funcs; nf->note_contents_func; nf++) {

-memset(, 0, sizeof(note));
-note.hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
-note.hdr.n_descsz = cpu_to_be32(nf->contents_size);
-g_strlcpy(note.name, note_name, sizeof(note.name));
-(*nf->note_contents_func)(, cpu, id);
+notep = 
+if (nf->pvonly && !s390_is_pv()) {
+continue;
+}
  
-note_size = sizeof(note) - sizeof(note.contents) + nf->contents_size;

-ret = f(, note_size, s);
+content_size = nf->contents_size ? nf->contents_size : 
nf->note_size_func();
+note_size = sizeof(note) - sizeof(notep->contents) + content_size;
+
+/* Notes with dynamic sizes need to allocate a note */
+if (nf->note_size_func) {
+notep = g_malloc0(note_size);


Either use g_malloc() here (without the trailing "0") ...


+}
+
+memset(notep, 0, sizeof(note));


... or put the memset() in an "else" block.


+/* Setup note header data */
+notep->hdr.n_descsz = cpu_to_be32(content_size);
+notep->hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1);
+g_strlcpy(notep->name, note_name, sizeof(notep->name));
+
+/* Get contents and write them out */
+(*nf->note_contents_func)(notep, cpu, id);
+ret = f(notep, note_size, s);
+
+if (nf->note_size_func) {
+g_free(notep);
+}
  
  if (ret < 0) {

  return -1;
@@ -247,12 +285,161 @@ int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, 
CPUState *cs,
  return s390x_write_elf64_notes("LINUX", f, cpu, cpuid, s, note_linux);
  }
  
+/* PV dump section size functions */

+static uint64_t get_dump_stor_state_size_from_len(uint64_t len)
+{
+return (len / (1 << 20)) * kvm_s390_pv_dmp_get_size_stor_state();


Use "MiB" instead of "1 << 20" ?


+}
+
+static uint64_t get_size_stor_state(DumpState *s)
+{
+return get_dump_stor_state_size_from_len(s->total_size);
+}


 Thomas




Re: [RFC PATCH v2 0/8] qapi: add generator for Golang interface

2022-08-29 Thread Markus Armbruster
Victor Toso  writes:

> Hi,
>
> On Tue, Jul 05, 2022 at 08:46:34AM -0700, Andrea Bolognani wrote:
>> I've commented in detail to the single patches, just a couple of
>> additional points.
>>
>> On Fri, Jun 17, 2022 at 02:19:24PM +0200, Victor Toso wrote:
>> > * 7) Flat structs by removing embed types. Discussion with Andrea
>> >  Thread: 
>> > https://lists.gnu.org/archive/html/qemu-devel/2022-05/msg01590.html
>> >
>> >  No one required it but I decided to give it a try. Major issue that
>> >  I see with this approach is to have generated a few 'Base' structs
>> >  that are now useless. Overall, less nested structs seems better to
>> >  me. Opnions?
>> >
>> >  Example:
>> >   | /* This is now useless, should be removed? */
>> >   | type InetSocketAddressBase struct {
>> >   | Host string `json:"host"`
>> >   | Port string `json:"port"`
>> >   | }
>>
>> Can we somehow keep track, in the generator, of types that are
>> only used as building blocks for other types, and prevent them
>> from showing up in the generated code?
>
> I'm not 100% sure it is good to remove them from generated code
> because technically it is a valid qapi type. If all @base types
> are embed types and they don't show in other way or form, sure we
> can remove them from generated code... I'm not sure if it is
> possible to guarantee this.
>
> But yes, if possible, I'd like to remove what seems useless type
> definitions.

The existing C generators have to generate all the types, because the
generated code is for QEMU's own use, where we need all the types.

The existing introspection generator generates only the types visible in
QAPI/QMP introspection.

The former generate for internal use (where we want all the types), and
the latter for external use (where only the types visible in the
external interface are actually useful).

>> Finally, looking at the repository containing the generated
>> code I see that the generated type are sorted by kind, e.g. all
>> unions are in a file, all events in another one and so on. I
>> believe the structure should match more closely that of the
>> QAPI schema, so e.g.  block-related types should all go in one
>> file, net-related types in another one and so on.
>
> That's something I don't mind adding but some hardcoded mapping
> is needed. If you look into git history of qapi/ folder, .json
> files can come and go, types be moved around, etc. So, we need to
> proper map types in a way that the generated code would be kept
> stable even if qapi files would have been rearranged. What I
> proposed was only the simplest solution.
>
> Also, the generator takes a qapi-schema.json as input. We are
> more focused in qemu/qapi/qapi-schema.json generated coded but
> would not hurt to think we could even use it for qemu-guest-agent
> from qemu/qga/qapi-schema.json -- this to say that the hardcoded
> mapping needs to take into account non qemu qapi schemas too.

In the beginning, the QAPI schema was monolithic.  qga/qapi-schema.json
still is.

When keeping everything in a single qapi-schema.json became unwieldy, we
split it into "modules" tied together with a simple include directive.
Generated code remained monolithic.

When monolithic generated code became too annoying (touch schema,
recompile everything), we made it match the module structure: code for
FOO.json goes into *-FOO.c and *-FOO.h, where the *-FOO.h #include the
generated headers for the .json modules FOO.json includes.

Schema code motion hasn't been much of a problem.  Moving from FOO.json
to one of the modules it includes is transparent.  Non-transparent moves
are relatively rare as long as the split into modules actually makes
sense.

>> Looking forward to the next iteration :)
>
> Me too, thanks again!
>
> Cheers,
> Victor




Re: [PATCH] MAINTAINERS: Update Akihiko Odaki's email address

2022-08-29 Thread Marc-André Lureau
Hi

On Mon, Aug 29, 2022 at 12:33 PM Akihiko Odaki 
wrote:

> I am now employed by Daynix. Although my role as a reviewer of
> macOS-related change is not very relevant to the employment, I decided
> to use the company email address to avoid confusions from different
> addresses.
>
> Signed-off-by: Akihiko Odaki 
>

Congrats :)
Reviewed-by: Marc-André Lureau 



> ---
>  MAINTAINERS | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 5ce4227ff6..fd9bd1dca5 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2451,7 +2451,7 @@ Core Audio framework backend
>  M: Gerd Hoffmann 
>  M: Philippe Mathieu-Daudé 
>  R: Christian Schoenebeck 
> -R: Akihiko Odaki 
> +R: Akihiko Odaki 
>  S: Odd Fixes
>  F: audio/coreaudio.c
>
> @@ -2730,7 +2730,7 @@ F: util/drm.c
>  Cocoa graphics
>  M: Peter Maydell 
>  M: Philippe Mathieu-Daudé 
> -R: Akihiko Odaki 
> +R: Akihiko Odaki 
>  S: Odd Fixes
>  F: ui/cocoa.m
>
> --
> 2.37.2
>
>
>

-- 
Marc-André Lureau


Re: [PATCH] tests/qtest/ac97-test: Correct reference to driver

2022-08-29 Thread Marc-André Lureau
On Mon, Aug 29, 2022 at 12:38 PM Akihiko Odaki 
wrote:

> Signed-off-by: Akihiko Odaki 


Reviewed-by: Marc-André Lureau 


> ---
>  tests/qtest/ac97-test.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tests/qtest/ac97-test.c b/tests/qtest/ac97-test.c
> index b084e31bff..74103efdfa 100644
> --- a/tests/qtest/ac97-test.c
> +++ b/tests/qtest/ac97-test.c
> @@ -28,7 +28,7 @@ static void *ac97_get_driver(void *obj, const char
> *interface)
>  return >dev;
>  }
>
> -fprintf(stderr, "%s not present in e1000e\n", interface);
> +fprintf(stderr, "%s not present in ac97\n", interface);
>  g_assert_not_reached();
>  }
>
> --
> 2.37.2
>
>
>

-- 
Marc-André Lureau


Re: [PATCH v5 16/18] s390x: Introduce PV query interface

2022-08-29 Thread Thomas Huth

On 11/08/2022 14.11, Janosch Frank wrote:

Introduce an interface over which we can get information about UV data.

Signed-off-by: Janosch Frank 
Reviewed-by: Steffen Eiden 
---
  hw/s390x/pv.c  | 61 ++
  hw/s390x/s390-virtio-ccw.c |  6 
  include/hw/s390x/pv.h  | 10 +++
  3 files changed, 77 insertions(+)


Acked-by: Thomas Huth 





Re: [PATCH v5 15/18] s390x: Add protected dump cap

2022-08-29 Thread Thomas Huth

On 11/08/2022 14.11, Janosch Frank wrote:

Add a protected dump capability for later feature checking.

Signed-off-by: Janosch Frank 
Reviewed-by: Steffen Eiden 
---
  target/s390x/kvm/kvm.c   | 7 +++
  target/s390x/kvm/kvm_s390x.h | 1 +
  2 files changed, 8 insertions(+)


Reviewed-by: Thomas Huth 




Re: [RFC PATCH v2 2/8] qapi: golang: Generate qapi's alternate types in Go

2022-08-29 Thread Markus Armbruster
Victor Toso  writes:

> Hi,
>
> On Fri, Aug 19, 2022 at 11:27:13AM -0500, Andrea Bolognani wrote:
>> On Wed, Aug 17, 2022 at 04:04:19PM +0200, Victor Toso wrote:
>> > On Tue, Jul 05, 2022 at 08:45:06AM -0700, Andrea Bolognani wrote:
>> > > On Fri, Jun 17, 2022 at 02:19:26PM +0200, Victor Toso wrote:
>> > > > func (s *BlockdevRef) UnmarshalJSON(data []byte) error {
>> > > > // Check for json-null first
>> > > > if string(data) == "null" {
>> > > > return errors.New(`null not supported for BlockdevRef`)
>> > > > }
>> > > > // Check for BlockdevOptions
>> > > > {
>> > > > s.Definition = new(BlockdevOptions)
>> > > > if err := StrictDecode(s.Definition, data); err == nil {
>> > > > return nil
>> > > > }
>> > >
>> > > The use of StrictDecode() here means that we won't be able to
>> > > parse an alternate produced by a version of QEMU where
>> > > BlockdevOptions has gained additional fields, doesn't it?
>> >
>> > That's correct. This means that with this RFCv2 proposal, qapi-go
>> > based on qemu version 7.1 might not be able to decode a qmp
>> > message from qemu version 7.2 if it has introduced a new field.
>> >
>> > This needs fixing, not sure yet the way to go.
>> >
>> > > Considering that we will happily parse such a BlockdevOptions
>> > > outside of the context of BlockdevRef, I think we should be
>> > > consistent and allow the same to happen here.
>> >
>> > StrictDecode is only used with alternates because, unlike unions,
>> > Alternate types don't have a 'discriminator' field that would
>> > allow us to know what data type to expect.
>> >
>> > With this in mind, theoretically speaking, we could have very
>> > similar struct types as Alternate fields and we have to find on
>> > runtime which type is that underlying byte stream.
>> >
>> > So, to reply to your suggestion, if we allow BlockdevRef without
>> > StrictDecode we might find ourselves in a situation that it
>> > matched a few fields of BlockdevOptions but it the byte stream
>> > was actually another type.
>>
>> IIUC your concern is that the QAPI schema could gain a new
>> type, TotallyNotBlockdevOptions, which looks exactly like
>> BlockdevOptions except for one or more extra fields.
>>
>> If QEMU then produced a JSON like
>>
>>   { "description": { /* a TotallyNotBlockdevOptions here */ } }
>>
>> and we'd try to deserialize it with Go code like
>>
>>   ref := BlockdevRef{}
>>   json.Unmarsal()
>>
>> we'd end up mistakenly parsing the TotallyNotBlockdevOptions as a
>> valid BlockdevOptions, dropping the extra fields in the process.
>>
>> Does that correctly describe the reason why you feel that the use of
>> StrictDecode is necessary?
>
> Not quite. The problem here is related to the Alternate types of
> the QAPI specification [0], just to name a simple in-use example,
> BlockdevRefOrNul [1].
>
> [0] 
> https://gitlab.com/qemu-project/qemu/-/blob/master/docs/devel/qapi-code-gen.rst?plain=1#L387
> [1] 
> https://gitlab.com/qemu-project/qemu/-/blob/master/qapi/block-core.json#L4349
>
> To exemplify the problem that I try to solve with StrictDecode,
> let's say there is a DeviceRef alternate type that looks like:
>
> { 'alternate': 'DeviceRef',
>   'data': { 'memory': 'BlockdevRefInMemory',
> 'disk': 'BlockdevRefInDisk',
> 'cloud': 'BlockdevRefInCloud' } }
>
> Just a quick recap, at runtime we don't have data's payload name
> (e.g: disk).  We need to check the actual data and try to find
> what is the payload type.
>
> type BlockdevRefInMemory struct {
> Name  *string
> Size  uint64
> Start uint64
> End   uint64
> }
>
> type BlockdevRefInDisk struct {
> Name  *string
> Size  uint64
> Path  *string
> }
>
> type BlockdevRefInCloud struct {
> Name  *string
> Size  uint64
> Uri   *string
> }
>
> All types have unique fields but they all share some fields too.

Quick intercession (I merely skimmed the review thread; forgive me if
it's not useful or not new):

An alternate type is like a union type, except there is no
discriminator on the wire.  Instead, the branch to use is inferred
from the value.  An alternate can only express a choice between types
represented differently on the wire.

This is docs/devel/qapi-code-gen.rst.  Implied there: the inference is
based on the JSON type *only*, i.e. no two branches can have the same
JSON type on the wire.  Since all complex types (struct or union) are
JSON object on the wire, at most one alternate branch can be of complex
type.

More sophisticated inference would be possible if we need it.  So far we
haven't.

> Golang, without StrictDecode would happily decode a "disk"
> payload into either "memory" or "cloud" fields, matching only
> what the json provides and ignoring the rest. StrictDecode would
> error if the payload had fields that don't belong to that Type so
> we could try to find a perfect match.
>
> While this should work reliably with current version of 

[PATCH v7 09/10] parallels: Move statistic collection to a separate function

2022-08-29 Thread Alexander Ivanov
We will add more and more checks so we need a better code structure
in parallels_co_check. Let each check performs in a separate loop
in a separate helper.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/parallels.c | 53 +++
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index 1874045c51..eacfdea4b6 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -526,47 +526,56 @@ static int parallels_check_leak(BlockDriverState *bs,
 return 0;
 }
 
-static int coroutine_fn parallels_co_check(BlockDriverState *bs,
-   BdrvCheckResult *res,
-   BdrvCheckMode fix)
+static void parallels_collect_statistics(BlockDriverState *bs,
+ BdrvCheckResult *res,
+ BdrvCheckMode fix)
 {
 BDRVParallelsState *s = bs->opaque;
-int64_t prev_off;
-int ret;
+int64_t off, prev_off;
 uint32_t i;
 
-qemu_co_mutex_lock(>lock);
-
-parallels_check_unclean(bs, res, fix);
-
-ret = parallels_check_outside_image(bs, res, fix);
-if (ret < 0) {
-goto out;
-}
-
-ret = parallels_check_leak(bs, res, fix);
-if (ret < 0) {
-goto out;
-}
-
 res->bfi.total_clusters = s->bat_size;
 res->bfi.compressed_clusters = 0; /* compression is not supported */
 
 prev_off = 0;
 for (i = 0; i < s->bat_size; i++) {
-int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+off = bat2sect(s, i) << BDRV_SECTOR_BITS;
 if (off == 0) {
 prev_off = 0;
 continue;
 }
 
-res->bfi.allocated_clusters++;
-
 if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
 res->bfi.fragmented_clusters++;
 }
+
 prev_off = off;
+res->bfi.allocated_clusters++;
 }
+}
+
+static int coroutine_fn parallels_co_check(BlockDriverState *bs,
+   BdrvCheckResult *res,
+   BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+int ret;
+
+qemu_co_mutex_lock(>lock);
+
+parallels_check_unclean(bs, res, fix);
+
+ret = parallels_check_outside_image(bs, res, fix);
+if (ret < 0) {
+goto out;
+}
+
+ret = parallels_check_leak(bs, res, fix);
+if (ret < 0) {
+goto out;
+}
+
+parallels_collect_statistics(bs, res, fix);
 
 out:
 qemu_co_mutex_unlock(>lock);
-- 
2.34.1




[PATCH v7 10/10] parallels: Replace qemu_co_mutex_lock by WITH_QEMU_LOCK_GUARD

2022-08-29 Thread Alexander Ivanov
Replace the way we use mutex in parallels_co_check() for simplier
and less error prone code.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 33 ++---
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index eacfdea4b6..8943eccbf5 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -561,30 +561,25 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 BDRVParallelsState *s = bs->opaque;
 int ret;
 
-qemu_co_mutex_lock(>lock);
+WITH_QEMU_LOCK_GUARD(>lock) {
+parallels_check_unclean(bs, res, fix);
 
-parallels_check_unclean(bs, res, fix);
+ret = parallels_check_outside_image(bs, res, fix);
+if (ret < 0) {
+return ret;
+}
 
-ret = parallels_check_outside_image(bs, res, fix);
-if (ret < 0) {
-goto out;
-}
+ret = parallels_check_leak(bs, res, fix);
+if (ret < 0) {
+return ret;
+}
 
-ret = parallels_check_leak(bs, res, fix);
-if (ret < 0) {
-goto out;
+parallels_collect_statistics(bs, res, fix);
 }
 
-parallels_collect_statistics(bs, res, fix);
-
-out:
-qemu_co_mutex_unlock(>lock);
-
-if (ret == 0) {
-ret = bdrv_co_flush(bs);
-if (ret < 0) {
-res->check_errors++;
-}
+ret = bdrv_co_flush(bs);
+if (ret < 0) {
+res->check_errors++;
 }
 
 return ret;
-- 
2.34.1




[PATCH v7 04/10] parallels: create parallels_set_bat_entry_helper() to assign BAT value

2022-08-29 Thread Alexander Ivanov
This helper will be reused in next patches during parallels_co_check
rework to simplify its code.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/parallels.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index c1ff8bb5f0..52a5cce46c 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -165,6 +165,13 @@ static int64_t block_status(BDRVParallelsState *s, int64_t 
sector_num,
 return start_off;
 }
 
+static void parallels_set_bat_entry(BDRVParallelsState *s,
+uint32_t index, uint32_t offset)
+{
+s->bat_bitmap[index] = cpu_to_le32(offset);
+bitmap_set(s->bat_dirty_bmap, bat_entry_off(index) / s->bat_dirty_block, 
1);
+}
+
 static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
  int nb_sectors, int *pnum)
 {
@@ -250,10 +257,8 @@ static int64_t allocate_clusters(BlockDriverState *bs, 
int64_t sector_num,
 }
 
 for (i = 0; i < to_allocate; i++) {
-s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
+parallels_set_bat_entry(s, idx + i, s->data_end / s->off_multiplier);
 s->data_end += s->tracks;
-bitmap_set(s->bat_dirty_bmap,
-   bat_entry_off(idx + i) / s->bat_dirty_block, 1);
 }
 
 return bat2sect(s, idx) + sector_num % s->tracks;
-- 
2.34.1




[PATCH v7 08/10] parallels: Move check of leaks to a separate function

2022-08-29 Thread Alexander Ivanov
We will add more and more checks so we need a better code structure
in parallels_co_check. Let each check performs in a separate loop
in a separate helper.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 84 +--
 1 file changed, 52 insertions(+), 32 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index f50cd232aa..1874045c51 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -475,14 +475,14 @@ static int parallels_check_outside_image(BlockDriverState 
*bs,
 return 0;
 }
 
-static int coroutine_fn parallels_co_check(BlockDriverState *bs,
-   BdrvCheckResult *res,
-   BdrvCheckMode fix)
+static int parallels_check_leak(BlockDriverState *bs,
+BdrvCheckResult *res,
+BdrvCheckMode fix)
 {
 BDRVParallelsState *s = bs->opaque;
-int64_t size, prev_off, high_off;
-int ret;
+int64_t size, off, high_off, count;
 uint32_t i;
+int ret;
 
 size = bdrv_getlength(bs->file->bs);
 if (size < 0) {
@@ -490,41 +490,16 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 return size;
 }
 
-qemu_co_mutex_lock(>lock);
-
-parallels_check_unclean(bs, res, fix);
-
-ret = parallels_check_outside_image(bs, res, fix);
-if (ret < 0) {
-goto out;
-}
-
-res->bfi.total_clusters = s->bat_size;
-res->bfi.compressed_clusters = 0; /* compression is not supported */
-
 high_off = 0;
-prev_off = 0;
 for (i = 0; i < s->bat_size; i++) {
-int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
-if (off == 0) {
-prev_off = 0;
-continue;
-}
-
-res->bfi.allocated_clusters++;
+off = bat2sect(s, i) << BDRV_SECTOR_BITS;
 if (off > high_off) {
 high_off = off;
 }
-
-if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
-res->bfi.fragmented_clusters++;
-}
-prev_off = off;
 }
 
 res->image_end_offset = high_off + s->cluster_size;
 if (size > res->image_end_offset) {
-int64_t count;
 count = DIV_ROUND_UP(size - res->image_end_offset, s->cluster_size);
 fprintf(stderr, "%s space leaked at the end of the image %" PRId64 
"\n",
 fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR",
@@ -542,12 +517,57 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 if (ret < 0) {
 error_report_err(local_err);
 res->check_errors++;
-goto out;
+return ret;
 }
 res->leaks_fixed += count;
 }
 }
 
+return 0;
+}
+
+static int coroutine_fn parallels_co_check(BlockDriverState *bs,
+   BdrvCheckResult *res,
+   BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+int64_t prev_off;
+int ret;
+uint32_t i;
+
+qemu_co_mutex_lock(>lock);
+
+parallels_check_unclean(bs, res, fix);
+
+ret = parallels_check_outside_image(bs, res, fix);
+if (ret < 0) {
+goto out;
+}
+
+ret = parallels_check_leak(bs, res, fix);
+if (ret < 0) {
+goto out;
+}
+
+res->bfi.total_clusters = s->bat_size;
+res->bfi.compressed_clusters = 0; /* compression is not supported */
+
+prev_off = 0;
+for (i = 0; i < s->bat_size; i++) {
+int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+if (off == 0) {
+prev_off = 0;
+continue;
+}
+
+res->bfi.allocated_clusters++;
+
+if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
+res->bfi.fragmented_clusters++;
+}
+prev_off = off;
+}
+
 out:
 qemu_co_mutex_unlock(>lock);
 
-- 
2.34.1




Re: [RFC PATCH v2] tests/9p: introduce declarative function calls

2022-08-29 Thread Greg Kurz
Hi Christian,

On Thu, 18 Aug 2022 16:13:40 +0200
Christian Schoenebeck  wrote:

> On Montag, 18. Juli 2022 16:02:31 CEST Christian Schoenebeck wrote:
> > On Montag, 18. Juli 2022 15:10:55 CEST Christian Schoenebeck wrote:
> > > There are currently 4 different functions for sending a 9p 'Twalk'
> > > request. They are all doing the same thing, just in a slightly different
> > > way and with slightly different function arguments.
> > > 
> > > Merge those 4 functions into a single function by using a struct for
> > > function call arguments and use designated initializers when calling this
> > > function to turn usage into a declarative apporach, which is better
> > > readable and easier to maintain.
> > > 
> > > Signed-off-by: Christian Schoenebeck 
> > > ---
> > > 
> > > v1 -> v2:
> > >   * Also merge low-level function v9fs_twalk().
> > >   
> > >   * Lower case twalk() function name.
> > >   
> > >   * Lower case rwalk struct field.
> > >   
> > >   * Add result struct TWalkRes.
> > >   
> > >   NOTE: I have not separated rwalk struct, because it would have
> > >   simplified code at one place, but complicated it at another one.
> > 
> > BTW, I also toyed around with virtio-9p-test.c -> virtio-9p-test.cpp, due to
> > advantages described in v1 discussion, however there are quite a bunch of C
> > header files included which would need refactoring (C++ keywords used like
> > 'export', 'class', 'private' and missing type casts from void*).
> > 
> > I also saw no easy way to separate those as alternative (like C low level
> > unit, C++ unit on top). so I decided that it was not worth it.
> 
> Not sure if you are on summer vacation right now, but I guess I just go ahead 
> and convert the rest of the 9p test code in the same way? At least I haven't 
> seen that you were opposed about the suggested idea in general.
> 

Yeah I was on vacation indeed. Please go ahead. I'll do my best to
review.

Cheers,

--
Greg

> Best regards,
> Christian Schoenebeck
> 
> > >  tests/qtest/virtio-9p-test.c | 251 +--
> > >  1 file changed, 154 insertions(+), 97 deletions(-)
> > > 
> > > diff --git a/tests/qtest/virtio-9p-test.c b/tests/qtest/virtio-9p-test.c
> > > index 25305a4cf7..69b1c27268 100644
> > > --- a/tests/qtest/virtio-9p-test.c
> > > +++ b/tests/qtest/virtio-9p-test.c
> > > @@ -72,6 +72,7 @@ static int split(const char *in, const char *delim, char
> > > ***out) static void split_free(char ***out)
> > > 
> > >  {
> > >  
> > >  int i;
> > > 
> > > +if (!*out) return;
> > > 
> > >  for (i = 0; (*out)[i]; ++i) {
> > >  
> > >  g_free((*out)[i]);
> > >  
> > >  }
> > > 
> > > @@ -390,31 +391,6 @@ static void v9fs_rattach(P9Req *req, v9fs_qid *qid)
> > > 
> > >  v9fs_req_free(req);
> > >  
> > >  }
> > > 
> > > -/* size[4] Twalk tag[2] fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
> > > -static P9Req *v9fs_twalk(QVirtio9P *v9p, uint32_t fid, uint32_t newfid,
> > > - uint16_t nwname, char *const wnames[], uint16_t
> > > tag) -{
> > > -P9Req *req;
> > > -int i;
> > > -uint32_t body_size = 4 + 4 + 2;
> > > -
> > > -for (i = 0; i < nwname; i++) {
> > > -uint16_t wname_size = v9fs_string_size(wnames[i]);
> > > -
> > > -g_assert_cmpint(body_size, <=, UINT32_MAX - wname_size);
> > > -body_size += wname_size;
> > > -}
> > > -req = v9fs_req_init(v9p,  body_size, P9_TWALK, tag);
> > > -v9fs_uint32_write(req, fid);
> > > -v9fs_uint32_write(req, newfid);
> > > -v9fs_uint16_write(req, nwname);
> > > -for (i = 0; i < nwname; i++) {
> > > -v9fs_string_write(req, wnames[i]);
> > > -}
> > > -v9fs_req_send(req);
> > > -return req;
> > > -}
> > > -
> > > 
> > >  /* size[4] Rwalk tag[2] nwqid[2] nwqid*(wqid[13]) */
> > >  static void v9fs_rwalk(P9Req *req, uint16_t *nwqid, v9fs_qid **wqid)
> > >  {
> > > 
> > > @@ -432,6 +408,98 @@ static void v9fs_rwalk(P9Req *req, uint16_t *nwqid,
> > > v9fs_qid **wqid) v9fs_req_free(req);
> > > 
> > >  }
> > > 
> > > +/* options for 'Twalk' 9p request */
> > > +typedef struct TWalkOpt {
> > > +/* 9P client being used (mandatory) */
> > > +QVirtio9P *client;
> > > +/* user supplied tag number being returned with response (optional)
> > > */
> > > +uint16_t tag;
> > > +/* file ID of directory from where walk should start (optional) */
> > > +uint32_t fid;
> > > +/* file ID for target directory being walked to (optional) */
> > > +uint32_t newfid;
> > > +/* low level variant of path to walk to (optional) */
> > > +uint16_t nwname;
> > > +char **wnames;
> > > +/* high level variant of path to walk to (optional) */
> > > +const char *path;
> > > +/* data being received from 9p server as 'Rwalk' response (optional)
> > > */ +struct {
> > > +uint16_t *nwqid;
> > > +v9fs_qid **wqid;
> > > +} rwalk;
> > > +/* only send Twalk request but not wait for a 

[PATCH v7 07/10] parallels: Move check of cluster outside image to a separate function

2022-08-29 Thread Alexander Ivanov
We will add more and more checks so we need a better code structure
in parallels_co_check. Let each check performs in a separate loop
in a separate helper.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 59 ++-
 1 file changed, 43 insertions(+), 16 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index eea318f809..f50cd232aa 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -438,13 +438,50 @@ static void parallels_check_unclean(BlockDriverState *bs,
 }
 }
 
+static int parallels_check_outside_image(BlockDriverState *bs,
+ BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+uint32_t i;
+int64_t off, high_off, size;
+
+size = bdrv_getlength(bs->file->bs);
+if (size < 0) {
+res->check_errors++;
+return size;
+}
+
+high_off = 0;
+for (i = 0; i < s->bat_size; i++) {
+off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+if (off > size) {
+fprintf(stderr, "%s cluster %u is outside image\n",
+fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
+res->corruptions++;
+if (fix & BDRV_FIX_ERRORS) {
+parallels_set_bat_entry(s, i, 0);
+res->corruptions_fixed++;
+}
+continue;
+}
+if (high_off < off) {
+high_off = off;
+}
+}
+
+s->data_end = (high_off + s->cluster_size) >> BDRV_SECTOR_BITS;
+
+return 0;
+}
+
 static int coroutine_fn parallels_co_check(BlockDriverState *bs,
BdrvCheckResult *res,
BdrvCheckMode fix)
 {
 BDRVParallelsState *s = bs->opaque;
 int64_t size, prev_off, high_off;
-int ret = 0;
+int ret;
 uint32_t i;
 
 size = bdrv_getlength(bs->file->bs);
@@ -457,6 +494,11 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 
 parallels_check_unclean(bs, res, fix);
 
+ret = parallels_check_outside_image(bs, res, fix);
+if (ret < 0) {
+goto out;
+}
+
 res->bfi.total_clusters = s->bat_size;
 res->bfi.compressed_clusters = 0; /* compression is not supported */
 
@@ -469,19 +511,6 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 continue;
 }
 
-/* cluster outside the image */
-if (off > size) {
-fprintf(stderr, "%s cluster %u is outside image\n",
-fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
-res->corruptions++;
-if (fix & BDRV_FIX_ERRORS) {
-parallels_set_bat_entry(s, i, 0);
-res->corruptions_fixed++;
-}
-prev_off = 0;
-continue;
-}
-
 res->bfi.allocated_clusters++;
 if (off > high_off) {
 high_off = off;
@@ -519,8 +548,6 @@ static int coroutine_fn parallels_co_check(BlockDriverState 
*bs,
 }
 }
 
-s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS;
-
 out:
 qemu_co_mutex_unlock(>lock);
 
-- 
2.34.1




[PATCH v7 06/10] parallels: Move check of unclean image to a separate function

2022-08-29 Thread Alexander Ivanov
We will add more and more checks so we need a better code structure
in parallels_co_check. Let each check performs in a separate loop
in a separate helper.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/parallels.c | 31 +--
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index b4a85b8aa7..eea318f809 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -418,6 +418,25 @@ static coroutine_fn int 
parallels_co_readv(BlockDriverState *bs,
 return ret;
 }
 
+static void parallels_check_unclean(BlockDriverState *bs,
+BdrvCheckResult *res,
+BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+
+if (!s->header_unclean) {
+return;
+}
+
+fprintf(stderr, "%s image was not closed correctly\n",
+fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
+res->corruptions++;
+if (fix & BDRV_FIX_ERRORS) {
+/* parallels_close will do the job right */
+res->corruptions_fixed++;
+s->header_unclean = false;
+}
+}
 
 static int coroutine_fn parallels_co_check(BlockDriverState *bs,
BdrvCheckResult *res,
@@ -435,16 +454,8 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 }
 
 qemu_co_mutex_lock(>lock);
-if (s->header_unclean) {
-fprintf(stderr, "%s image was not closed correctly\n",
-fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
-res->corruptions++;
-if (fix & BDRV_FIX_ERRORS) {
-/* parallels_close will do the job right */
-res->corruptions_fixed++;
-s->header_unclean = false;
-}
-}
+
+parallels_check_unclean(bs, res, fix);
 
 res->bfi.total_clusters = s->bat_size;
 res->bfi.compressed_clusters = 0; /* compression is not supported */
-- 
2.34.1




[PATCH v7 11/10] parallels: Incorrect condition in out-of-image check

2022-08-29 Thread Alexander Ivanov
All the offsets in the BAT must be lower than the file size.
Fix the check condition for correct check.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/parallels.c b/block/parallels.c
index 8943eccbf5..e6e8b9e369 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -455,7 +455,7 @@ static int parallels_check_outside_image(BlockDriverState 
*bs,
 high_off = 0;
 for (i = 0; i < s->bat_size; i++) {
 off = bat2sect(s, i) << BDRV_SECTOR_BITS;
-if (off > size) {
+if (off >= size) {
 fprintf(stderr, "%s cluster %u is outside image\n",
 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
 res->corruptions++;
-- 
2.34.1




Re: [PATCH v7 02/10] parallels: Fix high_off calculation in parallels_co_check()

2022-08-29 Thread Alexander Ivanov

Please ignore the patches in this branch.



[PATCH v7 01/10] parallels: Out of image offset in BAT leads to image inflation

2022-08-29 Thread Alexander Ivanov
data_end field in BDRVParallelsState is set to the biggest offset present
in BAT. If this offset is outside of the image, any further write
will create the cluster at this offset and/or the image will be truncated
to this offset on close. This is definitely not correct.
Raise an error in parallels_open() if data_end points outside the image
and it is not a check (let the check to repaire the image).
Set data_end to the end of the cluster with the last correct offset.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/block/parallels.c b/block/parallels.c
index a229c06f25..93bc2750ef 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -732,6 +732,7 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 BDRVParallelsState *s = bs->opaque;
 ParallelsHeader ph;
 int ret, size, i;
+int64_t file_size;
 QemuOpts *opts = NULL;
 Error *local_err = NULL;
 char *buf;
@@ -742,6 +743,12 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 return -EINVAL;
 }
 
+file_size = bdrv_getlength(bs->file->bs);
+if (file_size < 0) {
+return -EINVAL;
+}
+file_size >>= BDRV_SECTOR_BITS;
+
 ret = bdrv_pread(bs->file, 0, sizeof(ph), , 0);
 if (ret < 0) {
 goto fail;
@@ -806,6 +813,16 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 
 for (i = 0; i < s->bat_size; i++) {
 int64_t off = bat2sect(s, i);
+if (off >= file_size) {
+if (flags & BDRV_O_CHECK) {
+continue;
+}
+error_setg(errp, "parallels: Offset %" PRIi64 " in BAT[%d] entry "
+   "is larger than file size (%" PRIi64 ")",
+   off, i, file_size);
+ret = -EINVAL;
+goto fail;
+}
 if (off >= s->data_end) {
 s->data_end = off + s->tracks;
 }
-- 
2.34.1




[PATCH v7 05/10] parallels: Use generic infrastructure for BAT writing in parallels_co_check()

2022-08-29 Thread Alexander Ivanov
BAT is written in the context of conventional operations over
the image inside bdrv_co_flush() when it calls
parallels_co_flush_to_os() callback. Thus we should not
modify BAT array directly, but call parallels_set_bat_entry()
helper and bdrv_co_flush() further on. After that there is no
need to manually write BAT and track its modification.

This makes code more generic and allows to split
parallels_set_bat_entry() for independent pieces.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 23 ++-
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index 52a5cce46c..b4a85b8aa7 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -425,9 +425,8 @@ static int coroutine_fn parallels_co_check(BlockDriverState 
*bs,
 {
 BDRVParallelsState *s = bs->opaque;
 int64_t size, prev_off, high_off;
-int ret;
+int ret = 0;
 uint32_t i;
-bool flush_bat = false;
 
 size = bdrv_getlength(bs->file->bs);
 if (size < 0) {
@@ -465,9 +464,8 @@ static int coroutine_fn parallels_co_check(BlockDriverState 
*bs,
 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
 res->corruptions++;
 if (fix & BDRV_FIX_ERRORS) {
-s->bat_bitmap[i] = 0;
+parallels_set_bat_entry(s, i, 0);
 res->corruptions_fixed++;
-flush_bat = true;
 }
 prev_off = 0;
 continue;
@@ -484,15 +482,6 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 prev_off = off;
 }
 
-ret = 0;
-if (flush_bat) {
-ret = bdrv_co_pwrite_sync(bs->file, 0, s->header_size, s->header, 0);
-if (ret < 0) {
-res->check_errors++;
-goto out;
-}
-}
-
 res->image_end_offset = high_off + s->cluster_size;
 if (size > res->image_end_offset) {
 int64_t count;
@@ -523,6 +512,14 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 
 out:
 qemu_co_mutex_unlock(>lock);
+
+if (ret == 0) {
+ret = bdrv_co_flush(bs);
+if (ret < 0) {
+res->check_errors++;
+}
+}
+
 return ret;
 }
 
-- 
2.34.1




[PATCH v7 02/10] parallels: Fix high_off calculation in parallels_co_check()

2022-08-29 Thread Alexander Ivanov
Don't let high_off be more than the file size
even if we don't fix the image.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index 93bc2750ef..7e8cdbbc3a 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -460,12 +460,12 @@ static int coroutine_fn 
parallels_co_check(BlockDriverState *bs,
 fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
 res->corruptions++;
 if (fix & BDRV_FIX_ERRORS) {
-prev_off = 0;
 s->bat_bitmap[i] = 0;
 res->corruptions_fixed++;
 flush_bat = true;
-continue;
 }
+prev_off = 0;
+continue;
 }
 
 res->bfi.allocated_clusters++;
-- 
2.34.1




[PATCH v7 03/10] parallels: Fix data_end after out-of-image check

2022-08-29 Thread Alexander Ivanov
Set data_end to the end of the last cluster inside the image.
In such a way we can be sure that corrupted offsets in the BAT
can't affect on the image size.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/parallels.c b/block/parallels.c
index 7e8cdbbc3a..c1ff8bb5f0 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -514,6 +514,8 @@ static int coroutine_fn parallels_co_check(BlockDriverState 
*bs,
 }
 }
 
+s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS;
+
 out:
 qemu_co_mutex_unlock(>lock);
 return ret;
-- 
2.34.1




  1   2   >