[PATCH v3 2/2] target/i386: Raise #GP on unaligned m128 accesses when required.
Many instructions which load/store 128-bit values are supposed to raise #GP when the memory operand isn't 16-byte aligned. This includes: - Instructions explicitly requiring memory alignment (Exceptions Type 1 in the "AVX and SSE Instruction Exception Specification" section of the SDM) - Legacy SSE instructions that load/store 128-bit values (Exceptions Types 2 and 4). This change sets MO_ALIGN_16 on 128-bit memory accesses that require 16-byte alignment. It adds cpu_record_sigbus and cpu_do_unaligned_access hooks that simulate a #GP exception in qemu-user and qemu-system, respectively. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/217 Reviewed-by: Richard Henderson Signed-off-by: Ricky Zhou --- target/i386/tcg/excp_helper.c| 13 target/i386/tcg/helper-tcg.h | 28 ++--- target/i386/tcg/sysemu/excp_helper.c | 8 + target/i386/tcg/tcg-cpu.c| 2 ++ target/i386/tcg/translate.c | 45 +--- target/i386/tcg/user/excp_helper.c | 7 + 6 files changed, 74 insertions(+), 29 deletions(-) diff --git a/target/i386/tcg/excp_helper.c b/target/i386/tcg/excp_helper.c index c1ffa1c0ef..7c3c8dc7fe 100644 --- a/target/i386/tcg/excp_helper.c +++ b/target/i386/tcg/excp_helper.c @@ -140,3 +140,16 @@ G_NORETURN void raise_exception_ra(CPUX86State *env, int exception_index, { raise_interrupt2(env, exception_index, 0, 0, 0, retaddr); } + +G_NORETURN void handle_unaligned_access(CPUX86State *env, vaddr vaddr, +MMUAccessType access_type, +uintptr_t retaddr) +{ +/* + * Unaligned accesses are currently only triggered by SSE/AVX + * instructions that impose alignment requirements on memory + * operands. These instructions raise #GP(0) upon accessing an + * unaligned address. + */ +raise_exception_ra(env, EXCP0D_GPF, retaddr); +} diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h index 34167e2e29..cd1723389a 100644 --- a/target/i386/tcg/helper-tcg.h +++ b/target/i386/tcg/helper-tcg.h @@ -42,17 +42,6 @@ void x86_cpu_do_interrupt(CPUState *cpu); bool x86_cpu_exec_interrupt(CPUState *cpu, int int_req); #endif -/* helper.c */ -#ifdef CONFIG_USER_ONLY -void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr, -MMUAccessType access_type, -bool maperr, uintptr_t ra); -#else -bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size, - MMUAccessType access_type, int mmu_idx, - bool probe, uintptr_t retaddr); -#endif - void breakpoint_handler(CPUState *cs); /* n must be a constant to be efficient */ @@ -78,6 +67,23 @@ G_NORETURN void raise_exception_err_ra(CPUX86State *env, int exception_index, int error_code, uintptr_t retaddr); G_NORETURN void raise_interrupt(CPUX86State *nenv, int intno, int is_int, int error_code, int next_eip_addend); +G_NORETURN void handle_unaligned_access(CPUX86State *env, vaddr vaddr, +MMUAccessType access_type, +uintptr_t retaddr); +#ifdef CONFIG_USER_ONLY +void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr, +MMUAccessType access_type, +bool maperr, uintptr_t ra); +void x86_cpu_record_sigbus(CPUState *cs, vaddr addr, + MMUAccessType access_type, uintptr_t ra); +#else +bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size, + MMUAccessType access_type, int mmu_idx, + bool probe, uintptr_t retaddr); +G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, +MMUAccessType access_type, +int mmu_idx, uintptr_t retaddr); +#endif /* cc_helper.c */ extern const uint8_t parity_table[256]; diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c index 48feba7e75..796dc2a1f3 100644 --- a/target/i386/tcg/sysemu/excp_helper.c +++ b/target/i386/tcg/sysemu/excp_helper.c @@ -439,3 +439,11 @@ bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size, } return true; } + +G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, +MMUAccessType access_type, +int mmu_idx, uintptr_t retaddr) +{ +X86CPU *cpu = X86_CPU(cs); +handle_unaligned_access(>env, vaddr, access_type, retaddr); +} diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c index 6fdfdf9598..d3c2b8fb49 100644 --- a/target/i386/tcg/tcg-cpu.c +++ b/target/i386/tcg/tcg-cpu.c @@ -75,10 +75,12 @@ static const struct TCGCPUOps x86_tcg_ops = { #ifdef CONFIG_USER_ONLY
[PATCH v3 1/2] target/i386: Read 8 bytes from cvttps2pi/cvtps2pi memory operands
Before this change, emulation of cvttps2pi and cvtps2pi instructions would read 16 bytes of memory instead of 8. The SDM states that cvttps2pi takes a 64-bit memory location. The documentation for cvtps2pi claims that it takes a a 128-bit memory location, but as with cvttps2pi, the operand is written as xmm/m64. I double-checked on real hardware that both of these instructions only read 8 bytes. Reviewed-by: Richard Henderson Signed-off-by: Ricky Zhou --- target/i386/tcg/translate.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index b7972f0ff5..3ba5f76156 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3621,7 +3621,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, if (mod != 3) { gen_lea_modrm(env, s, modrm); op2_offset = offsetof(CPUX86State,xmm_t0); -gen_ldo_env_A0(s, op2_offset); +if (b1) { +gen_ldo_env_A0(s, op2_offset); +} else { +gen_ldq_env_A0(s, op2_offset); +} } else { rm = (modrm & 7) | REX_B(s); op2_offset = offsetof(CPUX86State,xmm_regs[rm]); -- 2.37.2
[PATCH] hcd-ohci: Fix inconsistency when resetting ohci root hubs
I found an assertion failure in usb_cancel_packet() and posted my analysis in https://gitlab.com/qemu-project/qemu/-/issues/1180. I think this issue is because the inconsistency when resetting ohci root hubs. There are two ways to reset ohci root hubs: 1) through HcRhPortStatus, 2) through HcControl. However, when the packet's status is USB_PACKET_ASYNC, resetting through HcRhPortStatus will complete the packet and thus resetting through HcControl will fail. That is because IMO resetting through HcRhPortStatus should first detach the port and then invoked usb_device_reset() just like through HcControl. Therefore, I change usb_device_reset() to usb_port_reset() where usb_detach() and usb_device_reset() are invoked consequently. Fixes: d28f4e2d8631 ("usb: kill USB_MSG_RESET") Reported-by: Qiang Liu Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1180 Signed-off-by: Qiang Liu --- hw/usb/hcd-ohci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c index 895b29fb86..72df917834 100644 --- a/hw/usb/hcd-ohci.c +++ b/hw/usb/hcd-ohci.c @@ -1426,7 +1426,7 @@ static void ohci_port_set_status(OHCIState *ohci, int portnum, uint32_t val) if (ohci_port_set_if_connected(ohci, portnum, val & OHCI_PORT_PRS)) { trace_usb_ohci_port_reset(portnum); -usb_device_reset(port->port.dev); +usb_port_reset(>port); port->ctrl &= ~OHCI_PORT_PRS; /* ??? Should this also set OHCI_PORT_PESC. */ port->ctrl |= OHCI_PORT_PES | OHCI_PORT_PRSC; -- 2.25.1
Re: [PATCH v2 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.
On 8/29/22 19:11, Ricky Zhou wrote: Many instructions which load/store 128-bit values are supposed to raise #GP when the memory operand isn't 16-byte aligned. This includes: - Instructions explicitly requiring memory alignment (Exceptions Type 1 in the "AVX and SSE Instruction Exception Specification" section of the SDM) - Legacy SSE instructions that load/store 128-bit values (Exceptions Types 2 and 4). This change sets MO_ALIGN_16 on 128-bit memory accesses that require 16-byte alignment. It adds cpu_record_sigbus and cpu_do_unaligned_access handlers that simulate a #GP exception in qemu-user and qemu-system, respectively. One minor behavior change apart from what is described above: Prior to this change, emulation of cvttps2pi and cvtps2pi instructions would incorrectly read 16 bytes of memory instead of 8. I double-checked on real hardware that these instructions only read 8 bytes (and do not have any address alignment requirements). This should really be split out to a separate patch. @@ -3621,7 +3629,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, if (mod != 3) { gen_lea_modrm(env, s, modrm); op2_offset = offsetof(CPUX86State,xmm_t0); -gen_ldo_env_A0(s, op2_offset); +if ((b >> 8) & 1) { Aka b1. Otherwise, Reviewed-by: Richard Henderson r~
[PATCH v2 0/1] target/i386: Raise #GP on unaligned m128 accesses when required.
Thanks Richard for the detailed comments/code pointers! I've switched to using MO_ALIGN_16 and implemented record_sigbus and do_unaligned_access hooks to simulate #GP(0) as suggested. Given what was said about the low likelihood of implementing #AC anytime soon, I have hardcoded #GP(0) in these hooks for now rather than plumbing through an extra bit in MemOp. Let me know if that seems reasonable, thanks! Ricky Zhou (1): target/i386: Raise #GP on unaligned m128 accesses when required. target/i386/tcg/excp_helper.c| 13 target/i386/tcg/helper-tcg.h | 28 +--- target/i386/tcg/sysemu/excp_helper.c | 8 + target/i386/tcg/tcg-cpu.c| 1 + target/i386/tcg/translate.c | 49 ++-- target/i386/tcg/user/excp_helper.c | 7 6 files changed, 77 insertions(+), 29 deletions(-) -- 2.37.2
Re: [PATCH for-7.2 v4 15/21] qmp/hmp, device_tree.c: introduce 'info fdt' command
On Mon, Aug 29, 2022 at 07:00:55PM -0300, Daniel Henrique Barboza wrote: > > > On 8/29/22 00:34, David Gibson wrote: > > On Fri, Aug 26, 2022 at 11:11:44AM -0300, Daniel Henrique Barboza wrote: > > > Reading the FDT requires that the user saves the fdt_blob and then use > > > 'dtc' to read the contents. Saving the file and using 'dtc' is a strong > > > use case when we need to compare two FDTs, but it's a lot of steps if > > > you want to do quick check on a certain node or property. > > > > > > 'info fdt' retrieves FDT nodes (and properties, later on) and print it > > > to the user. This can be used to check the FDT on running machines > > > without having to save the blob and use 'dtc'. > > > > > > The implementation is based on the premise that the machine thas a FDT > > > created using libfdt and pointed by 'machine->fdt'. As long as this > > > pre-requisite is met the machine should be able to support it. > > > > > > For now we're going to add the required QMP/HMP boilerplate and the > > > capability of printing the name of the properties of a given node. Next > > > patches will extend 'info fdt' to be able to print nodes recursively, > > > and then individual properties. > > > > > > This command will always be executed in-band (i.e. holding BQL), > > > avoiding potential race conditions with machines that might change the > > > FDT during runtime (e.g. PowerPC 'pseries' machine). > > > > > > 'info fdt' is not something that we expect to be used aside from > > > debugging, > > > so we're implementing it in QMP as 'x-query-fdt'. > > > > > > This is an example of 'info fdt' fetching the '/chosen' node of the > > > pSeries machine: > > > > > > (qemu) info fdt /chosen > > > chosen { > > > ibm,architecture-vec-5; > > > rng-seed; > > > ibm,arch-vec-5-platform-support; > > > linux,pci-probe-only; > > > stdout-path; > > > linux,stdout-path; > > > qemu,graphic-depth; > > > qemu,graphic-height; > > > qemu,graphic-width; > > > }; > > > > > > And the same node for the aarch64 'virt' machine: > > > > > > (qemu) info fdt /chosen > > > chosen { > > > stdout-path; > > > rng-seed; > > > kaslr-seed; > > > }; > > > > So, I'm reasonably convinced allowing dumping the whole dtb from > > qmp/hmp is useful. I'm less convined that info fdt is worth the > > additional complexity it incurs. Note that as well as being able to > > decompile a whole dtb using dtc, you can also extract and list > > specific properties from a dtb blob using the 'fdtget' tool which is > > part of the dtc tree. > > What's your opinion on patch 21/21, where 'dumpdtb' can write a formatted > FDT in a file with an extra option? That was possible because of the > format helpers introduced for 'info fdt'. The idea is that since we're > able to format a FDT in DTS format, we can also write the FDT in text > format without relying on DTC to decode it. Since it's mostly the same code, I think it's reasonable to throw in if the info fdt stuff is there, but I don't think it's worth including without that. As a whole, I remain dubious that (info fdt + dumpdts) is worth the complexity cost. People with more practical experience debugging the embedded ARM platforms might have a different opinion if they thing info fdt would be really useful though. > If we think that this 'dumpdtb' capability is worth having, I can respin > the patches without 'info fdt' but adding these helpers to enable this > 'dumpdtb' support. If not, then we can just remove patches 15-21 and > be done with it. > > > Thanks, > > > Daniel > > > > > > > > > Cc: Dr. David Alan Gilbert > > > Acked-by: Dr. David Alan Gilbert > > > Signed-off-by: Daniel Henrique Barboza > > > --- > > > hmp-commands-info.hx | 13 ++ > > > include/monitor/hmp.h| 1 + > > > include/sysemu/device_tree.h | 4 +++ > > > monitor/hmp-cmds.c | 13 ++ > > > monitor/qmp-cmds.c | 12 + > > > qapi/machine.json| 19 +++ > > > softmmu/device_tree.c| 47 > > > 7 files changed, 109 insertions(+) > > > > > > diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx > > > index 188d9ece3b..743b48865d 100644 > > > --- a/hmp-commands-info.hx > > > +++ b/hmp-commands-info.hx > > > @@ -921,3 +921,16 @@ SRST > > > ``stats`` > > > Show runtime-collected statistics > > > ERST > > > + > > > +{ > > > +.name = "fdt", > > > +.args_type = "nodepath:s", > > > +.params = "nodepath", > > > +.help = "show firmware device tree node given its path", > > > +.cmd= hmp_info_fdt, > > > +}, > > > + > > > +SRST > > > + ``info fdt`` > > > +Show a firmware device tree node given its path. Requires libfdt. > > > +ERST > > > diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h > > > index d7f324da59..c0883dd1e3 100644 > > > ---
[PATCH v2 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.
Many instructions which load/store 128-bit values are supposed to raise #GP when the memory operand isn't 16-byte aligned. This includes: - Instructions explicitly requiring memory alignment (Exceptions Type 1 in the "AVX and SSE Instruction Exception Specification" section of the SDM) - Legacy SSE instructions that load/store 128-bit values (Exceptions Types 2 and 4). This change sets MO_ALIGN_16 on 128-bit memory accesses that require 16-byte alignment. It adds cpu_record_sigbus and cpu_do_unaligned_access handlers that simulate a #GP exception in qemu-user and qemu-system, respectively. One minor behavior change apart from what is described above: Prior to this change, emulation of cvttps2pi and cvtps2pi instructions would incorrectly read 16 bytes of memory instead of 8. I double-checked on real hardware that these instructions only read 8 bytes (and do not have any address alignment requirements). Resolves: https://gitlab.com/qemu-project/qemu/-/issues/217 Signed-off-by: Ricky Zhou --- target/i386/tcg/excp_helper.c| 13 target/i386/tcg/helper-tcg.h | 28 +--- target/i386/tcg/sysemu/excp_helper.c | 8 + target/i386/tcg/tcg-cpu.c| 2 ++ target/i386/tcg/translate.c | 49 ++-- target/i386/tcg/user/excp_helper.c | 7 6 files changed, 78 insertions(+), 29 deletions(-) diff --git a/target/i386/tcg/excp_helper.c b/target/i386/tcg/excp_helper.c index c1ffa1c0ef..7c3c8dc7fe 100644 --- a/target/i386/tcg/excp_helper.c +++ b/target/i386/tcg/excp_helper.c @@ -140,3 +140,16 @@ G_NORETURN void raise_exception_ra(CPUX86State *env, int exception_index, { raise_interrupt2(env, exception_index, 0, 0, 0, retaddr); } + +G_NORETURN void handle_unaligned_access(CPUX86State *env, vaddr vaddr, +MMUAccessType access_type, +uintptr_t retaddr) +{ +/* + * Unaligned accesses are currently only triggered by SSE/AVX + * instructions that impose alignment requirements on memory + * operands. These instructions raise #GP(0) upon accessing an + * unaligned address. + */ +raise_exception_ra(env, EXCP0D_GPF, retaddr); +} diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h index 34167e2e29..cd1723389a 100644 --- a/target/i386/tcg/helper-tcg.h +++ b/target/i386/tcg/helper-tcg.h @@ -42,17 +42,6 @@ void x86_cpu_do_interrupt(CPUState *cpu); bool x86_cpu_exec_interrupt(CPUState *cpu, int int_req); #endif -/* helper.c */ -#ifdef CONFIG_USER_ONLY -void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr, -MMUAccessType access_type, -bool maperr, uintptr_t ra); -#else -bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size, - MMUAccessType access_type, int mmu_idx, - bool probe, uintptr_t retaddr); -#endif - void breakpoint_handler(CPUState *cs); /* n must be a constant to be efficient */ @@ -78,6 +67,23 @@ G_NORETURN void raise_exception_err_ra(CPUX86State *env, int exception_index, int error_code, uintptr_t retaddr); G_NORETURN void raise_interrupt(CPUX86State *nenv, int intno, int is_int, int error_code, int next_eip_addend); +G_NORETURN void handle_unaligned_access(CPUX86State *env, vaddr vaddr, +MMUAccessType access_type, +uintptr_t retaddr); +#ifdef CONFIG_USER_ONLY +void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr, +MMUAccessType access_type, +bool maperr, uintptr_t ra); +void x86_cpu_record_sigbus(CPUState *cs, vaddr addr, + MMUAccessType access_type, uintptr_t ra); +#else +bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size, + MMUAccessType access_type, int mmu_idx, + bool probe, uintptr_t retaddr); +G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, +MMUAccessType access_type, +int mmu_idx, uintptr_t retaddr); +#endif /* cc_helper.c */ extern const uint8_t parity_table[256]; diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c index 48feba7e75..796dc2a1f3 100644 --- a/target/i386/tcg/sysemu/excp_helper.c +++ b/target/i386/tcg/sysemu/excp_helper.c @@ -439,3 +439,11 @@ bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size, } return true; } + +G_NORETURN void x86_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, +MMUAccessType access_type, +int mmu_idx, uintptr_t retaddr) +{ +X86CPU *cpu = X86_CPU(cs); +handle_unaligned_access(>env, vaddr,
Re: [PATCH v1 15/25] Deprecate 32 bit big-endian MIPS
Reviewed-by: Huacai Chen On Tue, Aug 30, 2022 at 7:39 AM Philippe Mathieu-Daudé wrote: > > Hi Alex, > > (+Aleksandar/Huacai) > > On 26/8/22 19:21, Alex Bennée wrote: > > It's becoming harder to maintain a cross-compiler to test this host > > architecture as the old stable Debian 10 ("Buster") moved into LTS > > which supports fewer architectures. For now: > > > >- mark it's deprecation in the docs > >- downgrade the containers to build TCG tests only > >- drop the cross builds from our CI > > > > Users with an appropriate toolchain and user-space can still take > > their chances building it. > > > > Signed-off-by: Alex Bennée > > --- > > docs/about/build-platforms.rst| 2 +- > > docs/about/deprecated.rst | 13 ++ > > .gitlab-ci.d/container-cross.yml | 1 - > > .gitlab-ci.d/crossbuilds.yml | 14 --- > > tests/docker/Makefile.include | 5 +-- > > .../dockerfiles/debian-mips-cross.docker | 40 +-- > > 6 files changed, 27 insertions(+), 48 deletions(-) > > > > diff --git a/docs/about/build-platforms.rst b/docs/about/build-platforms.rst > > index 26028756d0..1ca9144a7d 100644 > > --- a/docs/about/build-platforms.rst > > +++ b/docs/about/build-platforms.rst > > @@ -41,7 +41,7 @@ Those hosts are officially supported, with various > > accelerators: > >- Accelerators > > * - Arm > >- kvm (64 bit only), tcg, xen > > - * - MIPS > > + * - MIPS (LE only) > >- kvm, tcg > > * - PPC > >- kvm, tcg > > diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst > > index 91b03115ee..22c2f4f4de 100644 > > --- a/docs/about/deprecated.rst > > +++ b/docs/about/deprecated.rst > > @@ -213,6 +213,19 @@ MIPS ``Trap-and-Emul`` KVM support (since 6.0) > > The MIPS ``Trap-and-Emul`` KVM host and guest support has been removed > > from Linux upstream kernel, declare it deprecated. > > > > +Host Architectures > > +-- > > + > > +BE MIPS (since 7.2) > > +''' > > + > > +A Debian 10 ("Buster") moved into LTS the big endian 32 bit version of > > +MIPS moved out of support making it hard to maintain our > > +cross-compilation CI tests of the architecture. As we no longer have > > +CI coverage support may bitrot away before the deprecation process > > +completes. The little endian variants of MIPS (both 32 and 64 bit) are > > +still a supported host architecture. > > For completeness we should update meson.build to consider > host_machine.endian() and adapt this section: > > >if not supported_cpus.contains(cpu) > message() > warning('SUPPORT FOR THIS HOST CPU WILL GO AWAY IN FUTURE RELEASES!') > message() > message('CPU host architecture ' + cpu + ' support is not currently > maintained.') >... > > This can be done later, and I might be able to do so in few weeks, > so meanwhile (with Thomas comment addressed): > Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH v8 0/7] Add support for zoned device
Stefan Hajnoczi 于2022年8月30日周二 03:44写道: > > On Fri, Aug 26, 2022 at 11:15:29PM +0800, Sam Li wrote: > > Zoned Block Devices (ZBDs) devide the LBA space to block regions called > > zones > > that are larger than the LBA size. It can only allow sequential writes, > > which > > reduces write amplification in SSD, leading to higher throughput and > > increased > > capacity. More details about ZBDs can be found at: > > > > https://zonedstorage.io/docs/introduction/zoned-storage > > > > The zoned device support aims to let guests (virtual machines) access zoned > > storage devices on the host (hypervisor) through a virtio-blk device. This > > involves extending QEMU's block layer and virtio-blk emulation code. In its > > current status, the virtio-blk device is not aware of ZBDs but the guest > > sees > > host-managed drives as regular drive that will runs correctly under the most > > common write workloads. > > > > This patch series extend the block layer APIs with the minimum set of zoned > > commands that are necessary to support zoned devices. The commands are - > > Report > > Zones, four zone operations and Zone Append (developing). > > > > It can be tested on a null_blk device using qemu-io or qemu-iotests. For > > example, the command line for zone report using qemu-io is: > > $ path/to/qemu-io --image-opts -n > > driver=zoned_host_device,filename=/dev/nullb0 > > -c "zrp offset nr_zones" > > > > v8: > > - address review comments > > * solve patch conflicts and merge sysfs helper funcations into one patch > > * add cache.direct=on check in config > > Hi Sam, > I have left a few comments. That's great! Thanks for reviewing. I'll send a revision soon. Sam
Re: [PATCH 1/5] Update version for v7.1.0-rc4 release
On 8/29/22 18:40, Antonio Caggiano wrote: > From: Richard Henderson > > Signed-off-by: Richard Henderson > --- > VERSION | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/VERSION b/VERSION > index 1c944b9863..b8d5f3ebb6 100644 > --- a/VERSION > +++ b/VERSION > @@ -1 +1 @@ > -7.0.93 > +7.0.94 This patch shouldn't be here. -- Best regards, Dmitry
Re: [PATCH v1 13/25] gitlab-ci/custom-runners: Disable -static-pie for ubuntu-20.04-aarch64
On 8/29/22 16:16, Philippe Mathieu-Daudé wrote: Shouldn't "--extra-cflags='-fno-pie -no-pie'" be handled by the configure script while processing the --disable-pie option? I think configure just passes b_pie=off to meson, but yes, this could be improved -- there's definitely a disconnect somewhere. r~
Re: [PATCH v1 15/25] Deprecate 32 bit big-endian MIPS
Hi Alex, (+Aleksandar/Huacai) On 26/8/22 19:21, Alex Bennée wrote: It's becoming harder to maintain a cross-compiler to test this host architecture as the old stable Debian 10 ("Buster") moved into LTS which supports fewer architectures. For now: - mark it's deprecation in the docs - downgrade the containers to build TCG tests only - drop the cross builds from our CI Users with an appropriate toolchain and user-space can still take their chances building it. Signed-off-by: Alex Bennée --- docs/about/build-platforms.rst| 2 +- docs/about/deprecated.rst | 13 ++ .gitlab-ci.d/container-cross.yml | 1 - .gitlab-ci.d/crossbuilds.yml | 14 --- tests/docker/Makefile.include | 5 +-- .../dockerfiles/debian-mips-cross.docker | 40 +-- 6 files changed, 27 insertions(+), 48 deletions(-) diff --git a/docs/about/build-platforms.rst b/docs/about/build-platforms.rst index 26028756d0..1ca9144a7d 100644 --- a/docs/about/build-platforms.rst +++ b/docs/about/build-platforms.rst @@ -41,7 +41,7 @@ Those hosts are officially supported, with various accelerators: - Accelerators * - Arm - kvm (64 bit only), tcg, xen - * - MIPS + * - MIPS (LE only) - kvm, tcg * - PPC - kvm, tcg diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst index 91b03115ee..22c2f4f4de 100644 --- a/docs/about/deprecated.rst +++ b/docs/about/deprecated.rst @@ -213,6 +213,19 @@ MIPS ``Trap-and-Emul`` KVM support (since 6.0) The MIPS ``Trap-and-Emul`` KVM host and guest support has been removed from Linux upstream kernel, declare it deprecated. +Host Architectures +-- + +BE MIPS (since 7.2) +''' + +A Debian 10 ("Buster") moved into LTS the big endian 32 bit version of +MIPS moved out of support making it hard to maintain our +cross-compilation CI tests of the architecture. As we no longer have +CI coverage support may bitrot away before the deprecation process +completes. The little endian variants of MIPS (both 32 and 64 bit) are +still a supported host architecture. For completeness we should update meson.build to consider host_machine.endian() and adapt this section: if not supported_cpus.contains(cpu) message() warning('SUPPORT FOR THIS HOST CPU WILL GO AWAY IN FUTURE RELEASES!') message() message('CPU host architecture ' + cpu + ' support is not currently maintained.') ... This can be done later, and I might be able to do so in few weeks, so meanwhile (with Thomas comment addressed): Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH v1 13/25] gitlab-ci/custom-runners: Disable -static-pie for ubuntu-20.04-aarch64
On 26/8/22 19:21, Alex Bennée wrote: From: Richard Henderson The project has reached the magic size at which we see /usr/aarch64-linux-gnu/lib/libc.a(init-first.o): in function `__libc_init_first': (.text+0x10): relocation truncated to fit: R_AARCH64_LD64_GOTPAGE_LO15 against \ symbol `__environ' defined in .bss section in /usr/aarch64-linux-gnu/lib/libc.a(environ.o) /usr/bin/ld: (.text+0x10): warning: too many GOT entries for -fpic, please recompile with -fPIC The bug has been reported upstream, but in the meantime there is nothing we can do except build a non-pie executable. Signed-off-by: Richard Henderson Signed-off-by: Alex Bennée Message-Id: <20220823210329.1969895-1-richard.hender...@linaro.org> --- .gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml b/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml index 3d878914e7..85a234801a 100644 --- a/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml +++ b/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml @@ -16,7 +16,9 @@ ubuntu-20.04-aarch64-all-linux-static: # --disable-glusterfs is needed because there's no static version of those libs in distro supplied packages - mkdir build - cd build - - ../configure --enable-debug --static --disable-system --disable-glusterfs --disable-libssh + # Disable -static-pie due to build error with system libc: + # https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1987438 + - ../configure --enable-debug --static --disable-system --disable-glusterfs --disable-libssh --disable-pie --extra-cflags='-fno-pie -no-pie' Shouldn't "--extra-cflags='-fno-pie -no-pie'" be handled by the configure script while processing the --disable-pie option?
Re: [PATCH v1 12/25] tests/vm: Remove obsolete Fedora VM test
On 26/8/22 19:21, Alex Bennée wrote: From: Thomas Huth It's still based on Fedora 30 - which is not supported anymore by QEMU since years. Seems like nobody is using (and refreshing) this, and it's easier to test this via a container anyway, so let's remove this now. Signed-off-by: Thomas Huth Message-Id: <20220822175317.190551-1-th...@redhat.com> Signed-off-by: Alex Bennée --- tests/vm/Makefile.include | 3 +- tests/vm/fedora | 190 -- 2 files changed, 1 insertion(+), 192 deletions(-) delete mode 100755 tests/vm/fedora Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH] MAINTAINERS: Update Akihiko Odaki's email address
On 29/8/22 10:31, Akihiko Odaki wrote: I am now employed by Daynix. Although my role as a reviewer of macOS-related change is not very relevant to the employment, I decided to use the company email address to avoid confusions from different addresses. Signed-off-by: Akihiko Odaki --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH] tests/avocado/migration: Get find_free_port() from the ports
On 29/8/22 14:19, Thomas Huth wrote: In upstream Avocado, the find_free_port() function is not available from "network" anymore, but must be used via "ports", see: https://github.com/avocado-framework/avocado/commit/22fc98c6ff76cc55c48 To be able to update to a newer Avocado version later, let's use the new way for accessing the find_free_port() function here. Signed-off-by: Thomas Huth --- tests/avocado/migration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.
On 8/29/22 13:46, Ricky Zhou wrote: Thanks for taking a look at this - did you see the bit in the cover letter where I discuss doing this via alignment requirements on the memory operation? My logic was that the memop alignment checks seem to be more oriented towards triggering #AC exceptions (even though this is not currently implemented), I missed that in the cover. However... implementing #AC is pretty hypothetical. It's not something that I've ever seen used, and not something that anyone has asked for. One slightly more involved way to use alignment on the MemOp could be to arrange to pass the problematic MemOp to do_unaligned_access and helper_unaligned_{ld,st}. Then we could allow CPUs to handle misalignment of different MemOps differently (e.g. raise #GP/SIGSEGV for certain ops and #AC/SIGBUS for others). For this change to x86, we could maybe get away with making MO_ALIGN_16 and above trigger #GP/SIGSEGV and everything else trigger #AC/SIGBUS. If that's a little hacky, we could instead add some dedicated bits to MemOp that distinguish different types of unaligned accesses. There's another related problem that actually has gotten a bug report in the past: when the form of the address should raise #SS instead of #GP in system mode. My initial thought was to record information about "the" memory access in the per-insn unwind info, until I realized that there are insns with multiple memory operations requiring different treatment. E.g. "push (%rax)", where the read might raise #GP and the write might raise #SS. So I think we'd need to encode #GP vs #SS into the mmu_idx used (e.g. in the lsb). However, I don't think there are any similar situations of multiple memory types affecting SSE, so #AC vs #GP could in fact be encoded into the per-insn unwind info. As for SIGBUS vs SIGSEGV for SSE and user-only, you only need implement the x86_cpu_ops.record_sigbus hook. C.f. the s390x version which raises PGM_SPECIFICATION -> SIGILL for unaligned atomic operations. r~
Re: [PATCH for-7.2 v4 15/21] qmp/hmp, device_tree.c: introduce 'info fdt' command
On 8/29/22 00:34, David Gibson wrote: On Fri, Aug 26, 2022 at 11:11:44AM -0300, Daniel Henrique Barboza wrote: Reading the FDT requires that the user saves the fdt_blob and then use 'dtc' to read the contents. Saving the file and using 'dtc' is a strong use case when we need to compare two FDTs, but it's a lot of steps if you want to do quick check on a certain node or property. 'info fdt' retrieves FDT nodes (and properties, later on) and print it to the user. This can be used to check the FDT on running machines without having to save the blob and use 'dtc'. The implementation is based on the premise that the machine thas a FDT created using libfdt and pointed by 'machine->fdt'. As long as this pre-requisite is met the machine should be able to support it. For now we're going to add the required QMP/HMP boilerplate and the capability of printing the name of the properties of a given node. Next patches will extend 'info fdt' to be able to print nodes recursively, and then individual properties. This command will always be executed in-band (i.e. holding BQL), avoiding potential race conditions with machines that might change the FDT during runtime (e.g. PowerPC 'pseries' machine). 'info fdt' is not something that we expect to be used aside from debugging, so we're implementing it in QMP as 'x-query-fdt'. This is an example of 'info fdt' fetching the '/chosen' node of the pSeries machine: (qemu) info fdt /chosen chosen { ibm,architecture-vec-5; rng-seed; ibm,arch-vec-5-platform-support; linux,pci-probe-only; stdout-path; linux,stdout-path; qemu,graphic-depth; qemu,graphic-height; qemu,graphic-width; }; And the same node for the aarch64 'virt' machine: (qemu) info fdt /chosen chosen { stdout-path; rng-seed; kaslr-seed; }; So, I'm reasonably convinced allowing dumping the whole dtb from qmp/hmp is useful. I'm less convined that info fdt is worth the additional complexity it incurs. Note that as well as being able to decompile a whole dtb using dtc, you can also extract and list specific properties from a dtb blob using the 'fdtget' tool which is part of the dtc tree. What's your opinion on patch 21/21, where 'dumpdtb' can write a formatted FDT in a file with an extra option? That was possible because of the format helpers introduced for 'info fdt'. The idea is that since we're able to format a FDT in DTS format, we can also write the FDT in text format without relying on DTC to decode it. If we think that this 'dumpdtb' capability is worth having, I can respin the patches without 'info fdt' but adding these helpers to enable this 'dumpdtb' support. If not, then we can just remove patches 15-21 and be done with it. Thanks, Daniel Cc: Dr. David Alan Gilbert Acked-by: Dr. David Alan Gilbert Signed-off-by: Daniel Henrique Barboza --- hmp-commands-info.hx | 13 ++ include/monitor/hmp.h| 1 + include/sysemu/device_tree.h | 4 +++ monitor/hmp-cmds.c | 13 ++ monitor/qmp-cmds.c | 12 + qapi/machine.json| 19 +++ softmmu/device_tree.c| 47 7 files changed, 109 insertions(+) diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx index 188d9ece3b..743b48865d 100644 --- a/hmp-commands-info.hx +++ b/hmp-commands-info.hx @@ -921,3 +921,16 @@ SRST ``stats`` Show runtime-collected statistics ERST + +{ +.name = "fdt", +.args_type = "nodepath:s", +.params = "nodepath", +.help = "show firmware device tree node given its path", +.cmd= hmp_info_fdt, +}, + +SRST + ``info fdt`` +Show a firmware device tree node given its path. Requires libfdt. +ERST diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h index d7f324da59..c0883dd1e3 100644 --- a/include/monitor/hmp.h +++ b/include/monitor/hmp.h @@ -135,6 +135,7 @@ void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict); void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict); void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict); void hmp_dumpdtb(Monitor *mon, const QDict *qdict); +void hmp_info_fdt(Monitor *mon, const QDict *qdict); void hmp_human_readable_text_helper(Monitor *mon, HumanReadableText *(*qmp_handler)(Error **)); void hmp_info_stats(Monitor *mon, const QDict *qdict); diff --git a/include/sysemu/device_tree.h b/include/sysemu/device_tree.h index bf7684e4ed..057d13e397 100644 --- a/include/sysemu/device_tree.h +++ b/include/sysemu/device_tree.h @@ -14,6 +14,8 @@ #ifndef DEVICE_TREE_H #define DEVICE_TREE_H +#include "qapi/qapi-types-common.h" + void *create_device_tree(int *sizep); void *load_device_tree(const char *filename_path, int *sizep); #ifdef CONFIG_LINUX @@ -137,6 +139,8 @@ int qemu_fdt_add_path(void *fdt, const char *path); void
Re: [PATCH] ui/console: Get tab completion working again in the SDL monitor vc
Hi Gerd, Can you take a look at this and let me know what you think? Thanks, -Cal On Thu, 11 Aug 2022, Cal Peake wrote: > Define a QEMU special key constant for the tab key and add an entry for > it in the qcode_to_keysym table. This allows tab completion to work again > in the SDL monitor virtual console, which has been broken ever since the > migration from SDL1 to SDL2. > > Signed-off-by: Cal Peake > --- > include/ui/console.h | 1 + > ui/console.c | 1 + > 2 files changed, 2 insertions(+) > > diff --git a/include/ui/console.h b/include/ui/console.h > index c0520c694c..e400ee9fa7 100644 > --- a/include/ui/console.h > +++ b/include/ui/console.h > @@ -70,6 +70,7 @@ void hmp_mouse_set(Monitor *mon, const QDict *qdict); > /* keysym is a unicode code except for special keys (see QEMU_KEY_xxx > constants) */ > #define QEMU_KEY_ESC1(c) ((c) | 0xe100) > +#define QEMU_KEY_TAB0x0009 > #define QEMU_KEY_BACKSPACE 0x007f > #define QEMU_KEY_UP QEMU_KEY_ESC1('A') > #define QEMU_KEY_DOWN QEMU_KEY_ESC1('B') > diff --git a/ui/console.c b/ui/console.c > index e139f7115e..addaafba28 100644 > --- a/ui/console.c > +++ b/ui/console.c > @@ -1368,6 +1368,7 @@ static const int qcode_to_keysym[Q_KEY_CODE__MAX] = { > [Q_KEY_CODE_PGUP] = QEMU_KEY_PAGEUP, > [Q_KEY_CODE_PGDN] = QEMU_KEY_PAGEDOWN, > [Q_KEY_CODE_DELETE] = QEMU_KEY_DELETE, > +[Q_KEY_CODE_TAB]= QEMU_KEY_TAB, > [Q_KEY_CODE_BACKSPACE] = QEMU_KEY_BACKSPACE, > }; > > -- > 2.35.3 > >
Re: [PATCH 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.
On Mon, Aug 29, 2022 at 9:45 AM Richard Henderson wrote: > > On 8/29/22 07:23, Ricky Zhou wrote: > This trap should be raised via the memory operation: > ... > Only the first of the two loads/stores must be aligned, as the other is known > to be +8. > You then must fill in the x86_tcg_ops.do_unaligned_access hook to raise #GP. Thanks for taking a look at this - did you see the bit in the cover letter where I discuss doing this via alignment requirements on the memory operation? My logic was that the memop alignment checks seem to be more oriented towards triggering #AC exceptions (even though this is not currently implemented), since qemu-user's unaligned access handlers (helper_unaligned_{ld,st}) already trigger SIGBUS as opposed to SIGSEGV. I was concerned that implementing this via MO_ALIGN_16 would get in the way of a hypothetical future implementation of the AC flag, since do_unaligned_access would need to raise #AC instead of #GP for that. One slightly more involved way to use alignment on the MemOp could be to arrange to pass the problematic MemOp to do_unaligned_access and helper_unaligned_{ld,st}. Then we could allow CPUs to handle misalignment of different MemOps differently (e.g. raise #GP/SIGSEGV for certain ops and #AC/SIGBUS for others). For this change to x86, we could maybe get away with making MO_ALIGN_16 and above trigger #GP/SIGSEGV and everything else trigger #AC/SIGBUS. If that's a little hacky, we could instead add some dedicated bits to MemOp that distinguish different types of unaligned accesses. What do you think? Happy to implement whichever approach is preferred! Thanks, Ricky
Re: [PATCH v5 09/18] dump: Use a buffer for ELF section data and headers
On Thu, 2022-08-11 at 12:11 +, Janosch Frank wrote: > Currently we're writing the NULL section header if we overflow the > physical header number in the ELF header. But in the future we'll add > custom section headers AND section data. > > To facilitate this we need to rearange section handling a bit. As with > the other ELF headers we split the code into a prepare and a write > step. > > Signed-off-by: Janosch Frank > --- > dump/dump.c | 83 +-- > include/sysemu/dump.h | 2 ++ > 2 files changed, 58 insertions(+), 27 deletions(-) > > diff --git a/dump/dump.c b/dump/dump.c > index a905316fe5..0051c71d08 100644 > --- a/dump/dump.c > +++ b/dump/dump.c > @@ -380,30 +380,57 @@ static void write_elf_phdr_note(DumpState *s, Error > **errp) > } > } > > -static void write_elf_section(DumpState *s, int type, Error **errp) > +static void prepare_elf_section_hdr_zero(DumpState *s) > { > -Elf32_Shdr shdr32; > -Elf64_Shdr shdr64; > -int shdr_size; > -void *shdr; > +if (dump_is_64bit(s)) { > +Elf64_Shdr *shdr64 = s->elf_section_hdrs; > + > +shdr64->sh_info = cpu_to_dump32(s, s->phdr_num); > +} else { > +Elf32_Shdr *shdr32 = s->elf_section_hdrs; > + > +shdr32->sh_info = cpu_to_dump32(s, s->phdr_num); > +} > +} > + > +static void prepare_elf_section_hdrs(DumpState *s) > +{ > +size_t len, sizeof_shdr; > + > +/* > + * Section ordering: > + * - HDR zero > + */ > +sizeof_shdr = dump_is_64bit(s) ? sizeof(Elf64_Shdr) : sizeof(Elf32_Shdr); > +len = sizeof_shdr * s->shdr_num; > +s->elf_section_hdrs = g_malloc0(len); I'm not seeing this being freed. > + > +/* > + * The first section header is ALWAYS a special initial section > + * header. > + * > + * The header should be 0 with one exception being that if > + * phdr_num is PN_XNUM then the sh_info field contains the real > + * number of segment entries. > + * > + * As we zero allocate the buffer we will only need to modify > + * sh_info for the PN_XNUM case. > + */ > +if (s->phdr_num >= PN_XNUM) { > +prepare_elf_section_hdr_zero(s); > +} > +} > + > +static void write_elf_section_headers(DumpState *s, Error **errp) [...] > @@ -579,6 +606,12 @@ static void dump_begin(DumpState *s, Error **errp) > return; > } > > +/* write section headers to vmcore */ > +write_elf_section_headers(s, errp); > +if (*errp) { > +return; > +} > + > /* write PT_NOTE to vmcore */ > write_elf_phdr_note(s, errp); > if (*errp) { > @@ -591,14 +624,6 @@ static void dump_begin(DumpState *s, Error **errp) > return; > } > > -/* write section to vmcore */ > -if (s->shdr_num) { > -write_elf_section(s, 1, errp); > -if (*errp) { > -return; > -} > -} > - Here you change the order of the headers, but the elf header is only fixed in patch 11. I agree that this should be a separate patch, with an explanation on why it's necessary. So basically squashed into patch 11, except I think the comment change in that one should go into another patch. > /* write notes to vmcore */ > write_elf_notes(s, errp); > } > @@ -674,7 +699,11 @@ static void create_vmcore(DumpState *s, Error **errp) > return; > } > > +/* Iterate over memory and dump it to file */ > dump_iterate(s, errp); > +if (*errp) { > +return; > +} > } > > static int write_start_flat_header(int fd) > diff --git a/include/sysemu/dump.h b/include/sysemu/dump.h > index b62513d87d..9995f65dc8 100644 > --- a/include/sysemu/dump.h > +++ b/include/sysemu/dump.h > @@ -177,6 +177,8 @@ typedef struct DumpState { > int64_t filter_area_begin; /* Start address of partial guest memory > area */ > int64_t filter_area_length; /* Length of partial guest memory area */ > > +void *elf_section_hdrs; /* Pointer to section header buffer */ > + > uint8_t *note_buf; /* buffer for notes */ > size_t note_buf_offset; /* the writing place in note_buf */ > uint32_t nr_cpus; /* number of guest's cpu */
Re: [PATCH v5 04/18] dump: Rework get_start_block
On Thu, 2022-08-11 at 12:10 +, Janosch Frank wrote: > get_start_block() returns the start address of the first memory block > or -1. > > With the GuestPhysBlock iterator conversion we don't need to set the > start address and can therefore remove that code and the "start" > DumpState struct member. The only functionality left is the validation > of the start block so it only makes sense to re-name the function to > validate_start_block() Nit, since you don't return an address anymore, I find retaining the - 1/0 return value instead of true/false weird. > > Signed-off-by: Janosch Frank > Reviewed-by: Marc-André Lureau > Reviewed-by: Janis Schoetterl-Glausch > --- > dump/dump.c | 20 ++-- > include/sysemu/dump.h | 2 -- > 2 files changed, 6 insertions(+), 16 deletions(-) > > diff --git a/dump/dump.c b/dump/dump.c > index 340de5a1e7..e204912a89 100644 > --- a/dump/dump.c > +++ b/dump/dump.c > @@ -1500,30 +1500,22 @@ static void create_kdump_vmcore(DumpState *s, Error > **errp) > } > } > > -static ram_addr_t get_start_block(DumpState *s) > +static int validate_start_block(DumpState *s) > { > GuestPhysBlock *block; > > if (!s->has_filter) { > -s->next_block = QTAILQ_FIRST(>guest_phys_blocks.head); > return 0; > } > > QTAILQ_FOREACH(block, >guest_phys_blocks.head, next) { > +/* This block is out of the range */ > if (block->target_start >= s->begin + s->length || > block->target_end <= s->begin) { > -/* This block is out of the range */ > continue; > } > - > -s->next_block = block; > -if (s->begin > block->target_start) { > -s->start = s->begin - block->target_start; > -} else { > -s->start = 0; > -} > -return s->start; > -} > +return 0; > + } > > return -1; > } > @@ -1670,8 +1662,8 @@ static void dump_init(DumpState *s, int fd, bool > has_format, > goto cleanup; > } > > -s->start = get_start_block(s); > -if (s->start == -1) { > +/* Is the filter filtering everything? */ > +if (validate_start_block(s) == -1) { > error_setg(errp, QERR_INVALID_PARAMETER, "begin"); > goto cleanup; > } > diff --git a/include/sysemu/dump.h b/include/sysemu/dump.h > index ffc2ea1072..7fce1d4af6 100644 > --- a/include/sysemu/dump.h > +++ b/include/sysemu/dump.h > @@ -166,8 +166,6 @@ typedef struct DumpState { > hwaddr memory_offset; > int fd; > > -GuestPhysBlock *next_block; > -ram_addr_t start; > bool has_filter; > int64_t begin; > int64_t length;
Re: [PATCH v8 0/7] Add support for zoned device
On Fri, Aug 26, 2022 at 11:15:29PM +0800, Sam Li wrote: > Zoned Block Devices (ZBDs) devide the LBA space to block regions called zones > that are larger than the LBA size. It can only allow sequential writes, which > reduces write amplification in SSD, leading to higher throughput and increased > capacity. More details about ZBDs can be found at: > > https://zonedstorage.io/docs/introduction/zoned-storage > > The zoned device support aims to let guests (virtual machines) access zoned > storage devices on the host (hypervisor) through a virtio-blk device. This > involves extending QEMU's block layer and virtio-blk emulation code. In its > current status, the virtio-blk device is not aware of ZBDs but the guest sees > host-managed drives as regular drive that will runs correctly under the most > common write workloads. > > This patch series extend the block layer APIs with the minimum set of zoned > commands that are necessary to support zoned devices. The commands are - > Report > Zones, four zone operations and Zone Append (developing). > > It can be tested on a null_blk device using qemu-io or qemu-iotests. For > example, the command line for zone report using qemu-io is: > $ path/to/qemu-io --image-opts -n > driver=zoned_host_device,filename=/dev/nullb0 > -c "zrp offset nr_zones" > > v8: > - address review comments > * solve patch conflicts and merge sysfs helper funcations into one patch > * add cache.direct=on check in config Hi Sam, I have left a few comments. Stefan signature.asc Description: PGP signature
Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
On Sat, Aug 27, 2022 at 12:17:04AM +0800, Sam Li wrote: > +/* > + * Send a zone_management command. > + * op is the zone operation. > + * offset is the starting zone specified as a sector offset. Does "sector offset" mean "byte offset from the start of the device" or does it mean in 512B sector units? For consistency this should be in bytes. > + * len is the maximum number of sectors the command should operate on. It > + * should be aligned with the zone sector size. Please use bytes for consistency with QEMU's block layer APIs. > @@ -3022,6 +3183,118 @@ static void raw_account_discard(BDRVRawState *s, > uint64_t nbytes, int ret) > } > } > > +/* > + * zone report - Get a zone block device's information in the form > + * of an array of zone descriptors. > + * > + * @param bs: passing zone block device file descriptor > + * @param zones: an array of zone descriptors to hold zone > + * information on reply > + * @param offset: offset can be any byte within the zone size. This isn't an offset within a zone, it's an offset within the entire device, so I think "zone size" is confusing here. > + * @param len: (not sure yet. Please remove this and document nr_zones instead. > + * @return 0 on success, -1 on failure > + */ > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t > offset, > + unsigned int *nr_zones, > + BlockZoneDescriptor *zones) { > +#if defined(CONFIG_BLKZONED) > +BDRVRawState *s = bs->opaque; > +RawPosixAIOData acb; > + > +acb = (RawPosixAIOData) { > +.bs = bs, > +.aio_fildes = s->fd, > +.aio_type = QEMU_AIO_ZONE_REPORT, > +.aio_offset = offset, > +.zone_report= { > +.nr_zones = nr_zones, > +.zones = zones, > +}, > +}; > + > +return raw_thread_pool_submit(bs, handle_aiocb_zone_report, ); > +#else > +return -ENOTSUP; > +#endif > +} > + > +/* > + * zone management operations - Execute an operation on a zone > + */ > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp > op, > +int64_t offset, int64_t len) { > +#if defined(CONFIG_BLKZONED) > +BDRVRawState *s = bs->opaque; > +RawPosixAIOData acb; > +int64_t zone_sector, zone_sector_mask; > +const char *ioctl_name; > +unsigned long zone_op; > +int ret; > + > +struct stat st; > +if (fstat(s->fd, ) < 0) { > +ret = -errno; > +return ret; > +} st is not used and can be removed. > +zone_sector = bs->bl.zone_sectors; > +zone_sector_mask = zone_sector - 1; > +if (offset & zone_sector_mask) { > +error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId64 "", offset, zone_sector); > +return -EINVAL; > +} > + > +if (len & zone_sector_mask) { > +error_report("number of sectors %" PRId64 " is not aligned to zone > size" > + " %" PRId64 "", len, zone_sector); > +return -EINVAL; > +} > + > +switch (op) { > +case BLK_ZO_OPEN: > +ioctl_name = "BLKOPENZONE"; > +zone_op = BLKOPENZONE; > +break; > +case BLK_ZO_CLOSE: > +ioctl_name = "BLKCLOSEZONE"; > +zone_op = BLKCLOSEZONE; > +break; > +case BLK_ZO_FINISH: > +ioctl_name = "BLKFINISHZONE"; > +zone_op = BLKFINISHZONE; > +break; > +case BLK_ZO_RESET: > +ioctl_name = "BLKRESETZONE"; > +zone_op = BLKRESETZONE; > +break; > +default: > +error_report("Invalid zone operation 0x%x", op); > +return -EINVAL; > +} > + > +acb = (RawPosixAIOData) { > +.bs = bs, > +.aio_fildes = s->fd, > +.aio_type = QEMU_AIO_ZONE_MGMT, > +.aio_offset = offset, > +.aio_nbytes = len, > +.zone_mgmt = { > +.zone_op = zone_op, > +}, > +}; > + > +ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, ); > +if (ret != 0) { > +error_report("ioctl %s failed %d", ioctl_name, errno); > +return -errno; ret contains a negative errno value. The errno variable is not used by raw_thread_pool_submit(). I suggest simplifying it to: return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, ); That's what most of the other raw_thread_pool_submit() callers. signature.asc Description: PGP signature
Re: [PATCH v8 2/7] file-posix: introduce helper funcations for sysfs attributes
On Sat, Aug 27, 2022 at 12:11:21AM +0800, Sam Li wrote: If you send another revision please fix the "funcations" typo in the commit message. signature.asc Description: PGP signature
Re: [PATCH 4/9] hw/isa/vt82c686: QOM'ify via-ide creation
Am 29. August 2022 19:04:06 MESZ schrieb BALATON Zoltan : >On Mon, 29 Aug 2022, BB wrote: >> Am 25. August 2022 01:18:56 MESZ schrieb BALATON Zoltan : >>> On Thu, 25 Aug 2022, Bernhard Beschow wrote: On Wed, Aug 24, 2022 at 3:54 PM BALATON Zoltan wrote: > On Tue, 23 Aug 2022, Bernhard Beschow wrote: >> The IDE function is closely tied to the ISA function (e.g. the IDE >> interrupt routing happens there), so it makes sense that the IDE >> function is instantiated within the southbridge itself. As a side effect, >> duplicated code in the boards is resolved. >> >> Signed-off-by: Bernhard Beschow >> --- >> configs/devices/mips64el-softmmu/default.mak | 1 - >> hw/isa/Kconfig | 1 + >> hw/isa/vt82c686.c| 18 ++ >> hw/mips/fuloong2e.c | 3 --- >> hw/ppc/Kconfig | 1 - >> hw/ppc/pegasos2.c| 4 >> 6 files changed, 19 insertions(+), 9 deletions(-) >> >> diff --git a/configs/devices/mips64el-softmmu/default.mak > b/configs/devices/mips64el-softmmu/default.mak >> index c610749ac1..d5188f7ea5 100644 >> --- a/configs/devices/mips64el-softmmu/default.mak >> +++ b/configs/devices/mips64el-softmmu/default.mak >> @@ -1,7 +1,6 @@ >> # Default configuration for mips64el-softmmu >> >> include ../mips-softmmu/common.mak >> -CONFIG_IDE_VIA=y >> CONFIG_FULOONG=y >> CONFIG_LOONGSON3V=y >> CONFIG_ATI_VGA=y >> diff --git a/hw/isa/Kconfig b/hw/isa/Kconfig >> index d42143a991..20de7e9294 100644 >> --- a/hw/isa/Kconfig >> +++ b/hw/isa/Kconfig >> @@ -53,6 +53,7 @@ config VT82C686 >> select I8254 >> select I8257 >> select I8259 >> +select IDE_VIA >> select MC146818RTC >> select PARALLEL >> >> diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c >> index 5582c0b179..37d9ed635d 100644 >> --- a/hw/isa/vt82c686.c >> +++ b/hw/isa/vt82c686.c >> @@ -17,6 +17,7 @@ >> #include "hw/isa/vt82c686.h" >> #include "hw/pci/pci.h" >> #include "hw/qdev-properties.h" >> +#include "hw/ide/pci.h" >> #include "hw/isa/isa.h" >> #include "hw/isa/superio.h" >> #include "hw/intc/i8259.h" >> @@ -544,6 +545,7 @@ struct ViaISAState { >> qemu_irq cpu_intr; >> qemu_irq *isa_irqs; >> ViaSuperIOState via_sio; >> +PCIIDEState ide; >> }; >> >> static const VMStateDescription vmstate_via = { >> @@ -556,10 +558,18 @@ static const VMStateDescription vmstate_via = { >> } >> }; >> >> +static void via_isa_init(Object *obj) >> +{ >> +ViaISAState *s = VIA_ISA(obj); >> + >> +object_initialize_child(obj, "ide", >ide, "via-ide"); >> +} >> + >> static const TypeInfo via_isa_info = { >> .name = TYPE_VIA_ISA, >> .parent= TYPE_PCI_DEVICE, >> .instance_size = sizeof(ViaISAState), >> +.instance_init = via_isa_init, >> .abstract = true, >> .interfaces= (InterfaceInfo[]) { >> { INTERFACE_CONVENTIONAL_PCI_DEVICE }, >> @@ -583,6 +593,7 @@ static void via_isa_realize(PCIDevice *d, Error > **errp) >> { >> ViaISAState *s = VIA_ISA(d); >> DeviceState *dev = DEVICE(d); >> +PCIBus *pci_bus = pci_get_bus(d); >> qemu_irq *isa_irq; >> ISABus *isa_bus; >> int i; >> @@ -607,6 +618,13 @@ static void via_isa_realize(PCIDevice *d, Error > **errp) >> if (!qdev_realize(DEVICE(>via_sio), BUS(isa_bus), errp)) { >> return; >> } >> + >> +/* Function 1: IDE */ >> +qdev_prop_set_int32(DEVICE(>ide), "addr", d->devfn + 1); >> +if (!qdev_realize(DEVICE(>ide), BUS(pci_bus), errp)) { >> +return; >> +} >> +pci_ide_create_devs(PCI_DEVICE(>ide)); > > I'm not sure about moving pci_ide_create_devs() here. This is usally > called from board code and only piix4 seems to do this. Maybe that's wrong > because if all IDE devices did this then one machine could not have more > than one different ide devices (like having an on-board ide and adding a > pci ide controoler with -device) so this probably belongs to the board > code to add devices to its default ide controller only as this is machine > specific. Unless I'm wrong in which case somebody will correct me. > Grepping the code it can be seen that it's always called right after creating the IDE controllers. The only notable exception is the "sii3112" device in the sam460ex board which is not emulated yet. Since the IDE >>> >>> The problem with sii3112 is that it only has 2 channels becuase I did not >>> bother to implement more so pci_ide_create_devs() probably would not
Re: [PATCH 8/9] hw/isa/vt82c686: QOM'ify RTC creation
Am 29. August 2022 19:50:10 MESZ schrieb BALATON Zoltan : >On Mon, 29 Aug 2022, BB wrote: >> Am 24. August 2022 01:23:14 MESZ schrieb BALATON Zoltan : >>> On Tue, 23 Aug 2022, Bernhard Beschow wrote: On Tue, Aug 23, 2022 at 2:20 AM BALATON Zoltan wrote: > On Tue, 23 Aug 2022, Bernhard Beschow wrote: >> Signed-off-by: Bernhard Beschow >> --- >> hw/isa/vt82c686.c | 12 +++- >> 1 file changed, 11 insertions(+), 1 deletion(-) >> >> diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c >> index 47f2fd2669..ee745d5d49 100644 >> --- a/hw/isa/vt82c686.c >> +++ b/hw/isa/vt82c686.c >> @@ -546,6 +546,7 @@ struct ViaISAState { >> qemu_irq cpu_intr; >> qemu_irq *isa_irqs; >> ViaSuperIOState via_sio; >> +RTCState rtc; >> PCIIDEState ide; >> UHCIState uhci[2]; >> ViaPMState pm; >> @@ -567,6 +568,7 @@ static void via_isa_init(Object *obj) >> { >> ViaISAState *s = VIA_ISA(obj); >> >> +object_initialize_child(obj, "rtc", >rtc, TYPE_MC146818_RTC); >> object_initialize_child(obj, "ide", >ide, "via-ide"); >> object_initialize_child(obj, "uhci1", >uhci[0], > "vt82c686b-usb-uhci"); >> object_initialize_child(obj, "uhci2", >uhci[1], > "vt82c686b-usb-uhci"); >> @@ -615,7 +617,15 @@ static void via_isa_realize(PCIDevice *d, Error > **errp) >> isa_bus_irqs(isa_bus, s->isa_irqs); >> i8254_pit_init(isa_bus, 0x40, 0, NULL); >> i8257_dma_init(isa_bus, 0); >> -mc146818_rtc_init(isa_bus, 2000, NULL); >> + >> +/* RTC */ >> +qdev_prop_set_int32(DEVICE(>rtc), "base_year", 2000); >> +if (!qdev_realize(DEVICE(>rtc), BUS(isa_bus), errp)) { >> +return; >> +} >> +object_property_add_alias(qdev_get_machine(), "rtc-time", > OBJECT(>rtc), >> + "date"); >> +isa_connect_gpio_out(ISA_DEVICE(>rtc), 0, s->rtc.isairq); >> >> for (i = 0; i < PCI_CONFIG_HEADER_SIZE; i++) { >> if (i < PCI_COMMAND || i >= PCI_REVISION_ID) { >> > > This actually introduces code duplication as all other places except piix4 > seem to still use the init function (probably to ensure that the rtc-rime > alias on the machine is properly set) so I'd keep this the same as > everything else and drop this patch until this init function is removed > from all other places as well. > Hi Zoltan, Thanks for the fast reply! Regarding code homogeneity and duplication I've made a similar argument for mc146818_rtc_init() in the past [1] and I've learnt that my patch went backwards. Incidentally, Peter mentioned vt686c as a candidate for the embed-the-device-struct style which - again incidentally - I've now done. >>> >>> I've seen patches embedding devices recently but in this case it looked not >>> that simple because of the rtc-time alias. >>> The rtc-time alias is actually only used by a couple of PPC machines where Pegasos II is one of them. So the alias actually needs to be created only for these machines, and identifying the cases where it has to be preserved requires a lot of careful investigation. In the Pegasos II case this seems especially complicated since one needs to look through several layers of devices. During my work on the VT82xx south bridges I've gained some knowledge such that I'd like to make this simplifying contribution. >>> >>> I've used it to implement the get-time-of-day rtas call with VOF in >>> pegasos2 because otherwise it would need to access internals of the RTC >>> model and/or duplicate some code. Here's the message discussing this: >>> >>> https://lists.nongnu.org/archive/html/qemu-ppc/2021-10/msg00170.html >>> >>> so this alias still seems to be the simplest way. >>> >>> I think the primary function of this alias is not these ppc machines but >>> some QMP/HMP command or to make the guest time available from the monitor >>> or something like that so it's probably also used from there and therefore >>> all rtc probably should have it but I'm not sure about it. >> >> Indeed, the alias seems to be a convenience for some QMP/HMP commands. >> AFAICS only the mc146818 sets the alias while it is probably not the only >> RTC modelled in QEMU. So I wonder why boards using another RTC don't need it >> and whether removing the alias constitutes a compatibility break. >> Our discussion makes me realize that the creation of the alias could now actually be moved to the Pegasos II board. This way, the Pegasos II board would both create and consume that alias, which seems to remove quite some cognitive load. Do you agree? Would moving the alias to the board work for you? >>> >>> Yes I think that would be better. This way the vt82xx and piix4 would be >>> similar and the alias would also
Re: [PATCH 8/9] hw/isa/vt82c686: QOM'ify RTC creation
On Mon, 29 Aug 2022, BB wrote: Am 24. August 2022 01:23:14 MESZ schrieb BALATON Zoltan : On Tue, 23 Aug 2022, Bernhard Beschow wrote: On Tue, Aug 23, 2022 at 2:20 AM BALATON Zoltan wrote: On Tue, 23 Aug 2022, Bernhard Beschow wrote: Signed-off-by: Bernhard Beschow --- hw/isa/vt82c686.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c index 47f2fd2669..ee745d5d49 100644 --- a/hw/isa/vt82c686.c +++ b/hw/isa/vt82c686.c @@ -546,6 +546,7 @@ struct ViaISAState { qemu_irq cpu_intr; qemu_irq *isa_irqs; ViaSuperIOState via_sio; +RTCState rtc; PCIIDEState ide; UHCIState uhci[2]; ViaPMState pm; @@ -567,6 +568,7 @@ static void via_isa_init(Object *obj) { ViaISAState *s = VIA_ISA(obj); +object_initialize_child(obj, "rtc", >rtc, TYPE_MC146818_RTC); object_initialize_child(obj, "ide", >ide, "via-ide"); object_initialize_child(obj, "uhci1", >uhci[0], "vt82c686b-usb-uhci"); object_initialize_child(obj, "uhci2", >uhci[1], "vt82c686b-usb-uhci"); @@ -615,7 +617,15 @@ static void via_isa_realize(PCIDevice *d, Error **errp) isa_bus_irqs(isa_bus, s->isa_irqs); i8254_pit_init(isa_bus, 0x40, 0, NULL); i8257_dma_init(isa_bus, 0); -mc146818_rtc_init(isa_bus, 2000, NULL); + +/* RTC */ +qdev_prop_set_int32(DEVICE(>rtc), "base_year", 2000); +if (!qdev_realize(DEVICE(>rtc), BUS(isa_bus), errp)) { +return; +} +object_property_add_alias(qdev_get_machine(), "rtc-time", OBJECT(>rtc), + "date"); +isa_connect_gpio_out(ISA_DEVICE(>rtc), 0, s->rtc.isairq); for (i = 0; i < PCI_CONFIG_HEADER_SIZE; i++) { if (i < PCI_COMMAND || i >= PCI_REVISION_ID) { This actually introduces code duplication as all other places except piix4 seem to still use the init function (probably to ensure that the rtc-rime alias on the machine is properly set) so I'd keep this the same as everything else and drop this patch until this init function is removed from all other places as well. Hi Zoltan, Thanks for the fast reply! Regarding code homogeneity and duplication I've made a similar argument for mc146818_rtc_init() in the past [1] and I've learnt that my patch went backwards. Incidentally, Peter mentioned vt686c as a candidate for the embed-the-device-struct style which - again incidentally - I've now done. I've seen patches embedding devices recently but in this case it looked not that simple because of the rtc-time alias. The rtc-time alias is actually only used by a couple of PPC machines where Pegasos II is one of them. So the alias actually needs to be created only for these machines, and identifying the cases where it has to be preserved requires a lot of careful investigation. In the Pegasos II case this seems especially complicated since one needs to look through several layers of devices. During my work on the VT82xx south bridges I've gained some knowledge such that I'd like to make this simplifying contribution. I've used it to implement the get-time-of-day rtas call with VOF in pegasos2 because otherwise it would need to access internals of the RTC model and/or duplicate some code. Here's the message discussing this: https://lists.nongnu.org/archive/html/qemu-ppc/2021-10/msg00170.html so this alias still seems to be the simplest way. I think the primary function of this alias is not these ppc machines but some QMP/HMP command or to make the guest time available from the monitor or something like that so it's probably also used from there and therefore all rtc probably should have it but I'm not sure about it. Indeed, the alias seems to be a convenience for some QMP/HMP commands. AFAICS only the mc146818 sets the alias while it is probably not the only RTC modelled in QEMU. So I wonder why boards using another RTC don't need it and whether removing the alias constitutes a compatibility break. Our discussion makes me realize that the creation of the alias could now actually be moved to the Pegasos II board. This way, the Pegasos II board would both create and consume that alias, which seems to remove quite some cognitive load. Do you agree? Would moving the alias to the board work for you? Yes I think that would be better. This way the vt82xx and piix4 would be similar and the alias would also be clear within the pegasos2 code and it also has the machine directly at that point so it's clearer that way. All in all I wonder if we need to preserve the alias for the fuloong2e board? I don't know. A quick investigation shows that it seems to be added by commit 654a36d857ff94 which suggests something may use it (or was intended to use it back then, but not sure if things have changed in the meantime). I don't think any management app cares about fuloong2e but if this should be a generic thing then all machine may need it. Then it's also mentioned in commit 29551fdcf4d99 that suggests one ought to
Re: [PATCH RFC 00/13] migration: Postcopy Preempt-Full
On Mon, Aug 29, 2022 at 12:56:46PM -0400, Peter Xu wrote: > This is a RFC series. Tree is here: > > https://github.com/xzpeter/qemu/tree/preempt-full > > It's not complete because there're still something we need to do which will > be attached to the end of this cover letter, however this series can > already safely pass qtest and any of my test. > > Comparing to the recently merged preempt mode I called it "preempt-full" > because it threadifies the postcopy channels so now urgent pages can be > fully handled separately outside of the ram save loop. Sorry to have the > same name as the PREEMPT_FULL in the Linux RT world, it's just that we > needed a name for the capability and it was named as preempt already > anyway.. > > The existing preempt code has reduced ramdom page req latency over 10Gbps > network from ~12ms to ~500us which has already landed. > > This preempt-full series can further reduces that ~500us to ~230us per my > initial test. More to share below. > > Note that no new capability is needed, IOW it's fully compatible with the > existing preempt mode. So the naming is actually not important but just to > identify the difference on the binaries. It's because this series only > reworks the sender side code and does not change the migration protocol, it > just runs faster. > > IOW, old "preempt" QEMU can also migrate to "preempt-full" QEMU, vice versa. > > - When old "preempt" mode QEMU migrates to "preempt-full" QEMU, it'll be > the same as running both old "preempt" QEMUs. > > - When "preempt-full" QEMU migrates to old "preempt" QEMU, it'll be the > same as running both "preempt-full". > > The logic of the series is quite simple too: simply moving the existing > preempt channel page sends to rp-return thread. It can slow down rp-return > thread on receiving pages, but I don't really see a major issue with it so > far. > > This latency number is getting close to the extreme of 4K page request > latency of any TCP roundtrip of the 10Gbps nic I have. The 'extreme > number' is something I get from mig_mon tool which has a mode [1] to > emulate the extreme tcp roundtrips of page requests. > > Performance > === > > Page request latencies has distributions as below, with a VM of 20G mem, 20 > cores, 10Gbps nic, 18G fully random writes: > > Postcopy Vanilla > > > Average: 12093 (us) > @delay_us: > [1]1 | > | > [2, 4) 0 | > | > [4, 8) 0 | > | > [8, 16)0 | > | > [16, 32) 1 | > | > [32, 64) 8 | > | > [64, 128) 11 | > | > [128, 256)14 | > | > [256, 512)19 | > | > [512, 1K) 14 | > | > [1K, 2K) 35 | > | > [2K, 4K) 18 | > | > [4K, 8K) 87 |@ > | > [8K, 16K) 2397 > || > [16K, 32K) 7 | > | > [32K, 64K) 2 | > | > [64K, 128K) 20 | > | > [128K, 256K) 6 | > | > > Postcopy Preempt > > > Average: 496 (us) > > @delay_us: > [32, 64) 2 | > | > [64, 128) 2306 | > | > [128, 256) 25422 > || > [256, 512) 8238 | > | > [512, 1K) 1066 |@@ > | > [1K, 2K)2167 | > | > [2K, 4K)3329 |@@ > | > [4K, 8K) 109 | > | > [8K, 16K) 48 | > | > > Postcopy Preempt-Full > - > > Average: 229 (us) > > @delay_us: > [8, 16)1 | > | > [16, 32) 3 |
Re: [PATCH 1/1] monitor/hmp: print trace as option in help for log command
Sorry that the format for "none" should be changed as well. I have sent a v2: https://lists.gnu.org/archive/html/qemu-devel/2022-08/msg04445.html Thank you very much! Dongli Zhang On 8/29/22 3:04 AM, Dongli Zhang wrote: > The below is printed when printing help information in qemu-system-x86_64 > command line, and when CONFIG_TRACE_LOG is enabled: > > $ qemu-system-x86_64 -d help > ... ... > trace:PATTERN enable trace events > > Use "-d trace:help" to get a list of trace events. > > However, they are not printed in hmp "help log" command. > > Cc: Joe Jin > Signed-off-by: Dongli Zhang > --- > monitor/hmp.c | 7 ++- > 1 file changed, 6 insertions(+), 1 deletion(-) > > diff --git a/monitor/hmp.c b/monitor/hmp.c > index 15ca047..9f48b70 100644 > --- a/monitor/hmp.c > +++ b/monitor/hmp.c > @@ -287,8 +287,13 @@ void help_cmd(Monitor *mon, const char *name) > monitor_printf(mon, "Log items (comma separated):\n"); > monitor_printf(mon, "%-10s %s\n", "none", "remove all logs"); > for (item = qemu_log_items; item->mask != 0; item++) { > -monitor_printf(mon, "%-10s %s\n", item->name, item->help); > +monitor_printf(mon, "%-15s %s\n", item->name, item->help); > } > +#ifdef CONFIG_TRACE_LOG > +monitor_printf(mon, "trace:PATTERN enable trace events\n"); > +monitor_printf(mon, "\nUse \"info trace-events\" to get a list > of " > +"trace events.\n\n"); > +#endif > return; > } > >
Re: [PATCH 8/9] hw/isa/vt82c686: QOM'ify RTC creation
Am 24. August 2022 01:23:14 MESZ schrieb BALATON Zoltan : >On Tue, 23 Aug 2022, Bernhard Beschow wrote: >> On Tue, Aug 23, 2022 at 2:20 AM BALATON Zoltan wrote: >>> On Tue, 23 Aug 2022, Bernhard Beschow wrote: Signed-off-by: Bernhard Beschow --- hw/isa/vt82c686.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c index 47f2fd2669..ee745d5d49 100644 --- a/hw/isa/vt82c686.c +++ b/hw/isa/vt82c686.c @@ -546,6 +546,7 @@ struct ViaISAState { qemu_irq cpu_intr; qemu_irq *isa_irqs; ViaSuperIOState via_sio; +RTCState rtc; PCIIDEState ide; UHCIState uhci[2]; ViaPMState pm; @@ -567,6 +568,7 @@ static void via_isa_init(Object *obj) { ViaISAState *s = VIA_ISA(obj); +object_initialize_child(obj, "rtc", >rtc, TYPE_MC146818_RTC); object_initialize_child(obj, "ide", >ide, "via-ide"); object_initialize_child(obj, "uhci1", >uhci[0], >>> "vt82c686b-usb-uhci"); object_initialize_child(obj, "uhci2", >uhci[1], >>> "vt82c686b-usb-uhci"); @@ -615,7 +617,15 @@ static void via_isa_realize(PCIDevice *d, Error >>> **errp) isa_bus_irqs(isa_bus, s->isa_irqs); i8254_pit_init(isa_bus, 0x40, 0, NULL); i8257_dma_init(isa_bus, 0); -mc146818_rtc_init(isa_bus, 2000, NULL); + +/* RTC */ +qdev_prop_set_int32(DEVICE(>rtc), "base_year", 2000); +if (!qdev_realize(DEVICE(>rtc), BUS(isa_bus), errp)) { +return; +} +object_property_add_alias(qdev_get_machine(), "rtc-time", >>> OBJECT(>rtc), + "date"); +isa_connect_gpio_out(ISA_DEVICE(>rtc), 0, s->rtc.isairq); for (i = 0; i < PCI_CONFIG_HEADER_SIZE; i++) { if (i < PCI_COMMAND || i >= PCI_REVISION_ID) { >>> >>> This actually introduces code duplication as all other places except piix4 >>> seem to still use the init function (probably to ensure that the rtc-rime >>> alias on the machine is properly set) so I'd keep this the same as >>> everything else and drop this patch until this init function is removed >>> from all other places as well. >>> >> >> Hi Zoltan, >> >> Thanks for the fast reply! Regarding code homogeneity and duplication I've >> made a similar argument for mc146818_rtc_init() in the past [1] and I've >> learnt that my patch went backwards. Incidentally, Peter mentioned vt686c >> as a candidate for the embed-the-device-struct style which - again >> incidentally - I've now done. > >I've seen patches embedding devices recently but in this case it looked not >that simple because of the rtc-time alias. > >> The rtc-time alias is actually only used by a couple of PPC machines where >> Pegasos II is one of them. So the alias actually needs to be created only >> for these machines, and identifying the cases where it has to be preserved >> requires a lot of careful investigation. In the Pegasos II case this seems >> especially complicated since one needs to look through several layers of >> devices. During my work on the VT82xx south bridges I've gained some >> knowledge such that I'd like to make this simplifying contribution. > >I've used it to implement the get-time-of-day rtas call with VOF in pegasos2 >because otherwise it would need to access internals of the RTC model and/or >duplicate some code. Here's the message discussing this: > >https://lists.nongnu.org/archive/html/qemu-ppc/2021-10/msg00170.html > >so this alias still seems to be the simplest way. > >I think the primary function of this alias is not these ppc machines but some >QMP/HMP command or to make the guest time available from the monitor or >something like that so it's probably also used from there and therefore all >rtc probably should have it but I'm not sure about it. Indeed, the alias seems to be a convenience for some QMP/HMP commands. AFAICS only the mc146818 sets the alias while it is probably not the only RTC modelled in QEMU. So I wonder why boards using another RTC don't need it and whether removing the alias constitutes a compatibility break. >> Our discussion makes me realize that the creation of the alias could now >> actually be moved to the Pegasos II board. This way, the Pegasos II board >> would both create and consume that alias, which seems to remove quite some >> cognitive load. Do you agree? Would moving the alias to the board work for >> you? > >Yes I think that would be better. This way the vt82xx and piix4 would be >similar and the alias would also be clear within the pegasos2 code and it also >has the machine directly at that point so it's clearer that way. All in all I wonder if we need to preserve the alias for the fuloong2e board? Best regards, Bernhard > >Regards, >BALATON Zoltan
[PATCH v2 1/1] monitor/hmp: print trace as option in help for log command
The below is printed when printing help information in qemu-system-x86_64 command line, and when CONFIG_TRACE_LOG is enabled: $ qemu-system-x86_64 -d help ... ... trace:PATTERN enable trace events Use "-d trace:help" to get a list of trace events. However, they are not printed in hmp "help log" command. Cc: Joe Jin Signed-off-by: Dongli Zhang --- Changed since v1: - change format for "none" as well. monitor/hmp.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/monitor/hmp.c b/monitor/hmp.c index 15ca047..467fc84 100644 --- a/monitor/hmp.c +++ b/monitor/hmp.c @@ -285,10 +285,15 @@ void help_cmd(Monitor *mon, const char *name) if (!strcmp(name, "log")) { const QEMULogItem *item; monitor_printf(mon, "Log items (comma separated):\n"); -monitor_printf(mon, "%-10s %s\n", "none", "remove all logs"); +monitor_printf(mon, "%-15s %s\n", "none", "remove all logs"); for (item = qemu_log_items; item->mask != 0; item++) { -monitor_printf(mon, "%-10s %s\n", item->name, item->help); +monitor_printf(mon, "%-15s %s\n", item->name, item->help); } +#ifdef CONFIG_TRACE_LOG +monitor_printf(mon, "trace:PATTERN enable trace events\n"); +monitor_printf(mon, "\nUse \"info trace-events\" to get a list of " +"trace events.\n\n"); +#endif return; } -- 1.8.3.1
[PATCH v2 2/4] tests/x86: Add 'q35' machine type to ivshmem-test
Configure pci bridge setting to test ivshmem on 'q35'. Signed-off-by: Michael Labiuk --- tests/qtest/ivshmem-test.c | 30 ++ 1 file changed, 30 insertions(+) diff --git a/tests/qtest/ivshmem-test.c b/tests/qtest/ivshmem-test.c index e23a97fa8e..c4ca7efc62 100644 --- a/tests/qtest/ivshmem-test.c +++ b/tests/qtest/ivshmem-test.c @@ -378,6 +378,32 @@ static void test_ivshmem_server(void) close(thread.pipe[0]); } +static void device_del(QTestState *qtest, const char *id) +{ +QDict *resp; + +resp = qtest_qmp(qtest, + "{'execute': 'device_del'," + " 'arguments': { 'id': %s } }", id); + +g_assert(qdict_haskey(resp, "return")); +qobject_unref(resp); +} + +static void test_ivshmem_hotplug_q35(void) +{ +QTestState *qts = qtest_init("-object memory-backend-ram,size=1M,id=mb1 " + "-device pcie-root-port,id=p1 " + "-device pcie-pci-bridge,bus=p1,id=b1 " + "-machine q35"); + +qtest_qmp_device_add(qts, "ivshmem-plain", "iv1", + "{'memdev': 'mb1', 'bus': 'b1'}"); +device_del(qts, "iv1"); + +qtest_quit(qts); +} + #define PCI_SLOT_HP 0x06 static void test_ivshmem_hotplug(void) @@ -469,6 +495,7 @@ int main(int argc, char **argv) { int ret, fd; gchar dir[] = "/tmp/ivshmem-test.XX"; +const char *arch = qtest_get_arch(); g_test_init(, , NULL); @@ -494,6 +521,9 @@ int main(int argc, char **argv) qtest_add_func("/ivshmem/pair", test_ivshmem_pair); qtest_add_func("/ivshmem/server", test_ivshmem_server); } +if (!strcmp(arch, "x86_64")) { +qtest_add_func("/ivshmem/hotplug-q35", test_ivshmem_hotplug_q35); +} out: ret = g_test_run(); -- 2.34.1
[PATCH v2 3/4] tests/x86: Add 'q35' machine type to hd-geo-test
Add pci bridge setting to test hotplug. Duplicate tests for plugging scsi and virtio devices for q35 machine type. Signed-off-by: Michael Labiuk --- tests/qtest/hd-geo-test.c | 148 ++ 1 file changed, 148 insertions(+) diff --git a/tests/qtest/hd-geo-test.c b/tests/qtest/hd-geo-test.c index 413cf964c0..256450729f 100644 --- a/tests/qtest/hd-geo-test.c +++ b/tests/qtest/hd-geo-test.c @@ -874,6 +874,78 @@ static void test_override_scsi_hot_unplug(void) g_free(args); } +static void test_override_scsi_hot_unplug_q35(void) +{ +QTestState *qts; +char *joined_args; +QFWCFG *fw_cfg; +QDict *response; +int i; +TestArgs *args = create_args(); +CHSResult expected[] = { +{ +"/pci@i0cf8/pci-bridge@1/pci-bridge@0/scsi@2/channel@0/disk@0,0", +{1, 120, 30} +}, +{ +"/pci@i0cf8/pci-bridge@1/pci-bridge@0/scsi@2/channel@0/disk@1,0", +{20, 20, 20} +}, +{NULL, {0, 0, 0} } +}; +CHSResult expected2[] = { +{ +"/pci@i0cf8/pci-bridge@1/pci-bridge@0/scsi@2/channel@0/disk@1,0", +{20, 20, 20} +}, +{NULL, {0, 0, 0} } +}; +add_drive_with_mbr(args, empty_mbr, 1); +add_drive_with_mbr(args, empty_mbr, 1); +add_scsi_controller(args, "virtio-scsi-pci", "b1", 2); +add_scsi_disk(args, 0, 0, 0, 0, 0, 1, 120, 30); +add_scsi_disk(args, 1, 0, 0, 1, 0, 20, 20, 20); + +joined_args = g_strjoinv(" ", args->argv); + +qts = qtest_initf("-device pcie-root-port,id=p0 " + "-device pcie-pci-bridge,bus=p0,id=b1 " + "-machine q35 %s", joined_args); +fw_cfg = pc_fw_cfg_init(qts); + +read_bootdevices(fw_cfg, expected); + +/* unplug device an restart */ +response = qtest_qmp(qts, + "{ 'execute': 'device_del'," + " 'arguments': {'id': 'scsi-disk0' }}"); +g_assert(response); +g_assert(!qdict_haskey(response, "error")); +qobject_unref(response); +response = qtest_qmp(qts, + "{ 'execute': 'system_reset', 'arguments': { }}"); +g_assert(response); +g_assert(!qdict_haskey(response, "error")); +qobject_unref(response); + +qtest_qmp_eventwait(qts, "RESET"); + +read_bootdevices(fw_cfg, expected2); + +g_free(joined_args); +qtest_quit(qts); + +g_free(fw_cfg); + +for (i = 0; i < args->n_drives; i++) { +unlink(args->drives[i]); +free(args->drives[i]); +} +g_free(args->drives); +g_strfreev(args->argv); +g_free(args); +} + static void test_override_virtio_hot_unplug(void) { QTestState *qts; @@ -934,6 +1006,77 @@ static void test_override_virtio_hot_unplug(void) g_free(args); } +static void test_override_virtio_hot_unplug_q35(void) +{ +QTestState *qts; +char *joined_args; +QFWCFG *fw_cfg; +QDict *response; +int i; +TestArgs *args = create_args(); +CHSResult expected[] = { +{ +"/pci@i0cf8/pci-bridge@2/pci-bridge@0/scsi@2/disk@0,0", +{1, 120, 30} +}, +{ +"/pci@i0cf8/pci-bridge@2/pci-bridge@0/scsi@3/disk@0,0", +{20, 20, 20} +}, +{NULL, {0, 0, 0} } +}; +CHSResult expected2[] = { +{ +"/pci@i0cf8/pci-bridge@2/pci-bridge@0/scsi@3/disk@0,0", +{20, 20, 20} +}, +{NULL, {0, 0, 0} } +}; +add_drive_with_mbr(args, empty_mbr, 1); +add_drive_with_mbr(args, empty_mbr, 1); +add_virtio_disk(args, 0, "b1", 2, 1, 120, 30); +add_virtio_disk(args, 1, "b1", 3, 20, 20, 20); + +joined_args = g_strjoinv(" ", args->argv); + +qts = qtest_initf("-device pcie-root-port,id=p0 " + "-device pcie-pci-bridge,bus=p0,id=b1 " + "-machine pc %s", joined_args); +fw_cfg = pc_fw_cfg_init(qts); + +read_bootdevices(fw_cfg, expected); + +/* unplug device an restart */ +response = qtest_qmp(qts, + "{ 'execute': 'device_del'," + " 'arguments': {'id': 'virtio-disk0' }}"); +g_assert(response); +g_assert(!qdict_haskey(response, "error")); +qobject_unref(response); +response = qtest_qmp(qts, + "{ 'execute': 'system_reset', 'arguments': { }}"); +g_assert(response); +g_assert(!qdict_haskey(response, "error")); +qobject_unref(response); + +qtest_qmp_eventwait(qts, "RESET"); + +read_bootdevices(fw_cfg, expected2); + +g_free(joined_args); +qtest_quit(qts); + +g_free(fw_cfg); + +for (i = 0; i < args->n_drives; i++) { +unlink(args->drives[i]); +free(args->drives[i]); +} +g_free(args->drives); +g_strfreev(args->argv); +g_free(args); +} + int main(int argc, char **argv) { Backend i; @@ -974,8 +1117,13 @@ int main(int argc, char **argv)
Re: [PATCH 4/9] hw/isa/vt82c686: QOM'ify via-ide creation
On Mon, 29 Aug 2022, BB wrote: Am 25. August 2022 01:18:56 MESZ schrieb BALATON Zoltan : On Thu, 25 Aug 2022, Bernhard Beschow wrote: On Wed, Aug 24, 2022 at 3:54 PM BALATON Zoltan wrote: On Tue, 23 Aug 2022, Bernhard Beschow wrote: The IDE function is closely tied to the ISA function (e.g. the IDE interrupt routing happens there), so it makes sense that the IDE function is instantiated within the southbridge itself. As a side effect, duplicated code in the boards is resolved. Signed-off-by: Bernhard Beschow --- configs/devices/mips64el-softmmu/default.mak | 1 - hw/isa/Kconfig | 1 + hw/isa/vt82c686.c| 18 ++ hw/mips/fuloong2e.c | 3 --- hw/ppc/Kconfig | 1 - hw/ppc/pegasos2.c| 4 6 files changed, 19 insertions(+), 9 deletions(-) diff --git a/configs/devices/mips64el-softmmu/default.mak b/configs/devices/mips64el-softmmu/default.mak index c610749ac1..d5188f7ea5 100644 --- a/configs/devices/mips64el-softmmu/default.mak +++ b/configs/devices/mips64el-softmmu/default.mak @@ -1,7 +1,6 @@ # Default configuration for mips64el-softmmu include ../mips-softmmu/common.mak -CONFIG_IDE_VIA=y CONFIG_FULOONG=y CONFIG_LOONGSON3V=y CONFIG_ATI_VGA=y diff --git a/hw/isa/Kconfig b/hw/isa/Kconfig index d42143a991..20de7e9294 100644 --- a/hw/isa/Kconfig +++ b/hw/isa/Kconfig @@ -53,6 +53,7 @@ config VT82C686 select I8254 select I8257 select I8259 +select IDE_VIA select MC146818RTC select PARALLEL diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c index 5582c0b179..37d9ed635d 100644 --- a/hw/isa/vt82c686.c +++ b/hw/isa/vt82c686.c @@ -17,6 +17,7 @@ #include "hw/isa/vt82c686.h" #include "hw/pci/pci.h" #include "hw/qdev-properties.h" +#include "hw/ide/pci.h" #include "hw/isa/isa.h" #include "hw/isa/superio.h" #include "hw/intc/i8259.h" @@ -544,6 +545,7 @@ struct ViaISAState { qemu_irq cpu_intr; qemu_irq *isa_irqs; ViaSuperIOState via_sio; +PCIIDEState ide; }; static const VMStateDescription vmstate_via = { @@ -556,10 +558,18 @@ static const VMStateDescription vmstate_via = { } }; +static void via_isa_init(Object *obj) +{ +ViaISAState *s = VIA_ISA(obj); + +object_initialize_child(obj, "ide", >ide, "via-ide"); +} + static const TypeInfo via_isa_info = { .name = TYPE_VIA_ISA, .parent= TYPE_PCI_DEVICE, .instance_size = sizeof(ViaISAState), +.instance_init = via_isa_init, .abstract = true, .interfaces= (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, @@ -583,6 +593,7 @@ static void via_isa_realize(PCIDevice *d, Error **errp) { ViaISAState *s = VIA_ISA(d); DeviceState *dev = DEVICE(d); +PCIBus *pci_bus = pci_get_bus(d); qemu_irq *isa_irq; ISABus *isa_bus; int i; @@ -607,6 +618,13 @@ static void via_isa_realize(PCIDevice *d, Error **errp) if (!qdev_realize(DEVICE(>via_sio), BUS(isa_bus), errp)) { return; } + +/* Function 1: IDE */ +qdev_prop_set_int32(DEVICE(>ide), "addr", d->devfn + 1); +if (!qdev_realize(DEVICE(>ide), BUS(pci_bus), errp)) { +return; +} +pci_ide_create_devs(PCI_DEVICE(>ide)); I'm not sure about moving pci_ide_create_devs() here. This is usally called from board code and only piix4 seems to do this. Maybe that's wrong because if all IDE devices did this then one machine could not have more than one different ide devices (like having an on-board ide and adding a pci ide controoler with -device) so this probably belongs to the board code to add devices to its default ide controller only as this is machine specific. Unless I'm wrong in which case somebody will correct me. Grepping the code it can be seen that it's always called right after creating the IDE controllers. The only notable exception is the "sii3112" device in the sam460ex board which is not emulated yet. Since the IDE The problem with sii3112 is that it only has 2 channels becuase I did not bother to implement more so pci_ide_create_devs() probably would not work as it assumes 4 channels. AFAIK this means that the short -hda, -cdrom, etc. convenience options don't work with sam460ex but yhou have to use the long way of creating ide-hd and ide-cd devices on the command line. I think there's a version of this controller with 4 channels, maybe called sii3114 or similar and it would be easy to enhance the current model for that but I haven't done that. What's not emulated on sam460ex is the on-board SATA ports of the SoC because it's too complex and all guest OSes have sii31xx drivers so it was simpler to implement that instead. The network port is the same as we already have working PCI network cards so I did not try to implement the 460EX network ports. controllers are often created in board code this means pci_ide_create_devs() is called there as well. Grouping these calls
[PATCH v2 1/4] tests/x86: Add subtest with 'q35' machine type to device-plug-test
Configure pci bridge setting to plug pci device and unplug. Signed-off-by: Michael Labiuk --- tests/qtest/device-plug-test.c | 26 ++ 1 file changed, 26 insertions(+) diff --git a/tests/qtest/device-plug-test.c b/tests/qtest/device-plug-test.c index 2e3137843e..2f07b37ba1 100644 --- a/tests/qtest/device-plug-test.c +++ b/tests/qtest/device-plug-test.c @@ -165,6 +165,26 @@ static void test_spapr_phb_unplug_request(void) qtest_quit(qtest); } +static void test_q35_pci_unplug_request(void) +{ + +QTestState *qtest = qtest_initf("-machine q35 " +"-device pcie-root-port,id=p1 " +"-device pcie-pci-bridge,bus=p1,id=b1 " +"-device virtio-mouse-pci,bus=b1,id=dev0"); + +/* + * Request device removal. As the guest is not running, the request won't + * be processed. However during system reset, the removal will be + * handled, removing the device. + */ +device_del(qtest, "dev0"); +system_reset(qtest); +wait_device_deleted_event(qtest, "dev0"); + +qtest_quit(qtest); +} + int main(int argc, char **argv) { const char *arch = qtest_get_arch(); @@ -195,5 +215,11 @@ int main(int argc, char **argv) test_spapr_phb_unplug_request); } +if (!strcmp(arch, "x86_64")) { +qtest_add_func("/device-plug/q35-pci-unplug-request", + test_q35_pci_unplug_request); + +} + return g_test_run(); } -- 2.34.1
[PATCH v2 4/4] tests/x86: Add 'q35' machine type to drive_del-test
Configure pci bridge setting Also run tests on 'q35' machine type. Signed-off-by: Michael Labiuk --- tests/qtest/drive_del-test.c | 111 +++ 1 file changed, 111 insertions(+) diff --git a/tests/qtest/drive_del-test.c b/tests/qtest/drive_del-test.c index 5e6d58b4dd..3a2ddecf22 100644 --- a/tests/qtest/drive_del-test.c +++ b/tests/qtest/drive_del-test.c @@ -258,6 +258,27 @@ static void test_cli_device_del(void) qtest_quit(qts); } +static void test_cli_device_del_q35(void) +{ +QTestState *qts; + +/* + * -drive/-device and device_del. Start with a drive used by a + * device that unplugs after reset. + */ +qts = qtest_initf("-drive if=none,id=drive0,file=null-co://," + "file.read-zeroes=on,format=raw " + "-machine q35 -device pcie-root-port,id=p1 " + "-device pcie-pci-bridge,bus=p1,id=b1 " + "-device virtio-blk-%s,drive=drive0,bus=b1,id=dev0", + qvirtio_get_dev_type()); + +device_del(qts, true); +g_assert(!has_drive(qts)); + +qtest_quit(qts); +} + static void test_empty_device_del(void) { QTestState *qts; @@ -294,6 +315,45 @@ static void test_device_add_and_del(void) qtest_quit(qts); } +static void device_add_q35(QTestState *qts) +{ +QDict *response; +char driver[32]; +snprintf(driver, sizeof(driver), "virtio-blk-%s", + qvirtio_get_dev_type()); + +response = qtest_qmp(qts, "{'execute': 'device_add'," + " 'arguments': {" + " 'driver': %s," + " 'drive': 'drive0'," + " 'id': 'dev0'," + " 'bus': 'b1'" + "}}", driver); +g_assert(response); +g_assert(qdict_haskey(response, "return")); +qobject_unref(response); +} + +static void test_device_add_and_del_q35(void) +{ +QTestState *qts; + +/* + * -drive/device_add and device_del. Start with a drive used by a + * device that unplugs after reset. + */ +qts = qtest_initf("-machine q35 -device pcie-root-port,id=p1 " + "-device pcie-pci-bridge,bus=p1,id=b1 " + "-drive if=none,id=drive0,file=null-co://," + "file.read-zeroes=on,format=raw"); + +device_add_q35(qts); +device_del(qts, true); +g_assert(!has_drive(qts)); + +qtest_quit(qts); +} + static void test_drive_add_device_add_and_del(void) { QTestState *qts; @@ -318,6 +378,25 @@ static void test_drive_add_device_add_and_del(void) qtest_quit(qts); } +static void test_drive_add_device_add_and_del_q35(void) +{ +QTestState *qts; + +qts = qtest_init("-machine q35 -device pcie-root-port,id=p1 " + "-device pcie-pci-bridge,bus=p1,id=b1"); + +/* + * drive_add/device_add and device_del. The drive is used by a + * device that unplugs after reset. + */ +drive_add_with_media(qts); +device_add_q35(qts); +device_del(qts, true); +g_assert(!has_drive(qts)); + +qtest_quit(qts); +} + static void test_blockdev_add_device_add_and_del(void) { QTestState *qts; @@ -342,8 +421,29 @@ static void test_blockdev_add_device_add_and_del(void) qtest_quit(qts); } +static void test_blockdev_add_device_add_and_del_q35(void) +{ +QTestState *qts; + +qts = qtest_init("-machine q35 -device pcie-root-port,id=p1 " + "-device pcie-pci-bridge,bus=p1,id=b1"); + +/* + * blockdev_add/device_add and device_del. The it drive is used by a + * device that unplugs after reset, but it doesn't go away. + */ +blockdev_add_with_media(qts); +device_add_q35(qts); +device_del(qts, true); +g_assert(has_blockdev(qts)); + +qtest_quit(qts); +} + int main(int argc, char **argv) { +const char *arch = qtest_get_arch(); + g_test_init(, , NULL); qtest_add_func("/drive_del/without-dev", test_drive_without_dev); @@ -363,6 +463,17 @@ int main(int argc, char **argv) test_empty_device_del); qtest_add_func("/device_del/blockdev", test_blockdev_add_device_add_and_del); + +if (!strcmp(arch, "x86_64")) { +qtest_add_func("/device_del/drive/cli_device_q35", + test_cli_device_del_q35); +qtest_add_func("/device_del/drive/device_add_q35", + test_device_add_and_del_q35); +qtest_add_func("/device_del/drive/drive_add_device_add_q35", + test_drive_add_device_add_and_del_q35); +qtest_add_func("/device_del/blockdev_q35", + test_blockdev_add_device_add_and_del_q35); +} } return g_test_run(); -- 2.34.1
[PATCH v2 0/4] Add 'q35' machine type to hotplug tests
Add pci bridge setting to run hotplug tests on q35 machine type. Hotplug tests was bounded to 'pc' machine type by commit 7b172333f1b Michael Labiuk (4): tests/x86: Add subtest with 'q35' machine type to device-plug-test tests/x86: Add 'q35' machine type to ivshmem-test tests/x86: Add 'q35' machine type to hd-geo-test tests/x86: Add 'q35' machine type to drive_del-test tests/qtest/device-plug-test.c | 26 ++ tests/qtest/drive_del-test.c | 111 + tests/qtest/hd-geo-test.c | 148 + tests/qtest/ivshmem-test.c | 30 +++ 4 files changed, 315 insertions(+) -- 2.34.1
[PATCH RFC 13/13] migration: Send requested page directly in rp-return thread
With all the facilities ready, send the requested page directly in the rp-return thread rather than queuing it in the request queue, if and only if postcopy preempt is enabled. It can achieve so because it uses separate channel for sending urgent pages. The only shared data is bitmap and it's protected by the bitmap_mutex. Signed-off-by: Peter Xu --- migration/ram.c | 108 1 file changed, 108 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index ef89812c69..e731a70255 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -539,6 +539,8 @@ static QemuThread *decompress_threads; static QemuMutex decomp_done_lock; static QemuCond decomp_done_cond; +static int ram_save_host_page_urgent(PageSearchStatus *pss); + static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, ram_addr_t offset, uint8_t *source_buf); @@ -553,6 +555,16 @@ static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) pss->complete_round = false; } +/* + * Check whether two PSSs are actively sending the same page. Return true + * if it is, false otherwise. + */ +static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) +{ +return pss1->host_page_sending && pss2->host_page_sending && +(pss1->host_page_start == pss2->host_page_start); +} + static void *do_data_compress(void *opaque) { CompressParam *param = opaque; @@ -2250,6 +2262,53 @@ int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len) return -1; } +/* + * When with postcopy preempt, we send back the page directly in the + * rp-return thread. + */ +if (postcopy_preempt_active()) { +ram_addr_t page_start = start >> TARGET_PAGE_BITS; +size_t page_size = qemu_ram_pagesize(ramblock); +PageSearchStatus *pss = _state->pss[RAM_CHANNEL_POSTCOPY]; +int ret = 0; + +qemu_mutex_lock(>bitmap_mutex); + +pss_init(pss, ramblock, page_start); +/* Always use the preempt channel, and make sure it's there */ +pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; +pss->postcopy_requested = true; +assert(pss->pss_channel); + +/* + * It must be either one or multiple of host page size. Just + * assert; if something wrong we're mostly split brain anyway. + */ +assert(len % page_size == 0); +while (len) { +if (ram_save_host_page_urgent(pss)) { +error_report("%s: ram_save_host_page_urgent() failed: " + "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, + __func__, ramblock->idstr, start); +ret = -1; +break; +} +/* + * NOTE: after ram_save_host_page_urgent() succeeded, pss->page + * will automatically be moved and point to the next host page + * we're going to send, so no need to update here. + * + * Normally QEMU never sends >1 host page in requests, so + * logically we don't even need that as the loop should only + * run once, but just to be consistent. + */ +len -= page_size; +}; +qemu_mutex_unlock(>bitmap_mutex); + +return ret; +} + struct RAMSrcPageRequest *new_entry = g_new0(struct RAMSrcPageRequest, 1); new_entry->rb = ramblock; @@ -2528,6 +2587,55 @@ static void pss_host_page_finish(PageSearchStatus *pss) pss->host_page_end = 0; } +/* + * Send an urgent host page specified by `pss'. Need to be called with + * bitmap_mutex held. + * + * Returns 0 if save host page succeeded, false otherwise. + */ +static int ram_save_host_page_urgent(PageSearchStatus *pss) +{ +bool page_dirty, sent = false; +RAMState *rs = ram_state; +int ret = 0; + +trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); +pss_host_page_prepare(pss); + +/* + * If precopy is sending the same page, let it be done in precopy, or + * we could send the same page in two channels and none of them will + * receive the whole page. + */ +if (pss_overlap(pss, _state->pss[RAM_CHANNEL_PRECOPY])) { +trace_postcopy_preempt_hit(pss->block->idstr, + pss->page << TARGET_PAGE_BITS); +return 0; +} + +do { +page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); + +if (page_dirty) { +/* Be strict to return code; it must be 1, or what else? */ +if (ram_save_target_page(rs, pss) != 1) { +error_report_once("%s: ram_save_target_page failed", __func__); +ret = -1; +goto out; +} +sent = true; +} +pss_find_next_dirty(pss); +} while
[PATCH RFC 08/13] migration: Teach PSS about host page
Migration code has a lot to do with host pages. Teaching PSS core about the idea of host page helps a lot and makes the code clean. Meanwhile, this prepares for the future changes that can leverage the new PSS helpers that this patch introduces to send host page in another thread. Three more fields are introduced for this: (1) host_page_sending: this is set to true when QEMU is sending a host page, false otherwise. (2) host_page_{start|end}: this points to the end of host page, and it's only valid when host_page_sending==true. For example, when we look up the next dirty page on the ramblock, with host_page_sending==true, we'll not even try to look for anything beyond the current host page. This can be efficient than current code because currently we'll set pss->page to next dirty bit (which can be over current host page) and reset it to host page boundary if we found overflow. The latter is not efficient as we don't need to scan over host page boundary. Meanwhile with above, we can easily make migration_bitmap_find_dirty() self contained by updating pss->page properly. rs* parameter is removed because it's not even used in old code. When sending a host page, we should use the pss helpers like this: - pss_host_page_prepare(pss): called before sending host page - pss_within_range(pss): whether we're still working on the cur host page? - pss_host_page_finish(pss): called after sending a host page If there'll be another function to send host page (e.g. in return path thread) in the future, it should follow the same style. Signed-off-by: Peter Xu --- migration/ram.c | 91 +++-- 1 file changed, 73 insertions(+), 18 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 2f37520be4..e2b922ad59 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -474,6 +474,11 @@ struct PageSearchStatus { * postcopy pages via postcopy preempt channel. */ bool postcopy_target_channel; +/* Whether we're sending a host page */ +bool host_page_sending; +/* The start/end of current host page. Invalid if host_page_sending==false */ +unsigned long host_page_start; +unsigned long host_page_end; }; typedef struct PageSearchStatus PageSearchStatus; @@ -851,26 +856,36 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, } /** - * migration_bitmap_find_dirty: find the next dirty page from start + * pss_find_next_dirty: find the next dirty page of current ramblock * - * Returns the page offset within memory region of the start of a dirty page + * This function updates pss->page to point to the next dirty page index + * within the ramblock, or the end of ramblock when nothing found. * * @rs: current RAM state - * @rb: RAMBlock where to search for dirty pages - * @start: page where we start the search + * @pss: the current page search status */ -static inline -unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, - unsigned long start) +static void pss_find_next_dirty(PageSearchStatus *pss) { +RAMBlock *rb = pss->block; unsigned long size = rb->used_length >> TARGET_PAGE_BITS; unsigned long *bitmap = rb->bmap; if (ramblock_is_ignored(rb)) { -return size; +/* Points directly to the end, so we know no dirty page */ +pss->page = size; +return; } -return find_next_bit(bitmap, size, start); +/* + * If during sending a host page, only look for dirty pages within the + * current host page being send. + */ +if (pss->host_page_sending) { +assert(pss->host_page_end); +size = MIN(size, pss->host_page_end); +} + +pss->page = find_next_bit(bitmap, size, pss->page); } static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, @@ -1555,7 +1570,9 @@ static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again) pss->postcopy_requested = false; pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY; -pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); +/* Update pss->page for the next dirty bit in ramblock */ +pss_find_next_dirty(pss); + if (pss->complete_round && pss->block == rs->last_seen_block && pss->page >= rs->last_page) { /* @@ -2445,6 +2462,44 @@ static void postcopy_preempt_reset_channel(RAMState *rs) } } +/* Should be called before sending a host page */ +static void pss_host_page_prepare(PageSearchStatus *pss) +{ +/* How many guest pages are there in one host page? */ +size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; + +pss->host_page_sending = true; +pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); +pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); +} + +/* + * Whether the page pointed by PSS is within the host page being sent. + * Must be called
[PATCH RFC 12/13] migration: Move last_sent_block into PageSearchStatus
Since we use PageSearchStatus to represent a channel, it makes perfect sense to keep last_sent_block (aka, leverage RAM_SAVE_FLAG_CONTINUE) to be per-channel rather than global because each channel can be sending different pages on ramblocks. Hence move it from RAMState into PageSearchStatus. Signed-off-by: Peter Xu --- migration/ram.c | 71 - 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 2be9b91ffc..ef89812c69 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -89,6 +89,8 @@ XBZRLECacheStats xbzrle_counters; struct PageSearchStatus { /* The migration channel used for a specific host page */ QEMUFile*pss_channel; +/* Last block from where we have sent data */ +RAMBlock *last_sent_block; /* Current block being searched */ RAMBlock*block; /* Current page to search from */ @@ -368,8 +370,6 @@ struct RAMState { int uffdio_fd; /* Last block that we have visited searching for dirty pages */ RAMBlock *last_seen_block; -/* Last block from where we have sent data */ -RAMBlock *last_sent_block; /* Last dirty target page we have sent */ ram_addr_t last_page; /* last ram version we have seen */ @@ -677,16 +677,17 @@ exit: * * Returns the number of bytes written * - * @f: QEMUFile where to send the data + * @pss: current PSS channel status * @block: block that contains the page we want to send * @offset: offset inside the block for the page * in the lower bits, it contains flags */ -static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, +static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block, ram_addr_t offset) { size_t size, len; -bool same_block = (block == rs->last_sent_block); +bool same_block = (block == pss->last_sent_block); +QEMUFile *f = pss->pss_channel; if (same_block) { offset |= RAM_SAVE_FLAG_CONTINUE; @@ -699,7 +700,7 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, qemu_put_byte(f, len); qemu_put_buffer(f, (uint8_t *)block->idstr, len); size += 1 + len; -rs->last_sent_block = block; +pss->last_sent_block = block; } return size; } @@ -783,17 +784,19 @@ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) * -1 means that xbzrle would be longer than normal * * @rs: current RAM state + * @pss: current PSS channel * @current_data: pointer to the address of the page contents * @current_addr: addr of the page * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_xbzrle_page(RAMState *rs, QEMUFile *file, +static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, uint8_t **current_data, ram_addr_t current_addr, RAMBlock *block, ram_addr_t offset) { int encoded_len = 0, bytes_xbzrle; uint8_t *prev_cached_page; +QEMUFile *file = pss->pss_channel; if (!cache_is_cached(XBZRLE.cache, current_addr, ram_counters.dirty_sync_count)) { @@ -858,7 +861,7 @@ static int save_xbzrle_page(RAMState *rs, QEMUFile *file, } /* Send XBZRLE based compressed page */ -bytes_xbzrle = save_page_header(rs, file, block, +bytes_xbzrle = save_page_header(pss, block, offset | RAM_SAVE_FLAG_XBZRLE); qemu_put_byte(file, ENCODING_FLAG_XBZRLE); qemu_put_be16(file, encoded_len); @@ -1286,19 +1289,19 @@ static void ram_release_page(const char *rbname, uint64_t offset) * Returns the size of data written to the file, 0 means the page is not * a zero page * - * @rs: current RAM state - * @file: the file where the data is saved + * @pss: current PSS channel * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, +static int save_zero_page_to_file(PageSearchStatus *pss, RAMBlock *block, ram_addr_t offset) { uint8_t *p = block->host + offset; +QEMUFile *file = pss->pss_channel; int len = 0; if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { -len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO); +len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO); qemu_put_byte(file, 0); len += 1; ram_release_page(block->idstr, offset); @@ -1311,14 +1314,14 @@ static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, * * Returns the number of pages written. * - * @rs: current RAM state + * @pss: current PSS channel * @block: block that contains the page we want to send * @offset: offset inside the block for the page */
[PATCH RFC 11/13] migration: Make PageSearchStatus part of RAMState
We used to allocate PSS structure on the stack for precopy when sending pages. Make it static, so as to describe per-channel ram migration status. Here we declared RAM_CHANNEL_MAX instances, preparing for postcopy to use it, even though this patch has not yet to start using the 2nd instance. This should not have any functional change per se, but it already starts to export PSS information via the RAMState, so that e.g. one PSS channel can start to reference the other PSS channel. Always protect PSS access using the same RAMState.bitmap_mutex. We already do so, so no code change needed, just some comment update. Maybe we should consider renaming bitmap_mutex some day as it's going to be a more commonly and big mutex we use for ram states, but just leave it for later. Signed-off-by: Peter Xu --- migration/ram.c | 116 ++-- 1 file changed, 63 insertions(+), 53 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index bdfcc6171a..2be9b91ffc 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -85,6 +85,46 @@ XBZRLECacheStats xbzrle_counters; +/* used by the search for pages to send */ +struct PageSearchStatus { +/* The migration channel used for a specific host page */ +QEMUFile*pss_channel; +/* Current block being searched */ +RAMBlock*block; +/* Current page to search from */ +unsigned long page; +/* Set once we wrap around */ +bool complete_round; +/* + * [POSTCOPY-ONLY] Whether current page is explicitly requested by + * postcopy. When set, the request is "urgent" because the dest QEMU + * threads are waiting for us. + */ +bool postcopy_requested; +/* + * [POSTCOPY-ONLY] The target channel to use to send current page. + * + * Note: This may _not_ match with the value in postcopy_requested + * above. Let's imagine the case where the postcopy request is exactly + * the page that we're sending in progress during precopy. In this case + * we'll have postcopy_requested set to true but the target channel + * will be the precopy channel (so that we don't split brain on that + * specific page since the precopy channel already contains partial of + * that page data). + * + * Besides that specific use case, postcopy_target_channel should + * always be equal to postcopy_requested, because by default we send + * postcopy pages via postcopy preempt channel. + */ +bool postcopy_target_channel; +/* Whether we're sending a host page */ +bool host_page_sending; +/* The start/end of current host page. Invalid if host_page_sending==false */ +unsigned long host_page_start; +unsigned long host_page_end; +}; +typedef struct PageSearchStatus PageSearchStatus; + /* struct contains XBZRLE cache and a static page used by the compression */ static struct { @@ -319,6 +359,11 @@ typedef struct { struct RAMState { /* QEMUFile used for this migration */ QEMUFile *f; +/* + * PageSearchStatus structures for the channels when send pages. + * Protected by the bitmap_mutex. + */ +PageSearchStatus pss[RAM_CHANNEL_MAX]; /* UFFD file descriptor, used in 'write-tracking' migration */ int uffdio_fd; /* Last block that we have visited searching for dirty pages */ @@ -362,7 +407,12 @@ struct RAMState { uint64_t target_page_count; /* number of dirty bits in the bitmap */ uint64_t migration_dirty_pages; -/* Protects modification of the bitmap and migration dirty pages */ +/* + * Protects: + * - dirty/clear bitmap + * - migration_dirty_pages + * - pss structures + */ QemuMutex bitmap_mutex; /* The RAMBlock used in the last src_page_requests */ RAMBlock *last_req_rb; @@ -444,46 +494,6 @@ void dirty_sync_missed_zero_copy(void) ram_counters.dirty_sync_missed_zero_copy++; } -/* used by the search for pages to send */ -struct PageSearchStatus { -/* The migration channel used for a specific host page */ -QEMUFile*pss_channel; -/* Current block being searched */ -RAMBlock*block; -/* Current page to search from */ -unsigned long page; -/* Set once we wrap around */ -bool complete_round; -/* - * [POSTCOPY-ONLY] Whether current page is explicitly requested by - * postcopy. When set, the request is "urgent" because the dest QEMU - * threads are waiting for us. - */ -bool postcopy_requested; -/* - * [POSTCOPY-ONLY] The target channel to use to send current page. - * - * Note: This may _not_ match with the value in postcopy_requested - * above. Let's imagine the case where the postcopy request is exactly - * the page that we're sending in progress during precopy. In this case - * we'll have postcopy_requested set to true but the target channel - * will be the precopy channel (so that we
[PATCH RFC 09/13] migration: Introduce pss_channel
Introduce pss_channel for PageSearchStatus, define it as "the migration channel to be used to transfer this host page". We used to have rs->f, which is a mirror to MigrationState.to_dst_file. After postcopy preempt initial version, rs->f can be dynamically changed depending on which channel we want to use. But that later work still doesn't grant full concurrency of sending pages in e.g. different threads, because rs->f can either be the PRECOPY channel or POSTCOPY channel. This needs to be per-thread too. PageSearchStatus is actually a good piece of struct which we can leverage if we want to have multiple threads sending pages. Sending a single guest page may not make sense, so we make the granule to be "host page", and in the PSS structure we allow specify a QEMUFile* to migrate a specific host page. Then we open the possibility to specify different channels in different threads with different PSS structures. The PSS prefix can be slightly misleading here because e.g. for the upcoming usage of postcopy channel/thread it's not "searching" (or, scanning) at all but sending the explicit page that was requested. However since PSS existed for some years keep it as-is until someone complains. This patch mostly (simply) replace rs->f with pss->pss_channel only. No functional change intended for this patch yet. But it does prepare to finally drop rs->f, and make ram_save_guest_page() thread safe. Signed-off-by: Peter Xu --- migration/ram.c | 70 +++-- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index e2b922ad59..adcc57c584 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -446,6 +446,8 @@ void dirty_sync_missed_zero_copy(void) /* used by the search for pages to send */ struct PageSearchStatus { +/* The migration channel used for a specific host page */ +QEMUFile*pss_channel; /* Current block being searched */ RAMBlock*block; /* Current page to search from */ @@ -768,9 +770,9 @@ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, -ram_addr_t current_addr, RAMBlock *block, -ram_addr_t offset) +static int save_xbzrle_page(RAMState *rs, QEMUFile *file, +uint8_t **current_data, ram_addr_t current_addr, +RAMBlock *block, ram_addr_t offset) { int encoded_len = 0, bytes_xbzrle; uint8_t *prev_cached_page; @@ -838,11 +840,11 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, } /* Send XBZRLE based compressed page */ -bytes_xbzrle = save_page_header(rs, rs->f, block, +bytes_xbzrle = save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_XBZRLE); -qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE); -qemu_put_be16(rs->f, encoded_len); -qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len); +qemu_put_byte(file, ENCODING_FLAG_XBZRLE); +qemu_put_be16(file, encoded_len); +qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); bytes_xbzrle += encoded_len + 1 + 2; /* * Like compressed_size (please see update_compress_thread_counts), @@ -1295,9 +1297,10 @@ static int save_zero_page_to_file(RAMState *rs, QEMUFile *file, * @block: block that contains the page we want to send * @offset: offset inside the block for the page */ -static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) +static int save_zero_page(RAMState *rs, QEMUFile *file, RAMBlock *block, + ram_addr_t offset) { -int len = save_zero_page_to_file(rs, rs->f, block, offset); +int len = save_zero_page_to_file(rs, file, block, offset); if (len) { ram_counters.duplicate++; @@ -1314,15 +1317,15 @@ static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) * * Return true if the pages has been saved, otherwise false is returned. */ -static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, - int *pages) +static bool control_save_page(PageSearchStatus *pss, RAMBlock *block, + ram_addr_t offset, int *pages) { uint64_t bytes_xmit = 0; int ret; *pages = -1; -ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE, -_xmit); +ret = ram_control_save_page(pss->pss_channel, block->offset, offset, +TARGET_PAGE_SIZE, _xmit); if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { return false; } @@ -1356,17 +1359,17 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset, * @buf: the page to be
[PATCH RFC 07/13] migration: Remove RAMState.f references in compression code
Removing referencing to RAMState.f in compress_page_with_multi_thread() and flush_compressed_data(). Compression code by default isn't compatible with having >1 channels (or it won't currently know which channel to flush the compressed data), so to make it simple we always flush on the default to_dst_file port until someone wants to add >1 ports support, as rs->f right now can really change (after postcopy preempt is introduced). There should be no functional change at all after patch applied, since as long as rs->f referenced in compression code, it must be to_dst_file. Signed-off-by: Peter Xu --- migration/ram.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 43893d0a40..2f37520be4 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1461,6 +1461,7 @@ static bool save_page_use_compression(RAMState *rs); static void flush_compressed_data(RAMState *rs) { +MigrationState *ms = migrate_get_current(); int idx, len, thread_count; if (!save_page_use_compression(rs)) { @@ -1479,7 +1480,7 @@ static void flush_compressed_data(RAMState *rs) for (idx = 0; idx < thread_count; idx++) { qemu_mutex_lock(_param[idx].mutex); if (!comp_param[idx].quit) { -len = qemu_put_qemu_file(rs->f, comp_param[idx].file); +len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file); /* * it's safe to fetch zero_page without holding comp_done_lock * as there is no further request submitted to the thread, @@ -1498,11 +1499,11 @@ static inline void set_compress_params(CompressParam *param, RAMBlock *block, param->offset = offset; } -static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block, - ram_addr_t offset) +static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset) { int idx, thread_count, bytes_xmit = -1, pages = -1; bool wait = migrate_compress_wait_thread(); +MigrationState *ms = migrate_get_current(); thread_count = migrate_compress_threads(); qemu_mutex_lock(_done_lock); @@ -1510,7 +1511,8 @@ retry: for (idx = 0; idx < thread_count; idx++) { if (comp_param[idx].done) { comp_param[idx].done = false; -bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file); +bytes_xmit = qemu_put_qemu_file(ms->to_dst_file, +comp_param[idx].file); qemu_mutex_lock(_param[idx].mutex); set_compress_params(_param[idx], block, offset); qemu_cond_signal(_param[idx].cond); @@ -2263,7 +2265,7 @@ static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset) return false; } -if (compress_page_with_multi_thread(rs, block, offset) > 0) { +if (compress_page_with_multi_thread(block, offset) > 0) { return true; } -- 2.32.0
[PATCH RFC 06/13] migration: Trivial cleanup save_page_header() on same block check
The 2nd check on RAM_SAVE_FLAG_CONTINUE is a bit redundant. Use a boolean to be clearer. Signed-off-by: Peter Xu --- migration/ram.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 612c7dd708..43893d0a40 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -661,14 +661,15 @@ static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block, ram_addr_t offset) { size_t size, len; +bool same_block = (block == rs->last_sent_block); -if (block == rs->last_sent_block) { +if (same_block) { offset |= RAM_SAVE_FLAG_CONTINUE; } qemu_put_be64(f, offset); size = 8; -if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { +if (!same_block) { len = strlen(block->idstr); qemu_put_byte(f, len); qemu_put_buffer(f, (uint8_t *)block->idstr, len); -- 2.32.0
[PATCH RFC 04/13] migration: Cleanup xbzrle zero page cache update logic
The major change is to replace "!save_page_use_compression()" with "xbzrle_enabled" to make it clear. Reasonings: (1) When compression enabled, "!save_page_use_compression()" is exactly the same as checking "xbzrle_enabled". (2) When compression disabled, "!save_page_use_compression()" always return true. We used to try calling the xbzrle code, but after this change we won't, and we shouldn't need to. Since at it, drop the xbzrle_enabled check in xbzrle_cache_zero_page() because with this change it's not needed anymore. Signed-off-by: Peter Xu --- migration/ram.c | 6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 9e96a46323..612c7dd708 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -741,10 +741,6 @@ void mig_throttle_counter_reset(void) */ static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr) { -if (!rs->xbzrle_enabled) { -return; -} - /* We don't care if this fails to allocate a new cache page * as long as it updated an old one */ cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, @@ -2301,7 +2297,7 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) /* Must let xbzrle know, otherwise a previous (now 0'd) cached * page would be stale */ -if (!save_page_use_compression(rs)) { +if (rs->xbzrle_enabled) { XBZRLE_cache_lock(); xbzrle_cache_zero_page(rs, block->offset + offset); XBZRLE_cache_unlock(); -- 2.32.0
[PATCH RFC 10/13] migration: Add pss_init()
Helper to init PSS structures. Signed-off-by: Peter Xu --- migration/ram.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index adcc57c584..bdfcc6171a 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -535,6 +535,14 @@ static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block, static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss, bool postcopy_requested); +/* NOTE: page is the PFN not real ram_addr_t. */ +static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) +{ +pss->block = rb; +pss->page = page; +pss->complete_round = false; +} + static void *do_data_compress(void *opaque) { CompressParam *param = opaque; @@ -2625,9 +2633,7 @@ static int ram_find_and_save_block(RAMState *rs) return pages; } -pss.block = rs->last_seen_block; -pss.page = rs->last_page; -pss.complete_round = false; +pss_init(, rs->last_seen_block, rs->last_page); if (!pss.block) { pss.block = QLIST_FIRST_RCU(_list.blocks); -- 2.32.0
[PATCH RFC 05/13] migration: Disallow postcopy preempt to be used with compress
The preempt mode requires the capability to assign channel for each of the page, while the compression logic will currently assign pages to different compress thread/local-channel so potentially they're incompatible. Signed-off-by: Peter Xu --- migration/migration.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index bb8bbddfe4..844bca1ff6 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1336,6 +1336,17 @@ static bool migrate_caps_check(bool *cap_list, error_setg(errp, "Postcopy preempt requires postcopy-ram"); return false; } + +/* + * Preempt mode requires urgent pages to be sent in separate + * channel, OTOH compression logic will disorder all pages into + * different compression channels, which is not compatible with the + * preempt assumptions on channel assignments. + */ +if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) { +error_setg(errp, "Postcopy preempt not compatible with compress"); +return false; +} } return true; -- 2.32.0
[PATCH RFC 02/13] migration: Add postcopy_preempt_active()
Add the helper to show that postcopy preempt enabled, meanwhile active. Signed-off-by: Peter Xu --- migration/ram.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..8c5d5332e8 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -162,6 +162,11 @@ out: return ret; } +static bool postcopy_preempt_active(void) +{ +return migrate_postcopy_preempt() && migration_in_postcopy(); +} + bool ramblock_is_ignored(RAMBlock *block) { return !qemu_ram_is_migratable(block) || @@ -2434,7 +2439,7 @@ static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss) /* We need to make sure rs->f always points to the default channel elsewhere */ static void postcopy_preempt_reset_channel(RAMState *rs) { -if (migrate_postcopy_preempt() && migration_in_postcopy()) { +if (postcopy_preempt_active()) { rs->postcopy_channel = RAM_CHANNEL_PRECOPY; rs->f = migrate_get_current()->to_dst_file; trace_postcopy_preempt_reset_channel(); @@ -2472,7 +2477,7 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) return 0; } -if (migrate_postcopy_preempt() && migration_in_postcopy()) { +if (postcopy_preempt_active()) { postcopy_preempt_choose_channel(rs, pss); } -- 2.32.0
[PATCH RFC 01/13] migration: Use non-atomic ops for clear log bitmap
Since we already have bitmap_mutex to protect either the dirty bitmap or the clear log bitmap, we don't need atomic operations to set/clear/test on the clear log bitmap. Switching all ops from atomic to non-atomic versions, meanwhile touch up the comments to show which lock is in charge. Introduced non-atomic version of bitmap_test_and_clear_atomic(), mostly the same as the atomic version but simplified a few places, e.g. dropped the "old_bits" variable, and also the explicit memory barriers. Signed-off-by: Peter Xu --- include/exec/ram_addr.h | 11 +- include/qemu/bitmap.h | 1 + util/bitmap.c | 45 + 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index f3e0c78161..5092a2e0ff 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -42,7 +42,8 @@ static inline long clear_bmap_size(uint64_t pages, uint8_t shift) } /** - * clear_bmap_set: set clear bitmap for the page range + * clear_bmap_set: set clear bitmap for the page range. Must be with + * bitmap_mutex held. * * @rb: the ramblock to operate on * @start: the start page number @@ -55,12 +56,12 @@ static inline void clear_bmap_set(RAMBlock *rb, uint64_t start, { uint8_t shift = rb->clear_bmap_shift; -bitmap_set_atomic(rb->clear_bmap, start >> shift, - clear_bmap_size(npages, shift)); +bitmap_set(rb->clear_bmap, start >> shift, clear_bmap_size(npages, shift)); } /** - * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set + * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set. + * Must be with bitmap_mutex held. * * @rb: the ramblock to operate on * @page: the page number to check @@ -71,7 +72,7 @@ static inline bool clear_bmap_test_and_clear(RAMBlock *rb, uint64_t page) { uint8_t shift = rb->clear_bmap_shift; -return bitmap_test_and_clear_atomic(rb->clear_bmap, page >> shift, 1); +return bitmap_test_and_clear(rb->clear_bmap, page >> shift, 1); } static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset) diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h index 82a1d2f41f..3ccb00865f 100644 --- a/include/qemu/bitmap.h +++ b/include/qemu/bitmap.h @@ -253,6 +253,7 @@ void bitmap_set(unsigned long *map, long i, long len); void bitmap_set_atomic(unsigned long *map, long i, long len); void bitmap_clear(unsigned long *map, long start, long nr); bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr); +bool bitmap_test_and_clear(unsigned long *map, long start, long nr); void bitmap_copy_and_clear_atomic(unsigned long *dst, unsigned long *src, long nr); unsigned long bitmap_find_next_zero_area(unsigned long *map, diff --git a/util/bitmap.c b/util/bitmap.c index f81d8057a7..8d12e90a5a 100644 --- a/util/bitmap.c +++ b/util/bitmap.c @@ -240,6 +240,51 @@ void bitmap_clear(unsigned long *map, long start, long nr) } } +bool bitmap_test_and_clear(unsigned long *map, long start, long nr) +{ +unsigned long *p = map + BIT_WORD(start); +const long size = start + nr; +int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); +unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); +bool dirty = false; + +assert(start >= 0 && nr >= 0); + +/* First word */ +if (nr - bits_to_clear > 0) { +if ((*p) & mask_to_clear) { +dirty = true; +} +*p &= ~mask_to_clear; +nr -= bits_to_clear; +bits_to_clear = BITS_PER_LONG; +p++; +} + +/* Full words */ +if (bits_to_clear == BITS_PER_LONG) { +while (nr >= BITS_PER_LONG) { +if (*p) { +dirty = true; +*p = 0; +} +nr -= BITS_PER_LONG; +p++; +} +} + +/* Last word */ +if (nr) { +mask_to_clear &= BITMAP_LAST_WORD_MASK(size); +if ((*p) & mask_to_clear) { +dirty = true; +} +*p &= ~mask_to_clear; +} + +return dirty; +} + bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr) { unsigned long *p = map + BIT_WORD(start); -- 2.32.0
[PATCH RFC 03/13] migration: Yield bitmap_mutex properly when sending/sleeping
Don't take the bitmap mutex when sending pages, or when being throttled by migration_rate_limit() (which is a bit tricky to call it here in ram code, but seems still helpful). It prepares for the possibility of concurrently sending pages in >1 threads using the function ram_save_host_page() because all threads may need the bitmap_mutex to operate on bitmaps, so that either sendmsg() or any kind of qemu_sem_wait() blocking for one thread will not block the other from progressing. Signed-off-by: Peter Xu --- migration/ram.c | 42 +++--- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 8c5d5332e8..9e96a46323 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2470,6 +2470,7 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) unsigned long hostpage_boundary = QEMU_ALIGN_UP(pss->page + 1, pagesize_bits); unsigned long start_page = pss->page; +bool page_dirty; int res; if (ramblock_is_ignored(pss->block)) { @@ -2487,22 +2488,41 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) break; } +page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); +/* + * Properly yield the lock only in postcopy preempt mode because + * both migration thread and rp-return thread can operate on the + * bitmaps. + */ +if (postcopy_preempt_active()) { +qemu_mutex_unlock(>bitmap_mutex); +} + /* Check the pages is dirty and if it is send it */ -if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) { +if (page_dirty) { tmppages = ram_save_target_page(rs, pss); -if (tmppages < 0) { -return tmppages; +if (tmppages >= 0) { +pages += tmppages; +/* + * Allow rate limiting to happen in the middle of huge pages if + * something is sent in the current iteration. + */ +if (pagesize_bits > 1 && tmppages > 0) { +migration_rate_limit(); +} } +} else { +tmppages = 0; +} -pages += tmppages; -/* - * Allow rate limiting to happen in the middle of huge pages if - * something is sent in the current iteration. - */ -if (pagesize_bits > 1 && tmppages > 0) { -migration_rate_limit(); -} +if (postcopy_preempt_active()) { +qemu_mutex_lock(>bitmap_mutex); } + +if (tmppages < 0) { +return tmppages; +} + pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page); } while ((pss->page < hostpage_boundary) && offset_in_ramblock(pss->block, -- 2.32.0
[PATCH RFC 00/13] migration: Postcopy Preempt-Full
This is a RFC series. Tree is here: https://github.com/xzpeter/qemu/tree/preempt-full It's not complete because there're still something we need to do which will be attached to the end of this cover letter, however this series can already safely pass qtest and any of my test. Comparing to the recently merged preempt mode I called it "preempt-full" because it threadifies the postcopy channels so now urgent pages can be fully handled separately outside of the ram save loop. Sorry to have the same name as the PREEMPT_FULL in the Linux RT world, it's just that we needed a name for the capability and it was named as preempt already anyway.. The existing preempt code has reduced ramdom page req latency over 10Gbps network from ~12ms to ~500us which has already landed. This preempt-full series can further reduces that ~500us to ~230us per my initial test. More to share below. Note that no new capability is needed, IOW it's fully compatible with the existing preempt mode. So the naming is actually not important but just to identify the difference on the binaries. It's because this series only reworks the sender side code and does not change the migration protocol, it just runs faster. IOW, old "preempt" QEMU can also migrate to "preempt-full" QEMU, vice versa. - When old "preempt" mode QEMU migrates to "preempt-full" QEMU, it'll be the same as running both old "preempt" QEMUs. - When "preempt-full" QEMU migrates to old "preempt" QEMU, it'll be the same as running both "preempt-full". The logic of the series is quite simple too: simply moving the existing preempt channel page sends to rp-return thread. It can slow down rp-return thread on receiving pages, but I don't really see a major issue with it so far. This latency number is getting close to the extreme of 4K page request latency of any TCP roundtrip of the 10Gbps nic I have. The 'extreme number' is something I get from mig_mon tool which has a mode [1] to emulate the extreme tcp roundtrips of page requests. Performance === Page request latencies has distributions as below, with a VM of 20G mem, 20 cores, 10Gbps nic, 18G fully random writes: Postcopy Vanilla Average: 12093 (us) @delay_us: [1]1 || [2, 4) 0 || [4, 8) 0 || [8, 16)0 || [16, 32) 1 || [32, 64) 8 || [64, 128) 11 || [128, 256)14 || [256, 512)19 || [512, 1K) 14 || [1K, 2K) 35 || [2K, 4K) 18 || [4K, 8K) 87 |@ | [8K, 16K) 2397 || [16K, 32K) 7 || [32K, 64K) 2 || [64K, 128K) 20 || [128K, 256K) 6 || Postcopy Preempt Average: 496 (us) @delay_us: [32, 64) 2 || [64, 128) 2306 || [128, 256) 25422 || [256, 512) 8238 || [512, 1K) 1066 |@@ | [1K, 2K)2167 || [2K, 4K)3329 |@@ | [4K, 8K) 109 || [8K, 16K) 48 || Postcopy Preempt-Full - Average: 229 (us) @delay_us: [8, 16)1 || [16, 32) 3 || [32, 64) 2 || [64, 128) 11956 |@@ | [128, 256) 60403 || [256, 512) 15047 |
Re: [PATCH] linux-user: fix readlinkat handling with magic exe symlink
bump? This helps fix one of the libuv tests when run under qemu https://github.com/libuv/libuv/pull/2941#issuecomment-1207145306 On Mon, Aug 8, 2022 at 3:07 PM Jameson Nash wrote: > Exactly the same as f17f4989fa193fa8279474c5462289a3cfe69aea before was > for readlink. I suppose this was simply missed at the time. > > Signed-off-by: Jameson Nash > --- > linux-user/syscall.c | 15 +-- > 1 file changed, 13 insertions(+), 2 deletions(-) > > diff --git a/linux-user/syscall.c b/linux-user/syscall.c > index ef53feb5ab..6ef4e42b21 100644 > --- a/linux-user/syscall.c > +++ b/linux-user/syscall.c > @@ -9894,11 +9894,22 @@ static abi_long do_syscall1(CPUArchState *cpu_env, > int num, abi_long arg1, > p2 = lock_user(VERIFY_WRITE, arg3, arg4, 0); > if (!p || !p2) { > ret = -TARGET_EFAULT; > +} else if (!arg4) { > +/* Short circuit this for the magic exe check. */ > +ret = -TARGET_EINVAL; > } else if (is_proc_myself((const char *)p, "exe")) { > char real[PATH_MAX], *temp; > temp = realpath(exec_path, real); > -ret = temp == NULL ? get_errno(-1) : strlen(real) ; > -snprintf((char *)p2, arg4, "%s", real); > +/* Return value is # of bytes that we wrote to the > buffer. */ > +if (temp == NULL) { > +ret = get_errno(-1); > +} else { > +/* Don't worry about sign mismatch as earlier mapping > + * logic would have thrown a bad address error. */ > +ret = MIN(strlen(real), arg4); > +/* We cannot NUL terminate the string. */ > +memcpy(p2, real, ret); > +} > } else { > ret = get_errno(readlinkat(arg1, path(p), p2, arg4)); > } > -- > 2.25.1 > >
Re: [PATCH 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.
On 8/29/22 07:23, Ricky Zhou wrote: Many instructions which load/store 128-bit values are supposed to raise #GP when the memory operand isn't 16-byte aligned. This includes: - Instructions explicitly requiring memory alignment (Exceptions Type 1 in the "AVX and SSE Instruction Exception Specification" section of the SDM) - Legacy SSE instructions that load/store 128-bit values (Exceptions Types 2 and 4). This change adds a raise_gp_if_unaligned helper which raises #GP if an address is not properly aligned. This helper is called before 128-bit loads/stores where appropriate. Resolves:https://gitlab.com/qemu-project/qemu/-/issues/217 Signed-off-by: Ricky Zhou --- target/i386/helper.h | 1 + target/i386/tcg/mem_helper.c | 8 target/i386/tcg/translate.c | 38 +--- 3 files changed, 44 insertions(+), 3 deletions(-) This trap should be raised via the memory operation: - static inline void gen_ldo_env_A0(DisasContext *s, int offset) + static inline void gen_ldo_env_A0(DisasContext *s, int offset, bool aligned) { int mem_index = s->mem_index; - tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, MO_LEUQ); + tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, + MO_LEUQ | (aligned ? MO_ALIGN_16 : 0)); tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(0))); tcg_gen_addi_tl(s->tmp0, s->A0, 8); tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(ZMMReg, ZMM_Q(1))); } Only the first of the two loads/stores must be aligned, as the other is known to be +8. You then must fill in the x86_tcg_ops.do_unaligned_access hook to raise #GP. r~
Re: [PATCH 4/9] hw/isa/vt82c686: QOM'ify via-ide creation
Am 25. August 2022 01:18:56 MESZ schrieb BALATON Zoltan : >On Thu, 25 Aug 2022, Bernhard Beschow wrote: >> On Wed, Aug 24, 2022 at 3:54 PM BALATON Zoltan wrote: >>> On Tue, 23 Aug 2022, Bernhard Beschow wrote: The IDE function is closely tied to the ISA function (e.g. the IDE interrupt routing happens there), so it makes sense that the IDE function is instantiated within the southbridge itself. As a side effect, duplicated code in the boards is resolved. Signed-off-by: Bernhard Beschow --- configs/devices/mips64el-softmmu/default.mak | 1 - hw/isa/Kconfig | 1 + hw/isa/vt82c686.c| 18 ++ hw/mips/fuloong2e.c | 3 --- hw/ppc/Kconfig | 1 - hw/ppc/pegasos2.c| 4 6 files changed, 19 insertions(+), 9 deletions(-) diff --git a/configs/devices/mips64el-softmmu/default.mak >>> b/configs/devices/mips64el-softmmu/default.mak index c610749ac1..d5188f7ea5 100644 --- a/configs/devices/mips64el-softmmu/default.mak +++ b/configs/devices/mips64el-softmmu/default.mak @@ -1,7 +1,6 @@ # Default configuration for mips64el-softmmu include ../mips-softmmu/common.mak -CONFIG_IDE_VIA=y CONFIG_FULOONG=y CONFIG_LOONGSON3V=y CONFIG_ATI_VGA=y diff --git a/hw/isa/Kconfig b/hw/isa/Kconfig index d42143a991..20de7e9294 100644 --- a/hw/isa/Kconfig +++ b/hw/isa/Kconfig @@ -53,6 +53,7 @@ config VT82C686 select I8254 select I8257 select I8259 +select IDE_VIA select MC146818RTC select PARALLEL diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c index 5582c0b179..37d9ed635d 100644 --- a/hw/isa/vt82c686.c +++ b/hw/isa/vt82c686.c @@ -17,6 +17,7 @@ #include "hw/isa/vt82c686.h" #include "hw/pci/pci.h" #include "hw/qdev-properties.h" +#include "hw/ide/pci.h" #include "hw/isa/isa.h" #include "hw/isa/superio.h" #include "hw/intc/i8259.h" @@ -544,6 +545,7 @@ struct ViaISAState { qemu_irq cpu_intr; qemu_irq *isa_irqs; ViaSuperIOState via_sio; +PCIIDEState ide; }; static const VMStateDescription vmstate_via = { @@ -556,10 +558,18 @@ static const VMStateDescription vmstate_via = { } }; +static void via_isa_init(Object *obj) +{ +ViaISAState *s = VIA_ISA(obj); + +object_initialize_child(obj, "ide", >ide, "via-ide"); +} + static const TypeInfo via_isa_info = { .name = TYPE_VIA_ISA, .parent= TYPE_PCI_DEVICE, .instance_size = sizeof(ViaISAState), +.instance_init = via_isa_init, .abstract = true, .interfaces= (InterfaceInfo[]) { { INTERFACE_CONVENTIONAL_PCI_DEVICE }, @@ -583,6 +593,7 @@ static void via_isa_realize(PCIDevice *d, Error >>> **errp) { ViaISAState *s = VIA_ISA(d); DeviceState *dev = DEVICE(d); +PCIBus *pci_bus = pci_get_bus(d); qemu_irq *isa_irq; ISABus *isa_bus; int i; @@ -607,6 +618,13 @@ static void via_isa_realize(PCIDevice *d, Error >>> **errp) if (!qdev_realize(DEVICE(>via_sio), BUS(isa_bus), errp)) { return; } + +/* Function 1: IDE */ +qdev_prop_set_int32(DEVICE(>ide), "addr", d->devfn + 1); +if (!qdev_realize(DEVICE(>ide), BUS(pci_bus), errp)) { +return; +} +pci_ide_create_devs(PCI_DEVICE(>ide)); >>> >>> I'm not sure about moving pci_ide_create_devs() here. This is usally >>> called from board code and only piix4 seems to do this. Maybe that's wrong >>> because if all IDE devices did this then one machine could not have more >>> than one different ide devices (like having an on-board ide and adding a >>> pci ide controoler with -device) so this probably belongs to the board >>> code to add devices to its default ide controller only as this is machine >>> specific. Unless I'm wrong in which case somebody will correct me. >>> >> >> Grepping the code it can be seen that it's always called right after >> creating the IDE controllers. The only notable exception is the "sii3112" >> device in the sam460ex board which is not emulated yet. Since the IDE > >The problem with sii3112 is that it only has 2 channels becuase I did not >bother to implement more so pci_ide_create_devs() probably would not work as >it assumes 4 channels. AFAIK this means that the short -hda, -cdrom, etc. >convenience options don't work with sam460ex but yhou have to use the long way >of creating ide-hd and ide-cd devices on the command line. I think there's a >version of this controller with 4 channels, maybe called sii3114 or similar >and it
Re: [PATCH v7 2/2] target/s390x: support PRNO_TRNG instruction
On Fri, Aug 26, 2022 at 01:28:11PM +0200, Thomas Huth wrote: > > +qemu_guest_getrandom_nofail(tmp, block); > > +for (size_t i = 0; i < block; ++i) { > > +cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra); > > +*buf_reg = deposit64(*buf_reg, 0, message_reg_len, *buf_reg + > > 1); > > +--*len_reg; > > I know it's annoying, but technically, you must not touch the upper bits of > the len_reg if running in 31- or 24-bit addressing mode. The Principles of > Operations say: > > "In either the 24- or 31-bit addressing mode, bits 32-63 of the odd-numbered > register are decremented by the number > of bytes processed for the respective operand, and > bits 0-31 of the register remain unchanged." > This is what I was trying to do with the use of deposit64, following David's guidance. Did I mess something up? > > +} > > +len -= block; > > +} > > +} > > + > > uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, > > uint32_t r3, > >uint32_t type) > > { > > Don't you also need to modify the "query" part to signal the availability of > the function? Doesn't Linux in the guest check the availability first before > using it? I think this is already handled at the upper layers. Linux detects it fine. > > > @@ -209,6 +235,10 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, > > uint32_t r2, uint32_t r3, > > return klmd_sha512(env, ra, env->regs[1], >regs[r2], > > >regs[r2 + 1]); > > } > > break; > > +case 114: /* CPACF_PRNO_TRNG */ > > +fill_buf_random(env, ra, >regs[r1], >regs[r1 + 1]); > > +fill_buf_random(env, ra, >regs[r2], >regs[r2 + 1]); > > +break; > > default: > > /* we don't implement any other subfunction yet */ > > g_assert_not_reached(); > > Maybe one more thing to check (according the "Special Conditions" section in > the Principles of Operation): > > "A specification exception is recognized and no other > action is taken if any of the following conditions exist: > > ... > > 2. The R1 or R2 fields designate an odd-numbered > register or general register 0. This exception is > recognized regardless of the function code. > " This is taken care of already by the function that calls into this function. Jason
Re: [PATCH v7 1/2] target/s390x: support SHA-512 extensions
On Fri, Aug 26, 2022 at 12:21:36PM +0200, Thomas Huth wrote: > > + * Copyright (C) 2022 Jason A. Donenfeld . All Rights > > Reserved. > > Please drop the "All rights reserved" ... it does not have any legal meaning No. > > +{ > > +enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep > > interactivity. */ > > +uint64_t z[8], b[8], a[8], w[16], t; > > +uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, > > processed = 0; > > The line is very long, could you please declare message and len on separate > lines? Will do. > > > +int i, j, message_reg_len = 64, blocks = 0, cc = 0; > > + > > +if (!(env->psw.mask & PSW_MASK_64)) { > > +len = (uint32_t)len; > > +message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > > +} > > + > > +for (i = 0; i < 8; ++i) { > > +z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, > > parameter_block + 8 * i), ra); > > Quite a long line again, maybe split it like this: > >abi_ptr addr = wrap_address(env, parameter_block + 8 * i); >z[i] = a[i] = cpu_ldq_be_data_ra(env, addr, ra); Sure. > > > +} > > + > > +while (len >= 128) { > > +if (++blocks > MAX_BLOCKS_PER_RUN) { > > +cc = 3; > > +break; > > +} > > + > > +for (i = 0; i < 16; ++i) { > > +if (message) { > > +w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + > > 8 * i), ra); > > Long line again, please split. Okay. > > cpu_stb_data_ra(env, param_addr, subfunc[i], ra); > > So for KIMD and KLMD, I think we now have to set the bit that corresponds to > SHA-512 in the query status information, too? Otherwise the guest might not > use the function if it thinks that it is not available? That's already taken care of generically I think. This works fine from Linux's autodetection. Jason
Re: [PATCH v7 11/14] KVM: Register/unregister the guest private memory regions
On Fri, Aug 26, 2022 at 04:19:43PM +0100, Fuad Tabba wrote: > > +bool __weak kvm_arch_private_mem_supported(struct kvm *kvm) > > +{ > > + return false; > > +} > > + > > static int check_memory_region_flags(const struct kvm_user_mem_region *mem) > > { > > u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; > > @@ -4689,6 +4729,22 @@ static long kvm_vm_ioctl(struct file *filp, > > r = kvm_vm_ioctl_set_memory_region(kvm, ); > > break; > > } > > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM > > + case KVM_MEMORY_ENCRYPT_REG_REGION: > > + case KVM_MEMORY_ENCRYPT_UNREG_REGION: { > > + struct kvm_enc_region region; > > + > > + if (!kvm_arch_private_mem_supported(kvm)) > > + goto arch_vm_ioctl; > > + > > + r = -EFAULT; > > + if (copy_from_user(, argp, sizeof(region))) > > + goto out; > > + > > + r = kvm_vm_ioctl_set_encrypted_region(kvm, ioctl, ); > > + break; > > + } > > +#endif > > case KVM_GET_DIRTY_LOG: { > > struct kvm_dirty_log log; > > > > @@ -4842,6 +4898,7 @@ static long kvm_vm_ioctl(struct file *filp, > > r = kvm_vm_ioctl_get_stats_fd(kvm); > > break; > > default: > > +arch_vm_ioctl: > > It might be good to make this label conditional on > CONFIG_HAVE_KVM_PRIVATE_MEM, otherwise you get a warning if > CONFIG_HAVE_KVM_PRIVATE_MEM isn't defined. > > +#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM > arch_vm_ioctl: > +#endif Right, as the bot already complains. Chao > > Cheers, > /fuad > > > > > > > r = kvm_arch_vm_ioctl(filp, ioctl, arg); > > } > > out: > > -- > > 2.25.1 > >
Re: [PATCH] target/sh4: Fix TB_FLAG_UNALIGN
On 8/29/22 02:05, BALATON Zoltan wrote: On Sun, 28 Aug 2022, Richard Henderson wrote: The value previously chosen overlaps GUSA_MASK. Cc: qemu-sta...@nongnu.org Fixes: 4da06fb3062 ("target/sh4: Implement prctl_unalign_sigbus") Resolves: https://gitlab.com/qemu-project/qemu/-/issues/856 Signed-off-by: Richard Henderson --- target/sh4/cpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h index 9f15ef913c..e79cbc59e2 100644 --- a/target/sh4/cpu.h +++ b/target/sh4/cpu.h @@ -84,7 +84,7 @@ #define DELAY_SLOT_RTE (1 << 2) #define TB_FLAG_PENDING_MOVCA (1 << 3) -#define TB_FLAG_UNALIGN (1 << 4) +#define TB_FLAG_UNALIGN (1 << 13) Is it worth a comment to note why that value to avoid the same problem if another flag is added in the future? Hmm, or perhaps move it down below, so that we see bit 3 used, then bits 4-12, then bit 13. r~
Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory
On Fri, Aug 26, 2022 at 04:19:25PM +0100, Fuad Tabba wrote: > Hi, > > On Wed, Jul 6, 2022 at 9:24 AM Chao Peng wrote: > > > > This is the v7 of this series which tries to implement the fd-based KVM > > guest private memory. The patches are based on latest kvm/queue branch > > commit: > > > > b9b71f43683a (kvm/queue) KVM: x86/mmu: Buffer nested MMU > > split_desc_cache only by default capacity > > > > Introduction > > > > In general this patch series introduce fd-based memslot which provides > > guest memory through memory file descriptor fd[offset,size] instead of > > hva/size. The fd can be created from a supported memory filesystem > > like tmpfs/hugetlbfs etc. which we refer as memory backing store. KVM > > and the the memory backing store exchange callbacks when such memslot > > gets created. At runtime KVM will call into callbacks provided by the > > backing store to get the pfn with the fd+offset. Memory backing store > > will also call into KVM callbacks when userspace punch hole on the fd > > to notify KVM to unmap secondary MMU page table entries. > > > > Comparing to existing hva-based memslot, this new type of memslot allows > > guest memory unmapped from host userspace like QEMU and even the kernel > > itself, therefore reduce attack surface and prevent bugs. > > > > Based on this fd-based memslot, we can build guest private memory that > > is going to be used in confidential computing environments such as Intel > > TDX and AMD SEV. When supported, the memory backing store can provide > > more enforcement on the fd and KVM can use a single memslot to hold both > > the private and shared part of the guest memory. > > > > mm extension > > - > > Introduces new MFD_INACCESSIBLE flag for memfd_create(), the file > > created with these flags cannot read(), write() or mmap() etc via normal > > MMU operations. The file content can only be used with the newly > > introduced memfile_notifier extension. > > > > The memfile_notifier extension provides two sets of callbacks for KVM to > > interact with the memory backing store: > > - memfile_notifier_ops: callbacks for memory backing store to notify > > KVM when memory gets invalidated. > > - backing store callbacks: callbacks for KVM to call into memory > > backing store to request memory pages for guest private memory. > > > > The memfile_notifier extension also provides APIs for memory backing > > store to register/unregister itself and to trigger the notifier when the > > bookmarked memory gets invalidated. > > > > The patchset also introduces a new memfd seal F_SEAL_AUTO_ALLOCATE to > > prevent double allocation caused by unintentional guest when we only > > have a single side of the shared/private memfds effective. > > > > memslot extension > > - > > Add the private fd and the fd offset to existing 'shared' memslot so > > that both private/shared guest memory can live in one single memslot. > > A page in the memslot is either private or shared. Whether a guest page > > is private or shared is maintained through reusing existing SEV ioctls > > KVM_MEMORY_ENCRYPT_{UN,}REG_REGION. > > > > I'm on the Android pKVM team at Google, and we've been looking into > how this approach fits with what we've been doing with pkvm/arm64. > I've had a go at porting your patches, along with some fixes and > additions so it would go on top of our latest pkvm patch series [1] to > see how well this proposal fits with what we’re doing. You can find > the ported code at this link [2]. > > In general, an fd-based approach fits very well with pKVM for the > reasons you mention. It means that we don't necessarily need to map > the guest memory, and with the new extensions it allows the host > kernel to control whether to restrict migration and swapping. Good to hear that. > > For pKVM, we would also need the guest private memory not to be > GUP’able by the kernel so that userspace can’t trick the kernel into > accessing guest private memory in a context where it isn’t prepared to > handle the fault injected by the hypervisor. We’re looking at whether > we could use memfd_secret to achieve this, or maybe whether extending > your work might solve the problem. This is interesting and can be a valuable addition to this series. > > However, during the porting effort, the main issue we've encountered > is that many of the details of this approach seem to be targeted at > TDX/SEV and don’t readily align with the design of pKVM. My knowledge > on TDX is very rudimentary, so please bear with me if I get things > wrong. No doubt this series is initially designed for confidential computing usages, but pKVM can definitely extend it if it finds useful. > > The idea of the memslot having two references to the backing memory, > the (new) private_fd (a file descriptor) as well as the userspace_addr > (a memory address), with the meaning changing depending on whether the > memory is private or shared. Both can
[PATCH 5/5] virtio-gpu: Don't require udmabuf when blob support is enabled
From: Dmitry Osipenko Host blobs don't need udmabuf, it's only needed by guest blobs. The host blobs are utilized by the Mesa virgl driver when persistent memory mapping is needed by a GL buffer, otherwise virgl driver doesn't use blobs. Persistent mapping support bumps GL version from 4.3 to 4.5 in guest. Relax the udmabuf requirement. Signed-off-by: Dmitry Osipenko Reviewed-by: Antonio Caggiano --- hw/display/virtio-gpu.c | 20 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index 527c0aeede..4c2a9b7ea7 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -367,7 +367,9 @@ static void virtio_gpu_resource_create_blob(VirtIOGPU *g, return; } -virtio_gpu_init_udmabuf(res); +if (cblob.blob_mem == VIRTIO_GPU_BLOB_MEM_GUEST) { +virtio_gpu_init_udmabuf(res); +} QTAILQ_INSERT_HEAD(>reslist, res, next); } @@ -1315,19 +1317,13 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error **errp) VirtIODevice *vdev = VIRTIO_DEVICE(qdev); VirtIOGPU *g = VIRTIO_GPU(qdev); -if (virtio_gpu_blob_enabled(g->parent_obj.conf)) { -if (!virtio_gpu_have_udmabuf()) { -error_setg(errp, "cannot enable blob resources without udmabuf"); -return; -} - #ifndef HAVE_VIRGL_RESOURCE_BLOB -if (virtio_gpu_virgl_enabled(g->parent_obj.conf)) { -error_setg(errp, "Linked virglrenderer does not support blob resources"); -return; -} -#endif +if (virtio_gpu_blob_enabled(g->parent_obj.conf) && +virtio_gpu_virgl_enabled(g->parent_obj.conf)) { +error_setg(errp, "Linked virglrenderer does not support blob resources"); +return; } +#endif if (!virtio_gpu_base_device_realize(qdev, virtio_gpu_handle_ctrl_cb, -- 2.34.1
[PATCH 0/5] virtio-gpu: Blob resources
Add shared memory and support blob resource creation, mapping and unmapping through virglrenderer new stable APIs[0] when available. [0] https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/891 Antonio Caggiano (1): virtio-gpu: Handle resource blob commands Dmitry Osipenko (1): virtio-gpu: Don't require udmabuf when blob support is enabled Dr. David Alan Gilbert (1): virtio: Add shared memory capability Gerd Hoffmann (1): virtio-gpu: hostmem Richard Henderson (1): Update version for v7.1.0-rc4 release VERSION | 2 +- hw/display/virtio-gpu-pci.c | 15 +++ hw/display/virtio-gpu-virgl.c| 169 +++ hw/display/virtio-gpu.c | 25 ++-- hw/display/virtio-vga.c | 33 -- hw/virtio/virtio-pci.c | 18 +++ include/hw/virtio/virtio-gpu-bswap.h | 18 +++ include/hw/virtio/virtio-gpu.h | 11 ++ include/hw/virtio/virtio-pci.h | 4 + meson.build | 5 + 10 files changed, 276 insertions(+), 24 deletions(-) -- 2.34.1
[PATCH 4/5] virtio-gpu: Handle resource blob commands
Support BLOB resources creation by calling virgl_renderer_resource_create_blob. Signed-off-by: Antonio Caggiano Signed-off-by: Dmitry Osipenko --- hw/display/virtio-gpu-virgl.c| 169 +++ hw/display/virtio-gpu.c | 8 +- include/hw/virtio/virtio-gpu-bswap.h | 18 +++ include/hw/virtio/virtio-gpu.h | 6 + meson.build | 5 + 5 files changed, 202 insertions(+), 4 deletions(-) diff --git a/hw/display/virtio-gpu-virgl.c b/hw/display/virtio-gpu-virgl.c index 73cb92c8d5..c4c2c31d76 100644 --- a/hw/display/virtio-gpu-virgl.c +++ b/hw/display/virtio-gpu-virgl.c @@ -16,6 +16,8 @@ #include "trace.h" #include "hw/virtio/virtio.h" #include "hw/virtio/virtio-gpu.h" +#include "hw/virtio/virtio-gpu-bswap.h" +#include "hw/virtio/virtio-iommu.h" #include @@ -398,6 +400,162 @@ static void virgl_cmd_get_capset(VirtIOGPU *g, g_free(resp); } +#ifdef HAVE_VIRGL_RESOURCE_BLOB + +static void virgl_cmd_resource_create_blob(VirtIOGPU *g, + struct virtio_gpu_ctrl_command *cmd) +{ +struct virtio_gpu_simple_resource *res; +struct virtio_gpu_resource_create_blob cblob; +int ret; + +VIRTIO_GPU_FILL_CMD(cblob); +virtio_gpu_create_blob_bswap(); +trace_virtio_gpu_cmd_res_create_blob(cblob.resource_id, cblob.size); + +if (cblob.resource_id == 0) { +qemu_log_mask(LOG_GUEST_ERROR, "%s: resource id 0 is not allowed\n", + __func__); +cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +return; +} + +res = virtio_gpu_find_resource(g, cblob.resource_id); +if (res) { +qemu_log_mask(LOG_GUEST_ERROR, "%s: resource already exists %d\n", + __func__, cblob.resource_id); +cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +return; +} + +res = g_new0(struct virtio_gpu_simple_resource, 1); +res->resource_id = cblob.resource_id; +res->blob_size = cblob.size; + +if (res->iov) { +cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; +return; +} + +if (cblob.blob_mem != VIRTIO_GPU_BLOB_MEM_HOST3D) { +ret = virtio_gpu_create_mapping_iov(g, cblob.nr_entries, sizeof(cblob), +cmd, >addrs, >iov, +>iov_cnt); +if (ret != 0) { +cmd->error = VIRTIO_GPU_RESP_ERR_UNSPEC; +return; +} +} + +if (cblob.blob_mem == VIRTIO_GPU_BLOB_MEM_GUEST) { +virtio_gpu_init_udmabuf(res); +} +QTAILQ_INSERT_HEAD(>reslist, res, next); + +const struct virgl_renderer_resource_create_blob_args virgl_args = { +.res_handle = cblob.resource_id, +.ctx_id = cblob.hdr.ctx_id, +.blob_mem = cblob.blob_mem, +.blob_id = cblob.blob_id, +.blob_flags = cblob.blob_flags, +.size = cblob.size, +.iovecs = res->iov, +.num_iovs = res->iov_cnt, +}; +ret = virgl_renderer_resource_create_blob(_args); +if (ret) { +g_print("Virgl blob create error: %s\n", strerror(-ret)); +} +} + +static void virgl_cmd_resource_map_blob(VirtIOGPU *g, +struct virtio_gpu_ctrl_command *cmd) +{ +struct virtio_gpu_simple_resource *res; +struct virtio_gpu_resource_map_blob mblob; +int ret; +void *data; +uint64_t size; +struct virtio_gpu_resp_map_info resp; + +VIRTIO_GPU_FILL_CMD(mblob); +virtio_gpu_map_blob_bswap(); + +if (mblob.resource_id == 0) { +qemu_log_mask(LOG_GUEST_ERROR, "%s: resource id 0 is not allowed\n", + __func__); +cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +return; +} + +res = virtio_gpu_find_resource(g, mblob.resource_id); +if (!res) { +qemu_log_mask(LOG_GUEST_ERROR, "%s: resource does not exist %d\n", + __func__, mblob.resource_id); +cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +return; +} + +ret = virgl_renderer_resource_map(res->resource_id, , ); +if (ret) { +g_print("Virgl blob resource map error: %s\n", strerror(-ret)); +cmd->error = VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID; +return; +} + +memory_region_init_ram_device_ptr(>region, OBJECT(g), NULL, size, data); +memory_region_add_subregion(>parent_obj.hostmem, mblob.offset, >region); +memory_region_set_enabled(>region, true); + +memset(, 0, sizeof(resp)); +resp.hdr.type = VIRTIO_GPU_RESP_OK_MAP_INFO; +virgl_renderer_resource_get_map_info(mblob.resource_id, _info); +virtio_gpu_ctrl_response(g, cmd, , sizeof(resp)); + +res->mapped = true; +} + +static void virgl_cmd_resource_unmap_blob(VirtIOGPU *g, +struct virtio_gpu_ctrl_command *cmd) +{ +struct virtio_gpu_simple_resource *res; +struct
[PATCH 3/5] virtio-gpu: hostmem
From: Gerd Hoffmann Use VIRTIO_GPU_SHM_ID_HOST_VISIBLE as id for virtio-gpu. v2: Formatting fixes Signed-off-by: Antonio Caggiano Acked-by: Michael S. Tsirkin --- hw/display/virtio-gpu-pci.c| 15 +++ hw/display/virtio-gpu.c| 1 + hw/display/virtio-vga.c| 33 - include/hw/virtio/virtio-gpu.h | 5 + 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/hw/display/virtio-gpu-pci.c b/hw/display/virtio-gpu-pci.c index 93f214ff58..2cbbacd7fe 100644 --- a/hw/display/virtio-gpu-pci.c +++ b/hw/display/virtio-gpu-pci.c @@ -33,6 +33,21 @@ static void virtio_gpu_pci_base_realize(VirtIOPCIProxy *vpci_dev, Error **errp) DeviceState *vdev = DEVICE(g); int i; +if (virtio_gpu_hostmem_enabled(g->conf)) { +vpci_dev->msix_bar_idx = 1; +vpci_dev->modern_mem_bar_idx = 2; +memory_region_init(>hostmem, OBJECT(g), "virtio-gpu-hostmem", + g->conf.hostmem); +pci_register_bar(_dev->pci_dev, 4, + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_PREFETCH | + PCI_BASE_ADDRESS_MEM_TYPE_64, + >hostmem); +virtio_pci_add_shm_cap(vpci_dev, 4, 0, g->conf.hostmem, + VIRTIO_GPU_SHM_ID_HOST_VISIBLE); +} + +qdev_set_parent_bus(vdev, BUS(_dev->bus), errp); virtio_pci_force_virtio_1(vpci_dev); if (!qdev_realize(vdev, BUS(_dev->bus), errp)) { return; diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index 20cc703dcc..506b3b8eef 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -1424,6 +1424,7 @@ static Property virtio_gpu_properties[] = { 256 * MiB), DEFINE_PROP_BIT("blob", VirtIOGPU, parent_obj.conf.flags, VIRTIO_GPU_FLAG_BLOB_ENABLED, false), +DEFINE_PROP_SIZE("hostmem", VirtIOGPU, parent_obj.conf.hostmem, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/display/virtio-vga.c b/hw/display/virtio-vga.c index 4dcb34c4a7..aa8d1ab993 100644 --- a/hw/display/virtio-vga.c +++ b/hw/display/virtio-vga.c @@ -115,17 +115,32 @@ static void virtio_vga_base_realize(VirtIOPCIProxy *vpci_dev, Error **errp) pci_register_bar(_dev->pci_dev, 0, PCI_BASE_ADDRESS_MEM_PREFETCH, >vram); -/* - * Configure virtio bar and regions - * - * We use bar #2 for the mmio regions, to be compatible with stdvga. - * virtio regions are moved to the end of bar #2, to make room for - * the stdvga mmio registers at the start of bar #2. - */ -vpci_dev->modern_mem_bar_idx = 2; -vpci_dev->msix_bar_idx = 4; vpci_dev->modern_io_bar_idx = 5; +if (!virtio_gpu_hostmem_enabled(g->conf)) { +/* + * Configure virtio bar and regions + * + * We use bar #2 for the mmio regions, to be compatible with stdvga. + * virtio regions are moved to the end of bar #2, to make room for + * the stdvga mmio registers at the start of bar #2. + */ +vpci_dev->modern_mem_bar_idx = 2; +vpci_dev->msix_bar_idx = 4; +} else { +vpci_dev->msix_bar_idx = 1; +vpci_dev->modern_mem_bar_idx = 2; +memory_region_init(>hostmem, OBJECT(g), "virtio-gpu-hostmem", + g->conf.hostmem); +pci_register_bar(_dev->pci_dev, 4, + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_PREFETCH | + PCI_BASE_ADDRESS_MEM_TYPE_64, + >hostmem); +virtio_pci_add_shm_cap(vpci_dev, 4, 0, g->conf.hostmem, + VIRTIO_GPU_SHM_ID_HOST_VISIBLE); +} + if (!(vpci_dev->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ)) { /* * with page-per-vq=off there is no padding space we can use diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h index 2e28507efe..eafce75b04 100644 --- a/include/hw/virtio/virtio-gpu.h +++ b/include/hw/virtio/virtio-gpu.h @@ -102,12 +102,15 @@ enum virtio_gpu_base_conf_flags { (_cfg.flags & (1 << VIRTIO_GPU_FLAG_DMABUF_ENABLED)) #define virtio_gpu_blob_enabled(_cfg) \ (_cfg.flags & (1 << VIRTIO_GPU_FLAG_BLOB_ENABLED)) +#define virtio_gpu_hostmem_enabled(_cfg) \ +(_cfg.hostmem > 0) struct virtio_gpu_base_conf { uint32_t max_outputs; uint32_t flags; uint32_t xres; uint32_t yres; +uint64_t hostmem; }; struct virtio_gpu_ctrl_command { @@ -131,6 +134,8 @@ struct VirtIOGPUBase { int renderer_blocked; int enable; +MemoryRegion hostmem; + struct virtio_gpu_scanout scanout[VIRTIO_GPU_MAX_SCANOUTS]; int enabled_output_bitmask; -- 2.34.1
[PATCH 1/5] Update version for v7.1.0-rc4 release
From: Richard Henderson Signed-off-by: Richard Henderson --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 1c944b9863..b8d5f3ebb6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -7.0.93 +7.0.94 -- 2.34.1
[PATCH 2/5] virtio: Add shared memory capability
From: "Dr. David Alan Gilbert" Define a new capability type 'VIRTIO_PCI_CAP_SHARED_MEMORY_CFG' and the data structure 'virtio_pci_shm_cap' to go with it. They allow defining shared memory regions with sizes and offsets of 2^32 and more. Multiple instances of the capability are allowed and distinguished by a device-specific 'id'. v2: Remove virtio_pci_shm_cap as virtio_pci_cap64 is used instead. v3: No need for mask32 as cpu_to_le32 truncates the value. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Antonio Caggiano --- hw/virtio/virtio-pci.c | 18 ++ include/hw/virtio/virtio-pci.h | 4 2 files changed, 22 insertions(+) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index a50c5a57d7..377bb06fec 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1169,6 +1169,24 @@ static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, return offset; } +int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, + uint8_t bar, uint64_t offset, uint64_t length, + uint8_t id) +{ +struct virtio_pci_cap64 cap = { +.cap.cap_len = sizeof cap, +.cap.cfg_type = VIRTIO_PCI_CAP_SHARED_MEMORY_CFG, +}; + +cap.cap.bar = bar; +cap.cap.length = cpu_to_le32(length); +cap.length_hi = cpu_to_le32(length >> 32); +cap.cap.offset = cpu_to_le32(offset); +cap.offset_hi = cpu_to_le32(offset >> 32); +cap.cap.id = id; +return virtio_pci_add_mem_cap(proxy, ); +} + static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, unsigned size) { diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h index 2446dcd9ae..5e5c4a4c6d 100644 --- a/include/hw/virtio/virtio-pci.h +++ b/include/hw/virtio/virtio-pci.h @@ -252,4 +252,8 @@ void virtio_pci_types_register(const VirtioPCIDeviceTypeInfo *t); */ unsigned virtio_pci_optimal_num_queues(unsigned fixed_queues); +int virtio_pci_add_shm_cap(VirtIOPCIProxy *proxy, + uint8_t bar, uint64_t offset, uint64_t length, + uint8_t id); + #endif -- 2.34.1
Re: [PATCH v7 01/14] mm: Add F_SEAL_AUTO_ALLOCATE seal to memfd
On Fri, Aug 26, 2022 at 04:19:32PM +0100, Fuad Tabba wrote: > Hi Chao, > > On Wed, Jul 6, 2022 at 9:25 AM Chao Peng wrote: > > > > Normally, a write to unallocated space of a file or the hole of a sparse > > file automatically causes space allocation, for memfd, this equals to > > memory allocation. This new seal prevents such automatically allocating, > > either this is from a direct write() or a write on the previously > > mmap-ed area. The seal does not prevent fallocate() so an explicit > > fallocate() can still cause allocating and can be used to reserve > > memory. > > > > This is used to prevent unintentional allocation from userspace on a > > stray or careless write and any intentional allocation should use an > > explicit fallocate(). One of the main usecases is to avoid memory double > > allocation for confidential computing usage where we use two memfds to > > back guest memory and at a single point only one memfd is alive and we > > want to prevent memory allocation for the other memfd which may have > > been mmap-ed previously. More discussion can be found at: > > > > https://lkml.org/lkml/2022/6/14/1255 > > > > Suggested-by: Sean Christopherson > > Signed-off-by: Chao Peng > > --- > > include/uapi/linux/fcntl.h | 1 + > > mm/memfd.c | 3 ++- > > mm/shmem.c | 16 ++-- > > 3 files changed, 17 insertions(+), 3 deletions(-) > > > > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h > > index 2f86b2ad6d7e..98bdabc8e309 100644 > > --- a/include/uapi/linux/fcntl.h > > +++ b/include/uapi/linux/fcntl.h > > @@ -43,6 +43,7 @@ > > #define F_SEAL_GROW0x0004 /* prevent file from growing */ > > #define F_SEAL_WRITE 0x0008 /* prevent writes */ > > #define F_SEAL_FUTURE_WRITE0x0010 /* prevent future writes while > > mapped */ > > +#define F_SEAL_AUTO_ALLOCATE 0x0020 /* prevent allocation for writes */ > > I think this should also be added to tools/include/uapi/linux/fcntl.h Yes, thanks. Chao > > Cheers, > /fuad > > > > /* (1U << 31) is reserved for signed error codes */ > > > > /* > > diff --git a/mm/memfd.c b/mm/memfd.c > > index 08f5f8304746..2afd898798e4 100644 > > --- a/mm/memfd.c > > +++ b/mm/memfd.c > > @@ -150,7 +150,8 @@ static unsigned int *memfd_file_seals_ptr(struct file > > *file) > > F_SEAL_SHRINK | \ > > F_SEAL_GROW | \ > > F_SEAL_WRITE | \ > > -F_SEAL_FUTURE_WRITE) > > +F_SEAL_FUTURE_WRITE | \ > > +F_SEAL_AUTO_ALLOCATE) > > > > static int memfd_add_seals(struct file *file, unsigned int seals) > > { > > diff --git a/mm/shmem.c b/mm/shmem.c > > index a6f565308133..6c8aef15a17d 100644 > > --- a/mm/shmem.c > > +++ b/mm/shmem.c > > @@ -2051,6 +2051,8 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) > > struct vm_area_struct *vma = vmf->vma; > > struct inode *inode = file_inode(vma->vm_file); > > gfp_t gfp = mapping_gfp_mask(inode->i_mapping); > > + struct shmem_inode_info *info = SHMEM_I(inode); > > + enum sgp_type sgp; > > int err; > > vm_fault_t ret = VM_FAULT_LOCKED; > > > > @@ -2113,7 +2115,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) > > spin_unlock(>i_lock); > > } > > > > - err = shmem_getpage_gfp(inode, vmf->pgoff, >page, SGP_CACHE, > > + if (unlikely(info->seals & F_SEAL_AUTO_ALLOCATE)) > > + sgp = SGP_NOALLOC; > > + else > > + sgp = SGP_CACHE; > > + > > + err = shmem_getpage_gfp(inode, vmf->pgoff, >page, sgp, > > gfp, vma, vmf, ); > > if (err) > > return vmf_error(err); > > @@ -2459,6 +2466,7 @@ shmem_write_begin(struct file *file, struct > > address_space *mapping, > > struct inode *inode = mapping->host; > > struct shmem_inode_info *info = SHMEM_I(inode); > > pgoff_t index = pos >> PAGE_SHIFT; > > + enum sgp_type sgp; > > int ret = 0; > > > > /* i_rwsem is held by caller */ > > @@ -2470,7 +2478,11 @@ shmem_write_begin(struct file *file, struct > > address_space *mapping, > > return -EPERM; > > } > > > > - ret = shmem_getpage(inode, index, pagep, SGP_WRITE); > > + if (unlikely(info->seals & F_SEAL_AUTO_ALLOCATE)) > > + sgp = SGP_NOALLOC; > > + else > > + sgp = SGP_WRITE; > > + ret = shmem_getpage(inode, index, pagep, sgp); > > > > if (ret) > > return ret; > > -- > > 2.25.1 > >
Re: New feature shout outs for KVM Forum QEMU Status Report
block/ - VDUSE export was added so qemu-storage-daemon can export disk images to the host kernel and guests. See qemu-storage-daemon(1) man page for details. hw/remote/ - vfio-user server support was added so PCI devices can be emulated in a separate QEMU process. vfio-user client support is not yet available in QEMU so guests are currently unable to connect to the server. signature.asc Description: PGP signature
Re: [PATCH 22/51] tests/qtest: qmp-test: Skip running test_qmp_oob for win32
Bin Meng writes: > Hi Markus, > > On Mon, Aug 29, 2022 at 9:14 PM Markus Armbruster wrote: >> >> Bin Meng writes: >> >> > From: Bin Meng >> > >> > The test_qmp_oob test case calls mkfifo() which does not exist on >> > win32. Exclude it. >> > >> > Signed-off-by: Bin Meng >> > --- >> > >> > tests/qtest/qmp-test.c | 6 ++ >> > 1 file changed, 6 insertions(+) >> > >> > diff --git a/tests/qtest/qmp-test.c b/tests/qtest/qmp-test.c >> > index b950dbafaf..4a165447f8 100644 >> > --- a/tests/qtest/qmp-test.c >> > +++ b/tests/qtest/qmp-test.c >> > @@ -159,6 +159,8 @@ static void test_qmp_protocol(void) >> > qtest_quit(qts); >> > } >> > >> > +#ifndef _WIN32 >> > + >> > /* Out-of-band tests */ >> > >> > char *tmpdir; >> > @@ -279,6 +281,8 @@ static void test_qmp_oob(void) >> > qtest_quit(qts); >> > } >> > >> > +#endif /* _WIN32 */ >> > + >> > /* Preconfig tests */ >> > >> > static void test_qmp_preconfig(void) >> > @@ -338,7 +342,9 @@ int main(int argc, char *argv[]) >> > g_test_init(, , NULL); >> > >> > qtest_add_func("qmp/protocol", test_qmp_protocol); >> > +#ifndef _WIN32 >> > qtest_add_func("qmp/oob", test_qmp_oob); >> > +#endif >> > qtest_add_func("qmp/preconfig", test_qmp_preconfig); >> > qtest_add_func("qmp/missing-any-arg", test_qmp_missing_any_arg); >> >> I'd appreciate a comment explaining why we have to disable this test on >> Windows. > > The reason is explained in the commit message. Yes, and putting it there is a good idea. But I'd appreciate if you *also* put it in the code, so future readers of the code don't have to dig through git history.
[PATCH] pci: Abort if pci_add_capability fails
From: Akihiko Odaki pci_add_capability appears most PCI devices. The error handling required lots of code, and led to inconsistent behaviors such as: - passing error_abort - passing error_fatal - asserting the returned value - propagating the error to the caller - skipping the rest of the function - just ignoring pci_add_capability fails if the new capability overlaps with an existing one. It happens only if the device implementation is wrong so pci_add_capability can just abort instead of returning to the caller in the case, fixing inconsistencies and removing extra code. Signed-off-by: Akihiko Odaki --- docs/pcie_sriov.txt| 4 +-- hw/display/bochs-display.c | 4 +-- hw/i386/amd_iommu.c| 18 ++- hw/ide/ich.c | 8 ++--- hw/net/e1000e.c| 22 +++-- hw/net/eepro100.c | 7 +--- hw/nvme/ctrl.c | 10 +- hw/pci-bridge/cxl_downstream.c | 9 ++ hw/pci-bridge/cxl_upstream.c | 9 ++ hw/pci-bridge/i82801b11.c | 15 ++--- hw/pci-bridge/pci_bridge_dev.c | 2 +- hw/pci-bridge/pcie_pci_bridge.c| 17 +++--- hw/pci-bridge/pcie_root_port.c | 16 ++--- hw/pci-bridge/xio3130_downstream.c | 15 ++--- hw/pci-bridge/xio3130_upstream.c | 15 ++--- hw/pci-host/designware.c | 3 +- hw/pci-host/xilinx-pcie.c | 5 +-- hw/pci/msi.c | 9 +- hw/pci/msix.c | 8 ++--- hw/pci/pci.c | 14 +++- hw/pci/pci_bridge.c| 22 - hw/pci/pcie.c | 52 -- hw/pci/shpc.c | 16 +++-- hw/pci/slotid_cap.c| 8 ++--- hw/usb/hcd-xhci-pci.c | 3 +- hw/vfio/pci-quirks.c | 15 ++--- hw/vfio/pci.c | 12 +++ hw/virtio/virtio-pci.c | 22 - include/hw/pci/pci.h | 5 ++- include/hw/pci/pci_bridge.h| 5 ++- include/hw/pci/pcie.h | 11 +++ include/hw/pci/shpc.h | 5 ++- include/hw/virtio/virtio-pci.h | 2 +- 33 files changed, 98 insertions(+), 290 deletions(-) diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt index 11158dbf88..728a73ba7b 100644 --- a/docs/pcie_sriov.txt +++ b/docs/pcie_sriov.txt @@ -49,7 +49,7 @@ setting up a BAR for a VF. pci_your_pf_dev_realize( ... ) { ... - int ret = pcie_endpoint_cap_init(d, 0x70); + pcie_endpoint_cap_init(d, 0x70); ... pcie_ari_init(d, 0x100, 1); ... @@ -79,7 +79,7 @@ setting up a BAR for a VF. pci_your_vf_dev_realize( ... ) { ... - int ret = pcie_endpoint_cap_init(d, 0x60); + pcie_endpoint_cap_init(d, 0x60); ... pcie_ari_init(d, 0x100, 1); ... diff --git a/hw/display/bochs-display.c b/hw/display/bochs-display.c index 8ed734b195..111cabcfb3 100644 --- a/hw/display/bochs-display.c +++ b/hw/display/bochs-display.c @@ -265,7 +265,6 @@ static void bochs_display_realize(PCIDevice *dev, Error **errp) { BochsDisplayState *s = BOCHS_DISPLAY(dev); Object *obj = OBJECT(dev); -int ret; if (s->vgamem < 4 * MiB) { error_setg(errp, "bochs-display: video memory too small"); @@ -302,8 +301,7 @@ static void bochs_display_realize(PCIDevice *dev, Error **errp) } if (pci_bus_is_express(pci_get_bus(dev))) { -ret = pcie_endpoint_cap_init(dev, 0x80); -assert(ret > 0); +pcie_endpoint_cap_init(dev, 0x80); } else { dev->cap_present &= ~QEMU_PCI_CAP_EXPRESS; } diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 725f69095b..256ecba1c3 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1553,23 +1553,11 @@ static void amdvi_sysbus_realize(DeviceState *dev, Error **errp) if (!qdev_realize(DEVICE(>pci), >qbus, errp)) { return; } -ret = pci_add_capability(>pci.dev, AMDVI_CAPAB_ID_SEC, 0, - AMDVI_CAPAB_SIZE, errp); -if (ret < 0) { -return; -} +pci_add_capability(>pci.dev, AMDVI_CAPAB_ID_SEC, 0, AMDVI_CAPAB_SIZE); s->capab_offset = ret; -ret = pci_add_capability(>pci.dev, PCI_CAP_ID_MSI, 0, - AMDVI_CAPAB_REG_SIZE, errp); -if (ret < 0) { -return; -} -ret = pci_add_capability(>pci.dev, PCI_CAP_ID_HT, 0, - AMDVI_CAPAB_REG_SIZE, errp); -if (ret < 0) { -return; -} +pci_add_capability(>pci.dev, PCI_CAP_ID_MSI, 0, AMDVI_CAPAB_REG_SIZE); +pci_add_capability(>pci.dev, PCI_CAP_ID_HT, 0, AMDVI_CAPAB_REG_SIZE); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = amdvi_host_dma_iommu(bus, s, AMDVI_IOAPIC_SB_DEVID); diff --git a/hw/ide/ich.c b/hw/ide/ich.c index 1007a51fcb..7349faa78f 100644 ---
[PATCH 1/1] target/i386: Raise #GP on unaligned m128 accesses when required.
Many instructions which load/store 128-bit values are supposed to raise #GP when the memory operand isn't 16-byte aligned. This includes: - Instructions explicitly requiring memory alignment (Exceptions Type 1 in the "AVX and SSE Instruction Exception Specification" section of the SDM) - Legacy SSE instructions that load/store 128-bit values (Exceptions Types 2 and 4). This change adds a raise_gp_if_unaligned helper which raises #GP if an address is not properly aligned. This helper is called before 128-bit loads/stores where appropriate. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/217 Signed-off-by: Ricky Zhou --- target/i386/helper.h | 1 + target/i386/tcg/mem_helper.c | 8 target/i386/tcg/translate.c | 38 +--- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/target/i386/helper.h b/target/i386/helper.h index ac3b4d1ee3..17d78f2b0d 100644 --- a/target/i386/helper.h +++ b/target/i386/helper.h @@ -213,6 +213,7 @@ DEF_HELPER_1(update_mxcsr, void, env) DEF_HELPER_1(enter_mmx, void, env) DEF_HELPER_1(emms, void, env) DEF_HELPER_3(movq, void, env, ptr, ptr) +DEF_HELPER_3(raise_gp_if_unaligned, void, env, tl, tl) #define SHIFT 0 #include "ops_sse_header.h" diff --git a/target/i386/tcg/mem_helper.c b/target/i386/tcg/mem_helper.c index e3cdafd2d4..79259abef3 100644 --- a/target/i386/tcg/mem_helper.c +++ b/target/i386/tcg/mem_helper.c @@ -181,3 +181,11 @@ void helper_boundl(CPUX86State *env, target_ulong a0, int v) raise_exception_ra(env, EXCP05_BOUND, GETPC()); } } + +void helper_raise_gp_if_unaligned(CPUX86State *env, target_ulong addr, + target_ulong align_mask) +{ +if (unlikely((addr & align_mask) != 0)) { +raise_exception_ra(env, EXCP0D_GPF, GETPC()); +} +} diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index b7972f0ff5..de13f483b6 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -3054,7 +3054,7 @@ static const struct SSEOpHelper_epp sse_op_table6[256] = { [0x25] = SSE41_OP(pmovsxdq), [0x28] = SSE41_OP(pmuldq), [0x29] = SSE41_OP(pcmpeqq), -[0x2a] = SSE41_SPECIAL, /* movntqda */ +[0x2a] = SSE41_SPECIAL, /* movntdqa */ [0x2b] = SSE41_OP(packusdw), [0x30] = SSE41_OP(pmovzxbw), [0x31] = SSE41_OP(pmovzxbd), @@ -3194,10 +3194,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, break; case 0x1e7: /* movntdq */ case 0x02b: /* movntps */ -case 0x12b: /* movntps */ +case 0x12b: /* movntpd */ if (mod == 3) goto illegal_op; gen_lea_modrm(env, s, modrm); +gen_helper_raise_gp_if_unaligned(cpu_env, s->A0, tcg_const_tl(0xf)); gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); break; case 0x3f0: /* lddqu */ @@ -3273,6 +3274,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x26f: /* movdqu xmm, ea */ if (mod != 3) { gen_lea_modrm(env, s, modrm); +/* movaps, movapd, movdqa */ +if (b == 0x028 || b == 0x128 || b == 0x16f) { +gen_helper_raise_gp_if_unaligned(cpu_env, s->A0, + tcg_const_tl(0xf)); +} gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); } else { rm = (modrm & 7) | REX_B(s); @@ -3331,6 +3337,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x212: /* movsldup */ if (mod != 3) { gen_lea_modrm(env, s, modrm); +if (!(s->prefix & PREFIX_VEX)) { +gen_helper_raise_gp_if_unaligned(cpu_env, s->A0, + tcg_const_tl(0xf)); +} gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); } else { rm = (modrm & 7) | REX_B(s); @@ -3373,6 +3383,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x216: /* movshdup */ if (mod != 3) { gen_lea_modrm(env, s, modrm); +if (!(s->prefix & PREFIX_VEX)) { +gen_helper_raise_gp_if_unaligned(cpu_env, s->A0, + tcg_const_tl(0xf)); +} gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); } else { rm = (modrm & 7) | REX_B(s); @@ -3465,6 +3479,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x27f: /* movdqu ea, xmm */ if (mod != 3) { gen_lea_modrm(env, s, modrm); +if (b == 0x029 || b == 0x129 || b == 0x17f) { +
[PATCH 0/1] target/i386: Raise #GP on unaligned m128 accesses when required.
This is a change to raise #GP on unaligned m128 loads/stores when required by the spec. Some notes on this change: 1. I considered making use of the existing support for enforcing memory alignment (setting MO_ALIGN_16 in the load/store's MemOp), but rejected this approach. There are at least two scenarios where we might want to do alignment checks in x86: a. Loads/stores when the AC flag is enabled (which should raise #AC on misalignment) b. SSE/AVX instructions which require memory alignment (which raise #GP on misalignment) The MemOp alignment checking mechanism can only handle one of these scenarios, since they require different exceptions to be raised. I think it make more sense to use the existing memory alignment support for implementing (a), since helper_unaligned_{ld,st} is already triggers SIGBUS in qemu-user. This is why I ended up implementing (b) with a helper. 2. It is often the case that legacy SSE instructions require 16 byte alignment of 128-bit memory operands, but AVX versions of the instructions do not (e.g. movsldup requires alignment and vmovsldup does not). From what I can tell, QEMU currently doesn't appear to report AVX support in cpuid, but it still seems to emulate some of these instructions if you tell it to execute them. This change attempts to distinguish between legacy SSE instructions and AVX instructions by by conditioning on !(s->prefix & PREFIX_VEX). Not sure this is very future-proof though - for example, it may need to be updated if support for EVEX prefixes is added. LMK if there's a nicer way to do this. 3. I tested this by running a Linux VM in qemu-system-x86_64 and verifying that movaps on an misaligned address triggers a segfault. Ricky Zhou (1): target/i386: Raise #GP on unaligned m128 accesses when required. target/i386/helper.h | 1 + target/i386/tcg/mem_helper.c | 8 target/i386/tcg/translate.c | 38 +--- 3 files changed, 44 insertions(+), 3 deletions(-) -- 2.37.2
Re: [PATCH 22/51] tests/qtest: qmp-test: Skip running test_qmp_oob for win32
Hi Markus, On Mon, Aug 29, 2022 at 9:14 PM Markus Armbruster wrote: > > Bin Meng writes: > > > From: Bin Meng > > > > The test_qmp_oob test case calls mkfifo() which does not exist on > > win32. Exclude it. > > > > Signed-off-by: Bin Meng > > --- > > > > tests/qtest/qmp-test.c | 6 ++ > > 1 file changed, 6 insertions(+) > > > > diff --git a/tests/qtest/qmp-test.c b/tests/qtest/qmp-test.c > > index b950dbafaf..4a165447f8 100644 > > --- a/tests/qtest/qmp-test.c > > +++ b/tests/qtest/qmp-test.c > > @@ -159,6 +159,8 @@ static void test_qmp_protocol(void) > > qtest_quit(qts); > > } > > > > +#ifndef _WIN32 > > + > > /* Out-of-band tests */ > > > > char *tmpdir; > > @@ -279,6 +281,8 @@ static void test_qmp_oob(void) > > qtest_quit(qts); > > } > > > > +#endif /* _WIN32 */ > > + > > /* Preconfig tests */ > > > > static void test_qmp_preconfig(void) > > @@ -338,7 +342,9 @@ int main(int argc, char *argv[]) > > g_test_init(, , NULL); > > > > qtest_add_func("qmp/protocol", test_qmp_protocol); > > +#ifndef _WIN32 > > qtest_add_func("qmp/oob", test_qmp_oob); > > +#endif > > qtest_add_func("qmp/preconfig", test_qmp_preconfig); > > qtest_add_func("qmp/missing-any-arg", test_qmp_missing_any_arg); > > I'd appreciate a comment explaining why we have to disable this test on > Windows. The reason is explained in the commit message. Regards, Bin
Re: [RFC PATCH v2 0/8] qapi: add generator for Golang interface
Hi, On Mon, Aug 29, 2022 at 01:53:51PM +0200, Markus Armbruster wrote: > Victor Toso writes: > > > Hi, > > > > On Tue, Jul 05, 2022 at 08:46:34AM -0700, Andrea Bolognani wrote: > >> I've commented in detail to the single patches, just a couple of > >> additional points. > >> > >> On Fri, Jun 17, 2022 at 02:19:24PM +0200, Victor Toso wrote: > >> > * 7) Flat structs by removing embed types. Discussion with Andrea > >> > Thread: > >> > https://lists.gnu.org/archive/html/qemu-devel/2022-05/msg01590.html > >> > > >> > No one required it but I decided to give it a try. Major issue that > >> > I see with this approach is to have generated a few 'Base' structs > >> > that are now useless. Overall, less nested structs seems better to > >> > me. Opnions? > >> > > >> > Example: > >> > | /* This is now useless, should be removed? */ > >> > | type InetSocketAddressBase struct { > >> > | Host string `json:"host"` > >> > | Port string `json:"port"` > >> > | } > >> > >> Can we somehow keep track, in the generator, of types that are > >> only used as building blocks for other types, and prevent them > >> from showing up in the generated code? > > > > I'm not 100% sure it is good to remove them from generated code > > because technically it is a valid qapi type. If all @base types > > are embed types and they don't show in other way or form, sure we > > can remove them from generated code... I'm not sure if it is > > possible to guarantee this. > > > > But yes, if possible, I'd like to remove what seems useless type > > definitions. > > The existing C generators have to generate all the types, because the > generated code is for QEMU's own use, where we need all the types. > > The existing introspection generator generates only the types visible in > QAPI/QMP introspection. > > The former generate for internal use (where we want all the types), and > the latter for external use (where only the types visible in the > external interface are actually useful). My doubt are on types that might be okay to be hidden because they are embed in other types, like InetSocketAddressBase. Note that what I mean with the struct being embed is that the actual fields of InetSocketAddressBase being added to the type which uses it, like InetSocketAddress. | type InetSocketAddressBase struct { | Host string `json:"host"` | Port string `json:"port"` | } | | type InetSocketAddress struct { | // Base fields | Host string `json:"host"` | Port string `json:"port"` | | Numeric *bool `json:"numeric,omitempty"` | To*uint16 `json:"to,omitempty"` | Ipv4 *bool `json:"ipv4,omitempty"` | Ipv6 *bool `json:"ipv6,omitempty"` | KeepAlive *bool `json:"keep-alive,omitempty"` | Mptcp *bool `json:"mptcp,omitempty"` | } Andrea's suggestions is to have the generator to track if a given type is always embed in which case we can skip generating it in the Go module. I think that could work indeed. In the hypothetical case that hiddens structs like InetSocketAddressBase becomes a parameter on command in the future, the generator would know and start generating this type from that point onwards. > >> Finally, looking at the repository containing the generated > >> code I see that the generated type are sorted by kind, e.g. all > >> unions are in a file, all events in another one and so on. I > >> believe the structure should match more closely that of the > >> QAPI schema, so e.g. block-related types should all go in one > >> file, net-related types in another one and so on. > > > > That's something I don't mind adding but some hardcoded mapping > > is needed. If you look into git history of qapi/ folder, .json > > files can come and go, types be moved around, etc. So, we need to > > proper map types in a way that the generated code would be kept > > stable even if qapi files would have been rearranged. What I > > proposed was only the simplest solution. > > > > Also, the generator takes a qapi-schema.json as input. We are > > more focused in qemu/qapi/qapi-schema.json generated coded but > > would not hurt to think we could even use it for qemu-guest-agent > > from qemu/qga/qapi-schema.json -- this to say that the hardcoded > > mapping needs to take into account non qemu qapi schemas too. > > In the beginning, the QAPI schema was monolithic. > qga/qapi-schema.json still is. > > When keeping everything in a single qapi-schema.json became > unwieldy, we split it into "modules" tied together with a > simple include directive. Generated code remained monolithic. > When monolithic generated code became too annoying (touch > schema, recompile everything), we made it match the module > structure: code for FOO.json goes into *-FOO.c and *-FOO.h, > where the *-FOO.h #include the generated headers for the .json > modules FOO.json includes. > > Schema code motion hasn't been much of a
Re: [RFC PATCH v2 2/8] qapi: golang: Generate qapi's alternate types in Go
Hi, On Mon, Aug 29, 2022 at 01:27:06PM +0200, Markus Armbruster wrote: > Victor Toso writes: > > > Hi, > > > > On Fri, Aug 19, 2022 at 11:27:13AM -0500, Andrea Bolognani wrote: > >> On Wed, Aug 17, 2022 at 04:04:19PM +0200, Victor Toso wrote: > >> > On Tue, Jul 05, 2022 at 08:45:06AM -0700, Andrea Bolognani wrote: > >> > > On Fri, Jun 17, 2022 at 02:19:26PM +0200, Victor Toso wrote: > >> > > > func (s *BlockdevRef) UnmarshalJSON(data []byte) error { > >> > > > // Check for json-null first > >> > > > if string(data) == "null" { > >> > > > return errors.New(`null not supported for BlockdevRef`) > >> > > > } > >> > > > // Check for BlockdevOptions > >> > > > { > >> > > > s.Definition = new(BlockdevOptions) > >> > > > if err := StrictDecode(s.Definition, data); err == nil { > >> > > > return nil > >> > > > } > >> > > > >> > > The use of StrictDecode() here means that we won't be able to > >> > > parse an alternate produced by a version of QEMU where > >> > > BlockdevOptions has gained additional fields, doesn't it? > >> > > >> > That's correct. This means that with this RFCv2 proposal, qapi-go > >> > based on qemu version 7.1 might not be able to decode a qmp > >> > message from qemu version 7.2 if it has introduced a new field. > >> > > >> > This needs fixing, not sure yet the way to go. > >> > > >> > > Considering that we will happily parse such a BlockdevOptions > >> > > outside of the context of BlockdevRef, I think we should be > >> > > consistent and allow the same to happen here. > >> > > >> > StrictDecode is only used with alternates because, unlike unions, > >> > Alternate types don't have a 'discriminator' field that would > >> > allow us to know what data type to expect. > >> > > >> > With this in mind, theoretically speaking, we could have very > >> > similar struct types as Alternate fields and we have to find on > >> > runtime which type is that underlying byte stream. > >> > > >> > So, to reply to your suggestion, if we allow BlockdevRef without > >> > StrictDecode we might find ourselves in a situation that it > >> > matched a few fields of BlockdevOptions but it the byte stream > >> > was actually another type. > >> > >> IIUC your concern is that the QAPI schema could gain a new > >> type, TotallyNotBlockdevOptions, which looks exactly like > >> BlockdevOptions except for one or more extra fields. > >> > >> If QEMU then produced a JSON like > >> > >> { "description": { /* a TotallyNotBlockdevOptions here */ } } > >> > >> and we'd try to deserialize it with Go code like > >> > >> ref := BlockdevRef{} > >> json.Unmarsal() > >> > >> we'd end up mistakenly parsing the TotallyNotBlockdevOptions as a > >> valid BlockdevOptions, dropping the extra fields in the process. > >> > >> Does that correctly describe the reason why you feel that the use of > >> StrictDecode is necessary? > > > > Not quite. The problem here is related to the Alternate types of > > the QAPI specification [0], just to name a simple in-use example, > > BlockdevRefOrNul [1]. > > > > [0] > > https://gitlab.com/qemu-project/qemu/-/blob/master/docs/devel/qapi-code-gen.rst?plain=1#L387 > > [1] > > https://gitlab.com/qemu-project/qemu/-/blob/master/qapi/block-core.json#L4349 > > > > To exemplify the problem that I try to solve with StrictDecode, > > let's say there is a DeviceRef alternate type that looks like: > > > > { 'alternate': 'DeviceRef', > > 'data': { 'memory': 'BlockdevRefInMemory', > > 'disk': 'BlockdevRefInDisk', > > 'cloud': 'BlockdevRefInCloud' } } > > > > Just a quick recap, at runtime we don't have data's payload name > > (e.g: disk). We need to check the actual data and try to find > > what is the payload type. > > > > type BlockdevRefInMemory struct { > > Name *string > > Size uint64 > > Start uint64 > > End uint64 > > } > > > > type BlockdevRefInDisk struct { > > Name *string > > Size uint64 > > Path *string > > } > > > > type BlockdevRefInCloud struct { > > Name *string > > Size uint64 > > Uri *string > > } > > > > All types have unique fields but they all share some fields too. > > Quick intercession (I merely skimmed the review thread; forgive me if > it's not useful or not new): > > An alternate type is like a union type, except there is no > discriminator on the wire. Instead, the branch to use is inferred > from the value. An alternate can only express a choice between types > represented differently on the wire. > > This is docs/devel/qapi-code-gen.rst. Implied there: the inference is > based on the JSON type *only*, i.e. no two branches can have the same > JSON type on the wire. Since all complex types (struct or union) are > JSON object on the wire, at most one alternate branch can be of complex > type. Ah, I've missed this bit. Thank you, it does make it much simpler. > More sophisticated inference would be possible if we need it. > So
Re: [PATCH 22/51] tests/qtest: qmp-test: Skip running test_qmp_oob for win32
Bin Meng writes: > From: Bin Meng > > The test_qmp_oob test case calls mkfifo() which does not exist on > win32. Exclude it. > > Signed-off-by: Bin Meng > --- > > tests/qtest/qmp-test.c | 6 ++ > 1 file changed, 6 insertions(+) > > diff --git a/tests/qtest/qmp-test.c b/tests/qtest/qmp-test.c > index b950dbafaf..4a165447f8 100644 > --- a/tests/qtest/qmp-test.c > +++ b/tests/qtest/qmp-test.c > @@ -159,6 +159,8 @@ static void test_qmp_protocol(void) > qtest_quit(qts); > } > > +#ifndef _WIN32 > + > /* Out-of-band tests */ > > char *tmpdir; > @@ -279,6 +281,8 @@ static void test_qmp_oob(void) > qtest_quit(qts); > } > > +#endif /* _WIN32 */ > + > /* Preconfig tests */ > > static void test_qmp_preconfig(void) > @@ -338,7 +342,9 @@ int main(int argc, char *argv[]) > g_test_init(, , NULL); > > qtest_add_func("qmp/protocol", test_qmp_protocol); > +#ifndef _WIN32 > qtest_add_func("qmp/oob", test_qmp_oob); > +#endif > qtest_add_func("qmp/preconfig", test_qmp_preconfig); > qtest_add_func("qmp/missing-any-arg", test_qmp_missing_any_arg); I'd appreciate a comment explaining why we have to disable this test on Windows.
Re: [PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
Sam Li 于2022年8月29日周一 20:53写道: > > By adding zone management operations in BlockDriver, storage controller > emulation can use the new block layer APIs including Report Zone and > four zone management operations (open, close, finish, reset). > > Add zoned storage commands of the device: zone_report(zrp), zone_open(zo), > zone_close(zc), zone_reset(zrs), zone_finish(zf). > > For example, to test zone_report, use following command: > $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0 > -c "zrp offset nr_zones" > > Signed-off-by: Sam Li > Reviewed-by: Hannes Reinecke > --- > block/block-backend.c | 51 + > block/file-posix.c| 326 +- > block/io.c| 41 > include/block/block-io.h | 7 + > include/block/block_int-common.h | 21 ++ > include/block/raw-aio.h | 6 +- > include/sysemu/block-backend-io.h | 17 ++ > meson.build | 1 + > qapi/block-core.json | 8 +- > qemu-io-cmds.c| 143 + > 10 files changed, 617 insertions(+), 4 deletions(-) > > diff --git a/block/block-backend.c b/block/block-backend.c > index d4a5df2ac2..c5798651df 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1775,6 +1775,57 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) > return ret; > } > > +/* > + * Send a zone_report command. > + * offset is a byte offset from the start of the device. No alignment > + * required for offset. > + * nr_zones represents IN maximum and OUT actual. > + */ > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, > +unsigned int *nr_zones, > +BlockZoneDescriptor *zones) > +{ > +int ret; > +IO_CODE(); > + > +blk_inc_in_flight(blk); /* increase before waiting */ > +blk_wait_while_drained(blk); > +if (!blk_is_available(blk)) { > +blk_dec_in_flight(blk); > +return -ENOMEDIUM; > +} > +ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); > +blk_dec_in_flight(blk); > +return ret; > +} > + > +/* > + * Send a zone_management command. > + * op is the zone operation. > + * offset is the starting zone specified as a sector offset. > + * len is the maximum number of sectors the command should operate on. It > + * should be aligned with the zone sector size. > + */ > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, > +int64_t offset, int64_t len) > +{ > +int ret; > +IO_CODE(); > + > + > +blk_inc_in_flight(blk); > +blk_wait_while_drained(blk); > + > +ret = blk_check_byte_request(blk, offset, len); > +if (ret < 0) { > +return ret; > +} > + > +ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); > +blk_dec_in_flight(blk); > +return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 0a8b4b426e..e3efba6db7 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -67,6 +67,9 @@ > #include > #include > #include > +#if defined(CONFIG_BLKZONED) > +#include > +#endif > #include > #include > #include > @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData { > PreallocMode prealloc; > Error **errp; > } truncate; > +struct { > +unsigned int *nr_zones; > +BlockZoneDescriptor *zones; > +} zone_report; > +struct { > +unsigned long zone_op; > +} zone_mgmt; > }; > } RawPosixAIOData; > > @@ -1339,7 +1349,7 @@ static void raw_refresh_limits(BlockDriverState *bs, > Error **errp) > #endif > > if (bs->sg || S_ISBLK(st.st_mode)) { > -int ret = hdev_get_max_hw_transfer(s->fd, ); > +ret = hdev_get_max_hw_transfer(s->fd, ); > > if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { > bs->bl.max_hw_transfer = ret; > @@ -1356,6 +1366,27 @@ static void raw_refresh_limits(BlockDriverState *bs, > Error **errp) > zoned = BLK_Z_NONE; > } > bs->bl.zoned = zoned; > +if (zoned != BLK_Z_NONE) { > +ret = get_sysfs_long_val(, "chunk_sectors"); > +if (ret > 0) { > +bs->bl.zone_sectors = ret; > +} > + > +ret = get_sysfs_long_val(, "zone_append_max_bytes"); > +if (ret > 0) { > +bs->bl.zone_append_max_bytes = ret; > +} > + > +ret = get_sysfs_long_val(, "max_open_zones"); > +if (ret >= 0) { > +bs->bl.max_open_zones = ret; > +} > + > +ret = get_sysfs_long_val(, "max_active_zones"); > +if (ret >= 0) { > +bs->bl.max_active_zones = ret; > +} > +} > } > > static int check_for_dasd(int fd) > @@ -1850,6 +1881,136 @@ static off_t copy_file_range(int in_fd,
[PATCH v8 3/7] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls
By adding zone management operations in BlockDriver, storage controller emulation can use the new block layer APIs including Report Zone and four zone management operations (open, close, finish, reset). Add zoned storage commands of the device: zone_report(zrp), zone_open(zo), zone_close(zc), zone_reset(zrs), zone_finish(zf). For example, to test zone_report, use following command: $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0 -c "zrp offset nr_zones" Signed-off-by: Sam Li Reviewed-by: Hannes Reinecke --- block/block-backend.c | 51 + block/file-posix.c| 326 +- block/io.c| 41 include/block/block-io.h | 7 + include/block/block_int-common.h | 21 ++ include/block/raw-aio.h | 6 +- include/sysemu/block-backend-io.h | 17 ++ meson.build | 1 + qapi/block-core.json | 8 +- qemu-io-cmds.c| 143 + 10 files changed, 617 insertions(+), 4 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index d4a5df2ac2..c5798651df 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1775,6 +1775,57 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) return ret; } +/* + * Send a zone_report command. + * offset is a byte offset from the start of the device. No alignment + * required for offset. + * nr_zones represents IN maximum and OUT actual. + */ +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, +unsigned int *nr_zones, +BlockZoneDescriptor *zones) +{ +int ret; +IO_CODE(); + +blk_inc_in_flight(blk); /* increase before waiting */ +blk_wait_while_drained(blk); +if (!blk_is_available(blk)) { +blk_dec_in_flight(blk); +return -ENOMEDIUM; +} +ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); +blk_dec_in_flight(blk); +return ret; +} + +/* + * Send a zone_management command. + * op is the zone operation. + * offset is the starting zone specified as a sector offset. + * len is the maximum number of sectors the command should operate on. It + * should be aligned with the zone sector size. + */ +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, +int64_t offset, int64_t len) +{ +int ret; +IO_CODE(); + + +blk_inc_in_flight(blk); +blk_wait_while_drained(blk); + +ret = blk_check_byte_request(blk, offset, len); +if (ret < 0) { +return ret; +} + +ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); +blk_dec_in_flight(blk); +return ret; +} + void blk_drain(BlockBackend *blk) { BlockDriverState *bs = blk_bs(blk); diff --git a/block/file-posix.c b/block/file-posix.c index 0a8b4b426e..e3efba6db7 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -67,6 +67,9 @@ #include #include #include +#if defined(CONFIG_BLKZONED) +#include +#endif #include #include #include @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData { PreallocMode prealloc; Error **errp; } truncate; +struct { +unsigned int *nr_zones; +BlockZoneDescriptor *zones; +} zone_report; +struct { +unsigned long zone_op; +} zone_mgmt; }; } RawPosixAIOData; @@ -1339,7 +1349,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) #endif if (bs->sg || S_ISBLK(st.st_mode)) { -int ret = hdev_get_max_hw_transfer(s->fd, ); +ret = hdev_get_max_hw_transfer(s->fd, ); if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { bs->bl.max_hw_transfer = ret; @@ -1356,6 +1366,27 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp) zoned = BLK_Z_NONE; } bs->bl.zoned = zoned; +if (zoned != BLK_Z_NONE) { +ret = get_sysfs_long_val(, "chunk_sectors"); +if (ret > 0) { +bs->bl.zone_sectors = ret; +} + +ret = get_sysfs_long_val(, "zone_append_max_bytes"); +if (ret > 0) { +bs->bl.zone_append_max_bytes = ret; +} + +ret = get_sysfs_long_val(, "max_open_zones"); +if (ret >= 0) { +bs->bl.max_open_zones = ret; +} + +ret = get_sysfs_long_val(, "max_active_zones"); +if (ret >= 0) { +bs->bl.max_active_zones = ret; +} +} } static int check_for_dasd(int fd) @@ -1850,6 +1881,136 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd, } #endif +/* + * parse_zone - Fill a zone descriptor + */ +#if defined(CONFIG_BLKZONED) +static inline void parse_zone(struct BlockZoneDescriptor *zone, + const struct blk_zone *blkz) { +zone->start = blkz->start; +zone->length = blkz->len; +zone->cap =
Re: [PATCH 0/1] Update vfio-user module to the latest
On 07/08/2022 12.39, John Levon wrote: On Fri, Aug 05, 2022 at 09:24:56AM +0100, Daniel P. Berrangé wrote: [...] If we do add something as a submodule for some reason, I'd like us to say upfront that this is for a fixed time period (ie maximum of 3 releases aka 1 year) only after which we'll remove it no matter what. I'm not clear on the costs of having the submodule: could you please explain why it's an issue exactly? Some reasoning can be found here: https://lore.kernel.org/qemu-devel/d7a7b28f-a665-2567-0fb6-e31e7ecbb...@redhat.com/ I can only think of the flaky test issue (for which I'm sorry). Speaking of that test issue, yes, it would be good to get this patch included now as soon as the 7.1 release has been done. Who's going to send a pull request for this? Thomas
[PATCH] tests/avocado/migration: Get find_free_port() from the ports
In upstream Avocado, the find_free_port() function is not available from "network" anymore, but must be used via "ports", see: https://github.com/avocado-framework/avocado/commit/22fc98c6ff76cc55c48 To be able to update to a newer Avocado version later, let's use the new way for accessing the find_free_port() function here. Signed-off-by: Thomas Huth --- tests/avocado/migration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/avocado/migration.py b/tests/avocado/migration.py index 584d6ef53f..4b25680c50 100644 --- a/tests/avocado/migration.py +++ b/tests/avocado/migration.py @@ -14,7 +14,7 @@ from avocado_qemu import QemuSystemTest from avocado import skipUnless -from avocado.utils import network +from avocado.utils.network import ports from avocado.utils import wait from avocado.utils.path import find_command @@ -57,7 +57,7 @@ def do_migrate(self, dest_uri, src_uri=None): self.assert_migration(source_vm, dest_vm) def _get_free_port(self): -port = network.find_free_port() +port = ports.find_free_port() if port is None: self.cancel('Failed to find a free port') return port -- 2.31.1
Re: [PATCH v5 18/18] s390x: pv: Add dump support
On 11/08/2022 14.11, Janosch Frank wrote: Sometimes dumping a guest from the outside is the only way to get the data that is needed. This can be the case if a dumping mechanism like KDUMP hasn't been configured or data needs to be fetched at a specific point. Dumping a protected guest from the outside without help from fw/hw doesn't yield sufficient data to be useful. Hence we now introduce PV dump support. The PV dump support works by integrating the firmware into the dump process. New Ultravisor calls are used to initiate the dump process, dump cpu data, dump memory state and lastly complete the dump process. The UV calls are exposed by KVM via the new KVM_PV_DUMP command and its subcommands. The guest's data is fully encrypted and can only be decrypted by the entity that owns the customer communication key for the dumped guest. Also dumping needs to be allowed via a flag in the SE header. On the QEMU side of things we store the PV dump data in the newly introduced architecture ELF sections (storage state and completion data) and the cpu notes (for cpu dump data). Users can use the zgetdump tool to convert the encrypted QEMU dump to an unencrypted one. Signed-off-by: Janosch Frank --- [...] @@ -207,22 +226,41 @@ static int s390x_write_elf64_notes(const char *note_name, DumpState *s, const NoteFuncDesc *funcs) { -Note note; +Note note, *notep; const NoteFuncDesc *nf; -int note_size; +int note_size, content_size; int ret = -1; assert(strlen(note_name) < sizeof(note.name)); for (nf = funcs; nf->note_contents_func; nf++) { -memset(, 0, sizeof(note)); -note.hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1); -note.hdr.n_descsz = cpu_to_be32(nf->contents_size); -g_strlcpy(note.name, note_name, sizeof(note.name)); -(*nf->note_contents_func)(, cpu, id); +notep = +if (nf->pvonly && !s390_is_pv()) { +continue; +} -note_size = sizeof(note) - sizeof(note.contents) + nf->contents_size; -ret = f(, note_size, s); +content_size = nf->contents_size ? nf->contents_size : nf->note_size_func(); +note_size = sizeof(note) - sizeof(notep->contents) + content_size; + +/* Notes with dynamic sizes need to allocate a note */ +if (nf->note_size_func) { +notep = g_malloc0(note_size); Either use g_malloc() here (without the trailing "0") ... +} + +memset(notep, 0, sizeof(note)); ... or put the memset() in an "else" block. +/* Setup note header data */ +notep->hdr.n_descsz = cpu_to_be32(content_size); +notep->hdr.n_namesz = cpu_to_be32(strlen(note_name) + 1); +g_strlcpy(notep->name, note_name, sizeof(notep->name)); + +/* Get contents and write them out */ +(*nf->note_contents_func)(notep, cpu, id); +ret = f(notep, note_size, s); + +if (nf->note_size_func) { +g_free(notep); +} if (ret < 0) { return -1; @@ -247,12 +285,161 @@ int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, return s390x_write_elf64_notes("LINUX", f, cpu, cpuid, s, note_linux); } +/* PV dump section size functions */ +static uint64_t get_dump_stor_state_size_from_len(uint64_t len) +{ +return (len / (1 << 20)) * kvm_s390_pv_dmp_get_size_stor_state(); Use "MiB" instead of "1 << 20" ? +} + +static uint64_t get_size_stor_state(DumpState *s) +{ +return get_dump_stor_state_size_from_len(s->total_size); +} Thomas
Re: [RFC PATCH v2 0/8] qapi: add generator for Golang interface
Victor Toso writes: > Hi, > > On Tue, Jul 05, 2022 at 08:46:34AM -0700, Andrea Bolognani wrote: >> I've commented in detail to the single patches, just a couple of >> additional points. >> >> On Fri, Jun 17, 2022 at 02:19:24PM +0200, Victor Toso wrote: >> > * 7) Flat structs by removing embed types. Discussion with Andrea >> > Thread: >> > https://lists.gnu.org/archive/html/qemu-devel/2022-05/msg01590.html >> > >> > No one required it but I decided to give it a try. Major issue that >> > I see with this approach is to have generated a few 'Base' structs >> > that are now useless. Overall, less nested structs seems better to >> > me. Opnions? >> > >> > Example: >> > | /* This is now useless, should be removed? */ >> > | type InetSocketAddressBase struct { >> > | Host string `json:"host"` >> > | Port string `json:"port"` >> > | } >> >> Can we somehow keep track, in the generator, of types that are >> only used as building blocks for other types, and prevent them >> from showing up in the generated code? > > I'm not 100% sure it is good to remove them from generated code > because technically it is a valid qapi type. If all @base types > are embed types and they don't show in other way or form, sure we > can remove them from generated code... I'm not sure if it is > possible to guarantee this. > > But yes, if possible, I'd like to remove what seems useless type > definitions. The existing C generators have to generate all the types, because the generated code is for QEMU's own use, where we need all the types. The existing introspection generator generates only the types visible in QAPI/QMP introspection. The former generate for internal use (where we want all the types), and the latter for external use (where only the types visible in the external interface are actually useful). >> Finally, looking at the repository containing the generated >> code I see that the generated type are sorted by kind, e.g. all >> unions are in a file, all events in another one and so on. I >> believe the structure should match more closely that of the >> QAPI schema, so e.g. block-related types should all go in one >> file, net-related types in another one and so on. > > That's something I don't mind adding but some hardcoded mapping > is needed. If you look into git history of qapi/ folder, .json > files can come and go, types be moved around, etc. So, we need to > proper map types in a way that the generated code would be kept > stable even if qapi files would have been rearranged. What I > proposed was only the simplest solution. > > Also, the generator takes a qapi-schema.json as input. We are > more focused in qemu/qapi/qapi-schema.json generated coded but > would not hurt to think we could even use it for qemu-guest-agent > from qemu/qga/qapi-schema.json -- this to say that the hardcoded > mapping needs to take into account non qemu qapi schemas too. In the beginning, the QAPI schema was monolithic. qga/qapi-schema.json still is. When keeping everything in a single qapi-schema.json became unwieldy, we split it into "modules" tied together with a simple include directive. Generated code remained monolithic. When monolithic generated code became too annoying (touch schema, recompile everything), we made it match the module structure: code for FOO.json goes into *-FOO.c and *-FOO.h, where the *-FOO.h #include the generated headers for the .json modules FOO.json includes. Schema code motion hasn't been much of a problem. Moving from FOO.json to one of the modules it includes is transparent. Non-transparent moves are relatively rare as long as the split into modules actually makes sense. >> Looking forward to the next iteration :) > > Me too, thanks again! > > Cheers, > Victor
Re: [PATCH] MAINTAINERS: Update Akihiko Odaki's email address
Hi On Mon, Aug 29, 2022 at 12:33 PM Akihiko Odaki wrote: > I am now employed by Daynix. Although my role as a reviewer of > macOS-related change is not very relevant to the employment, I decided > to use the company email address to avoid confusions from different > addresses. > > Signed-off-by: Akihiko Odaki > Congrats :) Reviewed-by: Marc-André Lureau > --- > MAINTAINERS | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/MAINTAINERS b/MAINTAINERS > index 5ce4227ff6..fd9bd1dca5 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -2451,7 +2451,7 @@ Core Audio framework backend > M: Gerd Hoffmann > M: Philippe Mathieu-Daudé > R: Christian Schoenebeck > -R: Akihiko Odaki > +R: Akihiko Odaki > S: Odd Fixes > F: audio/coreaudio.c > > @@ -2730,7 +2730,7 @@ F: util/drm.c > Cocoa graphics > M: Peter Maydell > M: Philippe Mathieu-Daudé > -R: Akihiko Odaki > +R: Akihiko Odaki > S: Odd Fixes > F: ui/cocoa.m > > -- > 2.37.2 > > > -- Marc-André Lureau
Re: [PATCH] tests/qtest/ac97-test: Correct reference to driver
On Mon, Aug 29, 2022 at 12:38 PM Akihiko Odaki wrote: > Signed-off-by: Akihiko Odaki Reviewed-by: Marc-André Lureau > --- > tests/qtest/ac97-test.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/tests/qtest/ac97-test.c b/tests/qtest/ac97-test.c > index b084e31bff..74103efdfa 100644 > --- a/tests/qtest/ac97-test.c > +++ b/tests/qtest/ac97-test.c > @@ -28,7 +28,7 @@ static void *ac97_get_driver(void *obj, const char > *interface) > return >dev; > } > > -fprintf(stderr, "%s not present in e1000e\n", interface); > +fprintf(stderr, "%s not present in ac97\n", interface); > g_assert_not_reached(); > } > > -- > 2.37.2 > > > -- Marc-André Lureau
Re: [PATCH v5 16/18] s390x: Introduce PV query interface
On 11/08/2022 14.11, Janosch Frank wrote: Introduce an interface over which we can get information about UV data. Signed-off-by: Janosch Frank Reviewed-by: Steffen Eiden --- hw/s390x/pv.c | 61 ++ hw/s390x/s390-virtio-ccw.c | 6 include/hw/s390x/pv.h | 10 +++ 3 files changed, 77 insertions(+) Acked-by: Thomas Huth
Re: [PATCH v5 15/18] s390x: Add protected dump cap
On 11/08/2022 14.11, Janosch Frank wrote: Add a protected dump capability for later feature checking. Signed-off-by: Janosch Frank Reviewed-by: Steffen Eiden --- target/s390x/kvm/kvm.c | 7 +++ target/s390x/kvm/kvm_s390x.h | 1 + 2 files changed, 8 insertions(+) Reviewed-by: Thomas Huth
Re: [RFC PATCH v2 2/8] qapi: golang: Generate qapi's alternate types in Go
Victor Toso writes: > Hi, > > On Fri, Aug 19, 2022 at 11:27:13AM -0500, Andrea Bolognani wrote: >> On Wed, Aug 17, 2022 at 04:04:19PM +0200, Victor Toso wrote: >> > On Tue, Jul 05, 2022 at 08:45:06AM -0700, Andrea Bolognani wrote: >> > > On Fri, Jun 17, 2022 at 02:19:26PM +0200, Victor Toso wrote: >> > > > func (s *BlockdevRef) UnmarshalJSON(data []byte) error { >> > > > // Check for json-null first >> > > > if string(data) == "null" { >> > > > return errors.New(`null not supported for BlockdevRef`) >> > > > } >> > > > // Check for BlockdevOptions >> > > > { >> > > > s.Definition = new(BlockdevOptions) >> > > > if err := StrictDecode(s.Definition, data); err == nil { >> > > > return nil >> > > > } >> > > >> > > The use of StrictDecode() here means that we won't be able to >> > > parse an alternate produced by a version of QEMU where >> > > BlockdevOptions has gained additional fields, doesn't it? >> > >> > That's correct. This means that with this RFCv2 proposal, qapi-go >> > based on qemu version 7.1 might not be able to decode a qmp >> > message from qemu version 7.2 if it has introduced a new field. >> > >> > This needs fixing, not sure yet the way to go. >> > >> > > Considering that we will happily parse such a BlockdevOptions >> > > outside of the context of BlockdevRef, I think we should be >> > > consistent and allow the same to happen here. >> > >> > StrictDecode is only used with alternates because, unlike unions, >> > Alternate types don't have a 'discriminator' field that would >> > allow us to know what data type to expect. >> > >> > With this in mind, theoretically speaking, we could have very >> > similar struct types as Alternate fields and we have to find on >> > runtime which type is that underlying byte stream. >> > >> > So, to reply to your suggestion, if we allow BlockdevRef without >> > StrictDecode we might find ourselves in a situation that it >> > matched a few fields of BlockdevOptions but it the byte stream >> > was actually another type. >> >> IIUC your concern is that the QAPI schema could gain a new >> type, TotallyNotBlockdevOptions, which looks exactly like >> BlockdevOptions except for one or more extra fields. >> >> If QEMU then produced a JSON like >> >> { "description": { /* a TotallyNotBlockdevOptions here */ } } >> >> and we'd try to deserialize it with Go code like >> >> ref := BlockdevRef{} >> json.Unmarsal() >> >> we'd end up mistakenly parsing the TotallyNotBlockdevOptions as a >> valid BlockdevOptions, dropping the extra fields in the process. >> >> Does that correctly describe the reason why you feel that the use of >> StrictDecode is necessary? > > Not quite. The problem here is related to the Alternate types of > the QAPI specification [0], just to name a simple in-use example, > BlockdevRefOrNul [1]. > > [0] > https://gitlab.com/qemu-project/qemu/-/blob/master/docs/devel/qapi-code-gen.rst?plain=1#L387 > [1] > https://gitlab.com/qemu-project/qemu/-/blob/master/qapi/block-core.json#L4349 > > To exemplify the problem that I try to solve with StrictDecode, > let's say there is a DeviceRef alternate type that looks like: > > { 'alternate': 'DeviceRef', > 'data': { 'memory': 'BlockdevRefInMemory', > 'disk': 'BlockdevRefInDisk', > 'cloud': 'BlockdevRefInCloud' } } > > Just a quick recap, at runtime we don't have data's payload name > (e.g: disk). We need to check the actual data and try to find > what is the payload type. > > type BlockdevRefInMemory struct { > Name *string > Size uint64 > Start uint64 > End uint64 > } > > type BlockdevRefInDisk struct { > Name *string > Size uint64 > Path *string > } > > type BlockdevRefInCloud struct { > Name *string > Size uint64 > Uri *string > } > > All types have unique fields but they all share some fields too. Quick intercession (I merely skimmed the review thread; forgive me if it's not useful or not new): An alternate type is like a union type, except there is no discriminator on the wire. Instead, the branch to use is inferred from the value. An alternate can only express a choice between types represented differently on the wire. This is docs/devel/qapi-code-gen.rst. Implied there: the inference is based on the JSON type *only*, i.e. no two branches can have the same JSON type on the wire. Since all complex types (struct or union) are JSON object on the wire, at most one alternate branch can be of complex type. More sophisticated inference would be possible if we need it. So far we haven't. > Golang, without StrictDecode would happily decode a "disk" > payload into either "memory" or "cloud" fields, matching only > what the json provides and ignoring the rest. StrictDecode would > error if the payload had fields that don't belong to that Type so > we could try to find a perfect match. > > While this should work reliably with current version of
[PATCH v7 09/10] parallels: Move statistic collection to a separate function
We will add more and more checks so we need a better code structure in parallels_co_check. Let each check performs in a separate loop in a separate helper. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/parallels.c | 53 +++ 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index 1874045c51..eacfdea4b6 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -526,47 +526,56 @@ static int parallels_check_leak(BlockDriverState *bs, return 0; } -static int coroutine_fn parallels_co_check(BlockDriverState *bs, - BdrvCheckResult *res, - BdrvCheckMode fix) +static void parallels_collect_statistics(BlockDriverState *bs, + BdrvCheckResult *res, + BdrvCheckMode fix) { BDRVParallelsState *s = bs->opaque; -int64_t prev_off; -int ret; +int64_t off, prev_off; uint32_t i; -qemu_co_mutex_lock(>lock); - -parallels_check_unclean(bs, res, fix); - -ret = parallels_check_outside_image(bs, res, fix); -if (ret < 0) { -goto out; -} - -ret = parallels_check_leak(bs, res, fix); -if (ret < 0) { -goto out; -} - res->bfi.total_clusters = s->bat_size; res->bfi.compressed_clusters = 0; /* compression is not supported */ prev_off = 0; for (i = 0; i < s->bat_size; i++) { -int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS; +off = bat2sect(s, i) << BDRV_SECTOR_BITS; if (off == 0) { prev_off = 0; continue; } -res->bfi.allocated_clusters++; - if (prev_off != 0 && (prev_off + s->cluster_size) != off) { res->bfi.fragmented_clusters++; } + prev_off = off; +res->bfi.allocated_clusters++; } +} + +static int coroutine_fn parallels_co_check(BlockDriverState *bs, + BdrvCheckResult *res, + BdrvCheckMode fix) +{ +BDRVParallelsState *s = bs->opaque; +int ret; + +qemu_co_mutex_lock(>lock); + +parallels_check_unclean(bs, res, fix); + +ret = parallels_check_outside_image(bs, res, fix); +if (ret < 0) { +goto out; +} + +ret = parallels_check_leak(bs, res, fix); +if (ret < 0) { +goto out; +} + +parallels_collect_statistics(bs, res, fix); out: qemu_co_mutex_unlock(>lock); -- 2.34.1
[PATCH v7 10/10] parallels: Replace qemu_co_mutex_lock by WITH_QEMU_LOCK_GUARD
Replace the way we use mutex in parallels_co_check() for simplier and less error prone code. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev --- block/parallels.c | 33 ++--- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index eacfdea4b6..8943eccbf5 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -561,30 +561,25 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, BDRVParallelsState *s = bs->opaque; int ret; -qemu_co_mutex_lock(>lock); +WITH_QEMU_LOCK_GUARD(>lock) { +parallels_check_unclean(bs, res, fix); -parallels_check_unclean(bs, res, fix); +ret = parallels_check_outside_image(bs, res, fix); +if (ret < 0) { +return ret; +} -ret = parallels_check_outside_image(bs, res, fix); -if (ret < 0) { -goto out; -} +ret = parallels_check_leak(bs, res, fix); +if (ret < 0) { +return ret; +} -ret = parallels_check_leak(bs, res, fix); -if (ret < 0) { -goto out; +parallels_collect_statistics(bs, res, fix); } -parallels_collect_statistics(bs, res, fix); - -out: -qemu_co_mutex_unlock(>lock); - -if (ret == 0) { -ret = bdrv_co_flush(bs); -if (ret < 0) { -res->check_errors++; -} +ret = bdrv_co_flush(bs); +if (ret < 0) { +res->check_errors++; } return ret; -- 2.34.1
[PATCH v7 04/10] parallels: create parallels_set_bat_entry_helper() to assign BAT value
This helper will be reused in next patches during parallels_co_check rework to simplify its code. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/parallels.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index c1ff8bb5f0..52a5cce46c 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -165,6 +165,13 @@ static int64_t block_status(BDRVParallelsState *s, int64_t sector_num, return start_off; } +static void parallels_set_bat_entry(BDRVParallelsState *s, +uint32_t index, uint32_t offset) +{ +s->bat_bitmap[index] = cpu_to_le32(offset); +bitmap_set(s->bat_dirty_bmap, bat_entry_off(index) / s->bat_dirty_block, 1); +} + static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { @@ -250,10 +257,8 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, } for (i = 0; i < to_allocate; i++) { -s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier); +parallels_set_bat_entry(s, idx + i, s->data_end / s->off_multiplier); s->data_end += s->tracks; -bitmap_set(s->bat_dirty_bmap, - bat_entry_off(idx + i) / s->bat_dirty_block, 1); } return bat2sect(s, idx) + sector_num % s->tracks; -- 2.34.1
[PATCH v7 08/10] parallels: Move check of leaks to a separate function
We will add more and more checks so we need a better code structure in parallels_co_check. Let each check performs in a separate loop in a separate helper. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev --- block/parallels.c | 84 +-- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index f50cd232aa..1874045c51 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -475,14 +475,14 @@ static int parallels_check_outside_image(BlockDriverState *bs, return 0; } -static int coroutine_fn parallels_co_check(BlockDriverState *bs, - BdrvCheckResult *res, - BdrvCheckMode fix) +static int parallels_check_leak(BlockDriverState *bs, +BdrvCheckResult *res, +BdrvCheckMode fix) { BDRVParallelsState *s = bs->opaque; -int64_t size, prev_off, high_off; -int ret; +int64_t size, off, high_off, count; uint32_t i; +int ret; size = bdrv_getlength(bs->file->bs); if (size < 0) { @@ -490,41 +490,16 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, return size; } -qemu_co_mutex_lock(>lock); - -parallels_check_unclean(bs, res, fix); - -ret = parallels_check_outside_image(bs, res, fix); -if (ret < 0) { -goto out; -} - -res->bfi.total_clusters = s->bat_size; -res->bfi.compressed_clusters = 0; /* compression is not supported */ - high_off = 0; -prev_off = 0; for (i = 0; i < s->bat_size; i++) { -int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS; -if (off == 0) { -prev_off = 0; -continue; -} - -res->bfi.allocated_clusters++; +off = bat2sect(s, i) << BDRV_SECTOR_BITS; if (off > high_off) { high_off = off; } - -if (prev_off != 0 && (prev_off + s->cluster_size) != off) { -res->bfi.fragmented_clusters++; -} -prev_off = off; } res->image_end_offset = high_off + s->cluster_size; if (size > res->image_end_offset) { -int64_t count; count = DIV_ROUND_UP(size - res->image_end_offset, s->cluster_size); fprintf(stderr, "%s space leaked at the end of the image %" PRId64 "\n", fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR", @@ -542,12 +517,57 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, if (ret < 0) { error_report_err(local_err); res->check_errors++; -goto out; +return ret; } res->leaks_fixed += count; } } +return 0; +} + +static int coroutine_fn parallels_co_check(BlockDriverState *bs, + BdrvCheckResult *res, + BdrvCheckMode fix) +{ +BDRVParallelsState *s = bs->opaque; +int64_t prev_off; +int ret; +uint32_t i; + +qemu_co_mutex_lock(>lock); + +parallels_check_unclean(bs, res, fix); + +ret = parallels_check_outside_image(bs, res, fix); +if (ret < 0) { +goto out; +} + +ret = parallels_check_leak(bs, res, fix); +if (ret < 0) { +goto out; +} + +res->bfi.total_clusters = s->bat_size; +res->bfi.compressed_clusters = 0; /* compression is not supported */ + +prev_off = 0; +for (i = 0; i < s->bat_size; i++) { +int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS; +if (off == 0) { +prev_off = 0; +continue; +} + +res->bfi.allocated_clusters++; + +if (prev_off != 0 && (prev_off + s->cluster_size) != off) { +res->bfi.fragmented_clusters++; +} +prev_off = off; +} + out: qemu_co_mutex_unlock(>lock); -- 2.34.1
Re: [RFC PATCH v2] tests/9p: introduce declarative function calls
Hi Christian, On Thu, 18 Aug 2022 16:13:40 +0200 Christian Schoenebeck wrote: > On Montag, 18. Juli 2022 16:02:31 CEST Christian Schoenebeck wrote: > > On Montag, 18. Juli 2022 15:10:55 CEST Christian Schoenebeck wrote: > > > There are currently 4 different functions for sending a 9p 'Twalk' > > > request. They are all doing the same thing, just in a slightly different > > > way and with slightly different function arguments. > > > > > > Merge those 4 functions into a single function by using a struct for > > > function call arguments and use designated initializers when calling this > > > function to turn usage into a declarative apporach, which is better > > > readable and easier to maintain. > > > > > > Signed-off-by: Christian Schoenebeck > > > --- > > > > > > v1 -> v2: > > > * Also merge low-level function v9fs_twalk(). > > > > > > * Lower case twalk() function name. > > > > > > * Lower case rwalk struct field. > > > > > > * Add result struct TWalkRes. > > > > > > NOTE: I have not separated rwalk struct, because it would have > > > simplified code at one place, but complicated it at another one. > > > > BTW, I also toyed around with virtio-9p-test.c -> virtio-9p-test.cpp, due to > > advantages described in v1 discussion, however there are quite a bunch of C > > header files included which would need refactoring (C++ keywords used like > > 'export', 'class', 'private' and missing type casts from void*). > > > > I also saw no easy way to separate those as alternative (like C low level > > unit, C++ unit on top). so I decided that it was not worth it. > > Not sure if you are on summer vacation right now, but I guess I just go ahead > and convert the rest of the 9p test code in the same way? At least I haven't > seen that you were opposed about the suggested idea in general. > Yeah I was on vacation indeed. Please go ahead. I'll do my best to review. Cheers, -- Greg > Best regards, > Christian Schoenebeck > > > > tests/qtest/virtio-9p-test.c | 251 +-- > > > 1 file changed, 154 insertions(+), 97 deletions(-) > > > > > > diff --git a/tests/qtest/virtio-9p-test.c b/tests/qtest/virtio-9p-test.c > > > index 25305a4cf7..69b1c27268 100644 > > > --- a/tests/qtest/virtio-9p-test.c > > > +++ b/tests/qtest/virtio-9p-test.c > > > @@ -72,6 +72,7 @@ static int split(const char *in, const char *delim, char > > > ***out) static void split_free(char ***out) > > > > > > { > > > > > > int i; > > > > > > +if (!*out) return; > > > > > > for (i = 0; (*out)[i]; ++i) { > > > > > > g_free((*out)[i]); > > > > > > } > > > > > > @@ -390,31 +391,6 @@ static void v9fs_rattach(P9Req *req, v9fs_qid *qid) > > > > > > v9fs_req_free(req); > > > > > > } > > > > > > -/* size[4] Twalk tag[2] fid[4] newfid[4] nwname[2] nwname*(wname[s]) */ > > > -static P9Req *v9fs_twalk(QVirtio9P *v9p, uint32_t fid, uint32_t newfid, > > > - uint16_t nwname, char *const wnames[], uint16_t > > > tag) -{ > > > -P9Req *req; > > > -int i; > > > -uint32_t body_size = 4 + 4 + 2; > > > - > > > -for (i = 0; i < nwname; i++) { > > > -uint16_t wname_size = v9fs_string_size(wnames[i]); > > > - > > > -g_assert_cmpint(body_size, <=, UINT32_MAX - wname_size); > > > -body_size += wname_size; > > > -} > > > -req = v9fs_req_init(v9p, body_size, P9_TWALK, tag); > > > -v9fs_uint32_write(req, fid); > > > -v9fs_uint32_write(req, newfid); > > > -v9fs_uint16_write(req, nwname); > > > -for (i = 0; i < nwname; i++) { > > > -v9fs_string_write(req, wnames[i]); > > > -} > > > -v9fs_req_send(req); > > > -return req; > > > -} > > > - > > > > > > /* size[4] Rwalk tag[2] nwqid[2] nwqid*(wqid[13]) */ > > > static void v9fs_rwalk(P9Req *req, uint16_t *nwqid, v9fs_qid **wqid) > > > { > > > > > > @@ -432,6 +408,98 @@ static void v9fs_rwalk(P9Req *req, uint16_t *nwqid, > > > v9fs_qid **wqid) v9fs_req_free(req); > > > > > > } > > > > > > +/* options for 'Twalk' 9p request */ > > > +typedef struct TWalkOpt { > > > +/* 9P client being used (mandatory) */ > > > +QVirtio9P *client; > > > +/* user supplied tag number being returned with response (optional) > > > */ > > > +uint16_t tag; > > > +/* file ID of directory from where walk should start (optional) */ > > > +uint32_t fid; > > > +/* file ID for target directory being walked to (optional) */ > > > +uint32_t newfid; > > > +/* low level variant of path to walk to (optional) */ > > > +uint16_t nwname; > > > +char **wnames; > > > +/* high level variant of path to walk to (optional) */ > > > +const char *path; > > > +/* data being received from 9p server as 'Rwalk' response (optional) > > > */ +struct { > > > +uint16_t *nwqid; > > > +v9fs_qid **wqid; > > > +} rwalk; > > > +/* only send Twalk request but not wait for a
[PATCH v7 07/10] parallels: Move check of cluster outside image to a separate function
We will add more and more checks so we need a better code structure in parallels_co_check. Let each check performs in a separate loop in a separate helper. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev --- block/parallels.c | 59 ++- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index eea318f809..f50cd232aa 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -438,13 +438,50 @@ static void parallels_check_unclean(BlockDriverState *bs, } } +static int parallels_check_outside_image(BlockDriverState *bs, + BdrvCheckResult *res, + BdrvCheckMode fix) +{ +BDRVParallelsState *s = bs->opaque; +uint32_t i; +int64_t off, high_off, size; + +size = bdrv_getlength(bs->file->bs); +if (size < 0) { +res->check_errors++; +return size; +} + +high_off = 0; +for (i = 0; i < s->bat_size; i++) { +off = bat2sect(s, i) << BDRV_SECTOR_BITS; +if (off > size) { +fprintf(stderr, "%s cluster %u is outside image\n", +fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); +res->corruptions++; +if (fix & BDRV_FIX_ERRORS) { +parallels_set_bat_entry(s, i, 0); +res->corruptions_fixed++; +} +continue; +} +if (high_off < off) { +high_off = off; +} +} + +s->data_end = (high_off + s->cluster_size) >> BDRV_SECTOR_BITS; + +return 0; +} + static int coroutine_fn parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { BDRVParallelsState *s = bs->opaque; int64_t size, prev_off, high_off; -int ret = 0; +int ret; uint32_t i; size = bdrv_getlength(bs->file->bs); @@ -457,6 +494,11 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, parallels_check_unclean(bs, res, fix); +ret = parallels_check_outside_image(bs, res, fix); +if (ret < 0) { +goto out; +} + res->bfi.total_clusters = s->bat_size; res->bfi.compressed_clusters = 0; /* compression is not supported */ @@ -469,19 +511,6 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, continue; } -/* cluster outside the image */ -if (off > size) { -fprintf(stderr, "%s cluster %u is outside image\n", -fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); -res->corruptions++; -if (fix & BDRV_FIX_ERRORS) { -parallels_set_bat_entry(s, i, 0); -res->corruptions_fixed++; -} -prev_off = 0; -continue; -} - res->bfi.allocated_clusters++; if (off > high_off) { high_off = off; @@ -519,8 +548,6 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, } } -s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS; - out: qemu_co_mutex_unlock(>lock); -- 2.34.1
[PATCH v7 06/10] parallels: Move check of unclean image to a separate function
We will add more and more checks so we need a better code structure in parallels_co_check. Let each check performs in a separate loop in a separate helper. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/parallels.c | 31 +-- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index b4a85b8aa7..eea318f809 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -418,6 +418,25 @@ static coroutine_fn int parallels_co_readv(BlockDriverState *bs, return ret; } +static void parallels_check_unclean(BlockDriverState *bs, +BdrvCheckResult *res, +BdrvCheckMode fix) +{ +BDRVParallelsState *s = bs->opaque; + +if (!s->header_unclean) { +return; +} + +fprintf(stderr, "%s image was not closed correctly\n", +fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR"); +res->corruptions++; +if (fix & BDRV_FIX_ERRORS) { +/* parallels_close will do the job right */ +res->corruptions_fixed++; +s->header_unclean = false; +} +} static int coroutine_fn parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res, @@ -435,16 +454,8 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, } qemu_co_mutex_lock(>lock); -if (s->header_unclean) { -fprintf(stderr, "%s image was not closed correctly\n", -fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR"); -res->corruptions++; -if (fix & BDRV_FIX_ERRORS) { -/* parallels_close will do the job right */ -res->corruptions_fixed++; -s->header_unclean = false; -} -} + +parallels_check_unclean(bs, res, fix); res->bfi.total_clusters = s->bat_size; res->bfi.compressed_clusters = 0; /* compression is not supported */ -- 2.34.1
[PATCH v7 11/10] parallels: Incorrect condition in out-of-image check
All the offsets in the BAT must be lower than the file size. Fix the check condition for correct check. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev --- block/parallels.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/parallels.c b/block/parallels.c index 8943eccbf5..e6e8b9e369 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -455,7 +455,7 @@ static int parallels_check_outside_image(BlockDriverState *bs, high_off = 0; for (i = 0; i < s->bat_size; i++) { off = bat2sect(s, i) << BDRV_SECTOR_BITS; -if (off > size) { +if (off >= size) { fprintf(stderr, "%s cluster %u is outside image\n", fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); res->corruptions++; -- 2.34.1
Re: [PATCH v7 02/10] parallels: Fix high_off calculation in parallels_co_check()
Please ignore the patches in this branch.
[PATCH v7 01/10] parallels: Out of image offset in BAT leads to image inflation
data_end field in BDRVParallelsState is set to the biggest offset present in BAT. If this offset is outside of the image, any further write will create the cluster at this offset and/or the image will be truncated to this offset on close. This is definitely not correct. Raise an error in parallels_open() if data_end points outside the image and it is not a check (let the check to repaire the image). Set data_end to the end of the cluster with the last correct offset. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev --- block/parallels.c | 17 + 1 file changed, 17 insertions(+) diff --git a/block/parallels.c b/block/parallels.c index a229c06f25..93bc2750ef 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -732,6 +732,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, BDRVParallelsState *s = bs->opaque; ParallelsHeader ph; int ret, size, i; +int64_t file_size; QemuOpts *opts = NULL; Error *local_err = NULL; char *buf; @@ -742,6 +743,12 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } +file_size = bdrv_getlength(bs->file->bs); +if (file_size < 0) { +return -EINVAL; +} +file_size >>= BDRV_SECTOR_BITS; + ret = bdrv_pread(bs->file, 0, sizeof(ph), , 0); if (ret < 0) { goto fail; @@ -806,6 +813,16 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, for (i = 0; i < s->bat_size; i++) { int64_t off = bat2sect(s, i); +if (off >= file_size) { +if (flags & BDRV_O_CHECK) { +continue; +} +error_setg(errp, "parallels: Offset %" PRIi64 " in BAT[%d] entry " + "is larger than file size (%" PRIi64 ")", + off, i, file_size); +ret = -EINVAL; +goto fail; +} if (off >= s->data_end) { s->data_end = off + s->tracks; } -- 2.34.1
[PATCH v7 05/10] parallels: Use generic infrastructure for BAT writing in parallels_co_check()
BAT is written in the context of conventional operations over the image inside bdrv_co_flush() when it calls parallels_co_flush_to_os() callback. Thus we should not modify BAT array directly, but call parallels_set_bat_entry() helper and bdrv_co_flush() further on. After that there is no need to manually write BAT and track its modification. This makes code more generic and allows to split parallels_set_bat_entry() for independent pieces. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev --- block/parallels.c | 23 ++- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index 52a5cce46c..b4a85b8aa7 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -425,9 +425,8 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, { BDRVParallelsState *s = bs->opaque; int64_t size, prev_off, high_off; -int ret; +int ret = 0; uint32_t i; -bool flush_bat = false; size = bdrv_getlength(bs->file->bs); if (size < 0) { @@ -465,9 +464,8 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); res->corruptions++; if (fix & BDRV_FIX_ERRORS) { -s->bat_bitmap[i] = 0; +parallels_set_bat_entry(s, i, 0); res->corruptions_fixed++; -flush_bat = true; } prev_off = 0; continue; @@ -484,15 +482,6 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, prev_off = off; } -ret = 0; -if (flush_bat) { -ret = bdrv_co_pwrite_sync(bs->file, 0, s->header_size, s->header, 0); -if (ret < 0) { -res->check_errors++; -goto out; -} -} - res->image_end_offset = high_off + s->cluster_size; if (size > res->image_end_offset) { int64_t count; @@ -523,6 +512,14 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, out: qemu_co_mutex_unlock(>lock); + +if (ret == 0) { +ret = bdrv_co_flush(bs); +if (ret < 0) { +res->check_errors++; +} +} + return ret; } -- 2.34.1
[PATCH v7 02/10] parallels: Fix high_off calculation in parallels_co_check()
Don't let high_off be more than the file size even if we don't fix the image. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev --- block/parallels.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/parallels.c b/block/parallels.c index 93bc2750ef..7e8cdbbc3a 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -460,12 +460,12 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); res->corruptions++; if (fix & BDRV_FIX_ERRORS) { -prev_off = 0; s->bat_bitmap[i] = 0; res->corruptions_fixed++; flush_bat = true; -continue; } +prev_off = 0; +continue; } res->bfi.allocated_clusters++; -- 2.34.1
[PATCH v7 03/10] parallels: Fix data_end after out-of-image check
Set data_end to the end of the last cluster inside the image. In such a way we can be sure that corrupted offsets in the BAT can't affect on the image size. Signed-off-by: Alexander Ivanov Reviewed-by: Denis V. Lunev --- block/parallels.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/parallels.c b/block/parallels.c index 7e8cdbbc3a..c1ff8bb5f0 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -514,6 +514,8 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs, } } +s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS; + out: qemu_co_mutex_unlock(>lock); return ret; -- 2.34.1