Pierrick Bouvier <pierrick.bouv...@linaro.org> writes: (Added Philip to CC)
> On 5/28/24 12:57, Alex Bennée wrote: >> Pierrick Bouvier <pierrick.bouv...@linaro.org> writes: >> >>> On 5/28/24 12:14, Alex Bennée wrote: >>>> Pierrick Bouvier <pierrick.bouv...@linaro.org> writes: >>>> >>>>> This plugin uses the new time control interface to make decisions >>>>> about the state of time during the emulation. The algorithm is >>>>> currently very simple. The user specifies an ips rate which applies >>>>> per core. <snip> >>>>> +static void vcpu_init(qemu_plugin_id_t id, unsigned int cpu_index) >>>>> +{ >>>>> + vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index); >>>>> + /* ensure start time is set first */ >>>>> + set_start_time(); >>>>> + /* start counter from absolute time reference */ >>>>> + vcpu->counter = num_insn_during(uptime_ns()); >>>>> + vcpu_set_state(vcpu, EXECUTING); >>>>> +} >>>>> + >>>>> +static void vcpu_idle(qemu_plugin_id_t id, unsigned int cpu_index) >>>>> +{ >>>>> + vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index); >>>>> + vcpu_set_state(vcpu, IDLE); >>>>> +} >>>>> + >>>>> +static void vcpu_resume(qemu_plugin_id_t id, unsigned int cpu_index) >>>>> +{ >>>>> + vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index); >>>>> + g_assert(vcpu->state == IDLE); >>>> I'm triggering a weird race here: >>>> (gdb) b vcpu_init >>>> Breakpoint 1 at 0x7ffff7fa15f7: file >>>> /home/alex/lsrc/qemu.git/contrib/plugins/ips.c, line 127. >>>> (gdb) r >>>> The program being debugged has been started already. >>>> Start it from the beginning? (y or n) y >>>> Starting program: >>>> /home/alex/lsrc/qemu.git/builds/arm.debug/qemu-system-aarch64 >>>> -machine type=virt,virtualization=on,pflash0=rom,pflash1=efivars >>>> -cpu cortex-a57 -smp 32 -accel tcg -device >>>> virtio-net-pci,netdev=unet -device virtio-scsi-pci -device >>>> scsi-hd,drive=hd -netdev user,id=unet,hostfwd=tcp::2222-:22 >>>> -blockdev >>>> driver=raw,node-name=hd,file.driver=host_device,file.filename=/dev/zen-ssd2/trixie-arm64,discard=unmap >>>> -serial mon:stdio -blockdev >>>> node-name=rom,driver=file,filename=/home/alex/lsrc/qemu.git/builds/arm.debug/pc-bios/edk2-aarch64-code.fd,read-only=true >>>> -blockdev >>>> node-name=efivars,driver=file,filename=/home/alex/images/qemu-arm64-efivars >>>> -m 8192 -object memory-backend-memfd,id=mem,size=8G,share=on -kernel >>>> /home/alex/lsrc/linux.git/builds/arm64/arch/arm64/boot/Image -append >>>> root=/dev/sda2\ console=ttyAMA0 -plugin >>>> contrib/plugins/libips.so,ips=1000000000 >>>> [Thread debugging using libthread_db enabled] >>>> Using host libthread_db library >>>> "/lib/x86_64-linux-gnu/libthread_db.so.1". >>>> [New Thread 0x7fffe72006c0 (LWP 360538)] >>>> [New Thread 0x7fffe68006c0 (LWP 360540)] >>>> [New Thread 0x7fffe5e006c0 (LWP 360541)] >>>> [New Thread 0x7fffe54006c0 (LWP 360542)] >>>> [New Thread 0x7fffe4a006c0 (LWP 360543)] >>>> [New Thread 0x7fffdfe006c0 (LWP 360544)] >>>> [New Thread 0x7fffdf4006c0 (LWP 360545)] >>>> [New Thread 0x7fffdea006c0 (LWP 360546)] >>>> [Switching to Thread 0x7fffdf4006c0 (LWP 360545)] >>>> Thread 8 "qemu-system-aar" hit Breakpoint 1, vcpu_init >>>> (id=10457908569352202058, cpu_index=0) at >>>> /home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127 >>>> 127 vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, >>>> cpu_index); >>>> (gdb) c >>>> Continuing. >>>> [New Thread 0x7fffde0006c0 (LWP 360548)] >>>> [Switching to Thread 0x7fffdea006c0 (LWP 360546)] >>>> Thread 9 "qemu-system-aar" hit Breakpoint 1, vcpu_init >>>> (id=10457908569352202058, cpu_index=1) at >>>> /home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127 >>>> 127 vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, >>>> cpu_index); >>>> (gdb) >>>> Continuing. >>>> [New Thread 0x7fffdd6006c0 (LWP 360549)] >>>> [Switching to Thread 0x7fffde0006c0 (LWP 360548)] >>>> Thread 10 "qemu-system-aar" hit Breakpoint 1, vcpu_init >>>> (id=10457908569352202058, cpu_index=2) at >>>> /home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127 >>>> 127 vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, >>>> cpu_index); >>>> (gdb) >>>> Continuing. >>>> [New Thread 0x7fffdcc006c0 (LWP 360550)] >>>> [Switching to Thread 0x7fffdd6006c0 (LWP 360549)] >>>> Thread 11 "qemu-system-aar" hit Breakpoint 1, vcpu_init >>>> (id=10457908569352202058, cpu_index=3) at >>>> /home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127 >>>> 127 vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, >>>> cpu_index); >>>> (gdb) >>>> Continuing. >>>> [New Thread 0x7fffd3e006c0 (LWP 360551)] >>>> [Switching to Thread 0x7fffdcc006c0 (LWP 360550)] >>>> Thread 12 "qemu-system-aar" hit Breakpoint 1, vcpu_init >>>> (id=10457908569352202058, cpu_index=4) at >>>> /home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127 >>>> 127 vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, >>>> cpu_index); >>>> (gdb) n >>>> 129 set_start_time(); >>>> (gdb) >>>> 131 vcpu->counter = num_insn_during(uptime_ns()); >>>> (gdb) >>>> 132 vcpu_set_state(vcpu, EXECUTING); >>>> (gdb) >>>> 133 } >>>> (gdb) p vcpu->state >>>> $1 = EXECUTING >>>> (gdb) p &vcpu->state >>>> $2 = (vCPUState *) 0x555557c6b5d0 >>>> (gdb) watch *(vCPUState *) 0x555557c6b5d0 >>>> Hardware watchpoint 2: *(vCPUState *) 0x555557c6b5d0 >>>> (gdb) c >>>> Continuing. >>>> [Thread 0x7fffdfe006c0 (LWP 360544) exited] >>>> [Thread 0x7fffe5e006c0 (LWP 360541) exited] >>>> Thread 12 "qemu-system-aar" hit Hardware watchpoint 2: >>>> *(vCPUState *) 0x555557c6b5d0 >>>> Old value = EXECUTING >>>> New value = IDLE >>>> vcpu_set_state (vcpu=0x555557c6b5c0, new_state=IDLE) at >>>> /home/alex/lsrc/qemu.git/contrib/plugins/ips.c:83 >>>> 83 } >>>> (gdb) c >>>> Continuing. >>>> [New Thread 0x7fffdfe006c0 (LWP 360591)] >>>> [New Thread 0x7fffe5e006c0 (LWP 360592)] >>>> [New Thread 0x7fffd34006c0 (LWP 360593)] >>>> [Switching to Thread 0x7fffdfe006c0 (LWP 360591)] >>>> Thread 14 "qemu-system-aar" hit Breakpoint 1, vcpu_init >>>> (id=10457908569352202058, cpu_index=6) at >>>> /home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127 >>>> 127 vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, >>>> cpu_index); >>>> (gdb) >>>> Continuing. >>>> [New Thread 0x7fffd2a006c0 (LWP 360594)] >>>> ** >>>> ERROR:/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:144:vcpu_resume: >>>> assertion failed: (vcpu->state == IDLE) >>>> Bail out! >>>> ERROR:/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:144:vcpu_resume: >>>> assertion failed: (vcpu->state == IDLE) >>>> Thread 13 "qemu-system-aar" received signal SIGABRT, Aborted. >>>> [Switching to Thread 0x7fffd3e006c0 (LWP 360551)] >>>> __pthread_kill_implementation (threadid=<optimized out>, >>>> signo=signo@entry=6, no_tid=no_tid@entry=0) at ./nptl/pthread_kill.c:44 >>>> 44 ./nptl/pthread_kill.c: No such file or directory. >>>> (gdb) bt >>>> #0 __pthread_kill_implementation (threadid=<optimized out>, >>>> signo=signo@entry=6, no_tid=no_tid@entry=0) at ./nptl/pthread_kill.c:44 >>>> #1 0x00007ffff4ca9e8f in __pthread_kill_internal (signo=6, >>>> threadid=<optimized out>) at ./nptl/pthread_kill.c:78 >>>> #2 0x00007ffff4c5afb2 in __GI_raise (sig=sig@entry=6) at >>>> ../sysdeps/posix/raise.c:26 >>>> #3 0x00007ffff4c45472 in __GI_abort () at ./stdlib/abort.c:79 >>>> #4 0x00007ffff6e46ec8 in () at /lib/x86_64-linux-gnu/libglib-2.0.so.0 >>>> #5 0x00007ffff6ea6e1a in g_assertion_message_expr () at >>>> /lib/x86_64-linux-gnu/libglib-2.0.so.0 >>>> #6 0x00007ffff7fa16e2 in vcpu_resume (id=10457908569352202058, >>>> cpu_index=5) at /home/alex/lsrc/qemu.git/contrib/plugins/ips.c:144 >>>> #7 0x00005555562c3202 in plugin_vcpu_cb__simple (cpu=0x5555583d9540, >>>> ev=QEMU_PLUGIN_EV_VCPU_RESUME) at ../../plugins/core.c:111 >>>> #8 0x00005555562c43f7 in qemu_plugin_vcpu_resume_cb >>>> (cpu=0x5555583d9540) at ../../plugins/core.c:523 >>>> #9 0x0000555555d9150e in qemu_wait_io_event (cpu=0x5555583d9540) at >>>> ../../system/cpus.c:465 >>>> #10 0x00005555562c67d0 in mttcg_cpu_thread_fn (arg=0x5555583d9540) at >>>> ../../accel/tcg/tcg-accel-ops-mttcg.c:118 >>>> #11 0x00005555565120ff in qemu_thread_start (args=0x5555584716e0) at >>>> ../../util/qemu-thread-posix.c:541 >>>> #12 0x00007ffff4ca8134 in start_thread (arg=<optimized out>) at >>>> ./nptl/pthread_create.c:442 >>>> #13 0x00007ffff4d287dc in clone3 () at >>>> ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81 >>>> (gdb) >>>> But I don't understand how we can ever hit the idle callback without >>>> first hitting the init callback. >>>> >>> >>> More exactly, the assert we hit means that the idle callback, was not >>> called before the resume callback. >>> Any chance you can check what is the current vcpu->state value? I >>> wonder if it's not an exited cpu, that gets resumed after. >> No when I looked at it it was set as UNKNOWN. Unfortunately it >> doesn't >> trigger if I stick breakpoints in or run under rr. >> > > By using rr record --chaos mode maybe? Ahh yes that triggered it. It looks like we have a race: (rr) c Continuing. Thread 1 hit Hardware watchpoint 7: *(int *) 0x5559ba6899b0 Old value = -1 New value = 10 machvirt_init (machine=0x5559b9ac3c00) at ../../hw/arm/virt.c:2214 2214 numa_cpu_pre_plug(&possible_cpus->cpus[cs->cpu_index], DEVICE(cpuobj), (rr) c Continuing. [New Thread 650125.650142] [Switching to Thread 650125.650142] Thread 44 hit Breakpoint 8, mttcg_cpu_thread_fn (arg=0x5559ba6896e0) at ../../accel/tcg/tcg-accel-ops-mttcg.c:70 70 assert(tcg_enabled()); (rr) p cpu->cpu_index $24 = 10 (rr) c Continuing. [Switching to Thread 650125.650125] Thread 1 hit Breakpoint 10, cpu_common_realizefn (dev=0x5559ba6896e0, errp=0x7fff02322c10) at ../../hw/core/cpu-common.c:205 205 Object *machine = qdev_get_machine(); (rr) p cpu->cpu_index $25 = 10 (rr) bt #0 cpu_common_realizefn (dev=0x5559ba6896e0, errp=0x7fff02322c10) at ../../hw/core/cpu-common.c:205 #1 0x00005559b606e6a5 in arm_cpu_realizefn (dev=0x5559ba6896e0, errp=0x7fff02322c10) at ../../target/arm/cpu.c:2552 #2 0x00005559b63dd5d0 in device_set_realized (obj=0x5559ba6896e0, value=true, errp=0x7fff02322d20) at ../../hw/core/qdev.c:510 #3 0x00005559b63e84e7 in property_set_bool (obj=0x5559ba6896e0, v=0x5559ba681540, name=0x5559b68fafe1 "realized", opaque=0x5559b97f73c0, errp=0x7fff02322d20) at ../../qom/object.c:2354 #4 0x00005559b63e6065 in object_property_set (obj=0x5559ba6896e0, name=0x5559b68fafe1 "realized", v=0x5559ba681540, errp=0x7fff02322d20) at ../../qom/object.c:1463 #5 0x00005559b63ead1d in object_property_set_qobject (obj=0x5559ba6896e0, name=0x5559b68fafe1 "realized", value=0x5559ba681420, errp=0x5559b7810320 <error_fatal>) at ../../qom/qom-qobject.c:28 #6 0x00005559b63e640a in object_property_set_bool (obj=0x5559ba6896e0, name=0x5559b68fafe1 "realized", value=true, errp=0x5559b7810320 <error_fatal>) at ../../qom/object.c:1533 #7 0x00005559b63dccbd in qdev_realize (dev=0x5559ba6896e0, bus=0x0, errp=0x5559b7810320 <error_fatal>) at ../../hw/core/qdev.c:291 #8 0x00005559b5fe4243 in machvirt_init (machine=0x5559b9ac3c00) at ../../hw/arm/virt.c:2295 #9 0x00005559b5ada5d6 in machine_run_board_init (machine=0x5559b9ac3c00, mem_path=0x0, errp=0x7fff02322f30) at ../../hw/core/machine.c:1576 #10 0x00005559b5eb416f in qemu_init_board () at ../../system/vl.c:2621 #11 0x00005559b5eb4482 in qmp_x_exit_preconfig (errp=0x5559b7810320 <error_fatal>) at ../../system/vl.c:2713 #12 0x00005559b5eb7088 in qemu_init (argc=35, argv=0x7fff02323268) at ../../system/vl.c:3759 #13 0x00005559b63d7ff2 in main (argc=35, argv=0x7fff02323268) at ../../system/main.c:47 Because looking at the order things happen in the specific CPU realisation: qemu_init_vcpu(cs); cpu_reset(cs); acc->parent_realize(dev, errp); And the common realize function doesn't get to do: /* Plugin initialization must wait until the cpu start executing code */ #ifdef CONFIG_PLUGIN if (tcg_enabled()) { cpu->plugin_state = qemu_plugin_create_vcpu_state(); async_run_on_cpu(cpu, qemu_plugin_vcpu_init__async, RUN_ON_CPU_NULL); } #endif before we've started executing..... -- Alex Bennée Virtualisation Tech Lead @ Linaro