On 5/29/24 05:13, Alex Bennée wrote:
Pierrick Bouvier <pierrick.bouv...@linaro.org> writes:

(Added Philip to CC)

On 5/28/24 12:57, Alex Bennée wrote:
Pierrick Bouvier <pierrick.bouv...@linaro.org> writes:

On 5/28/24 12:14, Alex Bennée wrote:
Pierrick Bouvier <pierrick.bouv...@linaro.org> writes:

This plugin uses the new time control interface to make decisions
about the state of time during the emulation. The algorithm is
currently very simple. The user specifies an ips rate which applies
per core.

<snip>

+static void vcpu_init(qemu_plugin_id_t id, unsigned int cpu_index)
+{
+    vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
+    /* ensure start time is set first */
+    set_start_time();
+    /* start counter from absolute time reference */
+    vcpu->counter = num_insn_during(uptime_ns());
+    vcpu_set_state(vcpu, EXECUTING);
+}
+
+static void vcpu_idle(qemu_plugin_id_t id, unsigned int cpu_index)
+{
+    vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
+    vcpu_set_state(vcpu, IDLE);
+}
+
+static void vcpu_resume(qemu_plugin_id_t id, unsigned int cpu_index)
+{
+    vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
+    g_assert(vcpu->state == IDLE);
I'm triggering a weird race here:
     (gdb) b vcpu_init
     Breakpoint 1 at 0x7ffff7fa15f7: file 
/home/alex/lsrc/qemu.git/contrib/plugins/ips.c, line 127.
     (gdb) r
     The program being debugged has been started already.
     Start it from the beginning? (y or n) y
     Starting program:
/home/alex/lsrc/qemu.git/builds/arm.debug/qemu-system-aarch64
-machine type=virt,virtualization=on,pflash0=rom,pflash1=efivars
-cpu cortex-a57 -smp 32 -accel tcg -device
virtio-net-pci,netdev=unet -device virtio-scsi-pci -device
scsi-hd,drive=hd -netdev user,id=unet,hostfwd=tcp::2222-:22
-blockdev
driver=raw,node-name=hd,file.driver=host_device,file.filename=/dev/zen-ssd2/trixie-arm64,discard=unmap
-serial mon:stdio -blockdev
node-name=rom,driver=file,filename=/home/alex/lsrc/qemu.git/builds/arm.debug/pc-bios/edk2-aarch64-code.fd,read-only=true
-blockdev
node-name=efivars,driver=file,filename=/home/alex/images/qemu-arm64-efivars
-m 8192 -object memory-backend-memfd,id=mem,size=8G,share=on -kernel
/home/alex/lsrc/linux.git/builds/arm64/arch/arm64/boot/Image -append
root=/dev/sda2\ console=ttyAMA0 -plugin
contrib/plugins/libips.so,ips=1000000000
     [Thread debugging using libthread_db enabled]
     Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
     [New Thread 0x7fffe72006c0 (LWP 360538)]
     [New Thread 0x7fffe68006c0 (LWP 360540)]
     [New Thread 0x7fffe5e006c0 (LWP 360541)]
     [New Thread 0x7fffe54006c0 (LWP 360542)]
     [New Thread 0x7fffe4a006c0 (LWP 360543)]
     [New Thread 0x7fffdfe006c0 (LWP 360544)]
     [New Thread 0x7fffdf4006c0 (LWP 360545)]
     [New Thread 0x7fffdea006c0 (LWP 360546)]
     [Switching to Thread 0x7fffdf4006c0 (LWP 360545)]
     Thread 8 "qemu-system-aar" hit Breakpoint 1, vcpu_init
(id=10457908569352202058, cpu_index=0) at
/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127
     127         vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
     (gdb) c
     Continuing.
     [New Thread 0x7fffde0006c0 (LWP 360548)]
     [Switching to Thread 0x7fffdea006c0 (LWP 360546)]
     Thread 9 "qemu-system-aar" hit Breakpoint 1, vcpu_init
(id=10457908569352202058, cpu_index=1) at
/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127
     127         vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
     (gdb)
     Continuing.
     [New Thread 0x7fffdd6006c0 (LWP 360549)]
     [Switching to Thread 0x7fffde0006c0 (LWP 360548)]
     Thread 10 "qemu-system-aar" hit Breakpoint 1, vcpu_init
(id=10457908569352202058, cpu_index=2) at
/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127
     127         vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
     (gdb)
     Continuing.
     [New Thread 0x7fffdcc006c0 (LWP 360550)]
     [Switching to Thread 0x7fffdd6006c0 (LWP 360549)]
     Thread 11 "qemu-system-aar" hit Breakpoint 1, vcpu_init
(id=10457908569352202058, cpu_index=3) at
/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127
     127         vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
     (gdb)
     Continuing.
     [New Thread 0x7fffd3e006c0 (LWP 360551)]
     [Switching to Thread 0x7fffdcc006c0 (LWP 360550)]
     Thread 12 "qemu-system-aar" hit Breakpoint 1, vcpu_init
(id=10457908569352202058, cpu_index=4) at
/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127
     127         vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
     (gdb) n
     129         set_start_time();
     (gdb)
     131         vcpu->counter = num_insn_during(uptime_ns());
     (gdb)
     132         vcpu_set_state(vcpu, EXECUTING);
     (gdb)
     133     }
     (gdb) p vcpu->state
     $1 = EXECUTING
     (gdb) p &vcpu->state
     $2 = (vCPUState *) 0x555557c6b5d0
     (gdb) watch *(vCPUState *) 0x555557c6b5d0
     Hardware watchpoint 2: *(vCPUState *) 0x555557c6b5d0
     (gdb) c
     Continuing.
     [Thread 0x7fffdfe006c0 (LWP 360544) exited]
     [Thread 0x7fffe5e006c0 (LWP 360541) exited]
     Thread 12 "qemu-system-aar" hit Hardware watchpoint 2:
*(vCPUState *) 0x555557c6b5d0
     Old value = EXECUTING
     New value = IDLE
     vcpu_set_state (vcpu=0x555557c6b5c0, new_state=IDLE) at 
/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:83
     83      }
     (gdb) c
     Continuing.
     [New Thread 0x7fffdfe006c0 (LWP 360591)]
     [New Thread 0x7fffe5e006c0 (LWP 360592)]
     [New Thread 0x7fffd34006c0 (LWP 360593)]
     [Switching to Thread 0x7fffdfe006c0 (LWP 360591)]
     Thread 14 "qemu-system-aar" hit Breakpoint 1, vcpu_init
(id=10457908569352202058, cpu_index=6) at
/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:127
     127         vCPUTime *vcpu = qemu_plugin_scoreboard_find(vcpus, cpu_index);
     (gdb)
     Continuing.
     [New Thread 0x7fffd2a006c0 (LWP 360594)]
     **
     ERROR:/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:144:vcpu_resume: 
assertion failed: (vcpu->state == IDLE)
     Bail out! 
ERROR:/home/alex/lsrc/qemu.git/contrib/plugins/ips.c:144:vcpu_resume: assertion 
failed: (vcpu->state == IDLE)
     Thread 13 "qemu-system-aar" received signal SIGABRT, Aborted.
     [Switching to Thread 0x7fffd3e006c0 (LWP 360551)]
     __pthread_kill_implementation (threadid=<optimized out>, 
signo=signo@entry=6, no_tid=no_tid@entry=0) at ./nptl/pthread_kill.c:44
     44      ./nptl/pthread_kill.c: No such file or directory.
     (gdb) bt
     #0  __pthread_kill_implementation (threadid=<optimized out>, 
signo=signo@entry=6, no_tid=no_tid@entry=0) at ./nptl/pthread_kill.c:44
     #1  0x00007ffff4ca9e8f in __pthread_kill_internal (signo=6, 
threadid=<optimized out>) at ./nptl/pthread_kill.c:78
     #2  0x00007ffff4c5afb2 in __GI_raise (sig=sig@entry=6) at 
../sysdeps/posix/raise.c:26
     #3  0x00007ffff4c45472 in __GI_abort () at ./stdlib/abort.c:79
     #4  0x00007ffff6e46ec8 in  () at /lib/x86_64-linux-gnu/libglib-2.0.so.0
     #5  0x00007ffff6ea6e1a in g_assertion_message_expr () at 
/lib/x86_64-linux-gnu/libglib-2.0.so.0
     #6  0x00007ffff7fa16e2 in vcpu_resume (id=10457908569352202058, 
cpu_index=5) at /home/alex/lsrc/qemu.git/contrib/plugins/ips.c:144
     #7  0x00005555562c3202 in plugin_vcpu_cb__simple (cpu=0x5555583d9540, 
ev=QEMU_PLUGIN_EV_VCPU_RESUME) at ../../plugins/core.c:111
     #8  0x00005555562c43f7 in qemu_plugin_vcpu_resume_cb (cpu=0x5555583d9540) 
at ../../plugins/core.c:523
     #9  0x0000555555d9150e in qemu_wait_io_event (cpu=0x5555583d9540) at 
../../system/cpus.c:465
     #10 0x00005555562c67d0 in mttcg_cpu_thread_fn (arg=0x5555583d9540) at 
../../accel/tcg/tcg-accel-ops-mttcg.c:118
     #11 0x00005555565120ff in qemu_thread_start (args=0x5555584716e0) at 
../../util/qemu-thread-posix.c:541
     #12 0x00007ffff4ca8134 in start_thread (arg=<optimized out>) at 
./nptl/pthread_create.c:442
     #13 0x00007ffff4d287dc in clone3 () at 
../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
     (gdb)
But I don't understand how we can ever hit the idle callback without
first hitting the init callback.


More exactly, the assert we hit means that the idle callback, was not
called before the resume callback.
Any chance you can check what is the current vcpu->state value? I
wonder if it's not an exited cpu, that gets resumed after.
No when I looked at it it was set as UNKNOWN. Unfortunately it
doesn't
trigger if I stick breakpoints in or run under rr.


By using rr record --chaos mode maybe?

Ahh yes that triggered it.

It looks like we have a race:

   (rr) c
   Continuing.

   Thread 1 hit Hardware watchpoint 7: *(int *) 0x5559ba6899b0

   Old value = -1
   New value = 10
   machvirt_init (machine=0x5559b9ac3c00) at ../../hw/arm/virt.c:2214
   2214            numa_cpu_pre_plug(&possible_cpus->cpus[cs->cpu_index], 
DEVICE(cpuobj),
   (rr) c
   Continuing.
   [New Thread 650125.650142]
   [Switching to Thread 650125.650142]

   Thread 44 hit Breakpoint 8, mttcg_cpu_thread_fn (arg=0x5559ba6896e0) at 
../../accel/tcg/tcg-accel-ops-mttcg.c:70
   70          assert(tcg_enabled());
   (rr) p cpu->cpu_index
   $24 = 10
   (rr) c
   Continuing.
   [Switching to Thread 650125.650125]

   Thread 1 hit Breakpoint 10, cpu_common_realizefn (dev=0x5559ba6896e0, 
errp=0x7fff02322c10) at ../../hw/core/cpu-common.c:205
   205         Object *machine = qdev_get_machine();
   (rr) p cpu->cpu_index
   $25 = 10
   (rr) bt
   #0  cpu_common_realizefn (dev=0x5559ba6896e0, errp=0x7fff02322c10) at 
../../hw/core/cpu-common.c:205
   #1  0x00005559b606e6a5 in arm_cpu_realizefn (dev=0x5559ba6896e0, 
errp=0x7fff02322c10) at ../../target/arm/cpu.c:2552
   #2  0x00005559b63dd5d0 in device_set_realized (obj=0x5559ba6896e0, 
value=true, errp=0x7fff02322d20) at ../../hw/core/qdev.c:510
   #3  0x00005559b63e84e7 in property_set_bool (obj=0x5559ba6896e0, v=0x5559ba681540, 
name=0x5559b68fafe1 "realized", opaque=0x5559b97f73c0, errp=0x7fff02322d20)
       at ../../qom/object.c:2354
   #4  0x00005559b63e6065 in object_property_set (obj=0x5559ba6896e0, name=0x5559b68fafe1 
"realized", v=0x5559ba681540, errp=0x7fff02322d20) at ../../qom/object.c:1463
   #5  0x00005559b63ead1d in object_property_set_qobject (obj=0x5559ba6896e0, 
name=0x5559b68fafe1 "realized", value=0x5559ba681420, errp=0x5559b7810320 
<error_fatal>)
       at ../../qom/qom-qobject.c:28
   #6  0x00005559b63e640a in object_property_set_bool (obj=0x5559ba6896e0, name=0x5559b68fafe1 
"realized", value=true, errp=0x5559b7810320 <error_fatal>)
       at ../../qom/object.c:1533
   #7  0x00005559b63dccbd in qdev_realize (dev=0x5559ba6896e0, bus=0x0, 
errp=0x5559b7810320 <error_fatal>) at ../../hw/core/qdev.c:291
   #8  0x00005559b5fe4243 in machvirt_init (machine=0x5559b9ac3c00) at 
../../hw/arm/virt.c:2295
   #9  0x00005559b5ada5d6 in machine_run_board_init (machine=0x5559b9ac3c00, 
mem_path=0x0, errp=0x7fff02322f30) at ../../hw/core/machine.c:1576
   #10 0x00005559b5eb416f in qemu_init_board () at ../../system/vl.c:2621
   #11 0x00005559b5eb4482 in qmp_x_exit_preconfig (errp=0x5559b7810320 
<error_fatal>) at ../../system/vl.c:2713
   #12 0x00005559b5eb7088 in qemu_init (argc=35, argv=0x7fff02323268) at 
../../system/vl.c:3759
   #13 0x00005559b63d7ff2 in main (argc=35, argv=0x7fff02323268) at 
../../system/main.c:47

Because looking at the order things happen in the specific CPU realisation:

     qemu_init_vcpu(cs);
     cpu_reset(cs);

     acc->parent_realize(dev, errp);

And the common realize function doesn't get to do:

     /* Plugin initialization must wait until the cpu start executing code */
#ifdef CONFIG_PLUGIN
     if (tcg_enabled()) {
         cpu->plugin_state = qemu_plugin_create_vcpu_state();
         async_run_on_cpu(cpu, qemu_plugin_vcpu_init__async, RUN_ON_CPU_NULL);
     }
#endif

before we've started executing.....


Thanks for the analysis Alex.
It seems to me this is integrating the recent series from Philippe.
Maybe the race has been introduced there.
Could you try without this series?

Reply via email to