Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-05-06 Thread Anthony Harivel
Anthony Harivel, Apr 26, 2024 at 10:36:
>
> Hi Paolo,
>
> Daniel P. Berrangé, Apr 25, 2024 at 17:42:
> > On Thu, Apr 25, 2024 at 05:34:52PM +0200, Anthony Harivel wrote:
> > > Hi Daniel,
> > > 
> > > Daniel P. Berrangé, Apr 18, 2024 at 18:42:
> > > 
> > > > > +if (kvm_is_rapl_feat_enable(cs)) {
> > > > > +if (!IS_INTEL_CPU(env)) {
> > > > > +error_setg(errp, "RAPL feature can only be\
> > > > > +  enabled with Intel CPU models");
> > > > > +return false;
> > > > > +}
> > > > > +}
> > > >
> > > > I see a crash in kvm_is_rapl_feat_enable() from this caller,
> > > > when I run with this kind of command line:
> > > >
> > > >  $ qemu-system-x86_64 \
> > > >   -kernel /lib/modules/6.6.9-100.fc38.x86_64/vmlinuz \
> > > >   -initrd tiny-initrd.img  -m 2000 -serial stdio -nodefaults \
> > > >   -display none -accel kvm -append "console=ttyS0 quiet"
> > > >
> > > >
> > > > #0  0x55bc14b7 in kvm_is_rapl_feat_enable 
> > > > (cs=cs@entry=0x57b83470) at ../target/i386/kvm/kvm.c:2531
> > > > #1  0x55bc7534 in kvm_cpu_realizefn (cs=0x57b83470, 
> > > > errp=0x7fffd2a0) at ../target/i386/kvm/kvm-cpu.c:54
> > > > #2  0x55d2432a in accel_cpu_common_realize (cpu=0x57b83470, 
> > > > errp=0x7fffd2a0) at ../accel/accel-target.c:130
> > > > #3  0x55cdd955 in cpu_exec_realizefn 
> > > > (cpu=cpu@entry=0x57b83470, errp=errp@entry=0x7fffd2a0) at 
> > > > ../cpu-target.c:137
> > > > #4  0x55c14b89 in x86_cpu_realizefn (dev=0x57b83470, 
> > > > errp=0x7fffd310) at ../target/i386/cpu.c:7320
> > > > #5  0x55d58f4b in device_set_realized (obj=, 
> > > > value=, errp=0x7fffd390) at ../hw/core/qdev.c:510
> > > > #6  0x55d5d78d in property_set_bool (obj=0x57b83470, 
> > > > v=, name=, opaque=0x578558e0, 
> > > > errp=0x7fffd390)
> > > > at ../qom/object.c:2358
> > > > #7  0x55d60b0b in object_property_set 
> > > > (obj=obj@entry=0x57b83470, name=name@entry=0x5607c799 
> > > > "realized", v=v@entry=0x57b8ccb0, errp=0x7fffd390, 
> > > > errp@entry=0x56e210d8 ) at ../qom/object.c:1472
> > > > #8  0x55d6444f in object_property_set_qobject
> > > > (obj=obj@entry=0x57b83470, name=name@entry=0x5607c799 
> > > > "realized", value=value@entry=0x57854800, 
> > > > errp=errp@entry=0x56e210d8 )
> > > > at ../qom/qom-qobject.c:28
> > > > #9  0x55d61174 in object_property_set_bool
> > > > (obj=0x57b83470, name=name@entry=0x5607c799 "realized", 
> > > > value=value@entry=true, errp=errp@entry=0x56e210d8 ) 
> > > > at ../qom/object.c:1541
> > > > #10 0x55d59a3c in qdev_realize (dev=, 
> > > > bus=bus@entry=0x0, errp=errp@entry=0x56e210d8 ) at 
> > > > ../hw/core/qdev.c:292
> > > > #11 0x55bd51e0 in x86_cpu_new (x86ms=, 
> > > > apic_id=0, errp=0x56e210d8 ) at ../hw/i386/x86.c:105
> > > > #12 0x55bd52ce in x86_cpus_init 
> > > > (x86ms=x86ms@entry=0x57aaed30, default_cpu_version=) 
> > > > at ../hw/i386/x86.c:156
> > > > #13 0x55bdc1a7 in pc_init1 (machine=0x57aaed30, 
> > > > pci_type=0x5604aa61 "i440FX") at ../hw/i386/pc_piix.c:185
> > > > #14 0x55947a11 in machine_run_board_init 
> > > > (machine=0x57aaed30, mem_path=, errp= > > > out>, errp@entry=0x56e210d8 )
> > > > at ../hw/core/machine.c:1547
> > > > #15 0x55b020ed in qemu_init_board () at ../system/vl.c:2613
> > > > #16 qmp_x_exit_preconfig (errp=0x56e210d8 ) at 
> > > > ../system/vl.c:2705
> > > > #17 0x55b0611e in qemu_init (argc=, 
> > > > argv=) at ../system/vl.c:3739
> > > > #18 0x55897ca9 in main (argc=, argv= > > > out>) at ../system/main.c:47
> > > >
> > > > The problem is that 'cs->kvm_state' is NULL here
> > > >
> > > 
> > > After some investigation it seems that kvm_state is not yet committed 
> > > at this point. Shame, because GDB showed me that we have already pass 
> > > the kvm_accel_instance_init() in accel/kvm/kvm-all.c that sets the 
> > > value "msr_energy.enable" in kvm_state...
> > > 
> > > So should I dig more to still do the sanity check in kvm_cpu_realizefn() 
> > > or should I already move the RAPL feature  from -kvm to -cpu 
> > > like suggested by Zhao from Intel and then access it from the CPUState ?
> >
> > I'm not so sure about that question. I think Paolo is best placed
> > to suggest which is better, as the KVM maintainer.
> >
>
> I'm facing an issue that either require a simple change or a more 
> complex one depending on the decision.
>
> TL;DR: Should I move from -kvm to -cpu the rapl feature ? 
>
> Zhao from Intel suggested me that enabling the rapl feature looks more 
> natural than through -kvm feature because it is indeed a cpu feature. 
>
> From the point of view of the QEMU architecture, it's more easier to 
> enable the feature with -kvm because it is kvm dependent; but 

Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-26 Thread Anthony Harivel


Hi Paolo,

Daniel P. Berrangé, Apr 25, 2024 at 17:42:
> On Thu, Apr 25, 2024 at 05:34:52PM +0200, Anthony Harivel wrote:
> > Hi Daniel,
> > 
> > Daniel P. Berrangé, Apr 18, 2024 at 18:42:
> > 
> > > > +if (kvm_is_rapl_feat_enable(cs)) {
> > > > +if (!IS_INTEL_CPU(env)) {
> > > > +error_setg(errp, "RAPL feature can only be\
> > > > +  enabled with Intel CPU models");
> > > > +return false;
> > > > +}
> > > > +}
> > >
> > > I see a crash in kvm_is_rapl_feat_enable() from this caller,
> > > when I run with this kind of command line:
> > >
> > >  $ qemu-system-x86_64 \
> > >   -kernel /lib/modules/6.6.9-100.fc38.x86_64/vmlinuz \
> > >   -initrd tiny-initrd.img  -m 2000 -serial stdio -nodefaults \
> > >   -display none -accel kvm -append "console=ttyS0 quiet"
> > >
> > >
> > > #0  0x55bc14b7 in kvm_is_rapl_feat_enable 
> > > (cs=cs@entry=0x57b83470) at ../target/i386/kvm/kvm.c:2531
> > > #1  0x55bc7534 in kvm_cpu_realizefn (cs=0x57b83470, 
> > > errp=0x7fffd2a0) at ../target/i386/kvm/kvm-cpu.c:54
> > > #2  0x55d2432a in accel_cpu_common_realize (cpu=0x57b83470, 
> > > errp=0x7fffd2a0) at ../accel/accel-target.c:130
> > > #3  0x55cdd955 in cpu_exec_realizefn 
> > > (cpu=cpu@entry=0x57b83470, errp=errp@entry=0x7fffd2a0) at 
> > > ../cpu-target.c:137
> > > #4  0x55c14b89 in x86_cpu_realizefn (dev=0x57b83470, 
> > > errp=0x7fffd310) at ../target/i386/cpu.c:7320
> > > #5  0x55d58f4b in device_set_realized (obj=, 
> > > value=, errp=0x7fffd390) at ../hw/core/qdev.c:510
> > > #6  0x55d5d78d in property_set_bool (obj=0x57b83470, 
> > > v=, name=, opaque=0x578558e0, 
> > > errp=0x7fffd390)
> > > at ../qom/object.c:2358
> > > #7  0x55d60b0b in object_property_set 
> > > (obj=obj@entry=0x57b83470, name=name@entry=0x5607c799 "realized", 
> > > v=v@entry=0x57b8ccb0, errp=0x7fffd390, 
> > > errp@entry=0x56e210d8 ) at ../qom/object.c:1472
> > > #8  0x55d6444f in object_property_set_qobject
> > > (obj=obj@entry=0x57b83470, name=name@entry=0x5607c799 
> > > "realized", value=value@entry=0x57854800, 
> > > errp=errp@entry=0x56e210d8 )
> > > at ../qom/qom-qobject.c:28
> > > #9  0x55d61174 in object_property_set_bool
> > > (obj=0x57b83470, name=name@entry=0x5607c799 "realized", 
> > > value=value@entry=true, errp=errp@entry=0x56e210d8 ) at 
> > > ../qom/object.c:1541
> > > #10 0x55d59a3c in qdev_realize (dev=, 
> > > bus=bus@entry=0x0, errp=errp@entry=0x56e210d8 ) at 
> > > ../hw/core/qdev.c:292
> > > #11 0x55bd51e0 in x86_cpu_new (x86ms=, apic_id=0, 
> > > errp=0x56e210d8 ) at ../hw/i386/x86.c:105
> > > #12 0x55bd52ce in x86_cpus_init 
> > > (x86ms=x86ms@entry=0x57aaed30, default_cpu_version=) 
> > > at ../hw/i386/x86.c:156
> > > #13 0x55bdc1a7 in pc_init1 (machine=0x57aaed30, 
> > > pci_type=0x5604aa61 "i440FX") at ../hw/i386/pc_piix.c:185
> > > #14 0x55947a11 in machine_run_board_init (machine=0x57aaed30, 
> > > mem_path=, errp=, errp@entry=0x56e210d8 
> > > )
> > > at ../hw/core/machine.c:1547
> > > #15 0x55b020ed in qemu_init_board () at ../system/vl.c:2613
> > > #16 qmp_x_exit_preconfig (errp=0x56e210d8 ) at 
> > > ../system/vl.c:2705
> > > #17 0x55b0611e in qemu_init (argc=, 
> > > argv=) at ../system/vl.c:3739
> > > #18 0x55897ca9 in main (argc=, argv= > > out>) at ../system/main.c:47
> > >
> > > The problem is that 'cs->kvm_state' is NULL here
> > >
> > 
> > After some investigation it seems that kvm_state is not yet committed 
> > at this point. Shame, because GDB showed me that we have already pass 
> > the kvm_accel_instance_init() in accel/kvm/kvm-all.c that sets the 
> > value "msr_energy.enable" in kvm_state...
> > 
> > So should I dig more to still do the sanity check in kvm_cpu_realizefn() 
> > or should I already move the RAPL feature  from -kvm to -cpu 
> > like suggested by Zhao from Intel and then access it from the CPUState ?
>
> I'm not so sure about that question. I think Paolo is best placed
> to suggest which is better, as the KVM maintainer.
>

I'm facing an issue that either require a simple change or a more 
complex one depending on the decision.

TL;DR: Should I move from -kvm to -cpu the rapl feature ? 

Zhao from Intel suggested me that enabling the rapl feature looks more 
natural than through -kvm feature because it is indeed a cpu feature. 

>From the point of view of the QEMU architecture, it's more easier to 
enable the feature with -kvm because it is kvm dependent; but maybe from 
the point of view of the user, it's more natural to enable this at -cpu 
level. 

The issue I'm facing above is from a suggestion from Daniel to do the 
sanity check at kvm_cpu_realizefn() level. However GDB showed me that 
kvm_st

Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-25 Thread Daniel P . Berrangé
On Thu, Apr 25, 2024 at 05:34:52PM +0200, Anthony Harivel wrote:
> Hi Daniel,
> 
> Daniel P. Berrangé, Apr 18, 2024 at 18:42:
> 
> > > +if (kvm_is_rapl_feat_enable(cs)) {
> > > +if (!IS_INTEL_CPU(env)) {
> > > +error_setg(errp, "RAPL feature can only be\
> > > +  enabled with Intel CPU models");
> > > +return false;
> > > +}
> > > +}
> >
> > I see a crash in kvm_is_rapl_feat_enable() from this caller,
> > when I run with this kind of command line:
> >
> >  $ qemu-system-x86_64 \
> >   -kernel /lib/modules/6.6.9-100.fc38.x86_64/vmlinuz \
> >   -initrd tiny-initrd.img  -m 2000 -serial stdio -nodefaults \
> >   -display none -accel kvm -append "console=ttyS0 quiet"
> >
> >
> > #0  0x55bc14b7 in kvm_is_rapl_feat_enable 
> > (cs=cs@entry=0x57b83470) at ../target/i386/kvm/kvm.c:2531
> > #1  0x55bc7534 in kvm_cpu_realizefn (cs=0x57b83470, 
> > errp=0x7fffd2a0) at ../target/i386/kvm/kvm-cpu.c:54
> > #2  0x55d2432a in accel_cpu_common_realize (cpu=0x57b83470, 
> > errp=0x7fffd2a0) at ../accel/accel-target.c:130
> > #3  0x55cdd955 in cpu_exec_realizefn (cpu=cpu@entry=0x57b83470, 
> > errp=errp@entry=0x7fffd2a0) at ../cpu-target.c:137
> > #4  0x55c14b89 in x86_cpu_realizefn (dev=0x57b83470, 
> > errp=0x7fffd310) at ../target/i386/cpu.c:7320
> > #5  0x55d58f4b in device_set_realized (obj=, 
> > value=, errp=0x7fffd390) at ../hw/core/qdev.c:510
> > #6  0x55d5d78d in property_set_bool (obj=0x57b83470, 
> > v=, name=, opaque=0x578558e0, 
> > errp=0x7fffd390)
> > at ../qom/object.c:2358
> > #7  0x55d60b0b in object_property_set 
> > (obj=obj@entry=0x57b83470, name=name@entry=0x5607c799 "realized", 
> > v=v@entry=0x57b8ccb0, errp=0x7fffd390, 
> > errp@entry=0x56e210d8 ) at ../qom/object.c:1472
> > #8  0x55d6444f in object_property_set_qobject
> > (obj=obj@entry=0x57b83470, name=name@entry=0x5607c799 
> > "realized", value=value@entry=0x57854800, 
> > errp=errp@entry=0x56e210d8 )
> > at ../qom/qom-qobject.c:28
> > #9  0x55d61174 in object_property_set_bool
> > (obj=0x57b83470, name=name@entry=0x5607c799 "realized", 
> > value=value@entry=true, errp=errp@entry=0x56e210d8 ) at 
> > ../qom/object.c:1541
> > #10 0x55d59a3c in qdev_realize (dev=, 
> > bus=bus@entry=0x0, errp=errp@entry=0x56e210d8 ) at 
> > ../hw/core/qdev.c:292
> > #11 0x55bd51e0 in x86_cpu_new (x86ms=, apic_id=0, 
> > errp=0x56e210d8 ) at ../hw/i386/x86.c:105
> > #12 0x55bd52ce in x86_cpus_init (x86ms=x86ms@entry=0x57aaed30, 
> > default_cpu_version=) at ../hw/i386/x86.c:156
> > #13 0x55bdc1a7 in pc_init1 (machine=0x57aaed30, 
> > pci_type=0x5604aa61 "i440FX") at ../hw/i386/pc_piix.c:185
> > #14 0x55947a11 in machine_run_board_init (machine=0x57aaed30, 
> > mem_path=, errp=, errp@entry=0x56e210d8 
> > )
> > at ../hw/core/machine.c:1547
> > #15 0x55b020ed in qemu_init_board () at ../system/vl.c:2613
> > #16 qmp_x_exit_preconfig (errp=0x56e210d8 ) at 
> > ../system/vl.c:2705
> > #17 0x55b0611e in qemu_init (argc=, argv= > out>) at ../system/vl.c:3739
> > #18 0x55897ca9 in main (argc=, argv=) 
> > at ../system/main.c:47
> >
> > The problem is that 'cs->kvm_state' is NULL here
> >
> 
> After some investigation it seems that kvm_state is not yet committed 
> at this point. Shame, because GDB showed me that we have already pass 
> the kvm_accel_instance_init() in accel/kvm/kvm-all.c that sets the 
> value "msr_energy.enable" in kvm_state...
> 
> So should I dig more to still do the sanity check in kvm_cpu_realizefn() 
> or should I already move the RAPL feature  from -kvm to -cpu 
> like suggested by Zhao from Intel and then access it from the CPUState ?

I'm not so sure about that question. I think Paolo is best placed
to suggest which is better, as the KVM maintainer.


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-25 Thread Anthony Harivel
Hi Daniel,

Daniel P. Berrangé, Apr 18, 2024 at 18:42:

> > +if (kvm_is_rapl_feat_enable(cs)) {
> > +if (!IS_INTEL_CPU(env)) {
> > +error_setg(errp, "RAPL feature can only be\
> > +  enabled with Intel CPU models");
> > +return false;
> > +}
> > +}
>
> I see a crash in kvm_is_rapl_feat_enable() from this caller,
> when I run with this kind of command line:
>
>  $ qemu-system-x86_64 \
>   -kernel /lib/modules/6.6.9-100.fc38.x86_64/vmlinuz \
>   -initrd tiny-initrd.img  -m 2000 -serial stdio -nodefaults \
>   -display none -accel kvm -append "console=ttyS0 quiet"
>
>
> #0  0x55bc14b7 in kvm_is_rapl_feat_enable 
> (cs=cs@entry=0x57b83470) at ../target/i386/kvm/kvm.c:2531
> #1  0x55bc7534 in kvm_cpu_realizefn (cs=0x57b83470, 
> errp=0x7fffd2a0) at ../target/i386/kvm/kvm-cpu.c:54
> #2  0x55d2432a in accel_cpu_common_realize (cpu=0x57b83470, 
> errp=0x7fffd2a0) at ../accel/accel-target.c:130
> #3  0x55cdd955 in cpu_exec_realizefn (cpu=cpu@entry=0x57b83470, 
> errp=errp@entry=0x7fffd2a0) at ../cpu-target.c:137
> #4  0x55c14b89 in x86_cpu_realizefn (dev=0x57b83470, 
> errp=0x7fffd310) at ../target/i386/cpu.c:7320
> #5  0x55d58f4b in device_set_realized (obj=, 
> value=, errp=0x7fffd390) at ../hw/core/qdev.c:510
> #6  0x55d5d78d in property_set_bool (obj=0x57b83470, v= out>, name=, opaque=0x578558e0, errp=0x7fffd390)
> at ../qom/object.c:2358
> #7  0x55d60b0b in object_property_set (obj=obj@entry=0x57b83470, 
> name=name@entry=0x5607c799 "realized", v=v@entry=0x57b8ccb0, 
> errp=0x7fffd390, 
> errp@entry=0x56e210d8 ) at ../qom/object.c:1472
> #8  0x55d6444f in object_property_set_qobject
> (obj=obj@entry=0x57b83470, name=name@entry=0x5607c799 "realized", 
> value=value@entry=0x57854800, errp=errp@entry=0x56e210d8 
> )
> at ../qom/qom-qobject.c:28
> #9  0x55d61174 in object_property_set_bool
> (obj=0x57b83470, name=name@entry=0x5607c799 "realized", 
> value=value@entry=true, errp=errp@entry=0x56e210d8 ) at 
> ../qom/object.c:1541
> #10 0x55d59a3c in qdev_realize (dev=, 
> bus=bus@entry=0x0, errp=errp@entry=0x56e210d8 ) at 
> ../hw/core/qdev.c:292
> #11 0x55bd51e0 in x86_cpu_new (x86ms=, apic_id=0, 
> errp=0x56e210d8 ) at ../hw/i386/x86.c:105
> #12 0x55bd52ce in x86_cpus_init (x86ms=x86ms@entry=0x57aaed30, 
> default_cpu_version=) at ../hw/i386/x86.c:156
> #13 0x55bdc1a7 in pc_init1 (machine=0x57aaed30, 
> pci_type=0x5604aa61 "i440FX") at ../hw/i386/pc_piix.c:185
> #14 0x55947a11 in machine_run_board_init (machine=0x57aaed30, 
> mem_path=, errp=, errp@entry=0x56e210d8 
> )
> at ../hw/core/machine.c:1547
> #15 0x55b020ed in qemu_init_board () at ../system/vl.c:2613
> #16 qmp_x_exit_preconfig (errp=0x56e210d8 ) at 
> ../system/vl.c:2705
> #17 0x55b0611e in qemu_init (argc=, argv= out>) at ../system/vl.c:3739
> #18 0x55897ca9 in main (argc=, argv=) 
> at ../system/main.c:47
>
> The problem is that 'cs->kvm_state' is NULL here
>

After some investigation it seems that kvm_state is not yet committed 
at this point. Shame, because GDB showed me that we have already pass 
the kvm_accel_instance_init() in accel/kvm/kvm-all.c that sets the 
value "msr_energy.enable" in kvm_state...

So should I dig more to still do the sanity check in kvm_cpu_realizefn() 
or should I already move the RAPL feature  from -kvm to -cpu 
like suggested by Zhao from Intel and then access it from the CPUState ?

The last one would require more work but if I can skip a new iteration 
because I would need to do it anyway that would save me time in this end. 

Thanks

Regards,
Anthony





Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-19 Thread Zhao Liu
Hi Anthony,

On Thu, Apr 18, 2024 at 12:52:14PM +0200, Anthony Harivel wrote:
> Date: Thu, 18 Apr 2024 12:52:14 +0200
> From: Anthony Harivel 
> Subject: Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu
> 
> > The package energy consumption includes core part and uncore part, where
> > uncore part consumption may not be able to be scaled based on vCPU
> > runtime ratio.
> >
> > When the uncore part consumption is small, the error in this part is
> > small, but if it is large, then the error generated by scaling by vCPU
> > runtime will be large.
> >
> 
> So far we can only work with what Intel is giving us i.e Package power 
> plane and DRAM power plane on server, which is the main target of 
> this feature. Maybe in the future, Intel will expand the core power 
> plane and the uncore power plane to server class CPU ?

Not future features, I'd like to illustrate the impact of the uncore
part (iGPU/NPU on Client or various accelerators on Server) on this
algorithm. Because the consumption of the uncore part is complex and not
necessarily linearly related to the vCPU task running time.

It might be worth to state potential impact on accuracy of uncore parts
to doc (I doubt that heavy uncore consumption will even affect the
consistency of the energy trend as you said).

Anyway, clearer scenarios help this feature get used.

> > May I ask what your usage scenario is? Is there significant uncore
> > consumption (e.g. GPU)?
> >
> 
> Same answer as above: uncore/graphics power plane is only available on 
> client class CPU. 

Yes, iGPU is, but server may have other accelerators, e.g., DSA/IAA/QAT
on SPR.

> > Also, I think of a generic question is whether the error in this
> > calculation is measurable? Like comparing the RAPL status of the same
> > workload on Guest and bare metal to check the error.
> >
> > IIUC, this calculation is highly affected by native/sibling Guests,
> > especially in cloud scenarios where there are multiple Guests, the
> > accuracy of this algorithm needs to be checked.
> >
> 
> Indeed, depending on where your vCPUs are running within the package (on 
> the native or sibling CPU), you might observe different power 
> consumption levels. However, I don't consider this to be a problem, as 
> the ratio calculation takes into account the vCPU's location.
> 
> We also need to approach the measurement differently. Due to the 
> complexity of factors influencing power consumption, we must compare 
> what is comparable. If you require precise power consumption data, 
> use a power meter on the PSU of the server.It will provide the 
> ultimate judgment. However, if you need an estimation to optimize 
> software workloads in a guest, then this feature could be useful. All my 
> tests have consistently shown reproducible output in terms of power 
> consumption, which has convinced me that we can effectively work with 
> it.

Thanks, another mail in which you illustrated that the trend is
consistent.

[snip]

> >
> > In addition, RAPL is basically a CPU feature, I think it would be more
> > appropriate to make it as a x86 CPU's property.
> >
> > Your RAPL support actually provides a framework for assisting KVM
> > emulation in userspace, so this informs other feature support (maybe model
> > specific, or architectural) in the future. Enabling/disabling CPU features
> > via -cpu looks more natural.
> 
> This is totally dependant of KVM because it used the KVM MSR Filtering 
> to access userspace when a specific MSR is required.

Yes, but in other words, other KVM based features (completely hardware
virtualization) are also configured by -cpu. This RAPL is still a CPU
feature and just need KVM's help.
 
> I can try to find a way to use -cpu for this feature and check if KVM is 
> activated or not. 
>

[snip]

> >
> > I understand tick would ignore frequency changes, e.g., HWP's auto-pilot
> > or turbo boost. All these CPU frequency change would impact on core energy
> > consumption.
> >
> > I think the better way would be to use APERF counter, but unfortunately it
> > lacks virtualization support (for Intel).
> >
> > Due to such considerations, it may be more worthwhile to evaluate the
> > accuracy of this tick-based algorithm.
> >
> 
> I've evaluated such things with another tool called Kepler [1]. This 
> tool calculate the power ratio with metrics from RAPL and uses either 
> eBPF or the tick based systems for time metrics.

Thanks for this information! I understand current tick based algorithm
is a common approximation in the industry (like Kepler), right?

> The eBPF part [2] is 
> triggered on each 'fin

Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-18 Thread Anthony Harivel


Hi Daniel,

Daniel P. Berrangé, Apr 18, 2024 at 18:42:
> On Thu, Apr 11, 2024 at 02:14:34PM +0200, Anthony Harivel wrote:
> > Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> > interface (Running Average Power Limit) for advertising the accumulated
> > energy consumption of various power domains (e.g. CPU packages, DRAM,
> > etc.).
> > 
> > The consumption is reported via MSRs (model specific registers) like
> > MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> > 64 bits registers that represent the accumulated energy consumption in
> > micro Joules. They are updated by microcode every ~1ms.
> > 
> > For now, KVM always returns 0 when the guest requests the value of
> > these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle
> > these MSRs dynamically in userspace.
> > 
> > To limit the amount of system calls for every MSR call, create a new
> > thread in QEMU that updates the "virtual" MSR values asynchronously.
> > 
> > Each vCPU has its own vMSR to reflect the independence of vCPUs. The
> > thread updates the vMSR values with the ratio of energy consumed of
> > the whole physical CPU package the vCPU thread runs on and the
> > thread's utime and stime values.
> > 
> > All other non-vCPU threads are also taken into account. Their energy
> > consumption is evenly distributed among all vCPUs threads running on
> > the same physical CPU package.
> > 
> > To overcome the problem that reading the RAPL MSR requires priviliged
> > access, a socket communication between QEMU and the qemu-vmsr-helper is
> > mandatory. You can specified the socket path in the parameter.
> > 
> > This feature is activated with -accel kvm,rapl=true,path=/path/sock.sock
> > 
> > Actual limitation:
> > - Works only on Intel host CPU because AMD CPUs are using different MSR
> >   adresses.
> > 
> > - Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at
> >   the moment.
> > 
> > Signed-off-by: Anthony Harivel 
> > ---
> >  accel/kvm/kvm-all.c   |  27 +++
> >  docs/specs/index.rst  |   1 +
> >  docs/specs/rapl-msr.rst   | 155 
> >  include/sysemu/kvm.h  |   2 +
> >  include/sysemu/kvm_int.h  |  32 +++
> >  target/i386/cpu.h |   8 +
> >  target/i386/kvm/kvm-cpu.c |   9 +
> >  target/i386/kvm/kvm.c | 428 ++
> >  target/i386/kvm/meson.build   |   1 +
> >  target/i386/kvm/vmsr_energy.c | 335 ++
> >  target/i386/kvm/vmsr_energy.h |  99 
> >  11 files changed, 1097 insertions(+)
> >  create mode 100644 docs/specs/rapl-msr.rst
> >  create mode 100644 target/i386/kvm/vmsr_energy.c
> >  create mode 100644 target/i386/kvm/vmsr_energy.h
> > 
>
> > diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
> > index 9c791b7b0520..eafb625839b8 100644
> > --- a/target/i386/kvm/kvm-cpu.c
> > +++ b/target/i386/kvm/kvm-cpu.c
> > @@ -50,6 +50,15 @@ static bool kvm_cpu_realizefn(CPUState *cs, Error **errp)
> > MSR_IA32_UCODE_REV);
> >  }
> >  }
> > +
> > +if (kvm_is_rapl_feat_enable(cs)) {
> > +if (!IS_INTEL_CPU(env)) {
> > +error_setg(errp, "RAPL feature can only be\
> > +  enabled with Intel CPU models");
> > +return false;
> > +}
> > +}
>
> I see a crash in kvm_is_rapl_feat_enable() from this caller,
> when I run with this kind of command line:
>
>  $ qemu-system-x86_64 \
>   -kernel /lib/modules/6.6.9-100.fc38.x86_64/vmlinuz \
>   -initrd tiny-initrd.img  -m 2000 -serial stdio -nodefaults \
>   -display none -accel kvm -append "console=ttyS0 quiet"
>
>
> #0  0x55bc14b7 in kvm_is_rapl_feat_enable 
> (cs=cs@entry=0x57b83470) at ../target/i386/kvm/kvm.c:2531
> #1  0x55bc7534 in kvm_cpu_realizefn (cs=0x57b83470, 
> errp=0x7fffd2a0) at ../target/i386/kvm/kvm-cpu.c:54
> #2  0x55d2432a in accel_cpu_common_realize (cpu=0x57b83470, 
> errp=0x7fffd2a0) at ../accel/accel-target.c:130
> #3  0x55cdd955 in cpu_exec_realizefn (cpu=cpu@entry=0x57b83470, 
> errp=errp@entry=0x7fffd2a0) at ../cpu-target.c:137
> #4  0x55c14b89 in x86_cpu_realizefn (dev=0x57b83470, 
> errp=0x7fffd310) at ../target/i386/cpu.c:7320
> #5  0x55d58f4b in device_set_realized (obj=, 
> value=, errp=0x7fffd390) at ../hw/core/qdev.c:510
> #6  0x55d5d78d in property_set_bool (obj=0x57b83470, v= out>, name=, opaque=0x578558e0, errp=0x7fffd390)
> at ../qom/object.c:2358
> #7  0x55d60b0b in object_property_set (obj=obj@entry=0x57b83470, 
> name=name@entry=0x5607c799 "realized", v=v@entry=0x57b8ccb0, 
> errp=0x7fffd390, 
> errp@entry=0x56e210d8 ) at ../qom/object.c:1472
> #8  0x55d6444f in object_property_set_qobject
> (obj=obj@entry=0x57b83470, name=name@entry=0x5607c799 "re

Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-18 Thread Daniel P . Berrangé
On Thu, Apr 11, 2024 at 02:14:34PM +0200, Anthony Harivel wrote:
> Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> interface (Running Average Power Limit) for advertising the accumulated
> energy consumption of various power domains (e.g. CPU packages, DRAM,
> etc.).
> 
> The consumption is reported via MSRs (model specific registers) like
> MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> 64 bits registers that represent the accumulated energy consumption in
> micro Joules. They are updated by microcode every ~1ms.
> 
> For now, KVM always returns 0 when the guest requests the value of
> these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle
> these MSRs dynamically in userspace.
> 
> To limit the amount of system calls for every MSR call, create a new
> thread in QEMU that updates the "virtual" MSR values asynchronously.
> 
> Each vCPU has its own vMSR to reflect the independence of vCPUs. The
> thread updates the vMSR values with the ratio of energy consumed of
> the whole physical CPU package the vCPU thread runs on and the
> thread's utime and stime values.
> 
> All other non-vCPU threads are also taken into account. Their energy
> consumption is evenly distributed among all vCPUs threads running on
> the same physical CPU package.
> 
> To overcome the problem that reading the RAPL MSR requires priviliged
> access, a socket communication between QEMU and the qemu-vmsr-helper is
> mandatory. You can specified the socket path in the parameter.
> 
> This feature is activated with -accel kvm,rapl=true,path=/path/sock.sock
> 
> Actual limitation:
> - Works only on Intel host CPU because AMD CPUs are using different MSR
>   adresses.
> 
> - Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at
>   the moment.
> 
> Signed-off-by: Anthony Harivel 
> ---
>  accel/kvm/kvm-all.c   |  27 +++
>  docs/specs/index.rst  |   1 +
>  docs/specs/rapl-msr.rst   | 155 
>  include/sysemu/kvm.h  |   2 +
>  include/sysemu/kvm_int.h  |  32 +++
>  target/i386/cpu.h |   8 +
>  target/i386/kvm/kvm-cpu.c |   9 +
>  target/i386/kvm/kvm.c | 428 ++
>  target/i386/kvm/meson.build   |   1 +
>  target/i386/kvm/vmsr_energy.c | 335 ++
>  target/i386/kvm/vmsr_energy.h |  99 
>  11 files changed, 1097 insertions(+)
>  create mode 100644 docs/specs/rapl-msr.rst
>  create mode 100644 target/i386/kvm/vmsr_energy.c
>  create mode 100644 target/i386/kvm/vmsr_energy.h
> 

> diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
> index 9c791b7b0520..eafb625839b8 100644
> --- a/target/i386/kvm/kvm-cpu.c
> +++ b/target/i386/kvm/kvm-cpu.c
> @@ -50,6 +50,15 @@ static bool kvm_cpu_realizefn(CPUState *cs, Error **errp)
> MSR_IA32_UCODE_REV);
>  }
>  }
> +
> +if (kvm_is_rapl_feat_enable(cs)) {
> +if (!IS_INTEL_CPU(env)) {
> +error_setg(errp, "RAPL feature can only be\
> +  enabled with Intel CPU models");
> +return false;
> +}
> +}

I see a crash in kvm_is_rapl_feat_enable() from this caller,
when I run with this kind of command line:

 $ qemu-system-x86_64 \
  -kernel /lib/modules/6.6.9-100.fc38.x86_64/vmlinuz \
  -initrd tiny-initrd.img  -m 2000 -serial stdio -nodefaults \
  -display none -accel kvm -append "console=ttyS0 quiet"


#0  0x55bc14b7 in kvm_is_rapl_feat_enable (cs=cs@entry=0x57b83470) 
at ../target/i386/kvm/kvm.c:2531
#1  0x55bc7534 in kvm_cpu_realizefn (cs=0x57b83470, 
errp=0x7fffd2a0) at ../target/i386/kvm/kvm-cpu.c:54
#2  0x55d2432a in accel_cpu_common_realize (cpu=0x57b83470, 
errp=0x7fffd2a0) at ../accel/accel-target.c:130
#3  0x55cdd955 in cpu_exec_realizefn (cpu=cpu@entry=0x57b83470, 
errp=errp@entry=0x7fffd2a0) at ../cpu-target.c:137
#4  0x55c14b89 in x86_cpu_realizefn (dev=0x57b83470, 
errp=0x7fffd310) at ../target/i386/cpu.c:7320
#5  0x55d58f4b in device_set_realized (obj=, 
value=, errp=0x7fffd390) at ../hw/core/qdev.c:510
#6  0x55d5d78d in property_set_bool (obj=0x57b83470, v=, name=, opaque=0x578558e0, errp=0x7fffd390)
at ../qom/object.c:2358
#7  0x55d60b0b in object_property_set (obj=obj@entry=0x57b83470, 
name=name@entry=0x5607c799 "realized", v=v@entry=0x57b8ccb0, 
errp=0x7fffd390, 
errp@entry=0x56e210d8 ) at ../qom/object.c:1472
#8  0x55d6444f in object_property_set_qobject
(obj=obj@entry=0x57b83470, name=name@entry=0x5607c799 "realized", 
value=value@entry=0x57854800, errp=errp@entry=0x56e210d8 )
at ../qom/qom-qobject.c:28
#9  0x55d61174 in object_property_set_bool
(obj=0x57b83470, name=name@entry=0x5607c799 "realized", 
value=value@entry=true, errp=errp@entry

Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-18 Thread Anthony Harivel


Hi Zhao,

Zhao Liu, Apr 17, 2024 at 12:07:
> Hi Anthony,
>
> May I ask what your usage scenario is? Is it to measure Guest's energy
> consumption and to charged per watt consumed? ;-)

See previous email from Daniel.

> On Thu, Apr 11, 2024 at 02:14:34PM +0200, Anthony Harivel wrote:
> > Date: Thu, 11 Apr 2024 14:14:34 +0200
> > From: Anthony Harivel 
> > Subject: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu
> > 
> > Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> > interface (Running Average Power Limit) for advertising the accumulated
> > energy consumption of various power domains (e.g. CPU packages, DRAM,
> > etc.).
> >
> > The consumption is reported via MSRs (model specific registers) like
> > MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> > 64 bits registers that represent the accumulated energy consumption in
> > micro Joules. They are updated by microcode every ~1ms.
>
> What is your current target platform?
>
> On future Xeon platforms (EMR and beyond) RAPL will support TPMI (an MMIO
> interface) and the TPMI based RAPL will be preferred in the future as
> well:
> * TPMI doc: https://github.com/intel/tpmi_power_management
> * TPMI based RAPL driver: drivers/powercap/intel_rapl_tpmi.c
>
> So do you have the plan to support RAPL-TPMI?

Yes, I guess this would be inevitable in the future. But right now the 
lack of HW with this TPMI make hard to integrate it on day 1.

> > For now, KVM always returns 0 when the guest requests the value of
> > these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle
> > these MSRs dynamically in userspace.
> > 
> > To limit the amount of system calls for every MSR call, create a new
> > thread in QEMU that updates the "virtual" MSR values asynchronously.
> > 
> > Each vCPU has its own vMSR to reflect the independence of vCPUs. The
> > thread updates the vMSR values with the ratio of energy consumed of
> > the whole physical CPU package the vCPU thread runs on and the
> > thread's utime and stime values.
> > 
> > All other non-vCPU threads are also taken into account. Their energy
> > consumption is evenly distributed among all vCPUs threads running on
> > the same physical CPU package.
>
> The package energy consumption includes core part and uncore part, where
> uncore part consumption may not be able to be scaled based on vCPU
> runtime ratio.
>
> When the uncore part consumption is small, the error in this part is
> small, but if it is large, then the error generated by scaling by vCPU
> runtime will be large.
>

So far we can only work with what Intel is giving us i.e Package power 
plane and DRAM power plane on server, which is the main target of 
this feature. Maybe in the future, Intel will expand the core power 
plane and the uncore power plane to server class CPU ?

> May I ask what your usage scenario is? Is there significant uncore
> consumption (e.g. GPU)?
>

Same answer as above: uncore/graphics power plane is only available on 
client class CPU. 

> Also, I think of a generic question is whether the error in this
> calculation is measurable? Like comparing the RAPL status of the same
> workload on Guest and bare metal to check the error.
>
> IIUC, this calculation is highly affected by native/sibling Guests,
> especially in cloud scenarios where there are multiple Guests, the
> accuracy of this algorithm needs to be checked.
>

Indeed, depending on where your vCPUs are running within the package (on 
the native or sibling CPU), you might observe different power 
consumption levels. However, I don't consider this to be a problem, as 
the ratio calculation takes into account the vCPU's location.

We also need to approach the measurement differently. Due to the 
complexity of factors influencing power consumption, we must compare 
what is comparable. If you require precise power consumption data, 
use a power meter on the PSU of the server.It will provide the 
ultimate judgment. However, if you need an estimation to optimize 
software workloads in a guest, then this feature could be useful. All my 
tests have consistently shown reproducible output in terms of power 
consumption, which has convinced me that we can effectively work with 
it.

> > To overcome the problem that reading the RAPL MSR requires priviliged
> > access, a socket communication between QEMU and the qemu-vmsr-helper is
> > mandatory. You can specified the socket path in the parameter.
> > 
> > This feature is activated with -accel kvm,rapl=true,path=/path/sock.sock
>
> Based on the above comment, I suggest to rename this option as "rapl-msr"
> to distinguish i

Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-18 Thread Anthony Harivel


Hi Zhao, Daniel,

Zhao Liu, Apr 17, 2024 at 17:13:
> Hi Daniel,
>
> On Wed, Apr 17, 2024 at 01:27:03PM +0100, Daniel P. Berrangé wrote:
> > Date: Wed, 17 Apr 2024 13:27:03 +0100
> > From: "Daniel P. Berrangé" 
> > Subject: Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu
> > 
> > On Wed, Apr 17, 2024 at 06:07:02PM +0800, Zhao Liu wrote:
> > > Hi Anthony,
> > > 
> > > May I ask what your usage scenario is? Is it to measure Guest's energy
> > > consumption and to charged per watt consumed? ;-)
> > > 
> > > On Thu, Apr 11, 2024 at 02:14:34PM +0200, Anthony Harivel wrote:
> > > > Date: Thu, 11 Apr 2024 14:14:34 +0200
> > > > From: Anthony Harivel 
> > > > Subject: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu
> > > > 
> > > > Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> > > > interface (Running Average Power Limit) for advertising the accumulated
> > > > energy consumption of various power domains (e.g. CPU packages, DRAM,
> > > > etc.).
> > > >
> > > > The consumption is reported via MSRs (model specific registers) like
> > > > MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> > > > 64 bits registers that represent the accumulated energy consumption in
> > > > micro Joules. They are updated by microcode every ~1ms.
> > > 
> > > What is your current target platform?
> > 
> > I think we can assume /all/ future CPUs are conceptially in scope
> > for this.
> > 
> > The use case is to allow guest owners to monitor the power consumption
> > of their workloads, so they can take steps to optimize their guest VM
> > workloads to reduce power consumed.
>
> Thanks for the explanation! 
>

Thanks Daniel for stepping in on the explanation.


> > > On future Xeon platforms (EMR and beyond) RAPL will support TPMI (an MMIO
> > > interface) and the TPMI based RAPL will be preferred in the future as
> > > well:
> > 
> > Is the MSR based interface likely to be removed in future silicon,
> > or it will be remain for back compat ?
>
> For Xeon, GNR will have both TMPI & MSR RAPL, but eventually MSR RAPL
> will be removed. Therefore, if RAPL support is desired for all future
> Xeons, then it's necessary to consider TMPI as the next plan.
>
> Alternatively, the whole RAPL scope can be split into rapl-msr and
> rapl-tpmi features.
>

I'm aware of the MSR/TPMI RAPL that will appear in the future, and 
I would like to share my perspective on that.

Firstly, we can safely assume that it will take years before all server 
hardware is transitioned to the new GNR (or future XEON without RAPL 
MSR). It may be around 2024 when these features could be integrated into 
QEMU. While the adoption of this feature might take some time, I'm 
optimistic that once it's implemented, people will finally have the 
tools to optimize workloads inside VMs and start reducing power 
consumption.

Secondly, the second-hand server market is substantial. This means that 
with the Virtual RAPL MSR, all XEON processors starting from Sandy 
Bridge (2012!) will have the potential for software optimization. Making 
the most of existing resources is essential for sustainability.

Lastly, when the TPMI becomes available in hardware in the future, the 
RAPL interface and ratio calculation will remain the same, with only the 
method of obtaining host values changing. This transition should be 
manageable.

Regards,
Anthony




Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-17 Thread Daniel P . Berrangé
On Thu, Apr 11, 2024 at 02:14:34PM +0200, Anthony Harivel wrote:


> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index fad9a7e8ff30..37f68c496807 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -544,4 +544,6 @@ uint32_t kvm_dirty_ring_size(void);
>   * reported for the VM.
>   */
>  bool kvm_hwpoisoned_mem(void);
> +
> +bool kvm_is_rapl_feat_enable(CPUState *cs);
>  #endif
> diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
> index 882e37e12c5b..8dbeda473c6c 100644
> --- a/include/sysemu/kvm_int.h
> +++ b/include/sysemu/kvm_int.h
> @@ -14,6 +14,9 @@
>  #include "qemu/accel.h"
>  #include "qemu/queue.h"
>  #include "sysemu/kvm.h"
> +#include "hw/boards.h"
> +#include "hw/i386/topology.h"
> +#include "io/channel-socket.h"
>  
>  typedef struct KVMSlot
>  {
> @@ -48,6 +51,34 @@ typedef struct KVMMemoryListener {
>  
>  #define KVM_MSI_HASHTAB_SIZE256
>  
> +typedef struct KVMHostTopoInfo {
> +/* Number of package on the Host */
> +unsigned int maxpkgs;
> +/* Number of cpus on the Host */
> +unsigned int maxcpus;
> +/* Number of cpus on each different package */
> +unsigned int *pkg_cpu_count;
> +/* Each package can have different maxticks */
> +unsigned int *maxticks;
> +} KVMHostTopoInfo;
> +
> +struct KVMMsrEnergy {
> +pid_t pid;
> +bool enable;
> +char *socket_path;
> +QIOChannelSocket *sioc;
> +QemuThread msr_thr;
> +unsigned int vcpus;
> +unsigned int vsockets;

Add 'guest_' prefix on to these two, to make it clearer

> +X86CPUTopoInfo topo_info;

Lets call this 'guest_topo' too, to make it more explicitly
distinguished from the next 'host_topo' field.

> +KVMHostTopoInfo host_topo;
> +const CPUArchIdList *cpu_list;

This name choice has an unfortunate side effect.

cpu.h has a '#define cpu_list x86_cpu_list' for the purposes
of selecting the target specific  cpu_list function.

Unfortunately that macro affects your field name here, which
is why the code is wierdly able to set an 'x86_cpu_list' field
despite you having declared it 'cpu_list'.

I'd suggest perhaps calling this field 'guest_cpus' to avoid
this confusing side-effect, and again also making clear that
this is tracking guest, not host, CPUs.

> +uint64_t *msr_value;
> +uint64_t msr_unit;
> +uint64_t msr_limit;
> +uint64_t msr_info;
> +};
> +
>  enum KVMDirtyRingReaperState {
>  KVM_DIRTY_RING_REAPER_NONE = 0,
>  /* The reaper is sleeping */
> @@ -114,6 +145,7 @@ struct KVMState
>  bool kvm_dirty_ring_with_bitmap;
>  uint64_t kvm_eager_split_size;  /* Eager Page Splitting chunk size */
>  struct KVMDirtyRingReaper reaper;
> +struct KVMMsrEnergy msr_energy;
>  NotifyVmexitOption notify_vmexit;
>  uint32_t notify_window;
>  uint32_t xen_version;

> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index e68cbe929302..3de69caa229e 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c

> @@ -2513,6 +2565,339 @@ static void register_smram_listener(Notifier *n, void 
> *unused)
>   &smram_address_space, 1, "kvm-smram");
>  }
>  
> +static void *kvm_msr_energy_thread(void *data)
> +{
> +KVMState *s = data;
> +struct KVMMsrEnergy *vmsr = &s->msr_energy;
> +
> +g_autofree package_energy_stat *pkg_stat = NULL;
> +g_autofree thread_stat *thd_stat = NULL;
> +g_autofree pid_t *thread_ids = NULL;
> +g_autofree CPUState *cpu = NULL;
> +g_autofree unsigned int *vpkgs_energy_stat = NULL;
> +unsigned int num_threads = 0;
> +unsigned int tmp_num_threads = 0;
> +
> +X86CPUTopoIDs topo_ids;
> +
> +rcu_register_thread();
> +
> +/* Allocate memory for each package energy status */
> +pkg_stat = g_new0(package_energy_stat, vmsr->host_topo.maxpkgs);
> +
> +/* Allocate memory for thread stats */
> +thd_stat = g_new0(thread_stat, 1);
> +
> +/* Allocate memory for holding virtual package energy counter */
> +vpkgs_energy_stat = g_new0(unsigned int, vmsr->vsockets);
> +
> +/* Populate the max tick of each packages */
> +for (int i = 0; i <= vmsr->host_topo.maxpkgs; i++) {

The 'maxticks' array is allocated as 'maxpkgs' in size, so this
limit condition needs to be '<' not '<=', to avoid an out of
bounds write.

> +/*
> + * Max numbers of ticks per package
> + * Time in second * Number of ticks/second * Number of cores/package
> + * ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
> + */
> +vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 100)
> +* sysconf(_SC_CLK_TCK)
> +* vmsr->host_topo.pkg_cpu_count[i];
> +}
> +
> +while (true) {
> +/* Get all qemu threads id */
> +thread_ids = vmsr_get_thread_ids(vmsr->pid, &num_threads);

Since you're re-allocating thread_ids on each loop iteration, you
need to free it, not overw

Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-17 Thread Zhao Liu
Hi Daniel,

On Wed, Apr 17, 2024 at 01:27:03PM +0100, Daniel P. Berrangé wrote:
> Date: Wed, 17 Apr 2024 13:27:03 +0100
> From: "Daniel P. Berrangé" 
> Subject: Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu
> 
> On Wed, Apr 17, 2024 at 06:07:02PM +0800, Zhao Liu wrote:
> > Hi Anthony,
> > 
> > May I ask what your usage scenario is? Is it to measure Guest's energy
> > consumption and to charged per watt consumed? ;-)
> > 
> > On Thu, Apr 11, 2024 at 02:14:34PM +0200, Anthony Harivel wrote:
> > > Date: Thu, 11 Apr 2024 14:14:34 +0200
> > > From: Anthony Harivel 
> > > Subject: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu
> > > 
> > > Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> > > interface (Running Average Power Limit) for advertising the accumulated
> > > energy consumption of various power domains (e.g. CPU packages, DRAM,
> > > etc.).
> > >
> > > The consumption is reported via MSRs (model specific registers) like
> > > MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> > > 64 bits registers that represent the accumulated energy consumption in
> > > micro Joules. They are updated by microcode every ~1ms.
> > 
> > What is your current target platform?
> 
> I think we can assume /all/ future CPUs are conceptially in scope
> for this.
> 
> The use case is to allow guest owners to monitor the power consumption
> of their workloads, so they can take steps to optimize their guest VM
> workloads to reduce power consumed.

Thanks for the explanation! 

> > On future Xeon platforms (EMR and beyond) RAPL will support TPMI (an MMIO
> > interface) and the TPMI based RAPL will be preferred in the future as
> > well:
> 
> Is the MSR based interface likely to be removed in future silicon,
> or it will be remain for back compat ?

For Xeon, GNR will have both TMPI & MSR RAPL, but eventually MSR RAPL
will be removed. Therefore, if RAPL support is desired for all future
Xeons, then it's necessary to consider TMPI as the next plan.

Alternatively, the whole RAPL scope can be split into rapl-msr and
rapl-tpmi features.

> > * TPMI doc: https://github.com/intel/tpmi_power_management
> > * TPMI based RAPL driver: drivers/powercap/intel_rapl_tpmi.c
> >

Regards,
Zhao




Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-17 Thread Daniel P . Berrangé
On Wed, Apr 17, 2024 at 06:07:02PM +0800, Zhao Liu wrote:
> Hi Anthony,
> 
> May I ask what your usage scenario is? Is it to measure Guest's energy
> consumption and to charged per watt consumed? ;-)
> 
> On Thu, Apr 11, 2024 at 02:14:34PM +0200, Anthony Harivel wrote:
> > Date: Thu, 11 Apr 2024 14:14:34 +0200
> > From: Anthony Harivel 
> > Subject: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu
> > 
> > Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> > interface (Running Average Power Limit) for advertising the accumulated
> > energy consumption of various power domains (e.g. CPU packages, DRAM,
> > etc.).
> >
> > The consumption is reported via MSRs (model specific registers) like
> > MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> > 64 bits registers that represent the accumulated energy consumption in
> > micro Joules. They are updated by microcode every ~1ms.
> 
> What is your current target platform?

I think we can assume /all/ future CPUs are conceptially in scope
for this.

The use case is to allow guest owners to monitor the power consumption
of their workloads, so they can take steps to optimize their guest VM
workloads to reduce power consumed.

> On future Xeon platforms (EMR and beyond) RAPL will support TPMI (an MMIO
> interface) and the TPMI based RAPL will be preferred in the future as
> well:

Is the MSR based interface likely to be removed in future silicon,
or it will be remain for back compat ?

> * TPMI doc: https://github.com/intel/tpmi_power_management
> * TPMI based RAPL driver: drivers/powercap/intel_rapl_tpmi.c
> 

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-17 Thread Zhao Liu
Hi Anthony,

May I ask what your usage scenario is? Is it to measure Guest's energy
consumption and to charged per watt consumed? ;-)

On Thu, Apr 11, 2024 at 02:14:34PM +0200, Anthony Harivel wrote:
> Date: Thu, 11 Apr 2024 14:14:34 +0200
> From: Anthony Harivel 
> Subject: [PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu
> 
> Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> interface (Running Average Power Limit) for advertising the accumulated
> energy consumption of various power domains (e.g. CPU packages, DRAM,
> etc.).
>
> The consumption is reported via MSRs (model specific registers) like
> MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> 64 bits registers that represent the accumulated energy consumption in
> micro Joules. They are updated by microcode every ~1ms.

What is your current target platform?

On future Xeon platforms (EMR and beyond) RAPL will support TPMI (an MMIO
interface) and the TPMI based RAPL will be preferred in the future as
well:
* TPMI doc: https://github.com/intel/tpmi_power_management
* TPMI based RAPL driver: drivers/powercap/intel_rapl_tpmi.c

So do you have the plan to support RAPL-TPMI?
 
> For now, KVM always returns 0 when the guest requests the value of
> these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle
> these MSRs dynamically in userspace.
> 
> To limit the amount of system calls for every MSR call, create a new
> thread in QEMU that updates the "virtual" MSR values asynchronously.
> 
> Each vCPU has its own vMSR to reflect the independence of vCPUs. The
> thread updates the vMSR values with the ratio of energy consumed of
> the whole physical CPU package the vCPU thread runs on and the
> thread's utime and stime values.
> 
> All other non-vCPU threads are also taken into account. Their energy
> consumption is evenly distributed among all vCPUs threads running on
> the same physical CPU package.

The package energy consumption includes core part and uncore part, where
uncore part consumption may not be able to be scaled based on vCPU
runtime ratio.

When the uncore part consumption is small, the error in this part is
small, but if it is large, then the error generated by scaling by vCPU
runtime will be large.

May I ask what your usage scenario is? Is there significant uncore
consumption (e.g. GPU)?

Also, I think of a generic question is whether the error in this
calculation is measurable? Like comparing the RAPL status of the same
workload on Guest and bare metal to check the error.

IIUC, this calculation is highly affected by native/sibling Guests,
especially in cloud scenarios where there are multiple Guests, the
accuracy of this algorithm needs to be checked.

> To overcome the problem that reading the RAPL MSR requires priviliged
> access, a socket communication between QEMU and the qemu-vmsr-helper is
> mandatory. You can specified the socket path in the parameter.
> 
> This feature is activated with -accel kvm,rapl=true,path=/path/sock.sock

Based on the above comment, I suggest to rename this option as "rapl-msr"
to distinguish it from rapl-tpmi.

In addition, RAPL is basically a CPU feature, I think it would be more
appropriate to make it as a x86 CPU's property.

Your RAPL support actually provides a framework for assisting KVM
emulation in userspace, so this informs other feature support (maybe model
specific, or architectural) in the future. Enabling/disabling CPU features
via -cpu looks more natural.

[snip]

> +High level implementation
> +-
> +
> +In order to update the value of the virtual MSR, a QEMU thread is created.
> +The thread is basically just an infinity loop that does:
> +
> +1. Snapshot of the time metrics of all QEMU threads (Time spent scheduled in
> +   Userspace and System)
>
> +2. Snapshot of the actual MSR_PKG_ENERGY_STATUS counter of all packages where
> +   the QEMU threads are running on.
> +
> +3. Sleep for 1 second - During this pause the vcpu and other non-vcpu threads
> +   will do what they have to do and so the energy counter will increase.
> +
> +4. Repeat 2. and 3. and calculate the delta of every metrics representing the
> +   time spent scheduled for each QEMU thread *and* the energy spent by the
> +   packages during the pause.
> +
> +5. Filter the vcpu threads and the non-vcpu threads.
> +
> +6. Retrieve the topology of the Virtual Machine. This helps identify which
> +   vCPU is running on which virtual package.
> +
> +7. The total energy spent by the non-vcpu threads is divided by the number
> +   of vcpu threads so that each vcpu thread will get an equal part of the
> +   energy spent by the QEMU workers.
> +
> +8. Calculate the ratio of energy spent per vcpu threads.
> +
> +9. Cal

[PATCH v5 3/3] Add support for RAPL MSRs in KVM/Qemu

2024-04-11 Thread Anthony Harivel
Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
interface (Running Average Power Limit) for advertising the accumulated
energy consumption of various power domains (e.g. CPU packages, DRAM,
etc.).

The consumption is reported via MSRs (model specific registers) like
MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
64 bits registers that represent the accumulated energy consumption in
micro Joules. They are updated by microcode every ~1ms.

For now, KVM always returns 0 when the guest requests the value of
these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle
these MSRs dynamically in userspace.

To limit the amount of system calls for every MSR call, create a new
thread in QEMU that updates the "virtual" MSR values asynchronously.

Each vCPU has its own vMSR to reflect the independence of vCPUs. The
thread updates the vMSR values with the ratio of energy consumed of
the whole physical CPU package the vCPU thread runs on and the
thread's utime and stime values.

All other non-vCPU threads are also taken into account. Their energy
consumption is evenly distributed among all vCPUs threads running on
the same physical CPU package.

To overcome the problem that reading the RAPL MSR requires priviliged
access, a socket communication between QEMU and the qemu-vmsr-helper is
mandatory. You can specified the socket path in the parameter.

This feature is activated with -accel kvm,rapl=true,path=/path/sock.sock

Actual limitation:
- Works only on Intel host CPU because AMD CPUs are using different MSR
  adresses.

- Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at
  the moment.

Signed-off-by: Anthony Harivel 
---
 accel/kvm/kvm-all.c   |  27 +++
 docs/specs/index.rst  |   1 +
 docs/specs/rapl-msr.rst   | 155 
 include/sysemu/kvm.h  |   2 +
 include/sysemu/kvm_int.h  |  32 +++
 target/i386/cpu.h |   8 +
 target/i386/kvm/kvm-cpu.c |   9 +
 target/i386/kvm/kvm.c | 428 ++
 target/i386/kvm/meson.build   |   1 +
 target/i386/kvm/vmsr_energy.c | 335 ++
 target/i386/kvm/vmsr_energy.h |  99 
 11 files changed, 1097 insertions(+)
 create mode 100644 docs/specs/rapl-msr.rst
 create mode 100644 target/i386/kvm/vmsr_energy.c
 create mode 100644 target/i386/kvm/vmsr_energy.h

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index a8cecd040ebc..7649f226767a 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -3613,6 +3613,21 @@ static void kvm_set_device(Object *obj,
 s->device = g_strdup(value);
 }
 
+static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
+{
+KVMState *s = KVM_STATE(obj);
+s->msr_energy.enable = value;
+}
+
+static void kvm_set_kvm_rapl_socket_path(Object *obj,
+ const char *str,
+ Error **errp)
+{
+KVMState *s = KVM_STATE(obj);
+g_free(s->msr_energy.socket_path);
+s->msr_energy.socket_path = g_strdup(str);
+}
+
 static void kvm_accel_instance_init(Object *obj)
 {
 KVMState *s = KVM_STATE(obj);
@@ -3632,6 +3647,7 @@ static void kvm_accel_instance_init(Object *obj)
 s->xen_gnttab_max_frames = 64;
 s->xen_evtchn_max_pirq = 256;
 s->device = NULL;
+s->msr_energy.enable = false;
 }
 
 /**
@@ -3676,6 +3692,17 @@ static void kvm_accel_class_init(ObjectClass *oc, void 
*data)
 object_class_property_set_description(oc, "device",
 "Path to the device node to use (default: /dev/kvm)");
 
+object_class_property_add_bool(oc, "rapl",
+   NULL,
+   kvm_set_kvm_rapl);
+object_class_property_set_description(oc, "rapl",
+"Allow energy related MSRs for RAPL interface in Guest");
+
+object_class_property_add_str(oc, "rapl-helper-socket", NULL,
+  kvm_set_kvm_rapl_socket_path);
+object_class_property_set_description(oc, "rapl-helper-socket",
+"Socket Path for comminucating with the Virtual MSR helper daemon");
+
 kvm_arch_accel_class_init(oc);
 }
 
diff --git a/docs/specs/index.rst b/docs/specs/index.rst
index 1484e3e76077..e738ea7d102f 100644
--- a/docs/specs/index.rst
+++ b/docs/specs/index.rst
@@ -33,3 +33,4 @@ guest hardware that is specific to QEMU.
virt-ctlr
vmcoreinfo
vmgenid
+   rapl-msr
diff --git a/docs/specs/rapl-msr.rst b/docs/specs/rapl-msr.rst
new file mode 100644
index ..1202ee89bee0
--- /dev/null
+++ b/docs/specs/rapl-msr.rst
@@ -0,0 +1,155 @@
+
+RAPL MSR support
+
+
+The RAPL interface (Running Average Power Limit) is advertising the accumulated
+energy consumption of various power domains (e.g. CPU packages, DRAM, etc.).
+
+The consumption is reported via MSRs (model specific registers) like
+MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs ar