Introduce support for exposing and enabling APERF/MPERF MSR passthrough for x86 QEMU guests when running under KVM.
The Linux kernel supports a KVM capability allowing the hypervisor to disable read intercepts on the IA32_APERF and IA32_MPERF MSRs, enabling guests to track effective frequency directly without VM-exits. QEMU currently lacks a native way to request this capability or expose the corresponding feature bit to the guest. This patch adds the `aperfmperf` feature flag via `FEAT_6_ECX` (CPUID.06H:ECX[bit 0]). To ensure safe tracking across power states, the flag ties into QEMU's existing host power management framework. When host CPU power management is explicitly requested by the user (via `-overcommit cpu-pm=on`) and the `+aperfmperf` flag is provided to the CPU, QEMU will invoke the KVM ioctl to drop the APERF/MPERF MSR read intercepts. This implementation allows guest operating systems (such as FreeBSD or Linux) to dynamically calculate CPU utilization and turbo-boost metrics without incurring performance overhead from hypervisor trap-and- emulate loops. Signed-off-by: Anderson Nascimento <[email protected]> --- Changes in v2 - Added migration flags - Link to v1: https://lore.kernel.org/all/[email protected]/ target/i386/cpu.c | 18 +++++++++++++++++- target/i386/cpu.h | 2 ++ target/i386/kvm/kvm.c | 5 ++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 8929a75c7c..544738d406 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1544,6 +1544,22 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .cpuid = { .eax = 6, .reg = R_EAX, }, .tcg_features = TCG_6_EAX_FEATURES, }, + [FEAT_6_ECX] = { + .type = CPUID_FEATURE_WORD, + .feat_names = { + "aperfmperf", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + }, + .cpuid = { .eax = 6, .reg = R_ECX, }, + .tcg_features = 0, + .unmigratable_flags = CPUID_6_ECX_APERFMPERF, + }, [FEAT_XSAVE_XCR0_LO] = { .type = CPUID_FEATURE_WORD, .cpuid = { @@ -8770,7 +8786,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, /* Thermal and Power Leaf */ *eax = env->features[FEAT_6_EAX]; *ebx = 0; - *ecx = 0; + *ecx = env->features[FEAT_6_ECX]; *edx = 0; break; case 7: diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 67e2ecf325..87864969c7 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -700,6 +700,7 @@ typedef enum FeatureWord { FEAT_SVM, /* CPUID[8000_000A].EDX */ FEAT_XSAVE, /* CPUID[EAX=0xd,ECX=1].EAX */ FEAT_6_EAX, /* CPUID[6].EAX */ + FEAT_6_ECX, /* CPUID[6].ECX */ FEAT_XSAVE_XCR0_LO, /* CPUID[EAX=0xd,ECX=0].EAX */ FEAT_XSAVE_XCR0_HI, /* CPUID[EAX=0xd,ECX=0].EDX */ FEAT_ARCH_CAPABILITIES, @@ -1232,6 +1233,7 @@ uint64_t x86_cpu_get_supported_feature_word(X86CPU *cpu, FeatureWord w); #define CPUID_XSAVE_XFD (1U << 4) #define CPUID_6_EAX_ARAT (1U << 2) +#define CPUID_6_ECX_APERFMPERF (1U << 0) /* CPUID[0x80000007].EDX flags: */ #define CPUID_APM_INVTSC (1U << 8) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 9e352882c8..ca722ff9e9 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -498,6 +498,8 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, } } else if (function == 6 && reg == R_EAX) { ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */ + } else if (function == 6 && reg == R_ECX) { + ret |= CPUID_6_ECX_APERFMPERF; } else if (function == 7 && index == 0 && reg == R_EBX) { /* Not new instructions, just an optimization. */ uint32_t ebx; @@ -3291,7 +3293,8 @@ static int kvm_vm_enable_disable_exits(KVMState *s) disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT | KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | - KVM_X86_DISABLE_EXITS_CSTATE); + KVM_X86_DISABLE_EXITS_CSTATE | + KVM_X86_DISABLE_EXITS_APERFMPERF); } return kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0, -- 2.54.0
