On 1/9/2026 3:53 PM, Dongli Zhang wrote: > QEMU uses the kvm_get_msrs() function to save Intel PMU registers from KVM > and kvm_put_msrs() to restore them to KVM. However, there is no support for > AMD PMU registers. Currently, pmu_version and num_pmu_gp_counters are > initialized based on cpuid(0xa), which does not apply to AMD processors. > For AMD CPUs, prior to PerfMonV2, the number of general-purpose registers > is determined based on the CPU version. > > To address this issue, we need to add support for AMD PMU registers. > Without this support, the following problems can arise: > > 1. If the VM is reset (e.g., via QEMU system_reset or VM kdump/kexec) while > running "perf top", the PMU registers are not disabled properly. > > 2. Despite x86_cpu_reset() resetting many registers to zero, kvm_put_msrs() > does not handle AMD PMU registers, causing some PMU events to remain > enabled in KVM. > > 3. The KVM kvm_pmc_speculative_in_use() function consistently returns true, > preventing the reclamation of these events. Consequently, the > kvm_pmc->perf_event remains active. > > 4. After a reboot, the VM kernel may report the following error: > > [ 0.092011] Performance Events: Fam17h+ core perfctr, Broken BIOS > detected, complain to your hardware vendor. > [ 0.092023] [Firmware Bug]: the BIOS has corrupted hw-PMU resources (MSR > c0010200 is 530076) > > 5. In the worst case, the active kvm_pmc->perf_event may inject unknown > NMIs randomly into the VM kernel: > > [...] Uhhuh. NMI received for unknown reason 30 on CPU 0. > > To resolve these issues, we propose resetting AMD PMU registers during the > VM reset process. > > Signed-off-by: Dongli Zhang <[email protected]> > --- > Changed since v1: > - Modify "MSR_K7_EVNTSEL0 + 3" and "MSR_K7_PERFCTR0 + 3" by using > AMD64_NUM_COUNTERS (suggested by Sandipan Das). > - Use "AMD64_NUM_COUNTERS_CORE * 2 - 1", not "MSR_F15H_PERF_CTL0 + 0xb". > (suggested by Sandipan Das). > - Switch back to "-pmu" instead of using a global "pmu-cap-disabled". > - Don't initialize PMU info if kvm.enable_pmu=N. > Changed since v2: > - Remove 'static' from host_cpuid_vendorX. > - Change has_pmu_version to pmu_version. > - Use object_property_get_int() to get CPU family. > - Use cpuid_find_entry() instead of cpu_x86_cpuid(). > - Send error log when host and guest are from different vendors. > - Move "if (!cpu->enable_pmu)" to begin of function. Add comments to > reminder developers. > - Add support to Zhaoxin. Change is_same_vendor() to > is_host_compat_vendor(). > - Didn't add Reviewed-by from Sandipan because the change isn't minor. > Changed since v3: > - Use host_cpu_vendor_fms() from Zhao's patch. > - Check AMD directly makes the "compat" rule clear. > - Add comment to MAX_GP_COUNTERS. > - Skip PMU info initialization if !kvm_pmu_disabled. > Changed since v4: > - Add Reviewed-by from Zhao and Sandipan. > Changed since v6: > - Add Reviewed-by from Dapeng Mi. > Changed since v8: > - Remove the usage of 'kvm_pmu_disabled' as sussged by Zide Chen. > - Remove Reviewed-by from Zhao Liu, Sandipan Das and Dapeng Mi, as the > usage of 'kvm_pmu_disabled' is removed. > > target/i386/cpu.h | 12 +++ > target/i386/kvm/kvm.c | 168 +++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 176 insertions(+), 4 deletions(-) > > diff --git a/target/i386/cpu.h b/target/i386/cpu.h > index 2bbc977d90..0960b98960 100644 > --- a/target/i386/cpu.h > +++ b/target/i386/cpu.h > @@ -506,6 +506,14 @@ typedef enum X86Seg { > #define MSR_CORE_PERF_GLOBAL_CTRL 0x38f > #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390 > > +#define MSR_K7_EVNTSEL0 0xc0010000 > +#define MSR_K7_PERFCTR0 0xc0010004 > +#define MSR_F15H_PERF_CTL0 0xc0010200 > +#define MSR_F15H_PERF_CTR0 0xc0010201 > + > +#define AMD64_NUM_COUNTERS 4 > +#define AMD64_NUM_COUNTERS_CORE 6 > + > #define MSR_MC0_CTL 0x400 > #define MSR_MC0_STATUS 0x401 > #define MSR_MC0_ADDR 0x402 > @@ -1737,6 +1745,10 @@ typedef struct { > #endif > > #define MAX_FIXED_COUNTERS 3 > +/* > + * This formula is based on Intel's MSR. The current size also meets AMD's > + * needs. > + */ > #define MAX_GP_COUNTERS (MSR_IA32_PERF_STATUS - MSR_P6_EVNTSEL0) > > #define NB_OPMASK_REGS 8 > diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c > index 3b803c662d..fb7b672a9d 100644 > --- a/target/i386/kvm/kvm.c > +++ b/target/i386/kvm/kvm.c > @@ -2096,7 +2096,7 @@ int kvm_arch_pre_create_vcpu(CPUState *cpu, Error > **errp) > return 0; > } > > -static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid) > +static void kvm_init_pmu_info_intel(struct kvm_cpuid2 *cpuid) > { > struct kvm_cpuid_entry2 *c; > > @@ -2129,6 +2129,89 @@ static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid) > } > } > > +static void kvm_init_pmu_info_amd(struct kvm_cpuid2 *cpuid, X86CPU *cpu) > +{ > + struct kvm_cpuid_entry2 *c; > + int64_t family; > + > + family = object_property_get_int(OBJECT(cpu), "family", NULL); > + if (family < 0) { > + return; > + } > + > + if (family < 6) { > + error_report("AMD performance-monitoring is supported from " > + "K7 and later"); > + return; > + } > + > + pmu_version = 1; > + num_pmu_gp_counters = AMD64_NUM_COUNTERS; > + > + c = cpuid_find_entry(cpuid, 0x80000001, 0); > + if (!c) { > + return; > + } > + > + if (!(c->ecx & CPUID_EXT3_PERFCORE)) { > + return; > + } > + > + num_pmu_gp_counters = AMD64_NUM_COUNTERS_CORE; > +} > + > +static bool is_host_compat_vendor(CPUX86State *env) > +{ > + char host_vendor[CPUID_VENDOR_SZ + 1]; > + > + host_cpu_vendor_fms(host_vendor, NULL, NULL, NULL); > + > + /* > + * Intel and Zhaoxin are compatible. > + */ > + if ((g_str_equal(host_vendor, CPUID_VENDOR_INTEL) || > + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN1) || > + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN2)) && > + (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env))) { > + return true; > + } > + > + return g_str_equal(host_vendor, CPUID_VENDOR_AMD) && > + IS_AMD_CPU(env); > +} > + > +static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid, X86CPU *cpu) > +{ > + CPUX86State *env = &cpu->env; > + > + /* > + * If KVM_CAP_PMU_CAPABILITY is not supported, there is no way to > + * disable the AMD PMU virtualization. > + * > + * Assume the user is aware of this when !cpu->enable_pmu. AMD PMU > + * registers are not going to reset, even they are still available to > + * guest VM. > + */ > + if (!cpu->enable_pmu) { > + return; > + } > + > + /* > + * It is not supported to virtualize AMD PMU registers on Intel > + * processors, nor to virtualize Intel PMU registers on AMD processors. > + */ > + if (!is_host_compat_vendor(env)) { > + error_report("host doesn't support requested feature: vPMU"); > + return; > + } > + > + if (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) { > + kvm_init_pmu_info_intel(cpuid); > + } else if (IS_AMD_CPU(env)) { > + kvm_init_pmu_info_amd(cpuid, cpu); > + } > +} > + > int kvm_arch_init_vcpu(CPUState *cs) > { > struct { > @@ -2319,7 +2402,7 @@ int kvm_arch_init_vcpu(CPUState *cs) > cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i); > cpuid_data.cpuid.nent = cpuid_i; > > - kvm_init_pmu_info(&cpuid_data.cpuid); > + kvm_init_pmu_info(&cpuid_data.cpuid, cpu); > > if (x86_cpu_family(env->cpuid_version) >= 6 > && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) == > @@ -4094,7 +4177,7 @@ static int kvm_put_msrs(X86CPU *cpu, KvmPutState level) > kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, > env->poll_control_msr); > } > > - if (pmu_version > 0) { > + if ((IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) && pmu_version > 0) { > if (pmu_version > 1) { > /* Stop the counter. */ > kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); > @@ -4125,6 +4208,38 @@ static int kvm_put_msrs(X86CPU *cpu, KvmPutState level) > env->msr_global_ctrl); > } > } > + > + if (IS_AMD_CPU(env) && pmu_version > 0) { > + uint32_t sel_base = MSR_K7_EVNTSEL0; > + uint32_t ctr_base = MSR_K7_PERFCTR0; > + /* > + * The address of the next selector or counter register is > + * obtained by incrementing the address of the current selector > + * or counter register by one. > + */ > + uint32_t step = 1; > + > + /* > + * When PERFCORE is enabled, AMD PMU uses a separate set of > + * addresses for the selector and counter registers. > + * Additionally, the address of the next selector or counter > + * register is determined by incrementing the address of the > + * current register by two. > + */ > + if (num_pmu_gp_counters == AMD64_NUM_COUNTERS_CORE) { > + sel_base = MSR_F15H_PERF_CTL0; > + ctr_base = MSR_F15H_PERF_CTR0; > + step = 2; > + } > + > + for (i = 0; i < num_pmu_gp_counters; i++) { > + kvm_msr_entry_add(cpu, ctr_base + i * step, > + env->msr_gp_counters[i]); > + kvm_msr_entry_add(cpu, sel_base + i * step, > + env->msr_gp_evtsel[i]); > + } > + } > + > /* > * Hyper-V partition-wide MSRs: to avoid clearing them on cpu > hot-add, > * only sync them to KVM on the first cpu > @@ -4629,7 +4744,8 @@ static int kvm_get_msrs(X86CPU *cpu) > if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) { > kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1); > } > - if (pmu_version > 0) { > + > + if ((IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) && pmu_version > 0) { > if (pmu_version > 1) { > kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); > kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0); > @@ -4645,6 +4761,35 @@ static int kvm_get_msrs(X86CPU *cpu) > } > } > > + if (IS_AMD_CPU(env) && pmu_version > 0) { > + uint32_t sel_base = MSR_K7_EVNTSEL0; > + uint32_t ctr_base = MSR_K7_PERFCTR0; > + /* > + * The address of the next selector or counter register is > + * obtained by incrementing the address of the current selector > + * or counter register by one. > + */ > + uint32_t step = 1; > + > + /* > + * When PERFCORE is enabled, AMD PMU uses a separate set of > + * addresses for the selector and counter registers. > + * Additionally, the address of the next selector or counter > + * register is determined by incrementing the address of the > + * current register by two. > + */ > + if (num_pmu_gp_counters == AMD64_NUM_COUNTERS_CORE) { > + sel_base = MSR_F15H_PERF_CTL0; > + ctr_base = MSR_F15H_PERF_CTR0; > + step = 2; > + } > + > + for (i = 0; i < num_pmu_gp_counters; i++) { > + kvm_msr_entry_add(cpu, ctr_base + i * step, 0); > + kvm_msr_entry_add(cpu, sel_base + i * step, 0); > + } > + } > + > if (env->mcg_cap) { > kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0); > kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0); > @@ -4975,6 +5120,21 @@ static int kvm_get_msrs(X86CPU *cpu) > case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1: > env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data; > break; > + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL0 + AMD64_NUM_COUNTERS - 1: > + env->msr_gp_evtsel[index - MSR_K7_EVNTSEL0] = msrs[i].data; > + break; > + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR0 + AMD64_NUM_COUNTERS - 1: > + env->msr_gp_counters[index - MSR_K7_PERFCTR0] = msrs[i].data; > + break; > + case MSR_F15H_PERF_CTL0 ... > + MSR_F15H_PERF_CTL0 + AMD64_NUM_COUNTERS_CORE * 2 - 1: > + index = index - MSR_F15H_PERF_CTL0; > + if (index & 0x1) { > + env->msr_gp_counters[index] = msrs[i].data; > + } else { > + env->msr_gp_evtsel[index] = msrs[i].data; > + } > + break; > case HV_X64_MSR_HYPERCALL: > env->msr_hv_hypercall = msrs[i].data; > break;
The Intel related code looks good to me. Let AMD part to Sandipan. Thanks. Reviewed-by: Dapeng Mi <[email protected]>
