Hi Zhao, Sandipan and Dapeng,
FYI: I have removed your Reviewed-by since the previous version.
I have removed the following code since the previous version as suggested by
Zide.
+ /*
+ * The PMU virtualization is disabled by kvm.enable_pmu=N.
+ */
+ if (kvm_pmu_disabled) {
+ return;
+ }
Thank you very much!
Dongli Zhang
On 1/8/26 11:53 PM, Dongli Zhang wrote:
> QEMU uses the kvm_get_msrs() function to save Intel PMU registers from KVM
> and kvm_put_msrs() to restore them to KVM. However, there is no support for
> AMD PMU registers. Currently, pmu_version and num_pmu_gp_counters are
> initialized based on cpuid(0xa), which does not apply to AMD processors.
> For AMD CPUs, prior to PerfMonV2, the number of general-purpose registers
> is determined based on the CPU version.
>
> To address this issue, we need to add support for AMD PMU registers.
> Without this support, the following problems can arise:
>
> 1. If the VM is reset (e.g., via QEMU system_reset or VM kdump/kexec) while
> running "perf top", the PMU registers are not disabled properly.
>
> 2. Despite x86_cpu_reset() resetting many registers to zero, kvm_put_msrs()
> does not handle AMD PMU registers, causing some PMU events to remain
> enabled in KVM.
>
> 3. The KVM kvm_pmc_speculative_in_use() function consistently returns true,
> preventing the reclamation of these events. Consequently, the
> kvm_pmc->perf_event remains active.
>
> 4. After a reboot, the VM kernel may report the following error:
>
> [ 0.092011] Performance Events: Fam17h+ core perfctr, Broken BIOS
> detected, complain to your hardware vendor.
> [ 0.092023] [Firmware Bug]: the BIOS has corrupted hw-PMU resources (MSR
> c0010200 is 530076)
>
> 5. In the worst case, the active kvm_pmc->perf_event may inject unknown
> NMIs randomly into the VM kernel:
>
> [...] Uhhuh. NMI received for unknown reason 30 on CPU 0.
>
> To resolve these issues, we propose resetting AMD PMU registers during the
> VM reset process.
>
> Signed-off-by: Dongli Zhang <[email protected]>
> ---
> Changed since v1:
> - Modify "MSR_K7_EVNTSEL0 + 3" and "MSR_K7_PERFCTR0 + 3" by using
> AMD64_NUM_COUNTERS (suggested by Sandipan Das).
> - Use "AMD64_NUM_COUNTERS_CORE * 2 - 1", not "MSR_F15H_PERF_CTL0 + 0xb".
> (suggested by Sandipan Das).
> - Switch back to "-pmu" instead of using a global "pmu-cap-disabled".
> - Don't initialize PMU info if kvm.enable_pmu=N.
> Changed since v2:
> - Remove 'static' from host_cpuid_vendorX.
> - Change has_pmu_version to pmu_version.
> - Use object_property_get_int() to get CPU family.
> - Use cpuid_find_entry() instead of cpu_x86_cpuid().
> - Send error log when host and guest are from different vendors.
> - Move "if (!cpu->enable_pmu)" to begin of function. Add comments to
> reminder developers.
> - Add support to Zhaoxin. Change is_same_vendor() to
> is_host_compat_vendor().
> - Didn't add Reviewed-by from Sandipan because the change isn't minor.
> Changed since v3:
> - Use host_cpu_vendor_fms() from Zhao's patch.
> - Check AMD directly makes the "compat" rule clear.
> - Add comment to MAX_GP_COUNTERS.
> - Skip PMU info initialization if !kvm_pmu_disabled.
> Changed since v4:
> - Add Reviewed-by from Zhao and Sandipan.
> Changed since v6:
> - Add Reviewed-by from Dapeng Mi.
> Changed since v8:
> - Remove the usage of 'kvm_pmu_disabled' as sussged by Zide Chen.
> - Remove Reviewed-by from Zhao Liu, Sandipan Das and Dapeng Mi, as the
> usage of 'kvm_pmu_disabled' is removed.
>
> target/i386/cpu.h | 12 +++
> target/i386/kvm/kvm.c | 168 +++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 176 insertions(+), 4 deletions(-)
>
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 2bbc977d90..0960b98960 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -506,6 +506,14 @@ typedef enum X86Seg {
> #define MSR_CORE_PERF_GLOBAL_CTRL 0x38f
> #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390
>
> +#define MSR_K7_EVNTSEL0 0xc0010000
> +#define MSR_K7_PERFCTR0 0xc0010004
> +#define MSR_F15H_PERF_CTL0 0xc0010200
> +#define MSR_F15H_PERF_CTR0 0xc0010201
> +
> +#define AMD64_NUM_COUNTERS 4
> +#define AMD64_NUM_COUNTERS_CORE 6
> +
> #define MSR_MC0_CTL 0x400
> #define MSR_MC0_STATUS 0x401
> #define MSR_MC0_ADDR 0x402
> @@ -1737,6 +1745,10 @@ typedef struct {
> #endif
>
> #define MAX_FIXED_COUNTERS 3
> +/*
> + * This formula is based on Intel's MSR. The current size also meets AMD's
> + * needs.
> + */
> #define MAX_GP_COUNTERS (MSR_IA32_PERF_STATUS - MSR_P6_EVNTSEL0)
>
> #define NB_OPMASK_REGS 8
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 3b803c662d..fb7b672a9d 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -2096,7 +2096,7 @@ int kvm_arch_pre_create_vcpu(CPUState *cpu, Error
> **errp)
> return 0;
> }
>
> -static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid)
> +static void kvm_init_pmu_info_intel(struct kvm_cpuid2 *cpuid)
> {
> struct kvm_cpuid_entry2 *c;
>
> @@ -2129,6 +2129,89 @@ static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid)
> }
> }
>
> +static void kvm_init_pmu_info_amd(struct kvm_cpuid2 *cpuid, X86CPU *cpu)
> +{
> + struct kvm_cpuid_entry2 *c;
> + int64_t family;
> +
> + family = object_property_get_int(OBJECT(cpu), "family", NULL);
> + if (family < 0) {
> + return;
> + }
> +
> + if (family < 6) {
> + error_report("AMD performance-monitoring is supported from "
> + "K7 and later");
> + return;
> + }
> +
> + pmu_version = 1;
> + num_pmu_gp_counters = AMD64_NUM_COUNTERS;
> +
> + c = cpuid_find_entry(cpuid, 0x80000001, 0);
> + if (!c) {
> + return;
> + }
> +
> + if (!(c->ecx & CPUID_EXT3_PERFCORE)) {
> + return;
> + }
> +
> + num_pmu_gp_counters = AMD64_NUM_COUNTERS_CORE;
> +}
> +
> +static bool is_host_compat_vendor(CPUX86State *env)
> +{
> + char host_vendor[CPUID_VENDOR_SZ + 1];
> +
> + host_cpu_vendor_fms(host_vendor, NULL, NULL, NULL);
> +
> + /*
> + * Intel and Zhaoxin are compatible.
> + */
> + if ((g_str_equal(host_vendor, CPUID_VENDOR_INTEL) ||
> + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN1) ||
> + g_str_equal(host_vendor, CPUID_VENDOR_ZHAOXIN2)) &&
> + (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env))) {
> + return true;
> + }
> +
> + return g_str_equal(host_vendor, CPUID_VENDOR_AMD) &&
> + IS_AMD_CPU(env);
> +}
> +
> +static void kvm_init_pmu_info(struct kvm_cpuid2 *cpuid, X86CPU *cpu)
> +{
> + CPUX86State *env = &cpu->env;
> +
> + /*
> + * If KVM_CAP_PMU_CAPABILITY is not supported, there is no way to
> + * disable the AMD PMU virtualization.
> + *
> + * Assume the user is aware of this when !cpu->enable_pmu. AMD PMU
> + * registers are not going to reset, even they are still available to
> + * guest VM.
> + */
> + if (!cpu->enable_pmu) {
> + return;
> + }
> +
> + /*
> + * It is not supported to virtualize AMD PMU registers on Intel
> + * processors, nor to virtualize Intel PMU registers on AMD processors.
> + */
> + if (!is_host_compat_vendor(env)) {
> + error_report("host doesn't support requested feature: vPMU");
> + return;
> + }
> +
> + if (IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) {
> + kvm_init_pmu_info_intel(cpuid);
> + } else if (IS_AMD_CPU(env)) {
> + kvm_init_pmu_info_amd(cpuid, cpu);
> + }
> +}
> +
> int kvm_arch_init_vcpu(CPUState *cs)
> {
> struct {
> @@ -2319,7 +2402,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
> cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i);
> cpuid_data.cpuid.nent = cpuid_i;
>
> - kvm_init_pmu_info(&cpuid_data.cpuid);
> + kvm_init_pmu_info(&cpuid_data.cpuid, cpu);
>
> if (x86_cpu_family(env->cpuid_version) >= 6
> && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
> @@ -4094,7 +4177,7 @@ static int kvm_put_msrs(X86CPU *cpu, KvmPutState level)
> kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL,
> env->poll_control_msr);
> }
>
> - if (pmu_version > 0) {
> + if ((IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) && pmu_version > 0) {
> if (pmu_version > 1) {
> /* Stop the counter. */
> kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
> @@ -4125,6 +4208,38 @@ static int kvm_put_msrs(X86CPU *cpu, KvmPutState level)
> env->msr_global_ctrl);
> }
> }
> +
> + if (IS_AMD_CPU(env) && pmu_version > 0) {
> + uint32_t sel_base = MSR_K7_EVNTSEL0;
> + uint32_t ctr_base = MSR_K7_PERFCTR0;
> + /*
> + * The address of the next selector or counter register is
> + * obtained by incrementing the address of the current selector
> + * or counter register by one.
> + */
> + uint32_t step = 1;
> +
> + /*
> + * When PERFCORE is enabled, AMD PMU uses a separate set of
> + * addresses for the selector and counter registers.
> + * Additionally, the address of the next selector or counter
> + * register is determined by incrementing the address of the
> + * current register by two.
> + */
> + if (num_pmu_gp_counters == AMD64_NUM_COUNTERS_CORE) {
> + sel_base = MSR_F15H_PERF_CTL0;
> + ctr_base = MSR_F15H_PERF_CTR0;
> + step = 2;
> + }
> +
> + for (i = 0; i < num_pmu_gp_counters; i++) {
> + kvm_msr_entry_add(cpu, ctr_base + i * step,
> + env->msr_gp_counters[i]);
> + kvm_msr_entry_add(cpu, sel_base + i * step,
> + env->msr_gp_evtsel[i]);
> + }
> + }
> +
> /*
> * Hyper-V partition-wide MSRs: to avoid clearing them on cpu
> hot-add,
> * only sync them to KVM on the first cpu
> @@ -4629,7 +4744,8 @@ static int kvm_get_msrs(X86CPU *cpu)
> if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) {
> kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
> }
> - if (pmu_version > 0) {
> +
> + if ((IS_INTEL_CPU(env) || IS_ZHAOXIN_CPU(env)) && pmu_version > 0) {
> if (pmu_version > 1) {
> kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
> kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
> @@ -4645,6 +4761,35 @@ static int kvm_get_msrs(X86CPU *cpu)
> }
> }
>
> + if (IS_AMD_CPU(env) && pmu_version > 0) {
> + uint32_t sel_base = MSR_K7_EVNTSEL0;
> + uint32_t ctr_base = MSR_K7_PERFCTR0;
> + /*
> + * The address of the next selector or counter register is
> + * obtained by incrementing the address of the current selector
> + * or counter register by one.
> + */
> + uint32_t step = 1;
> +
> + /*
> + * When PERFCORE is enabled, AMD PMU uses a separate set of
> + * addresses for the selector and counter registers.
> + * Additionally, the address of the next selector or counter
> + * register is determined by incrementing the address of the
> + * current register by two.
> + */
> + if (num_pmu_gp_counters == AMD64_NUM_COUNTERS_CORE) {
> + sel_base = MSR_F15H_PERF_CTL0;
> + ctr_base = MSR_F15H_PERF_CTR0;
> + step = 2;
> + }
> +
> + for (i = 0; i < num_pmu_gp_counters; i++) {
> + kvm_msr_entry_add(cpu, ctr_base + i * step, 0);
> + kvm_msr_entry_add(cpu, sel_base + i * step, 0);
> + }
> + }
> +
> if (env->mcg_cap) {
> kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
> kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
> @@ -4975,6 +5120,21 @@ static int kvm_get_msrs(X86CPU *cpu)
> case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
> env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
> break;
> + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL0 + AMD64_NUM_COUNTERS - 1:
> + env->msr_gp_evtsel[index - MSR_K7_EVNTSEL0] = msrs[i].data;
> + break;
> + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR0 + AMD64_NUM_COUNTERS - 1:
> + env->msr_gp_counters[index - MSR_K7_PERFCTR0] = msrs[i].data;
> + break;
> + case MSR_F15H_PERF_CTL0 ...
> + MSR_F15H_PERF_CTL0 + AMD64_NUM_COUNTERS_CORE * 2 - 1:
> + index = index - MSR_F15H_PERF_CTL0;
> + if (index & 0x1) {
> + env->msr_gp_counters[index] = msrs[i].data;
> + } else {
> + env->msr_gp_evtsel[index] = msrs[i].data;
> + }
> + break;
> case HV_X64_MSR_HYPERCALL:
> env->msr_hv_hypercall = msrs[i].data;
> break;