The aperf/mperf are used to report current CPU frequency after 7d5905dc14a
"x86 / CPU: Always show current CPU frequency in /proc/cpuinfo". But guest
kernel always reports a fixed VCPU frequency in the /proc/cpuinfo, which
may confuse users especially when turbo is enabled on the host.

Emulate guest APERF/MPERF capability based their values on the host.

Co-developed-by: Li RongQing <[email protected]>
Signed-off-by: Li RongQing <[email protected]>
Reviewed-by: Chai Wen <[email protected]>
Reviewed-by: Jia Lina <[email protected]>
Signed-off-by: Like Xu <[email protected]>
---
 arch/x86/include/asm/kvm_host.h | 12 ++++++
 arch/x86/kvm/cpuid.c            |  8 +++-
 arch/x86/kvm/x86.c              | 76 ++++++++++++++++++++++++++++++++-
 3 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f852ee350beb..c48b9a0a086e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -539,6 +539,16 @@ struct kvm_vcpu_hv_stimer {
        bool msg_pending;
 };
 
+/* vCPU thermal and power context */
+struct kvm_vcpu_hwp {
+       /* Hardware Coordination Feedback Capability (Presence of APERF/MPERF) 
*/
+       bool hw_coord_fb_cap;
+       /* MPERF increases with a fixed frequency */
+       u64 mperf;
+       /* APERF increases with the current/actual frequency */
+       u64 aperf;
+};
+
 /* Hyper-V synthetic interrupt controller (SynIC)*/
 struct kvm_vcpu_hv_synic {
        u64 version;
@@ -829,6 +839,8 @@ struct kvm_vcpu_arch {
 
        /* AMD MSRC001_0015 Hardware Configuration */
        u64 msr_hwcr;
+
+       struct kvm_vcpu_hwp hwp;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a294f9747aa..7057809e7cfd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -78,6 +78,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
                        apic->lapic_timer.timer_mode_mask = 1 << 17;
        }
 
+       best = kvm_find_cpuid_entry(vcpu, 0x6, 0);
+       if (best && best->function == 0x6 &&
+           boot_cpu_has(X86_FEATURE_APERFMPERF) && (best->ecx & 0x1))
+               vcpu->arch.hwp.hw_coord_fb_cap = true;
+
        best = kvm_find_cpuid_entry(vcpu, 7, 0);
        if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
                cpuid_entry_change(best, X86_FEATURE_OSPKE,
@@ -561,7 +566,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array 
*array, u32 function)
        case 6: /* Thermal management */
                entry->eax = 0x4; /* allow ARAT */
                entry->ebx = 0;
-               entry->ecx = 0;
+               /* allow aperf/mperf to report the true VCPU frequency. */
+               entry->ecx = boot_cpu_has(X86_FEATURE_APERFMPERF) ? 0x1 : 0;
                entry->edx = 0;
                break;
        /* function 7 has additional index. */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00c88c2f34e4..d220d9cc904a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3056,6 +3056,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
                        return 1;
                vcpu->arch.msr_misc_features_enables = data;
                break;
+       case MSR_IA32_MPERF:
+               if (!msr_info->host_initiated && 
!vcpu->arch.hwp.hw_coord_fb_cap)
+                       return 1;
+               vcpu->arch.hwp.mperf = 0;
+               return 0;
+       case MSR_IA32_APERF:
+               if (!msr_info->host_initiated && 
!vcpu->arch.hwp.hw_coord_fb_cap)
+                       return 1;
+               vcpu->arch.hwp.aperf = 0;
+               return 0;
        default:
                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
                        return xen_hvm_config(vcpu, data);
@@ -3323,6 +3333,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
        case MSR_K7_HWCR:
                msr_info->data = vcpu->arch.msr_hwcr;
                break;
+       case MSR_IA32_MPERF:
+               if (!msr_info->host_initiated && 
!vcpu->arch.hwp.hw_coord_fb_cap)
+                       return 1;
+               msr_info->data = vcpu->arch.hwp.mperf;
+               break;
+       case MSR_IA32_APERF:
+               if (!msr_info->host_initiated && 
!vcpu->arch.hwp.hw_coord_fb_cap)
+                       return 1;
+               msr_info->data = vcpu->arch.hwp.aperf;
+               break;
        default:
                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
                        return kvm_pmu_get_msr(vcpu, msr_info);
@@ -8300,6 +8320,50 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
 
+static inline void get_host_amperf(u64 *mperf, u64 *aperf)
+{
+       rdmsrl(MSR_IA32_MPERF, *mperf);
+       rdmsrl(MSR_IA32_APERF, *aperf);
+}
+
+static inline u64 get_amperf_delta(u64 enter, u64 exit)
+{
+       return (exit >= enter) ? (exit - enter) : (ULONG_MAX - enter + exit);
+}
+
+static inline void vcpu_update_amperf(struct kvm_vcpu *vcpu, u64 adelta, u64 
mdelta)
+{
+       u64 aperf_left, mperf_left, delta, tmp;
+
+       aperf_left = ULONG_MAX - vcpu->arch.hwp.aperf;
+       mperf_left = ULONG_MAX - vcpu->arch.hwp.mperf;
+
+       /* fast path when neither MSR overflows */
+       if (adelta <= aperf_left && mdelta <= mperf_left) {
+               vcpu->arch.hwp.aperf += adelta;
+               vcpu->arch.hwp.mperf += mdelta;
+               return;
+       }
+
+       /* when either MSR overflows, both MSRs are reset to zero and continue 
to increment. */
+       delta = min(adelta, mdelta);
+       if (delta > aperf_left || delta > mperf_left) {
+               tmp = max(vcpu->arch.hwp.aperf, vcpu->arch.hwp.mperf);
+               tmp = delta - (ULONG_MAX - tmp) - 1;
+               vcpu->arch.hwp.aperf = tmp + adelta - delta;
+               vcpu->arch.hwp.mperf = tmp + mdelta - delta;
+               return;
+       }
+
+       if (mdelta > adelta && mdelta > aperf_left) {
+               vcpu->arch.hwp.mperf = mdelta - mperf_left - 1;
+               vcpu->arch.hwp.aperf = 0;
+       } else {
+               vcpu->arch.hwp.mperf = 0;
+               vcpu->arch.hwp.aperf = adelta - aperf_left - 1;
+       }
+}
+
 /*
  * Returns 1 to let vcpu_run() continue the guest execution loop without
  * exiting to the userspace.  Otherwise, the value will be returned to the
@@ -8312,7 +8376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                dm_request_for_irq_injection(vcpu) &&
                kvm_cpu_accept_dm_intr(vcpu);
        fastpath_t exit_fastpath;
-
+       u64 enter_mperf = 0, enter_aperf = 0, exit_mperf = 0, exit_aperf = 0;
        bool req_immediate_exit = false;
 
        if (kvm_request_pending(vcpu)) {
@@ -8516,8 +8580,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
        }
 
+       if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap))
+               get_host_amperf(&enter_mperf, &enter_aperf);
+
        exit_fastpath = kvm_x86_ops.run(vcpu);
 
+       if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap)) {
+               get_host_amperf(&exit_mperf, &exit_aperf);
+               vcpu_update_amperf(vcpu, get_amperf_delta(enter_aperf, 
exit_aperf),
+                       get_amperf_delta(enter_mperf, exit_mperf));
+       }
+
        /*
         * Do this here before restoring debug registers on the host.  And
         * since we do this before handling the vmexit, a DR access vmexit
@@ -9482,6 +9555,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
        vcpu->arch.pending_external_vector = -1;
        vcpu->arch.preempted_in_kernel = false;
+       vcpu->arch.hwp.hw_coord_fb_cap = false;
 
        kvm_hv_vcpu_init(vcpu);
 
-- 
2.21.3

Reply via email to