On 3/24/2025 11:01 PM, Mingwei Zhang wrote:
> From: Dapeng Mi <[email protected]>
>
> Check if rdpmc can be intercepted for mediated vPMU. Simply speaking,
> if guest own all PMU counters in mediated vPMU, then rdpmc interception
> should be disabled to mitigate the performance impact, otherwise rdpmc
> has to be intercepted to avoid guest obtain host counter's data via
> rdpmc instruction.
>
> Co-developed-by: Mingwei Zhang <[email protected]>
> Signed-off-by: Mingwei Zhang <[email protected]>
> Co-developed-by: Sandipan Das <[email protected]>
> Signed-off-by: Sandipan Das <[email protected]>
> Signed-off-by: Dapeng Mi <[email protected]>
> ---
> arch/x86/include/asm/msr-index.h | 1 +
> arch/x86/kvm/pmu.c | 34 ++++++++++++++++++++++++++++++++
> arch/x86/kvm/pmu.h | 19 ++++++++++++++++++
> arch/x86/kvm/svm/pmu.c | 14 ++++++++++++-
> arch/x86/kvm/vmx/pmu_intel.c | 18 ++++++++---------
> 5 files changed, 76 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/include/asm/msr-index.h
> b/arch/x86/include/asm/msr-index.h
> index ca70846ffd55..337f4b0a2998 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -312,6 +312,7 @@
> #define PERF_CAP_PEBS_FORMAT 0xf00
> #define PERF_CAP_FW_WRITES BIT_ULL(13)
> #define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
> +#define PERF_CAP_PERF_METRICS BIT_ULL(15)
> #define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG
> | \
> PERF_CAP_PEBS_FORMAT |
> PERF_CAP_PEBS_BASELINE)
>
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> index 92c742ead663..6ad71752be4b 100644
> --- a/arch/x86/kvm/pmu.c
> +++ b/arch/x86/kvm/pmu.c
> @@ -604,6 +604,40 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx,
> u64 *data)
> return 0;
> }
>
> +inline bool kvm_rdpmc_in_guest(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> +
> + if (!kvm_mediated_pmu_enabled(vcpu))
> + return false;
> +
> + /*
> + * VMware allows access to these Pseduo-PMCs even when read via RDPMC
> + * in Ring3 when CR4.PCE=0.
> + */
> + if (enable_vmware_backdoor)
> + return false;
> +
> + /*
> + * FIXME: In theory, perf metrics is always combined with fixed
> + * counter 3. it's fair enough to compare the guest and host
> + * fixed counter number and don't need to check perf metrics
> + * explicitly. However kvm_pmu_cap.num_counters_fixed is limited
> + * KVM_MAX_NR_FIXED_COUNTERS (3) as fixed counter 3 is not
> + * supported now. perf metrics is still needed to be checked
> + * explicitly here. Once fixed counter 3 is supported, the perf
> + * metrics checking can be removed.
> + */
> + return pmu->nr_arch_gp_counters == kvm_pmu_cap.num_counters_gp &&
> + pmu->nr_arch_fixed_counters == kvm_pmu_cap.num_counters_fixed &&
> + vcpu_has_perf_metrics(vcpu) == kvm_host_has_perf_metrics() &&
> + pmu->counter_bitmask[KVM_PMC_GP] ==
> + (BIT_ULL(kvm_pmu_cap.bit_width_gp) - 1) &&
> + pmu->counter_bitmask[KVM_PMC_FIXED] ==
> + (BIT_ULL(kvm_pmu_cap.bit_width_fixed) - 1);
> +}
> +EXPORT_SYMBOL_GPL(kvm_rdpmc_in_guest);
> +
> void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
> {
> if (lapic_in_kernel(vcpu)) {
> diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
> index e1d0096f249b..509c995b7871 100644
> --- a/arch/x86/kvm/pmu.h
> +++ b/arch/x86/kvm/pmu.h
> @@ -271,6 +271,24 @@ static inline bool pmc_is_globally_enabled(struct
> kvm_pmc *pmc)
> return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
> }
>
> +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
> +{
> + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
> + return 0;
> +
> + return vcpu->arch.perf_capabilities;
> +}
> +
> +static inline bool vcpu_has_perf_metrics(struct kvm_vcpu *vcpu)
> +{
> + return !!(vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PERF_METRICS);
> +}
> +
> +static inline bool kvm_host_has_perf_metrics(void)
> +{
> + return !!(kvm_host.perf_capabilities & PERF_CAP_PERF_METRICS);
> +}
> +
> void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
> void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
> int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
> @@ -287,6 +305,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64
> eventsel);
> bool vcpu_pmu_can_enable(struct kvm_vcpu *vcpu);
>
> bool is_vmware_backdoor_pmc(u32 pmc_idx);
> +bool kvm_rdpmc_in_guest(struct kvm_vcpu *vcpu);
>
> extern struct kvm_pmu_ops intel_pmu_ops;
> extern struct kvm_pmu_ops amd_pmu_ops;
> diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
> index c8b9fd9b5350..153972e944eb 100644
> --- a/arch/x86/kvm/svm/pmu.c
> +++ b/arch/x86/kvm/svm/pmu.c
> @@ -173,7 +173,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct
> msr_data *msr_info)
> return 1;
> }
>
> -static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
> +static void __amd_pmu_refresh(struct kvm_vcpu *vcpu)
> {
> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> union cpuid_0x80000022_ebx ebx;
> @@ -212,6 +212,18 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
> bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
> }
>
> +static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> +
> + __amd_pmu_refresh(vcpu);
> +
> + if (kvm_rdpmc_in_guest(vcpu))
> + svm_clr_intercept(svm, INTERCEPT_RDPMC);
> + else
> + svm_set_intercept(svm, INTERCEPT_RDPMC);
> +}
> +
After putting kprobes on kvm_pmu_rdpmc(), I noticed that RDPMC instructions were
getting intercepted for the secondary vCPUs. This happens because when secondary
vCPUs come up, kvm_vcpu_reset() gets called after guest CPUID has been updated.
While RDPMC interception is initially disabled in the kvm_pmu_refresh() path, it
gets re-enabled in the kvm_vcpu_reset() path as svm_vcpu_reset() calls
init_vmcb().
We should consider adding the following change to avoid that.
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 6f9142063cc4..1c9c183092f3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1354,7 +1354,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_set_intercept(svm, INTERCEPT_SMI);
svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
- svm_set_intercept(svm, INTERCEPT_RDPMC);
svm_set_intercept(svm, INTERCEPT_CPUID);
svm_set_intercept(svm, INTERCEPT_INVD);
svm_set_intercept(svm, INTERCEPT_INVLPG);
> static void amd_pmu_init(struct kvm_vcpu *vcpu)
> {
> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
> index fc017e9a6a0c..2a5f79206b02 100644
> --- a/arch/x86/kvm/vmx/pmu_intel.c
> +++ b/arch/x86/kvm/vmx/pmu_intel.c
> @@ -108,14 +108,6 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct
> kvm_vcpu *vcpu,
> return &counters[array_index_nospec(idx, num_counters)];
> }
>
> -static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
> -{
> - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
> - return 0;
> -
> - return vcpu->arch.perf_capabilities;
> -}
> -
> static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
> {
> return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0;
> @@ -456,7 +448,7 @@ static void intel_pmu_enable_fixed_counter_bits(struct
> kvm_pmu *pmu, u64 bits)
> pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits);
> }
>
> -static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
> +static void __intel_pmu_refresh(struct kvm_vcpu *vcpu)
> {
> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
> @@ -564,6 +556,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
> }
> }
>
> +static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
> +{
> + __intel_pmu_refresh(vcpu);
> +
> + exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING,
> + !kvm_rdpmc_in_guest(vcpu));
> +}
> +
> static void intel_pmu_init(struct kvm_vcpu *vcpu)
> {
> int i;