[PATCH RFC 3/7] kvm: x86: XSAVE state and XFD MSRs context switch

Jing Liu Sat, 06 Feb 2021 22:58:25 -0800

XFD allows the kernel to enable a feature state in XCR0 and to
receive a #NM trap when a task uses instructions accessing that state.
Kernel defines "struct fpu.state_mask" to indicate the saved xstate and
interact with the XFD hardware when needed via a simple conversion.
Once a dynamic feature is detected, "state_mask" is expanded and
"state_ptr" is dynamically allocated to hold the whole state. Meanwhile
once the state is not in INIT state, the corresponding XFD bit should
not be armed anymore.


In KVM, "guest_fpu" serves for any guest task working on this vcpu
during vmexit and vmenter. We provide a pre-allocated guest_fpu space
and entire "guest_fpu.state_mask" to avoid each dynamic features
detection on each vcpu task. Meanwhile, to ensure correctly
xsaves/xrstors guest state, set IA32_XFD as zero during vmexit and
vmenter.

For "current->thread.fpu", since host and guest probably have different
state and mask, it also need be switched to the right context when fpu
load and put.

Signed-off-by: Jing Liu <jing2....@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++
 arch/x86/kernel/fpu/init.c      |  1 +
 arch/x86/kernel/fpu/xstate.c    |  2 +
 arch/x86/kvm/vmx/vmx.c          | 76 +++++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/vmx.h          |  1 +
 arch/x86/kvm/x86.c              | 69 +++++++++++++++++++++++++-----
 6 files changed, 141 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7e5f33a0d0e2..6dedf3d22659 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1203,6 +1203,9 @@ struct kvm_x86_ops {
                               struct x86_exception *exception);
        void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
 
+       void (*xfd_load)(struct kvm_vcpu *vcpu);
+       void (*xfd_put)(struct kvm_vcpu *vcpu);
+
        void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
 
        void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7e0c68043ce3..fbb761fc13ec 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -145,6 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_kernel_xstate_min_size);
  * can be dynamically expanded to include some states up to this size.
  */
 unsigned int fpu_kernel_xstate_max_size;
+EXPORT_SYMBOL_GPL(fpu_kernel_xstate_max_size);
 
 /* Get alignment of the TYPE. */
 #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 080f3be9a5e6..9c471a0364e2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -77,12 +77,14 @@ static struct xfeature_capflag_info xfeature_capflags[] 
__initdata = {
  * XSAVE buffer, both supervisor and user xstates.
  */
 u64 xfeatures_mask_all __read_mostly;
+EXPORT_SYMBOL_GPL(xfeatures_mask_all);
 
 /*
  * This represents user xstates, a subset of xfeatures_mask_all, saved in a
  * dynamic kernel XSAVE buffer.
  */
 u64 xfeatures_mask_user_dynamic __read_mostly;
+EXPORT_SYMBOL_GPL(xfeatures_mask_user_dynamic);
 
 static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] 
= -1};
 static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] 
= -1};
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7fa54e78c45c..be3cc0f3ec6d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1167,6 +1167,75 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
        wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
 }
 
+static void vmx_xfd_load(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (guest_cpuid_has(vcpu, X86_FEATURE_XFD)) {
+               vmx->host_ia32_xfd = 
xfirstuse_not_detected(vcpu->arch.user_fpu);
+               /*
+                * Keep IA32_XFD as zero in hypervisor.
+                * Guest non-zero IA32_XFD is restored until kvm_x86_ops.run
+                */
+               if (vmx->host_ia32_xfd)
+                       wrmsrl(MSR_IA32_XFD, 0);
+       }
+}
+
+static void vmx_xfd_put(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (guest_cpuid_has(vcpu, X86_FEATURE_XFD)) {
+               /* IA32_XFD register is kept as zero in hypervisor. */
+               if (vmx->host_ia32_xfd)
+                       wrmsrl(MSR_IA32_XFD, vmx->host_ia32_xfd);
+               /* User (qemu) IA32_XFD_ERR should be zero. */
+               if (vmx->msr_ia32_xfd_err)
+                       wrmsrl(MSR_IA32_XFD_ERR, 0);
+       }
+}
+
+/* Load guest XFD MSRs before entering. */
+static void xfd_guest_enter(struct vcpu_vmx *vmx)
+{
+       if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_XFD)) {
+               if (vmx->msr_ia32_xfd)
+                       wrmsrl(MSR_IA32_XFD, vmx->msr_ia32_xfd);
+               /*
+                * We do not rdmsr here since in most cases
+                * IA32_XFD_ERR is zero. One rare exception is that,
+                * this vmenter follows a vmexit with non-zero
+                * MSR_IA32_XFD_ERR and it doesn't change during
+                * this interval.
+                *
+                * So just simply load the non-zero guest value.
+                */
+               if (vmx->msr_ia32_xfd_err)
+                       wrmsrl(MSR_IA32_XFD_ERR, vmx->msr_ia32_xfd_err);
+       }
+}
+
+/*
+ * Save guest XFD MSRs once vmexit since the registers may be changed
+ * when control is transferred out of KVM, e.g. preemption.
+ */
+static void xfd_guest_exit(struct vcpu_vmx *vmx)
+{
+       if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_XFD)) {
+               rdmsrl(MSR_IA32_XFD, vmx->msr_ia32_xfd);
+               rdmsrl(MSR_IA32_XFD_ERR, vmx->msr_ia32_xfd_err);
+               /*
+                * Clear the MSR_IA32_XFD to ensure correctly protect guest
+                * fpu context in hypervisor.
+                * No need to reset MSR_IA32_XFD_ERR in hypervisor since it
+                * has no impact on others.
+                */
+               if (vmx->msr_ia32_xfd)
+                       wrmsrl(MSR_IA32_XFD, 0);
+       }
+}
+
 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
                        unsigned long fs_base, unsigned long gs_base)
 {
@@ -6735,6 +6804,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        kvm_load_guest_xsave_state(vcpu);
 
+       xfd_guest_enter(vmx);
+
        pt_guest_enter(vmx);
 
        atomic_switch_perf_msrs(vmx);
@@ -6804,6 +6875,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        pt_guest_exit(vmx);
 
+       xfd_guest_exit(vmx);
+
        kvm_load_host_xsave_state(vcpu);
 
        vmx->nested.nested_run_pending = 0;
@@ -7644,6 +7717,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
        .vcpu_load = vmx_vcpu_load,
        .vcpu_put = vmx_vcpu_put,
 
+       .xfd_load = vmx_xfd_load,
+       .xfd_put = vmx_xfd_put,
+
        .update_exception_bitmap = update_exception_bitmap,
        .get_msr_feature = vmx_get_msr_feature,
        .get_msr = vmx_get_msr,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index d487f5a53a08..9a9ea37a29b1 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -288,6 +288,7 @@ struct vcpu_vmx {
        } shadow_msr_intercept;
 
        /* eXtended Feature Disabling (XFD) MSRs */
+       u64 host_ia32_xfd;
        u64 msr_ia32_xfd;
        u64 msr_ia32_xfd_err;
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9ca8b1e58afa..15908bc65d1c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9220,22 +9220,44 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 static void kvm_save_current_fpu(struct fpu *fpu)
 {
-       struct fpu *src_fpu = &current->thread.fpu;
+       struct fpu *cur_fpu = &current->thread.fpu;
 
+       fpu->state_ptr = cur_fpu->state_ptr;
+       fpu->state_mask = cur_fpu->state_mask;
        /*
         * If the target FPU state is not resident in the CPU registers, just
         * memcpy() from current, else save CPU state directly to the target.
         */
        if (test_thread_flag(TIF_NEED_FPU_LOAD)) {
-               memcpy(&fpu->state, &src_fpu->state,
-                      fpu_kernel_xstate_min_size);
+               /*
+                * No need to copy if dynamic feature is used, because
+                * they just simply point to the same recent state.
+                */
+               if (!cur_fpu->state_ptr)
+                       memcpy(&fpu->state, &cur_fpu->state,
+                              fpu_kernel_xstate_min_size);
        } else {
-               if (fpu->state_mask != src_fpu->state_mask)
-                       fpu->state_mask = src_fpu->state_mask;
                copy_fpregs_to_fpstate(fpu);
        }
 }
 
+/*
+ * Swap fpu context to next fpu role.
+ *
+ * "current" fpu acts two roles: user contexts and guest contexts.
+ * Swap "current" fpu to next role to ensure correctly handle
+ * dynamic state buffers, e.g. in preemption case.
+ */
+static void kvm_load_next_fpu(struct fpu *next_fpu, u64 mask)
+{
+       struct fpu *cur_fpu = &current->thread.fpu;
+
+       cur_fpu->state_ptr = next_fpu->state_ptr;
+       cur_fpu->state_mask = next_fpu->state_mask;
+
+       __copy_kernel_to_fpregs(__xstate(next_fpu), mask);
+}
+
 /* Swap (qemu) user FPU context for the guest FPU context. */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
@@ -9243,9 +9265,11 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
        kvm_save_current_fpu(vcpu->arch.user_fpu);
 
+       if (static_cpu_has(X86_FEATURE_XFD) && kvm_x86_ops.xfd_load)
+               kvm_x86_ops.xfd_load(vcpu);
+
        /* PKRU is separately restored in kvm_x86_ops.run.  */
-       __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
-                               ~XFEATURE_MASK_PKRU);
+       kvm_load_next_fpu(vcpu->arch.guest_fpu, ~XFEATURE_MASK_PKRU);
 
        fpregs_mark_activate();
        fpregs_unlock();
@@ -9260,7 +9284,10 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 
        kvm_save_current_fpu(vcpu->arch.guest_fpu);
 
-       copy_kernel_to_fpregs(vcpu->arch.user_fpu);
+       if (static_cpu_has(X86_FEATURE_XFD) && kvm_x86_ops.xfd_put)
+               kvm_x86_ops.xfd_put(vcpu);
+
+       kvm_load_next_fpu(vcpu->arch.user_fpu, -1);
 
        fpregs_mark_activate();
        fpregs_unlock();
@@ -9840,11 +9867,13 @@ static int sync_regs(struct kvm_vcpu *vcpu)
 
 static void fx_init(struct kvm_vcpu *vcpu)
 {
+       struct xregs_state *xsave;
+
+       xsave = __xsave(vcpu->arch.guest_fpu);
        fpstate_init(vcpu->arch.guest_fpu);
        if (boot_cpu_has(X86_FEATURE_XSAVES))
-               vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
+               xsave->header.xcomp_bv =
                        host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
        /*
         * Ensure guest xcr0 is valid for loading
         */
@@ -9920,6 +9949,21 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
                pr_err("kvm: failed to allocate vcpu's fpu\n");
                goto free_user_fpu;
        }
+
+       vcpu->arch.guest_fpu->state_mask = xfeatures_mask_all &
+                               ~xfeatures_mask_user_dynamic;
+
+       /* If have dynamic features, initialize full context. */
+       if (xfeatures_mask_user_dynamic) {
+               vcpu->arch.guest_fpu->state_ptr =
+                       kmalloc(fpu_kernel_xstate_max_size, GFP_KERNEL);
+               if (!vcpu->arch.guest_fpu->state_ptr)
+                       goto free_guest_fpu;
+
+               vcpu->arch.guest_fpu->state_mask |=
+                       xfeatures_mask_user_dynamic;
+       }
+
        fx_init(vcpu);
 
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -9936,7 +9980,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
        r = kvm_x86_ops.vcpu_create(vcpu);
        if (r)
-               goto free_guest_fpu;
+               goto free_guest_fpu_exp;
 
        vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
        vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
@@ -9947,6 +9991,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu_put(vcpu);
        return 0;
 
+free_guest_fpu_exp:
+       kfree(vcpu->arch.guest_fpu->state_ptr);
 free_guest_fpu:
        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
 free_user_fpu:
@@ -10002,6 +10048,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
        kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+       kfree(vcpu->arch.guest_fpu->state_ptr);
        kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
 
        kvm_hv_vcpu_uninit(vcpu);
-- 
2.18.4

[PATCH RFC 3/7] kvm: x86: XSAVE state and XFD MSRs context switch

Reply via email to