Add and use a new API for mapping a private pfn from guest_memfd into the
TDP MMU from TDX's post-populate hook instead of partially open-coding the
functionality into the TDX code.  Sharing code with the pre-fault path
sounded good on paper, but it's fatally flawed as simulating a fault loses
the pfn, and calling back into gmem to re-retrieve the pfn creates locking
problems, e.g. kvm_gmem_populate() already holds the gmem invalidation
lock.

Providing a dedicated API will also removing several MMU exports that
ideally would not be exposed outside of the MMU, let alone to vendor code.
On that topic, opportunistically drop the kvm_mmu_load() export.  Leave
kvm_tdp_mmu_gpa_is_mapped() alone for now; the entire commit that added
kvm_tdp_mmu_gpa_is_mapped() will be removed in the near future.

Gate the API on CONFIG_KVM_GUEST_MEMFD=y as private memory _must_ be backed
by guest_memfd.  Add a lockdep-only assert to that the incoming pfn is
indeed backed by guest_memfd, and that the gmem instance's invalidate lock
is held (which, combined with slots_lock being held, obviates the need to
check for a stale "fault").

Cc: Michael Roth <[email protected]>
Cc: Yan Zhao <[email protected]>
Cc: Ira Weiny <[email protected]>
Cc: Vishal Annapurve <[email protected]>
Cc: Rick Edgecombe <[email protected]>
Reviewed-by: Rick Edgecombe <[email protected]>
Reviewed-by: Kai Huang <[email protected]>
Link: https://lore.kernel.org/all/[email protected]
Signed-off-by: Sean Christopherson <[email protected]>
---
 arch/x86/kvm/mmu.h     |  1 +
 arch/x86/kvm/mmu/mmu.c | 81 +++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/tdx.c | 10 ++----
 3 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f63074048ec6..2f108e381959 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -259,6 +259,7 @@ extern bool tdp_mmu_enabled;
 
 bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
 int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 
*level);
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t 
pfn);
 
 static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 18d69d48bc55..bad0480bdb0d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5014,6 +5014,86 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu 
*vcpu,
        return min(range->size, end - range->gpa);
 }
 
+#ifdef CONFIG_KVM_GUEST_MEMFD
+static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_PROVE_LOCKING
+       if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
+           WARN_ON_ONCE(!slot->gmem.file) ||
+           WARN_ON_ONCE(!file_count(slot->gmem.file)))
+               return;
+
+       
lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock);
+#endif
+}
+
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t 
pfn)
+{
+       struct kvm_page_fault fault = {
+               .addr = gfn_to_gpa(gfn),
+               .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
+               .prefetch = true,
+               .is_tdp = true,
+               .nx_huge_page_workaround_enabled = 
is_nx_huge_page_enabled(vcpu->kvm),
+
+               .max_level = PG_LEVEL_4K,
+               .req_level = PG_LEVEL_4K,
+               .goal_level = PG_LEVEL_4K,
+               .is_private = true,
+
+               .gfn = gfn,
+               .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
+               .pfn = pfn,
+               .map_writable = true,
+       };
+       struct kvm *kvm = vcpu->kvm;
+       int r;
+
+       lockdep_assert_held(&kvm->slots_lock);
+
+       /*
+        * Mapping a pre-determined private pfn is intended only for use when
+        * populating a guest_memfd instance.  Assert that the slot is backed
+        * by guest_memfd and that the gmem instance's invalidate_lock is held.
+        */
+       kvm_assert_gmem_invalidate_lock_held(fault.slot);
+
+       if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
+               return -EIO;
+
+       if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn))
+               return -EPERM;
+
+       r = kvm_mmu_reload(vcpu);
+       if (r)
+               return r;
+
+       r = mmu_topup_memory_caches(vcpu, false);
+       if (r)
+               return r;
+
+       do {
+               if (signal_pending(current))
+                       return -EINTR;
+
+               if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu))
+                       return -EIO;
+
+               cond_resched();
+
+               guard(read_lock)(&kvm->mmu_lock);
+
+               r = kvm_tdp_mmu_map(vcpu, &fault);
+       } while (r == RET_PF_RETRY);
+
+       if (r != RET_PF_FIXED)
+               return -EIO;
+
+       return 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
+#endif
+
 static void nonpaging_init_context(struct kvm_mmu *context)
 {
        context->page_fault = nonpaging_page_fault;
@@ -5997,7 +6077,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 out:
        return r;
 }
-EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load);
 
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 4c3014befe9f..29f344af4cc2 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3157,15 +3157,12 @@ struct tdx_gmem_post_populate_arg {
 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
                                  void __user *src, int order, void *_arg)
 {
-       u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
-       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
        struct tdx_gmem_post_populate_arg *arg = _arg;
-       struct kvm_vcpu *vcpu = arg->vcpu;
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       u64 err, entry, level_state;
        gpa_t gpa = gfn_to_gpa(gfn);
-       u8 level = PG_LEVEL_4K;
        struct page *src_page;
        int ret, i;
-       u64 err, entry, level_state;
 
        /*
         * Get the source page if it has been faulted in. Return failure if the
@@ -3177,7 +3174,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t 
gfn, kvm_pfn_t pfn,
        if (ret != 1)
                return -ENOMEM;
 
-       ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
+       ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
        if (ret < 0)
                goto out;
 
@@ -3240,7 +3237,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu 
*vcpu, struct kvm_tdx_cmd *c
            !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << 
PAGE_SHIFT) - 1))
                return -EINVAL;
 
-       kvm_mmu_reload(vcpu);
        ret = 0;
        while (region.nr_pages) {
                if (signal_pending(current)) {
-- 
2.51.1.930.gacf6e81ea2-goog


Reply via email to