Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Mukesh R Fri, 30 Jan 2026 14:17:38 -0800

On 1/27/26 10:57, Stanislav Kinsburskii wrote:

On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:

On 1/26/26 10:15, Stanislav Kinsburskii wrote:

On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:

On 1/20/26 17:53, Stanislav Kinsburskii wrote:

On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:

From: Mukesh Rathor <[email protected]>


Upon guest access, in case of missing mmio mapping, the hypervisor
generates an unmapped gpa intercept. In this path, lookup the PCI
resource pfn for the guest gpa, and ask the hypervisor to map it
via hypercall. The PCI resource pfn is maintained by the VFIO driver,
and obtained via fixup_user_fault call (similar to KVM).

Signed-off-by: Mukesh Rathor <[email protected]>
---
    drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
    1 file changed, 115 insertions(+)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 03f3aa9f5541..4c8bc7cd0888 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -56,6 +56,14 @@ struct hv_stats_page {
        };
    } __packed;
+bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
+static int __init setup_hv_full_mmio(char *str)
+{
+       hv_nofull_mmio = true;
+       return 0;
+}
+__setup("hv_nofull_mmio", setup_hv_full_mmio);
+
    struct mshv_root mshv_root;
    enum hv_scheduler_type hv_scheduler_type;
@@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition 
*partition, u64 gfn)
    }
    #ifdef CONFIG_X86_64
+
+/*
+ * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
+ * else just return -errno.
+ */
+static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
+                                      u64 *mmio_pfnp)
+{
+       struct vm_area_struct *vma;
+       bool is_mmio;
+       u64 uaddr;
+       struct mshv_mem_region *mreg;
+       struct follow_pfnmap_args pfnmap_args;
+       int rc = -EINVAL;
+
+       /*
+        * Do not allow mem region to be deleted beneath us. VFIO uses
+        * useraddr vma to lookup pci bar pfn.
+        */
+       spin_lock(&pt->pt_mem_regions_lock);
+
+       /* Get the region again under the lock */
+       mreg = mshv_partition_region_by_gfn(pt, gfn);
+       if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
+               goto unlock_pt_out;
+
+       uaddr = mreg->start_uaddr +
+               ((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
+
+       mmap_read_lock(current->mm);


Semaphore can't be taken under spinlock.


Yeah, something didn't feel right here and I meant to recheck, now regret
rushing to submit the patch.

Rethinking, I think the pt_mem_regions_lock is not needed to protect
the uaddr because unmap will properly serialize via the mm lock.

+       vma = vma_lookup(current->mm, uaddr);
+       is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;


Why this check is needed again?


To make sure region did not change. This check is under lock.


How can this happen? One can't change VMA type without unmapping it
first. And unmapping it leads to a kernel MMIO region state dangling
around without corresponding user space mapping.


Right, and vm_flags would not be mmio expected then.

This is similar to dangling pinned regions and should likely be
addressed the same way by utilizing MMU notifiers to destpoy memoty
regions is VMA is detached.


I don't think we need that. Either it succeeds if the region did not
change at all, or just fails.


I'm afraid we do, as if the driver mapped a page with the previous
memory region, and then the region is unmapped, the page will stay
mapped in the hypervisor, but will be considered free by kernel, which
in turn will lead to GPF upn next allocation.


There are no ram pages for mmio regions. Also, we don't do much with
mmio regions other than tell the hyp about it.

Thanks,
-Mukesh

With pinned regions we issue is similar but less impacting: pages can't
be released by user space unmapping and thus will be simply leaked, but
the system stays intact.

MMIO regions are simila to movable region in this regard: they don't
reference the user pages, and thus this guest region replaement is a
stright wat to kernel panic.

The region type is stored on the region itself.
And the type is checked on the caller side.

+       if (!is_mmio)
+               goto unlock_mmap_out;
+
+       pfnmap_args.vma = vma;
+       pfnmap_args.address = uaddr;
+
+       rc = follow_pfnmap_start(&pfnmap_args);
+       if (rc) {
+               rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
+                                     NULL);
+               if (rc)
+                       goto unlock_mmap_out;
+
+               rc = follow_pfnmap_start(&pfnmap_args);
+               if (rc)
+                       goto unlock_mmap_out;
+       }
+
+       *mmio_pfnp = pfnmap_args.pfn;
+       follow_pfnmap_end(&pfnmap_args);
+d
+unlock_mmap_out:
+       mmap_read_unlock(current->mm);
+unlock_pt_out:
+       spin_unlock(&pt->pt_mem_regions_lock);
+       return rc;
+}
+
+/*
+ * At present, the only unmapped gpa is mmio space. Verify if it's mmio
+ * and resolve if possible.
+ * Returns: True if valid mmio intercept and it was handled, else false
+ */
+static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
+{
+       struct hv_message *hvmsg = vp->vp_intercept_msg_page;
+       struct hv_x64_memory_intercept_message *msg;
+       union hv_x64_memory_access_info accinfo;
+       u64 gfn, mmio_spa, numpgs;
+       struct mshv_mem_region *mreg;
+       int rc;
+       struct mshv_partition *pt = vp->vp_partition;
+
+       msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
+       accinfo = msg->memory_access_info;
+
+       if (!accinfo.gva_gpa_valid)
+               return false;
+
+       /* Do a fast check and bail if non mmio intercept */
+       gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
+       mreg = mshv_partition_region_by_gfn(pt, gfn);


This call needs to be protected by the spinlock.


This is sorta fast path to bail. We recheck under partition lock above.


Accessing the list of regions without lock is unsafe.


I am not sure why? This check is done by a vcpu thread, so regions
will not have just gone away.


This is shared resources. Multiple VP thread get into this function
simultaneously, so there is a race already. But this one we can live
with without locking as they don't mutate the list of the regions.

The issue happens when VMM adds or removed another region as it mutates
the list and races with VP threads doing this lookup.

Thanks,
Stanislav

Thanks,
-Mukesh

Thanks,
Stanislav

Thanks,
-Mukesh

Thanks,
Stanislav

+       if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
+               return false;
+
+       rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
+       if (rc)
+               return false;
+
+       if (!hv_nofull_mmio) {          /* default case */
+               gfn = mreg->start_gfn;
+               mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
+               numpgs = mreg->nr_pages;
+       } else
+               numpgs = 1;
+
+       rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
+
+       return rc == 0;
+}
+
    static struct mshv_mem_region *
    mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
    {
@@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
        return ret;
    }
+
    #else  /* CONFIG_X86_64 */
+static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
    static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
    #endif /* CONFIG_X86_64 */
    static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
    {
        switch (vp->vp_intercept_msg_page->header.message_type) {
+       case HVMSG_UNMAPPED_GPA:
+               return mshv_handle_unmapped_gpa(vp);
        case HVMSG_GPA_INTERCEPT:
                return mshv_handle_gpa_intercept(vp);
        }
--
2.51.2.vfs.0.1

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Reply via email to