On Fri, Jul 03, 2026 at 12:05:17AM +0800, Yu Zhang wrote:

> +static bool hv_iommu_capable(struct device *dev, enum iommu_cap cap)
> +{
> +     switch (cap) {
> +     case IOMMU_CAP_CACHE_COHERENCY:
> +             return true;
> +     case IOMMU_CAP_DEFERRED_FLUSH:
> +             return true;

This CAP isn't necessary anymore

> +static struct iommu_device *hv_iommu_probe_device(struct device *dev)
> +{
> +     struct pci_dev *pdev;
> +     struct hv_iommu_endpoint *vdev;
> +     struct hv_output_get_logical_device_property device_iommu_property = 
> {0};
> +
> +     if (!dev_is_pci(dev))
> +             return ERR_PTR(-ENODEV);
> +
> +     pdev = to_pci_dev(dev);
> +
> +     if (hv_iommu_get_logical_device_property(dev,
> +                                              
> HV_LOGICAL_DEVICE_PROPERTY_PVIOMMU,
> +                                              &device_iommu_property) ||
> +         !(device_iommu_property.device_iommu & HV_DEVICE_IOMMU_ENABLED))
> +             return ERR_PTR(-ENODEV);
> +
> +     vdev = kzalloc_obj(*vdev, GFP_KERNEL);
> +     if (!vdev)
> +             return ERR_PTR(-ENOMEM);
> +
> +     vdev->dev = dev;
> +     vdev->hv_iommu = hv_iommu_device;
> +     dev_iommu_priv_set(dev, vdev);
> +
> +     if (hv_iommu_ats_supported(hv_iommu_device->cap) &&
> +         pci_ats_supported(pdev))
> +             pci_enable_ats(pdev, __ffs(hv_iommu_device->pgsize_bitmap));

This can probably just be PAGE_SHIFT

Also ATS shouldn't be enabled until a translation is installed,
otherwise the driver cannot participate in the ATS error handling
Nicolin is working on.

> +static void hv_iommu_release_device(struct device *dev)
> +{
> +     struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> +     struct pci_dev *pdev = to_pci_dev(dev);
> +
> +     if (pdev->ats_enabled)
> +             pci_disable_ats(pdev);
> +
> +     dev_iommu_priv_set(dev, NULL);

No necessary, the caller does it

> +static struct iommu_group *hv_iommu_device_group(struct device *dev)
> +{
> +     if (dev_is_pci(dev))
> +             return pci_device_group(dev);
> +
> +     WARN_ON_ONCE(1);
> +     return generic_device_group(dev);

I think you can just return failure here instead of WARN_ON ?

> +static int __init hv_initialize_static_domains(void)
> +{
> +     int ret;
> +     struct hv_iommu_domain *hv_domain;
> +
> +     /* Default stage-1 identity domain */
> +     hv_domain = &hv_identity_domain;
> +
> +     ret = hv_create_device_domain(hv_domain, HV_DEVICE_DOMAIN_TYPE_S1);
> +     if (ret)
> +             return ret;
> +
> +     ret = hv_configure_device_domain(hv_domain, IOMMU_DOMAIN_IDENTITY);
> +     if (ret)
> +             goto delete_identity_domain;

IMHO I would change this around to have a single function that accepts
a struct hv_input_configure_device_domain as input and does both of
the hypercalls inside. Then here it is easy to directly construct the
hv_input_configure_device_domain for blocking and identity.

I'd be happy if this never touched domain_type, drivers shouldn't be
touching that.

> +static void __init hv_init_iommu_device(struct hv_iommu_dev *hv_iommu,
> +                     struct hv_output_get_iommu_capabilities *hv_iommu_cap)
> +{
> +     ida_init(&hv_iommu->domain_ids);
> +
> +     hv_iommu->cap = hv_iommu_cap->iommu_cap;
> +     hv_iommu->max_iova_width = hv_iommu_cap->max_iova_width;
> +     if (!hv_iommu_5lvl_supported(hv_iommu->cap) &&
> +         hv_iommu->max_iova_width > 48) {
> +             pr_info("5-level paging not supported, limiting iova width to 
> 48.\n");
> +             hv_iommu->max_iova_width = 48;
> +     }
> +
> +     hv_iommu->geometry = (struct iommu_domain_geometry) {
> +             .aperture_start = 0,
> +             .aperture_end = (((u64)1) << hv_iommu->max_iova_width) - 1,
> +             .force_aperture = true,
> +     };

I don't see anything reading this, I don't expect this to be used?

The max_iova_width has to be passed into the iommupt creation, which
it does:

 +      cfg.common.hw_max_vasz_lg2 = hv_iommu_device->max_iova_width;
 +      cfg.common.hw_max_oasz_lg2 = 52;
 +      cfg.top_level = (hv_iommu_device->max_iova_width > 48) ? 4 : 3;
 +      ret = pt_iommu_x86_64_init(&hv_domain->pt_iommu_x86_64, &cfg, 
GFP_KERNEL);
 +      if (ret)

So just delete hv->iommu->geometry.

Also, VT-D has weirdness where the HW can require a 4 level table but
only a 3 level worth of IOVA width is being used. This was a
real-world bug we hit when converting to iommupt. This interaction
with the HV doesn't seem able to represent that.

> +     /*
> +      * The page table code only maps x86 page sizes (4K/2M/1G); require the
> +      * hypervisor to advertise a non-empty subset of exactly those.
> +      */
> +     if (!hv_iommu_cap.pgsize_bitmap ||
> +         (hv_iommu_cap.pgsize_bitmap & ~(u64)(SZ_4K | SZ_2M | SZ_1G))) {
> +             pr_err("unsupported page sizes: pgsize_bitmap=0x%llx\n",
> +                    hv_iommu_cap.pgsize_bitmap);
> +             return -ENODEV;
> +     }

This can just be

if (!(hv_iommu_cap.pgsize_bitmap & PAGE_SHIFT)) {
                pr_err("unsupported page sizes: pgsize_bitmap=0x%llx\n",
                       hv_iommu_cap.pgsize_bitmap);
}               return -ENODEV;

Which is all you really need. If the HV doesn't support 1G it is
perfectly fine, the iommupt page bitmap is already masked by this. 

> +     ret = iommu_device_register(&hv_iommu->iommu, &hv_iommu_ops, NULL);
> +     if (ret) {
> +             pr_err("iommu_device_register failed: %d\n", ret);
> +             goto err_sysfs_remove;
> +     }
> +
> +     pr_info("successfully initialized\n");

Don't log someting so vauge?

Jason

Reply via email to