On Fri, Jul 03, 2026 at 12:05:17AM +0800, Yu Zhang wrote:
> +static bool hv_iommu_capable(struct device *dev, enum iommu_cap cap)
> +{
> + switch (cap) {
> + case IOMMU_CAP_CACHE_COHERENCY:
> + return true;
> + case IOMMU_CAP_DEFERRED_FLUSH:
> + return true;
This CAP isn't necessary anymore
> +static struct iommu_device *hv_iommu_probe_device(struct device *dev)
> +{
> + struct pci_dev *pdev;
> + struct hv_iommu_endpoint *vdev;
> + struct hv_output_get_logical_device_property device_iommu_property =
> {0};
> +
> + if (!dev_is_pci(dev))
> + return ERR_PTR(-ENODEV);
> +
> + pdev = to_pci_dev(dev);
> +
> + if (hv_iommu_get_logical_device_property(dev,
> +
> HV_LOGICAL_DEVICE_PROPERTY_PVIOMMU,
> + &device_iommu_property) ||
> + !(device_iommu_property.device_iommu & HV_DEVICE_IOMMU_ENABLED))
> + return ERR_PTR(-ENODEV);
> +
> + vdev = kzalloc_obj(*vdev, GFP_KERNEL);
> + if (!vdev)
> + return ERR_PTR(-ENOMEM);
> +
> + vdev->dev = dev;
> + vdev->hv_iommu = hv_iommu_device;
> + dev_iommu_priv_set(dev, vdev);
> +
> + if (hv_iommu_ats_supported(hv_iommu_device->cap) &&
> + pci_ats_supported(pdev))
> + pci_enable_ats(pdev, __ffs(hv_iommu_device->pgsize_bitmap));
This can probably just be PAGE_SHIFT
Also ATS shouldn't be enabled until a translation is installed,
otherwise the driver cannot participate in the ATS error handling
Nicolin is working on.
> +static void hv_iommu_release_device(struct device *dev)
> +{
> + struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> + struct pci_dev *pdev = to_pci_dev(dev);
> +
> + if (pdev->ats_enabled)
> + pci_disable_ats(pdev);
> +
> + dev_iommu_priv_set(dev, NULL);
No necessary, the caller does it
> +static struct iommu_group *hv_iommu_device_group(struct device *dev)
> +{
> + if (dev_is_pci(dev))
> + return pci_device_group(dev);
> +
> + WARN_ON_ONCE(1);
> + return generic_device_group(dev);
I think you can just return failure here instead of WARN_ON ?
> +static int __init hv_initialize_static_domains(void)
> +{
> + int ret;
> + struct hv_iommu_domain *hv_domain;
> +
> + /* Default stage-1 identity domain */
> + hv_domain = &hv_identity_domain;
> +
> + ret = hv_create_device_domain(hv_domain, HV_DEVICE_DOMAIN_TYPE_S1);
> + if (ret)
> + return ret;
> +
> + ret = hv_configure_device_domain(hv_domain, IOMMU_DOMAIN_IDENTITY);
> + if (ret)
> + goto delete_identity_domain;
IMHO I would change this around to have a single function that accepts
a struct hv_input_configure_device_domain as input and does both of
the hypercalls inside. Then here it is easy to directly construct the
hv_input_configure_device_domain for blocking and identity.
I'd be happy if this never touched domain_type, drivers shouldn't be
touching that.
> +static void __init hv_init_iommu_device(struct hv_iommu_dev *hv_iommu,
> + struct hv_output_get_iommu_capabilities *hv_iommu_cap)
> +{
> + ida_init(&hv_iommu->domain_ids);
> +
> + hv_iommu->cap = hv_iommu_cap->iommu_cap;
> + hv_iommu->max_iova_width = hv_iommu_cap->max_iova_width;
> + if (!hv_iommu_5lvl_supported(hv_iommu->cap) &&
> + hv_iommu->max_iova_width > 48) {
> + pr_info("5-level paging not supported, limiting iova width to
> 48.\n");
> + hv_iommu->max_iova_width = 48;
> + }
> +
> + hv_iommu->geometry = (struct iommu_domain_geometry) {
> + .aperture_start = 0,
> + .aperture_end = (((u64)1) << hv_iommu->max_iova_width) - 1,
> + .force_aperture = true,
> + };
I don't see anything reading this, I don't expect this to be used?
The max_iova_width has to be passed into the iommupt creation, which
it does:
+ cfg.common.hw_max_vasz_lg2 = hv_iommu_device->max_iova_width;
+ cfg.common.hw_max_oasz_lg2 = 52;
+ cfg.top_level = (hv_iommu_device->max_iova_width > 48) ? 4 : 3;
+ ret = pt_iommu_x86_64_init(&hv_domain->pt_iommu_x86_64, &cfg,
GFP_KERNEL);
+ if (ret)
So just delete hv->iommu->geometry.
Also, VT-D has weirdness where the HW can require a 4 level table but
only a 3 level worth of IOVA width is being used. This was a
real-world bug we hit when converting to iommupt. This interaction
with the HV doesn't seem able to represent that.
> + /*
> + * The page table code only maps x86 page sizes (4K/2M/1G); require the
> + * hypervisor to advertise a non-empty subset of exactly those.
> + */
> + if (!hv_iommu_cap.pgsize_bitmap ||
> + (hv_iommu_cap.pgsize_bitmap & ~(u64)(SZ_4K | SZ_2M | SZ_1G))) {
> + pr_err("unsupported page sizes: pgsize_bitmap=0x%llx\n",
> + hv_iommu_cap.pgsize_bitmap);
> + return -ENODEV;
> + }
This can just be
if (!(hv_iommu_cap.pgsize_bitmap & PAGE_SHIFT)) {
pr_err("unsupported page sizes: pgsize_bitmap=0x%llx\n",
hv_iommu_cap.pgsize_bitmap);
} return -ENODEV;
Which is all you really need. If the HV doesn't support 1G it is
perfectly fine, the iommupt page bitmap is already masked by this.
> + ret = iommu_device_register(&hv_iommu->iommu, &hv_iommu_ops, NULL);
> + if (ret) {
> + pr_err("iommu_device_register failed: %d\n", ret);
> + goto err_sysfs_remove;
> + }
> +
> + pr_info("successfully initialized\n");
Don't log someting so vauge?
Jason