From: Manish Honap <[email protected]>

Detect a vendor-specific CXL device at vfio-pci bind time and probe
its HDM decoder register block.

vfio_cxl_create_device_state() allocates per-device state via devm and
reads MEM_CAPABLE and CACHE_CAPABLE from the CXL DVSEC.

vfio_cxl_setup_regs() locates the component register block, temporarily
maps it, calls cxl_probe_component_regs() to find the HDM block, then
releases the mapping.

vfio_pci_cxl_detect_and_init() chains these two steps. If either fails,
vdev->cxl stays NULL and the device falls back to plain vfio-pci.

Signed-off-by: Manish Honap <[email protected]>
---
 drivers/vfio/pci/cxl/vfio_cxl_core.c | 217 +++++++++++++++++++++++++++
 drivers/vfio/pci/cxl/vfio_cxl_priv.h |  12 ++
 2 files changed, 229 insertions(+)

diff --git a/drivers/vfio/pci/cxl/vfio_cxl_core.c 
b/drivers/vfio/pci/cxl/vfio_cxl_core.c
index d12afec82ecd..b1c7603590b5 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_core.c
+++ b/drivers/vfio/pci/cxl/vfio_cxl_core.c
@@ -21,6 +21,158 @@
 #include "../vfio_pci_priv.h"
 #include "vfio_cxl_priv.h"
 
+/*
+ * vfio_cxl_create_device_state - Allocate and validate CXL device state
+ *
+ * Returns a pointer to the allocated vfio_pci_cxl_state on success, or
+ * ERR_PTR on failure.  The allocation uses devm; the caller must call
+ * devm_kfree(&pdev->dev, cxl) on any subsequent setup failure to release
+ * the resource before device unbind.  Using devm_kfree() to undo a devm
+ * allocation early is explicitly supported by the devres API.
+ *
+ * The caller assigns vdev->cxl only after all setup steps succeed, preventing
+ * partially-initialised state from being visible through vdev->cxl on any
+ * failure path.
+ */
+static struct vfio_pci_cxl_state *
+vfio_cxl_create_device_state(struct pci_dev *pdev, u16 dvsec)
+{
+       struct vfio_pci_cxl_state *cxl;
+       u16 cap_word;
+       u32 hdr1;
+
+       /* Freed automatically when pdev->dev is released. */
+       cxl = devm_cxl_dev_state_create(&pdev->dev,
+                                       CXL_DEVTYPE_DEVMEM,
+                                       pdev->dev.id, dvsec,
+                                       struct vfio_pci_cxl_state,
+                                       cxlds, false);
+       if (!cxl)
+               return ERR_PTR(-ENOMEM);
+
+       pci_read_config_dword(pdev, dvsec + PCI_DVSEC_HEADER1, &hdr1);
+       cxl->dvsec_len = PCI_DVSEC_HEADER1_LEN(hdr1);
+
+       pci_read_config_word(pdev, dvsec + CXL_DVSEC_CAPABILITY_OFFSET,
+                            &cap_word);
+
+       /*
+        * Only handle vendor devices (class != 0x0502) with Mem_Capable set.
+        * CACHE_CAPABLE is forwarded to the VMM so it knows whether a WBI
+        * sequence is needed before FLR.
+        */
+       if (!FIELD_GET(CXL_DVSEC_MEM_CAPABLE, cap_word) ||
+           (pdev->class >> 8) == PCI_CLASS_MEMORY_CXL) {
+               devm_kfree(&pdev->dev, cxl);
+               return ERR_PTR(-ENODEV);
+       }
+
+       cxl->cache_capable = FIELD_GET(CXL_DVSEC_CACHE_CAPABLE, cap_word);
+
+       return cxl;
+}
+
+static int vfio_cxl_setup_regs(struct vfio_pci_core_device *vdev,
+                              struct vfio_pci_cxl_state *cxl)
+{
+       struct cxl_register_map *map = &cxl->cxlds.reg_map;
+       resource_size_t offset, bar_offset, size;
+       struct pci_dev *pdev = vdev->pdev;
+       void __iomem *base;
+       int ret;
+       u8 count;
+       u8 bar;
+
+       if (WARN_ON_ONCE(!pci_is_enabled(pdev)))
+               return -EINVAL;
+
+       /* Find component register block via Register Locator DVSEC */
+       ret = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, map);
+       if (ret)
+               return ret;
+
+       /*
+        * Request the region and map.  This is a transient mapping
+        * used only to probe register capabilities; released immediately
+        * after cxl_probe_component_regs() returns.
+        */
+       if (!request_mem_region(map->resource, map->max_size, "vfio-cxl-probe"))
+               return -EBUSY;
+
+       base = ioremap(map->resource, map->max_size);
+       if (!base) {
+               ret = -ENOMEM;
+               goto failed_release;
+       }
+
+       /* Probe component register capabilities */
+       cxl_probe_component_regs(&pdev->dev, base, &map->component_map);
+
+       /* Check if HDM decoder was found */
+       if (!map->component_map.hdm_decoder.valid) {
+               ret = -ENODEV;
+               goto failed_unmap;
+       }
+
+       pci_dbg(pdev, "vfio_cxl: HDM decoder at offset=0x%lx, size=0x%lx\n",
+               map->component_map.hdm_decoder.offset,
+               map->component_map.hdm_decoder.size);
+
+       /* Get HDM register info */
+       ret = cxl_get_hdm_info(&cxl->cxlds, &count, &offset, &size);
+       if (ret)
+               goto failed_unmap;
+
+       if (!count || !size) {
+               ret = -ENODEV;
+               goto failed_unmap;
+       }
+
+       cxl->hdm_count = count;
+       /*
+        * cxl_get_hdm_info() returns rmap->offset = CXL_CM_OFFSET + 
<hdm_within_cm>
+        * (see cxl_probe_component_regs() which does base += CXL_CM_OFFSET 
before
+        * reading caps and stores CXL_CM_OFFSET + cap_ptr as the offset).
+        * Subtract CXL_CM_OFFSET so hdm_reg_offset is relative to the CXL.mem
+        * register area start, which is where comp_reg_virt[0] is anchored.
+        * The physical BAR address for hdm_iobase is recovered by adding
+        * CXL_CM_OFFSET back in vfio_cxl_setup_virt_regs().
+        */
+       cxl->hdm_reg_offset = offset - CXL_CM_OFFSET;
+       cxl->hdm_reg_size = size;
+
+       ret = cxl_regblock_get_bar_info(map, &bar, &bar_offset);
+       if (ret)
+               goto failed_unmap;
+
+       cxl->comp_reg_bar = bar;
+       cxl->comp_reg_offset = bar_offset;
+       cxl->comp_reg_size = CXL_COMPONENT_REG_BLOCK_SIZE;
+
+       iounmap(base);
+       release_mem_region(map->resource, map->max_size);
+
+       return 0;
+
+failed_unmap:
+       iounmap(base);
+failed_release:
+       release_mem_region(map->resource, map->max_size);
+
+       return ret;
+}
+
+/*
+ * Free CXL state early on probe failure.  devm_kfree() on a live devres
+ * allocation removes it from the list immediately, so the normal devres
+ * teardown at unbind time won't double-free it.
+ */
+static void vfio_cxl_dev_state_free(struct pci_dev *pdev,
+                                   struct vfio_pci_cxl_state *cxl)
+{
+       devm_kfree(&pdev->dev, cxl);
+}
+
 /**
  * vfio_pci_cxl_detect_and_init - Detect and initialize a vendor-specific
  *                                CXL.mem device
@@ -32,10 +184,75 @@
  */
 void vfio_pci_cxl_detect_and_init(struct vfio_pci_core_device *vdev)
 {
+       struct pci_dev *pdev = vdev->pdev;
+       struct vfio_pci_cxl_state *cxl;
+       u16 dvsec;
+       int ret;
+
+       if (!pcie_is_cxl(pdev))
+               return;
+
+       dvsec = pci_find_dvsec_capability(pdev,
+                                         PCI_VENDOR_ID_CXL,
+                                         PCI_DVSEC_CXL_DEVICE);
+       if (!dvsec)
+               return;
+
+       /*
+        * CXL DVSEC found: any failure from here is a hard probe error on
+        * a confirmed CXL-capable device, not a silent non-CXL fallback.
+        * Warn the operator so misconfiguration is visible.
+        */
+       cxl = vfio_cxl_create_device_state(pdev, dvsec);
+       if (IS_ERR(cxl)) {
+               if (PTR_ERR(cxl) != -ENODEV)
+                       pci_warn(pdev,
+                                "vfio-cxl: CXL device state allocation failed: 
%ld\n",
+                                PTR_ERR(cxl));
+               return;
+       }
+
+       /*
+        * Required for ioremap of the component register block and
+        * calls to cxl_probe_component_regs().
+        */
+       ret = pci_enable_device_mem(pdev);
+       if (ret) {
+               pci_warn(pdev,
+                        "vfio-cxl: pci_enable_device_mem failed: %d\n", ret);
+               goto free_cxl;
+       }
+
+       ret = vfio_cxl_setup_regs(vdev, cxl);
+       if (ret) {
+               pci_warn(pdev,
+                        "vfio-cxl: HDM register probing failed: %d\n", ret);
+               pci_disable_device(pdev);
+               goto free_cxl;
+       }
+
+       pci_disable_device(pdev);
+
+       /*
+        * Register probing succeeded.  Assign vdev->cxl now so that
+        * all subsequent helpers can access state via vdev->cxl.
+        * All failure paths below clear vdev->cxl before calling
+        * vfio_cxl_dev_state_free().
+        */
+       vdev->cxl = cxl;
+
+       return;
+
+free_cxl:
+       vfio_cxl_dev_state_free(pdev, cxl);
 }
 
 void vfio_pci_cxl_cleanup(struct vfio_pci_core_device *vdev)
 {
+       struct vfio_pci_cxl_state *cxl = vdev->cxl;
+
+       if (!cxl)
+               return;
 }
 
 MODULE_IMPORT_NS("CXL");
diff --git a/drivers/vfio/pci/cxl/vfio_cxl_priv.h 
b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
index 4cecc25db410..54b1f6d885aa 100644
--- a/drivers/vfio/pci/cxl/vfio_cxl_priv.h
+++ b/drivers/vfio/pci/cxl/vfio_cxl_priv.h
@@ -21,8 +21,20 @@ struct vfio_pci_cxl_state {
        size_t                       hdm_reg_size;
        resource_size_t              comp_reg_offset;
        size_t                       comp_reg_size;
+       u16                          dvsec_len;
        u8                           hdm_count;
        u8                           comp_reg_bar;
+       bool                         cache_capable;
 };
 
+/*
+ * CXL DVSEC for CXL Devices - register offsets within the DVSEC
+ * (CXL 4.0 8.1.3).
+ * Offsets are relative to the DVSEC capability base (cxl->dvsec).
+ */
+#define CXL_DVSEC_CAPABILITY_OFFSET 0xa
+#define CXL_DVSEC_MEM_CAPABLE      BIT(2)
+/* CXL DVSEC Capability register bit 0: device supports CXL.cache (HDM-DB) */
+#define CXL_DVSEC_CACHE_CAPABLE            BIT(0)
+
 #endif /* __LINUX_VFIO_CXL_PRIV_H */
-- 
2.25.1


Reply via email to