Hi Leon,

> Subject: [PATCH 10/10] vfio/pci: Add dma-buf export support for MMIO
> regions
> 
> From: Leon Romanovsky <leo...@nvidia.com>
> 
> Add support for exporting PCI device MMIO regions through dma-buf,
> enabling safe sharing of non-struct page memory with controlled
> lifetime management. This allows RDMA and other subsystems to import
> dma-buf FDs and build them into memory regions for PCI P2P operations.
> 
> The implementation provides a revocable attachment mechanism using
> dma-buf move operations. MMIO regions are normally pinned as BARs
> don't change physical addresses, but access is revoked when the VFIO
> device is closed or a PCI reset is issued. This ensures kernel
> self-defense against potentially hostile userspace.
> 
> Signed-off-by: Jason Gunthorpe <j...@nvidia.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasire...@intel.com>
> Signed-off-by: Leon Romanovsky <leo...@nvidia.com>
> ---
>  drivers/vfio/pci/Kconfig           |  20 ++
>  drivers/vfio/pci/Makefile          |   2 +
>  drivers/vfio/pci/vfio_pci_config.c |  22 +-
>  drivers/vfio/pci/vfio_pci_core.c   |  25 ++-
>  drivers/vfio/pci/vfio_pci_dmabuf.c | 321 +++++++++++++++++++++++++++++
>  drivers/vfio/pci/vfio_pci_priv.h   |  23 +++
>  include/linux/dma-buf.h            |   1 +
>  include/linux/vfio_pci_core.h      |   3 +
>  include/uapi/linux/vfio.h          |  19 ++
>  9 files changed, 431 insertions(+), 5 deletions(-)
>  create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c
> 
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 2b0172f546652..55ae888bf26ae 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -55,6 +55,26 @@ config VFIO_PCI_ZDEV_KVM
> 
>         To enable s390x KVM vfio-pci extensions, say Y.
> 
> +config VFIO_PCI_DMABUF
> +     bool "VFIO PCI extensions for DMA-BUF"
> +     depends on VFIO_PCI_CORE
> +     depends on PCI_P2PDMA && DMA_SHARED_BUFFER
> +     default y
> +     help
> +       Enable support for VFIO PCI extensions that allow exporting
> +       device MMIO regions as DMA-BUFs for peer devices to access via
> +       peer-to-peer (P2P) DMA.
> +
> +       This feature enables a VFIO-managed PCI device to export a portion
> +       of its MMIO BAR as a DMA-BUF file descriptor, which can be passed
> +       to other userspace drivers or kernel subsystems capable of
> +       initiating DMA to that region.
> +
> +       Say Y here if you want to enable VFIO DMABUF-based MMIO export
> +       support for peer-to-peer DMA use cases.
> +
> +       If unsure, say N.
> +
>  source "drivers/vfio/pci/mlx5/Kconfig"
> 
>  source "drivers/vfio/pci/hisilicon/Kconfig"
> diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> index cf00c0a7e55c8..f9155e9c5f630 100644
> --- a/drivers/vfio/pci/Makefile
> +++ b/drivers/vfio/pci/Makefile
> @@ -2,7 +2,9 @@
> 
>  vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o
> vfio_pci_config.o
>  vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
> +
>  obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
> +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
> 
>  vfio-pci-y := vfio_pci.o
>  vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
> diff --git a/drivers/vfio/pci/vfio_pci_config.c
> b/drivers/vfio/pci/vfio_pci_config.c
> index 8f02f236b5b4b..7e23387a43b4d 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct
> vfio_pci_core_device *vdev, int pos,
>               virt_mem = !!(le16_to_cpu(*virt_cmd) &
> PCI_COMMAND_MEMORY);
>               new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
> 
> -             if (!new_mem)
> +             if (!new_mem) {
>                       vfio_pci_zap_and_down_write_memory_lock(vdev);
> -             else
> +                     vfio_pci_dma_buf_move(vdev, true);
> +             } else {
>                       down_write(&vdev->memory_lock);
> +             }
> 
>               /*
>                * If the user is writing mem/io enable (new_mem/io) and we
> @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct
> vfio_pci_core_device *vdev, int pos,
>               *virt_cmd &= cpu_to_le16(~mask);
>               *virt_cmd |= cpu_to_le16(new_cmd & mask);
> 
> +             if (__vfio_pci_memory_enabled(vdev))
> +                     vfio_pci_dma_buf_move(vdev, false);
>               up_write(&vdev->memory_lock);
>       }
> 
> @@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct
> perm_bits *perm)
>  static void vfio_lock_and_set_power_state(struct vfio_pci_core_device
> *vdev,
>                                         pci_power_t state)
>  {
> -     if (state >= PCI_D3hot)
> +     if (state >= PCI_D3hot) {
>               vfio_pci_zap_and_down_write_memory_lock(vdev);
> -     else
> +             vfio_pci_dma_buf_move(vdev, true);
> +     } else {
>               down_write(&vdev->memory_lock);
> +     }
> 
>       vfio_pci_set_power_state(vdev, state);
> +     if (__vfio_pci_memory_enabled(vdev))
> +             vfio_pci_dma_buf_move(vdev, false);
>       up_write(&vdev->memory_lock);
>  }
> 
> @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct
> vfio_pci_core_device *vdev, int pos,
> 
>               if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) {
>                       vfio_pci_zap_and_down_write_memory_lock(vdev);
> +                     vfio_pci_dma_buf_move(vdev, true);
>                       pci_try_reset_function(vdev->pdev);
> +                     if (__vfio_pci_memory_enabled(vdev))
> +                             vfio_pci_dma_buf_move(vdev, true);
>                       up_write(&vdev->memory_lock);
>               }
>       }
> @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct
> vfio_pci_core_device *vdev, int pos,
> 
>               if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP))
> {
>                       vfio_pci_zap_and_down_write_memory_lock(vdev);
> +                     vfio_pci_dma_buf_move(vdev, true);
>                       pci_try_reset_function(vdev->pdev);
> +                     if (__vfio_pci_memory_enabled(vdev))
> +                             vfio_pci_dma_buf_move(vdev, true);
>                       up_write(&vdev->memory_lock);
>               }
>       }
> diff --git a/drivers/vfio/pci/vfio_pci_core.c 
> b/drivers/vfio/pci/vfio_pci_core.c
> index 5512d13bb8899..e5ab5d1cafd9c 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -29,7 +29,9 @@
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
>  #include <linux/iommufd.h>
> +#ifdef CONFIG_VFIO_PCI_DMABUF
>  #include <linux/pci-p2pdma.h>
> +#endif
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -288,6 +290,8 @@ static int vfio_pci_runtime_pm_entry(struct
> vfio_pci_core_device *vdev,
>        * semaphore.
>        */
>       vfio_pci_zap_and_down_write_memory_lock(vdev);
> +     vfio_pci_dma_buf_move(vdev, true);
> +
>       if (vdev->pm_runtime_engaged) {
>               up_write(&vdev->memory_lock);
>               return -EINVAL;
> @@ -371,6 +375,8 @@ static void vfio_pci_runtime_pm_exit(struct
> vfio_pci_core_device *vdev)
>        */
>       down_write(&vdev->memory_lock);
>       __vfio_pci_runtime_pm_exit(vdev);
> +     if (__vfio_pci_memory_enabled(vdev))
> +             vfio_pci_dma_buf_move(vdev, false);
>       up_write(&vdev->memory_lock);
>  }
> 
> @@ -691,6 +697,8 @@ void vfio_pci_core_close_device(struct vfio_device
> *core_vdev)
>  #endif
>       vfio_pci_core_disable(vdev);
> 
> +     vfio_pci_dma_buf_cleanup(vdev);
> +
>       mutex_lock(&vdev->igate);
>       if (vdev->err_trigger) {
>               eventfd_ctx_put(vdev->err_trigger);
> @@ -1223,7 +1231,10 @@ static int vfio_pci_ioctl_reset(struct
> vfio_pci_core_device *vdev,
>        */
>       vfio_pci_set_power_state(vdev, PCI_D0);
> 
> +     vfio_pci_dma_buf_move(vdev, true);
>       ret = pci_try_reset_function(vdev->pdev);
> +     if (__vfio_pci_memory_enabled(vdev))
> +             vfio_pci_dma_buf_move(vdev, false);
>       up_write(&vdev->memory_lock);
> 
>       return ret;
> @@ -1512,6 +1523,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device
> *device, u32 flags,
>               return vfio_pci_core_pm_exit(vdev, flags, arg, argsz);
>       case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>               return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
> +     case VFIO_DEVICE_FEATURE_DMA_BUF:
> +             return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
>       default:
>               return -ENOTTY;
>       }
> @@ -2088,9 +2101,13 @@ int vfio_pci_core_init_dev(struct vfio_device
> *core_vdev)
>       INIT_LIST_HEAD(&vdev->dummy_resources_list);
>       INIT_LIST_HEAD(&vdev->ioeventfds_list);
>       INIT_LIST_HEAD(&vdev->sriov_pfs_item);
> +#ifdef CONFIG_VFIO_PCI_DMABUF
>       vdev->provider = pci_p2pdma_enable(vdev->pdev);
>       if (IS_ERR(vdev->provider))
>               return PTR_ERR(vdev->provider);
> +
> +     INIT_LIST_HEAD(&vdev->dmabufs);
> +#endif
>       init_rwsem(&vdev->memory_lock);
>       xa_init(&vdev->ctx);
> 
> @@ -2473,11 +2490,17 @@ static int vfio_pci_dev_set_hot_reset(struct
> vfio_device_set *dev_set,
>        * cause the PCI config space reset without restoring the original
>        * state (saved locally in 'vdev->pm_save').
>        */
> -     list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +     list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
> +             vfio_pci_dma_buf_move(vdev, true);
>               vfio_pci_set_power_state(vdev, PCI_D0);
> +     }
> 
>       ret = pci_reset_bus(pdev);
> 
> +     list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
> +             if (__vfio_pci_memory_enabled(vdev))
> +                     vfio_pci_dma_buf_move(vdev, false);
> +
>       vdev = list_last_entry(&dev_set->device_list,
>                              struct vfio_pci_core_device, vdev.dev_set_list);
> 
> diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c
> b/drivers/vfio/pci/vfio_pci_dmabuf.c
> new file mode 100644
> index 0000000000000..5fefcdecd1329
> --- /dev/null
> +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
> @@ -0,0 +1,321 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
> + */
> +#include <linux/dma-buf.h>
> +#include <linux/pci-p2pdma.h>
> +#include <linux/dma-resv.h>
> +
> +#include "vfio_pci_priv.h"
> +
> +MODULE_IMPORT_NS("DMA_BUF");
> +
> +struct vfio_pci_dma_buf {
> +     struct dma_buf *dmabuf;
> +     struct vfio_pci_core_device *vdev;
> +     struct list_head dmabufs_elm;
> +     struct phys_vec phys_vec;
> +     u8 revoked : 1;
> +};
> +
> +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
> +                                struct dma_buf_attachment *attachment)
> +{
> +     struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +     if (!attachment->peer2peer)
> +             return -EOPNOTSUPP;
> +
> +     if (priv->revoked)
> +             return -ENODEV;
> +
> +     switch (pci_p2pdma_map_type(priv->vdev->provider, attachment-
> >dev)) {
> +     case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
> +             break;
> +     case PCI_P2PDMA_MAP_BUS_ADDR:
> +             /*
> +              * There is no need in IOVA at all for this flow.
> +              * We rely on attachment->priv == NULL as a marker
> +              * for this mode.
> +              */
> +             return 0;
> +     default:
> +             return -EINVAL;
> +     }
> +
> +     attachment->priv = kzalloc(sizeof(struct dma_iova_state),
> GFP_KERNEL);
> +     if (!attachment->priv)
> +             return -ENOMEM;
> +
> +     dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv-
> >phys_vec.len);
> +     return 0;
> +}
> +
> +static void vfio_pci_dma_buf_detach(struct dma_buf *dmabuf,
> +                                 struct dma_buf_attachment *attachment)
> +{
> +     kfree(attachment->priv);
> +}
> +
> +static void fill_sg_entry(struct scatterlist *sgl, unsigned int length,
> +                      dma_addr_t addr)
> +{
> +     sg_set_page(sgl, NULL, length, 0);
> +     sg_dma_address(sgl) = addr;
> +     sg_dma_len(sgl) = length;
> +}
> +
> +static struct sg_table *
> +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
> +                  enum dma_data_direction dir)
> +{
> +     struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +     struct p2pdma_provider *provider = priv->vdev->provider;
> +     struct dma_iova_state *state = attachment->priv;
> +     struct phys_vec *phys_vec = &priv->phys_vec;
> +     struct scatterlist *sgl;
> +     struct sg_table *sgt;
> +     dma_addr_t addr;
> +     int ret;
> +
> +     dma_resv_assert_held(priv->dmabuf->resv);
> +
> +     sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> +     if (!sgt)
> +             return ERR_PTR(-ENOMEM);
> +
> +     ret = sg_alloc_table(sgt, 1, GFP_KERNEL | __GFP_ZERO);
> +     if (ret)
> +             goto err_kfree_sgt;
> +
> +     sgl = sgt->sgl;
> +
> +     if (!state) {
> +             addr = pci_p2pdma_bus_addr_map(provider, phys_vec-
> >paddr);
> +     } else if (dma_use_iova(state)) {
> +             ret = dma_iova_link(attachment->dev, state, phys_vec->paddr,
> 0,
> +                                 phys_vec->len, dir,
> DMA_ATTR_SKIP_CPU_SYNC);
> +             if (ret)
> +                     goto err_free_table;
> +
> +             ret = dma_iova_sync(attachment->dev, state, 0, phys_vec-
> >len);
> +             if (ret)
> +                     goto err_unmap_dma;
> +
> +             addr = state->addr;
> +     } else {
> +             addr = dma_map_phys(attachment->dev, phys_vec->paddr,
> +                                 phys_vec->len, dir,
> DMA_ATTR_SKIP_CPU_SYNC);
> +             ret = dma_mapping_error(attachment->dev, addr);
> +             if (ret)
> +                     goto err_free_table;
> +     }
> +
> +     fill_sg_entry(sgl, phys_vec->len, addr);
> +     return sgt;
> +
> +err_unmap_dma:
> +     dma_iova_destroy(attachment->dev, state, phys_vec->len, dir,
> +                      DMA_ATTR_SKIP_CPU_SYNC);
> +err_free_table:
> +     sg_free_table(sgt);
> +err_kfree_sgt:
> +     kfree(sgt);
> +     return ERR_PTR(ret);
> +}
> +
> +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment
> *attachment,
> +                                struct sg_table *sgt,
> +                                enum dma_data_direction dir)
> +{
> +     struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
> +     struct dma_iova_state *state = attachment->priv;
> +     struct scatterlist *sgl;
> +     int i;
> +
> +     if (!state)
> +             ; /* Do nothing */
> +     else if (dma_use_iova(state))
> +             dma_iova_destroy(attachment->dev, state, priv->phys_vec.len,
> +                              dir, DMA_ATTR_SKIP_CPU_SYNC);
> +     else
> +             for_each_sgtable_dma_sg(sgt, sgl, i)
> +                     dma_unmap_phys(attachment->dev,
> sg_dma_address(sgl),
> +                                    sg_dma_len(sgl), dir,
> +                                    DMA_ATTR_SKIP_CPU_SYNC);
> +
> +     sg_free_table(sgt);
> +     kfree(sgt);
> +}
> +
> +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
> +{
> +     struct vfio_pci_dma_buf *priv = dmabuf->priv;
> +
> +     /*
> +      * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
> +      * The refcount prevents both.
> +      */
> +     if (priv->vdev) {
> +             down_write(&priv->vdev->memory_lock);
> +             list_del_init(&priv->dmabufs_elm);
> +             up_write(&priv->vdev->memory_lock);
> +             vfio_device_put_registration(&priv->vdev->vdev);
> +     }
> +     kfree(priv);
> +}
> +
> +static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
> +     .attach = vfio_pci_dma_buf_attach,
> +     .detach = vfio_pci_dma_buf_detach,
> +     .map_dma_buf = vfio_pci_dma_buf_map,
> +     .release = vfio_pci_dma_buf_release,
> +     .unmap_dma_buf = vfio_pci_dma_buf_unmap,
> +};
> +
> +static void dma_ranges_to_p2p_phys(struct vfio_pci_dma_buf *priv,
> +                                struct vfio_device_feature_dma_buf
> *dma_buf)
> +{
> +     struct pci_dev *pdev = priv->vdev->pdev;
> +
> +     priv->phys_vec.len = dma_buf->length;
> +     priv->phys_vec.paddr = pci_resource_start(pdev, dma_buf-
> >region_index);
> +     priv->phys_vec.paddr += dma_buf->offset;
> +}
> +
> +static int validate_dmabuf_input(struct vfio_pci_core_device *vdev,
> +                              struct vfio_device_feature_dma_buf *dma_buf)
> +{
> +     struct pci_dev *pdev = vdev->pdev;
> +     u32 bar = dma_buf->region_index;
> +     u64 offset = dma_buf->offset;
> +     u64 len = dma_buf->length;
> +     resource_size_t bar_size;
> +     u64 sum;
> +
> +     /*
> +      * For PCI the region_index is the BAR number like  everything else.
> +      */
> +     if (bar >= VFIO_PCI_ROM_REGION_INDEX)
> +             return -ENODEV;
> +
> +     if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
> +             return -EINVAL;
> +
> +     if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> +             return -EINVAL;
> +
> +     bar_size = pci_resource_len(pdev, bar);
> +     if (check_add_overflow(offset, len, &sum) || sum > bar_size)
> +             return -EINVAL;
> +
> +     return 0;
> +}
> +
> +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32
> flags,
> +                               struct vfio_device_feature_dma_buf __user
> *arg,
> +                               size_t argsz)
> +{
> +     struct vfio_device_feature_dma_buf get_dma_buf = {};
> +     DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
> +     struct vfio_pci_dma_buf *priv;
> +     int ret;
> +
> +     ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
> +                              sizeof(get_dma_buf));
> +     if (ret != 1)
> +             return ret;
> +
> +     if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
> +             return -EFAULT;
> +
> +     ret = validate_dmabuf_input(vdev, &get_dma_buf);
> +     if (ret)
> +             return ret;
> +
> +     priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +     if (!priv)
> +             return -ENOMEM;
> +
> +     priv->vdev = vdev;
> +     dma_ranges_to_p2p_phys(priv, &get_dma_buf);
> +
> +     if (!vfio_device_try_get_registration(&vdev->vdev)) {
> +             ret = -ENODEV;
> +             goto err_free_priv;
> +     }
> +
> +     exp_info.ops = &vfio_pci_dmabuf_ops;
> +     exp_info.size = priv->phys_vec.len;
> +     exp_info.flags = get_dma_buf.open_flags;
> +     exp_info.priv = priv;
> +
> +     priv->dmabuf = dma_buf_export(&exp_info);
> +     if (IS_ERR(priv->dmabuf)) {
> +             ret = PTR_ERR(priv->dmabuf);
> +             goto err_dev_put;
> +     }
> +
> +     /* dma_buf_put() now frees priv */
> +     INIT_LIST_HEAD(&priv->dmabufs_elm);
> +     down_write(&vdev->memory_lock);
> +     dma_resv_lock(priv->dmabuf->resv, NULL);
> +     priv->revoked = !__vfio_pci_memory_enabled(vdev);
> +     list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
> +     dma_resv_unlock(priv->dmabuf->resv);
> +     up_write(&vdev->memory_lock);
> +
> +     /*
> +      * dma_buf_fd() consumes the reference, when the file closes the
> dmabuf
> +      * will be released.
> +      */
> +     return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
> +
> +err_dev_put:
> +     vfio_device_put_registration(&vdev->vdev);
> +err_free_priv:
> +     kfree(priv);
> +     return ret;
> +}
> +
> +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool
> revoked)
> +{
> +     struct vfio_pci_dma_buf *priv;
> +     struct vfio_pci_dma_buf *tmp;
> +
> +     lockdep_assert_held_write(&vdev->memory_lock);
> +
> +     list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> +             if (!get_file_active(&priv->dmabuf->file))
> +                     continue;
> +
> +             if (priv->revoked != revoked) {
> +                     dma_resv_lock(priv->dmabuf->resv, NULL);
> +                     priv->revoked = revoked;
> +                     dma_buf_move_notify(priv->dmabuf);
> +                     dma_resv_unlock(priv->dmabuf->resv);
> +             }
> +             dma_buf_put(priv->dmabuf);
> +     }
> +}
> +
> +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
> +{
> +     struct vfio_pci_dma_buf *priv;
> +     struct vfio_pci_dma_buf *tmp;
> +
> +     down_write(&vdev->memory_lock);
> +     list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
> +             if (!get_file_active(&priv->dmabuf->file))
> +                     continue;
> +
> +             dma_resv_lock(priv->dmabuf->resv, NULL);
> +             list_del_init(&priv->dmabufs_elm);
> +             priv->vdev = NULL;
> +             priv->revoked = true;
> +             dma_buf_move_notify(priv->dmabuf);
> +             dma_resv_unlock(priv->dmabuf->resv);
> +             vfio_device_put_registration(&vdev->vdev);
> +             dma_buf_put(priv->dmabuf);
> +     }
> +     up_write(&vdev->memory_lock);
> +}
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h 
> b/drivers/vfio/pci/vfio_pci_priv.h
> index a9972eacb2936..28a405f8b97c9 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -107,4 +107,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev
> *pdev)
>       return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA;
>  }
> 
> +#ifdef CONFIG_VFIO_PCI_DMABUF
> +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32
> flags,
> +                               struct vfio_device_feature_dma_buf __user
> *arg,
> +                               size_t argsz);
> +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
> +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool
> revoked);
> +#else
> +static inline int
> +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32
> flags,
> +                           struct vfio_device_feature_dma_buf __user *arg,
> +                           size_t argsz)
> +{
> +     return -ENOTTY;
> +}
> +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device
> *vdev)
> +{
> +}
> +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device
> *vdev,
> +                                      bool revoked)
> +{
> +}
> +#endif
> +
>  #endif
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index d58e329ac0e71..f14b413aae48d 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -483,6 +483,7 @@ struct dma_buf_attach_ops {
>   * @dev: device attached to the buffer.
>   * @node: list of dma_buf_attachment, protected by dma_resv lock of the
> dmabuf.
>   * @peer2peer: true if the importer can handle peer resources without pages.
> + * #state: DMA structure to provide support for physical addresses DMA
> interface
>   * @priv: exporter specific attachment data.
>   * @importer_ops: importer operations for this attachment, if provided
>   * dma_buf_map/unmap_attachment() must be called with the dma_resv
> lock held.
> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
> index b017fae251811..548cbb51bf146 100644
> --- a/include/linux/vfio_pci_core.h
> +++ b/include/linux/vfio_pci_core.h
> @@ -94,7 +94,10 @@ struct vfio_pci_core_device {
>       struct vfio_pci_core_device     *sriov_pf_core_dev;
>       struct notifier_block   nb;
>       struct rw_semaphore     memory_lock;
> +#ifdef CONFIG_VFIO_PCI_DMABUF
>       struct p2pdma_provider  *provider;
> +     struct list_head        dmabufs;
> +#endif
>  };
> 
>  /* Will be exported for vfio pci drivers usage */
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 5764f315137f9..ad8e303697f97 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1468,6 +1468,25 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
> 
> +/**
> + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
> + * regions selected.
> + *
> + * open_flags are the typical flags passed to open(2), eg O_RDWR,
> O_CLOEXEC,
> + * etc. offset/length specify a slice of the region to create the dmabuf 
> from.
> + * nr_ranges is the total number of (P2P DMA) ranges that comprise the
> dmabuf.
Any particular reason why you dropped the option (nr_ranges) of creating a
single dmabuf from multiple ranges of an MMIO region?

Restricting the dmabuf to a single range (or having to create multiple dmabufs
to represent multiple regions/ranges associated with a single scattered buffer)
would be very limiting and may not work in all cases. For instance, in my 
use-case,
I am trying to share a large (4k mode) framebuffer (FB) located in GPU's VRAM
between two (p2p compatible) GPU devices. And, this would probably not work
given that allocating a large contiguous FB (nr_ranges = 1) in VRAM may not be
feasible when there is memory pressure.

Furthermore, since you are adding a new UAPI with this patch/feature, as you 
know,
we cannot go back and tweak it (to add support for nr_ranges > 1) should there
be a need in the future, but you can always use nr_ranges = 1 anytime. 
Therefore,
I think it makes sense to be flexible in terms of the number of ranges to 
include
while creating a dmabuf instead of restricting ourselves to one range.

Thanks,
Vivek

> + *
> + * Return: The fd number on success, -1 and errno is set on failure.
> + */
> +#define VFIO_DEVICE_FEATURE_DMA_BUF 11
> +
> +struct vfio_device_feature_dma_buf {
> +     __u32   region_index;
> +     __u32   open_flags;
> +     __u64   offset;
> +     __u64   length;
> +};
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
> 
>  /**
> --
> 2.50.1

Reply via email to