On 5/11/26 16:22, Yifan Zhang wrote:
> During Mode 1 reset, the ASIC undergoes a reset cycle and becomes temporarily
> inaccessible via PCIe. Any attempt to access framebuffer or MMIO registers 
> during
> this window can result in uncompleted PCIe transactions, leading to NMI 
> panics or
> system hangs.
> 
> To prevent this, Unmap all of the applications mappings of the framebuffer
> and doorbell BARs before mode1 reset. Also prevent new mappings from coming in
> during the reset process.
> 
> v2: remove inode in kfd_dev (Christian)
> v3: correct unmap offset (Felix), remove prevent new mappings part to avoid 
> deadlock (Christian)
> 
> Signed-off-by: Yifan Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 22 ++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 ++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   | 22 ++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h      |  1 +
>  5 files changed, 52 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 7b10bbe28caf..d1dac3412a66 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -36,6 +36,7 @@
>  #include "amdgpu_ras.h"
>  #include "amdgpu_umc.h"
>  #include "amdgpu_reset.h"
> +#include "kfd_priv.h"
>  
>  /* Total memory size in system memory and all GPU VRAM. Used to
>   * estimate worst case amount of memory to reserve for page tables
> @@ -320,6 +321,27 @@ void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
>               (void)amdgpu_reset_domain_schedule(adev->reset_domain, 
> &adev->kfd.reset_work);
>  }
>  
> +void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev)
> +{
> +     struct kfd_dev *kfd = adev->kfd.dev;
> +     unsigned int i;
> +
> +     if (!kfd)
> +             return;
> +
> +     for (i = 0; i < kfd->num_nodes; i++) {
> +             struct kfd_node *node = kfd->nodes[i];
> +
> +             kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_DOORBELL |
> +                                         KFD_MMAP_GPU_ID(node->id),
> +                                         kfd_doorbell_process_slice(kfd));
> +             kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_MMIO |
> +                                         KFD_MMAP_GPU_ID(node->id),
> +                                         PAGE_SIZE);
> +     }
> +}
> +
> +
>  int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size,
>                               u32 domain, void **mem_obj, uint64_t *gpu_addr,
>                               void **cpu_ptr, bool cp_mqd_gfx9)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 2bf6a31c194d..5333e052d56d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -360,6 +360,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
> *adev,
>               uint64_t size, u32 alloc_flag, int8_t xcp_id);
>  void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
>               uint64_t size, u32 alloc_flag, int8_t xcp_id);
> +void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev);
>  
>  u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id);
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 1202a72ff063..6760c9331f46 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5844,6 +5844,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>       /* We need to lock reset domain only once both for XGMI and single 
> device */
>       amdgpu_device_recovery_get_reset_lock(adev, &device_list);
>  
> +     /* unmap all the mappings of doorbell and framebuffer to prevent user 
> space from
> +      * accessing them
> +      */
> +     unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
> +     amdgpu_amdkfd_clear_kfd_mapping(adev);
> +
>       amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
>                                     hive, need_emergency_restart);
>       if (need_emergency_restart)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 84b9bde7f371..1be1b1dd2341 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -69,6 +69,21 @@ static const struct class kfd_class = {
>       .name = kfd_dev_name,
>  };
>  
> +/*
> + * Cache the address space of the chardev on first open so that the reset
> + * path can drop all userspace mappings of doorbell and MMIO ranges via
> + * unmap_mapping_range().
> + */
> +static struct address_space *kfd_dev_mapping;
> +
> +void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const 
> holelen)
> +{
> +     struct address_space *mapping = READ_ONCE(kfd_dev_mapping);
> +
> +     if (mapping)
> +             unmap_mapping_range(mapping, holebegin, holelen, 1);
> +}
> +
>  static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct 
> kfd_process *p, __u32 gpu_id)
>  {
>       struct kfd_process_device *pdd;
> @@ -135,6 +150,13 @@ static int kfd_open(struct inode *inode, struct file 
> *filep)
>       if (iminor(inode) != 0)
>               return -ENODEV;
>  
> +     /*
> +      * /dev/kfd is a single chardev so all opens share one inode. Cache
> +      * its address_space on the first open for use by the reset path.
> +      */
> +     if (!READ_ONCE(kfd_dev_mapping))
> +             cmpxchg(&kfd_dev_mapping, NULL, inode->i_mapping);

That stuff looks really odd. Mostly @Felix why is that necessary?

Apart from that the patch looks good to me.

Regards,
Christian.


> +
>       is_32bit_user_mode = in_compat_syscall();
>  
>       if (is_32bit_user_mode) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index a6ff1db477f9..f037062c33ea 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -399,6 +399,7 @@ enum kfd_mempool {
>  /* Character device interface */
>  int kfd_chardev_init(void);
>  void kfd_chardev_exit(void);
> +void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const 
> holelen);
>  
>  /**
>   * enum kfd_unmap_queues_filter - Enum for queue filters.

Reply via email to