On 3/21/2025 4:22 PM, Emily Deng wrote:
> It need to check whether kq has been initialized correctly in 
> kq_acquire_packet_buffer.
> Or it will hit memory corruption during recover, as for recover, it will 
> uninitialize
> kq first.
> 
> Need to flush tlb after recover successully, as it maybe has create bo and
> map bo during recover.

Is this related to any specific type of 'reset'? For mode-2/mode-1 type
of resets, expectation is GC as whole is reset which includes GPU VM block.

Thanks,
Lijo

> 
> Signed-off-by: Emily Deng <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  4 ++++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 22 +++++++++++++++++++
>  4 files changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index b9c82be6ce13..eb2df5842618 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -1000,6 +1000,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
>               return 0;
>  
>       for (i = 0; i < kfd->num_nodes; i++) {
> +             kfd_flush_all_processes(kfd->nodes[i]);
>               ret = kfd_resume(kfd->nodes[i]);
>               if (ret)
>                       return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2b0a830f5b29..5e4ae969818e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -238,6 +238,10 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq,
>       uint64_t wptr64;
>       unsigned int *queue_address;
>  
> +     if (!kq) {
> +             pr_debug("kq has not been initialized\n");
> +             goto err_no_space;
> +     }
>       /* When rptr == wptr, the buffer is empty.
>        * When rptr == wptr + 1, the buffer is full.
>        * It is always rptr that advances to the position of wptr, rather than
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f6aedf69c644..6c073ead2b06 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1059,7 +1059,7 @@ int kfd_process_evict_queues(struct kfd_process *p, 
> uint32_t trigger);
>  int kfd_process_restore_queues(struct kfd_process *p);
>  void kfd_suspend_all_processes(void);
>  int kfd_resume_all_processes(void);
> -
> +void kfd_flush_all_processes(struct kfd_node *node);
>  struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process 
> *process,
>                                                        uint32_t gpu_id);
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 7c0c24732481..4ed03359020b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -2110,6 +2110,28 @@ int kfd_resume_all_processes(void)
>       return ret;
>  }
>  
> +void kfd_flush_all_processes(struct kfd_node *node)
> +{
> +     struct kfd_process *p;
> +     struct kfd_process_device *pdd;
> +     unsigned int temp;
> +     int idx = srcu_read_lock(&kfd_processes_srcu);
> +     struct amdgpu_vm *vm;
> +
> +     hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
> +             pdd = kfd_get_process_device_data(node, p);
> +             if (!pdd)
> +                     continue;
> +             vm = drm_priv_to_vm(pdd->drm_priv);
> +             if (!vm)
> +                     continue;
> +             atomic64_inc(&vm->tlb_seq);
> +             kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
> +     }
> +     srcu_read_unlock(&kfd_processes_srcu, idx);
> +
> +}
> +
>  int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process,
>                         struct vm_area_struct *vma)
>  {

Reply via email to