On 6/18/2024 12:03 PM, YiPeng Chai wrote:
> Add completion to wait for ras reset to complete.
> 
> Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>  2 files changed, 12 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 898889600771..7f8e6ca07957 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if 
> *ras_block)
>  
>  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
>  
> +#define MAX_RAS_RECOVERY_COMPLETION_TIME  120000 //ms
> +
>  enum amdgpu_ras_retire_page_reservation {
>       AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>       AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -2518,6 +2520,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
> *work)
>               atomic_set(&hive->ras_recovery, 0);
>               amdgpu_put_xgmi_hive(hive);
>       }
> +
> +     complete_all(&ras->ras_recovery_completion);
>  }
>  
>  /* alloc/realloc bps array */
> @@ -2911,10 +2915,16 @@ static int 
> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>  
>               flush_delayed_work(&con->page_retirement_dwork);
>  
> +             reinit_completion(&con->ras_recovery_completion);
> +
>               con->gpu_reset_flags |= reset;
>               amdgpu_ras_reset_gpu(adev);
>  
>               *gpu_reset = reset;
> +             if (!wait_for_completion_timeout(&con->ras_recovery_completion,
> +                             
> msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME)))
> +                     dev_err(adev->dev, "Waiting for GPU to complete ras 
> reset timeout! reset:0x%x\n",
> +                             reset);

If a mode-1 reset gets to execute first due to job timeout/hws detect
cases in poison timeout, then the ras handler will never get executed.
Why this wait is required?

Thanks,
Lijo

>       }
>  
>       return 0;
> @@ -3041,6 +3051,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>               }
>       }
>  
> +     init_completion(&con->ras_recovery_completion);
>       mutex_init(&con->page_rsv_lock);
>       INIT_KFIFO(con->poison_fifo);
>       mutex_init(&con->page_retirement_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 91daf48be03a..b47f03edac87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -537,6 +537,7 @@ struct amdgpu_ras {
>       DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
>       struct ras_ecc_log_info  umc_ecc_log;
>       struct delayed_work page_retirement_dwork;
> +     struct completion ras_recovery_completion;
>  
>       /* Fatal error detected flag */
>       atomic_t fed;

Reply via email to