I thought in the BACO reset function.

The top level reset function doesn't do much more than increment the 
vram_lost_counter either.

Christian.

Am 23.08.19 um 10:32 schrieb Liu, Monk:
>>> On the other hand wouldn't it be simpler to just increment 
>>> vram_lost_counter?
> In where ? if you mean in amdgpu_device_recover routine I won't do that since 
> the reset() can do any kind of reset like:
> 1) PF FLR
> 2) mode1/2 reset
> 3) magic reset through config space
> 4) BACO reset
>
> PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO reset is 
> definitely a vram lost reset
>
> If you increase the counter in general function that will be not accurate
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumer...@gmail.com>
> Sent: Friday, August 23, 2019 4:27 PM
> To: Liu, Monk <monk....@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset 
> function
>
> Am 23.08.19 um 05:34 schrieb Monk Liu:
>> for SOC15/vega10 the BACO reset would introduce vram lost in the high
>> end address range and current kmd's vram lost checking cannot catch it
>> since it only check visible frame buffer
>>
>> TODO:
>> to confirm if mode1/2 reset would introduce vram lost
> Looks good in general, but I would make the value mandatory or maybe use a 
> special return code instead.
>
> On the other hand wouldn't it be simpler to just increment vram_lost_counter?
>
> Christian.
>
>> Signed-off-by: Monk Liu <monk....@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----
>>    drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-
>>    drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---
>>    drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-
>>    drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-
>>    drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-
>>    7 files changed, 22 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index f6ae565..1fe3756 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {
>>      int (*read_register)(struct amdgpu_device *adev, u32 se_num,
>>                           u32 sh_num, u32 reg_offset, u32 *value);
>>      void (*set_vga_state)(struct amdgpu_device *adev, bool state);
>> -    int (*reset)(struct amdgpu_device *adev);
>> +    int (*reset)(struct amdgpu_device *adev, bool *lost);
>>      enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);
>>      /* get the reference clock */
>>      u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7 @@
>> int emu_soc_asic_init(struct amdgpu_device *adev);
>>     * ASICs macro.
>>     */
>>    #define amdgpu_asic_set_vga_state(adev, state)
>> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define
>> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))
>> +#define amdgpu_asic_reset(adev, lost)
>> +(adev)->asic_funcs->reset((adev), (lost))
>>    #define amdgpu_asic_reset_method(adev) 
>> (adev)->asic_funcs->reset_method((adev))
>>    #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))
>>    #define amdgpu_asic_set_uvd_clocks(adev, v, d)
>> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 02b3e7d..8668cb8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
>> work_struct *__work)
>>      struct amdgpu_device *adev =
>>              container_of(__work, struct amdgpu_device, xgmi_reset_work);
>>    
>> -    adev->asic_reset_res =  amdgpu_asic_reset(adev);
>> +    adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);
>>      if (adev->asic_reset_res)
>>              DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>>                       adev->asic_reset_res, adev->ddev->unique); @@ -2751,7 
>> +2751,7 @@
>> int amdgpu_device_init(struct amdgpu_device *adev,
>>       *  E.g., driver was not cleanly unloaded previously, etc.
>>       */
>>      if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>> -            r = amdgpu_asic_reset(adev);
>> +            r = amdgpu_asic_reset(adev, NULL);
>>              if (r) {
>>                      dev_err(adev->dev, "asic reset on init failed\n");
>>                      goto failed;
>> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
>> suspend, bool fbcon)
>>              pci_disable_device(dev->pdev);
>>              pci_set_power_state(dev->pdev, PCI_D3hot);
>>      } else {
>> -            r = amdgpu_asic_reset(adev);
>> +            r = amdgpu_asic_reset(adev, NULL);
>>              if (r)
>>                      DRM_ERROR("amdgpu asic reset failed\n");
>>      }
>> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct 
>> amdgpu_hive_info *hive,
>>                              if (!queue_work(system_highpri_wq, 
>> &tmp_adev->xgmi_reset_work))
>>                                      r = -EALREADY;
>>                      } else
>> -                            r = amdgpu_asic_reset(tmp_adev);
>> +                            r = amdgpu_asic_reset(tmp_adev, &vram_lost);
>>    
>>                      if (r) {
>>                              DRM_ERROR("ASIC reset failed with error, %d for 
>> drm dev, %s", @@
>> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
>> *hive,
>>                              if (r)
>>                                      goto out;
>>    
>> -                            vram_lost = 
>> amdgpu_device_check_vram_lost(tmp_adev);
>> +                            if (!vram_lost)
>> +                                    vram_lost = 
>> amdgpu_device_check_vram_lost(tmp_adev);
>> +
>>                              if (vram_lost) {
>>                                      DRM_INFO("VRAM is lost due to GPU 
>> reset!\n");
>>                                      
>> atomic_inc(&tmp_adev->vram_lost_counter);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c
>> b/drivers/gpu/drm/amd/amdgpu/cik.c
>> index 7b63d7a..0f25b82 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/cik.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
>> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct 
>> amdgpu_device *adev)
>>     * to reset them.
>>     * Returns 0 for success.
>>     */
>> -static int cik_asic_reset(struct amdgpu_device *adev)
>> +static int cik_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>>    {
>>      int r;
>>    
>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c
>> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
>> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)
>>              return AMD_RESET_METHOD_MODE1;
>>    }
>>    
>> -static int nv_asic_reset(struct amdgpu_device *adev)
>> +static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>>    {
>>    
>>      /* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@
>> static int nv_asic_reset(struct amdgpu_device *adev)
>>      int ret = 0;
>>      struct smu_context *smu = &adev->smu;
>>    
>> -    if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>> +    if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
>> +            if (vramlost)
>> +                    *vramlost = true;
>>              ret = smu_baco_reset(smu);
>> -    else
>> +    }
>> +    else {
>>              ret = nv_asic_mode1_reset(adev);
>> +    }
>>    
>>      return ret;
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c
>> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/si.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/si.c
>> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device 
>> *adev,
>>    }
>>    
>>    //xxx: not implemented
>> -static int si_asic_reset(struct amdgpu_device *adev)
>> +static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>>    {
>>      return 0;
>>    }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
>> b/drivers/gpu/drm/amd/amdgpu/soc15.c
>> index fe2212df..12b2966 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
>> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
>>              return AMD_RESET_METHOD_MODE1;
>>    }
>>    
>> -static int soc15_asic_reset(struct amdgpu_device *adev)
>> +static int soc15_asic_reset(struct amdgpu_device *adev, bool
>> +*vramlost)
>>    {
>>      switch (soc15_asic_reset_method(adev)) {
>>              case AMD_RESET_METHOD_BACO:
>> +                    if (vramlost)
>> +                            *vramlost = true;
>>                      return soc15_asic_baco_reset(adev);
>>              case AMD_RESET_METHOD_MODE2:
>>                      return soc15_mode2_reset(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c
>> b/drivers/gpu/drm/amd/amdgpu/vi.c index 56c882b..8eceb00 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/vi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
>> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device 
>> *adev)
>>     * to reset them.
>>     * Returns 0 for success.
>>     */
>> -static int vi_asic_reset(struct amdgpu_device *adev)
>> +static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>>    {
>>      int r;
>>    

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to