Am 23.08.19 um 10:57 schrieb Liu, Monk:
>>> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>>>                                     if (vram_lost) {
>>>                                             DRM_INFO("VRAM is lost due to 
>>> GPU reset!\n");
>>>                                             
>>> atomic_inc(&tmp_adev->vram_lost_counter);
> Above is the original logic, if we increment the counter in BACO reset 
> routine, we would potentially
> Have another counter increasement if by coincidence the 
> "amdgpu_device_check_vram_lost" successfully detected the vram lost (although 
> right now it didn't ..)

Yeah, but would increment it twice be a problem? I don't think so.

> Do you mean we remove the amdgpu_device_check_vram_lost(tmp_adev) in 
> device_recovery() routine ?

Please no, that thing certainly proved to be useful. Maybe we could 
investigate why it failed to auto detect the lost VRAM.

Christian.

> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <christian.koe...@amd.com>
> Sent: Friday, August 23, 2019 4:34 PM
> To: Liu, Monk <monk....@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset 
> function
>
> I thought in the BACO reset function.
>
> The top level reset function doesn't do much more than increment the 
> vram_lost_counter either.
>
> Christian.
>
> Am 23.08.19 um 10:32 schrieb Liu, Monk:
>>>> On the other hand wouldn't it be simpler to just increment 
>>>> vram_lost_counter?
>> In where ? if you mean in amdgpu_device_recover routine I won't do that 
>> since the reset() can do any kind of reset like:
>> 1) PF FLR
>> 2) mode1/2 reset
>> 3) magic reset through config space
>> 4) BACO reset
>>
>> PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO
>> reset is definitely a vram lost reset
>>
>> If you increase the counter in general function that will be not
>> accurate _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumer...@gmail.com>
>> Sent: Friday, August 23, 2019 4:27 PM
>> To: Liu, Monk <monk....@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for
>> reset function
>>
>> Am 23.08.19 um 05:34 schrieb Monk Liu:
>>> for SOC15/vega10 the BACO reset would introduce vram lost in the high
>>> end address range and current kmd's vram lost checking cannot catch
>>> it since it only check visible frame buffer
>>>
>>> TODO:
>>> to confirm if mode1/2 reset would introduce vram lost
>> Looks good in general, but I would make the value mandatory or maybe use a 
>> special return code instead.
>>
>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?
>>
>> Christian.
>>
>>> Signed-off-by: Monk Liu <monk....@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----
>>>     drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-
>>>     drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---
>>>     drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-
>>>     drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-
>>>     drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-
>>>     7 files changed, 22 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index f6ae565..1fe3756 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {
>>>             int (*read_register)(struct amdgpu_device *adev, u32 se_num,
>>>                                  u32 sh_num, u32 reg_offset, u32 *value);
>>>             void (*set_vga_state)(struct amdgpu_device *adev, bool state);
>>> -   int (*reset)(struct amdgpu_device *adev);
>>> +   int (*reset)(struct amdgpu_device *adev, bool *lost);
>>>             enum amd_reset_method (*reset_method)(struct amdgpu_device 
>>> *adev);
>>>             /* get the reference clock */
>>>             u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7
>>> @@ int emu_soc_asic_init(struct amdgpu_device *adev);
>>>      * ASICs macro.
>>>      */
>>>     #define amdgpu_asic_set_vga_state(adev, state)
>>> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define
>>> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))
>>> +#define amdgpu_asic_reset(adev, lost)
>>> +(adev)->asic_funcs->reset((adev), (lost))
>>>     #define amdgpu_asic_reset_method(adev) 
>>> (adev)->asic_funcs->reset_method((adev))
>>>     #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))
>>>     #define amdgpu_asic_set_uvd_clocks(adev, v, d)
>>> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 02b3e7d..8668cb8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
>>> work_struct *__work)
>>>             struct amdgpu_device *adev =
>>>                     container_of(__work, struct amdgpu_device, 
>>> xgmi_reset_work);
>>>     
>>> -   adev->asic_reset_res =  amdgpu_asic_reset(adev);
>>> +   adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);
>>>             if (adev->asic_reset_res)
>>>                     DRM_WARN("ASIC reset failed with error, %d for drm dev, 
>>> %s",
>>>                              adev->asic_reset_res, adev->ddev->unique); @@ 
>>> -2751,7 +2751,7
>>> @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>              *  E.g., driver was not cleanly unloaded previously, etc.
>>>              */
>>>             if (!amdgpu_sriov_vf(adev) && 
>>> amdgpu_asic_need_reset_on_init(adev)) {
>>> -           r = amdgpu_asic_reset(adev);
>>> +           r = amdgpu_asic_reset(adev, NULL);
>>>                     if (r) {
>>>                             dev_err(adev->dev, "asic reset on init 
>>> failed\n");
>>>                             goto failed;
>>> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, 
>>> bool suspend, bool fbcon)
>>>                     pci_disable_device(dev->pdev);
>>>                     pci_set_power_state(dev->pdev, PCI_D3hot);
>>>             } else {
>>> -           r = amdgpu_asic_reset(adev);
>>> +           r = amdgpu_asic_reset(adev, NULL);
>>>                     if (r)
>>>                             DRM_ERROR("amdgpu asic reset failed\n");
>>>             }
>>> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct 
>>> amdgpu_hive_info *hive,
>>>                                     if (!queue_work(system_highpri_wq, 
>>> &tmp_adev->xgmi_reset_work))
>>>                                             r = -EALREADY;
>>>                             } else
>>> -                           r = amdgpu_asic_reset(tmp_adev);
>>> +                           r = amdgpu_asic_reset(tmp_adev, &vram_lost);
>>>     
>>>                             if (r) {
>>>                                     DRM_ERROR("ASIC reset failed with 
>>> error, %d for drm dev, %s",
>>> @@
>>> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
>>> *hive,
>>>                                     if (r)
>>>                                             goto out;
>>>     
>>> -                           vram_lost = 
>>> amdgpu_device_check_vram_lost(tmp_adev);
>>> +                           if (!vram_lost)
>>> +                                   vram_lost = 
>>> amdgpu_device_check_vram_lost(tmp_adev);
>>> +
>>>                                     if (vram_lost) {
>>>                                             DRM_INFO("VRAM is lost due to 
>>> GPU reset!\n");
>>>                                             
>>> atomic_inc(&tmp_adev->vram_lost_counter);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c
>>> b/drivers/gpu/drm/amd/amdgpu/cik.c
>>> index 7b63d7a..0f25b82 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/cik.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
>>> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct 
>>> amdgpu_device *adev)
>>>      * to reset them.
>>>      * Returns 0 for success.
>>>      */
>>> -static int cik_asic_reset(struct amdgpu_device *adev)
>>> +static int cik_asic_reset(struct amdgpu_device *adev, bool
>>> +*vramlost)
>>>     {
>>>             int r;
>>>     
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c
>>> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
>>> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)
>>>                     return AMD_RESET_METHOD_MODE1;
>>>     }
>>>     
>>> -static int nv_asic_reset(struct amdgpu_device *adev)
>>> +static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>>>     {
>>>     
>>>             /* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@
>>> static int nv_asic_reset(struct amdgpu_device *adev)
>>>             int ret = 0;
>>>             struct smu_context *smu = &adev->smu;
>>>     
>>> -   if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>>> +   if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
>>> +           if (vramlost)
>>> +                   *vramlost = true;
>>>                     ret = smu_baco_reset(smu);
>>> -   else
>>> +   }
>>> +   else {
>>>                     ret = nv_asic_mode1_reset(adev);
>>> +   }
>>>     
>>>             return ret;
>>>     }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c
>>> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/si.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/si.c
>>> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct 
>>> amdgpu_device *adev,
>>>     }
>>>     
>>>     //xxx: not implemented
>>> -static int si_asic_reset(struct amdgpu_device *adev)
>>> +static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>>>     {
>>>             return 0;
>>>     }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> b/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> index fe2212df..12b2966 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
>>> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
>>>                     return AMD_RESET_METHOD_MODE1;
>>>     }
>>>     
>>> -static int soc15_asic_reset(struct amdgpu_device *adev)
>>> +static int soc15_asic_reset(struct amdgpu_device *adev, bool
>>> +*vramlost)
>>>     {
>>>             switch (soc15_asic_reset_method(adev)) {
>>>                     case AMD_RESET_METHOD_BACO:
>>> +                   if (vramlost)
>>> +                           *vramlost = true;
>>>                             return soc15_asic_baco_reset(adev);
>>>                     case AMD_RESET_METHOD_MODE2:
>>>                             return soc15_mode2_reset(adev); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
>>> index 56c882b..8eceb00 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/vi.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
>>> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device 
>>> *adev)
>>>      * to reset them.
>>>      * Returns 0 for success.
>>>      */
>>> -static int vi_asic_reset(struct amdgpu_device *adev)
>>> +static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>>>     {
>>>             int r;
>>>     

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to