amdgpu: cleanups for vram lost handling

Andrey Grodzovsky Wed, 28 Feb 2018 05:41:54 -0800

Patch look good to me then.

Acked-by: Andrey Grodzovsky <andrey.grodzov...@amd.com>



Andrey


On 02/28/2018 08:36 AM, Liu, Monk wrote:

                goto error;

amdgpu_irq_gpu_reset_resume_helper(adev);

        r = amdgpu_ib_ring_tests(adev);
-       if (r)
-               dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", 
r);

This 2 lines deletion seems unintentional

No, intentional, because if IB TEST failed the error will be reported by the 
fault IP

+       if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
+               atomic_inc(&adev->vram_lost_counter);
+               r = amdgpu_handle_vram_lost(adev);
        }

+error:

+
        return r;

Not doing incrementation of VRAM lost counter in case of error anymore is 
intentional ?


[ML] yeah, no need to increase the counter because GPU reset is failed so 
everything is meaningless, driver cannot used anymore



-----Original Message-----
From: Grodzovsky, Andrey
Sent: 2018年2月28日 21:29
To: Liu, Monk <monk....@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/4] drm/amdgpu: cleanups for vram lost handling



On 02/28/2018 02:21 AM, Monk Liu wrote:

1)create a routine "handle_vram_lost" to do the vram recovery, and put
it into amdgpu_device_reset/reset_sriov, this way no need of the extra
paramter to hold the VRAM LOST information and the related macros can
be removed.

3)show vram_recover failure if time out, and set TMO equal to
lockup_timeout if vram_recover is under SRIOV runtime mode.

4)report error if any ip reset failed for SR-IOV

Change-Id: I686e2b6133844c14948c206a2315c064a78c1d9c
Signed-off-by: Monk Liu <monk....@amd.com>
---
   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   4 -
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 136 
+++++++++++++++--------------
   2 files changed, 72 insertions(+), 68 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5bddfc1..abbc3f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -181,10 +181,6 @@ extern int amdgpu_cik_support;
   #define CIK_CURSOR_WIDTH 128
   #define CIK_CURSOR_HEIGHT 128

-/* GPU RESET flags */

-#define AMDGPU_RESET_INFO_VRAM_LOST  (1 << 0) -#define
AMDGPU_RESET_INFO_FULLRESET  (1 << 1)
-
   struct amdgpu_device;
   struct amdgpu_ib;
   struct amdgpu_cs_parser;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e9d81a8..39ece7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1591,6 +1591,8 @@ static int
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)

r = block->version->funcs->hw_init(adev);

                        DRM_INFO("RE-INIT: %s %s\n", 
block->version->funcs->name,
r?"failed":"successed");
+                       if (r)
+                               return r;
                }
        }

@@ -1624,6 +1626,8 @@ static int

amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)

r = block->version->funcs->hw_init(adev);

                        DRM_INFO("RE-INIT: %s %s\n", 
block->version->funcs->name,
r?"failed":"successed");
+                       if (r)
+                               return r;
                }
        }

@@ -2471,17 +2475,71 @@ static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev,

        return r;
   }

+static int amdgpu_handle_vram_lost(struct amdgpu_device *adev) {

+       struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
+       struct amdgpu_bo *bo, *tmp;
+       struct dma_fence *fence = NULL, *next = NULL;
+       long r = 1;
+       int i = 0;
+       long tmo;
+
+       if (amdgpu_sriov_runtime(adev))
+               tmo = msecs_to_jiffies(amdgpu_lockup_timeout);
+       else
+               tmo = msecs_to_jiffies(100);
+
+       DRM_INFO("recover vram bo from shadow start\n");
+       mutex_lock(&adev->shadow_list_lock);
+       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+               next = NULL;
+               amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
+               if (fence) {
+                       r = dma_fence_wait_timeout(fence, false, tmo);
+                       if (r == 0)
+                               pr_err("wait fence %p[%d] timeout\n", fence, i);
+                       else if (r < 0)
+                               pr_err("wait fence %p[%d] interrupted\n", 
fence, i);
+                       if (r < 1) {
+                               dma_fence_put(fence);
+                               fence = next;
+                               break;
+                       }
+                       i++;
+               }
+
+               dma_fence_put(fence);
+               fence = next;
+       }
+       mutex_unlock(&adev->shadow_list_lock);
+
+       if (fence) {
+               r = dma_fence_wait_timeout(fence, false, tmo);
+               if (r == 0)
+                       pr_err("wait fence %p[%d] timeout\n", fence, i);
+               else if (r < 0)
+                       pr_err("wait fence %p[%d] interrupted\n", fence, i);
+
+       }
+       dma_fence_put(fence);
+
+       if (r > 0)
+               DRM_INFO("recover vram bo from shadow done\n");
+       else
+               DRM_ERROR("recover vram bo from shadow failed\n");
+
+       return (r > 0?0:1);
+}
+
   /*
    * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough
    *
    * @adev: amdgpu device pointer
- * @reset_flags: output param tells caller the reset result
    *
    * attempt to do soft-reset or full-reset and reinitialize Asic
    * return 0 means successed otherwise failed
   */
-static int amdgpu_device_reset(struct amdgpu_device *adev,
-                              uint64_t* reset_flags)
+static int amdgpu_device_reset(struct amdgpu_device *adev)
   {
        bool need_full_reset, vram_lost = 0;
        int r;
@@ -2496,7 +2554,6 @@ static int amdgpu_device_reset(struct amdgpu_device *adev,
                        DRM_INFO("soft reset failed, will fallback to full 
reset!\n");
                        need_full_reset = true;
                }
-
        }

if (need_full_reset) {

@@ -2545,13 +2602,8 @@ static int amdgpu_device_reset(struct amdgpu_device 
*adev,
                }
        }

- if (reset_flags) {

-               if (vram_lost)
-                       (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
-
-               if (need_full_reset)
-                       (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
-       }
+       if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || 
vram_lost))
+               r = amdgpu_handle_vram_lost(adev);

return r;

   }
@@ -2560,14 +2612,11 @@ static int amdgpu_device_reset(struct amdgpu_device 
*adev,
    * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
    *
    * @adev: amdgpu device pointer
- * @reset_flags: output param tells caller the reset result
    *
    * do VF FLR and reinitialize Asic
    * return 0 means successed otherwise failed
   */
-static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
-                                    uint64_t *reset_flags,
-                                    bool from_hypervisor)
+static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool
+from_hypervisor)
   {
        int r;

@@ -2588,28 +2637,20 @@ static int amdgpu_device_reset_sriov(struct

amdgpu_device *adev,

/* now we are okay to resume SMC/CP/SDMA */

        r = amdgpu_device_ip_reinit_late_sriov(adev);
+       amdgpu_virt_release_full_gpu(adev, true);
        if (r)
                goto error;

amdgpu_irq_gpu_reset_resume_helper(adev);

        r = amdgpu_ib_ring_tests(adev);
-       if (r)
-               dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", 
r);

This 2 lines deletion seems unintentional

-error:

-       /* release full control of GPU after ib test */
-       amdgpu_virt_release_full_gpu(adev, true);
-
-       if (reset_flags) {
-               if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
-                       (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST;
-                       atomic_inc(&adev->vram_lost_counter);
-               }
-
-               /* VF FLR or hotlink reset is always full-reset */
-               (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET;
+       if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
+               atomic_inc(&adev->vram_lost_counter);
+               r = amdgpu_handle_vram_lost(adev);
        }

+error:

+
        return r;

Not doing incrementation of VRAM lost counter in case of error anymore is 
intentional ?

Andrey

@@ -2673,42 +2714,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

if (amdgpu_sriov_vf(adev))

-               r = amdgpu_device_reset_sriov(adev, &reset_flags, job ? false : 
true);
+               r = amdgpu_device_reset_sriov(adev, job ? false : true);
        else
-               r = amdgpu_device_reset(adev, &reset_flags);
-
-       if (!r) {
-               if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags 
& AMD_IS_APU)) ||
-                       (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) {
-                       struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
-                       struct amdgpu_bo *bo, *tmp;
-                       struct dma_fence *fence = NULL, *next = NULL;
-
-                       DRM_INFO("recover vram bo from shadow\n");
-                       mutex_lock(&adev->shadow_list_lock);
-                       list_for_each_entry_safe(bo, tmp, &adev->shadow_list, 
shadow_list) {
-                               next = NULL;
-                               amdgpu_device_recover_vram_from_shadow(adev, ring, 
bo, &next);
-                               if (fence) {
-                                       r = dma_fence_wait(fence, false);
-                                       if (r) {
-                                               WARN(r, "recovery from shadow isn't 
completed\n");
-                                               break;
-                                       }
-                               }
-
-                               dma_fence_put(fence);
-                               fence = next;
-                       }
-                       mutex_unlock(&adev->shadow_list_lock);
-                       if (fence) {
-                               r = dma_fence_wait(fence, false);
-                               if (r)
-                                       WARN(r, "recovery from shadow isn't 
completed\n");
-                       }
-                       dma_fence_put(fence);
-               }
-       }
+               r = amdgpu_device_reset(adev);

for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

                struct amdgpu_ring *ring = adev->rings[i];


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/4] drm/amdgpu: cleanups for vram lost handling

Reply via email to