Re: [PATCH] drm/amdgpu: recreate retired page's bo if vram get lost in gpu reset

2019-09-30 Thread Christian König

Am 29.09.19 um 08:35 schrieb Zhou1, Tao:

the info of retired page's bo may be lost if vram lost is encountered
in gpu reset (gpu page table in vram may be lost), force to recreate
all bos


NAK, that is complete nonsense.

When VRAM is lost the content of BOs in VRAM is lost, but the BOs itself 
are still there.


Regards,
Christian.



Signed-off-by: Tao Zhou 
Suggested-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 44 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
  3 files changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62955156653c..a89f6d053b3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3675,6 +3675,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU 
reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
+   amdgpu_ras_recovery_reset(tmp_adev);
}
  
  r = amdgpu_gtt_mgr_recover(

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 486568ded6d6..1b3b597aa973 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1573,6 +1573,50 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
  
  	return 0;

  }
+
+/*
+ * the info of retired page's bo may be lost if vram lost is encountered
+ * in gpu reset (gpu page table in vram may be lost), force to recreate
+ * all bos
+ */
+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   uint64_t bp;
+   struct amdgpu_bo *bo = NULL;
+   int i;
+
+   if (!con)
+   return;
+
+   data = con->eh_data;
+   if (!data)
+   return;
+
+   /* force to reserve all retired page again */
+   data->last_reserved = 0;
+
+   for (i = data->last_reserved; i < data->count; i++) {
+   bp = data->bps[i].retired_page;
+
+   /* There are three cases of reserve error should be ignored:
+* 1) a ras bad page has been allocated (used by someone);
+* 2) a ras bad page has been reserved (duplicate error 
injection
+*for one page);
+* 3) bo info isn't lost in gpu reset
+*/
+   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
+  AMDGPU_GPU_PAGE_SIZE,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  &bo, NULL))
+   DRM_NOTE("RAS NOTE: recreate bo for retired page 0x%llx 
fail\n", bp);
+   else
+   data->bps_bo[i] = bo;
+   data->last_reserved = i + 1;
+   bo = NULL;
+   }
+}
  /* recovery end */
  
  /* return 0 if ras will reset gpu and repost.*/

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f80fd3428c98..7a606d3be806 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -479,6 +479,7 @@ static inline int amdgpu_ras_is_supported(struct 
amdgpu_device *adev,
  }
  
  int amdgpu_ras_recovery_init(struct amdgpu_device *adev);

+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev);
  int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
unsigned int block);
  


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: recreate retired page's bo if vram get lost in gpu reset

2019-09-29 Thread Zhou1, Tao


> -Original Message-
> From: Chen, Guchun 
> Sent: 2019年9月29日 15:06
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org;
> Grodzovsky, Andrey ; Zhang, Hawking
> 
> Subject: RE: [PATCH] drm/amdgpu: recreate retired page's bo if vram get lost
> in gpu reset
> 
> 
> -Original Message-
> From: Zhou1, Tao 
> Sent: Sunday, September 29, 2019 2:35 PM
> To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
> ; Chen, Guchun ;
> Zhang, Hawking 
> Cc: Zhou1, Tao 
> Subject: [PATCH] drm/amdgpu: recreate retired page's bo if vram get lost in
> gpu reset
> 
> the info of retired page's bo may be lost if vram lost is encountered in gpu
> reset (gpu page table in vram may be lost), force to recreate all bos
> 
> Signed-off-by: Tao Zhou 
> Suggested-by: Andrey Grodzovsky 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 44
> ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
>  3 files changed, 46 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 62955156653c..a89f6d053b3f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3675,6 +3675,7 @@ static int amdgpu_do_asic_reset(struct
> amdgpu_hive_info *hive,
>   if (vram_lost) {
>   DRM_INFO("VRAM is lost due to GPU
> reset!\n");
>   amdgpu_inc_vram_lost(tmp_adev);
> +
>   amdgpu_ras_recovery_reset(tmp_adev);
>   }
> 
>   r = amdgpu_gtt_mgr_recover(
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 486568ded6d6..1b3b597aa973 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1573,6 +1573,50 @@ static int amdgpu_ras_recovery_fini(struct
> amdgpu_device *adev)
> 
>   return 0;
>  }
> +
> +/*
> + * the info of retired page's bo may be lost if vram lost is
> +encountered
> + * in gpu reset (gpu page table in vram may be lost), force to recreate
> + * all bos
> + */
> +void amdgpu_ras_recovery_reset(struct amdgpu_device *adev) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_err_handler_data *data;
> + uint64_t bp;
> + struct amdgpu_bo *bo = NULL;
> + int i;
> +
> + if (!con)
> + return;
> +
> + data = con->eh_data;
> + if (!data)
> + return;
> +
> + /* force to reserve all retired page again */
> + data->last_reserved = 0;
> +
> + for (i = data->last_reserved; i < data->count; i++) {
> + bp = data->bps[i].retired_page;
> +
> + /* There are three cases of reserve error should be ignored:
> +  * 1) a ras bad page has been allocated (used by someone);
> +  * 2) a ras bad page has been reserved (duplicate error
> injection
> +  *for one page);
> +  * 3) bo info isn't lost in gpu reset
> +  */
> + if (amdgpu_bo_create_kernel_at(adev, bp <<
> AMDGPU_GPU_PAGE_SHIFT,
> +AMDGPU_GPU_PAGE_SIZE,
> +AMDGPU_GEM_DOMAIN_VRAM,
> +&bo, NULL))
> + DRM_NOTE("RAS NOTE: recreate bo for retired page
> 0x%llx fail\n", bp);
> + else
> + data->bps_bo[i] = bo;
> + data->last_reserved = i + 1;
> + bo = NULL;
> + }
> [Guchun] In amdgpu_ras.c, we have another function
> amdgpu_ras_reserve_bad_pages, which has the same bo creation code as
> above.
> So can we put these common codes into another internal func dedicated for
> bo creating jobs to reduce code lines and get this internal func being called 
> by
> amdgpu_ras_reserve_bad_pages and amdgpu_ras_recovery_reset
> respectively?

[Tao] There are some minor differences, I'll add a new patch if the for loop 
can be reused.

> +}
>  /* recovery end */
> 
>  /* return 0 if ras will reset gpu and repost.*/ diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index f80fd3428c98..7a606d3be806 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -479,6 +479,7 @@ static inline int amdgpu_ras_is_supported(struct
> amdgpu_device *adev,  }
> 
>  int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
> +void amdgpu_ras_recovery_reset(struct amdgpu_device *adev);
>  int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
>   unsigned int block);
> 
> --
> 2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: recreate retired page's bo if vram get lost in gpu reset

2019-09-29 Thread Chen, Guchun

-Original Message-
From: Zhou1, Tao  
Sent: Sunday, September 29, 2019 2:35 PM
To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey 
; Chen, Guchun ; Zhang, Hawking 

Cc: Zhou1, Tao 
Subject: [PATCH] drm/amdgpu: recreate retired page's bo if vram get lost in gpu 
reset

the info of retired page's bo may be lost if vram lost is encountered in gpu 
reset (gpu page table in vram may be lost), force to recreate all bos

Signed-off-by: Tao Zhou 
Suggested-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 44 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
 3 files changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62955156653c..a89f6d053b3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3675,6 +3675,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU 
reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
+   amdgpu_ras_recovery_reset(tmp_adev);
}
 
r = amdgpu_gtt_mgr_recover(
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 486568ded6d6..1b3b597aa973 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1573,6 +1573,50 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
 
return 0;
 }
+
+/*
+ * the info of retired page's bo may be lost if vram lost is 
+encountered
+ * in gpu reset (gpu page table in vram may be lost), force to recreate
+ * all bos
+ */
+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   uint64_t bp;
+   struct amdgpu_bo *bo = NULL;
+   int i;
+
+   if (!con)
+   return;
+
+   data = con->eh_data;
+   if (!data)
+   return;
+
+   /* force to reserve all retired page again */
+   data->last_reserved = 0;
+
+   for (i = data->last_reserved; i < data->count; i++) {
+   bp = data->bps[i].retired_page;
+
+   /* There are three cases of reserve error should be ignored:
+* 1) a ras bad page has been allocated (used by someone);
+* 2) a ras bad page has been reserved (duplicate error 
injection
+*for one page);
+* 3) bo info isn't lost in gpu reset
+*/
+   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
+  AMDGPU_GPU_PAGE_SIZE,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  &bo, NULL))
+   DRM_NOTE("RAS NOTE: recreate bo for retired page 0x%llx 
fail\n", bp);
+   else
+   data->bps_bo[i] = bo;
+   data->last_reserved = i + 1;
+   bo = NULL;
+   }
[Guchun] In amdgpu_ras.c, we have another function 
amdgpu_ras_reserve_bad_pages, which has the same bo creation code as above.
So can we put these common codes into another internal func dedicated for bo 
creating jobs to reduce code lines and get this internal func being called by 
amdgpu_ras_reserve_bad_pages and amdgpu_ras_recovery_reset respectively?
+}
 /* recovery end */
 
 /* return 0 if ras will reset gpu and repost.*/ diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f80fd3428c98..7a606d3be806 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -479,6 +479,7 @@ static inline int amdgpu_ras_is_supported(struct 
amdgpu_device *adev,  }
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev);
 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
unsigned int block);
 
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: recreate retired page's bo if vram get lost in gpu reset

2019-09-28 Thread Zhou1, Tao
the info of retired page's bo may be lost if vram lost is encountered
in gpu reset (gpu page table in vram may be lost), force to recreate
all bos

Signed-off-by: Tao Zhou 
Suggested-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 44 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
 3 files changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62955156653c..a89f6d053b3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3675,6 +3675,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU 
reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
+   amdgpu_ras_recovery_reset(tmp_adev);
}
 
r = amdgpu_gtt_mgr_recover(
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 486568ded6d6..1b3b597aa973 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1573,6 +1573,50 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
 
return 0;
 }
+
+/*
+ * the info of retired page's bo may be lost if vram lost is encountered
+ * in gpu reset (gpu page table in vram may be lost), force to recreate
+ * all bos
+ */
+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   uint64_t bp;
+   struct amdgpu_bo *bo = NULL;
+   int i;
+
+   if (!con)
+   return;
+
+   data = con->eh_data;
+   if (!data)
+   return;
+
+   /* force to reserve all retired page again */
+   data->last_reserved = 0;
+
+   for (i = data->last_reserved; i < data->count; i++) {
+   bp = data->bps[i].retired_page;
+
+   /* There are three cases of reserve error should be ignored:
+* 1) a ras bad page has been allocated (used by someone);
+* 2) a ras bad page has been reserved (duplicate error 
injection
+*for one page);
+* 3) bo info isn't lost in gpu reset
+*/
+   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
+  AMDGPU_GPU_PAGE_SIZE,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  &bo, NULL))
+   DRM_NOTE("RAS NOTE: recreate bo for retired page 0x%llx 
fail\n", bp);
+   else
+   data->bps_bo[i] = bo;
+   data->last_reserved = i + 1;
+   bo = NULL;
+   }
+}
 /* recovery end */
 
 /* return 0 if ras will reset gpu and repost.*/
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f80fd3428c98..7a606d3be806 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -479,6 +479,7 @@ static inline int amdgpu_ras_is_supported(struct 
amdgpu_device *adev,
 }
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev);
 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
unsigned int block);
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx