RE: [PATCH] drm/amdgpu: avoid ras error injection for retired page

2019-09-30 Thread Zhou1, Tao


> -Original Message-
> From: Chen, Guchun 
> Sent: 2019年9月30日 15:14
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org;
> Zhang, Hawking 
> Subject: RE: [PATCH] drm/amdgpu: avoid ras error injection for retired page
> 
> 
> 
> 
> Regards,
> Guchun
> 
> -Original Message-
> From: Zhou1, Tao 
> Sent: Monday, September 30, 2019 2:58 PM
> To: amd-gfx@lists.freedesktop.org; Chen, Guchun
> ; Zhang, Hawking 
> Cc: Zhou1, Tao 
> Subject: [PATCH] drm/amdgpu: avoid ras error injection for retired page
> 
> check whether a page is bad page before error injection
> 
> Signed-off-by: Tao Zhou 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38
> +
>  1 file changed, 38 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index fe3a57e567c8..d50e565b0b20 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -71,6 +71,9 @@ const char *ras_block_string[] = {
> 
>  atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
> 
> +static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
> + uint64_t addr);
> +
>  static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
>   size_t size, loff_t *pos)
>  {
> @@ -290,6 +293,13 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct
> file *f, const char __user *
>   break;
>   }
> 
> + /* ce/ue error injection for a bad page is not allowed */
> + if (amdgpu_ras_check_bad_page(adev, data.inject.address))
> {
> + DRM_WARN("DRM WARN: 0x%llx has been marked
> as bad before error injection!\n",
> + data.inject.address);
> + break;
> + }
> +
>   /* data.inject.address is offset instead of absolute gpu
> address */
>   ret = amdgpu_ras_error_inject(adev, &data.inject);
>   break;
> @@ -1430,6 +1440,34 @@ static int amdgpu_ras_load_bad_pages(struct
> amdgpu_device *adev)
>   return ret;
>  }
> 
> +/* check if an address belongs to bad page */ static bool
> +amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
> + uint64_t addr)
> +{
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_err_handler_data *data;
> + int i, ret = false;
> [Guchun]It's better to use bool type for the ret variable, to keep consistent
> with function return type?
> Apart from that, this patch is: Reviewed-by: Guchun Chen
> 

[Tao] Thanks, I'll correct it before submit.

> 
> +
> + if (!con || !con->eh_data)
> + return ret;
> +
> + mutex_lock(&con->recovery_lock);
> + data = con->eh_data;
> + if (!data)
> + goto out;
> +
> + addr >>= AMDGPU_GPU_PAGE_SHIFT;
> + for (i = 0; i < data->count; i++)
> + if (addr == data->bps[i].retired_page) {
> + ret = true;
> + goto out;
> + }
> +
> +out:
> + mutex_unlock(&con->recovery_lock);
> + return ret;
> +}
> +
>  static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)
> {
>   /* Note: the caller should guarantee con and data are not NULL */
> --
> 2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: avoid ras error injection for retired page

2019-09-30 Thread Chen, Guchun



Regards,
Guchun

-Original Message-
From: Zhou1, Tao  
Sent: Monday, September 30, 2019 2:58 PM
To: amd-gfx@lists.freedesktop.org; Chen, Guchun ; Zhang, 
Hawking 
Cc: Zhou1, Tao 
Subject: [PATCH] drm/amdgpu: avoid ras error injection for retired page

check whether a page is bad page before error injection

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +
 1 file changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fe3a57e567c8..d50e565b0b20 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -71,6 +71,9 @@ const char *ras_block_string[] = {
 
 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
 
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+   uint64_t addr);
+
 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
 {
@@ -290,6 +293,13 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
break;
}
 
+   /* ce/ue error injection for a bad page is not allowed */
+   if (amdgpu_ras_check_bad_page(adev, data.inject.address)) {
+   DRM_WARN("DRM WARN: 0x%llx has been marked as bad 
before error injection!\n",
+   data.inject.address);
+   break;
+   }
+
/* data.inject.address is offset instead of absolute gpu 
address */
ret = amdgpu_ras_error_inject(adev, &data.inject);
break;
@@ -1430,6 +1440,34 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
return ret;
 }
 
+/* check if an address belongs to bad page */ static bool 
+amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+   uint64_t addr)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   int i, ret = false;
[Guchun]It's better to use bool type for the ret variable, to keep consistent 
with function return type?
Apart from that, this patch is: Reviewed-by: Guchun Chen 

+
+   if (!con || !con->eh_data)
+   return ret;
+
+   mutex_lock(&con->recovery_lock);
+   data = con->eh_data;
+   if (!data)
+   goto out;
+
+   addr >>= AMDGPU_GPU_PAGE_SHIFT;
+   for (i = 0; i < data->count; i++)
+   if (addr == data->bps[i].retired_page) {
+   ret = true;
+   goto out;
+   }
+
+out:
+   mutex_unlock(&con->recovery_lock);
+   return ret;
+}
+
 static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)  {
/* Note: the caller should guarantee con and data are not NULL */
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: avoid ras error injection for retired page

2019-09-29 Thread Zhou1, Tao
check whether a page is bad page before error injection

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +
 1 file changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fe3a57e567c8..d50e565b0b20 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -71,6 +71,9 @@ const char *ras_block_string[] = {
 
 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
 
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+   uint64_t addr);
+
 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
 {
@@ -290,6 +293,13 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
break;
}
 
+   /* ce/ue error injection for a bad page is not allowed */
+   if (amdgpu_ras_check_bad_page(adev, data.inject.address)) {
+   DRM_WARN("DRM WARN: 0x%llx has been marked as bad 
before error injection!\n",
+   data.inject.address);
+   break;
+   }
+
/* data.inject.address is offset instead of absolute gpu 
address */
ret = amdgpu_ras_error_inject(adev, &data.inject);
break;
@@ -1430,6 +1440,34 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
return ret;
 }
 
+/* check if an address belongs to bad page */
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+   uint64_t addr)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   int i, ret = false;
+
+   if (!con || !con->eh_data)
+   return ret;
+
+   mutex_lock(&con->recovery_lock);
+   data = con->eh_data;
+   if (!data)
+   goto out;
+
+   addr >>= AMDGPU_GPU_PAGE_SHIFT;
+   for (i = 0; i < data->count; i++)
+   if (addr == data->bps[i].retired_page) {
+   ret = true;
+   goto out;
+   }
+
+out:
+   mutex_unlock(&con->recovery_lock);
+   return ret;
+}
+
 static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)
 {
/* Note: the caller should guarantee con and data are not NULL */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx