Re: [PATCH 2/2] dmr/amdgpu: Add system auto reboot to RAS.

2019-08-29 Thread Christian König

Am 28.08.19 um 22:00 schrieb Andrey Grodzovsky:

In case of RAS error allow user configure auto system
reboot through ras_ctrl.
This is also part of the temproray work around for the RAS
hang problem.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 10 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
  3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3ecee10..f1cff47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3805,6 +3805,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool in_ras_intr = amdgpu_ras_intr_triggered();
  
+	/*

+* Flush RAM to disk so that after reboot
+* the user can read log and see why the system rebooted.
+*
+* Using user mode app call instead of kernel APIs such as
+* ksys_sync_helper for backward comparability with earlier
+* kernels into which this is also intended.
+*/
+   if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
+   char *envp[] = { "HOME=/", NULL };
+   char *argv[] = { "/bin/sync", NULL };
+
+   DRM_WARN("Emergency reboot.");
+
+   call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+   emergency_restart();
+   }
+
need_full_reset = job_signaled = false;
INIT_LIST_HEAD(_list);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 086e6df..423a1ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -30,6 +30,7 @@
  #include "amdgpu_ras.h"
  #include "amdgpu_atomfirmware.h"
  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include 
  
  const char *ras_error_string[] = {

"none",
@@ -154,6 +155,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file 
*f,
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
+   else if (sscanf(str, "reboot %32s", block_name) == 1)
+   op = 3;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;


This is actually becoming quite a mess. We should consider removing the 
parsing in the long term and using separate debugfs files for each action.


Christian.


@@ -287,6 +290,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
/* data.inject.address is offset instead of absolute gpu 
address */
ret = amdgpu_ras_error_inject(adev, );
break;
+   case 3:
+   amdgpu_ras_get_context(adev)->reboot = true;
+   break;
default:
ret = -EINVAL;
break;
@@ -1733,6 +1739,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
  void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
  {
if (atomic_cmpxchg(_ras_in_intr, 0, 1) == 0) {
-   DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! 
Stopping all GPU jobs.\n");
+   DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT 
detected!\n");
+
+   amdgpu_ras_reset_gpu(adev, false);
}
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index c0e22af..e3f0764 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -333,6 +333,7 @@ struct amdgpu_ras {
struct mutex recovery_lock;
  
  	uint32_t flags;

+   bool reboot;
  };
  
  struct ras_fs_data {


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/2] dmr/amdgpu: Add system auto reboot to RAS.

2019-08-28 Thread Andrey Grodzovsky
In case of RAS error allow user configure auto system
reboot through ras_ctrl.
This is also part of the temproray work around for the RAS
hang problem.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3ecee10..f1cff47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3805,6 +3805,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool in_ras_intr = amdgpu_ras_intr_triggered();
 
+   /*
+* Flush RAM to disk so that after reboot
+* the user can read log and see why the system rebooted.
+*
+* Using user mode app call instead of kernel APIs such as
+* ksys_sync_helper for backward comparability with earlier
+* kernels into which this is also intended.
+*/
+   if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
+   char *envp[] = { "HOME=/", NULL };
+   char *argv[] = { "/bin/sync", NULL };
+
+   DRM_WARN("Emergency reboot.");
+
+   call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+   emergency_restart();
+   }
+
need_full_reset = job_signaled = false;
INIT_LIST_HEAD(_list);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 086e6df..423a1ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -30,6 +30,7 @@
 #include "amdgpu_ras.h"
 #include "amdgpu_atomfirmware.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include 
 
 const char *ras_error_string[] = {
"none",
@@ -154,6 +155,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file 
*f,
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
+   else if (sscanf(str, "reboot %32s", block_name) == 1)
+   op = 3;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;
@@ -287,6 +290,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
/* data.inject.address is offset instead of absolute gpu 
address */
ret = amdgpu_ras_error_inject(adev, );
break;
+   case 3:
+   amdgpu_ras_get_context(adev)->reboot = true;
+   break;
default:
ret = -EINVAL;
break;
@@ -1733,6 +1739,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 {
if (atomic_cmpxchg(_ras_in_intr, 0, 1) == 0) {
-   DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! 
Stopping all GPU jobs.\n");
+   DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT 
detected!\n");
+
+   amdgpu_ras_reset_gpu(adev, false);
}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index c0e22af..e3f0764 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -333,6 +333,7 @@ struct amdgpu_ras {
struct mutex recovery_lock;
 
uint32_t flags;
+   bool reboot;
 };
 
 struct ras_fs_data {
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx