[AMD Official Use Only]

Sorry for the confusing commit message。 
This interface is only for amdgpu ras tool new function. There is no impact on 
currently existing tools and scripts。

-----Original Message-----
From: Zhang, Hawking <hawking.zh...@amd.com> 
Sent: Tuesday, March 29, 2022 4:33 PM
To: Chai, Thomas <yipeng.c...@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao <tao.zh...@amd.com>; Clements, John <john.cleme...@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

[AMD Official Use Only]

I'm not sure I understand the fix correctly - It seems to me it is trying to 
stop user/test cases that initiate error injection request back-to-back? But 
anyway, we shouldn't make the change or leverage debugfs for that purpose, and 
there is no guarantee test scripts/applications will follow the rule as well. 

I guess we need to identify the root cause case by case and stop the invalid 
request in kernel driver.

Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <yipeng.c...@amd.com>
Sent: Tuesday, March 29, 2022 15:39
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas <yipeng.c...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>; 
Zhou1, Tao <tao.zh...@amd.com>; Clements, John <john.cleme...@amd.com>; Chai, 
Thomas <yipeng.c...@amd.com>
Subject: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

Some AMDGPU RAS debugfs operations like UE injection can cause gpu reset. 
Before doing the next debugfs operation, the application should call poll to 
check if the gpu has finished recovering.

Signed-off-by: yipechai <yipeng.c...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 ++++++++++++++++++++++++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bbed76b79c8..337e3e247a45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -452,6 +452,12 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f,
 
                /* data.inject.address is offset instead of absolute gpu 
address */
                ret = amdgpu_ras_error_inject(adev, &data.inject);
+
+               if (!ret && (data.head.type == 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE)) {
+                       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+                       con->ras_ue_injected = 1;
+               }
                break;
        default:
                ret = -EINVAL;
@@ -464,6 +470,30 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f,
        return size;
 }
 
+/**
+ * DOC: Support AMDGPU RAS debugfs poll interface
+ *
+ * Some AMDGPU RAS debugfs operations like UE injection
+ * can cause gpu reset. Before doing the next debugfs
+ * operation, the application should call poll to check
+ * if gpu is in recovering status.
+ */
+static __poll_t amdgpu_ras_debugfs_ctrl_poll(struct file *f, struct 
+poll_table_struct *wait) {
+       struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       __poll_t mask = 0;
+
+       /* For UE injection, wait for gpu to finish recovery */
+       if (con->ras_ue_injected)
+               poll_wait(f, &con->gpu_ready_wait_wq, wait);
+
+       if (!atomic_read(&con->in_recovery))
+               mask = EPOLLIN | EPOLLRDNORM;
+
+       return mask;
+}
+
 /**
  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
  *
@@ -503,6 +533,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file 
*f,
 
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
        .owner = THIS_MODULE,
+       .poll = amdgpu_ras_debugfs_ctrl_poll,
        .read = NULL,
        .write = amdgpu_ras_debugfs_ctrl_write,
        .llseek = default_llseek
@@ -1837,6 +1868,11 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
        if (amdgpu_device_should_recover_gpu(ras->adev))
                amdgpu_device_gpu_recover(ras->adev, NULL);
        atomic_set(&ras->in_recovery, 0);
+
+       if (ras->ras_ue_injected) {
+               ras->ras_ue_injected = 0;
+               wake_up_all(&ras->gpu_ready_wait_wq);
+       }
 }
 
 /* alloc/realloc bps array */
@@ -2279,7 +2315,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
        INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
        atomic_set(&con->ras_ce_count, 0);
        atomic_set(&con->ras_ue_count, 0);
-
+       init_waitqueue_head(&con->gpu_ready_wait_wq);
        con->objs = (struct ras_manager *)(con + 1);
 
        amdgpu_ras_set_context(adev, con);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 606df8869b89..aea6bbb71501 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -379,6 +379,12 @@ struct amdgpu_ras {
 
        /* Indicates smu whether need update bad channel info */
        bool update_channel_flag;
+
+       /* UE injection flag */
+       uint32_t  ras_ue_injected;
+
+       /* Waiting for gpu ready work queue */
+       wait_queue_head_t gpu_ready_wait_wq;
 };
 
 struct ras_fs_data {
--
2.25.1

Reply via email to