[Public]

KERNEL[173.150476] change   
/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0/drm/card1
 (drm)
ACTION=change
DEVPATH=/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0/drm/card1
SUBSYSTEM=drm
WEDGED=none
DEVNAME=/dev/dri/card1
DEVTYPE=drm_minor
SEQNUM=6237
MAJOR=226
MINOR=1

a "drm_dev_wedget_event()" uevent example above.

You shouldn't discuss these together; they are two separate events occurring on 
different type devices (pci device and drm device).
software-defined devices and physical devices don't have a strict one-to-one 
mapping,
and the device initiating the reset and the device that need to reset are 
different on an XGMI system.
so, all independent PCI devices in same XGMI link need to independently report 
events.

Best Regards,
Kevin

-----Original Message-----
From: Lazar, Lijo <[email protected]>
Sent: Friday, September 26, 2025 14:55
To: Wang, Yang(Kevin) <[email protected]>; [email protected]
Cc: Zhang, Hawking <[email protected]>; Deucher, Alexander 
<[email protected]>
Subject: RE: [PATCH] drm/amdgpu: notify amdgpu gpu reset state via uevent

[Public]

Presently, there is this one also - drm_dev_wedged_event. Perhaps it's better 
to modify this to include additional info like pre and post reset along with 
cause of reset?

Thanks,
Lijo
-----Original Message-----
From: amd-gfx <[email protected]> On Behalf Of Yang Wang
Sent: Friday, September 26, 2025 12:04 PM
To: [email protected]
Cc: Zhang, Hawking <[email protected]>; Deucher, Alexander 
<[email protected]>
Subject: [PATCH] drm/amdgpu: notify amdgpu gpu reset state via uevent

Use the uevent mechanism to expose the GPU reset state, so that the system tool 
can more accurately monitor the device reset status.

example:
$ sudo cat /sys/kernel/debug/dri/<minor>/amdgpu_gpu_recover

KERNEL[172.053149] change   
/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 (pci)
ACTION=change
DEVPATH=/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0
SUBSYSTEM=pci
RESET=1
DRIVER=amdgpu
PCI_CLASS=30000
PCI_ID=1002:73BF
PCI_SUBSYS_ID=1002:0E3A
PCI_SLOT_NAME=0000:05:00.0
MODALIAS=pci:v00001002d000073BFsv00001002sd00000E3Abc03sc00i00
SEQNUM=6235

KERNEL[173.137681] change   
/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 (pci)
ACTION=change
DEVPATH=/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0
SUBSYSTEM=pci
RESET=0
DRIVER=amdgpu
PCI_CLASS=30000
PCI_ID=1002:73BF
PCI_SUBSYS_ID=1002:0E3A
PCI_SLOT_NAME=0000:05:00.0
MODALIAS=pci:v00001002d000073BFsv00001002sd00000E3Abc03sc00i00
SEQNUM=6236

Signed-off-by: Yang Wang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 39 ++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a0df4cabb99..73c946d9cbe1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1805,4 +1805,7 @@ void amdgpu_device_set_uid(struct amdgpu_uid *uid_info,
                           uint64_t uid);  uint64_t 
amdgpu_device_get_uid(struct amdgpu_uid *uid_info,
                               enum amdgpu_uid_type type, uint8_t inst);
+
+int amdgpu_device_uevent_reset(struct amdgpu_device *adev, bool done);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a77000c2e0bb..300cc22dad91 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6318,6 +6318,7 @@ static int amdgpu_device_asic_reset(struct amdgpu_device 
*adev,

 retry: /* Rest of adevs pre asic reset from XGMI hive. */
        list_for_each_entry(tmp_adev, device_list, reset_list) {
+               amdgpu_device_uevent_reset(tmp_adev, false);
                r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
                /*TODO Should we stop ?*/
                if (r) {
@@ -6362,6 +6363,8 @@ static int amdgpu_device_asic_reset(struct amdgpu_device 
*adev,
                 * in before drm_sched_start.
                 */
                amdgpu_device_stop_pending_resets(tmp_adev);
+
+               amdgpu_device_uevent_reset(tmp_adev, true);
        }

        return r;
@@ -7669,3 +7672,39 @@ u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info,

        return uid_info->uid[type][inst];  }
+
+__printf(3, 4)
+static int amdgpu_device_uevent_emit(struct amdgpu_device *adev, enum 
kobject_action action,
+                                    char *fmt, ...) {
+       struct kobject *kobj = &adev->dev->kobj;
+       char *uevent_env[2], *tmp;
+       va_list ap;
+       int ret;
+
+       va_start(ap, fmt);
+       tmp = kvasprintf(GFP_KERNEL, fmt, ap);
+       va_end(ap);
+
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       uevent_env[0] = tmp;
+       uevent_env[1] = NULL;
+
+       ret = kobject_uevent_env(kobj, action, uevent_env);
+
+       kvfree(tmp);
+
+out:
+       return ret;
+}
+
+int amdgpu_device_uevent_reset(struct amdgpu_device *adev, bool done) {
+       int val = done ? 0 : 1;
+
+       return amdgpu_device_uevent_emit(adev, KOBJ_CHANGE, "RESET=%d",
+val); }
--
2.34.1


Reply via email to