The coredump needs to contain accurate data and reporting a page
fault from a previous issue is incorrect.

Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-pra...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 13 ++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c          |  5 +++++
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
index de70747a099d..6fa53e070b50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
@@ -273,11 +273,13 @@ __amdgpu_devcoredump_read(char *buffer, size_t count, 
struct amdgpu_coredump_inf
        }
 
        /* Add page fault information */
-       fault_info = &coredump->adev->vm_manager.fault_info;
-       drm_printf(&p, "\n[%s] Page fault observed\n",
-                  fault_info->vmhub ? "mmhub" : "gfxhub");
-       drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", 
fault_info->addr);
-       drm_printf(&p, "Protection fault status register: 0x%x\n\n", 
fault_info->status);
+       fault_info = &coredump->fault_info;
+       if (fault_info->status != 0) {
+               drm_printf(&p, "\n[%s] Page fault observed\n",
+                          fault_info->vmhub ? "mmhub" : "gfxhub");
+               drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", 
fault_info->addr);
+               drm_printf(&p, "Protection fault status register: 0x%x\n\n", 
fault_info->status);
+       }
 
        /* dump the ip state for each ip */
        drm_printf(&p, "IP Dump\n");
@@ -377,6 +379,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool 
skip_vram_check,
 
        coredump->skip_vram_check = skip_vram_check;
        coredump->reset_vram_lost = vram_lost;
+       coredump->fault_info = adev->vm_manager.fault_info;
 
        if (job && job->pasid) {
                struct amdgpu_task_info *ti;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
index 33f2f6fdfcf7..38ccdd3d6213 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
@@ -37,6 +37,7 @@ struct amdgpu_coredump_info {
        struct timespec64               reset_time;
        bool                            skip_vram_check;
        bool                            reset_vram_lost;
+       struct amdgpu_vm_fault_info     fault_info;
        struct amdgpu_ring              *ring;
        /* Readable form of coredevdump, generate once to speed up
         * reading it (see drm_coredump_printer's documentation).
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index acb21fc8b3ce..5ee9d2cd74e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -199,6 +199,11 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct 
drm_sched_job *s_job)
 
 exit:
        drm_dev_exit(idx);
+
+       /* Clear fault info to avoid reporting the same fault. */
+       adev->vm_manager.fault_info.status = 0;
+       adev->vm_manager.fault_info.addr = 0;
+
        return DRM_GPU_SCHED_STAT_NOMINAL;
 }
 
-- 
2.43.0

Reply via email to