The coredump needs to contain accurate data and reporting a page fault from a previous issue is incorrect.
Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-pra...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 13 ++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 +++++ 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index de70747a099d..6fa53e070b50 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -273,11 +273,13 @@ __amdgpu_devcoredump_read(char *buffer, size_t count, struct amdgpu_coredump_inf } /* Add page fault information */ - fault_info = &coredump->adev->vm_manager.fault_info; - drm_printf(&p, "\n[%s] Page fault observed\n", - fault_info->vmhub ? "mmhub" : "gfxhub"); - drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr); - drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status); + fault_info = &coredump->fault_info; + if (fault_info->status != 0) { + drm_printf(&p, "\n[%s] Page fault observed\n", + fault_info->vmhub ? "mmhub" : "gfxhub"); + drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr); + drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status); + } /* dump the ip state for each ip */ drm_printf(&p, "IP Dump\n"); @@ -377,6 +379,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, coredump->skip_vram_check = skip_vram_check; coredump->reset_vram_lost = vram_lost; + coredump->fault_info = adev->vm_manager.fault_info; if (job && job->pasid) { struct amdgpu_task_info *ti; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h index 33f2f6fdfcf7..38ccdd3d6213 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h @@ -37,6 +37,7 @@ struct amdgpu_coredump_info { struct timespec64 reset_time; bool skip_vram_check; bool reset_vram_lost; + struct amdgpu_vm_fault_info fault_info; struct amdgpu_ring *ring; /* Readable form of coredevdump, generate once to speed up * reading it (see drm_coredump_printer's documentation). diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index acb21fc8b3ce..5ee9d2cd74e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -199,6 +199,11 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) exit: drm_dev_exit(idx); + + /* Clear fault info to avoid reporting the same fault. */ + adev->vm_manager.fault_info.status = 0; + adev->vm_manager.fault_info.addr = 0; + return DRM_GPU_SCHED_STAT_NOMINAL; } -- 2.43.0