Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump

2024-03-07 Thread Khatri, Sunil



On 3/7/2024 6:10 PM, Christian König wrote:

Am 07.03.24 um 09:37 schrieb Khatri, Sunil:


On 3/7/2024 1:47 PM, Christian König wrote:

Am 06.03.24 um 19:19 schrieb Sunil Khatri:

Add page fault information to the devcoredump.

Output of devcoredump:
 AMDGPU Device Coredump 
version: 1
kernel: 6.7.0-amd-staging-drm-next
module: amdgpu
time: 29.725011811
process_name: soft_recovery_p PID: 1720

Ring timed out details
IP Type: 0 Ring Name: gfx_0.0.0

[gfxhub] Page fault observed for GPU family:143
Faulty page starting at address 0x
Protection fault status register:0x301031

VRAM is lost due to GPU reset!

Signed-off-by: Sunil Khatri 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c

index 147100c27c2d..d7fea6cdf2f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t 
offset, size_t count,

 coredump->ring->name);
  }
  +    if (coredump->fault_info.status) {
+    struct amdgpu_vm_fault_info *fault_info = 
>fault_info;

+
+    drm_printf(, "\n[%s] Page fault observed for GPU 
family:%d\n",

+   fault_info->vmhub ? "mmhub" : "gfxhub",
+   coredump->adev->family);
+    drm_printf(, "Faulty page starting at address 0x%016llx\n",
+   fault_info->addr);
+    drm_printf(, "Protection fault status register:0x%x\n",
+   fault_info->status);
+    }
+
  if (coredump->reset_vram_lost)
-    drm_printf(, "VRAM is lost due to GPU reset!\n");
+    drm_printf(, "\nVRAM is lost due to GPU reset!\n");
  if (coredump->adev->reset_info.num_regs) {
  drm_printf(, "AMDGPU register dumps:\nOffset: 
Value:\n");
  @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device 
*adev, bool vram_lost,

  if (job) {
  s_job = >base;
  coredump->ring = to_amdgpu_ring(s_job->sched);
+    coredump->fault_info = job->vm->fault_info;


That's illegal. The VM pointer might already be stale at this point.

I think you need to add the fault info of the last fault globally in 
the VRAM manager or move this to the process info Shashank is 
working on.
Are you saying that during the reset or otherwise a vm which is part 
of this job could have been freed  and we might have a NULL 
dereference or invalid reference? Till now based on the resets and 
pagefaults that i have created till now using the same app which we 
are using for IH overflow i am able to get the valid vm only.


Assuming  amdgpu_vm is freed for this job or stale, are you 
suggesting to update this information in adev-> vm_manager along 
with existing per vm fault_info or only in vm_manager ?


Good question. having it both in the VM as well as the VM manager 
sounds like the simplest option for now.


Let me update the patch then with information in VM manager.

Regards
Sunil



Regards,
Christian.



Regards,
Christian.


  }
    coredump->adev = adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h

index 60522963aaca..3197955264f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
  struct timespec64   reset_time;
  bool    reset_vram_lost;
  struct amdgpu_ring    *ring;
+    struct amdgpu_vm_fault_info    fault_info;
  };
  #endif






Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump

2024-03-07 Thread Christian König

Am 07.03.24 um 09:37 schrieb Khatri, Sunil:


On 3/7/2024 1:47 PM, Christian König wrote:

Am 06.03.24 um 19:19 schrieb Sunil Khatri:

Add page fault information to the devcoredump.

Output of devcoredump:
 AMDGPU Device Coredump 
version: 1
kernel: 6.7.0-amd-staging-drm-next
module: amdgpu
time: 29.725011811
process_name: soft_recovery_p PID: 1720

Ring timed out details
IP Type: 0 Ring Name: gfx_0.0.0

[gfxhub] Page fault observed for GPU family:143
Faulty page starting at address 0x
Protection fault status register:0x301031

VRAM is lost due to GPU reset!

Signed-off-by: Sunil Khatri 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c

index 147100c27c2d..d7fea6cdf2f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t 
offset, size_t count,

 coredump->ring->name);
  }
  +    if (coredump->fault_info.status) {
+    struct amdgpu_vm_fault_info *fault_info = 
>fault_info;

+
+    drm_printf(, "\n[%s] Page fault observed for GPU 
family:%d\n",

+   fault_info->vmhub ? "mmhub" : "gfxhub",
+   coredump->adev->family);
+    drm_printf(, "Faulty page starting at address 0x%016llx\n",
+   fault_info->addr);
+    drm_printf(, "Protection fault status register:0x%x\n",
+   fault_info->status);
+    }
+
  if (coredump->reset_vram_lost)
-    drm_printf(, "VRAM is lost due to GPU reset!\n");
+    drm_printf(, "\nVRAM is lost due to GPU reset!\n");
  if (coredump->adev->reset_info.num_regs) {
  drm_printf(, "AMDGPU register dumps:\nOffset: 
Value:\n");
  @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device 
*adev, bool vram_lost,

  if (job) {
  s_job = >base;
  coredump->ring = to_amdgpu_ring(s_job->sched);
+    coredump->fault_info = job->vm->fault_info;


That's illegal. The VM pointer might already be stale at this point.

I think you need to add the fault info of the last fault globally in 
the VRAM manager or move this to the process info Shashank is working 
on.
Are you saying that during the reset or otherwise a vm which is part 
of this job could have been freed  and we might have a NULL 
dereference or invalid reference? Till now based on the resets and 
pagefaults that i have created till now using the same app which we 
are using for IH overflow i am able to get the valid vm only.


Assuming  amdgpu_vm is freed for this job or stale, are you 
suggesting to update this information in adev-> vm_manager along with 
existing per vm fault_info or only in vm_manager ?


Good question. having it both in the VM as well as the VM manager sounds 
like the simplest option for now.


Regards,
Christian.



Regards,
Christian.


  }
    coredump->adev = adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h

index 60522963aaca..3197955264f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
  struct timespec64   reset_time;
  bool    reset_vram_lost;
  struct amdgpu_ring    *ring;
+    struct amdgpu_vm_fault_info    fault_info;
  };
  #endif






Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump

2024-03-07 Thread Khatri, Sunil



On 3/7/2024 1:47 PM, Christian König wrote:

Am 06.03.24 um 19:19 schrieb Sunil Khatri:

Add page fault information to the devcoredump.

Output of devcoredump:
 AMDGPU Device Coredump 
version: 1
kernel: 6.7.0-amd-staging-drm-next
module: amdgpu
time: 29.725011811
process_name: soft_recovery_p PID: 1720

Ring timed out details
IP Type: 0 Ring Name: gfx_0.0.0

[gfxhub] Page fault observed for GPU family:143
Faulty page starting at address 0x
Protection fault status register:0x301031

VRAM is lost due to GPU reset!

Signed-off-by: Sunil Khatri 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c

index 147100c27c2d..d7fea6cdf2f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t 
offset, size_t count,

 coredump->ring->name);
  }
  +    if (coredump->fault_info.status) {
+    struct amdgpu_vm_fault_info *fault_info = 
>fault_info;

+
+    drm_printf(, "\n[%s] Page fault observed for GPU 
family:%d\n",

+   fault_info->vmhub ? "mmhub" : "gfxhub",
+   coredump->adev->family);
+    drm_printf(, "Faulty page starting at address 0x%016llx\n",
+   fault_info->addr);
+    drm_printf(, "Protection fault status register:0x%x\n",
+   fault_info->status);
+    }
+
  if (coredump->reset_vram_lost)
-    drm_printf(, "VRAM is lost due to GPU reset!\n");
+    drm_printf(, "\nVRAM is lost due to GPU reset!\n");
  if (coredump->adev->reset_info.num_regs) {
  drm_printf(, "AMDGPU register dumps:\nOffset: 
Value:\n");
  @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device 
*adev, bool vram_lost,

  if (job) {
  s_job = >base;
  coredump->ring = to_amdgpu_ring(s_job->sched);
+    coredump->fault_info = job->vm->fault_info;


That's illegal. The VM pointer might already be stale at this point.

I think you need to add the fault info of the last fault globally in 
the VRAM manager or move this to the process info Shashank is working on.
Are you saying that during the reset or otherwise a vm which is part 
of this job could have been freed  and we might have a NULL 
dereference or invalid reference? Till now based on the resets and 
pagefaults that i have created till now using the same app which we 
are using for IH overflow i am able to get the valid vm only.


Assuming  amdgpu_vm is freed for this job or stale, are you suggesting 
to update this information in adev-> vm_manager along with existing 
per vm fault_info or only in vm_manager ?


Regards,
Christian.


  }
    coredump->adev = adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h

index 60522963aaca..3197955264f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
  struct timespec64   reset_time;
  bool    reset_vram_lost;
  struct amdgpu_ring    *ring;
+    struct amdgpu_vm_fault_info    fault_info;
  };
  #endif




Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump

2024-03-07 Thread Christian König

Am 06.03.24 um 19:19 schrieb Sunil Khatri:

Add page fault information to the devcoredump.

Output of devcoredump:
 AMDGPU Device Coredump 
version: 1
kernel: 6.7.0-amd-staging-drm-next
module: amdgpu
time: 29.725011811
process_name: soft_recovery_p PID: 1720

Ring timed out details
IP Type: 0 Ring Name: gfx_0.0.0

[gfxhub] Page fault observed for GPU family:143
Faulty page starting at address 0x
Protection fault status register:0x301031

VRAM is lost due to GPU reset!

Signed-off-by: Sunil Khatri 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 147100c27c2d..d7fea6cdf2f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, 
size_t count,
   coredump->ring->name);
}
  
+	if (coredump->fault_info.status) {

+   struct amdgpu_vm_fault_info *fault_info = >fault_info;
+
+   drm_printf(, "\n[%s] Page fault observed for GPU family:%d\n",
+  fault_info->vmhub ? "mmhub" : "gfxhub",
+  coredump->adev->family);
+   drm_printf(, "Faulty page starting at address 0x%016llx\n",
+  fault_info->addr);
+   drm_printf(, "Protection fault status register:0x%x\n",
+  fault_info->status);
+   }
+
if (coredump->reset_vram_lost)
-   drm_printf(, "VRAM is lost due to GPU reset!\n");
+   drm_printf(, "\nVRAM is lost due to GPU reset!\n");
if (coredump->adev->reset_info.num_regs) {
drm_printf(, "AMDGPU register dumps:\nOffset: Value:\n");
  
@@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,

if (job) {
s_job = >base;
coredump->ring = to_amdgpu_ring(s_job->sched);
+   coredump->fault_info = job->vm->fault_info;


That's illegal. The VM pointer might already be stale at this point.

I think you need to add the fault info of the last fault globally in the 
VRAM manager or move this to the process info Shashank is working on.


Regards,
Christian.


}
  
  	coredump->adev = adev;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 60522963aaca..3197955264f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
struct timespec64   reset_time;
boolreset_vram_lost;
struct amdgpu_ring  *ring;
+   struct amdgpu_vm_fault_info fault_info;
  };
  #endif
  




Re: [PATCH] drm/amdgpu: add vm fault information to devcoredump

2024-03-06 Thread Khatri, Sunil



On 3/7/2024 12:51 AM, Deucher, Alexander wrote:

[Public]


-Original Message-
From: Sunil Khatri 
Sent: Wednesday, March 6, 2024 1:20 PM
To: Deucher, Alexander ; Koenig, Christian
; Sharma, Shashank

Cc: amd-...@lists.freedesktop.org; dri-devel@lists.freedesktop.org; linux-
ker...@vger.kernel.org; Joshi, Mukul ; Paneer
Selvam, Arunpravin ; Khatri, Sunil

Subject: [PATCH] drm/amdgpu: add vm fault information to devcoredump

Add page fault information to the devcoredump.

Output of devcoredump:
 AMDGPU Device Coredump 
version: 1
kernel: 6.7.0-amd-staging-drm-next
module: amdgpu
time: 29.725011811
process_name: soft_recovery_p PID: 1720

Ring timed out details
IP Type: 0 Ring Name: gfx_0.0.0

[gfxhub] Page fault observed for GPU family:143 Faulty page starting at

I think we should add a separate section for the GPU identification information 
(family, PCI ids, IP versions, etc.).  For this patch, I think fine to just 
print the fault address and status.


Noted

Regards
Sunil


Alex


address 0x Protection fault status register:0x301031

VRAM is lost due to GPU reset!

Signed-off-by: Sunil Khatri 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 147100c27c2d..d7fea6cdf2f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t
offset, size_t count,
  coredump->ring->name);
   }

+ if (coredump->fault_info.status) {
+ struct amdgpu_vm_fault_info *fault_info = 

fault_info;

+
+ drm_printf(, "\n[%s] Page fault observed for GPU
family:%d\n",
+fault_info->vmhub ? "mmhub" : "gfxhub",
+coredump->adev->family);
+ drm_printf(, "Faulty page starting at address 0x%016llx\n",
+fault_info->addr);
+ drm_printf(, "Protection fault status register:0x%x\n",
+fault_info->status);
+ }
+
   if (coredump->reset_vram_lost)
- drm_printf(, "VRAM is lost due to GPU reset!\n");
+ drm_printf(, "\nVRAM is lost due to GPU reset!\n");
   if (coredump->adev->reset_info.num_regs) {
   drm_printf(, "AMDGPU register dumps:\nOffset:
Value:\n");

@@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device
*adev, bool vram_lost,
   if (job) {
   s_job = >base;
   coredump->ring = to_amdgpu_ring(s_job->sched);
+ coredump->fault_info = job->vm->fault_info;
   }

   coredump->adev = adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 60522963aaca..3197955264f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
   struct timespec64   reset_time;
   boolreset_vram_lost;
   struct amdgpu_ring  *ring;
+ struct amdgpu_vm_fault_info fault_info;
  };
  #endif

--
2.34.1


RE: [PATCH] drm/amdgpu: add vm fault information to devcoredump

2024-03-06 Thread Deucher, Alexander
[Public]

> -Original Message-
> From: Sunil Khatri 
> Sent: Wednesday, March 6, 2024 1:20 PM
> To: Deucher, Alexander ; Koenig, Christian
> ; Sharma, Shashank
> 
> Cc: amd-...@lists.freedesktop.org; dri-devel@lists.freedesktop.org; linux-
> ker...@vger.kernel.org; Joshi, Mukul ; Paneer
> Selvam, Arunpravin ; Khatri, Sunil
> 
> Subject: [PATCH] drm/amdgpu: add vm fault information to devcoredump
>
> Add page fault information to the devcoredump.
>
> Output of devcoredump:
>  AMDGPU Device Coredump 
> version: 1
> kernel: 6.7.0-amd-staging-drm-next
> module: amdgpu
> time: 29.725011811
> process_name: soft_recovery_p PID: 1720
>
> Ring timed out details
> IP Type: 0 Ring Name: gfx_0.0.0
>
> [gfxhub] Page fault observed for GPU family:143 Faulty page starting at

I think we should add a separate section for the GPU identification information 
(family, PCI ids, IP versions, etc.).  For this patch, I think fine to just 
print the fault address and status.

Alex

> address 0x Protection fault status register:0x301031
>
> VRAM is lost due to GPU reset!
>
> Signed-off-by: Sunil Khatri 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 15 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
>  2 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 147100c27c2d..d7fea6cdf2f9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -203,8 +203,20 @@ amdgpu_devcoredump_read(char *buffer, loff_t
> offset, size_t count,
>  coredump->ring->name);
>   }
>
> + if (coredump->fault_info.status) {
> + struct amdgpu_vm_fault_info *fault_info = 
> >fault_info;
> +
> + drm_printf(, "\n[%s] Page fault observed for GPU
> family:%d\n",
> +fault_info->vmhub ? "mmhub" : "gfxhub",
> +coredump->adev->family);
> + drm_printf(, "Faulty page starting at address 0x%016llx\n",
> +fault_info->addr);
> + drm_printf(, "Protection fault status register:0x%x\n",
> +fault_info->status);
> + }
> +
>   if (coredump->reset_vram_lost)
> - drm_printf(, "VRAM is lost due to GPU reset!\n");
> + drm_printf(, "\nVRAM is lost due to GPU reset!\n");
>   if (coredump->adev->reset_info.num_regs) {
>   drm_printf(, "AMDGPU register dumps:\nOffset:
> Value:\n");
>
> @@ -253,6 +265,7 @@ void amdgpu_coredump(struct amdgpu_device
> *adev, bool vram_lost,
>   if (job) {
>   s_job = >base;
>   coredump->ring = to_amdgpu_ring(s_job->sched);
> + coredump->fault_info = job->vm->fault_info;
>   }
>
>   coredump->adev = adev;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 60522963aaca..3197955264f9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -98,6 +98,7 @@ struct amdgpu_coredump_info {
>   struct timespec64   reset_time;
>   boolreset_vram_lost;
>   struct amdgpu_ring  *ring;
> + struct amdgpu_vm_fault_info fault_info;
>  };
>  #endif
>
> --
> 2.34.1