Re: [PATCH] drm/amdgpu: Add more page fault info printing for GFX10

2019-08-12 Thread Christian König

Hi Yong,

that is an intentional differentiation.

The values behind the ":" are hardware values from the page faults 
interrupt vector.


All values printed without the ":" are additional things we get from the 
kernel task information based on the pasid.


Please keep that,
Christian.

Am 12.08.19 um 21:20 schrieb Zhao, Yong:

Hi Christian,

I feel with ":" it is better, because without it I found it is not easy
to interpret the printing. Moreover, it continues the format of the
former part.

It looks like this:

[  190.686668] amdgpu :03:00.0: [gfxhub0] retry page fault (src_id:0
ring:0 vmid:8 pasid:32771, for process:kfdtest pid:3273 thread:kfdtest
pid:3273)

vs without ":"

[  190.686668] amdgpu :03:00.0: [gfxhub0] retry page fault (src_id:0
ring:0 vmid:8 pasid:32771, for process kfdtest pid 3273 thread kfdtest
pid 3273)


If you are not convinced, I can change it to without ":".


Regards,

Yong

On 2019-08-12 3:12 p.m., Christian König wrote:

Am 12.08.19 um 21:05 schrieb Zhao, Yong:

The printing we did for GFX9 was not propogated to GFX10 somehow, so fix
it now.

Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca
Signed-off-by: Yong Zhao 
---
   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++
   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++-
   2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 4e3ac1084a94..f23be98e9897 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct
amdgpu_device *adev,
   }
     if (printk_ratelimit()) {
+    struct amdgpu_task_info task_info;
+
+    memset(&task_info, 0, sizeof(struct amdgpu_task_info));
+    amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+
   dev_err(adev->dev,
-    "[%s] VMC page fault (src_id:%u ring:%u vmid:%u
pasid:%u)\n",
+    "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, "
+    "for process:%s pid:%d thread:%s pid:%d)\n",
   entry->vmid_src ? "mmhub" : "gfxhub",
   entry->src_id, entry->ring_id, entry->vmid,
-    entry->pasid);
-    dev_err(adev->dev, "  at page 0x%016llx from %d\n",
+    entry->pasid, task_info.process_name, task_info.tgid,
+    task_info.task_name, task_info.pid);
+    dev_err(adev->dev, "  in page starting at address 0x%016llx
from client %d\n",
   addr, entry->client_id);
-    if (!amdgpu_sriov_vf(adev))
+    if (!amdgpu_sriov_vf(adev)) {
   dev_err(adev->dev,
-    "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
+    "GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
   status);
+    dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n",
+    REG_GET_FIELD(status,
+    GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS));
+    dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n",
+    REG_GET_FIELD(status,
+    GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR));
+    dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n",
+    REG_GET_FIELD(status,
+    GCVM_L2_PROTECTION_FAULT_STATUS, PERMISSION_FAULTS));
+    dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
+    REG_GET_FIELD(status,
+    GCVM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
+    dev_err(adev->dev, "\t RW: 0x%lx\n",
+    REG_GET_FIELD(status,
+    GCVM_L2_PROTECTION_FAULT_STATUS, RW));
+    }
   }
     return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 296e2d982578..34c4c2d08550 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct
amdgpu_device *adev,
     dev_err(adev->dev,
   "[%s] %s page fault (src_id:%u ring:%u vmid:%u "
-    "pasid:%u, for process %s pid %d thread %s pid %d)\n",
+    "pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n",

I think the text actually looks better without the ":".


   hub_name, retry_fault ? "retry" : "no-retry",
   entry->src_id, entry->ring_id, entry->vmid,
   entry->pasid, task_info.process_name, task_info.tgid,
@@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct
amdgpu_device *adev,
   dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
   REG_GET_FIELD(status,
   VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
+    dev_err(adev->dev, "\t RW: 0x%lx\n",
+    REG_GET_FIELD(status,
+    VM_L2_PROTECTION_FAULT_STATUS, RW));

That should probably be a separate patch since it is fixing gfx9.

Apart from that the patch looks good to me,
Christian.


   

Re: [PATCH] drm/amdgpu: Add more page fault info printing for GFX10

2019-08-12 Thread Zhao, Yong
Hi Christian,

I feel with ":" it is better, because without it I found it is not easy 
to interpret the printing. Moreover, it continues the format of the 
former part.

It looks like this:

[  190.686668] amdgpu :03:00.0: [gfxhub0] retry page fault (src_id:0 
ring:0 vmid:8 pasid:32771, for process:kfdtest pid:3273 thread:kfdtest 
pid:3273)

vs without ":"

[  190.686668] amdgpu :03:00.0: [gfxhub0] retry page fault (src_id:0 
ring:0 vmid:8 pasid:32771, for process kfdtest pid 3273 thread kfdtest 
pid 3273)


If you are not convinced, I can change it to without ":".


Regards,

Yong

On 2019-08-12 3:12 p.m., Christian König wrote:
> Am 12.08.19 um 21:05 schrieb Zhao, Yong:
>> The printing we did for GFX9 was not propogated to GFX10 somehow, so fix
>> it now.
>>
>> Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca
>> Signed-off-by: Yong Zhao 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++-
>>   2 files changed, 32 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 4e3ac1084a94..f23be98e9897 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct 
>> amdgpu_device *adev,
>>   }
>>     if (printk_ratelimit()) {
>> +    struct amdgpu_task_info task_info;
>> +
>> +    memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> +    amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> +
>>   dev_err(adev->dev,
>> -    "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 
>> pasid:%u)\n",
>> +    "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, "
>> +    "for process:%s pid:%d thread:%s pid:%d)\n",
>>   entry->vmid_src ? "mmhub" : "gfxhub",
>>   entry->src_id, entry->ring_id, entry->vmid,
>> -    entry->pasid);
>> -    dev_err(adev->dev, "  at page 0x%016llx from %d\n",
>> +    entry->pasid, task_info.process_name, task_info.tgid,
>> +    task_info.task_name, task_info.pid);
>> +    dev_err(adev->dev, "  in page starting at address 0x%016llx 
>> from client %d\n",
>>   addr, entry->client_id);
>> -    if (!amdgpu_sriov_vf(adev))
>> +    if (!amdgpu_sriov_vf(adev)) {
>>   dev_err(adev->dev,
>> -    "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
>> +    "GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
>>   status);
>> +    dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n",
>> +    REG_GET_FIELD(status,
>> +    GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS));
>> +    dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n",
>> +    REG_GET_FIELD(status,
>> +    GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR));
>> +    dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n",
>> +    REG_GET_FIELD(status,
>> +    GCVM_L2_PROTECTION_FAULT_STATUS, PERMISSION_FAULTS));
>> +    dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
>> +    REG_GET_FIELD(status,
>> +    GCVM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
>> +    dev_err(adev->dev, "\t RW: 0x%lx\n",
>> +    REG_GET_FIELD(status,
>> +    GCVM_L2_PROTECTION_FAULT_STATUS, RW));
>> +    }
>>   }
>>     return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index 296e2d982578..34c4c2d08550 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct 
>> amdgpu_device *adev,
>>     dev_err(adev->dev,
>>   "[%s] %s page fault (src_id:%u ring:%u vmid:%u "
>> -    "pasid:%u, for process %s pid %d thread %s pid %d)\n",
>> +    "pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n",
>
> I think the text actually looks better without the ":".
>
>>   hub_name, retry_fault ? "retry" : "no-retry",
>>   entry->src_id, entry->ring_id, entry->vmid,
>>   entry->pasid, task_info.process_name, task_info.tgid,
>> @@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct 
>> amdgpu_device *adev,
>>   dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
>>   REG_GET_FIELD(status,
>>   VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
>> +    dev_err(adev->dev, "\t RW: 0x%lx\n",
>> +    REG_GET_FIELD(status,
>> +    VM_L2_PROTECTION_FAULT_STATUS, RW));
>
> That should probably be a separate patch since it is fixing gfx9.
>
> Apart from that the patch looks good to me,
> Christian.
>
>>     }
>>   }
>
___
amd-

Re: [PATCH] drm/amdgpu: Add more page fault info printing for GFX10

2019-08-12 Thread Christian König

Am 12.08.19 um 21:05 schrieb Zhao, Yong:

The printing we did for GFX9 was not propogated to GFX10 somehow, so fix
it now.

Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca
Signed-off-by: Yong Zhao 
---
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++-
  2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 4e3ac1084a94..f23be98e9897 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct 
amdgpu_device *adev,
}
  
  	if (printk_ratelimit()) {

+   struct amdgpu_task_info task_info;
+
+   memset(&task_info, 0, sizeof(struct amdgpu_task_info));
+   amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+
dev_err(adev->dev,
-   "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 
pasid:%u)\n",
+   "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, "
+   "for process:%s pid:%d thread:%s pid:%d)\n",
entry->vmid_src ? "mmhub" : "gfxhub",
entry->src_id, entry->ring_id, entry->vmid,
-   entry->pasid);
-   dev_err(adev->dev, "  at page 0x%016llx from %d\n",
+   entry->pasid, task_info.process_name, task_info.tgid,
+   task_info.task_name, task_info.pid);
+   dev_err(adev->dev, "  in page starting at address 0x%016llx from 
client %d\n",
addr, entry->client_id);
-   if (!amdgpu_sriov_vf(adev))
+   if (!amdgpu_sriov_vf(adev)) {
dev_err(adev->dev,
-   "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
+   "GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
status);
+   dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS));
+   dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR));
+   dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, 
PERMISSION_FAULTS));
+   dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, 
MAPPING_ERROR));
+   dev_err(adev->dev, "\t RW: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, RW));
+   }
}
  
  	return 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 296e2d982578..34c4c2d08550 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
  
  		dev_err(adev->dev,

"[%s] %s page fault (src_id:%u ring:%u vmid:%u "
-   "pasid:%u, for process %s pid %d thread %s pid %d)\n",
+   "pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n",


I think the text actually looks better without the ":".


hub_name, retry_fault ? "retry" : "no-retry",
entry->src_id, entry->ring_id, entry->vmid,
entry->pasid, task_info.process_name, task_info.tgid,
@@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
REG_GET_FIELD(status,
VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
+   dev_err(adev->dev, "\t RW: 0x%lx\n",
+   REG_GET_FIELD(status,
+   VM_L2_PROTECTION_FAULT_STATUS, RW));


That should probably be a separate patch since it is fixing gfx9.

Apart from that the patch looks good to me,
Christian.

  
  		}

}


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Add more page fault info printing for GFX10

2019-08-12 Thread Zhao, Yong
The printing we did for GFX9 was not propogated to GFX10 somehow, so fix
it now.

Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca
Signed-off-by: Yong Zhao 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++-
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 4e3ac1084a94..f23be98e9897 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct 
amdgpu_device *adev,
}
 
if (printk_ratelimit()) {
+   struct amdgpu_task_info task_info;
+
+   memset(&task_info, 0, sizeof(struct amdgpu_task_info));
+   amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+
dev_err(adev->dev,
-   "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 
pasid:%u)\n",
+   "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, "
+   "for process:%s pid:%d thread:%s pid:%d)\n",
entry->vmid_src ? "mmhub" : "gfxhub",
entry->src_id, entry->ring_id, entry->vmid,
-   entry->pasid);
-   dev_err(adev->dev, "  at page 0x%016llx from %d\n",
+   entry->pasid, task_info.process_name, task_info.tgid,
+   task_info.task_name, task_info.pid);
+   dev_err(adev->dev, "  in page starting at address 0x%016llx 
from client %d\n",
addr, entry->client_id);
-   if (!amdgpu_sriov_vf(adev))
+   if (!amdgpu_sriov_vf(adev)) {
dev_err(adev->dev,
-   "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
+   "GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
status);
+   dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS));
+   dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR));
+   dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, 
PERMISSION_FAULTS));
+   dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, 
MAPPING_ERROR));
+   dev_err(adev->dev, "\t RW: 0x%lx\n",
+   REG_GET_FIELD(status,
+   GCVM_L2_PROTECTION_FAULT_STATUS, RW));
+   }
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 296e2d982578..34c4c2d08550 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 
dev_err(adev->dev,
"[%s] %s page fault (src_id:%u ring:%u vmid:%u "
-   "pasid:%u, for process %s pid %d thread %s pid %d)\n",
+   "pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n",
hub_name, retry_fault ? "retry" : "no-retry",
entry->src_id, entry->ring_id, entry->vmid,
entry->pasid, task_info.process_name, task_info.tgid,
@@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
REG_GET_FIELD(status,
VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
+   dev_err(adev->dev, "\t RW: 0x%lx\n",
+   REG_GET_FIELD(status,
+   VM_L2_PROTECTION_FAULT_STATUS, RW));
 
}
}
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx