Re: [PATCH] drm/amdgpu: Add more page fault info printing for GFX10
Hi Yong, that is an intentional differentiation. The values behind the ":" are hardware values from the page faults interrupt vector. All values printed without the ":" are additional things we get from the kernel task information based on the pasid. Please keep that, Christian. Am 12.08.19 um 21:20 schrieb Zhao, Yong: Hi Christian, I feel with ":" it is better, because without it I found it is not easy to interpret the printing. Moreover, it continues the format of the former part. It looks like this: [ 190.686668] amdgpu :03:00.0: [gfxhub0] retry page fault (src_id:0 ring:0 vmid:8 pasid:32771, for process:kfdtest pid:3273 thread:kfdtest pid:3273) vs without ":" [ 190.686668] amdgpu :03:00.0: [gfxhub0] retry page fault (src_id:0 ring:0 vmid:8 pasid:32771, for process kfdtest pid 3273 thread kfdtest pid 3273) If you are not convinced, I can change it to without ":". Regards, Yong On 2019-08-12 3:12 p.m., Christian König wrote: Am 12.08.19 um 21:05 schrieb Zhao, Yong: The printing we did for GFX9 was not propogated to GFX10 somehow, so fix it now. Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca Signed-off-by: Yong Zhao --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +++- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 4e3ac1084a94..f23be98e9897 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, } if (printk_ratelimit()) { + struct amdgpu_task_info task_info; + + memset(&task_info, 0, sizeof(struct amdgpu_task_info)); + amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); + dev_err(adev->dev, - "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n", + "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, " + "for process:%s pid:%d thread:%s pid:%d)\n", entry->vmid_src ? "mmhub" : "gfxhub", entry->src_id, entry->ring_id, entry->vmid, - entry->pasid); - dev_err(adev->dev, " at page 0x%016llx from %d\n", + entry->pasid, task_info.process_name, task_info.tgid, + task_info.task_name, task_info.pid); + dev_err(adev->dev, " in page starting at address 0x%016llx from client %d\n", addr, entry->client_id); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev)) { dev_err(adev->dev, - "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n", + "GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n", status); + dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS)); + dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR)); + dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, PERMISSION_FAULTS)); + dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR)); + dev_err(adev->dev, "\t RW: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, RW)); + } } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 296e2d982578..34c4c2d08550 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, dev_err(adev->dev, "[%s] %s page fault (src_id:%u ring:%u vmid:%u " - "pasid:%u, for process %s pid %d thread %s pid %d)\n", + "pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n", I think the text actually looks better without the ":". hub_name, retry_fault ? "retry" : "no-retry", entry->src_id, entry->ring_id, entry->vmid, entry->pasid, task_info.process_name, task_info.tgid, @@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n", REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR)); + dev_err(adev->dev, "\t RW: 0x%lx\n", + REG_GET_FIELD(status, + VM_L2_PROTECTION_FAULT_STATUS, RW)); That should probably be a separate patch since it is fixing gfx9. Apart from that the patch looks good to me, Christian.
Re: [PATCH] drm/amdgpu: Add more page fault info printing for GFX10
Hi Christian, I feel with ":" it is better, because without it I found it is not easy to interpret the printing. Moreover, it continues the format of the former part. It looks like this: [ 190.686668] amdgpu :03:00.0: [gfxhub0] retry page fault (src_id:0 ring:0 vmid:8 pasid:32771, for process:kfdtest pid:3273 thread:kfdtest pid:3273) vs without ":" [ 190.686668] amdgpu :03:00.0: [gfxhub0] retry page fault (src_id:0 ring:0 vmid:8 pasid:32771, for process kfdtest pid 3273 thread kfdtest pid 3273) If you are not convinced, I can change it to without ":". Regards, Yong On 2019-08-12 3:12 p.m., Christian König wrote: > Am 12.08.19 um 21:05 schrieb Zhao, Yong: >> The printing we did for GFX9 was not propogated to GFX10 somehow, so fix >> it now. >> >> Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca >> Signed-off-by: Yong Zhao >> --- >> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++ >> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +++- >> 2 files changed, 32 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c >> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c >> index 4e3ac1084a94..f23be98e9897 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c >> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c >> @@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct >> amdgpu_device *adev, >> } >> if (printk_ratelimit()) { >> + struct amdgpu_task_info task_info; >> + >> + memset(&task_info, 0, sizeof(struct amdgpu_task_info)); >> + amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); >> + >> dev_err(adev->dev, >> - "[%s] VMC page fault (src_id:%u ring:%u vmid:%u >> pasid:%u)\n", >> + "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, " >> + "for process:%s pid:%d thread:%s pid:%d)\n", >> entry->vmid_src ? "mmhub" : "gfxhub", >> entry->src_id, entry->ring_id, entry->vmid, >> - entry->pasid); >> - dev_err(adev->dev, " at page 0x%016llx from %d\n", >> + entry->pasid, task_info.process_name, task_info.tgid, >> + task_info.task_name, task_info.pid); >> + dev_err(adev->dev, " in page starting at address 0x%016llx >> from client %d\n", >> addr, entry->client_id); >> - if (!amdgpu_sriov_vf(adev)) >> + if (!amdgpu_sriov_vf(adev)) { >> dev_err(adev->dev, >> - "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n", >> + "GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n", >> status); >> + dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n", >> + REG_GET_FIELD(status, >> + GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS)); >> + dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n", >> + REG_GET_FIELD(status, >> + GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR)); >> + dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n", >> + REG_GET_FIELD(status, >> + GCVM_L2_PROTECTION_FAULT_STATUS, PERMISSION_FAULTS)); >> + dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n", >> + REG_GET_FIELD(status, >> + GCVM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR)); >> + dev_err(adev->dev, "\t RW: 0x%lx\n", >> + REG_GET_FIELD(status, >> + GCVM_L2_PROTECTION_FAULT_STATUS, RW)); >> + } >> } >> return 0; >> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >> index 296e2d982578..34c4c2d08550 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c >> @@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct >> amdgpu_device *adev, >> dev_err(adev->dev, >> "[%s] %s page fault (src_id:%u ring:%u vmid:%u " >> - "pasid:%u, for process %s pid %d thread %s pid %d)\n", >> + "pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n", > > I think the text actually looks better without the ":". > >> hub_name, retry_fault ? "retry" : "no-retry", >> entry->src_id, entry->ring_id, entry->vmid, >> entry->pasid, task_info.process_name, task_info.tgid, >> @@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct >> amdgpu_device *adev, >> dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n", >> REG_GET_FIELD(status, >> VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR)); >> + dev_err(adev->dev, "\t RW: 0x%lx\n", >> + REG_GET_FIELD(status, >> + VM_L2_PROTECTION_FAULT_STATUS, RW)); > > That should probably be a separate patch since it is fixing gfx9. > > Apart from that the patch looks good to me, > Christian. > >> } >> } > ___ amd-
Re: [PATCH] drm/amdgpu: Add more page fault info printing for GFX10
Am 12.08.19 um 21:05 schrieb Zhao, Yong: The printing we did for GFX9 was not propogated to GFX10 somehow, so fix it now. Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca Signed-off-by: Yong Zhao --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +++- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 4e3ac1084a94..f23be98e9897 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, } if (printk_ratelimit()) { + struct amdgpu_task_info task_info; + + memset(&task_info, 0, sizeof(struct amdgpu_task_info)); + amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); + dev_err(adev->dev, - "[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n", + "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, " + "for process:%s pid:%d thread:%s pid:%d)\n", entry->vmid_src ? "mmhub" : "gfxhub", entry->src_id, entry->ring_id, entry->vmid, - entry->pasid); - dev_err(adev->dev, " at page 0x%016llx from %d\n", + entry->pasid, task_info.process_name, task_info.tgid, + task_info.task_name, task_info.pid); + dev_err(adev->dev, " in page starting at address 0x%016llx from client %d\n", addr, entry->client_id); - if (!amdgpu_sriov_vf(adev)) + if (!amdgpu_sriov_vf(adev)) { dev_err(adev->dev, - "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n", + "GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n", status); + dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS)); + dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR)); + dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, PERMISSION_FAULTS)); + dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR)); + dev_err(adev->dev, "\t RW: 0x%lx\n", + REG_GET_FIELD(status, + GCVM_L2_PROTECTION_FAULT_STATUS, RW)); + } } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 296e2d982578..34c4c2d08550 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, dev_err(adev->dev, "[%s] %s page fault (src_id:%u ring:%u vmid:%u " - "pasid:%u, for process %s pid %d thread %s pid %d)\n", + "pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n", I think the text actually looks better without the ":". hub_name, retry_fault ? "retry" : "no-retry", entry->src_id, entry->ring_id, entry->vmid, entry->pasid, task_info.process_name, task_info.tgid, @@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n", REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR)); + dev_err(adev->dev, "\t RW: 0x%lx\n", + REG_GET_FIELD(status, + VM_L2_PROTECTION_FAULT_STATUS, RW)); That should probably be a separate patch since it is fixing gfx9. Apart from that the patch looks good to me, Christian. } } ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx