RE: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR
[AMD Official Use Only - General] Reviewed-by: Emily Deng Emily Deng Best Wishes >-Original Message- >From: Li, Yunxiang (Teddy) >Sent: Saturday, April 27, 2024 2:27 AM >To: amd-gfx@lists.freedesktop.org >Cc: Deucher, Alexander ; Koenig, Christian >; Lazar, Lijo ; Kuehling, >Felix ; Deng, Emily ; Li, >Yunxiang (Teddy) >Subject: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR > >There are other reset sources that pass NULL as the job pointer, such as >amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the >FLR comes from the host does not work. > >Add a flag in reset_context to explicitly mark host triggered reset, and set >this flag when we receive host reset notification. > >Signed-off-by: Yunxiang Li >--- >v2: fix typo >v3: pass reset_context directly >v4: clear the flag in case we retry > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 - >drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 1 + > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 1 + > drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 1 + > 5 files changed, 12 insertions(+), 5 deletions(-) > >diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >index 8befd10bf007..33c889c027a5 100644 >--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct >amdgpu_device *adev) > * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf > * > * @adev: amdgpu_device pointer >- * @from_hypervisor: request from hypervisor >+ * @reset_context: amdgpu reset context pointer > * > * do VF FLR and reinitialize Asic > * return 0 means succeeded otherwise failed > */ > static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, >- bool from_hypervisor) >+ struct amdgpu_reset_context >*reset_context) > { > int r; > struct amdgpu_hive_info *hive = NULL; >@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct >amdgpu_device *adev, > retry: > amdgpu_amdkfd_pre_reset(adev); > >- if (from_hypervisor) >+ if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { >+ clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); > r = amdgpu_virt_request_full_gpu(adev, true); >- else >+ } else { > r = amdgpu_virt_reset_gpu(adev); >+ } > if (r) > return r; >+ > amdgpu_ras_set_fed(adev, false); > amdgpu_irq_gpu_reset_resume_helper(adev); > >@@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct >amdgpu_device *adev, > /* Actual ASIC resets if needed.*/ > /* Host driver will handle XGMI hive reset for SRIOV */ > if (amdgpu_sriov_vf(adev)) { >- r = amdgpu_device_reset_sriov(adev, job ? false : true); >+ r = amdgpu_device_reset_sriov(adev, reset_context); > if (r) > adev->asic_reset_res = r; > >diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >index b11d190ece53..5a9cc043b858 100644 >--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h >@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS { > AMDGPU_NEED_FULL_RESET = 0, > AMDGPU_SKIP_HW_RESET = 1, > AMDGPU_SKIP_COREDUMP = 2, >+ AMDGPU_HOST_FLR = 3, > }; > > struct amdgpu_reset_context { >diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >index c5ba9c4757a8..f4c47492e0cd 100644 >--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c >@@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct >work_struct *work) > reset_context.method = AMD_RESET_METHOD_NONE; > reset_context.reset_req_dev = adev; > clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); >+ set_bit(AMDGPU_HOST_FLR, &reset_context.flags); > > amdgpu_device_gpu_recover(adev, NULL, &reset_context); > } >diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >index fa9d1b02f391..14cc7910e5cf 100644 >--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c >@@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct >work_struct *work) > reset_context.method = AMD_RESET_METHOD_NONE; > reset_context.reset_req_dev = adev; > clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); >+ set_bit(AMDGPU_HOST_FLR, &reset_context.flags); > > amdgpu_device_gpu_recover(adev, NULL, &reset_context); > } >diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c >index 14a065516ae4..78cd07744ebe 100644 >--- a/driver
RE: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR
[AMD Official Use Only - General] Reviewed-by: Zhigang Luo -Original Message- From: amd-gfx On Behalf Of Yunxiang Li Sent: Friday, April 26, 2024 2:27 PM To: amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander ; Koenig, Christian ; Lazar, Lijo ; Kuehling, Felix ; Deng, Emily ; Li, Yunxiang (Teddy) Subject: [PATCH v4 2/4] drm/amdgpu: Add reset_context flag for host FLR There are other reset sources that pass NULL as the job pointer, such as amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the FLR comes from the host does not work. Add a flag in reset_context to explicitly mark host triggered reset, and set this flag when we receive host reset notification. Signed-off-by: Yunxiang Li --- v2: fix typo v3: pass reset_context directly v4: clear the flag in case we retry drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 - drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 1 + 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8befd10bf007..33c889c027a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf * * @adev: amdgpu_device pointer - * @from_hypervisor: request from hypervisor + * @reset_context: amdgpu reset context pointer * * do VF FLR and reinitialize Asic * return 0 means succeeded otherwise failed */ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, -bool from_hypervisor) +struct amdgpu_reset_context *reset_context) { int r; struct amdgpu_hive_info *hive = NULL; @@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); - if (from_hypervisor) + if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { + clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); r = amdgpu_virt_request_full_gpu(adev, true); - else + } else { r = amdgpu_virt_reset_gpu(adev); + } if (r) return r; + amdgpu_ras_set_fed(adev, false); amdgpu_irq_gpu_reset_resume_helper(adev); @@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* Actual ASIC resets if needed.*/ /* Host driver will handle XGMI hive reset for SRIOV */ if (amdgpu_sriov_vf(adev)) { - r = amdgpu_device_reset_sriov(adev, job ? false : true); + r = amdgpu_device_reset_sriov(adev, reset_context); if (r) adev->asic_reset_res = r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index b11d190ece53..5a9cc043b858 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS { AMDGPU_NEED_FULL_RESET = 0, AMDGPU_SKIP_HW_RESET = 1, AMDGPU_SKIP_COREDUMP = 2, + AMDGPU_HOST_FLR = 3, }; struct amdgpu_reset_context { diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index c5ba9c4757a8..f4c47492e0cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index fa9d1b02f391..14cc7910e5cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 14a065516ae4..78cd07744ebe 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -529,6