Re: [PATCH] drm/amdkfd: Extend gfx12 trap handler fix to gfx10/11
On 06/06/2024 00:16, Jay Cornwall wrote: In commit 6d1878882d2d ("drm/amdkfd: gfx12 context save/restore trap handler fixes") the following fix was introduced but incorrectly restricted to gfx12. The same issue and a corresponding fix apply to gfx10 and gfx11. Do not overwrite TRAPSTS.{SAVECTX,HOST_TRAP} when restoring this register. Both of these fields can assert while the wavefront is running the trap handler. Signed-off-by: Jay Cornwall Cc: Lancelot Six Hi Jay, Thanks, that looks good to me (tested on gfx10.3, 11 and 12). For gfx11+ there might be a risk of overriding perf_snapshot, but that seems fine as this is not used, and such sample from cwsr restore would mostly be meaningless anyway. Reviewed-by: Lancelot Six Best, Lancelot. --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 16 +--- .../amd/amdkfd/cwsr_trap_handler_gfx10.asm| 38 ++- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 665122d1bbbd..02f7ba8c93cd 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -1136,7 +1136,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x705d, 0x807c817c, 0x8070ff70, 0x0080, 0xbf0a7b7c, 0xbf85fff8, - 0xbf82013d, 0xbef4037e, + 0xbf82013f, 0xbef4037e, 0x8775ff7f, 0x, 0x8875ff75, 0x0004, 0xbef60380, 0xbef703ff, @@ -1275,7 +1275,8 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80788478, 0xbf8c, 0xb9eef815, 0xbefc036f, 0xbefe0370, 0xbeff0371, - 0xb9f9f816, 0xb9fbf803, + 0xb9f9f816, 0xb9fb4803, + 0x907b8b7b, 0xb9fba2c3, 0xb9f3f801, 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, @@ -2544,7 +2545,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xe0704000, 0x705d, 0x807c817c, 0x8070ff70, 0x0080, 0xbf0a7b7c, - 0xbf85fff8, 0xbf820134, + 0xbf85fff8, 0xbf820136, 0xbef4037e, 0x8775ff7f, 0x, 0x8875ff75, 0x0004, 0xbef60380, @@ -2683,7 +2684,8 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xf000, 0x80788478, 0xbf8c, 0xb9eef815, 0xbefc036f, 0xbefe0370, - 0xbeff0371, 0xb9fbf803, + 0xbeff0371, 0xb9fb4803, + 0x907b8b7b, 0xb9fba2c3, 0xb9f3f801, 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, @@ -2981,7 +2983,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x701d, 0x807d817d, 0x8070ff70, 0x0080, 0xbf0a7b7d, 0xbfa2fff8, - 0xbfa0013f, 0xbef4007e, + 0xbfa00143, 0xbef4007e, 0x8b75ff7f, 0x, 0x8c75ff75, 0x0004, 0xbef60080, 0xbef700ff, @@ -3123,7 +3125,9 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x80788478, 0xbf89, 0xb96ef815, 0xbefd006f, 0xbefe0070, 0xbeff0071, - 0xb97bf803, 0xb973f801, + 0xb97b4803, 0x857b8b7b, + 0xb97b22c3, 0x857b867b, + 0xb97b7443, 0xb973f801, 0xb8ee3b05, 0x806e816e, 0xbf0d9972, 0xbfa20002, 0x846e896e, 0xbfa1, diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index ac3702b8e3c4..44772eec9ef4 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -119,9 +119,12 @@ var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT = 7 var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT= 8 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 +var SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT = 11 var SQ_WAVE_TRAPSTS_EXCP_HI_MASK = 0x7000 #if ASIC_FAMILY >= CHIP_PLUM_BONITO +var SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT= 16 var SQ_WAVE_TRAPSTS_WAVE_START_MASK = 0x2 +var SQ_WAVE_TRAPSTS_WAVE_START_SHIFT = 17 var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x4 var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK = 0x10 #endif @@ -137,14 +140,23 @@ var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 +var S_TRAPSTS_RESTORE_PART_1_SIZE = SQ_WAVE_TRAPSTS_SAVECTX_SHIFT +var S_TRAPSTS_RESTORE_PART_2_SHIFT = SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT + #if ASIC_FAMILY < CHIP_PLUM_BONITO var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK +var S_TRAPSTS_RESTORE_PART_2_SIZE = 32 - S_TRAPSTS_RESTORE_PART_2_SHIFT +var S_TRAPSTS_RESTORE_PART_3_SHIFT = 0 +var S_TRAPSTS_RESTORE_PART_3_SIZE = 0 #else var S_TRAPSTS_NON_MASKABLE_EXCP_MASK = SQ_WAVE_TRAPS
Re: [PATCH v2] drm/amdkfd: Handle deallocated VPGRs in gfx11+ trap handler
On 29/05/2024 20:35, Jay Cornwall wrote: A wavefront may deallocate its VGPRs at the end of a program while waiting for memory transactions to complete. If it subsequently receives a context save exception it will be unable to save, since this requires VGPRs. In this case the trap handler should terminate the wavefront. Fixes intermittent VM faults under context switching load. V2: Use S_ENDPGM instead of S_ENDPGM_SAVED for performance counters Hi Jay, Thanks for the V2. FYI,as far as I can see, the .h part of the patch does not seem to apply directly on current amd-staging-drm-next, but I guess we just have a different bases. Signed-off-by: Jay Cornwall Cc: Lancelot Six --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 695 +- .../amd/amdkfd/cwsr_trap_handler_gfx10.asm| 17 + 2 files changed, 366 insertions(+), 346 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 18e012e04493..ac3702b8e3c4 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -97,6 +97,7 @@ var S_STATUS_HALT_MASK= SQ_WAVE_STATE_PRIV_HALT_MASK var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF000 #endif +var SQ_WAVE_STATUS_NO_VGPRS_SHIFT = 24 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 8 @@ -451,6 +452,22 @@ L_EXIT_TRAP: s_rfe_b64 [ttmp0, ttmp1] L_SAVE: + // If VGPRs have been deallocated then terminate the wavefront. + // It has no remaining program to run and cannot save without VGPRs. +#if ASIC_FAMILY == CHIP_PLUM_BONITO + s_bitcmp1_b32 s_save_status, SQ_WAVE_STATUS_NO_VGPRS_SHIFT + s_cbranch_scc0 L_HAVE_VGPRS + s_endpgm +L_HAVE_VGPRS: +#endif +#if ASIC_FAMILY >= CHIP_GFX12 This is mostly cosmetic, but you could use #elif ASIC_FAMILY >= CHIP_GFX12 instead on #endif + #if. Those 2 blocks are not independent, they achieve the same goal for different configurations. I do not mind if you prefer to keep this as it is. Either way, the change look good to me: Reviewed-by: Lancelot Six Best, Lancelot. + s_getreg_b32s_save_tmp, hwreg(HW_REG_WAVE_STATUS) + s_bitcmp1_b32 s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT + s_cbranch_scc0 L_HAVE_VGPRS + s_endpgm +L_HAVE_VGPRS: +#endif + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x //pc[47:32] s_mov_b32 s_save_tmp, 0 s_setreg_b32hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_SAVE_CONTEXT_SHIFT, 1), s_save_tmp //clear saveCtx bit
Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
On 23/05/2024 20:31, Jay Cornwall wrote: On 5/23/2024 13:37, Lancelot SIX wrote: @@ -622,8 +638,15 @@ L_SAVE_HWREG: #if ASIC_FAMILY >= CHIP_GFX12 // Ensure no further changes to barrier or LDS state. + // STATE_PRIV.BARRIER_COMPLETE may change up to this point. s_barrier_signal -2 s_barrier_wait -2 + + // Re-read final state of BARRIER_COMPLETE field for save. + s_getreg_b32 s_save_tmp, hwreg(S_STATUS_HWREG) + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK Even if BARRIER_COMPLETE can be asserted while we are in the trap hadler, I do not think it can be cleared. That being said, it might be easier to just replace the bit, making it clearer. Yes, I chose to structure it this way to make the intent clearer. We don't gain much from dropping the s_andn2. Most of the time spent in the save handler is stalled on memory instructions. @@ -1351,7 +1369,17 @@ L_SKIP_BARRIER_RESTORE: s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask #endif +#if ASIC_FAMILY < CHIP_GFX12 s_setreg_b32 hwreg(S_TRAPSTS_HWREG), s_restore_trapsts Wouldn't other gfx1x architectures have a similar issue when writing TRAPSTS here? That is if TRAPSTS.SAVECTX is set while we are restoring, wouldn't we loose it? And for gfx11, there is TRAPSTS.HOST_TRAP that could have the same issue to some degree (not sure if we would loose the host trap completly, or re-enter with trap ID + HT bit set in ttmp1). Prior to gfx12 context save and host trap exceptions are not delivered to a wave until STATUS.PRIV=0, i.e. it leaves the trap handler. The changes needed for gfx12 are due to a design change in this area. Exceptions are now flagged immediately and cause re-entry to the trap if any are non-zero. Thanks for the clarifications. The patch looks good to me. Reviewed-by: Lancelot Six Best, Lancelot.
Re: [PATCH 2/3] drm/amdkfd: Replace deprecated gfx12 trap handler instructions
On 23/05/2024 15:08, Jay Cornwall wrote: Newer assemblers reject S_WAITCNT. All instances of S_WAITCNT can be replaced by S_WAITCNT 0 (< gfx12) or S_WAIT_IDLE (>= gfx12) since there is no concurrency of different memory instruction classes. Signed-off-by: Jay Cornwall Cc: Lancelot Six Thanks, that looks good to me. Reviewed-by: Lancelot Six --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 140 +- .../amd/amdkfd/cwsr_trap_handler_gfx10.asm| 52 +++ 2 files changed, 97 insertions(+), 95 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 11d076eb770c..d61b2c3bd0ac 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -711,12 +711,12 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbf0d8f7b, 0xbf840002, 0x887bff7b, 0x, 0xf4011bbd, 0xfa10, - 0xbf8cc07f, 0x8f6e976e, + 0xbf8c, 0x8f6e976e, 0x8a77ff77, 0x0080, 0x88776e77, 0xf4051bbd, - 0xfa00, 0xbf8cc07f, + 0xfa00, 0xbf8c, 0xf4051ebd, 0xfa08, - 0xbf8cc07f, 0x87ee6e6e, + 0xbf8c, 0x87ee6e6e, 0xbf840001, 0xbe80206e, 0x876eff6d, 0x00ff, 0xbf850008, 0x876eff6d, @@ -1185,7 +1185,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x785d, 0xe0304080, 0x785d0100, 0xe0304100, 0x785d0200, 0xe0304180, - 0x785d0300, 0xbf8c3f70, + 0x785d0300, 0xbf8c, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, @@ -1194,7 +1194,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x6e5d, 0xe0304080, 0x6e5d0100, 0xe0304100, 0x6e5d0200, 0xe0304180, - 0x6e5d0300, 0xbf8c3f70, + 0x6e5d0300, 0xbf8c, 0xbf820034, 0xbef603ff, 0x0100, 0xbeee0378, 0x8078ff78, 0x0400, @@ -1203,7 +1203,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x785d, 0xe0304100, 0x785d0100, 0xe0304200, 0x785d0200, 0xe0304300, - 0x785d0300, 0xbf8c3f70, + 0x785d0300, 0xbf8c, 0x7e008500, 0x7e028501, 0x7e048502, 0x7e068503, 0x807c847c, 0x8078ff78, @@ -1213,7 +1213,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x8f6f836f, 0x806f7c6f, 0xbefe03c1, 0xbeff0380, 0xe0304000, 0x785d, - 0xbf8c3f70, 0x7e008500, + 0xbf8c, 0x7e008500, 0x807c817c, 0x8078ff78, 0x0080, 0xbf0a6f7c, 0xbf85fff7, 0xbeff03c1, @@ -1221,7 +1221,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xe0304100, 0x6e5d0100, 0xe0304200, 0x6e5d0200, 0xe0304300, 0x6e5d0300, - 0xbf8c3f70, 0xb9783a05, + 0xbf8c, 0xb9783a05, 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, 0x8f788a78, @@ -1232,16 +1232,16 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x0100, 0xbefc03ff, 0x006c, 0x80f89078, 0xf429003a, 0xf000, - 0xbf8cc07f, 0x80fc847c, + 0xbf8c, 0x80fc847c, 0xbf80, 0xbe803100, 0xbe823102, 0x80f8a078, 0xf42d003a, 0xf000, - 0xbf8cc07f, 0x80fc887c, + 0xbf8c, 0x80fc887c, 0xbf80, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, 0x80f8c078, 0xf431003a, 0xf000, - 0xbf8cc07f, 0x80fc907c, + 0xbf8c, 0x80fc907c, 0xbf80, 0xbe803100, 0xbe823102, 0xbe843104, 0xbe863106, 0xbe883108, @@ -1271,9 +1271,9 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xf4211cfa, 0xf000, 0x80788478, 0xf4211bba, 0xf000, 0x80788478, - 0xbf8cc07f, 0xb9eef814, + 0xbf8c, 0xb9eef814, 0xf4211bba, 0xf000, - 0x80788478, 0xbf8cc07f, + 0x80788478, 0xbf8c, 0xb9eef815, 0xbefc036f, 0xbefe0370, 0xbeff0371, 0xb9f9f816, 0xb9fbf803, @@ -1288,7 +1288,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x, 0xf4091c37, 0xfa50, 0xf4091d37, 0xfa60, 0xf4011e77, - 0xfa74, 0xbf8cc07f, + 0xfa74, 0xbf8c, 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x0200, @@ -2299,12 +2299,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf0d8f7b, 0xbf840002, 0x887bff7b, 0x, 0xf4011bbd, 0xfa10, - 0xbf8cc07f, 0x8f6e976e, + 0xbf8c, 0x8f6e976e, 0x8a77ff77, 0x0080, 0x88776e77, 0xf4051bbd, - 0xfa00, 0xbf8cc07f, + 0xfa00, 0xbf8c, 0xf4051ebd, 0xfa08, - 0xbf8cc07f, 0x87ee6e6e, + 0xbf8c, 0x87ee6e6e, 0xbf840001, 0xbe80206e, 0x876eff6d, 0x00ff, 0xbf850008, 0x876eff6d, @@ -2319,7 +2319,7 @@ static
Re: [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source
On 23/05/2024 15:08, Jay Cornwall wrote: Source and binary have become mismatched during branch activity. Signed-off-by: Jay Cornwall Cc: Lancelot Six Thanks for doing this. This matches what I have when rebuilding the trap handlers. Reviewed-by: Lancelot Six --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 57 --- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 73d3772cdb76..11d076eb770c 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -718,12 +718,12 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xf4051ebd, 0xfa08, 0xbf8cc07f, 0x87ee6e6e, 0xbf840001, 0xbe80206e, - 0x876eff6d, 0x01ff, - 0xbf850005, 0x8878ff78, - 0x2000, 0x80ec886c, - 0x82ed806d, 0xbf820005, - 0x876eff6d, 0x0100, - 0xbf850002, 0x806c846c, + 0x876eff6d, 0x00ff, + 0xbf850008, 0x876eff6d, + 0x0100, 0xbf850007, + 0x8878ff78, 0x2000, + 0x80ec886c, 0x82ed806d, + 0xbf820002, 0x806c846c, 0x826d806d, 0x876dff6d, 0x, 0x907a8977, 0x877bff7a, 0x003f8000, @@ -1136,7 +1136,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xe0704000, 0x705d, 0x807c817c, 0x8070ff70, 0x0080, 0xbf0a7b7c, - 0xbf85fff8, 0xbf820144, + 0xbf85fff8, 0xbf82013e, 0xbef4037e, 0x8775ff7f, 0x, 0x8875ff75, 0x0004, 0xbef60380, @@ -1276,10 +1276,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80788478, 0xbf8cc07f, 0xb9eef815, 0xbefc036f, 0xbefe0370, 0xbeff0371, - 0x876f7bff, 0x03ff, - 0xb9ef4803, 0xb9f9f816, - 0x876f7bff, 0xf800, - 0x906f8b6f, 0xb9efa2c3, + 0xb9f9f816, 0xb9fbf803, 0xb9f3f801, 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, @@ -2309,12 +2306,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xf4051ebd, 0xfa08, 0xbf8cc07f, 0x87ee6e6e, 0xbf840001, 0xbe80206e, - 0x876eff6d, 0x01ff, - 0xbf850005, 0x8878ff78, - 0x2000, 0x80ec886c, - 0x82ed806d, 0xbf820005, - 0x876eff6d, 0x0100, - 0xbf850002, 0x806c846c, + 0x876eff6d, 0x00ff, + 0xbf850008, 0x876eff6d, + 0x0100, 0xbf850007, + 0x8878ff78, 0x2000, + 0x80ec886c, 0x82ed806d, + 0xbf820002, 0x806c846c, 0x826d806d, 0x876dff6d, 0x, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, @@ -2549,7 +2546,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x705d, 0x807c817c, 0x8070ff70, 0x0080, 0xbf0a7b7c, 0xbf85fff8, - 0xbf82013b, 0xbef4037e, + 0xbf820135, 0xbef4037e, 0x8775ff7f, 0x, 0x8875ff75, 0x0004, 0xbef60380, 0xbef703ff, @@ -2688,10 +2685,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xf000, 0x80788478, 0xbf8cc07f, 0xb9eef815, 0xbefc036f, 0xbefe0370, - 0xbeff0371, 0x876f7bff, - 0x03ff, 0xb9ef4803, - 0x876f7bff, 0xf800, - 0x906f8b6f, 0xb9efa2c3, + 0xbeff0371, 0xb9fbf803, 0xb9f3f801, 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, @@ -2749,11 +2743,11 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0xf808, 0xbf89fc07, 0x8bee6e6e, 0xbfa10001, 0xbe80486e, 0x8b6eff6d, - 0x01ff, 0xbfa20005, - 0x8c78ff78, 0x2000, - 0x80ec886c, 0x82ed806d, - 0xbfa5, 0x8b6eff6d, - 0x0100, 0xbfa20002, + 0x00ff, 0xbfa20008, + 0x8b6eff6d, 0x0100, + 0xbfa20007, 0x8c78ff78, + 0x2000, 0x80ec886c, + 0x82ed806d, 0xbfa2, 0x806c846c, 0x826d806d, 0x8b6dff6d, 0x, 0x8bfe7e7e, 0x8bea6a6a, @@ -2988,7 +2982,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x701d, 0x807d817d, 0x8070ff70, 0x0080, 0xbf0a7b7d, 0xbfa2fff8, - 0xbfa00146, 0xbef4007e, + 0xbfa00140, 0xbef4007e, 0x8b75ff7f, 0x, 0x8c75ff75, 0x0004, 0xbef60080, 0xbef700ff, @@ -3130,10 +3124,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0xf000, 0x80788478, 0xbf89fc07, 0xb96ef815, 0xbefd006f, 0xbefe0070, - 0xbeff0071, 0x8b6f7bff, - 0x03ff, 0xb96f4803, - 0x8b6f7bff, 0xf800, - 0x856f8b6f, 0xb96fa2c3, + 0xbeff0071, 0xb97bf803, 0xb973f801, 0xb8ee3b05, 0x806e816e, 0xbf0d9972, 0xbfa20002, 0x846e896e, @@ -4119,7 +4110,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0x8b6dff6d, 0x, 0x8bfe7e7e, 0x8bea6a6a, 0xb97af804, 0xbe804a6c, - 0xbfb0, 0xbf9f
Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
Hi Jay, I have added a couple (minor) of comments below. On 23/05/2024 15:08, Jay Cornwall wrote: Fix LDS size interpretation: 512 bytes (>= gfx12) vs 256 (< gfx12). Ensure STATE_PRIV.BARRIER_COMPLETE cannot change after reading or before writing. Other waves in the threadgroup may cause this field to assert if they complete the barrier. Do not overwrite EXCP_FLAG_PRIV.{SAVE_CONTEXT,HOST_TRAP} when restoring this register. Both of these fields can assert while the wavefront is running the trap handler. Signed-off-by: Jay Cornwall Cc: Lancelot Six --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 1191 + .../amd/amdkfd/cwsr_trap_handler_gfx10.asm| 55 +- 2 files changed, 639 insertions(+), 607 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 77ae25b6753c..18e012e04493 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -75,17 +75,22 @@ var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x2 var SQ_WAVE_STATUS_TRAP_EN_SHIFT = 6 var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 8 var S_STATUS_HWREG= HW_REG_STATUS var S_STATUS_ALWAYS_CLEAR_MASK= SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK var S_STATUS_HALT_MASK= SQ_WAVE_STATUS_HALT_MASK var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF var S_SAVE_PC_HI_HT_MASK = 0x0100 #else +var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4 +var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9 var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00 var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000 var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK= 0x8000 +var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT= 15 var SQ_WAVE_STATUS_WAVE64_SHIFT = 29 var SQ_WAVE_STATUS_WAVE64_SIZE= 1 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9 var S_STATUS_HWREG= HW_REG_WAVE_STATE_PRIV var S_STATUS_ALWAYS_CLEAR_MASK= SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK var S_STATUS_HALT_MASK= SQ_WAVE_STATE_PRIV_HALT_MASK @@ -149,8 +154,10 @@ var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20 var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK = 0x40 +var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT = 6 var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK= 0x100 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT= 8 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK = 0x200 var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800 var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80 @@ -430,7 +437,16 @@ L_EXIT_TRAP: // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + +#if ASIC_FAMILY < CHIP_GFX12 s_setreg_b32hwreg(S_STATUS_HWREG), s_save_status +#else + // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it. + // Only restore fields which the trap handler changes. + s_lshr_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_SCC_SHIFT + s_setreg_b32hwreg(S_STATUS_HWREG, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \ + SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_status +#endif s_rfe_b64 [ttmp0, ttmp1] @@ -622,8 +638,15 @@ L_SAVE_HWREG: #if ASIC_FAMILY >= CHIP_GFX12 // Ensure no further changes to barrier or LDS state. + // STATE_PRIV.BARRIER_COMPLETE may change up to this point. s_barrier_signal-2 s_barrier_wait -2 + + // Re-read final state of BARRIER_COMPLETE field for save. + s_getreg_b32s_save_tmp, hwreg(S_STATUS_HWREG) + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK Even if BARRIER_COMPLETE can be asserted while we are in the trap hadler, I do not think it can be cleared. That being said, it might be easier to just replace the bit, making it clearer. + s_or_b32s_save_status, s_save_status, s_save_tmp #endif write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) @@ -
[PATCH] drm/amdkfd: update buffer_{store, load}_* modifiers for gfx940
Instruction modifiers of the untyped vector memory buffer instructions (MUBUF encoded) changed in gfx940. The slc, scc and glc modifiers have been replaced with sc0, sc1 and nt. The current CWSR trap handler is written using pre-gfx940 modifier names, making the source incompatible with a strict gfx940 assembler. This patch updates the cwsr_trap_handler_gfx9.s source file to be compatible with all gfx9 variants of the ISA. The binary assembled code is unchanged (so the behaviour is unchanged as well), only the source representation is updated. Signed-off-by: Lancelot SIX --- .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 24 --- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index bb26338204f4..a2d597d7fb57 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -48,6 +48,12 @@ var ACK_SQC_STORE= 1 //workaround for suspected SQC store bug causing var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger var SINGLE_STEP_MISSED_WORKAROUND = (ASIC_FAMILY <= CHIP_ALDEBARAN) //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised +#if ASIC_FAMILY < CHIP_GC_9_4_3 +#define VMEM_MODIFIERS slc:1 glc:1 +#else +#define VMEM_MODIFIERS sc0:1 nt:1 +#endif + /**/ /* variables */ /**/ @@ -581,7 +587,7 @@ end L_SAVE_LDS_LOOP_VECTOR: ds_read_b64 v[0:1], v2 //x =LDS[a], byte address s_waitcnt lgkmcnt(0) - buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 + buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset VMEM_MODIFIERS offen:1 // s_waitcnt vmcnt(0) // v_add_u32 v2, vcc[0:1], v2, v3 v_add_u32 v2, v2, v3 @@ -979,17 +985,17 @@ L_TCP_STORE_CHECK_DONE: end function write_4vgprs_to_mem(s_rsrc, s_mem_offset) - buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3 + buffer_store_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS + buffer_store_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256 + buffer_store_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*2 + buffer_store_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*3 end function read_4vgprs_from_mem(s_rsrc, s_mem_offset) - buffer_load_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 - buffer_load_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256 - buffer_load_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2 - buffer_load_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3 + buffer_load_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS + buffer_load_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256 + buffer_load_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*2 + buffer_load_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256*3 s_waitcnt vmcnt(0) end base-commit: cf743996352e327f483dc7d66606c90276f57380 -- 2.34.1
[PATCH] drm/amdkfd: Flush the process wq before creating a kfd_process
There is a race condition when re-creating a kfd_process for a process. This has been observed when a process under the debugger executes exec(3). In this scenario: - The process executes exec. - This will eventually release the process's mm, which will cause the kfd_process object associated with the process to be freed (kfd_process_free_notifier decrements the reference count to the kfd_process to 0). This causes kfd_process_ref_release to enqueue kfd_process_wq_release to the kfd_process_wq. - The debugger receives the PTRACE_EVENT_EXEC notification, and tries to re-enable AMDGPU traps (KFD_IOC_DBG_TRAP_ENABLE). - When handling this request, KFD tries to re-create a kfd_process. This eventually calls kfd_create_process and kobject_init_and_add. At this point the call to kobject_init_and_add can fail because the old kfd_process.kobj has not been freed yet by kfd_process_wq_release. This patch proposes to avoid this race by making sure to drain kfd_process_wq before creating a new kfd_process object. This way, we know that any cleanup task is done executing when we reach kobject_init_and_add. Signed-off-by: Lancelot SIX --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 58c1fe542193..451bb058cc62 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -829,6 +829,14 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) if (process) { pr_debug("Process already found\n"); } else { + /* If the process just called exec(3), it is possible that the +* cleanup of the kfd_process (following the release of the mm +* of the old process image) is still in the cleanup work queue. +* Make sure to drain any job before trying to recreate any +* resource for this process. +*/ + flush_workqueue(kfd_process_wq); + process = create_process(thread); if (IS_ERR(process)) goto out; base-commit: cf743996352e327f483dc7d66606c90276f57380 -- 2.34.1
[PATCH] drm/amdkfd: Enable SQ watchpoint for gfx10
There are new control registers introduced in gfx10 used to configure hardware watchpoints triggered by SMEM instructions: SQ_WATCH{0,1,2,3}_{CNTL_ADDR_HI,ADDR_L}. Those registers work in a similar way as the TCP_WATCH* registers currently used for gfx9 and above. This patch adds support to program the SQ_WATCH registers for gfx10. The SQ_WATCH?_CNTL.MASK field has one bit more than TCP_WATCH?_CNTL.MASK, so SQ watchpoints can have a finer granularity than TCP_WATCH watchpoints. In this patch, we keep the capabilities advertised to the debugger unchanged (HSA_DBG_WATCH_ADDR_MASK_*_BIT_GFX10) as this reflects what both TCP and SQ watchpoints can do and both watchpoints are configured together. Signed-off-by: Lancelot SIX --- .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 71 +++ 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 69810b3f1c63..3ab6c3aa0ad1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -881,6 +881,7 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev, } #define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H) +#define SQ_WATCH_STRIDE (mmSQ_WATCH1_ADDR_H - mmSQ_WATCH0_ADDR_H) uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev, uint64_t watch_address, uint32_t watch_address_mask, @@ -889,55 +890,93 @@ uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev, uint32_t debug_vmid, uint32_t inst) { + /* SQ_WATCH?_ADDR_* and TCP_WATCH?_ADDR_* are programmed with the +* same values. +*/ uint32_t watch_address_high; uint32_t watch_address_low; - uint32_t watch_address_cntl; - - watch_address_cntl = 0; + uint32_t tcp_watch_address_cntl; + uint32_t sq_watch_address_cntl; watch_address_low = lower_32_bits(watch_address); watch_address_high = upper_32_bits(watch_address) & 0x; - watch_address_cntl = REG_SET_FIELD(watch_address_cntl, + tcp_watch_address_cntl = 0; + tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl, TCP_WATCH0_CNTL, VMID, debug_vmid); - watch_address_cntl = REG_SET_FIELD(watch_address_cntl, + tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl, TCP_WATCH0_CNTL, MODE, watch_mode); - watch_address_cntl = REG_SET_FIELD(watch_address_cntl, + tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl, TCP_WATCH0_CNTL, MASK, watch_address_mask >> 7); + sq_watch_address_cntl = 0; + sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl, + SQ_WATCH0_CNTL, + VMID, + debug_vmid); + sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl, + SQ_WATCH0_CNTL, + MODE, + watch_mode); + sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl, + SQ_WATCH0_CNTL, + MASK, + watch_address_mask >> 6); + /* Turning off this watch point until we set all the registers */ - watch_address_cntl = REG_SET_FIELD(watch_address_cntl, + tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl, TCP_WATCH0_CNTL, VALID, 0); - WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) + (watch_id * TCP_WATCH_STRIDE)), - watch_address_cntl); + tcp_watch_address_cntl); + + sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl, + SQ_WATCH0_CNTL, + VALID, + 0); + WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) + + (watch_id * SQ_WATCH_STRIDE)), + sq_watch_address_cntl); + /* Program {TCP,SQ}_WATCH?_ADDR* */ WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) + (watch_id * TCP_WATCH_STRIDE)), watch_address_high); - WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) + (watch_id * TCP_WATCH_STRIDE)), watch_address_low); + WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_H) + + (watch_id * SQ_WATCH_STRIDE)), + wa
Re: [PATCH] drm/amdkfd: Clear the VALU exception state in the trap handler
The trap handler could be entered with pending VALU exceptions, so clear the exception state before issuing vector instructions. Signed-off-by: Laurent Morichetti Reviewed-by: Jay Cornwall Hi, FYI, I tested this and it fixes the issue. Best, Lancelot. Tested-by: Lancelot Six