Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
On 23/05/2024 20:31, Jay Cornwall wrote: On 5/23/2024 13:37, Lancelot SIX wrote: @@ -622,8 +638,15 @@ L_SAVE_HWREG: #if ASIC_FAMILY >= CHIP_GFX12 // Ensure no further changes to barrier or LDS state. + // STATE_PRIV.BARRIER_COMPLETE may change up to this point. s_barrier_signal -2 s_barrier_wait -2 + + // Re-read final state of BARRIER_COMPLETE field for save. + s_getreg_b32 s_save_tmp, hwreg(S_STATUS_HWREG) + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK Even if BARRIER_COMPLETE can be asserted while we are in the trap hadler, I do not think it can be cleared. That being said, it might be easier to just replace the bit, making it clearer. Yes, I chose to structure it this way to make the intent clearer. We don't gain much from dropping the s_andn2. Most of the time spent in the save handler is stalled on memory instructions. @@ -1351,7 +1369,17 @@ L_SKIP_BARRIER_RESTORE: s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask #endif +#if ASIC_FAMILY < CHIP_GFX12 s_setreg_b32 hwreg(S_TRAPSTS_HWREG), s_restore_trapsts Wouldn't other gfx1x architectures have a similar issue when writing TRAPSTS here? That is if TRAPSTS.SAVECTX is set while we are restoring, wouldn't we loose it? And for gfx11, there is TRAPSTS.HOST_TRAP that could have the same issue to some degree (not sure if we would loose the host trap completly, or re-enter with trap ID + HT bit set in ttmp1). Prior to gfx12 context save and host trap exceptions are not delivered to a wave until STATUS.PRIV=0, i.e. it leaves the trap handler. The changes needed for gfx12 are due to a design change in this area. Exceptions are now flagged immediately and cause re-entry to the trap if any are non-zero. Thanks for the clarifications. The patch looks good to me. Reviewed-by: Lancelot Six Best, Lancelot.
Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
On 5/23/2024 13:37, Lancelot SIX wrote: @@ -622,8 +638,15 @@ L_SAVE_HWREG: #if ASIC_FAMILY >= CHIP_GFX12 // Ensure no further changes to barrier or LDS state. + // STATE_PRIV.BARRIER_COMPLETE may change up to this point. s_barrier_signal -2 s_barrier_wait -2 + + // Re-read final state of BARRIER_COMPLETE field for save. + s_getreg_b32 s_save_tmp, hwreg(S_STATUS_HWREG) + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK Even if BARRIER_COMPLETE can be asserted while we are in the trap hadler, I do not think it can be cleared. That being said, it might be easier to just replace the bit, making it clearer. Yes, I chose to structure it this way to make the intent clearer. We don't gain much from dropping the s_andn2. Most of the time spent in the save handler is stalled on memory instructions. @@ -1351,7 +1369,17 @@ L_SKIP_BARRIER_RESTORE: s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask #endif +#if ASIC_FAMILY < CHIP_GFX12 s_setreg_b32 hwreg(S_TRAPSTS_HWREG), s_restore_trapsts Wouldn't other gfx1x architectures have a similar issue when writing TRAPSTS here? That is if TRAPSTS.SAVECTX is set while we are restoring, wouldn't we loose it? And for gfx11, there is TRAPSTS.HOST_TRAP that could have the same issue to some degree (not sure if we would loose the host trap completly, or re-enter with trap ID + HT bit set in ttmp1). Prior to gfx12 context save and host trap exceptions are not delivered to a wave until STATUS.PRIV=0, i.e. it leaves the trap handler. The changes needed for gfx12 are due to a design change in this area. Exceptions are now flagged immediately and cause re-entry to the trap if any are non-zero.
Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
Hi Jay, I have added a couple (minor) of comments below. On 23/05/2024 15:08, Jay Cornwall wrote: Fix LDS size interpretation: 512 bytes (>= gfx12) vs 256 (< gfx12). Ensure STATE_PRIV.BARRIER_COMPLETE cannot change after reading or before writing. Other waves in the threadgroup may cause this field to assert if they complete the barrier. Do not overwrite EXCP_FLAG_PRIV.{SAVE_CONTEXT,HOST_TRAP} when restoring this register. Both of these fields can assert while the wavefront is running the trap handler. Signed-off-by: Jay Cornwall Cc: Lancelot Six --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 1191 + .../amd/amdkfd/cwsr_trap_handler_gfx10.asm| 55 +- 2 files changed, 639 insertions(+), 607 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 77ae25b6753c..18e012e04493 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -75,17 +75,22 @@ var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x2 var SQ_WAVE_STATUS_TRAP_EN_SHIFT = 6 var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 8 var S_STATUS_HWREG= HW_REG_STATUS var S_STATUS_ALWAYS_CLEAR_MASK= SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK var S_STATUS_HALT_MASK= SQ_WAVE_STATUS_HALT_MASK var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF var S_SAVE_PC_HI_HT_MASK = 0x0100 #else +var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK = 0x4 +var SQ_WAVE_STATE_PRIV_SCC_SHIFT = 9 var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK = 0xC00 var SQ_WAVE_STATE_PRIV_HALT_MASK = 0x4000 var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK= 0x8000 +var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT= 15 var SQ_WAVE_STATUS_WAVE64_SHIFT = 29 var SQ_WAVE_STATUS_WAVE64_SIZE= 1 +var SQ_WAVE_LDS_ALLOC_GRANULARITY = 9 var S_STATUS_HWREG= HW_REG_WAVE_STATE_PRIV var S_STATUS_ALWAYS_CLEAR_MASK= SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK var S_STATUS_HALT_MASK= SQ_WAVE_STATE_PRIV_HALT_MASK @@ -149,8 +154,10 @@ var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK = 0x10 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5 var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK = 0x20 var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK = 0x40 +var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT = 6 var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK= 0x100 +var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT= 8 var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK = 0x200 var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK = 0x800 var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80 @@ -430,7 +437,16 @@ L_EXIT_TRAP: // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + +#if ASIC_FAMILY < CHIP_GFX12 s_setreg_b32hwreg(S_STATUS_HWREG), s_save_status +#else + // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it. + // Only restore fields which the trap handler changes. + s_lshr_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_SCC_SHIFT + s_setreg_b32hwreg(S_STATUS_HWREG, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \ + SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_status +#endif s_rfe_b64 [ttmp0, ttmp1] @@ -622,8 +638,15 @@ L_SAVE_HWREG: #if ASIC_FAMILY >= CHIP_GFX12 // Ensure no further changes to barrier or LDS state. + // STATE_PRIV.BARRIER_COMPLETE may change up to this point. s_barrier_signal-2 s_barrier_wait -2 + + // Re-read final state of BARRIER_COMPLETE field for save. + s_getreg_b32s_save_tmp, hwreg(S_STATUS_HWREG) + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK Even if BARRIER_COMPLETE can be asserted while we are in the trap hadler, I do not think it can be cleared. That being said, it might be easier to just replace the bit, making it clearer. + s_or_b32s_save_status, s_save_status, s_save_tmp #endif write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) @@ -764,8 +787,7 @@
[PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes
Fix LDS size interpretation: 512 bytes (>= gfx12) vs 256 (< gfx12). Ensure STATE_PRIV.BARRIER_COMPLETE cannot change after reading or before writing. Other waves in the threadgroup may cause this field to assert if they complete the barrier. Do not overwrite EXCP_FLAG_PRIV.{SAVE_CONTEXT,HOST_TRAP} when restoring this register. Both of these fields can assert while the wavefront is running the trap handler. Signed-off-by: Jay Cornwall Cc: Lancelot Six --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 1191 + .../amd/amdkfd/cwsr_trap_handler_gfx10.asm| 55 +- 2 files changed, 639 insertions(+), 607 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index d61b2c3bd0ac..85a41e121cce 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -678,7 +678,7 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { }; static const uint32_t cwsr_trap_nv1x_hex[] = { - 0xbf820001, 0xbf820394, + 0xbf820001, 0xbf820393, 0xb0804004, 0xb978f802, 0x8a78ff78, 0x00020006, 0xb97bf803, 0x876eff78, @@ -932,23 +932,48 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb97b4306, 0x877bc17b, - 0xbf840086, 0xbf8a, + 0xbf840085, 0xbf8a, 0x877aff6d, 0x8000, - 0xbf840082, 0x8f7b867b, - 0x8f7b827b, 0xbef6037b, - 0xb9703a05, 0x80708170, - 0xbf0d9973, 0xbf850002, - 0x8f708970, 0xbf820001, - 0x8f708a70, 0xb97a1e06, - 0x8f7a8a7a, 0x80707a70, - 0x8070ff70, 0x0200, - 0x8070ff70, 0x0080, - 0xbef603ff, 0x0100, - 0xd765, 0x000100c1, - 0xd766, 0x000200c1, - 0x1684, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbefc0380, 0xbf850033, + 0xbf840081, 0x8f7b887b, + 0xbef6037b, 0xb9703a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0x8070ff70, + 0x0200, 0x8070ff70, + 0x0080, 0xbef603ff, + 0x0100, 0xd765, + 0x000100c1, 0xd766, + 0x000200c1, 0x1684, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbefc0380, + 0xbf850033, 0xb97af803, + 0x8a7a7aff, 0x1000, + 0xbf85001d, 0xd8d8, + 0x0100, 0xbf8c, + 0xbe840380, 0xd760, + 0x0901, 0x80048104, + 0xd761, 0x0901, + 0x80048104, 0xd762, + 0x0901, 0x80048104, + 0xd763, 0x0901, + 0x80048104, 0xf469003a, + 0xe000, 0x80709070, + 0xbf06a004, 0xbf84ffef, + 0x807cff7c, 0x0080, + 0xd525, 0x0001ff00, + 0x0080, 0xbf0a7b7c, + 0xbf85ffe4, 0xbf820044, + 0xbe8303ff, 0x0080, + 0xbf80, 0xbf80, + 0xbf80, 0xd8d8, + 0x0100, 0xbf8c, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, + 0xd525, 0x0001ff00, + 0x0080, 0xbf0a7b7c, + 0xbf85fff4, 0xbf820032, 0xb97af803, 0x8a7a7aff, 0x1000, 0xbf85001d, 0xd8d8, 0x0100, @@ -960,24 +985,45 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x80048104, 0xd763, 0x0901, 0x80048104, 0xf469003a, 0xe000, - 0x80709070, 0xbf06a004, + 0x80709070, 0xbf06c004, 0xbf84ffef, 0x807cff7c, - 0x0080, 0xd525, - 0x0001ff00, 0x0080, + 0x0100, 0xd525, + 0x0001ff00, 0x0100, 0xbf0a7b7c, 0xbf85ffe4, - 0xbf820044, 0xbe8303ff, - 0x0080, 0xbf80, + 0xbf820011, 0xbe8303ff, + 0x0100, 0xbf80, 0xbf80, 0xbf80, 0xd8d8, 0x0100, 0xbf8c, 0xe0704000, 0x705d0100, 0x807c037c, 0x80700370, 0xd525, - 0x0001ff00, 0x0080, + 0x0001ff00, 0x0100, 0xbf0a7b7c, 0xbf85fff4, - 0xbf820032, 0xb97af803, - 0x8a7a7aff, 0x1000, - 0xbf85001d, 0xd8d8, - 0x0100, 0xbf8c, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850004, 0xbef003ff, + 0x0200, 0xbeff0380, + 0xbf820003, 0xbef003ff, + 0x0400, 0xbeff03c1, + 0xb97b3a05, 0x807b817b, + 0x8f7b827b, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf85006b, 0xbef603ff, + 0x0100, 0xbefc0384, + 0xbf0a7b7c, 0xbf8400fa, + 0xb97af803, 0x8a7a7aff, + 0x1000, 0xbf850050, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xbe840380, 0xd760, + 0x0900, 0x80048104, + 0xd761, 0x0900, + 0x80048104, 0xd762, + 0x0900, 0x80048104, + 0xd763, 0x0900, + 0x80048104, 0xf469003a, + 0xe000,