A trap may occur in the middle of VOP3PX instruction co-issue. The PC would be restored incorrectly if left unmodified.
Identify this case by examining the instruction opcode and rewind the PC 8 bytes if it occurs. Signed-off-by: Jay Cornwall <[email protected]> Cc: Lancelot Six <[email protected]> Cc: Vladimir Indic <[email protected]> Cc: Shweta Khatri <[email protected]> --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 197 +++++++++--------- .../amd/amdkfd/cwsr_trap_handler_gfx12.asm | 25 ++- 2 files changed, 121 insertions(+), 101 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 39bdc98b8b6d..54fa76f374c9 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -4587,14 +4587,14 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = { }; static const uint32_t cwsr_trap_gfx12_1_0_hex[] = { - 0xbfa00001, 0xbfa003ac, + 0xbfa00001, 0xbfa003be, 0xb0804009, 0xb8f8f804, 0x9178ff78, 0x00008c00, 0xb8fbf811, 0x8b6eff78, 0x00004000, 0xbfa10008, 0x8b6eff7b, 0x00000080, 0xbfa20018, 0x8b6ea07b, - 0xbfa200d1, 0xbf830010, + 0xbfa200da, 0xbf830010, 0xb8fbf811, 0xbfa0fffb, 0x8b6eff7b, 0x00000bd0, 0xbfa20010, 0xb8eef812, @@ -4605,7 +4605,7 @@ static const uint32_t cwsr_trap_gfx12_1_0_hex[] = { 0xf0000000, 0xbfa20005, 0x8b6fff6f, 0x00000200, 0xbfa20002, 0x8b6ea07b, - 0xbfa200bb, 0x9177ff77, + 0xbfa200c4, 0x9177ff77, 0x007fc000, 0xb8fa04a1, 0x847a967a, 0x8c777a77, 0xb8fa0421, 0x847a957a, @@ -4632,43 +4632,46 @@ static const uint32_t cwsr_trap_gfx12_1_0_hex[] = { 0xbfa00002, 0x806c846c, 0x826d806d, 0x8b6dff6d, 0x01ffffff, 0xb8fbf811, - 0xbf0d847b, 0xbfa20078, + 0xbf0d847b, 0xbfa20081, 0xf4003eb6, 0xf8000000, 0xbfc70000, 0xf4003bb6, 0xf8000008, 0x8b76ff7a, 0x80000000, 0xbfa20027, 0x9376ff7a, 0x00060019, 0x81f9a376, 0xbf0b8179, - 0xbfa20068, 0x81f9ac76, - 0xbf0b8179, 0xbfa20062, + 0xbfa2006e, 0x81f9ac76, + 0xbf0b8179, 0xbfa20068, 0x81f9b776, 0xbf0b8179, - 0xbfa2005f, 0x8b76ff7a, + 0xbfa20065, 0x8b76ff7a, 0x000001ff, 0xbf06ff76, - 0x000000fe, 0xbfa2005d, + 0x000000fe, 0xbfa20063, 0xbf06ff76, 0x000000ff, - 0xbfa20057, 0xbf06ff76, - 0x000000fa, 0xbfa20054, + 0xbfa2005d, 0xbf06ff76, + 0x000000fa, 0xbfa2005a, 0x81f9ff76, 0x000000e9, - 0xbf0b8179, 0xbfa20050, + 0xbf0b8179, 0xbfa20056, 0x8b76ff7b, 0xffff0000, 0xbf06ff76, 0xbf860000, - 0xbfa10051, 0x9376ff7b, + 0xbfa1005a, 0x9376ff7b, 0x0002000e, 0x8b79ff7b, 0x00003f00, 0x85798679, 0x8c767976, 0xb9763b01, - 0xbfa00049, 0x8b76ff7a, + 0xbfa00052, 0x8b76ff7a, 0xfc000000, 0xbf06ff76, - 0xd4000000, 0xbfa20013, + 0xd4000000, 0xbfa20019, 0xbf06ff76, 0xc8000000, - 0xbfa20027, 0x8b76ff7a, + 0xbfa2002d, 0x8b76ff7a, 0xff000000, 0xbf06ff76, - 0xcf000000, 0xbfa20039, + 0xcf000000, 0xbfa2003f, 0x8b79ff7a, 0xffff0000, + 0xbf06ff79, 0xcc330000, + 0xbfa2003d, 0xbf06ff79, + 0xcc880000, 0xbfa2003a, 0xbf06ff79, 0xcc350000, - 0xbfa20037, 0xbf06ff79, - 0xcc3a0000, 0xbfa20034, + 0xbfa2003a, 0xbf06ff79, + 0xcc3a0000, 0xbfa20037, 0xbf06ff76, 0xcc000000, - 0xbfa10031, 0x8b76ff7b, + 0xbfa10034, 0x8b76ff7b, 0x000001ff, 0xbf06ff76, 0x000000ff, 0xbfa20029, 0xbf06ff76, 0x000000fa, @@ -4691,86 +4694,92 @@ static const uint32_t cwsr_trap_gfx12_1_0_hex[] = { 0x000001ff, 0xbf06ff76, 0x000000ff, 0xbfa20003, 0xbfc70000, 0xbefb006e, - 0xbfa0ffad, 0xbfc70000, - 0xbefb006f, 0xbfa0ffaa, - 0xbfc70000, 0x857a9677, - 0xb97a04a1, 0x857a9577, - 0xb97a0421, 0x857a8e77, - 0xb97a3021, 0x8bfe7e7e, - 0x8bea6a6a, 0x85788978, - 0xb9783244, 0xbe804a6c, - 0xb8faf802, 0xbf0d987a, - 0xbfa10001, 0xbfb00000, - 0x8b6dff6d, 0x01ffffff, - 0xbefa0080, 0xb97a0151, - 0x9177ff77, 0x007fc000, - 0xb8fa04a1, 0x847a967a, - 0x8c777a77, 0xb8fa0421, - 0x847a957a, 0x8c777a77, - 0xb8fa3021, 0x847a8e7a, - 0x8c777a77, 0xb980f821, - 0x00000000, 0xbf0d847b, - 0xbfa20078, 0xf4003eb6, - 0xf8000000, 0xbfc70000, - 0xf4003bb6, 0xf8000008, - 0x8b76ff7a, 0x80000000, - 0xbfa20027, 0x9376ff7a, - 0x00060019, 0x81f9a376, + 0xbfa0ffa7, 0xbfc70000, + 0xbefb006f, 0xbfa0ffa4, + 0x80ec886c, 0x82ed806d, + 0xbfa0fff7, 0xbfc70000, + 0x857a9677, 0xb97a04a1, + 0x857a9577, 0xb97a0421, + 0x857a8e77, 0xb97a3021, + 0x8bfe7e7e, 0x8bea6a6a, + 0x85788978, 0xb9783244, + 0xbe804a6c, 0xb8faf802, + 0xbf0d987a, 0xbfa10001, + 0xbfb00000, 0x8b6dff6d, + 0x01ffffff, 0xbefa0080, + 0xb97a0151, 0x9177ff77, + 0x007fc000, 0xb8fa04a1, + 0x847a967a, 0x8c777a77, + 0xb8fa0421, 0x847a957a, + 0x8c777a77, 0xb8fa3021, + 0x847a8e7a, 0x8c777a77, + 0xb980f821, 0x00000000, + 0xbf0d847b, 0xbfa20081, + 0xf4003eb6, 0xf8000000, + 0xbfc70000, 0xf4003bb6, + 0xf8000008, 0x8b76ff7a, + 0x80000000, 0xbfa20027, + 0x9376ff7a, 0x00060019, + 0x81f9a376, 0xbf0b8179, + 0xbfa2006e, 0x81f9ac76, 0xbf0b8179, 0xbfa20068, - 0x81f9ac76, 0xbf0b8179, - 0xbfa20062, 0x81f9b776, - 0xbf0b8179, 0xbfa2005f, - 0x8b76ff7a, 0x000001ff, - 0xbf06ff76, 0x000000fe, + 0x81f9b776, 0xbf0b8179, + 0xbfa20065, 0x8b76ff7a, + 0x000001ff, 0xbf06ff76, + 0x000000fe, 0xbfa20063, + 0xbf06ff76, 0x000000ff, 0xbfa2005d, 0xbf06ff76, - 0x000000ff, 0xbfa20057, + 0x000000fa, 0xbfa2005a, + 0x81f9ff76, 0x000000e9, + 0xbf0b8179, 0xbfa20056, + 0x8b76ff7b, 0xffff0000, + 0xbf06ff76, 0xbf860000, + 0xbfa1005a, 0x9376ff7b, + 0x0002000e, 0x8b79ff7b, + 0x00003f00, 0x85798679, + 0x8c767976, 0xb9763b01, + 0xbfa00052, 0x8b76ff7a, + 0xfc000000, 0xbf06ff76, + 0xd4000000, 0xbfa20019, + 0xbf06ff76, 0xc8000000, + 0xbfa2002d, 0x8b76ff7a, + 0xff000000, 0xbf06ff76, + 0xcf000000, 0xbfa2003f, + 0x8b79ff7a, 0xffff0000, + 0xbf06ff79, 0xcc330000, + 0xbfa2003d, 0xbf06ff79, + 0xcc880000, 0xbfa2003a, + 0xbf06ff79, 0xcc350000, + 0xbfa2003a, 0xbf06ff79, + 0xcc3a0000, 0xbfa20037, + 0xbf06ff76, 0xcc000000, + 0xbfa10034, 0x8b76ff7b, + 0x000001ff, 0xbf06ff76, + 0x000000ff, 0xbfa20029, 0xbf06ff76, 0x000000fa, - 0xbfa20054, 0x81f9ff76, - 0x000000e9, 0xbf0b8179, - 0xbfa20050, 0x8b76ff7b, - 0xffff0000, 0xbf06ff76, - 0xbf860000, 0xbfa10051, - 0x9376ff7b, 0x0002000e, - 0x8b79ff7b, 0x00003f00, - 0x85798679, 0x8c767976, - 0xb9763b01, 0xbfa00049, - 0x8b76ff7a, 0xfc000000, - 0xbf06ff76, 0xd4000000, - 0xbfa20013, 0xbf06ff76, - 0xc8000000, 0xbfa20027, - 0x8b76ff7a, 0xff000000, - 0xbf06ff76, 0xcf000000, - 0xbfa20039, 0x8b79ff7a, - 0xffff0000, 0xbf06ff79, - 0xcc350000, 0xbfa20037, - 0xbf06ff79, 0xcc3a0000, - 0xbfa20034, 0xbf06ff76, - 0xcc000000, 0xbfa10031, - 0x8b76ff7b, 0x000001ff, - 0xbf06ff76, 0x000000ff, - 0xbfa20029, 0xbf06ff76, - 0x000000fa, 0xbfa20026, - 0x81f6ff76, 0x000000e9, - 0xbf0b8176, 0xbfa20022, - 0x8b76ff7b, 0x0003fe00, - 0xbf06ff76, 0x0001fe00, - 0xbfa2001d, 0x8b76ff7b, - 0x07fc0000, 0xbf06ff76, - 0x03fc0000, 0xbfa20018, - 0xbfa00014, 0x9376ff7a, - 0x00040016, 0x81f68176, - 0xbf0b8176, 0xbfa20012, - 0x9376ff7a, 0x00050011, + 0xbfa20026, 0x81f6ff76, + 0x000000e9, 0xbf0b8176, + 0xbfa20022, 0x8b76ff7b, + 0x0003fe00, 0xbf06ff76, + 0x0001fe00, 0xbfa2001d, + 0x8b76ff7b, 0x07fc0000, + 0xbf06ff76, 0x03fc0000, + 0xbfa20018, 0xbfa00014, + 0x9376ff7a, 0x00040016, 0x81f68176, 0xbf0b8176, - 0xbfa2000d, 0x8b76ff7a, - 0x000001ff, 0xbf06ff76, - 0x000000ff, 0xbfa20008, - 0x8b76ff7b, 0x000001ff, + 0xbfa20012, 0x9376ff7a, + 0x00050011, 0x81f68176, + 0xbf0b8176, 0xbfa2000d, + 0x8b76ff7a, 0x000001ff, 0xbf06ff76, 0x000000ff, - 0xbfa20003, 0xbfc70000, - 0xbefb006e, 0xbfa0ffad, - 0xbfc70000, 0xbefb006f, - 0xbfa0ffaa, 0xbfc70000, + 0xbfa20008, 0x8b76ff7b, + 0x000001ff, 0xbf06ff76, + 0x000000ff, 0xbfa20003, + 0xbfc70000, 0xbefb006e, + 0xbfa0ffa7, 0xbfc70000, + 0xbefb006f, 0xbfa0ffa4, + 0x80ec886c, 0x82ed806d, + 0xbfa0fff7, 0xbfc70000, 0xbeee007e, 0xbeef007f, 0xbefe0180, 0xbefe4d84, 0xbf8a0000, 0x8b7aff7f, diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm index c33e7660d8f4..d38ff404277b 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm @@ -37,6 +37,7 @@ #define HAVE_CLUSTER_BARRIER (ASIC_FAMILY == CHIP_GC_12_0_3) #define CLUSTER_BARRIER_SERIALIZE_WORKAROUND (ASIC_FAMILY == CHIP_GC_12_0_3) #define RELAXED_SCHEDULING_IN_TRAP (ASIC_FAMILY == CHIP_GFX12) +#define HAVE_INSTRUCTION_FIXUP (ASIC_FAMILY == CHIP_GC_12_0_3) #define SINGLE_STEP_MISSED_WORKAROUND 1 //workaround for lost TRAP_AFTER_INST exception when SAVECTX raised #define HAVE_VALU_SGPR_HAZARD (ASIC_FAMILY == CHIP_GFX12) @@ -372,9 +373,9 @@ L_TRAP_CASE: L_EXIT_TRAP: s_and_b32 ttmp1, ttmp1, ADDRESS_HI32_MASK -#if HAVE_BANKED_VGPRS +#if HAVE_INSTRUCTION_FIXUP s_getreg_b32 s_save_excp_flag_priv, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) - fixup_vgpr_bank_selection() + fixup_instruction() #endif #if HAVE_XNACK @@ -415,8 +416,8 @@ L_HAVE_VGPRS: save_and_clear_xnack_state_priv(s_save_tmp) #endif -#if HAVE_BANKED_VGPRS - fixup_vgpr_bank_selection() +#if HAVE_INSTRUCTION_FIXUP + fixup_instruction() #endif /* inform SPI the readiness and wait for SPI's go signal */ @@ -1397,8 +1398,8 @@ L_BARRIER_RESTORE_LOOP: L_BARRIER_RESTORE_DONE: end -#if HAVE_BANKED_VGPRS -function fixup_vgpr_bank_selection +#if HAVE_INSTRUCTION_FIXUP +function fixup_instruction // PC read may fault if memory violation has been asserted. // In this case no further progress is expected so fixup is not needed. s_bitcmp1_b32 s_save_excp_flag_priv, SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_SHIFT @@ -1477,8 +1478,13 @@ L_FIXUP_NOT_VOP12C: s_cmp_eq_u32 ttmp10, 0xcf000000 // If 31:24 = 0xcf, this is VOPD3 s_cbranch_scc1 L_FIXUP_THREE_DWORD // If VOPD3, 3 DWORD inst // Not VOP1, VOP2, VOPC, VOP3, VOP3SD, VOPD, or VOPD3. - // Might be in VOP3P, but we must ensure we are not VOP3PX2 + // Check if we are in the middle of VOP3PX. s_and_b32 ttmp13, ttmp14, 0xffff0000 // Bits 31:16 + s_cmp_eq_u32 ttmp13, 0xcc330000 // If 31:16 = 0xcc33, this is 8 bytes past VOP3PX + s_cbranch_scc1 L_FIXUP_VOP3PX_MIDDLE + s_cmp_eq_u32 ttmp13, 0xcc880000 // If 31:16 = 0xcc88, this is 8 bytes past VOP3PX + s_cbranch_scc1 L_FIXUP_VOP3PX_MIDDLE + // Might be in VOP3P, but we must ensure we are not VOP3PX2 s_cmp_eq_u32 ttmp13, 0xcc350000 // If 31:16 = 0xcc35, this is VOP3PX2 s_cbranch_scc1 L_FIXUP_DONE // If VOP3PX2, no fixup needed s_cmp_eq_u32 ttmp13, 0xcc3a0000 // If 31:16 = 0xcc3a, this is VOP3PX2 @@ -1539,6 +1545,11 @@ L_FIXUP_THREE_DWORD: s_mov_b32 ttmp15, ttmp3 // Move possible S_SET_VGPR_MSB into ttmp15 s_branch L_FIXUP_ONE_DWORD // Go to common logic that checks if it is S_SET_VGPR_MSB +L_FIXUP_VOP3PX_MIDDLE: + s_sub_co_u32 ttmp0, ttmp0, 8 // Rewind PC 8 bytes to beginning of instruction + s_sub_co_ci_u32 ttmp1, ttmp1, 0 + s_branch L_FIXUP_TWO_DWORD // 2 DWORD inst (2nd half of a 4 DWORD inst) + L_FIXUP_DONE: s_wait_kmcnt 0 // Ensure load of ttmp2 and ttmp3 is done end -- 2.34.1
