Re: [PATCH] drm/amdkfd: Extend gfx12 trap handler fix to gfx10/11

2024-06-07 Thread Lancelot SIX




On 06/06/2024 00:16, Jay Cornwall wrote:

In commit 6d1878882d2d
("drm/amdkfd: gfx12 context save/restore trap handler fixes") the
following fix was introduced but incorrectly restricted to gfx12.
The same issue and a corresponding fix apply to gfx10 and gfx11.

Do not overwrite TRAPSTS.{SAVECTX,HOST_TRAP} when restoring this
register. Both of these fields can assert while the wavefront is
running the trap handler.

Signed-off-by: Jay Cornwall 
Cc: Lancelot Six 


Hi Jay,

Thanks, that looks good to me (tested on gfx10.3, 11 and 12).  For 
gfx11+ there might be a risk of overriding perf_snapshot, but that seems 
fine as this is not used, and such sample from cwsr restore would mostly 
be meaningless anyway.


Reviewed-by: Lancelot Six 

Best,
Lancelot.


---
  .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 16 +---
  .../amd/amdkfd/cwsr_trap_handler_gfx10.asm| 38 ++-
  2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 665122d1bbbd..02f7ba8c93cd 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -1136,7 +1136,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x705d, 0x807c817c,
0x8070ff70, 0x0080,
0xbf0a7b7c, 0xbf85fff8,
-   0xbf82013d, 0xbef4037e,
+   0xbf82013f, 0xbef4037e,
0x8775ff7f, 0x,
0x8875ff75, 0x0004,
0xbef60380, 0xbef703ff,
@@ -1275,7 +1275,8 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x80788478, 0xbf8c,
0xb9eef815, 0xbefc036f,
0xbefe0370, 0xbeff0371,
-   0xb9f9f816, 0xb9fbf803,
+   0xb9f9f816, 0xb9fb4803,
+   0x907b8b7b, 0xb9fba2c3,
0xb9f3f801, 0xb96e3a05,
0x806e816e, 0xbf0d9972,
0xbf850002, 0x8f6e896e,
@@ -2544,7 +2545,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xe0704000, 0x705d,
0x807c817c, 0x8070ff70,
0x0080, 0xbf0a7b7c,
-   0xbf85fff8, 0xbf820134,
+   0xbf85fff8, 0xbf820136,
0xbef4037e, 0x8775ff7f,
0x, 0x8875ff75,
0x0004, 0xbef60380,
@@ -2683,7 +2684,8 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xf000, 0x80788478,
0xbf8c, 0xb9eef815,
0xbefc036f, 0xbefe0370,
-   0xbeff0371, 0xb9fbf803,
+   0xbeff0371, 0xb9fb4803,
+   0x907b8b7b, 0xb9fba2c3,
0xb9f3f801, 0xb96e3a05,
0x806e816e, 0xbf0d9972,
0xbf850002, 0x8f6e896e,
@@ -2981,7 +2983,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
0x701d, 0x807d817d,
0x8070ff70, 0x0080,
0xbf0a7b7d, 0xbfa2fff8,
-   0xbfa0013f, 0xbef4007e,
+   0xbfa00143, 0xbef4007e,
0x8b75ff7f, 0x,
0x8c75ff75, 0x0004,
0xbef60080, 0xbef700ff,
@@ -3123,7 +3125,9 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
0x80788478, 0xbf89,
0xb96ef815, 0xbefd006f,
0xbefe0070, 0xbeff0071,
-   0xb97bf803, 0xb973f801,
+   0xb97b4803, 0x857b8b7b,
+   0xb97b22c3, 0x857b867b,
+   0xb97b7443, 0xb973f801,
0xb8ee3b05, 0x806e816e,
0xbf0d9972, 0xbfa20002,
0x846e896e, 0xbfa1,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index ac3702b8e3c4..44772eec9ef4 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -119,9 +119,12 @@ var SQ_WAVE_TRAPSTS_ADDR_WATCH_SHIFT   = 7
  var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
  var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT= 8
  var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800
+var SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT = 11
  var SQ_WAVE_TRAPSTS_EXCP_HI_MASK  = 0x7000
  #if ASIC_FAMILY >= CHIP_PLUM_BONITO
+var SQ_WAVE_TRAPSTS_HOST_TRAP_SHIFT= 16
  var SQ_WAVE_TRAPSTS_WAVE_START_MASK   = 0x2
+var SQ_WAVE_TRAPSTS_WAVE_START_SHIFT   = 17
  var SQ_WAVE_TRAPSTS_WAVE_END_MASK = 0x4
  var SQ_WAVE_TRAPSTS_TRAP_AFTER_INST_MASK  = 0x10
  #endif
@@ -137,14 +140,23 @@ var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000
  
  var SQ_WAVE_MODE_DEBUG_EN_MASK			= 0x800
  
+var S_TRAPSTS_RESTORE_PART_1_SIZE		= SQ_WAVE_TRAPSTS_SAVECTX_SHIFT

+var S_TRAPSTS_RESTORE_PART_2_SHIFT = 
SQ_WAVE_TRAPSTS_ILLEGAL_INST_SHIFT
+
  #if ASIC_FAMILY < CHIP_PLUM_BONITO
  var S_TRAPSTS_NON_MASKABLE_EXCP_MASK  = 
SQ_WAVE_TRAPSTS_MEM_VIOL_MASK|SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
+var S_TRAPSTS_RESTORE_PART_2_SIZE  = 32 - 
S_TRAPSTS_RESTORE_PART_2_SHIFT
+var S_TRAPSTS_RESTORE_PART_3_SHIFT = 0
+var S_TRAPSTS_RESTORE_PART_3_SIZE  = 0
  #else
  var S_TRAPSTS_NON_MASKABLE_EXCP_MASK  = SQ_WAVE_TRAPS

Re: [PATCH v2] drm/amdkfd: Handle deallocated VPGRs in gfx11+ trap handler

2024-05-29 Thread Lancelot SIX




On 29/05/2024 20:35, Jay Cornwall wrote:

A wavefront may deallocate its VGPRs at the end of a program while
waiting for memory transactions to complete. If it subsequently
receives a context save exception it will be unable to save,
since this requires VGPRs. In this case the trap handler should
terminate the wavefront.

Fixes intermittent VM faults under context switching load.

V2: Use S_ENDPGM instead of S_ENDPGM_SAVED for performance counters


Hi Jay,

Thanks for the V2.

FYI,as far as I can see, the .h part of the patch does not seem to apply 
directly on current amd-staging-drm-next, but I guess we just have a 
different bases.




Signed-off-by: Jay Cornwall 
Cc: Lancelot Six 
---
  .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 695 +-
  .../amd/amdkfd/cwsr_trap_handler_gfx10.asm|  17 +
  2 files changed, 366 insertions(+), 346 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index 18e012e04493..ac3702b8e3c4 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -97,6 +97,7 @@ var S_STATUS_HALT_MASK= 
SQ_WAVE_STATE_PRIV_HALT_MASK
  var S_SAVE_PC_HI_TRAP_ID_MASK = 0xF000
  #endif
  
+var SQ_WAVE_STATUS_NO_VGPRS_SHIFT		= 24

  var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT  = 12
  var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE   = 9
  var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE  = 8
@@ -451,6 +452,22 @@ L_EXIT_TRAP:
s_rfe_b64   [ttmp0, ttmp1]
  
  L_SAVE:

+   // If VGPRs have been deallocated then terminate the wavefront.
+   // It has no remaining program to run and cannot save without VGPRs.
+#if ASIC_FAMILY == CHIP_PLUM_BONITO
+   s_bitcmp1_b32   s_save_status, SQ_WAVE_STATUS_NO_VGPRS_SHIFT
+   s_cbranch_scc0  L_HAVE_VGPRS
+   s_endpgm
+L_HAVE_VGPRS:
+#endif
+#if ASIC_FAMILY >= CHIP_GFX12


This is mostly cosmetic, but you could use

#elif ASIC_FAMILY >= CHIP_GFX12

instead on #endif + #if.  Those 2 blocks are not independent, they 
achieve the same goal for different configurations.


I do not mind if you prefer to keep this as it is.  Either way, the 
change look good to me:


Reviewed-by: Lancelot Six 

Best,
Lancelot.


+   s_getreg_b32s_save_tmp, hwreg(HW_REG_WAVE_STATUS)
+   s_bitcmp1_b32   s_save_tmp, SQ_WAVE_STATUS_NO_VGPRS_SHIFT
+   s_cbranch_scc0  L_HAVE_VGPRS
+   s_endpgm
+L_HAVE_VGPRS:
+#endif
+
s_and_b32   s_save_pc_hi, s_save_pc_hi, 0x  
//pc[47:32]
s_mov_b32   s_save_tmp, 0
s_setreg_b32hwreg(S_TRAPSTS_HWREG, S_TRAPSTS_SAVE_CONTEXT_SHIFT, 
1), s_save_tmp //clear saveCtx bit


Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes

2024-05-23 Thread Lancelot SIX




On 23/05/2024 20:31, Jay Cornwall wrote:

On 5/23/2024 13:37, Lancelot SIX wrote:


@@ -622,8 +638,15 @@ L_SAVE_HWREG:
  #if ASIC_FAMILY >= CHIP_GFX12
  // Ensure no further changes to barrier or LDS state.
+    // STATE_PRIV.BARRIER_COMPLETE may change up to this point.
  s_barrier_signal    -2
  s_barrier_wait    -2
+
+    // Re-read final state of BARRIER_COMPLETE field for save.
+    s_getreg_b32    s_save_tmp, hwreg(S_STATUS_HWREG)
+    s_and_b32    s_save_tmp, s_save_tmp, 
SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
+    s_andn2_b32    s_save_status, s_save_status, 
SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK


Even if BARRIER_COMPLETE can be asserted while we are in the trap 
hadler, I do not think it can be cleared.  That being said, it might 
be easier to just replace the bit, making it clearer.


Yes, I chose to structure it this way to make the intent clearer. We 
don't gain much from dropping the s_andn2. Most of the time spent in the 
save handler is stalled on memory instructions.



@@ -1351,7 +1369,17 @@ L_SKIP_BARRIER_RESTORE:
  s_setreg_b32    hwreg(HW_REG_SHADER_XNACK_MASK), 
s_restore_xnack_mask

  #endif
+#if ASIC_FAMILY < CHIP_GFX12
  s_setreg_b32    hwreg(S_TRAPSTS_HWREG), s_restore_trapsts


Wouldn't other gfx1x architectures have a similar issue when writing 
TRAPSTS here?  That is if TRAPSTS.SAVECTX is set while we are 
restoring, wouldn't we loose it?


And for gfx11, there is TRAPSTS.HOST_TRAP that could have the same 
issue to some degree (not sure if we would loose the host trap 
completly, or re-enter with trap ID + HT bit set in ttmp1).


Prior to gfx12 context save and host trap exceptions are not delivered 
to a wave until STATUS.PRIV=0, i.e. it leaves the trap handler.


The changes needed for gfx12 are due to a design change in this area. 
Exceptions are now flagged immediately and cause re-entry to the trap if 
any are non-zero.


Thanks for the clarifications.  The patch looks good to me.

Reviewed-by: Lancelot Six 

Best,
Lancelot.


Re: [PATCH 2/3] drm/amdkfd: Replace deprecated gfx12 trap handler instructions

2024-05-23 Thread Lancelot SIX




On 23/05/2024 15:08, Jay Cornwall wrote:

Newer assemblers reject S_WAITCNT. All instances of S_WAITCNT can be
replaced by S_WAITCNT 0 (< gfx12) or S_WAIT_IDLE (>= gfx12) since
there is no concurrency of different memory instruction classes.

Signed-off-by: Jay Cornwall 
Cc: Lancelot Six 


Thanks, that looks good to me.

Reviewed-by: Lancelot Six 


---
  .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 140 +-
  .../amd/amdkfd/cwsr_trap_handler_gfx10.asm|  52 +++
  2 files changed, 97 insertions(+), 95 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 11d076eb770c..d61b2c3bd0ac 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -711,12 +711,12 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0xbf0d8f7b, 0xbf840002,
0x887bff7b, 0x,
0xf4011bbd, 0xfa10,
-   0xbf8cc07f, 0x8f6e976e,
+   0xbf8c, 0x8f6e976e,
0x8a77ff77, 0x0080,
0x88776e77, 0xf4051bbd,
-   0xfa00, 0xbf8cc07f,
+   0xfa00, 0xbf8c,
0xf4051ebd, 0xfa08,
-   0xbf8cc07f, 0x87ee6e6e,
+   0xbf8c, 0x87ee6e6e,
0xbf840001, 0xbe80206e,
0x876eff6d, 0x00ff,
0xbf850008, 0x876eff6d,
@@ -1185,7 +1185,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x785d, 0xe0304080,
0x785d0100, 0xe0304100,
0x785d0200, 0xe0304180,
-   0x785d0300, 0xbf8c3f70,
+   0x785d0300, 0xbf8c,
0x7e008500, 0x7e028501,
0x7e048502, 0x7e068503,
0x807c847c, 0x8078ff78,
@@ -1194,7 +1194,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x6e5d, 0xe0304080,
0x6e5d0100, 0xe0304100,
0x6e5d0200, 0xe0304180,
-   0x6e5d0300, 0xbf8c3f70,
+   0x6e5d0300, 0xbf8c,
0xbf820034, 0xbef603ff,
0x0100, 0xbeee0378,
0x8078ff78, 0x0400,
@@ -1203,7 +1203,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x785d, 0xe0304100,
0x785d0100, 0xe0304200,
0x785d0200, 0xe0304300,
-   0x785d0300, 0xbf8c3f70,
+   0x785d0300, 0xbf8c,
0x7e008500, 0x7e028501,
0x7e048502, 0x7e068503,
0x807c847c, 0x8078ff78,
@@ -1213,7 +1213,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x8f6f836f, 0x806f7c6f,
0xbefe03c1, 0xbeff0380,
0xe0304000, 0x785d,
-   0xbf8c3f70, 0x7e008500,
+   0xbf8c, 0x7e008500,
0x807c817c, 0x8078ff78,
0x0080, 0xbf0a6f7c,
0xbf85fff7, 0xbeff03c1,
@@ -1221,7 +1221,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0xe0304100, 0x6e5d0100,
0xe0304200, 0x6e5d0200,
0xe0304300, 0x6e5d0300,
-   0xbf8c3f70, 0xb9783a05,
+   0xbf8c, 0xb9783a05,
0x80788178, 0xbf0d9972,
0xbf850002, 0x8f788978,
0xbf820001, 0x8f788a78,
@@ -1232,16 +1232,16 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x0100, 0xbefc03ff,
0x006c, 0x80f89078,
0xf429003a, 0xf000,
-   0xbf8cc07f, 0x80fc847c,
+   0xbf8c, 0x80fc847c,
0xbf80, 0xbe803100,
0xbe823102, 0x80f8a078,
0xf42d003a, 0xf000,
-   0xbf8cc07f, 0x80fc887c,
+   0xbf8c, 0x80fc887c,
0xbf80, 0xbe803100,
0xbe823102, 0xbe843104,
0xbe863106, 0x80f8c078,
0xf431003a, 0xf000,
-   0xbf8cc07f, 0x80fc907c,
+   0xbf8c, 0x80fc907c,
0xbf80, 0xbe803100,
0xbe823102, 0xbe843104,
0xbe863106, 0xbe883108,
@@ -1271,9 +1271,9 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0xf4211cfa, 0xf000,
0x80788478, 0xf4211bba,
0xf000, 0x80788478,
-   0xbf8cc07f, 0xb9eef814,
+   0xbf8c, 0xb9eef814,
0xf4211bba, 0xf000,
-   0x80788478, 0xbf8cc07f,
+   0x80788478, 0xbf8c,
0xb9eef815, 0xbefc036f,
0xbefe0370, 0xbeff0371,
0xb9f9f816, 0xb9fbf803,
@@ -1288,7 +1288,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x, 0xf4091c37,
0xfa50, 0xf4091d37,
0xfa60, 0xf4011e77,
-   0xfa74, 0xbf8cc07f,
+   0xfa74, 0xbf8c,
0x906e8977, 0x876fff6e,
0x003f8000, 0x906e8677,
0x876eff6e, 0x0200,
@@ -2299,12 +2299,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xbf0d8f7b, 0xbf840002,
0x887bff7b, 0x,
0xf4011bbd, 0xfa10,
-   0xbf8cc07f, 0x8f6e976e,
+   0xbf8c, 0x8f6e976e,
0x8a77ff77, 0x0080,
0x88776e77, 0xf4051bbd,
-   0xfa00, 0xbf8cc07f,
+   0xfa00, 0xbf8c,
0xf4051ebd, 0xfa08,
-   0xbf8cc07f, 0x87ee6e6e,
+   0xbf8c, 0x87ee6e6e,
0xbf840001, 0xbe80206e,
0x876eff6d, 0x00ff,
0xbf850008, 0x876eff6d,
@@ -2319,7 +2319,7 @@ static

Re: [PATCH 1/3] drm/amdkfd: Sync trap handler binary with source

2024-05-23 Thread Lancelot SIX




On 23/05/2024 15:08, Jay Cornwall wrote:

Source and binary have become mismatched during branch activity.

Signed-off-by: Jay Cornwall 
Cc: Lancelot Six 


Thanks for doing this.

This matches what I have when rebuilding the trap handlers.

Reviewed-by: Lancelot Six 


---
  .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 57 ---
  1 file changed, 24 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 73d3772cdb76..11d076eb770c 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -718,12 +718,12 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0xf4051ebd, 0xfa08,
0xbf8cc07f, 0x87ee6e6e,
0xbf840001, 0xbe80206e,
-   0x876eff6d, 0x01ff,
-   0xbf850005, 0x8878ff78,
-   0x2000, 0x80ec886c,
-   0x82ed806d, 0xbf820005,
-   0x876eff6d, 0x0100,
-   0xbf850002, 0x806c846c,
+   0x876eff6d, 0x00ff,
+   0xbf850008, 0x876eff6d,
+   0x0100, 0xbf850007,
+   0x8878ff78, 0x2000,
+   0x80ec886c, 0x82ed806d,
+   0xbf820002, 0x806c846c,
0x826d806d, 0x876dff6d,
0x, 0x907a8977,
0x877bff7a, 0x003f8000,
@@ -1136,7 +1136,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0xe0704000, 0x705d,
0x807c817c, 0x8070ff70,
0x0080, 0xbf0a7b7c,
-   0xbf85fff8, 0xbf820144,
+   0xbf85fff8, 0xbf82013e,
0xbef4037e, 0x8775ff7f,
0x, 0x8875ff75,
0x0004, 0xbef60380,
@@ -1276,10 +1276,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = {
0x80788478, 0xbf8cc07f,
0xb9eef815, 0xbefc036f,
0xbefe0370, 0xbeff0371,
-   0x876f7bff, 0x03ff,
-   0xb9ef4803, 0xb9f9f816,
-   0x876f7bff, 0xf800,
-   0x906f8b6f, 0xb9efa2c3,
+   0xb9f9f816, 0xb9fbf803,
0xb9f3f801, 0xb96e3a05,
0x806e816e, 0xbf0d9972,
0xbf850002, 0x8f6e896e,
@@ -2309,12 +2306,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xf4051ebd, 0xfa08,
0xbf8cc07f, 0x87ee6e6e,
0xbf840001, 0xbe80206e,
-   0x876eff6d, 0x01ff,
-   0xbf850005, 0x8878ff78,
-   0x2000, 0x80ec886c,
-   0x82ed806d, 0xbf820005,
-   0x876eff6d, 0x0100,
-   0xbf850002, 0x806c846c,
+   0x876eff6d, 0x00ff,
+   0xbf850008, 0x876eff6d,
+   0x0100, 0xbf850007,
+   0x8878ff78, 0x2000,
+   0x80ec886c, 0x82ed806d,
+   0xbf820002, 0x806c846c,
0x826d806d, 0x876dff6d,
0x, 0x87fe7e7e,
0x87ea6a6a, 0xb9f8f802,
@@ -2549,7 +2546,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0x705d, 0x807c817c,
0x8070ff70, 0x0080,
0xbf0a7b7c, 0xbf85fff8,
-   0xbf82013b, 0xbef4037e,
+   0xbf820135, 0xbef4037e,
0x8775ff7f, 0x,
0x8875ff75, 0x0004,
0xbef60380, 0xbef703ff,
@@ -2688,10 +2685,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
0xf000, 0x80788478,
0xbf8cc07f, 0xb9eef815,
0xbefc036f, 0xbefe0370,
-   0xbeff0371, 0x876f7bff,
-   0x03ff, 0xb9ef4803,
-   0x876f7bff, 0xf800,
-   0x906f8b6f, 0xb9efa2c3,
+   0xbeff0371, 0xb9fbf803,
0xb9f3f801, 0xb96e3a05,
0x806e816e, 0xbf0d9972,
0xbf850002, 0x8f6e896e,
@@ -2749,11 +2743,11 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
0xf808, 0xbf89fc07,
0x8bee6e6e, 0xbfa10001,
0xbe80486e, 0x8b6eff6d,
-   0x01ff, 0xbfa20005,
-   0x8c78ff78, 0x2000,
-   0x80ec886c, 0x82ed806d,
-   0xbfa5, 0x8b6eff6d,
-   0x0100, 0xbfa20002,
+   0x00ff, 0xbfa20008,
+   0x8b6eff6d, 0x0100,
+   0xbfa20007, 0x8c78ff78,
+   0x2000, 0x80ec886c,
+   0x82ed806d, 0xbfa2,
0x806c846c, 0x826d806d,
0x8b6dff6d, 0x,
0x8bfe7e7e, 0x8bea6a6a,
@@ -2988,7 +2982,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
0x701d, 0x807d817d,
0x8070ff70, 0x0080,
0xbf0a7b7d, 0xbfa2fff8,
-   0xbfa00146, 0xbef4007e,
+   0xbfa00140, 0xbef4007e,
0x8b75ff7f, 0x,
0x8c75ff75, 0x0004,
0xbef60080, 0xbef700ff,
@@ -3130,10 +3124,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = {
0xf000, 0x80788478,
0xbf89fc07, 0xb96ef815,
0xbefd006f, 0xbefe0070,
-   0xbeff0071, 0x8b6f7bff,
-   0x03ff, 0xb96f4803,
-   0x8b6f7bff, 0xf800,
-   0x856f8b6f, 0xb96fa2c3,
+   0xbeff0071, 0xb97bf803,
0xb973f801, 0xb8ee3b05,
0x806e816e, 0xbf0d9972,
0xbfa20002, 0x846e896e,
@@ -4119,7 +4110,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
0x8b6dff6d, 0x,
0x8bfe7e7e, 0x8bea6a6a,
0xb97af804, 0xbe804a6c,
-   0xbfb0, 0xbf9f

Re: [PATCH 3/3] drm/amdkfd: gfx12 context save/restore trap handler fixes

2024-05-23 Thread Lancelot SIX

Hi Jay,

I have added a couple (minor) of comments below.

On 23/05/2024 15:08, Jay Cornwall wrote:

Fix LDS size interpretation: 512 bytes (>= gfx12) vs 256 (< gfx12).

Ensure STATE_PRIV.BARRIER_COMPLETE cannot change after reading or
before writing. Other waves in the threadgroup may cause this field
to assert if they complete the barrier.

Do not overwrite EXCP_FLAG_PRIV.{SAVE_CONTEXT,HOST_TRAP} when
restoring this register. Both of these fields can assert while the
wavefront is running the trap handler.

Signed-off-by: Jay Cornwall 
Cc: Lancelot Six 
---
  .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 1191 +
  .../amd/amdkfd/cwsr_trap_handler_gfx10.asm|   55 +-
  2 files changed, 639 insertions(+), 607 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index 77ae25b6753c..18e012e04493 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -75,17 +75,22 @@ var SQ_WAVE_STATUS_ECC_ERR_MASK = 
0x2
  var SQ_WAVE_STATUS_TRAP_EN_SHIFT  = 6
  var SQ_WAVE_IB_STS2_WAVE64_SHIFT  = 11
  var SQ_WAVE_IB_STS2_WAVE64_SIZE   = 1
+var SQ_WAVE_LDS_ALLOC_GRANULARITY  = 8
  var S_STATUS_HWREG= HW_REG_STATUS
  var S_STATUS_ALWAYS_CLEAR_MASK= 
SQ_WAVE_STATUS_SPI_PRIO_MASK|SQ_WAVE_STATUS_ECC_ERR_MASK
  var S_STATUS_HALT_MASK= 
SQ_WAVE_STATUS_HALT_MASK
  var S_SAVE_PC_HI_TRAP_ID_MASK = 0x00FF
  var S_SAVE_PC_HI_HT_MASK  = 0x0100
  #else
+var SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK   = 0x4
+var SQ_WAVE_STATE_PRIV_SCC_SHIFT   = 9
  var SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK  = 0xC00
  var SQ_WAVE_STATE_PRIV_HALT_MASK  = 0x4000
  var SQ_WAVE_STATE_PRIV_POISON_ERR_MASK= 0x8000
+var SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT= 15
  var SQ_WAVE_STATUS_WAVE64_SHIFT   = 29
  var SQ_WAVE_STATUS_WAVE64_SIZE= 1
+var SQ_WAVE_LDS_ALLOC_GRANULARITY  = 9
  var S_STATUS_HWREG= HW_REG_WAVE_STATE_PRIV
  var S_STATUS_ALWAYS_CLEAR_MASK= 
SQ_WAVE_STATE_PRIV_SYS_PRIO_MASK|SQ_WAVE_STATE_PRIV_POISON_ERR_MASK
  var S_STATUS_HALT_MASK= 
SQ_WAVE_STATE_PRIV_HALT_MASK
@@ -149,8 +154,10 @@ var SQ_WAVE_EXCP_FLAG_PRIV_MEM_VIOL_MASK   = 0x10
  var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_SHIFT = 5
  var SQ_WAVE_EXCP_FLAG_PRIV_SAVE_CONTEXT_MASK  = 0x20
  var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_MASK  = 0x40
+var SQ_WAVE_EXCP_FLAG_PRIV_ILLEGAL_INST_SHIFT  = 6
  var SQ_WAVE_EXCP_FLAG_PRIV_HOST_TRAP_MASK = 0x80
  var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_MASK= 0x100
+var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_START_SHIFT= 8
  var SQ_WAVE_EXCP_FLAG_PRIV_WAVE_END_MASK  = 0x200
  var SQ_WAVE_EXCP_FLAG_PRIV_TRAP_AFTER_INST_MASK   = 0x800
  var SQ_WAVE_TRAP_CTRL_ADDR_WATCH_MASK = 0x80
@@ -430,7 +437,16 @@ L_EXIT_TRAP:
// Restore SQ_WAVE_STATUS.
s_and_b64   exec, exec, exec
// Restore STATUS.EXECZ, not writable by s_setreg_b32
s_and_b64   vcc, vcc, vcc   
// Restore STATUS.VCCZ, not writable by s_setreg_b32
+
+#if ASIC_FAMILY < CHIP_GFX12
s_setreg_b32hwreg(S_STATUS_HWREG), s_save_status
+#else
+   // STATE_PRIV.BARRIER_COMPLETE may have changed since we read it.
+   // Only restore fields which the trap handler changes.
+   s_lshr_b32  s_save_status, s_save_status, 
SQ_WAVE_STATE_PRIV_SCC_SHIFT
+   s_setreg_b32hwreg(S_STATUS_HWREG, SQ_WAVE_STATE_PRIV_SCC_SHIFT, \
+   SQ_WAVE_STATE_PRIV_POISON_ERR_SHIFT - 
SQ_WAVE_STATE_PRIV_SCC_SHIFT + 1), s_save_status
+#endif
  
  	s_rfe_b64	[ttmp0, ttmp1]
  
@@ -622,8 +638,15 @@ L_SAVE_HWREG:
  
  #if ASIC_FAMILY >= CHIP_GFX12

// Ensure no further changes to barrier or LDS state.
+   // STATE_PRIV.BARRIER_COMPLETE may change up to this point.
s_barrier_signal-2
s_barrier_wait  -2
+
+   // Re-read final state of BARRIER_COMPLETE field for save.
+   s_getreg_b32s_save_tmp, hwreg(S_STATUS_HWREG)
+   s_and_b32   s_save_tmp, s_save_tmp, 
SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK
+   s_andn2_b32 s_save_status, s_save_status, 
SQ_WAVE_STATE_PRIV_BARRIER_COMPLETE_MASK


Even if BARRIER_COMPLETE can be asserted while we are in the trap 
hadler, I do not think it can be cleared.  That being said, it might be 
easier to just replace the bit, making it clearer.



+   s_or_b32s_save_status, s_save_status, s_save_tmp
  #endif
  
  	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)

@@ -

[PATCH] drm/amdkfd: update buffer_{store, load}_* modifiers for gfx940

2024-04-29 Thread Lancelot SIX
Instruction modifiers of the untyped vector memory buffer instructions
(MUBUF encoded) changed in gfx940.  The slc, scc and glc modifiers have
been replaced with sc0, sc1 and nt.

The current CWSR trap handler is written using pre-gfx940 modifier
names, making the source incompatible with a strict gfx940 assembler.

This patch updates the cwsr_trap_handler_gfx9.s source file to be
compatible with all gfx9 variants of the ISA.  The binary assembled code
is unchanged (so the behaviour is unchanged as well), only the source
representation is updated.

Signed-off-by: Lancelot SIX 
---
 .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 24 ---
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index bb26338204f4..a2d597d7fb57 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -48,6 +48,12 @@ var ACK_SQC_STORE=   1   
//workaround for suspected SQC store bug causing
 var SAVE_AFTER_XNACK_ERROR =   1   //workaround for 
TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger
 var SINGLE_STEP_MISSED_WORKAROUND   =  (ASIC_FAMILY <= CHIP_ALDEBARAN) 
//workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
 
+#if ASIC_FAMILY < CHIP_GC_9_4_3
+#define VMEM_MODIFIERS slc:1 glc:1
+#else
+#define VMEM_MODIFIERS sc0:1 nt:1
+#endif
+
 /**/
 /* variables */
 /**/
@@ -581,7 +587,7 @@ end
 L_SAVE_LDS_LOOP_VECTOR:
   ds_read_b64 v[0:1], v2   //x =LDS[a], byte address
   s_waitcnt lgkmcnt(0)
-  buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset 
offen:1  glc:1  slc:1
+  buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset 
VMEM_MODIFIERS offen:1
 // s_waitcnt vmcnt(0)
 // v_add_u32 v2, vcc[0:1], v2, v3
   v_add_u32 v2, v2, v3
@@ -979,17 +985,17 @@ L_TCP_STORE_CHECK_DONE:
 end
 
 function write_4vgprs_to_mem(s_rsrc, s_mem_offset)
-   buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
-   buffer_store_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1  offset:256
-   buffer_store_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1  
offset:256*2
-   buffer_store_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1  
offset:256*3
+   buffer_store_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS
+   buffer_store_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256
+   buffer_store_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*2
+   buffer_store_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*3
 end
 
 function read_4vgprs_from_mem(s_rsrc, s_mem_offset)
-   buffer_load_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
-   buffer_load_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256
-   buffer_load_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*2
-   buffer_load_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 offset:256*3
+   buffer_load_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS
+   buffer_load_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS offset:256
+   buffer_load_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*2
+   buffer_load_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*3
s_waitcnt vmcnt(0)
 end
 

base-commit: cf743996352e327f483dc7d66606c90276f57380
-- 
2.34.1



[PATCH] drm/amdkfd: Flush the process wq before creating a kfd_process

2024-04-26 Thread Lancelot SIX
There is a race condition when re-creating a kfd_process for a process.
This has been observed when a process under the debugger executes
exec(3).  In this scenario:
- The process executes exec.
 - This will eventually release the process's mm, which will cause the
   kfd_process object associated with the process to be freed
   (kfd_process_free_notifier decrements the reference count to the
   kfd_process to 0).  This causes kfd_process_ref_release to enqueue
   kfd_process_wq_release to the kfd_process_wq.
- The debugger receives the PTRACE_EVENT_EXEC notification, and tries to
  re-enable AMDGPU traps (KFD_IOC_DBG_TRAP_ENABLE).
 - When handling this request, KFD tries to re-create a kfd_process.
   This eventually calls kfd_create_process and kobject_init_and_add.

At this point the call to kobject_init_and_add can fail because the
old kfd_process.kobj has not been freed yet by kfd_process_wq_release.

This patch proposes to avoid this race by making sure to drain
kfd_process_wq before creating a new kfd_process object.  This way, we
know that any cleanup task is done executing when we reach
kobject_init_and_add.

Signed-off-by: Lancelot SIX 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 58c1fe542193..451bb058cc62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -829,6 +829,14 @@ struct kfd_process *kfd_create_process(struct task_struct 
*thread)
if (process) {
pr_debug("Process already found\n");
} else {
+   /* If the process just called exec(3), it is possible that the
+* cleanup of the kfd_process (following the release of the mm
+* of the old process image) is still in the cleanup work queue.
+* Make sure to drain any job before trying to recreate any
+* resource for this process.
+*/
+   flush_workqueue(kfd_process_wq);
+
process = create_process(thread);
if (IS_ERR(process))
goto out;

base-commit: cf743996352e327f483dc7d66606c90276f57380
-- 
2.34.1



[PATCH] drm/amdkfd: Enable SQ watchpoint for gfx10

2024-04-12 Thread Lancelot SIX
There are new control registers introduced in gfx10 used to configure
hardware watchpoints triggered by SMEM instructions:
SQ_WATCH{0,1,2,3}_{CNTL_ADDR_HI,ADDR_L}.

Those registers work in a similar way as the TCP_WATCH* registers
currently used for gfx9 and above.

This patch adds support to program the SQ_WATCH registers for gfx10.

The SQ_WATCH?_CNTL.MASK field has one bit more than
TCP_WATCH?_CNTL.MASK, so SQ watchpoints can have a finer granularity
than TCP_WATCH watchpoints.  In this patch, we keep the capabilities
advertised to the debugger unchanged
(HSA_DBG_WATCH_ADDR_MASK_*_BIT_GFX10) as this reflects what both TCP and
SQ watchpoints can do and both watchpoints are configured together.

Signed-off-by: Lancelot SIX 
---
 .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 71 +++
 1 file changed, 58 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 69810b3f1c63..3ab6c3aa0ad1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -881,6 +881,7 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct 
amdgpu_device *adev,
 }
 
 #define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
+#define SQ_WATCH_STRIDE (mmSQ_WATCH1_ADDR_H - mmSQ_WATCH0_ADDR_H)
 uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,
uint64_t watch_address,
uint32_t watch_address_mask,
@@ -889,55 +890,93 @@ uint32_t kgd_gfx_v10_set_address_watch(struct 
amdgpu_device *adev,
uint32_t debug_vmid,
uint32_t inst)
 {
+   /* SQ_WATCH?_ADDR_* and TCP_WATCH?_ADDR_* are programmed with the
+* same values.
+*/
uint32_t watch_address_high;
uint32_t watch_address_low;
-   uint32_t watch_address_cntl;
-
-   watch_address_cntl = 0;
+   uint32_t tcp_watch_address_cntl;
+   uint32_t sq_watch_address_cntl;
 
watch_address_low = lower_32_bits(watch_address);
watch_address_high = upper_32_bits(watch_address) & 0x;
 
-   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   tcp_watch_address_cntl = 0;
+   tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
VMID,
debug_vmid);
-   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
MODE,
watch_mode);
-   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
MASK,
watch_address_mask >> 7);
 
+   sq_watch_address_cntl = 0;
+   sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
+   SQ_WATCH0_CNTL,
+   VMID,
+   debug_vmid);
+   sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
+   SQ_WATCH0_CNTL,
+   MODE,
+   watch_mode);
+   sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
+   SQ_WATCH0_CNTL,
+   MASK,
+   watch_address_mask >> 6);
+
/* Turning off this watch point until we set all the registers */
-   watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
+   tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,
TCP_WATCH0_CNTL,
VALID,
0);
-
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +
(watch_id * TCP_WATCH_STRIDE)),
-   watch_address_cntl);
+   tcp_watch_address_cntl);
+
+   sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl,
+   SQ_WATCH0_CNTL,
+   VALID,
+   0);
+   WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) +
+   (watch_id * SQ_WATCH_STRIDE)),
+   sq_watch_address_cntl);
 
+   /* Program {TCP,SQ}_WATCH?_ADDR* */
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_high);
-
WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +
(watch_id * TCP_WATCH_STRIDE)),
watch_address_low);
 
+   WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_H) +
+   (watch_id * SQ_WATCH_STRIDE)),
+   wa

Re: [PATCH] drm/amdkfd: Clear the VALU exception state in the trap handler

2023-11-09 Thread Lancelot SIX

The trap handler could be entered with pending VALU exceptions, so
clear the exception state before issuing vector instructions.

Signed-off-by: Laurent Morichetti 


Reviewed-by: Jay Cornwall 


Hi,

FYI, I tested this and it fixes the issue.

Best,
Lancelot.

Tested-by: Lancelot Six