Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls

2022-11-24 Thread Felix Kuehling

Am 2022-11-24 um 09:58 schrieb Kim, Jonathan:

[AMD Official Use Only - General]


-Original Message-
From: Kuehling, Felix 
Sent: November 22, 2022 6:59 PM
To: Kim, Jonathan ; amd-
g...@lists.freedesktop.org
Subject: Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode
enable and disable calls


On 2022-10-31 12:23, Jonathan Kim wrote:

On GFX9.4.1, the implicit wait count instruction on s_barrier is
disabled by default in the driver during normal operation for
performance requirements.

There is a hardware bug in GFX9.4.1 where if the implicit wait count
instruction after an s_barrier instruction is disabled, any wave that
hits an exception may step over the s_barrier when returning from the
trap handler with the barrier logic having no ability to be
aware of this, thereby causing other waves to wait at the barrier
indefinitely resulting in a shader hang.  This bug has been corrected
for GFX9.4.2 and onward.

Since the debugger subscribes to hardware exceptions, in order to avoid
this bug, the debugger must enable implicit wait count on s_barrier
for a debug session and disable it on detach.

In order to change this setting in the in the device global SQ_CONFIG
register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
will either dispatch work through the compute ring buffers used for
image post processing or through the hardware scheduler by the KFD.

Have the KGD suspend and drain the compute ring buffer, then suspend

the

hardware scheduler and block any future KFD process job requests before
changing the implicit wait count setting.  Once set, resume all work.

Signed-off-by: Jonathan Kim 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   3 +
   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 105

+-

   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |   4 +-
   drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   2 +-
   4 files changed, 110 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 0e6ddf05c23c..9f2499f52d2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1034,6 +1034,9 @@ struct amdgpu_device {
 struct pci_saved_state  *pci_state;
 pci_channel_state_t pci_channel_state;

+   /* Track auto wait count on s_barrier settings */
+   boolbarrier_has_auto_waitcnt;
+
 struct amdgpu_reset_control *reset_cntl;
 uint32_t

ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c

b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c

index 4191af5a3f13..13f02a0aa828 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -26,6 +26,7 @@
   #include "amdgpu.h"
   #include "amdgpu_amdkfd.h"
   #include "amdgpu_amdkfd_arcturus.h"
+#include "amdgpu_reset.h"
   #include "sdma0/sdma0_4_2_2_offset.h"
   #include "sdma0/sdma0_4_2_2_sh_mask.h"
   #include "sdma1/sdma1_4_2_2_offset.h"
@@ -48,6 +49,8 @@
   #include "amdgpu_amdkfd_gfx_v9.h"
   #include "gfxhub_v1_0.h"
   #include "mmhub_v9_4.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"

   #define HQD_N_REGS 56
   #define DUMP_REG(addr) do {   \
@@ -276,6 +279,104 @@ int kgd_arcturus_hqd_sdma_destroy(struct

amdgpu_device *adev, void *mqd,

 return 0;
   }

+/*
+ * Helper used to suspend/resume gfx pipe for image post process work

to set

+ * barrier behaviour.
+ */
+static int suspend_resume_compute_scheduler(struct amdgpu_device

*adev, bool suspend)

+{
+   int i, r = 0;
+
+   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+   struct amdgpu_ring *ring = >gfx.compute_ring[i];
+
+   if (!(ring && ring->sched.thread))
+   continue;
+
+   /* stop secheduler and drain ring. */
+   if (suspend) {
+   drm_sched_stop(>sched, NULL);
+   r = amdgpu_fence_wait_empty(ring);
+   if (r)
+   goto out;
+   } else {
+   drm_sched_start(>sched, false);
+   }
+   }
+
+out:
+   /* return on resume or failure to drain rings. */
+   if (!suspend || r)
+   return r;
+
+   return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
+}
+
+static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool

enable_waitcnt)

+{
+   uint32_t data;
+
+   WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
+
+   if (!down_read_trylock(>reset_domain->sem))
+   return;
+
+   amdgpu_amdkfd_suspend(adev, false);
+
+   if (suspend_resume_compute_scheduler(adev, true))
+   goto out;
+
+   data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
+   data = REG_S

RE: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls

2022-11-24 Thread Kim, Jonathan
[AMD Official Use Only - General]

> -Original Message-
> From: Kuehling, Felix 
> Sent: November 22, 2022 6:59 PM
> To: Kim, Jonathan ; amd-
> g...@lists.freedesktop.org
> Subject: Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode
> enable and disable calls
>
>
> On 2022-10-31 12:23, Jonathan Kim wrote:
> > On GFX9.4.1, the implicit wait count instruction on s_barrier is
> > disabled by default in the driver during normal operation for
> > performance requirements.
> >
> > There is a hardware bug in GFX9.4.1 where if the implicit wait count
> > instruction after an s_barrier instruction is disabled, any wave that
> > hits an exception may step over the s_barrier when returning from the
> > trap handler with the barrier logic having no ability to be
> > aware of this, thereby causing other waves to wait at the barrier
> > indefinitely resulting in a shader hang.  This bug has been corrected
> > for GFX9.4.2 and onward.
> >
> > Since the debugger subscribes to hardware exceptions, in order to avoid
> > this bug, the debugger must enable implicit wait count on s_barrier
> > for a debug session and disable it on detach.
> >
> > In order to change this setting in the in the device global SQ_CONFIG
> > register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
> > will either dispatch work through the compute ring buffers used for
> > image post processing or through the hardware scheduler by the KFD.
> >
> > Have the KGD suspend and drain the compute ring buffer, then suspend
> the
> > hardware scheduler and block any future KFD process job requests before
> > changing the implicit wait count setting.  Once set, resume all work.
> >
> > Signed-off-by: Jonathan Kim 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   3 +
> >   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 105
> +-
> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |   4 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   2 +-
> >   4 files changed, 110 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 0e6ddf05c23c..9f2499f52d2c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -1034,6 +1034,9 @@ struct amdgpu_device {
> > struct pci_saved_state  *pci_state;
> > pci_channel_state_t pci_channel_state;
> >
> > +   /* Track auto wait count on s_barrier settings */
> > +   boolbarrier_has_auto_waitcnt;
> > +
> > struct amdgpu_reset_control *reset_cntl;
> > uint32_t
> ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > index 4191af5a3f13..13f02a0aa828 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > @@ -26,6 +26,7 @@
> >   #include "amdgpu.h"
> >   #include "amdgpu_amdkfd.h"
> >   #include "amdgpu_amdkfd_arcturus.h"
> > +#include "amdgpu_reset.h"
> >   #include "sdma0/sdma0_4_2_2_offset.h"
> >   #include "sdma0/sdma0_4_2_2_sh_mask.h"
> >   #include "sdma1/sdma1_4_2_2_offset.h"
> > @@ -48,6 +49,8 @@
> >   #include "amdgpu_amdkfd_gfx_v9.h"
> >   #include "gfxhub_v1_0.h"
> >   #include "mmhub_v9_4.h"
> > +#include "gc/gc_9_0_offset.h"
> > +#include "gc/gc_9_0_sh_mask.h"
> >
> >   #define HQD_N_REGS 56
> >   #define DUMP_REG(addr) do {   \
> > @@ -276,6 +279,104 @@ int kgd_arcturus_hqd_sdma_destroy(struct
> amdgpu_device *adev, void *mqd,
> > return 0;
> >   }
> >
> > +/*
> > + * Helper used to suspend/resume gfx pipe for image post process work
> to set
> > + * barrier behaviour.
> > + */
> > +static int suspend_resume_compute_scheduler(struct amdgpu_device
> *adev, bool suspend)
> > +{
> > +   int i, r = 0;
> > +
> > +   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> > +   struct amdgpu_ring *ring = >gfx.compute_ring[i];
> > +
> > +   if (!(ring && ring->sched.thread))
> > +   continue;
> > +
> > +   /* stop secheduler and drain ring. */
> > +   if (suspend) {
> > +   

Re: [PATCH 07/29] drm/amdgpu: add gfx9.4.1 hw debug mode enable and disable calls

2022-11-22 Thread Felix Kuehling



On 2022-10-31 12:23, Jonathan Kim wrote:

On GFX9.4.1, the implicit wait count instruction on s_barrier is
disabled by default in the driver during normal operation for
performance requirements.

There is a hardware bug in GFX9.4.1 where if the implicit wait count
instruction after an s_barrier instruction is disabled, any wave that
hits an exception may step over the s_barrier when returning from the
trap handler with the barrier logic having no ability to be
aware of this, thereby causing other waves to wait at the barrier
indefinitely resulting in a shader hang.  This bug has been corrected
for GFX9.4.2 and onward.

Since the debugger subscribes to hardware exceptions, in order to avoid
this bug, the debugger must enable implicit wait count on s_barrier
for a debug session and disable it on detach.

In order to change this setting in the in the device global SQ_CONFIG
register, the GFX pipeline must be idle.  GFX9.4.1 as a compute device
will either dispatch work through the compute ring buffers used for
image post processing or through the hardware scheduler by the KFD.

Have the KGD suspend and drain the compute ring buffer, then suspend the
hardware scheduler and block any future KFD process job requests before
changing the implicit wait count setting.  Once set, resume all work.

Signed-off-by: Jonathan Kim 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   3 +
  .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   | 105 +-
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |   4 +-
  drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   2 +-
  4 files changed, 110 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0e6ddf05c23c..9f2499f52d2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1034,6 +1034,9 @@ struct amdgpu_device {
struct pci_saved_state  *pci_state;
pci_channel_state_t pci_channel_state;
  
+	/* Track auto wait count on s_barrier settings */

+   boolbarrier_has_auto_waitcnt;
+
struct amdgpu_reset_control *reset_cntl;
uint32_t
ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c

index 4191af5a3f13..13f02a0aa828 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -26,6 +26,7 @@
  #include "amdgpu.h"
  #include "amdgpu_amdkfd.h"
  #include "amdgpu_amdkfd_arcturus.h"
+#include "amdgpu_reset.h"
  #include "sdma0/sdma0_4_2_2_offset.h"
  #include "sdma0/sdma0_4_2_2_sh_mask.h"
  #include "sdma1/sdma1_4_2_2_offset.h"
@@ -48,6 +49,8 @@
  #include "amdgpu_amdkfd_gfx_v9.h"
  #include "gfxhub_v1_0.h"
  #include "mmhub_v9_4.h"
+#include "gc/gc_9_0_offset.h"
+#include "gc/gc_9_0_sh_mask.h"
  
  #define HQD_N_REGS 56

  #define DUMP_REG(addr) do {   \
@@ -276,6 +279,104 @@ int kgd_arcturus_hqd_sdma_destroy(struct amdgpu_device 
*adev, void *mqd,
return 0;
  }
  
+/*

+ * Helper used to suspend/resume gfx pipe for image post process work to set
+ * barrier behaviour.
+ */
+static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool 
suspend)
+{
+   int i, r = 0;
+
+   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+   struct amdgpu_ring *ring = >gfx.compute_ring[i];
+
+   if (!(ring && ring->sched.thread))
+   continue;
+
+   /* stop secheduler and drain ring. */
+   if (suspend) {
+   drm_sched_stop(>sched, NULL);
+   r = amdgpu_fence_wait_empty(ring);
+   if (r)
+   goto out;
+   } else {
+   drm_sched_start(>sched, false);
+   }
+   }
+
+out:
+   /* return on resume or failure to drain rings. */
+   if (!suspend || r)
+   return r;
+
+   return amdgpu_device_ip_wait_for_idle(adev, GC_HWIP);
+}
+
+static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool 
enable_waitcnt)
+{
+   uint32_t data;
+
+   WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt);
+
+   if (!down_read_trylock(>reset_domain->sem))
+   return;
+
+   amdgpu_amdkfd_suspend(adev, false);
+
+   if (suspend_resume_compute_scheduler(adev, true))
+   goto out;
+
+   data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG));
+   data = REG_SET_FIELD(data, SQ_CONFIG, DISABLE_BARRIER_WAITCNT,
+   enable_waitcnt ? 0 : 1);
+   WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data);
+
+out:
+   suspend_resume_compute_scheduler(adev, false);
+
+   amdgpu_amdkfd_resume(adev, false);
+
+   up_read(>reset_domain->sem);
+}