Am 2023-07-07 um 10:14 schrieb Philip Yang:
Retry faults are delegated to IH soft ring and then processed by
deferred worker. Current IH soft ring size PAGE_SIZE can store 128
entries, which may overflow and drop retry faults, causes HW stucks
because the retry fault is not recovered.

Increase IH soft ring size to the same size as IH ring, define macro
IH_RING_SIZE to remove duplicate constant.

As discussed offline, dropping retry fault interrupts is only a problem when the CAM is enabled. You only need as many entries in the soft IH ring as there are entries in the CAM.

Regards,
  Felix



Show warning message if IH soft ring overflows because this should not
happen any more.

Signed-off-by: Philip Yang <philip.y...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c  | 8 ++++++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  | 4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 +-
  drivers/gpu/drm/amd/amdgpu/ih_v6_0.c    | 5 +++--
  drivers/gpu/drm/amd/amdgpu/navi10_ih.c  | 5 +++--
  drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 5 +++--
  drivers/gpu/drm/amd/amdgpu/vega20_ih.c  | 5 +++--
  7 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index fceb3b384955..51a0dbd2358a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -138,6 +138,7 @@ void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct 
amdgpu_ih_ring *ih)
  /**
   * amdgpu_ih_ring_write - write IV to the ring buffer
   *
+ * @adev: amdgpu_device pointer
   * @ih: ih ring to write to
   * @iv: the iv to write
   * @num_dw: size of the iv in dw
@@ -145,8 +146,8 @@ void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct 
amdgpu_ih_ring *ih)
   * Writes an IV to the ring buffer using the CPU and increment the wptr.
   * Used for testing and delegating IVs to a software ring.
   */
-void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
-                         unsigned int num_dw)
+void amdgpu_ih_ring_write(struct amdgpu_device *adev, struct amdgpu_ih_ring 
*ih,
+                         const uint32_t *iv, unsigned int num_dw)
  {
        uint32_t wptr = le32_to_cpu(*ih->wptr_cpu) >> 2;
        unsigned int i;
@@ -161,6 +162,9 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const 
uint32_t *iv,
        if (wptr != READ_ONCE(ih->rptr)) {
                wmb();
                WRITE_ONCE(*ih->wptr_cpu, cpu_to_le32(wptr));
+       } else {
+               dev_warn(adev->dev, "IH soft ring buffer overflow 0x%X, 0x%X\n",
+                        wptr, ih->rptr);
        }
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
index dd1c2eded6b9..a8cf67f1f011 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
@@ -97,8 +97,8 @@ struct amdgpu_ih_funcs {
  int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
                        unsigned ring_size, bool use_bus_addr);
  void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring 
*ih);
-void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
-                         unsigned int num_dw);
+void amdgpu_ih_ring_write(struct amdgpu_device *adev, struct amdgpu_ih_ring 
*ih,
+                         const uint32_t *iv, unsigned int num_dw);
  int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
                                            struct amdgpu_ih_ring *ih);
  int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 5273decc5753..fa6d0adcec20 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -493,7 +493,7 @@ void amdgpu_irq_delegate(struct amdgpu_device *adev,
                         struct amdgpu_iv_entry *entry,
                         unsigned int num_dw)
  {
-       amdgpu_ih_ring_write(&adev->irq.ih_soft, entry->iv_entry, num_dw);
+       amdgpu_ih_ring_write(adev, &adev->irq.ih_soft, entry->iv_entry, num_dw);
        schedule_work(&adev->irq.ih_soft_work);
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
index b02e1cef78a7..21d2e57cffe7 100644
--- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
@@ -32,6 +32,7 @@
  #include "soc15_common.h"
  #include "ih_v6_0.h"
+#define IH_RING_SIZE (256 * 1024)
  #define MAX_REARM_RETRY 10
static void ih_v6_0_set_interrupt_funcs(struct amdgpu_device *adev);
@@ -535,7 +536,7 @@ static int ih_v6_0_sw_init(void *handle)
         * use bus address for ih ring by psp bl */
        use_bus_addr =
                (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) ? false : true;
-       r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 * 1024, use_bus_addr);
+       r = amdgpu_ih_ring_init(adev, &adev->irq.ih, IH_RING_SIZE, 
use_bus_addr);
        if (r)
                return r;
@@ -548,7 +549,7 @@ static int ih_v6_0_sw_init(void *handle)
        /* initialize ih control register offset */
        ih_v6_0_init_register_offset(adev);
- r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, PAGE_SIZE, true);
+       r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_RING_SIZE, true);
        if (r)
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
index eec13cb5bf75..df33db6fd07b 100644
--- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
@@ -32,6 +32,7 @@
  #include "soc15_common.h"
  #include "navi10_ih.h"
+#define IH_RING_SIZE (256 * 1024)
  #define MAX_REARM_RETRY 10
#define mmIH_CHICKEN_Sienna_Cichlid 0x018d
@@ -565,7 +566,7 @@ static int navi10_ih_sw_init(void *handle)
                use_bus_addr = false;
        else
                use_bus_addr = true;
-       r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 * 1024, use_bus_addr);
+       r = amdgpu_ih_ring_init(adev, &adev->irq.ih, IH_RING_SIZE, 
use_bus_addr);
        if (r)
                return r;
@@ -578,7 +579,7 @@ static int navi10_ih_sw_init(void *handle)
        /* initialize ih control registers offset */
        navi10_ih_init_register_offset(adev);
- r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, PAGE_SIZE, true);
+       r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_RING_SIZE, true);
        if (r)
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index 1e83db0c5438..c9b37983a18d 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -33,6 +33,7 @@
  #include "soc15_common.h"
  #include "vega10_ih.h"
+#define IH_RING_SIZE (256 * 1024)
  #define MAX_REARM_RETRY 10
static void vega10_ih_set_interrupt_funcs(struct amdgpu_device *adev);
@@ -485,7 +486,7 @@ static int vega10_ih_sw_init(void *handle)
        if (r)
                return r;
- r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 * 1024, true);
+       r = amdgpu_ih_ring_init(adev, &adev->irq.ih, IH_RING_SIZE, true);
        if (r)
                return r;
@@ -510,7 +511,7 @@ static int vega10_ih_sw_init(void *handle)
        /* initialize ih control registers offset */
        vega10_ih_init_register_offset(adev);
- r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, PAGE_SIZE, true);
+       r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_RING_SIZE, true);
        if (r)
                return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
index 4d719df376a7..06d4176e4c68 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
@@ -33,6 +33,7 @@
  #include "soc15_common.h"
  #include "vega20_ih.h"
+#define IH_RING_SIZE (256 * 1024)
  #define MAX_REARM_RETRY 10
#define mmIH_CHICKEN_ALDEBARAN 0x18d
@@ -539,7 +540,7 @@ static int vega20_ih_sw_init(void *handle)
            (adev->ip_versions[OSSSYS_HWIP][0] == IP_VERSION(4, 4, 2)))
                use_bus_addr = false;
- r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 * 1024, use_bus_addr);
+       r = amdgpu_ih_ring_init(adev, &adev->irq.ih, IH_RING_SIZE, 
use_bus_addr);
        if (r)
                return r;
@@ -565,7 +566,7 @@ static int vega20_ih_sw_init(void *handle)
        /* initialize ih control registers offset */
        vega20_ih_init_register_offset(adev);
- r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, PAGE_SIZE, use_bus_addr);
+       r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft, IH_RING_SIZE, 
use_bus_addr);
        if (r)
                return r;

Reply via email to