Re: [RFC v4 08/11] drm/amdgpu: Move reset sem into reset_domain

2022-02-08 Thread Christian König

Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:

We want single instance of reset sem across all
reset clients because in case of XGMI we should stop
access cross device MMIO because any of them could be
in a reset in the moment.

Signed-off-by: Andrey Grodzovsky 


Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   | 10 
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 23 +--
  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 18 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  2 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|  6 +++--
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 14 ++-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |  4 ++--
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |  4 ++--
  10 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index cb9764513df8..ddfbcc8fd3d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1058,7 +1058,6 @@ struct amdgpu_device {
  
  	atomic_t 			in_gpu_reset;

enum pp_mp1_state   mp1_state;
-   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
  
  	struct mutex			notifier_lock;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 25e2e5bf90eb..c3728061d65a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -37,6 +37,8 @@
  #include "amdgpu_fw_attestation.h"
  #include "amdgpu_umr.h"
  
+#include "amdgpu_reset.h"

+
  #if defined(CONFIG_DEBUG_FS)
  
  /**

@@ -1279,7 +1281,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_write_killable(>reset_sem);
+   r = down_write_killable(>reset_domain->sem);
if (r)
return r;
  
@@ -1308,7 +1310,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)

kthread_unpark(ring->sched.thread);
}
  
-	up_write(>reset_sem);

+   up_write(>reset_domain->sem);
  
  	pm_runtime_mark_last_busy(dev->dev);

pm_runtime_put_autosuspend(dev->dev);
@@ -1517,7 +1519,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_read_killable(>reset_sem);
+   r = down_read_killable(>reset_domain->sem);
if (r)
goto pro_end;
  
@@ -1560,7 +1562,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)

/* restart the scheduler */
kthread_unpark(ring->sched.thread);
  
-	up_read(>reset_sem);

+   up_read(>reset_domain->sem);
  
  	ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index d61bc0a0457c..dcbb175d336f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -424,10 +424,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device 
*adev)
 * the lock.
 */
if (in_task()) {
-   if (down_read_trylock(>reset_sem))
-   up_read(>reset_sem);
+   if (down_read_trylock(>reset_domain->sem))
+   up_read(>reset_domain->sem);
else
-   lockdep_assert_held(>reset_sem);
+   lockdep_assert_held(>reset_domain->sem);
}
  #endif
return false;
@@ -453,9 +453,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   down_read_trylock(>reset_domain->sem)) {
ret = amdgpu_kiq_rreg(adev, reg);
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
} else {
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -538,9 +538,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   down_read_trylock(>reset_domain->sem)) {
amdgpu_kiq_wreg(adev, reg, v);
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
} 

[RFC v4 08/11] drm/amdgpu: Move reset sem into reset_domain

2022-02-08 Thread Andrey Grodzovsky
We want single instance of reset sem across all
reset clients because in case of XGMI we should stop
access cross device MMIO because any of them could be
in a reset in the moment.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c   | 10 
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 23 +--
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 18 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|  6 +++--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 14 ++-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c |  4 ++--
 10 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index cb9764513df8..ddfbcc8fd3d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1058,7 +1058,6 @@ struct amdgpu_device {
 
atomic_tin_gpu_reset;
enum pp_mp1_state   mp1_state;
-   struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
 
struct mutexnotifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 25e2e5bf90eb..c3728061d65a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -37,6 +37,8 @@
 #include "amdgpu_fw_attestation.h"
 #include "amdgpu_umr.h"
 
+#include "amdgpu_reset.h"
+
 #if defined(CONFIG_DEBUG_FS)
 
 /**
@@ -1279,7 +1281,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   r = down_write_killable(>reset_sem);
+   r = down_write_killable(>reset_domain->sem);
if (r)
return r;
 
@@ -1308,7 +1310,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
kthread_unpark(ring->sched.thread);
}
 
-   up_write(>reset_sem);
+   up_write(>reset_domain->sem);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1517,7 +1519,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
 
/* Avoid accidently unparking the sched thread during GPU reset */
-   r = down_read_killable(>reset_sem);
+   r = down_read_killable(>reset_domain->sem);
if (r)
goto pro_end;
 
@@ -1560,7 +1562,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
 
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
 
ttm_bo_unlock_delayed_workqueue(>mman.bdev, resched);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d61bc0a0457c..dcbb175d336f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -424,10 +424,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device 
*adev)
 * the lock.
 */
if (in_task()) {
-   if (down_read_trylock(>reset_sem))
-   up_read(>reset_sem);
+   if (down_read_trylock(>reset_domain->sem))
+   up_read(>reset_domain->sem);
else
-   lockdep_assert_held(>reset_sem);
+   lockdep_assert_held(>reset_domain->sem);
}
 #endif
return false;
@@ -453,9 +453,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   down_read_trylock(>reset_domain->sem)) {
ret = amdgpu_kiq_rreg(adev, reg);
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
} else {
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -538,9 +538,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
-   down_read_trylock(>reset_sem)) {
+   down_read_trylock(>reset_domain->sem)) {
amdgpu_kiq_wreg(adev, reg, v);
-   up_read(>reset_sem);
+   up_read(>reset_domain->sem);
} else {
writel(v, ((void