Re: [RFC v4 10/11] drm/amdgpu: Rework amdgpu_device_lock_adev

2022-02-09 Thread Christian König

Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:

This functions needs to be split into 2 parts where
one is called only once for locking single instance of
reset_domain's sem and reset flag and the other part
which handles MP1 states should still be called for
each device in XGMI hive.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 48 --
  1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e05d7cbefd2c..aaecf0797484 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4825,16 +4825,20 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
return r;
  }
  
-static void amdgpu_device_lock_adev(struct amdgpu_device *adev,

-   struct amdgpu_hive_info *hive)
+static void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
*reset_domain,
+   struct amdgpu_hive_info *hive)
  {
-   atomic_set(>reset_domain->in_gpu_reset, 1);
+   atomic_set(_domain->in_gpu_reset, 1);
  
  	if (hive) {

-   down_write_nest_lock(>reset_domain->sem, 
>hive_lock);
+   down_write_nest_lock(_domain->sem, >hive_lock);
} else {
-   down_write(>reset_domain->sem);
+   down_write(_domain->sem);
}
+}
+
+static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
+{
  
  	switch (amdgpu_asic_reset_method(adev)) {

case AMD_RESET_METHOD_MODE1:
@@ -4849,14 +4853,19 @@ static void amdgpu_device_lock_adev(struct 
amdgpu_device *adev,
}
  }
  
-static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)

+static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
  {
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
-   atomic_set(>reset_domain->in_gpu_reset, 0);
-   up_write(>reset_domain->sem);
  }
  
+static void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)

+{
+   atomic_set(_domain->in_gpu_reset, 0);
+   up_write(_domain->sem);
+}


I would move this into amdgpu_reset.c and call it 
amdgpu_reset_domain_unlock().


Same of course for amdgpu_reset_domain_lock()...

Apart from that looks good to me,
Christian.


+
+
  static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
  {
struct pci_dev *p = NULL;
@@ -5060,10 +5069,15 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
device_list_handle = _list;
}
  
+	/* We need to lock reset domain only once both for XGMI and single device */

+   tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+   reset_list);
+   amdgpu_device_lock_reset_domain(tmp_adev->reset_domain, hive);
+
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
  
-		amdgpu_device_lock_adev(tmp_adev, hive);

+   amdgpu_device_set_mp1_state(tmp_adev);
  
  		/*

 * Try to put the audio codec into suspend state
@@ -5213,9 +5227,14 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
  
  		if (audio_suspended)

amdgpu_device_resume_display_audio(tmp_adev);
-   amdgpu_device_unlock_adev(tmp_adev);
+
+   amdgpu_device_unset_mp1_state(tmp_adev);
}
  
+	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,

+   reset_list);
+   amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+
if (hive) {
mutex_unlock(>hive_lock);
amdgpu_put_xgmi_hive(hive);
@@ -5477,7 +5496,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev 
*pdev, pci_channel_sta
 * Locking adev->reset_domain->sem will prevent any external 
access
 * to GPU during PCI error recovery
 */
-   amdgpu_device_lock_adev(adev, NULL);
+   amdgpu_device_lock_reset_domain(adev->reset_domain, NULL);
+   amdgpu_device_set_mp1_state(adev);
  
  		/*

 * Block any work scheduling as we do for regular GPU reset
@@ -5584,7 +5604,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev 
*pdev)
DRM_INFO("PCIe error recovery succeeded\n");
} else {
DRM_ERROR("PCIe error recovery failed, err:%d", r);
-   amdgpu_device_unlock_adev(adev);
+   amdgpu_device_unset_mp1_state(adev);
+   amdgpu_device_unlock_reset_domain(adev->reset_domain);
}
  
  	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;

@@ -5621,7 +5642,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev)

[RFC v4 10/11] drm/amdgpu: Rework amdgpu_device_lock_adev

2022-02-08 Thread Andrey Grodzovsky
This functions needs to be split into 2 parts where
one is called only once for locking single instance of
reset_domain's sem and reset flag and the other part
which handles MP1 states should still be called for
each device in XGMI hive.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 48 --
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e05d7cbefd2c..aaecf0797484 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4825,16 +4825,20 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
return r;
 }
 
-static void amdgpu_device_lock_adev(struct amdgpu_device *adev,
-   struct amdgpu_hive_info *hive)
+static void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain 
*reset_domain,
+   struct amdgpu_hive_info *hive)
 {
-   atomic_set(>reset_domain->in_gpu_reset, 1);
+   atomic_set(_domain->in_gpu_reset, 1);
 
if (hive) {
-   down_write_nest_lock(>reset_domain->sem, 
>hive_lock);
+   down_write_nest_lock(_domain->sem, >hive_lock);
} else {
-   down_write(>reset_domain->sem);
+   down_write(_domain->sem);
}
+}
+
+static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
+{
 
switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1:
@@ -4849,14 +4853,19 @@ static void amdgpu_device_lock_adev(struct 
amdgpu_device *adev,
}
 }
 
-static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
 {
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
-   atomic_set(>reset_domain->in_gpu_reset, 0);
-   up_write(>reset_domain->sem);
 }
 
+static void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain 
*reset_domain)
+{
+   atomic_set(_domain->in_gpu_reset, 0);
+   up_write(_domain->sem);
+}
+
+
 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
 {
struct pci_dev *p = NULL;
@@ -5060,10 +5069,15 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
device_list_handle = _list;
}
 
+   /* We need to lock reset domain only once both for XGMI and single 
device */
+   tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+   reset_list);
+   amdgpu_device_lock_reset_domain(tmp_adev->reset_domain, hive);
+
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
 
-   amdgpu_device_lock_adev(tmp_adev, hive);
+   amdgpu_device_set_mp1_state(tmp_adev);
 
/*
 * Try to put the audio codec into suspend state
@@ -5213,9 +5227,14 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
 
if (audio_suspended)
amdgpu_device_resume_display_audio(tmp_adev);
-   amdgpu_device_unlock_adev(tmp_adev);
+
+   amdgpu_device_unset_mp1_state(tmp_adev);
}
 
+   tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+   reset_list);
+   amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+
if (hive) {
mutex_unlock(>hive_lock);
amdgpu_put_xgmi_hive(hive);
@@ -5477,7 +5496,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev 
*pdev, pci_channel_sta
 * Locking adev->reset_domain->sem will prevent any external 
access
 * to GPU during PCI error recovery
 */
-   amdgpu_device_lock_adev(adev, NULL);
+   amdgpu_device_lock_reset_domain(adev->reset_domain, NULL);
+   amdgpu_device_set_mp1_state(adev);
 
/*
 * Block any work scheduling as we do for regular GPU reset
@@ -5584,7 +5604,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev 
*pdev)
DRM_INFO("PCIe error recovery succeeded\n");
} else {
DRM_ERROR("PCIe error recovery failed, err:%d", r);
-   amdgpu_device_unlock_adev(adev);
+   amdgpu_device_unset_mp1_state(adev);
+   amdgpu_device_unlock_reset_domain(adev->reset_domain);
}
 
return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
@@ -5621,7 +5642,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
drm_sched_start(>sched, true);
}
 
-   amdgpu_device_unlock_adev(adev);
+   amdgpu_device_unset_mp1_state(adev);
+   amdgpu_device_unlock_reset_domain(adev->reset_domain);
 }