[Why]
When running kdump test on a machine with R7340 card, a hang is caused due
to the failure of 'amdgpu_device_ip_init()', error message as follows:

  '[drm:amdgpu_device_ip_init [amdgpu]] *ERROR* hw_init of IP block <si_dpm> 
failed -22'
  '[drm:uvd_v3_1_hw_init [amdgpu]] *ERROR* amdgpu: UVD Firmware validate fail 
(-22).'
  '[drm:amdgpu_device_ip_init [amdgpu]] *ERROR* hw_init of IP block <uvd_v3_1> 
failed -22'
  'amdgpu 0000:01:00.0: amdgpu: amdgpu_device_ip_init failed'
  'amdgpu 0000:01:00.0: amdgpu: Fatal error during GPU init'

This is because the caputrue kernel does not power off when it starts,
cause hardware status does not reset.

[How]
Add 'is_kdump_kernel()' judgment.
For 'si_dpm' block, use disable and then enable.
For 'uvd_v3_1' block, skip loading during the initialization phase.

Signed-off-by: Lu Yao <ya...@kylinos.cn>
---
During test, I first modified the 'amdgpu_device_ip_hw_init_phase*', make
it does not end directly when a block hw_init failed.

After analysis, 'si_dpm' block failed at 'si_dpm_enable()->
amdgpu_si_is_smc_running()', calling 'si_dpm_disable()' before can resolve.
'uvd_v3_1' block failed at 'uvd_v3_1_hw_init()->uvd_v3_1_fw_validate()',
read mmUVD_FW_STATUS value is 0x27220102, I didn't find out why. But for
caputrue kernel, UVD is not required. Therefore, don't added this block.
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
 drivers/gpu/drm/amd/amdgpu/si.c            | 6 ++++--
 drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c | 6 ++++++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 137a88b8de45..52ebc24561c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -50,6 +50,7 @@
 #include <linux/hashtable.h>
 #include <linux/dma-fence.h>
 #include <linux/pci.h>
+#include <linux/crash_dump.h>
 
 #include <drm/ttm/ttm_bo.h>
 #include <drm/ttm/ttm_placement.h>
diff --git a/drivers/gpu/drm/amd/amdgpu/si.c b/drivers/gpu/drm/amd/amdgpu/si.c
index 85235470e872..fc0daed1b829 100644
--- a/drivers/gpu/drm/amd/amdgpu/si.c
+++ b/drivers/gpu/drm/amd/amdgpu/si.c
@@ -2739,7 +2739,8 @@ int si_set_ip_blocks(struct amdgpu_device *adev)
 #endif
                else
                        amdgpu_device_ip_block_add(adev, &dce_v6_0_ip_block);
-               amdgpu_device_ip_block_add(adev, &uvd_v3_1_ip_block);
+               if (!is_kdump_kernel())
+                       amdgpu_device_ip_block_add(adev, &uvd_v3_1_ip_block);
                /* amdgpu_device_ip_block_add(adev, &vce_v1_0_ip_block); */
                break;
        case CHIP_OLAND:
@@ -2757,7 +2758,8 @@ int si_set_ip_blocks(struct amdgpu_device *adev)
 #endif
                else
                        amdgpu_device_ip_block_add(adev, &dce_v6_4_ip_block);
-               amdgpu_device_ip_block_add(adev, &uvd_v3_1_ip_block);
+               if (!is_kdump_kernel())
+                       amdgpu_device_ip_block_add(adev, &uvd_v3_1_ip_block);
                /* amdgpu_device_ip_block_add(adev, &vce_v1_0_ip_block); */
                break;
        case CHIP_HAINAN:
diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c 
b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
index a1baa13ab2c2..8700a22ba809 100644
--- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
+++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
@@ -1848,6 +1848,7 @@ static int si_calculate_sclk_params(struct amdgpu_device 
*adev,
 static void si_thermal_start_smc_fan_control(struct amdgpu_device *adev);
 static void si_fan_ctrl_set_default_mode(struct amdgpu_device *adev);
 static void si_dpm_set_irq_funcs(struct amdgpu_device *adev);
+static void si_dpm_disable(struct amdgpu_device *adev);
 
 static struct si_power_info *si_get_pi(struct amdgpu_device *adev)
 {
@@ -6811,6 +6812,11 @@ static int si_dpm_enable(struct amdgpu_device *adev)
        struct amdgpu_ps *boot_ps = adev->pm.dpm.boot_ps;
        int ret;
 
+       if (is_kdump_kernel()) {
+               si_dpm_disable(adev);
+               udelay(50);
+       }
+
        if (amdgpu_si_is_smc_running(adev))
                return -EINVAL;
        if (pi->voltage_control || si_pi->voltage_control_svi2)
-- 
2.25.1

Reply via email to