support umc/gfx/sdma ras on guest side

Changed from V1:
    move sriov judgment in amdgpu_ras_interrupt_fatal_error_handler

Change-Id: Ic7dda45d8f8cf2d5f1abc7705abc153d558da8a1
Signed-off-by: Stanley.Yang <stanley.y...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 42 ++++++++++++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   |  4 +++
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c     |  9 +++--
 4 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b583026dc893..ba7990d0dc0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5218,6 +5218,10 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device 
*adev,
                r = amdgpu_device_reset_sriov(adev, job ? false : true);
                if (r)
                        adev->asic_reset_res = r;
+
+               /* Aldebaran supports ras in SRIOV, so need resume ras during 
reset */
+               if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
+                       amdgpu_ras_resume(adev);
        } else {
                r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
                if (r && r == -EAGAIN)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a653cf3b3d13..2b28210c4994 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -726,7 +726,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
        /* Do not enable if it is not allowed. */
        WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
 
-       if (!amdgpu_ras_intr_triggered()) {
+       /* Only enable ras feature operation handle on host side */
+       if (!amdgpu_sriov_vf(adev) &&
+               !amdgpu_ras_intr_triggered()) {
                ret = psp_ras_enable_features(&adev->psp, info, enable);
                if (ret) {
                        dev_err(adev->dev, "ras %s %s failed poison:%d 
ret:%d\n",
@@ -1523,6 +1525,10 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
  */
 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
 {
+       /* Fatal error events are handled on host side */
+       if (amdgpu_sriov_vf(adev))
+               return;
+
        if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
                return;
 
@@ -2270,10 +2276,14 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
 {
        adev->ras_hw_enabled = adev->ras_enabled = 0;
 
-       if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
+       if (!adev->is_atom_fw ||
            !amdgpu_ras_asic_supported(adev))
                return;
 
+       if (!(amdgpu_sriov_vf(adev) &&
+               (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))))
+               return;
+
        if (!adev->gmc.xgmi.connected_to_cpu) {
                if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
                        dev_info(adev->dev, "MEM ECC is active.\n");
@@ -2285,15 +2295,21 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
 
                if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
                        dev_info(adev->dev, "SRAM ECC is active.\n");
-                       adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
-                                                   1 << AMDGPU_RAS_BLOCK__DF);
-
-                       if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 
0))
-                               adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__VCN |
-                                               1 << AMDGPU_RAS_BLOCK__JPEG);
-                       else
-                               adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__VCN |
-                                               1 << AMDGPU_RAS_BLOCK__JPEG);
+                       if (!amdgpu_sriov_vf(adev)) {
+                               adev->ras_hw_enabled |= ~(1 << 
AMDGPU_RAS_BLOCK__UMC |
+                                                           1 << 
AMDGPU_RAS_BLOCK__DF);
+
+                               if (adev->ip_versions[VCN_HWIP][0] == 
IP_VERSION(2, 6, 0))
+                                       adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__VCN |
+                                                       1 << 
AMDGPU_RAS_BLOCK__JPEG);
+                               else
+                                       adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__VCN |
+                                                       1 << 
AMDGPU_RAS_BLOCK__JPEG);
+                       } else {
+                               adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__PCIE_BIF |
+                                                               1 << 
AMDGPU_RAS_BLOCK__SDMA |
+                                                               1 << 
AMDGPU_RAS_BLOCK__GFX);
+                       }
                } else {
                        dev_info(adev->dev, "SRAM ECC is not presented.\n");
                }
@@ -2637,6 +2653,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
        struct amdgpu_ras_block_object *obj;
        int r;
 
+       /* Guest side doesn't need init ras feature */
+       if (amdgpu_sriov_vf(adev))
+               return 0;
+
        list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
                if (!node->ras_obj) {
                        dev_warn(adev->dev, "Warning: abnormal ras list 
node.\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 8e221a1ba937..42c1f050542f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -124,6 +124,10 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device 
*adev,
                struct amdgpu_iv_entry *entry)
 {
        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
+       if (amdgpu_sriov_vf(adev))
+               return AMDGPU_RAS_SUCCESS;
+
        amdgpu_ras_reset_gpu(adev);
 
        return AMDGPU_RAS_SUCCESS;
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index d6d79e97def9..18014ed0e853 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -85,9 +85,12 @@ static int psp_v13_0_init_microcode(struct psp_context *psp)
                err = psp_init_sos_microcode(psp, chip_name);
                if (err)
                        return err;
-               err = psp_init_ta_microcode(&adev->psp, chip_name);
-               if (err)
-                       return err;
+               /* It's not necessary to load ras ta on Guest side */
+               if (!amdgpu_sriov_vf(adev)) {
+                       err = psp_init_ta_microcode(&adev->psp, chip_name);
+                       if (err)
+                               return err;
+               }
                break;
        case IP_VERSION(13, 0, 1):
        case IP_VERSION(13, 0, 3):
-- 
2.17.1

Reply via email to