Re: [PATCH] drm/amdgpu: Use per device reset_domain for XGMI on sriov configuration

2022-09-07 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
From: amd-gfx  on behalf of Liu, Shaoyun 

Date: Thursday, September 8, 2022 at 08:05
To: amd-gfx@lists.freedesktop.org 
Subject: RE: [PATCH] drm/amdgpu: Use per device reset_domain for XGMI on sriov 
configuration
[AMD Official Use Only - General]

[AMD Official Use Only - General]

ping

-Original Message-
From: Liu, Shaoyun 
Sent: Wednesday, September 7, 2022 11:38 AM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Shaoyun 
Subject: [PATCH] drm/amdgpu: Use per device reset_domain for XGMI on sriov 
configuration

For SRIOV configuration, host driver control the reset method(either FLR or 
heavier chain reset). The host will notify the guest individually with FLR 
message if individual GPU within the hive need to be reset. So for guest side, 
no need to use hive->reset_domain to replace the original per device 
reset_domain

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 +-
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62b26f0e37b0..a5533e0d9d6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2453,17 +2453,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (amdgpu_xgmi_add_device(adev) == 0) {
struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);

-   if (!hive->reset_domain ||
-   !amdgpu_reset_get_reset_domain(hive->reset_domain)) 
{
-   r = -ENOENT;
+   if(!amdgpu_sriov_vf(adev)) {
+   if (!hive->reset_domain ||
+   
!amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+   r = -ENOENT;
+   amdgpu_put_xgmi_hive(hive);
+   goto init_failed;
+   }
+
+   /* Drop the early temporary reset domain we 
created for device */
+   
amdgpu_reset_put_reset_domain(adev->reset_domain);
+   adev->reset_domain = hive->reset_domain;
amdgpu_put_xgmi_hive(hive);
-   goto init_failed;
}
-
-   /* Drop the early temporary reset domain we created for 
device */
-   amdgpu_reset_put_reset_domain(adev->reset_domain);
-   adev->reset_domain = hive->reset_domain;
-   amdgpu_put_xgmi_hive(hive);
}
}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index d3b483aa81f8..a78b589e4f4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -391,24 +391,32 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
goto pro_end;
}

+   /**
+* Only init hive->reset_domain for none SRIOV configuration. For SRIOV,
+* Host driver decide how to reset the GPU either through FLR or chain 
reset.
+* Guest side will get individual notifications from the host for the 
FLR
+* if necessary.
+*/
+   if (!amdgpu_sriov_vf(adev)) {
/**
 * Avoid recreating reset domain when hive is reconstructed for the case
-* of reset the devices in the XGMI hive during probe for SRIOV
+* of reset the devices in the XGMI hive during probe for passthrough
+GPU
 * See 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.spinics.net%2Flists%2Famd-gfx%2Fmsg58836.html&data=05%7C01%7Chawking.zhang%40amd.com%7C1cc58853b47048a4e25208da912dc783%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637981923054102228%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=DmZeEevkv%2FlVa07m0HUCkC6ozH0BgJ4uPGnYe41NAOs%3D&reserved=0
 */
-   if (adev->reset_domain->type != XGMI_HIVE) {
-   hive->reset_domain = 
amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
-   if (!hive->reset_domain) {
-   dev_err(adev->dev, "XGMI: failed initializing 
reset domain for xgmi hive\n");
-   ret = -ENOMEM;
-   kobject_put(&hive->kobj);
-   kfree(hive);
-   hive = NULL;
-   goto pro_end;
-

RE: [PATCH] drm/amdgpu: Use per device reset_domain for XGMI on sriov configuration

2022-09-07 Thread Liu, Shaoyun
[AMD Official Use Only - General]

ping

-Original Message-
From: Liu, Shaoyun 
Sent: Wednesday, September 7, 2022 11:38 AM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Shaoyun 
Subject: [PATCH] drm/amdgpu: Use per device reset_domain for XGMI on sriov 
configuration

For SRIOV configuration, host driver control the reset method(either FLR or 
heavier chain reset). The host will notify the guest individually with FLR 
message if individual GPU within the hive need to be reset. So for guest side, 
no need to use hive->reset_domain to replace the original per device 
reset_domain

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 +-
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62b26f0e37b0..a5533e0d9d6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2453,17 +2453,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (amdgpu_xgmi_add_device(adev) == 0) {
struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);

-   if (!hive->reset_domain ||
-   !amdgpu_reset_get_reset_domain(hive->reset_domain)) 
{
-   r = -ENOENT;
+   if(!amdgpu_sriov_vf(adev)) {
+   if (!hive->reset_domain ||
+   
!amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+   r = -ENOENT;
+   amdgpu_put_xgmi_hive(hive);
+   goto init_failed;
+   }
+
+   /* Drop the early temporary reset domain we 
created for device */
+   
amdgpu_reset_put_reset_domain(adev->reset_domain);
+   adev->reset_domain = hive->reset_domain;
amdgpu_put_xgmi_hive(hive);
-   goto init_failed;
}
-
-   /* Drop the early temporary reset domain we created for 
device */
-   amdgpu_reset_put_reset_domain(adev->reset_domain);
-   adev->reset_domain = hive->reset_domain;
-   amdgpu_put_xgmi_hive(hive);
}
}

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index d3b483aa81f8..a78b589e4f4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -391,24 +391,32 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
goto pro_end;
}

+   /**
+* Only init hive->reset_domain for none SRIOV configuration. For SRIOV,
+* Host driver decide how to reset the GPU either through FLR or chain 
reset.
+* Guest side will get individual notifications from the host for the 
FLR
+* if necessary.
+*/
+   if (!amdgpu_sriov_vf(adev)) {
/**
 * Avoid recreating reset domain when hive is reconstructed for the case
-* of reset the devices in the XGMI hive during probe for SRIOV
+* of reset the devices in the XGMI hive during probe for passthrough
+GPU
 * See https://www.spinics.net/lists/amd-gfx/msg58836.html
 */
-   if (adev->reset_domain->type != XGMI_HIVE) {
-   hive->reset_domain = 
amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
-   if (!hive->reset_domain) {
-   dev_err(adev->dev, "XGMI: failed initializing 
reset domain for xgmi hive\n");
-   ret = -ENOMEM;
-   kobject_put(&hive->kobj);
-   kfree(hive);
-   hive = NULL;
-   goto pro_end;
-   }
-   } else {
-   amdgpu_reset_get_reset_domain(adev->reset_domain);
-   hive->reset_domain = adev->reset_domain;
+   if (adev->reset_domain->type != XGMI_HIVE) {
+   hive->reset_domain = 
amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
+   if (!hive->reset_domain) {
+   dev_err(adev->dev, "XGMI: failed 
initializing reset domain for xgmi hive\n");
+   ret = -ENOMEM;
+   kobject_put(&hive->kobj);
+

[PATCH] drm/amdgpu: Use per device reset_domain for XGMI on sriov configuration

2022-09-07 Thread shaoyunl
For SRIOV configuration, host driver control the reset method(either FLR or
heavier chain reset). The host will notify the guest individually with FLR
message if individual GPU within the hive need to be reset. So for guest
side, no need to use hive->reset_domain to replace the original per
device reset_domain

Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 +-
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62b26f0e37b0..a5533e0d9d6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2453,17 +2453,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (amdgpu_xgmi_add_device(adev) == 0) {
struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);
 
-   if (!hive->reset_domain ||
-   !amdgpu_reset_get_reset_domain(hive->reset_domain)) 
{
-   r = -ENOENT;
+   if(!amdgpu_sriov_vf(adev)) {
+   if (!hive->reset_domain ||
+   
!amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+   r = -ENOENT;
+   amdgpu_put_xgmi_hive(hive);
+   goto init_failed;
+   }
+
+   /* Drop the early temporary reset domain we 
created for device */
+   
amdgpu_reset_put_reset_domain(adev->reset_domain);
+   adev->reset_domain = hive->reset_domain;
amdgpu_put_xgmi_hive(hive);
-   goto init_failed;
}
-
-   /* Drop the early temporary reset domain we created for 
device */
-   amdgpu_reset_put_reset_domain(adev->reset_domain);
-   adev->reset_domain = hive->reset_domain;
-   amdgpu_put_xgmi_hive(hive);
}
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index d3b483aa81f8..a78b589e4f4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -391,24 +391,32 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
goto pro_end;
}
 
+   /**
+* Only init hive->reset_domain for none SRIOV configuration. For SRIOV,
+* Host driver decide how to reset the GPU either through FLR or chain 
reset.
+* Guest side will get individual notifications from the host for the 
FLR
+* if necessary.
+*/
+   if (!amdgpu_sriov_vf(adev)) {
/**
 * Avoid recreating reset domain when hive is reconstructed for the case
-* of reset the devices in the XGMI hive during probe for SRIOV
+* of reset the devices in the XGMI hive during probe for passthrough 
GPU
 * See https://www.spinics.net/lists/amd-gfx/msg58836.html
 */
-   if (adev->reset_domain->type != XGMI_HIVE) {
-   hive->reset_domain = 
amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
-   if (!hive->reset_domain) {
-   dev_err(adev->dev, "XGMI: failed initializing 
reset domain for xgmi hive\n");
-   ret = -ENOMEM;
-   kobject_put(&hive->kobj);
-   kfree(hive);
-   hive = NULL;
-   goto pro_end;
-   }
-   } else {
-   amdgpu_reset_get_reset_domain(adev->reset_domain);
-   hive->reset_domain = adev->reset_domain;
+   if (adev->reset_domain->type != XGMI_HIVE) {
+   hive->reset_domain = 
amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
+   if (!hive->reset_domain) {
+   dev_err(adev->dev, "XGMI: failed 
initializing reset domain for xgmi hive\n");
+   ret = -ENOMEM;
+   kobject_put(&hive->kobj);
+   kfree(hive);
+   hive = NULL;
+   goto pro_end;
+   }
+   } else {
+   amdgpu_reset_get_reset_domain(adev->reset_domain);
+   hive->reset_domain = adev->reset_domain;
+   }
}
 
hive->hive_id = adev->gmc.xgmi.hive_id;
-- 
2.17.1