Re: [RFC v4] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-08 Thread Christian König




Am 08.02.22 um 17:19 schrieb Andrey Grodzovsky:


On 2022-02-08 06:25, Lazar, Lijo wrote:



On 2/2/2022 10:56 PM, Andrey Grodzovsky wrote:

The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:
Fix crash on boot with XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  6 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 
+-

  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 38 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 18 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  4 +-
  9 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 8e96b9a14452..f2ba460bfd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
  #define AMDGPU_RESET_MAGIC_NUM 64
  #define AMDGPU_MAX_DF_PERFMONS 4
  -struct amdgpu_reset_domain {
-    struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
    struct amdgpu_device {
  struct device    *dev;
@@ -1102,7 +1100,7 @@ struct amdgpu_device {
  struct amdgpu_reset_control *reset_cntl;
  uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
  -    struct amdgpu_reset_domain    reset_domain;
+    struct amdgpu_reset_domain    *reset_domain;
  };
    static inline struct amdgpu_device *drm_to_adev(struct 
drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index fef952ca8db5..cd1b7af69c35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2313,7 +2313,7 @@ static int 
amdgpu_device_init_schedulers(struct amdgpu_device *adev)

    r = drm_sched_init(>sched, _sched_ops,
 ring->num_hw_submission, amdgpu_job_hang_limit,
-   timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+   timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);

  if (r) {
  DRM_ERROR("Failed to create scheduler on ring %s.\n",
    ring->name);
@@ -2432,24 +2432,22 @@ static int amdgpu_device_ip_init(struct 
amdgpu_device *adev)

  if (r)
  goto init_failed;
  +    /**
+ * In case of XGMI grab extra reference for reset domain for 
this device

+ */
  if (adev->gmc.xgmi.num_physical_nodes > 1) {
-    struct amdgpu_hive_info *hive;
-
-    amdgpu_xgmi_add_device(adev);
+    if (amdgpu_xgmi_add_device(adev) == 0) {
+    struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);

  -    hive = amdgpu_get_xgmi_hive(adev);
-    if (!hive || !hive->reset_domain.wq) {
-    DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);

-    r = -EINVAL;
-    goto init_failed;
-    }
+    if (!hive->reset_domain ||
+ !kref_get_unless_zero(>reset_domain->refcount)) {
+    r = -ENOENT;
+    goto init_failed;
+    }
  -    adev->reset_domain.wq = hive->reset_domain.wq;
-    } else {
-    adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);

-    if (!adev->reset_domain.wq) {
-    r = -ENOMEM;
-    goto init_failed;
+    /* Drop the early temporary reset domain we created for 
device */
+    kref_put(>reset_domain->refcount, 
amdgpu_reset_destroy_reset_domain);

+    adev->reset_domain = hive->reset_domain;
  }
  }
  @@ -3599,6 +3597,15 @@ int amdgpu_device_init(struct amdgpu_device 
*adev,

  return r;
  }
  +    /*
+ * Reset domain needs to be present early, before XGMI hive 
discovered
+ * (if any) and intitialized to use reset sem and in_gpu reset 
flag

+ * early on during init.
+ */
+    adev->reset_domain = 
amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");

+    if (!adev->reset_domain)
+    return -ENOMEM;
+
  /* early init functions */
  r = amdgpu_device_ip_early_init(adev);
  if (r)
@@ -3949,6 +3956,9 @@ void amdgpu_device_fini_sw(struct 
amdgpu_device *adev)

  if (adev->mman.discovery_bin)
  

Re: [RFC v4] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-08 Thread Andrey Grodzovsky



On 2022-02-08 06:25, Lazar, Lijo wrote:



On 2/2/2022 10:56 PM, Andrey Grodzovsky wrote:

The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:
Fix crash on boot with XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  6 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 38 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 18 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  4 +-
  9 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 8e96b9a14452..f2ba460bfd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
  #define AMDGPU_RESET_MAGIC_NUM 64
  #define AMDGPU_MAX_DF_PERFMONS 4
  -struct amdgpu_reset_domain {
-    struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
    struct amdgpu_device {
  struct device    *dev;
@@ -1102,7 +1100,7 @@ struct amdgpu_device {
  struct amdgpu_reset_control *reset_cntl;
  uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
  -    struct amdgpu_reset_domain    reset_domain;
+    struct amdgpu_reset_domain    *reset_domain;
  };
    static inline struct amdgpu_device *drm_to_adev(struct drm_device 
*ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index fef952ca8db5..cd1b7af69c35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2313,7 +2313,7 @@ static int amdgpu_device_init_schedulers(struct 
amdgpu_device *adev)

    r = drm_sched_init(>sched, _sched_ops,
 ring->num_hw_submission, amdgpu_job_hang_limit,
-   timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+   timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);

  if (r) {
  DRM_ERROR("Failed to create scheduler on ring %s.\n",
    ring->name);
@@ -2432,24 +2432,22 @@ static int amdgpu_device_ip_init(struct 
amdgpu_device *adev)

  if (r)
  goto init_failed;
  +    /**
+ * In case of XGMI grab extra reference for reset domain for 
this device

+ */
  if (adev->gmc.xgmi.num_physical_nodes > 1) {
-    struct amdgpu_hive_info *hive;
-
-    amdgpu_xgmi_add_device(adev);
+    if (amdgpu_xgmi_add_device(adev) == 0) {
+    struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
  -    hive = amdgpu_get_xgmi_hive(adev);
-    if (!hive || !hive->reset_domain.wq) {
-    DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);

-    r = -EINVAL;
-    goto init_failed;
-    }
+    if (!hive->reset_domain ||
+ !kref_get_unless_zero(>reset_domain->refcount)) {
+    r = -ENOENT;
+    goto init_failed;
+    }
  -    adev->reset_domain.wq = hive->reset_domain.wq;
-    } else {
-    adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);

-    if (!adev->reset_domain.wq) {
-    r = -ENOMEM;
-    goto init_failed;
+    /* Drop the early temporary reset domain we created for 
device */
+    kref_put(>reset_domain->refcount, 
amdgpu_reset_destroy_reset_domain);

+    adev->reset_domain = hive->reset_domain;
  }
  }
  @@ -3599,6 +3597,15 @@ int amdgpu_device_init(struct amdgpu_device 
*adev,

  return r;
  }
  +    /*
+ * Reset domain needs to be present early, before XGMI hive 
discovered

+ * (if any) and intitialized to use reset sem and in_gpu reset flag
+ * early on during init.
+ */
+    adev->reset_domain = 
amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");

+    if (!adev->reset_domain)
+    return -ENOMEM;
+
  /* early init functions */
  r = amdgpu_device_ip_early_init(adev);
  if (r)
@@ -3949,6 +3956,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device 
*adev)

  if (adev->mman.discovery_bin)
  amdgpu_discovery_fini(adev);
  +    kref_put(>reset_domain->refcount, 

Re: [RFC v4] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-08 Thread Lazar, Lijo




On 2/2/2022 10:56 PM, Andrey Grodzovsky wrote:

The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:
Fix crash on boot with XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  6 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 38 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 18 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 +-
  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  4 +-
  9 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8e96b9a14452..f2ba460bfd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
  #define AMDGPU_RESET_MAGIC_NUM 64
  #define AMDGPU_MAX_DF_PERFMONS 4
  
-struct amdgpu_reset_domain {

-   struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
  
  struct amdgpu_device {

struct device   *dev;
@@ -1102,7 +1100,7 @@ struct amdgpu_device {
struct amdgpu_reset_control *reset_cntl;
uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
  
-	struct amdgpu_reset_domain	reset_domain;

+   struct amdgpu_reset_domain  *reset_domain;
  };
  
  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index fef952ca8db5..cd1b7af69c35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2313,7 +2313,7 @@ static int amdgpu_device_init_schedulers(struct 
amdgpu_device *adev)
  
  		r = drm_sched_init(>sched, _sched_ops,

   ring->num_hw_submission, 
amdgpu_job_hang_limit,
-  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+  timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
  ring->name);
@@ -2432,24 +2432,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (r)
goto init_failed;
  
+	/**

+* In case of XGMI grab extra reference for reset domain for this device
+*/
if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   struct amdgpu_hive_info *hive;
-
-   amdgpu_xgmi_add_device(adev);
+   if (amdgpu_xgmi_add_device(adev) == 0) {
+   struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);
  
-		hive = amdgpu_get_xgmi_hive(adev);

-   if (!hive || !hive->reset_domain.wq) {
-   DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);
-   r = -EINVAL;
-   goto init_failed;
-   }
+   if (!hive->reset_domain ||
+   
!kref_get_unless_zero(>reset_domain->refcount)) {
+   r = -ENOENT;
+   goto init_failed;
+   }
  
-		adev->reset_domain.wq = hive->reset_domain.wq;

-   } else {
-   adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);
-   if (!adev->reset_domain.wq) {
-   r = -ENOMEM;
-   goto init_failed;
+   /* Drop the early temporary reset domain we created for 
device */
+   kref_put(>reset_domain->refcount, 
amdgpu_reset_destroy_reset_domain);
+   adev->reset_domain = hive->reset_domain;
}
}
  
@@ -3599,6 +3597,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,

return r;
}
  
+	/*

+* Reset domain needs to be present early, before XGMI hive discovered
+* (if any) and intitialized to use reset sem and in_gpu reset flag
+* early on during init.
+*/
+   adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE 
,"amdgpu-reset-dev");
+   if 

[RFC v4] drm/amdgpu: Rework reset domain to be refcounted.

2022-02-02 Thread Andrey Grodzovsky
The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:
Fix crash on boot with XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 38 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 18 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 29 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  4 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c  |  4 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c  |  4 +-
 9 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 8e96b9a14452..f2ba460bfd59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -813,9 +813,7 @@ struct amd_powerplay {
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 
-struct amdgpu_reset_domain {
-   struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
 
 struct amdgpu_device {
struct device   *dev;
@@ -1102,7 +1100,7 @@ struct amdgpu_device {
struct amdgpu_reset_control *reset_cntl;
uint32_t
ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE];
 
-   struct amdgpu_reset_domain  reset_domain;
+   struct amdgpu_reset_domain  *reset_domain;
 };
 
 static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index fef952ca8db5..cd1b7af69c35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2313,7 +2313,7 @@ static int amdgpu_device_init_schedulers(struct 
amdgpu_device *adev)
 
r = drm_sched_init(>sched, _sched_ops,
   ring->num_hw_submission, 
amdgpu_job_hang_limit,
-  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+  timeout, adev->reset_domain->wq, 
ring->sched_score, ring->name);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
  ring->name);
@@ -2432,24 +2432,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
if (r)
goto init_failed;
 
+   /**
+* In case of XGMI grab extra reference for reset domain for this device
+*/
if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   struct amdgpu_hive_info *hive;
-
-   amdgpu_xgmi_add_device(adev);
+   if (amdgpu_xgmi_add_device(adev) == 0) {
+   struct amdgpu_hive_info *hive = 
amdgpu_get_xgmi_hive(adev);
 
-   hive = amdgpu_get_xgmi_hive(adev);
-   if (!hive || !hive->reset_domain.wq) {
-   DRM_ERROR("Failed to obtain reset domain info for XGMI 
hive:%llx", hive->hive_id);
-   r = -EINVAL;
-   goto init_failed;
-   }
+   if (!hive->reset_domain ||
+   
!kref_get_unless_zero(>reset_domain->refcount)) {
+   r = -ENOENT;
+   goto init_failed;
+   }
 
-   adev->reset_domain.wq = hive->reset_domain.wq;
-   } else {
-   adev->reset_domain.wq = 
alloc_ordered_workqueue("amdgpu-reset-dev", 0);
-   if (!adev->reset_domain.wq) {
-   r = -ENOMEM;
-   goto init_failed;
+   /* Drop the early temporary reset domain we created for 
device */
+   kref_put(>reset_domain->refcount, 
amdgpu_reset_destroy_reset_domain);
+   adev->reset_domain = hive->reset_domain;
}
}
 
@@ -3599,6 +3597,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return r;
}
 
+   /*
+* Reset domain needs to be present early, before XGMI hive discovered
+* (if any) and intitialized to use reset sem and in_gpu reset flag
+* early on during init.
+*/
+   adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE 
,"amdgpu-reset-dev");
+   if (!adev->reset_domain)
+