Re: [RFC 2/6] drm/amdgpu: Move scheduler init to after XGMI is ready

2021-12-20 Thread Christian König

Am 20.12.21 um 22:51 schrieb Andrey Grodzovsky:


On 2021-12-20 2:16 a.m., Christian König wrote:



Am 17.12.21 um 23:27 schrieb Andrey Grodzovsky:

Before we initialize schedulers we must know which reset
domain are we in - for single device there iis a single
domain per device and so single wq per device. For XGMI
the reset domain spans the entire XGMI hive and so the
reset wq is per hive.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 
++

  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
  3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 5f13195d23d1..b595e6d699b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2284,6 +2284,47 @@ static int amdgpu_device_fw_loading(struct 
amdgpu_device *adev)

  return r;
  }
  +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
+{
+    long timeout;
+    int r, i;
+
+    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+    struct amdgpu_ring *ring = adev->rings[i];
+
+    /* No need to setup the GPU scheduler for rings that don't 
need it */

+    if (!ring || ring->no_scheduler)
+    continue;
+
+    switch (ring->funcs->type) {
+    case AMDGPU_RING_TYPE_GFX:
+    timeout = adev->gfx_timeout;
+    break;
+    case AMDGPU_RING_TYPE_COMPUTE:
+    timeout = adev->compute_timeout;
+    break;
+    case AMDGPU_RING_TYPE_SDMA:
+    timeout = adev->sdma_timeout;
+    break;
+    default:
+    timeout = adev->video_timeout;
+    break;
+    }
+




+    r = drm_sched_init(>sched, _sched_ops,
+   ring->num_hw_submission, amdgpu_job_hang_limit,
+   timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);

+    if (r) {
+    DRM_ERROR("Failed to create scheduler on ring %s.\n",
+  ring->name);
+    return r;
+    }


Maybe better put that into amdgpu_ring.c. But not really a hard 
requirement, more a gut feeling.



+    }
+
+    return 0;
+}
+
+
  /**
   * amdgpu_device_ip_init - run init for hardware IPs
   *
@@ -2412,6 +2453,10 @@ static int amdgpu_device_ip_init(struct 
amdgpu_device *adev)

  }
  }
  +    r = amdgpu_device_init_schedulers(adev);
+    if (r)
+    goto init_failed;
+
  /* Don't init kfd if whole hive need to be reset during init */
  if (!adev->gmc.xgmi.pending_reset)
  amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

index 3b7e86ea7167..5527c68c51de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -456,8 +456,6 @@ int amdgpu_fence_driver_init_ring(struct 
amdgpu_ring *ring,

    atomic_t *sched_score)
  {
  struct amdgpu_device *adev = ring->adev;
-    long timeout;
-    int r;
    if (!adev)
  return -EINVAL;
@@ -477,36 +475,12 @@ int amdgpu_fence_driver_init_ring(struct 
amdgpu_ring *ring,

  spin_lock_init(>fence_drv.lock);
  ring->fence_drv.fences = kcalloc(num_hw_submission * 2, 
sizeof(void *),

   GFP_KERNEL);
-    if (!ring->fence_drv.fences)
-    return -ENOMEM;
  -    /* No need to setup the GPU scheduler for rings that don't 
need it */

-    if (ring->no_scheduler)
-    return 0;
+    ring->num_hw_submission = num_hw_submission;
+    ring->sched_score = sched_score;


Probably better to set that in the caller and drop the parameters 
from the amdgpu_fence_driver_init_ring() function completely.


Christian.



I noticed that at least num_hw_submission is validated within the 
function so not sure we should then discard the parameters.


Good point. It also doesn't make sense to move this check up because the 
power of two requirement comes from the fences, doesn't it?


Ok in this case just keep it like it is.

Christian.



Andrey





  -    switch (ring->funcs->type) {
-    case AMDGPU_RING_TYPE_GFX:
-    timeout = adev->gfx_timeout;
-    break;
-    case AMDGPU_RING_TYPE_COMPUTE:
-    timeout = adev->compute_timeout;
-    break;
-    case AMDGPU_RING_TYPE_SDMA:
-    timeout = adev->sdma_timeout;
-    break;
-    default:
-    timeout = adev->video_timeout;
-    break;
-    }
-
-    r = drm_sched_init(>sched, _sched_ops,
-   num_hw_submission, amdgpu_job_hang_limit,
-   timeout, NULL, sched_score, ring->name);
-    if (r) {
-    DRM_ERROR("Failed to create scheduler on ring %s.\n",
-  ring->name);
-    return r;
-    }
+    if (!ring->fence_drv.fences)
+    return -ENOMEM;
    return 0;
  }
diff --git 

Re: [RFC 2/6] drm/amdgpu: Move scheduler init to after XGMI is ready

2021-12-20 Thread Andrey Grodzovsky



On 2021-12-20 2:16 a.m., Christian König wrote:



Am 17.12.21 um 23:27 schrieb Andrey Grodzovsky:

Before we initialize schedulers we must know which reset
domain are we in - for single device there iis a single
domain per device and so single wq per device. For XGMI
the reset domain spans the entire XGMI hive and so the
reset wq is per hive.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
  3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 5f13195d23d1..b595e6d699b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2284,6 +2284,47 @@ static int amdgpu_device_fw_loading(struct 
amdgpu_device *adev)

  return r;
  }
  +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
+{
+    long timeout;
+    int r, i;
+
+    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+    struct amdgpu_ring *ring = adev->rings[i];
+
+    /* No need to setup the GPU scheduler for rings that don't 
need it */

+    if (!ring || ring->no_scheduler)
+    continue;
+
+    switch (ring->funcs->type) {
+    case AMDGPU_RING_TYPE_GFX:
+    timeout = adev->gfx_timeout;
+    break;
+    case AMDGPU_RING_TYPE_COMPUTE:
+    timeout = adev->compute_timeout;
+    break;
+    case AMDGPU_RING_TYPE_SDMA:
+    timeout = adev->sdma_timeout;
+    break;
+    default:
+    timeout = adev->video_timeout;
+    break;
+    }
+




+    r = drm_sched_init(>sched, _sched_ops,
+   ring->num_hw_submission, amdgpu_job_hang_limit,
+   timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);

+    if (r) {
+    DRM_ERROR("Failed to create scheduler on ring %s.\n",
+  ring->name);
+    return r;
+    }


Maybe better put that into amdgpu_ring.c. But not really a hard 
requirement, more a gut feeling.



+    }
+
+    return 0;
+}
+
+
  /**
   * amdgpu_device_ip_init - run init for hardware IPs
   *
@@ -2412,6 +2453,10 @@ static int amdgpu_device_ip_init(struct 
amdgpu_device *adev)

  }
  }
  +    r = amdgpu_device_init_schedulers(adev);
+    if (r)
+    goto init_failed;
+
  /* Don't init kfd if whole hive need to be reset during init */
  if (!adev->gmc.xgmi.pending_reset)
  amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

index 3b7e86ea7167..5527c68c51de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -456,8 +456,6 @@ int amdgpu_fence_driver_init_ring(struct 
amdgpu_ring *ring,

    atomic_t *sched_score)
  {
  struct amdgpu_device *adev = ring->adev;
-    long timeout;
-    int r;
    if (!adev)
  return -EINVAL;
@@ -477,36 +475,12 @@ int amdgpu_fence_driver_init_ring(struct 
amdgpu_ring *ring,

  spin_lock_init(>fence_drv.lock);
  ring->fence_drv.fences = kcalloc(num_hw_submission * 2, 
sizeof(void *),

   GFP_KERNEL);
-    if (!ring->fence_drv.fences)
-    return -ENOMEM;
  -    /* No need to setup the GPU scheduler for rings that don't 
need it */

-    if (ring->no_scheduler)
-    return 0;
+    ring->num_hw_submission = num_hw_submission;
+    ring->sched_score = sched_score;


Probably better to set that in the caller and drop the parameters from 
the amdgpu_fence_driver_init_ring() function completely.


Christian.



I noticed that at least num_hw_submission is validated within the 
function so not sure we should then discard the parameters.


Andrey





  -    switch (ring->funcs->type) {
-    case AMDGPU_RING_TYPE_GFX:
-    timeout = adev->gfx_timeout;
-    break;
-    case AMDGPU_RING_TYPE_COMPUTE:
-    timeout = adev->compute_timeout;
-    break;
-    case AMDGPU_RING_TYPE_SDMA:
-    timeout = adev->sdma_timeout;
-    break;
-    default:
-    timeout = adev->video_timeout;
-    break;
-    }
-
-    r = drm_sched_init(>sched, _sched_ops,
-   num_hw_submission, amdgpu_job_hang_limit,
-   timeout, NULL, sched_score, ring->name);
-    if (r) {
-    DRM_ERROR("Failed to create scheduler on ring %s.\n",
-  ring->name);
-    return r;
-    }
+    if (!ring->fence_drv.fences)
+    return -ENOMEM;
    return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index 4d380e79752c..a4b8279e3011 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,6 +253,8 @@ struct 

Re: [RFC 2/6] drm/amdgpu: Move scheduler init to after XGMI is ready

2021-12-19 Thread Christian König




Am 17.12.21 um 23:27 schrieb Andrey Grodzovsky:

Before we initialize schedulers we must know which reset
domain are we in - for single device there iis a single
domain per device and so single wq per device. For XGMI
the reset domain spans the entire XGMI hive and so the
reset wq is per hive.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
  3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5f13195d23d1..b595e6d699b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2284,6 +2284,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
return r;
  }
  
+static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)

+{
+   long timeout;
+   int r, i;
+
+   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+   struct amdgpu_ring *ring = adev->rings[i];
+
+   /* No need to setup the GPU scheduler for rings that don't need 
it */
+   if (!ring || ring->no_scheduler)
+   continue;
+
+   switch (ring->funcs->type) {
+   case AMDGPU_RING_TYPE_GFX:
+   timeout = adev->gfx_timeout;
+   break;
+   case AMDGPU_RING_TYPE_COMPUTE:
+   timeout = adev->compute_timeout;
+   break;
+   case AMDGPU_RING_TYPE_SDMA:
+   timeout = adev->sdma_timeout;
+   break;
+   default:
+   timeout = adev->video_timeout;
+   break;
+   }
+




+   r = drm_sched_init(>sched, _sched_ops,
+  ring->num_hw_submission, 
amdgpu_job_hang_limit,
+  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+   if (r) {
+   DRM_ERROR("Failed to create scheduler on ring %s.\n",
+ ring->name);
+   return r;
+   }


Maybe better put that into amdgpu_ring.c. But not really a hard 
requirement, more a gut feeling.



+   }
+
+   return 0;
+}
+
+
  /**
   * amdgpu_device_ip_init - run init for hardware IPs
   *
@@ -2412,6 +2453,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
}
}
  
+	r = amdgpu_device_init_schedulers(adev);

+   if (r)
+   goto init_failed;
+
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset)
amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 3b7e86ea7167..5527c68c51de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -456,8 +456,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
  atomic_t *sched_score)
  {
struct amdgpu_device *adev = ring->adev;
-   long timeout;
-   int r;
  
  	if (!adev)

return -EINVAL;
@@ -477,36 +475,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring 
*ring,
spin_lock_init(>fence_drv.lock);
ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
 GFP_KERNEL);
-   if (!ring->fence_drv.fences)
-   return -ENOMEM;
  
-	/* No need to setup the GPU scheduler for rings that don't need it */

-   if (ring->no_scheduler)
-   return 0;
+   ring->num_hw_submission = num_hw_submission;
+   ring->sched_score = sched_score;


Probably better to set that in the caller and drop the parameters from 
the amdgpu_fence_driver_init_ring() function completely.


Christian.

  
-	switch (ring->funcs->type) {

-   case AMDGPU_RING_TYPE_GFX:
-   timeout = adev->gfx_timeout;
-   break;
-   case AMDGPU_RING_TYPE_COMPUTE:
-   timeout = adev->compute_timeout;
-   break;
-   case AMDGPU_RING_TYPE_SDMA:
-   timeout = adev->sdma_timeout;
-   break;
-   default:
-   timeout = adev->video_timeout;
-   break;
-   }
-
-   r = drm_sched_init(>sched, _sched_ops,
-  num_hw_submission, amdgpu_job_hang_limit,
-  timeout, NULL, sched_score, ring->name);
-   if (r) {
-   DRM_ERROR("Failed to create scheduler on ring %s.\n",
- ring->name);
-   return r;
-   }
+   if (!ring->fence_drv.fences)
+   

[RFC 2/6] drm/amdgpu: Move scheduler init to after XGMI is ready

2021-12-17 Thread Andrey Grodzovsky
Before we initialize schedulers we must know which reset
domain are we in - for single device there iis a single
domain per device and so single wq per device. For XGMI
the reset domain spans the entire XGMI hive and so the
reset wq is per hive.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
 3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5f13195d23d1..b595e6d699b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2284,6 +2284,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
return r;
 }
 
+static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
+{
+   long timeout;
+   int r, i;
+
+   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+   struct amdgpu_ring *ring = adev->rings[i];
+
+   /* No need to setup the GPU scheduler for rings that don't need 
it */
+   if (!ring || ring->no_scheduler)
+   continue;
+
+   switch (ring->funcs->type) {
+   case AMDGPU_RING_TYPE_GFX:
+   timeout = adev->gfx_timeout;
+   break;
+   case AMDGPU_RING_TYPE_COMPUTE:
+   timeout = adev->compute_timeout;
+   break;
+   case AMDGPU_RING_TYPE_SDMA:
+   timeout = adev->sdma_timeout;
+   break;
+   default:
+   timeout = adev->video_timeout;
+   break;
+   }
+
+   r = drm_sched_init(>sched, _sched_ops,
+  ring->num_hw_submission, 
amdgpu_job_hang_limit,
+  timeout, adev->reset_domain.wq, 
ring->sched_score, ring->name);
+   if (r) {
+   DRM_ERROR("Failed to create scheduler on ring %s.\n",
+ ring->name);
+   return r;
+   }
+   }
+
+   return 0;
+}
+
+
 /**
  * amdgpu_device_ip_init - run init for hardware IPs
  *
@@ -2412,6 +2453,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
}
}
 
+   r = amdgpu_device_init_schedulers(adev);
+   if (r)
+   goto init_failed;
+
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset)
amdgpu_amdkfd_device_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 3b7e86ea7167..5527c68c51de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -456,8 +456,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
  atomic_t *sched_score)
 {
struct amdgpu_device *adev = ring->adev;
-   long timeout;
-   int r;
 
if (!adev)
return -EINVAL;
@@ -477,36 +475,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring 
*ring,
spin_lock_init(>fence_drv.lock);
ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
 GFP_KERNEL);
-   if (!ring->fence_drv.fences)
-   return -ENOMEM;
 
-   /* No need to setup the GPU scheduler for rings that don't need it */
-   if (ring->no_scheduler)
-   return 0;
+   ring->num_hw_submission = num_hw_submission;
+   ring->sched_score = sched_score;
 
-   switch (ring->funcs->type) {
-   case AMDGPU_RING_TYPE_GFX:
-   timeout = adev->gfx_timeout;
-   break;
-   case AMDGPU_RING_TYPE_COMPUTE:
-   timeout = adev->compute_timeout;
-   break;
-   case AMDGPU_RING_TYPE_SDMA:
-   timeout = adev->sdma_timeout;
-   break;
-   default:
-   timeout = adev->video_timeout;
-   break;
-   }
-
-   r = drm_sched_init(>sched, _sched_ops,
-  num_hw_submission, amdgpu_job_hang_limit,
-  timeout, NULL, sched_score, ring->name);
-   if (r) {
-   DRM_ERROR("Failed to create scheduler on ring %s.\n",
- ring->name);
-   return r;
-   }
+   if (!ring->fence_drv.fences)
+   return -ENOMEM;
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 4d380e79752c..a4b8279e3011 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,6 +253,8 @@ struct