On 1/16/26 17:20, Alex Deucher wrote:
> If we need to allocate a job during GPU reset, use
> GFP_ATOMIC rather than GFP_KERNEL.
>
> Signed-off-by: Alex Deucher <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 ++++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 3 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c | 6 ++++--
> 4 files changed, 13 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 72ec455fa932c..136e50de712a0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -68,7 +68,7 @@ int amdgpu_ib_get(struct amdgpu_device *adev, struct
> amdgpu_vm *vm,
> int r;
>
> if (size) {
> - r = amdgpu_sa_bo_new(&adev->ib_pools[pool_type],
> + r = amdgpu_sa_bo_new(adev, &adev->ib_pools[pool_type],
> &ib->sa_bo, size);
> if (r) {
> dev_err(adev->dev, "failed to get a new IB (%d)\n", r);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 1daa9145b217e..c7e4d79b9f61d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -192,18 +192,21 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct
> amdgpu_vm *vm,
> if (num_ibs == 0)
> return -EINVAL;
>
> - *job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL);
> + *job = kzalloc(struct_size(*job, ibs, num_ibs),
> + amdgpu_in_reset(adev) ? GFP_ATOMIC : GFP_KERNEL);
That's an extremely bad idea, amdgpu_in_reset() returns true even outside of
the reset thread.
We really need to look at the pool type.
Regards,
Christian.
> if (!*job)
> return -ENOMEM;
>
> - af = kzalloc(sizeof(struct amdgpu_fence), GFP_KERNEL);
> + af = kzalloc(sizeof(struct amdgpu_fence),
> + amdgpu_in_reset(adev) ? GFP_ATOMIC : GFP_KERNEL);
> if (!af) {
> r = -ENOMEM;
> goto err_job;
> }
> (*job)->hw_fence = af;
>
> - af = kzalloc(sizeof(struct amdgpu_fence), GFP_KERNEL);
> + af = kzalloc(sizeof(struct amdgpu_fence),
> + amdgpu_in_reset(adev) ? GFP_ATOMIC : GFP_KERNEL);
> if (!af) {
> r = -ENOMEM;
> goto err_fence;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 912c9afaf9e11..7ee0cc46b4608 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -339,7 +339,8 @@ void amdgpu_sa_bo_manager_fini(struct amdgpu_device *adev,
> struct amdgpu_sa_manager *sa_manager);
> int amdgpu_sa_bo_manager_start(struct amdgpu_device *adev,
> struct amdgpu_sa_manager *sa_manager);
> -int amdgpu_sa_bo_new(struct amdgpu_sa_manager *sa_manager,
> +int amdgpu_sa_bo_new(struct amdgpu_device *adev,
> + struct amdgpu_sa_manager *sa_manager,
> struct drm_suballoc **sa_bo,
> unsigned int size);
> void amdgpu_sa_bo_free(struct drm_suballoc **sa_bo,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c
> index 39070b2a4c04f..fc13969f8ef49 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c
> @@ -76,12 +76,14 @@ void amdgpu_sa_bo_manager_fini(struct amdgpu_device *adev,
> amdgpu_bo_free_kernel(&sa_manager->bo, &sa_manager->gpu_addr,
> &sa_manager->cpu_ptr);
> }
>
> -int amdgpu_sa_bo_new(struct amdgpu_sa_manager *sa_manager,
> +int amdgpu_sa_bo_new(struct amdgpu_device *adev,
> + struct amdgpu_sa_manager *sa_manager,
> struct drm_suballoc **sa_bo,
> unsigned int size)
> {
> struct drm_suballoc *sa = drm_suballoc_new(&sa_manager->base, size,
> - GFP_KERNEL, false, 0);
> + amdgpu_in_reset(adev) ?
> GFP_ATOMIC : GFP_KERNEL,
> + false, 0);
>
> if (IS_ERR(sa)) {
> *sa_bo = NULL;