On Thu, Oct 9, 2025 at 2:50 PM Jonathan Kim <[email protected]> wrote:
>
> Suspend/resume all gangs should be done with the device lock is held.
>
> Signed-off-by: Jonathan Kim <[email protected]>

Acked-by: Alex Deucher <[email protected]>

> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 73 ++++++-------------
>  1 file changed, 21 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 6c5c7c1bf5ed..6e7bc983fc0b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1209,6 +1209,15 @@ static int evict_process_queues_cpsch(struct 
> device_queue_manager *dqm,
>         pr_debug_ratelimited("Evicting process pid %d queues\n",
>                             pdd->process->lead_thread->pid);
>
> +       if (dqm->dev->kfd->shared_resources.enable_mes) {
> +               pdd->last_evict_timestamp = get_jiffies_64();
> +               retval = suspend_all_queues_mes(dqm);
> +               if (retval) {
> +                       dev_err(dev, "Suspending all queues failed");
> +                       goto out;
> +               }
> +       }
> +
>         /* Mark all queues as evicted. Deactivate all active queues on
>          * the qpd.
>          */
> @@ -1221,23 +1230,27 @@ static int evict_process_queues_cpsch(struct 
> device_queue_manager *dqm,
>                 decrement_queue_count(dqm, qpd, q);
>
>                 if (dqm->dev->kfd->shared_resources.enable_mes) {
> -                       int err;
> -
> -                       err = remove_queue_mes(dqm, q, qpd);
> -                       if (err) {
> +                       retval = remove_queue_mes(dqm, q, qpd);
> +                       if (retval) {
>                                 dev_err(dev, "Failed to evict queue %d\n",
>                                         q->properties.queue_id);
> -                               retval = err;
> +                               goto out;
>                         }
>                 }
>         }
> -       pdd->last_evict_timestamp = get_jiffies_64();
> -       if (!dqm->dev->kfd->shared_resources.enable_mes)
> +
> +       if (!dqm->dev->kfd->shared_resources.enable_mes) {
> +               pdd->last_evict_timestamp = get_jiffies_64();
>                 retval = execute_queues_cpsch(dqm,
>                                               qpd->is_debug ?
>                                               
> KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
>                                               
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
>                                               USE_DEFAULT_GRACE_PERIOD);
> +       } else {
> +               retval = resume_all_queues_mes(dqm);
> +               if (retval)
> +                       dev_err(dev, "Resuming all queues failed");
> +       }
>
>  out:
>         dqm_unlock(dqm);
> @@ -3098,61 +3111,17 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node 
> *knode, u32 pasid, u32 doorbel
>         return ret;
>  }
>
> -static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
> -                                  struct qcm_process_device *qpd)
> -{
> -       struct device *dev = dqm->dev->adev->dev;
> -       int ret = 0;
> -
> -       /* Check if process is already evicted */
> -       dqm_lock(dqm);
> -       if (qpd->evicted) {
> -               /* Increment the evicted count to make sure the
> -                * process stays evicted before its terminated.
> -                */
> -               qpd->evicted++;
> -               dqm_unlock(dqm);
> -               goto out;
> -       }
> -       dqm_unlock(dqm);
> -
> -       ret = suspend_all_queues_mes(dqm);
> -       if (ret) {
> -               dev_err(dev, "Suspending all queues failed");
> -               goto out;
> -       }
> -
> -       ret = dqm->ops.evict_process_queues(dqm, qpd);
> -       if (ret) {
> -               dev_err(dev, "Evicting process queues failed");
> -               goto out;
> -       }
> -
> -       ret = resume_all_queues_mes(dqm);
> -       if (ret)
> -               dev_err(dev, "Resuming all queues failed");
> -
> -out:
> -       return ret;
> -}
> -
>  int kfd_evict_process_device(struct kfd_process_device *pdd)
>  {
>         struct device_queue_manager *dqm;
>         struct kfd_process *p;
> -       int ret = 0;
>
>         p = pdd->process;
>         dqm = pdd->dev->dqm;
>
>         WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
>
> -       if (dqm->dev->kfd->shared_resources.enable_mes)
> -               ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
> -       else
> -               ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> -
> -       return ret;
> +       return dqm->ops.evict_process_queues(dqm, &pdd->qpd);
>  }
>
>  int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
> --
> 2.34.1
>

Reply via email to