[PATCH 02/11] drm/amdkfd: Fix suspend/resume issue on Carrizo

2017-09-15 Thread Felix Kuehling
From: Yong Zhao 

When we do suspend/resume through "sudo pm-suspend" while there is
HSA activity running, upon resume we will encounter HWS hanging, which
is caused by memory read/write failures. The root cause is that when
suspend, we neglected to unbind pasid from kfd device.

Another major change is that the bind/unbinding is changed to be
performed on a per process basis, instead of whether there are queues
in dqm.

Signed-off-by: Yong Zhao 
Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c| 22 --
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h  | 15 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 89 ++
 4 files changed, 101 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index cc8af11..ff3f97c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -191,7 +191,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev 
*pdev, int pasid)
struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
 
if (dev)
-   kfd_unbind_process_from_device(dev, pasid);
+   kfd_process_iommu_unbind_callback(dev, pasid);
 }
 
 /*
@@ -339,12 +339,16 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
 
 void kgd2kfd_suspend(struct kfd_dev *kfd)
 {
-   if (kfd->init_complete) {
-   kfd->dqm->ops.stop(kfd->dqm);
-   amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
-   amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
-   amd_iommu_free_device(kfd->pdev);
-   }
+   if (!kfd->init_complete)
+   return;
+
+   kfd->dqm->ops.stop(kfd->dqm);
+
+   kfd_unbind_processes_from_device(kfd);
+
+   amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+   amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
+   amd_iommu_free_device(kfd->pdev);
 }
 
 int kgd2kfd_resume(struct kfd_dev *kfd)
@@ -369,6 +373,10 @@ static int kfd_resume(struct kfd_dev *kfd)
amd_iommu_set_invalid_ppr_cb(kfd->pdev,
 iommu_invalid_ppr_cb);
 
+   err = kfd_bind_processes_to_device(kfd);
+   if (err)
+   return -ENXIO;
+
err = kfd->dqm->ops.start(kfd->dqm);
if (err) {
dev_err(kfd_device,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 53a66e8..5db82b8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -670,7 +670,6 @@ static int initialize_cpsch(struct device_queue_manager 
*dqm)
 
 static int start_cpsch(struct device_queue_manager *dqm)
 {
-   struct device_process_node *node;
int retval;
 
retval = 0;
@@ -697,11 +696,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
 
init_interrupts(dqm);
 
-   list_for_each_entry(node, &dqm->queues, list)
-   if (node->qpd->pqm->process && dqm->dev)
-   kfd_bind_process_to_device(dqm->dev,
-   node->qpd->pqm->process);
-
execute_queues_cpsch(dqm, true);
 
return 0;
@@ -714,15 +708,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
 
 static int stop_cpsch(struct device_queue_manager *dqm)
 {
-   struct device_process_node *node;
-   struct kfd_process_device *pdd;
-
destroy_queues_cpsch(dqm, true, true);
 
-   list_for_each_entry(node, &dqm->queues, list) {
-   pdd = qpd_to_pdd(node->qpd);
-   pdd->bound = false;
-   }
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
pm_uninit(&dqm->packets);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index b397ec7..ef582cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -435,6 +435,13 @@ struct qcm_process_device {
uint32_t sh_hidden_private_base;
 };
 
+
+enum kfd_pdd_bound {
+   PDD_UNBOUND = 0,
+   PDD_BOUND,
+   PDD_BOUND_SUSPENDED,
+};
+
 /* Data that is per-process-per device. */
 struct kfd_process_device {
/*
@@ -459,7 +466,7 @@ struct kfd_process_device {
uint64_t scratch_limit;
 
/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
-   bool bound;
+   enum kfd_pdd_bound bound;
 
/* This flag tells if we should reset all
 * wavefronts on process termination
@@ -548,8 +555,10 @@ struct kfd_process *kfd_get_process(const struct 
task_struct *);
 struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
 
 struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
-   struct kfd_process *p);
-void 

Re: [PATCH 02/11] drm/amdkfd: Fix suspend/resume issue on Carrizo

2017-09-17 Thread Oded Gabbay
On Sat, Sep 16, 2017 at 2:42 AM, Felix Kuehling  wrote:
> From: Yong Zhao 
>
> When we do suspend/resume through "sudo pm-suspend" while there is
> HSA activity running, upon resume we will encounter HWS hanging, which
> is caused by memory read/write failures. The root cause is that when
> suspend, we neglected to unbind pasid from kfd device.
>
> Another major change is that the bind/unbinding is changed to be
> performed on a per process basis, instead of whether there are queues
> in dqm.
>
> Signed-off-by: Yong Zhao 
> Signed-off-by: Felix Kuehling 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c| 22 --
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 13 
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h  | 15 +++-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 89 
> ++
>  4 files changed, 101 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index cc8af11..ff3f97c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -191,7 +191,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev 
> *pdev, int pasid)
> struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
>
> if (dev)
> -   kfd_unbind_process_from_device(dev, pasid);
> +   kfd_process_iommu_unbind_callback(dev, pasid);
>  }
>
>  /*
> @@ -339,12 +339,16 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
>
>  void kgd2kfd_suspend(struct kfd_dev *kfd)
>  {
> -   if (kfd->init_complete) {
> -   kfd->dqm->ops.stop(kfd->dqm);
> -   amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
> -   amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
> -   amd_iommu_free_device(kfd->pdev);
> -   }
> +   if (!kfd->init_complete)
> +   return;
> +
> +   kfd->dqm->ops.stop(kfd->dqm);
> +
> +   kfd_unbind_processes_from_device(kfd);
> +
> +   amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
> +   amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
> +   amd_iommu_free_device(kfd->pdev);
>  }
>
>  int kgd2kfd_resume(struct kfd_dev *kfd)
> @@ -369,6 +373,10 @@ static int kfd_resume(struct kfd_dev *kfd)
> amd_iommu_set_invalid_ppr_cb(kfd->pdev,
>  iommu_invalid_ppr_cb);
>
> +   err = kfd_bind_processes_to_device(kfd);
> +   if (err)
> +   return -ENXIO;

You need to undo previous initialization in case
kfd_bind_processes_to_device fails, i.e. call amd_iommu_free_device()

> +
> err = kfd->dqm->ops.start(kfd->dqm);
> if (err) {
> dev_err(kfd_device,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 53a66e8..5db82b8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -670,7 +670,6 @@ static int initialize_cpsch(struct device_queue_manager 
> *dqm)
>
>  static int start_cpsch(struct device_queue_manager *dqm)
>  {
> -   struct device_process_node *node;
> int retval;
>
> retval = 0;
> @@ -697,11 +696,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
>
> init_interrupts(dqm);
>
> -   list_for_each_entry(node, &dqm->queues, list)
> -   if (node->qpd->pqm->process && dqm->dev)
> -   kfd_bind_process_to_device(dqm->dev,
> -   node->qpd->pqm->process);
> -
> execute_queues_cpsch(dqm, true);
>
> return 0;
> @@ -714,15 +708,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
>
>  static int stop_cpsch(struct device_queue_manager *dqm)
>  {
> -   struct device_process_node *node;
> -   struct kfd_process_device *pdd;
> -
> destroy_queues_cpsch(dqm, true, true);
>
> -   list_for_each_entry(node, &dqm->queues, list) {
> -   pdd = qpd_to_pdd(node->qpd);
> -   pdd->bound = false;
> -   }
> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> pm_uninit(&dqm->packets);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index b397ec7..ef582cc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -435,6 +435,13 @@ struct qcm_process_device {
> uint32_t sh_hidden_private_base;
>  };
>
> +
> +enum kfd_pdd_bound {
> +   PDD_UNBOUND = 0,
> +   PDD_BOUND,
> +   PDD_BOUND_SUSPENDED,
> +};
> +
>  /* Data that is per-process-per device. */
>  struct kfd_process_device {
> /*
> @@ -459,7 +466,7 @@ struct kfd_process_device {
> uint64_t scratch_limit;
>
> /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) 
> */
> -   bool bound;
> +   enum kfd_pdd_bound boun