Re: [Patch v4 13/24] drm/amdkfd: CRIU checkpoint and restore queue mqds

2022-01-10 Thread Felix Kuehling

On 2021-12-22 7:37 p.m., Rajneesh Bhardwaj wrote:

From: David Yat Sin 

Checkpoint contents of queue MQD's on CRIU dump and restore them during
CRIU restore.

Signed-off-by: David Yat Sin 


David has an update for this patch to fix up the doorbell offset in the 
restored SDMA MQD.


Regards,
  Felix




---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c   |   2 +-
  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  72 +++-
  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  14 +-
  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h  |   7 +
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  67 
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  68 
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  68 
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  69 
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   5 +
  .../amd/amdkfd/kfd_process_queue_manager.c| 158 --
  11 files changed, 506 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 3fb155f756fd..146879cd3f2b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -312,7 +312,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
p->pasid,
dev->id);
  
-	err = pqm_create_queue(>pqm, dev, filep, _properties, _id, NULL,

+   err = pqm_create_queue(>pqm, dev, filep, _properties, _id, 
NULL, NULL,
_offset_in_process);
if (err != 0)
goto err_create_queue;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index 0c50e67e2b51..3a5303ebcabf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -185,7 +185,7 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
properties.type = KFD_QUEUE_TYPE_DIQ;
  
  	status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,

-   , , NULL, NULL);
+   , , NULL, NULL, NULL);
  
  	if (status) {

pr_err("Failed to create DIQ\n");
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a0f5b8533a03..a92274f9f1f7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -331,7 +331,8 @@ static void deallocate_vmid(struct device_queue_manager 
*dqm,
  static int create_queue_nocpsch(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd,
-   const struct kfd_criu_queue_priv_data *qd)
+   const struct kfd_criu_queue_priv_data *qd,
+   const void *restore_mqd)
  {
struct mqd_manager *mqd_mgr;
int retval;
@@ -390,8 +391,14 @@ static int create_queue_nocpsch(struct 
device_queue_manager *dqm,
retval = -ENOMEM;
goto out_deallocate_doorbell;
}
-   mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
-   >gart_mqd_addr, >properties);
+
+   if (qd)
+   mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, 
>gart_mqd_addr,
+>properties, restore_mqd);
+   else
+   mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
+   >gart_mqd_addr, >properties);
+
if (q->properties.is_active) {
if (!dqm->sched_running) {
WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
@@ -1339,7 +1346,8 @@ static void destroy_kernel_queue_cpsch(struct 
device_queue_manager *dqm,
  
  static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,

struct qcm_process_device *qpd,
-   const struct kfd_criu_queue_priv_data *qd)
+   const struct kfd_criu_queue_priv_data *qd,
+   const void *restore_mqd)
  {
int retval;
struct mqd_manager *mqd_mgr;
@@ -1385,8 +1393,12 @@ static int create_queue_cpsch(struct 
device_queue_manager *dqm, struct queue *q,
 * updates the is_evicted flag but is a no-op otherwise.
 */
q->properties.is_evicted = !!qpd->evicted;
-   mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
-   >gart_mqd_addr, >properties);
+   if (qd)
+   mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, 
>gart_mqd_addr,
+>properties, restore_mqd);
+   else
+   mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
+   

[Patch v4 13/24] drm/amdkfd: CRIU checkpoint and restore queue mqds

2021-12-22 Thread Rajneesh Bhardwaj
From: David Yat Sin 

Checkpoint contents of queue MQD's on CRIU dump and restore them during
CRIU restore.

Signed-off-by: David Yat Sin 

---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c   |   2 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  72 +++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  14 +-
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h  |   7 +
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  67 
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  68 
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  68 
 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  69 
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   5 +
 .../amd/amdkfd/kfd_process_queue_manager.c| 158 --
 11 files changed, 506 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 3fb155f756fd..146879cd3f2b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -312,7 +312,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
p->pasid,
dev->id);
 
-   err = pqm_create_queue(>pqm, dev, filep, _properties, _id, 
NULL,
+   err = pqm_create_queue(>pqm, dev, filep, _properties, _id, 
NULL, NULL,
_offset_in_process);
if (err != 0)
goto err_create_queue;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index 0c50e67e2b51..3a5303ebcabf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -185,7 +185,7 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
properties.type = KFD_QUEUE_TYPE_DIQ;
 
status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,
-   , , NULL, NULL);
+   , , NULL, NULL, NULL);
 
if (status) {
pr_err("Failed to create DIQ\n");
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a0f5b8533a03..a92274f9f1f7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -331,7 +331,8 @@ static void deallocate_vmid(struct device_queue_manager 
*dqm,
 static int create_queue_nocpsch(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd,
-   const struct kfd_criu_queue_priv_data *qd)
+   const struct kfd_criu_queue_priv_data *qd,
+   const void *restore_mqd)
 {
struct mqd_manager *mqd_mgr;
int retval;
@@ -390,8 +391,14 @@ static int create_queue_nocpsch(struct 
device_queue_manager *dqm,
retval = -ENOMEM;
goto out_deallocate_doorbell;
}
-   mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
-   >gart_mqd_addr, >properties);
+
+   if (qd)
+   mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, 
>gart_mqd_addr,
+>properties, restore_mqd);
+   else
+   mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
+   >gart_mqd_addr, >properties);
+
if (q->properties.is_active) {
if (!dqm->sched_running) {
WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
@@ -1339,7 +1346,8 @@ static void destroy_kernel_queue_cpsch(struct 
device_queue_manager *dqm,
 
 static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue 
*q,
struct qcm_process_device *qpd,
-   const struct kfd_criu_queue_priv_data *qd)
+   const struct kfd_criu_queue_priv_data *qd,
+   const void *restore_mqd)
 {
int retval;
struct mqd_manager *mqd_mgr;
@@ -1385,8 +1393,12 @@ static int create_queue_cpsch(struct 
device_queue_manager *dqm, struct queue *q,
 * updates the is_evicted flag but is a no-op otherwise.
 */
q->properties.is_evicted = !!qpd->evicted;
-   mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
-   >gart_mqd_addr, >properties);
+   if (qd)
+   mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, 
>gart_mqd_addr,
+>properties, restore_mqd);
+   else
+   mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
+   >gart_mqd_addr, >properties);
 
list_add(>list, >queues_list);
qpd->queue_count++;
@@ -1774,6 +1786,50 @@ static int get_wave_state(struct device_queue_manager 
*dqm,