Re: [Patch v4 13/24] drm/amdkfd: CRIU checkpoint and restore queue mqds
On 2021-12-22 7:37 p.m., Rajneesh Bhardwaj wrote: From: David Yat Sin Checkpoint contents of queue MQD's on CRIU dump and restore them during CRIU restore. Signed-off-by: David Yat Sin David has an update for this patch to fix up the doorbell offset in the restored SDMA MQD. Regards, Felix --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 2 +- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 72 +++- .../drm/amd/amdkfd/kfd_device_queue_manager.h | 14 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 7 + .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 67 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c | 68 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 68 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 69 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 + .../amd/amdkfd/kfd_process_queue_manager.c| 158 -- 11 files changed, 506 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 3fb155f756fd..146879cd3f2b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -312,7 +312,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, p->pasid, dev->id); - err = pqm_create_queue(>pqm, dev, filep, _properties, _id, NULL, + err = pqm_create_queue(>pqm, dev, filep, _properties, _id, NULL, NULL, _offset_in_process); if (err != 0) goto err_create_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c index 0c50e67e2b51..3a5303ebcabf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c @@ -185,7 +185,7 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) properties.type = KFD_QUEUE_TYPE_DIQ; status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, - , , NULL, NULL); + , , NULL, NULL, NULL); if (status) { pr_err("Failed to create DIQ\n"); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index a0f5b8533a03..a92274f9f1f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -331,7 +331,8 @@ static void deallocate_vmid(struct device_queue_manager *dqm, static int create_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd, - const struct kfd_criu_queue_priv_data *qd) + const struct kfd_criu_queue_priv_data *qd, + const void *restore_mqd) { struct mqd_manager *mqd_mgr; int retval; @@ -390,8 +391,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, retval = -ENOMEM; goto out_deallocate_doorbell; } - mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, - >gart_mqd_addr, >properties); + + if (qd) + mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, >gart_mqd_addr, +>properties, restore_mqd); + else + mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, + >gart_mqd_addr, >properties); + if (q->properties.is_active) { if (!dqm->sched_running) { WARN_ONCE(1, "Load non-HWS mqd while stopped\n"); @@ -1339,7 +1346,8 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd, - const struct kfd_criu_queue_priv_data *qd) + const struct kfd_criu_queue_priv_data *qd, + const void *restore_mqd) { int retval; struct mqd_manager *mqd_mgr; @@ -1385,8 +1393,12 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, * updates the is_evicted flag but is a no-op otherwise. */ q->properties.is_evicted = !!qpd->evicted; - mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, - >gart_mqd_addr, >properties); + if (qd) + mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, >gart_mqd_addr, +>properties, restore_mqd); + else + mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, +
[Patch v4 13/24] drm/amdkfd: CRIU checkpoint and restore queue mqds
From: David Yat Sin Checkpoint contents of queue MQD's on CRIU dump and restore them during CRIU restore. Signed-off-by: David Yat Sin --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 2 +- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 72 +++- .../drm/amd/amdkfd/kfd_device_queue_manager.h | 14 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 7 + .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 67 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c | 68 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 68 .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 69 drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 + .../amd/amdkfd/kfd_process_queue_manager.c| 158 -- 11 files changed, 506 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 3fb155f756fd..146879cd3f2b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -312,7 +312,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, p->pasid, dev->id); - err = pqm_create_queue(>pqm, dev, filep, _properties, _id, NULL, + err = pqm_create_queue(>pqm, dev, filep, _properties, _id, NULL, NULL, _offset_in_process); if (err != 0) goto err_create_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c index 0c50e67e2b51..3a5303ebcabf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c @@ -185,7 +185,7 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) properties.type = KFD_QUEUE_TYPE_DIQ; status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, - , , NULL, NULL); + , , NULL, NULL, NULL); if (status) { pr_err("Failed to create DIQ\n"); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index a0f5b8533a03..a92274f9f1f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -331,7 +331,8 @@ static void deallocate_vmid(struct device_queue_manager *dqm, static int create_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd, - const struct kfd_criu_queue_priv_data *qd) + const struct kfd_criu_queue_priv_data *qd, + const void *restore_mqd) { struct mqd_manager *mqd_mgr; int retval; @@ -390,8 +391,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, retval = -ENOMEM; goto out_deallocate_doorbell; } - mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, - >gart_mqd_addr, >properties); + + if (qd) + mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, >gart_mqd_addr, +>properties, restore_mqd); + else + mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, + >gart_mqd_addr, >properties); + if (q->properties.is_active) { if (!dqm->sched_running) { WARN_ONCE(1, "Load non-HWS mqd while stopped\n"); @@ -1339,7 +1346,8 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm, static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd, - const struct kfd_criu_queue_priv_data *qd) + const struct kfd_criu_queue_priv_data *qd, + const void *restore_mqd) { int retval; struct mqd_manager *mqd_mgr; @@ -1385,8 +1393,12 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, * updates the is_evicted flag but is a no-op otherwise. */ q->properties.is_evicted = !!qpd->evicted; - mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, - >gart_mqd_addr, >properties); + if (qd) + mqd_mgr->restore_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, >gart_mqd_addr, +>properties, restore_mqd); + else + mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj, + >gart_mqd_addr, >properties); list_add(>list, >queues_list); qpd->queue_count++; @@ -1774,6 +1786,50 @@ static int get_wave_state(struct device_queue_manager *dqm,