Re: [PATCH 5/5] drm/amdkfd: Fix a concurrency issue during kfd recovery
Patches 3 and 5 are Reviewed-by: Felix Kuehling Am 2021-07-16 um 11:36 a.m. schrieb Oak Zeng: > start_cpsch and stop_cpsch can be called during kfd device > initialization or during gpu reset/recovery. So they can > run concurrently. Currently in start_cpsch and stop_cpsch, > pm_init and pm_uninit is not protected by the dpm lock. > Imagine such a case that user use packet manager's function > to submit a pm4 packet to hang hws (ie through command > cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee > /sys/kernel/debug/kfd/hang_hws), while kfd device is under > device reset/recovery so packet manager can be not initialized. > There will be unpredictable protection fault in such case. > > This patch moves pm_init/uninit inside the dpm lock and check > packet manager is initialized before using packet manager > function. > > Signed-off-by: Oak Zeng > Acked-by: Christian Konig > --- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +--- > drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 +--- > drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- > 4 files changed, 14 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index 03875d2..56b5010 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, > uint32_t throttle_bitmask) > */ > int kfd_debugfs_hang_hws(struct kfd_dev *dev) > { > - int r = 0; > - > if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { > pr_err("HWS is not enabled"); > return -EINVAL; > } > > - r = pm_debugfs_hang_hws(&dev->dqm->packet_mgr); > - if (!r) > - r = dqm_debugfs_execute_queues(dev->dqm); > - > - return r; > + return dqm_debugfs_hang_hws(dev->dqm); > } > > #endif > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index 6b2f594..6b89ca6 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) > > retval = 0; > > + dqm_lock(dqm); > retval = pm_init(&dqm->packet_mgr, dqm); > if (retval) > goto fail_packet_manager_init; > @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) > > init_interrupts(dqm); > > - dqm_lock(dqm); > /* clear hang status when driver try to start the hw scheduler */ > dqm->is_hws_hang = false; > dqm->is_resetting = false; > @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) > fail_set_sched_resources: > pm_uninit(&dqm->packet_mgr, false); > fail_packet_manager_init: > + dqm_unlock(dqm); > return retval; > } > > @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager > *dqm) > unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > hanging = dqm->is_hws_hang || dqm->is_resetting; > dqm->sched_running = false; > - dqm_unlock(dqm); > > pm_release_ib(&dqm->packet_mgr); > > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > pm_uninit(&dqm->packet_mgr, hanging); > + dqm_unlock(dqm); > > return 0; > } > @@ -2099,11 +2100,16 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) > return r; > } > > -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) > +int dqm_debugfs_hang_hws(struct device_queue_manager *dqm) > { > int r = 0; > > dqm_lock(dqm); > + r = pm_debugfs_hang_hws(&dqm->packet_mgr); > + if (r) { > + dqm_unlock(dqm); > + return r; > + } > dqm->active_runlist = true; > r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > dqm_unlock(dqm); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > index b130cc0..b33ebe8 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) > uint32_t *buffer, size; > int r = 0; > > + if (!pm->priv_queue) > + return -EAGAIN; > + > size = pm->pmf->query_status_size; > mutex_lock(&pm->lock); > kq_acquire_packet_buffer(pm->priv_queue, > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 3426743..8a5dfda 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -1194,7 +1194,7 @@ int pm_debugfs_runlist(struct seq_file *m, voi
[PATCH 5/5] drm/amdkfd: Fix a concurrency issue during kfd recovery
start_cpsch and stop_cpsch can be called during kfd device initialization or during gpu reset/recovery. So they can run concurrently. Currently in start_cpsch and stop_cpsch, pm_init and pm_uninit is not protected by the dpm lock. Imagine such a case that user use packet manager's function to submit a pm4 packet to hang hws (ie through command cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee /sys/kernel/debug/kfd/hang_hws), while kfd device is under device reset/recovery so packet manager can be not initialized. There will be unpredictable protection fault in such case. This patch moves pm_init/uninit inside the dpm lock and check packet manager is initialized before using packet manager function. Signed-off-by: Oak Zeng Acked-by: Christian Konig --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +--- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 +--- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 03875d2..56b5010 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) */ int kfd_debugfs_hang_hws(struct kfd_dev *dev) { - int r = 0; - if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { pr_err("HWS is not enabled"); return -EINVAL; } - r = pm_debugfs_hang_hws(&dev->dqm->packet_mgr); - if (!r) - r = dqm_debugfs_execute_queues(dev->dqm); - - return r; + return dqm_debugfs_hang_hws(dev->dqm); } #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 6b2f594..6b89ca6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = 0; + dqm_lock(dqm); retval = pm_init(&dqm->packet_mgr, dqm); if (retval) goto fail_packet_manager_init; @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - dqm_lock(dqm); /* clear hang status when driver try to start the hw scheduler */ dqm->is_hws_hang = false; dqm->is_resetting = false; @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) fail_set_sched_resources: pm_uninit(&dqm->packet_mgr, false); fail_packet_manager_init: + dqm_unlock(dqm); return retval; } @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; - dqm_unlock(dqm); pm_release_ib(&dqm->packet_mgr); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->packet_mgr, hanging); + dqm_unlock(dqm); return 0; } @@ -2099,11 +2100,16 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) return r; } -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) +int dqm_debugfs_hang_hws(struct device_queue_manager *dqm) { int r = 0; dqm_lock(dqm); + r = pm_debugfs_hang_hws(&dqm->packet_mgr); + if (r) { + dqm_unlock(dqm); + return r; + } dqm->active_runlist = true; r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); dqm_unlock(dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index b130cc0..b33ebe8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) uint32_t *buffer, size; int r = 0; + if (!pm->priv_queue) + return -EAGAIN; + size = pm->pmf->query_status_size; mutex_lock(&pm->lock); kq_acquire_packet_buffer(pm->priv_queue, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 3426743..8a5dfda 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1194,7 +1194,7 @@ int pm_debugfs_runlist(struct seq_file *m, void *data); int kfd_debugfs_hang_hws(struct kfd_dev *dev); int pm_debugfs_hang_hws(struct packet_manager *pm); -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm); +int dqm_debugfs_hang_hws(struct device_queue_manager *dqm); #else -- 2.7.4
Re: [PATCH 5/5] drm/amdkfd: Fix a concurrency issue during kfd recovery
Am 2021-07-15 um 9:34 p.m. schrieb Oak Zeng: > start_cpsch and stop_cpsch can be called during kfd device > initialization or during gpu reset/recovery. So they can > run concurrently. Currently in start_cpsch and stop_cpsch, > pm_init and pm_uninit is not protected by the dpm lock. > Imagine such a case that user use packet manager's function > to submit a pm4 packet to hang hws (ie through command > cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee > /sys/kernel/debug/kfd/hang_hws), while kfd device is under > device reset/recovery so packet manager can be not initialized. > There will be unpredictable protection fault in such case. > > This patch moves pm_init/uninit inside the dpm lock and check > packet manager is initialized before using packet manager > function. > > Signed-off-by: Oak Zeng > --- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +--- > drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 -- > drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ > 3 files changed, 12 insertions(+), 9 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index c51402b..adc2342 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, > uint32_t throttle_bitmask) > */ > int kfd_debugfs_hang_hws(struct kfd_dev *dev) > { > - int r = 0; > - > if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { > pr_err("HWS is not enabled"); > return -EINVAL; > } > > - r = pm_debugfs_hang_hws(&dev->dqm->dpm); > - if (!r) > - r = dqm_debugfs_execute_queues(dev->dqm); > - > - return r; > + return dqm_debugfs_execute_queues(dev->dqm); This function should now probably be renamed to something like dqm_debugfs_hang_hws. Other than that, this patch looks good to me. Regards, Felix > } > > #endif > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index f2984d3..44136d2 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) > > retval = 0; > > + dqm_lock(dqm); > retval = pm_init(&dqm->dpm, dqm); > if (retval) > goto fail_packet_manager_init; > @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) > > init_interrupts(dqm); > > - dqm_lock(dqm); > /* clear hang status when driver try to start the hw scheduler */ > dqm->is_hws_hang = false; > dqm->is_resetting = false; > @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) > fail_set_sched_resources: > pm_uninit(&dqm->dpm, false); > fail_packet_manager_init: > + dqm_unlock(dqm); > return retval; > } > > @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager > *dqm) > unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > hanging = dqm->is_hws_hang || dqm->is_resetting; > dqm->sched_running = false; > - dqm_unlock(dqm); > > pm_release_ib(&dqm->dpm); > > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > pm_uninit(&dqm->dpm, hanging); > + dqm_unlock(dqm); > > return 0; > } > @@ -2104,6 +2105,11 @@ int dqm_debugfs_execute_queues(struct > device_queue_manager *dqm) > int r = 0; > > dqm_lock(dqm); > + r = pm_debugfs_hang_hws(&dqm->dpm); > + if (r) { > + dqm_unlock(dqm); > + return r; > + } > dqm->active_runlist = true; > r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > dqm_unlock(dqm); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > index b130cc0..0123e64 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) > uint32_t *buffer, size; > int r = 0; > > + if (!pm->priv_queue) > + return -EBUSY; > + > size = pm->pmf->query_status_size; > mutex_lock(&pm->lock); > kq_acquire_packet_buffer(pm->priv_queue, ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 5/5] drm/amdkfd: Fix a concurrency issue during kfd recovery
On 7/16/2021 7:04 AM, Oak Zeng wrote: start_cpsch and stop_cpsch can be called during kfd device initialization or during gpu reset/recovery. So they can run concurrently. Currently in start_cpsch and stop_cpsch, pm_init and pm_uninit is not protected by the dpm lock. Imagine such a case that user use packet manager's function to submit a pm4 packet to hang hws (ie through command cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee /sys/kernel/debug/kfd/hang_hws), while kfd device is under device reset/recovery so packet manager can be not initialized. There will be unpredictable protection fault in such case. This patch moves pm_init/uninit inside the dpm lock and check packet manager is initialized before using packet manager function. Signed-off-by: Oak Zeng --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +--- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 -- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c51402b..adc2342 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) */ int kfd_debugfs_hang_hws(struct kfd_dev *dev) { - int r = 0; - if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { pr_err("HWS is not enabled"); return -EINVAL; } - r = pm_debugfs_hang_hws(&dev->dqm->dpm); - if (!r) - r = dqm_debugfs_execute_queues(dev->dqm); - - return r; + return dqm_debugfs_execute_queues(dev->dqm); } #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f2984d3..44136d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = 0; + dqm_lock(dqm); retval = pm_init(&dqm->dpm, dqm); if (retval) goto fail_packet_manager_init; @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - dqm_lock(dqm); /* clear hang status when driver try to start the hw scheduler */ dqm->is_hws_hang = false; dqm->is_resetting = false; @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) fail_set_sched_resources: pm_uninit(&dqm->dpm, false); fail_packet_manager_init: + dqm_unlock(dqm); return retval; } @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; - dqm_unlock(dqm); pm_release_ib(&dqm->dpm); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->dpm, hanging); + dqm_unlock(dqm); return 0; } @@ -2104,6 +2105,11 @@ int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) int r = 0; dqm_lock(dqm); + r = pm_debugfs_hang_hws(&dqm->dpm); execute_queues look like a misnomer now as it combines the logic to hang HWS. + if (r) { + dqm_unlock(dqm); + return r; + } dqm->active_runlist = true; r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); dqm_unlock(dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index b130cc0..0123e64 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) uint32_t *buffer, size; int r = 0; + if (!pm->priv_queue) + return -EBUSY; + EAGAIN may be more appropriate Thanks, Lijo size = pm->pmf->query_status_size; mutex_lock(&pm->lock); kq_acquire_packet_buffer(pm->priv_queue, ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 5/5] drm/amdkfd: Fix a concurrency issue during kfd recovery
start_cpsch and stop_cpsch can be called during kfd device initialization or during gpu reset/recovery. So they can run concurrently. Currently in start_cpsch and stop_cpsch, pm_init and pm_uninit is not protected by the dpm lock. Imagine such a case that user use packet manager's function to submit a pm4 packet to hang hws (ie through command cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee /sys/kernel/debug/kfd/hang_hws), while kfd device is under device reset/recovery so packet manager can be not initialized. There will be unpredictable protection fault in such case. This patch moves pm_init/uninit inside the dpm lock and check packet manager is initialized before using packet manager function. Signed-off-by: Oak Zeng --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +--- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 -- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c51402b..adc2342 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) */ int kfd_debugfs_hang_hws(struct kfd_dev *dev) { - int r = 0; - if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { pr_err("HWS is not enabled"); return -EINVAL; } - r = pm_debugfs_hang_hws(&dev->dqm->dpm); - if (!r) - r = dqm_debugfs_execute_queues(dev->dqm); - - return r; + return dqm_debugfs_execute_queues(dev->dqm); } #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f2984d3..44136d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = 0; + dqm_lock(dqm); retval = pm_init(&dqm->dpm, dqm); if (retval) goto fail_packet_manager_init; @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - dqm_lock(dqm); /* clear hang status when driver try to start the hw scheduler */ dqm->is_hws_hang = false; dqm->is_resetting = false; @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) fail_set_sched_resources: pm_uninit(&dqm->dpm, false); fail_packet_manager_init: + dqm_unlock(dqm); return retval; } @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; - dqm_unlock(dqm); pm_release_ib(&dqm->dpm); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->dpm, hanging); + dqm_unlock(dqm); return 0; } @@ -2104,6 +2105,11 @@ int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) int r = 0; dqm_lock(dqm); + r = pm_debugfs_hang_hws(&dqm->dpm); + if (r) { + dqm_unlock(dqm); + return r; + } dqm->active_runlist = true; r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); dqm_unlock(dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index b130cc0..0123e64 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) uint32_t *buffer, size; int r = 0; + if (!pm->priv_queue) + return -EBUSY; + size = pm->pmf->query_status_size; mutex_lock(&pm->lock); kq_acquire_packet_buffer(pm->priv_queue, -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 5/5] drm/amdkfd: Fix a concurrency issue during kfd recovery
start_cpsch and stop_cpsch can be called during kfd device initialization or during gpu reset/recovery. So they can run concurrently. Currently in start_cpsch and stop_cpsch, pm_init and pm_uninit is not protected by the dpm lock. Imagine such a case that user use packet manager's function to submit a pm4 packet to hang hws (ie through command cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee /sys/kernel/debug/kfd/hang_hws), while kfd device is under device reset/recovery so packet manager can be not initialized. There will be unpredictable protection fault in such case. This patch moves pm_init/uninit inside the dpm lock and check packet manager is initialized before using packet manager function. Signed-off-by: Oak Zeng --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 +--- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 -- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 3 +++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c51402b..adc2342 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1383,18 +1383,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) */ int kfd_debugfs_hang_hws(struct kfd_dev *dev) { - int r = 0; - if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { pr_err("HWS is not enabled"); return -EINVAL; } - r = pm_debugfs_hang_hws(&dev->dqm->dpm); - if (!r) - r = dqm_debugfs_execute_queues(dev->dqm); - - return r; + return dqm_debugfs_execute_queues(dev->dqm); } #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f2984d3..44136d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = 0; + dqm_lock(dqm); retval = pm_init(&dqm->dpm, dqm); if (retval) goto fail_packet_manager_init; @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - dqm_lock(dqm); /* clear hang status when driver try to start the hw scheduler */ dqm->is_hws_hang = false; dqm->is_resetting = false; @@ -1199,6 +1199,7 @@ static int start_cpsch(struct device_queue_manager *dqm) fail_set_sched_resources: pm_uninit(&dqm->dpm, false); fail_packet_manager_init: + dqm_unlock(dqm); return retval; } @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; - dqm_unlock(dqm); pm_release_ib(&dqm->dpm); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->dpm, hanging); + dqm_unlock(dqm); return 0; } @@ -2104,6 +2105,11 @@ int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) int r = 0; dqm_lock(dqm); + r = pm_debugfs_hang_hws(&dqm->dpm); + if (r) { + dqm_unlock(dqm); + return r; + } dqm->active_runlist = true; r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); dqm_unlock(dqm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index b130cc0..0123e64 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -448,6 +448,9 @@ int pm_debugfs_hang_hws(struct packet_manager *pm) uint32_t *buffer, size; int r = 0; + if (!pm->priv_queue) + return -EBUSY; + size = pm->pmf->query_status_size; mutex_lock(&pm->lock); kq_acquire_packet_buffer(pm->priv_queue, -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx