Update the way drm_coredump_printer is used based on its documentation and Xe's code: the main idea is to generate the final version in one go and then use memcpy to return the chunks requested by the caller of amdgpu_devcoredump_read.
The generation is moved to a separate worker thread. This cuts the time to copy the dump from 40s to ~0s on my machine. --- v3: - removed adev->coredump_in_progress and instead use work as the synchronisation mechanism - use kvfree instead of kfree --- Signed-off-by: Pierre-Eric Pelloux-Prayer <[email protected]> Acked-by: Alex Deucher <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 ++ .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 83 +++++++++++++++++-- .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h | 7 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + 4 files changed, 91 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 057c8bd2ad89..e31dac2421b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -328,6 +328,7 @@ struct kfd_vm_fault_info; struct amdgpu_hive_info; struct amdgpu_reset_context; struct amdgpu_reset_control; +struct amdgpu_coredump_info; enum amdgpu_cp_irq { AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP = 0, @@ -1200,6 +1201,11 @@ struct amdgpu_device { struct amdgpu_reset_domain *reset_domain; +#ifdef CONFIG_DEV_COREDUMP + struct amdgpu_coredump_info *coredump; + struct work_struct coredump_work; +#endif + struct mutex benchmark_mutex; bool scpm_enabled; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c index 42a969512dcc..0c7fc3800f17 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -32,8 +32,13 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, bool vram_lost, struct amdgpu_job *job) { } +void amdgpu_coredump_init(struct amdgpu_device *adev) +{ +} #else +#define AMDGPU_CORE_DUMP_SIZE_MAX (256 * 1024 * 1024) + const char *hw_ip_names[MAX_HWIP] = { [GC_HWIP] = "GC", [HDP_HWIP] = "HDP", @@ -196,11 +201,9 @@ static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev, } static ssize_t -amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, - void *data, size_t datalen) +amdgpu_devcoredump_format(char *buffer, size_t count, struct amdgpu_coredump_info *coredump) { struct drm_printer p; - struct amdgpu_coredump_info *coredump = data; struct drm_print_iterator iter; struct amdgpu_vm_fault_info *fault_info; struct amdgpu_ip_block *ip_block; @@ -208,7 +211,6 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, iter.data = buffer; iter.offset = 0; - iter.start = offset; iter.remain = count; p = drm_coredump_printer(&iter); @@ -323,9 +325,63 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, return count - iter.remain; } +static ssize_t +amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, + void *data, size_t datalen) +{ + struct amdgpu_coredump_info *coredump = data; + ssize_t byte_copied; + + if (!coredump) + return -ENODEV; + + if (!coredump->formatted) + return -ENODEV; + + if (offset >= coredump->formatted_size) + return 0; + + byte_copied = count < coredump->formatted_size - offset ? count : + coredump->formatted_size - offset; + memcpy(buffer, coredump->formatted + offset, byte_copied); + + return byte_copied; +} + static void amdgpu_devcoredump_free(void *data) { - kfree(data); + struct amdgpu_coredump_info *coredump = data; + + kvfree(coredump->formatted); + kvfree(data); +} + +static void amdgpu_devcoredump_deferred_work(struct work_struct *work) +{ + struct amdgpu_device *adev = container_of(work, typeof(*adev), coredump_work); + struct amdgpu_coredump_info *coredump = adev->coredump; + + /* Do a one-time preparation of the coredump output because + * repeatingly calling drm_coredump_printer is very slow. + */ + coredump->formatted_size = amdgpu_devcoredump_format( + NULL, AMDGPU_CORE_DUMP_SIZE_MAX, coredump); + coredump->formatted = kvzalloc(coredump->formatted_size, GFP_KERNEL); + if (!coredump->formatted) { + amdgpu_devcoredump_free(coredump); + goto end; + } + + amdgpu_devcoredump_format(coredump->formatted, coredump->formatted_size, coredump); + + /* If there's an existing coredump for this device, the free function will be + * called immediately so coredump might be invalid after the call to dev_coredumpm. + */ + dev_coredumpm(coredump->adev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, + amdgpu_devcoredump_read, amdgpu_devcoredump_free); + +end: + adev->coredump = NULL; } void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, @@ -335,6 +391,10 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, struct amdgpu_coredump_info *coredump; struct drm_sched_job *s_job; + /* No need to generate a new coredump if there's one in progress already. */ + if (work_pending(&adev->coredump_work)) + return; + coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); if (!coredump) return; @@ -361,11 +421,20 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, ktime_get_ts64(&coredump->reset_time); - dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, - amdgpu_devcoredump_read, amdgpu_devcoredump_free); + /* Update the current coredump pointer (no lock needed, this function can only be called + * from a single thread) + */ + adev->coredump = coredump; + /* Kick off coredump formatting to a worker thread. */ + queue_work(system_unbound_wq, &adev->coredump_work); drm_info(dev, "AMDGPU device coredump file has been created\n"); drm_info(dev, "Check your /sys/class/drm/card%d/device/devcoredump/data\n", dev->primary->index); } + +void amdgpu_coredump_init(struct amdgpu_device *adev) +{ + INIT_WORK(&adev->coredump_work, amdgpu_devcoredump_deferred_work); +} #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h index ef9772c6bcc9..b3582d0b4ca4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h @@ -35,12 +35,19 @@ struct amdgpu_coredump_info { struct amdgpu_device *adev; struct amdgpu_task_info reset_task_info; struct timespec64 reset_time; + bool skip_vram_check; bool reset_vram_lost; struct amdgpu_ring *ring; + /* Readable form of coredevdump, generate once to speed up + * reading it (see drm_coredump_printer's documentation). + */ + ssize_t formatted_size; + char *formatted; }; #endif void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, bool vram_lost, struct amdgpu_job *job); +void amdgpu_coredump_init(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 48540300b10a..1cb88955f651 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4503,6 +4503,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); + amdgpu_coredump_init(adev); + adev->gfx.gfx_off_req_count = 1; adev->gfx.gfx_off_residency = 0; adev->gfx.gfx_off_entrycount = 0; -- 2.43.0
