This patch implements full Streaming Performance Monitor (SPM) support for AMD GPUs, enabling real-time performance counter data collection directly to userspace buffers.
The implementation provides three key operations: - AMDGPU_SPM_OP_ACQUIRE: Acquire exclusive access to SPM hardware resources - AMDGPU_SPM_OP_RELEASE: Release SPM hardware for use by other processes - AMDGPU_SPM_OP_SET_DEST_BUF: Configure destination buffer and manage counter data streaming Key features: - Flexible destination buffer management with configurable timeout behavior - Automatic detection and reporting of data loss due to ring buffer overflow - Support for partial buffer fills with explicit data size reporting - Detailed kernel API documentation with operation semantics The amdgpu_spm_setdestbuff() function allowing profiling tools to efficiently collect performance data from the GPU. The timeout mechanism enables waiting for a buffer to fill completely before switching to a new one, or immediate switching with partial data preservation. This enables performance analysis tools, and profiling frameworks to access real-time GPU performance metrics without kernel-mode overhead. Signed-off-by: James Zhu <[email protected]> --- amdgpu/amdgpu-symbols.txt | 3 ++ amdgpu/amdgpu.h | 38 ++++++++++++++++ amdgpu/amdgpu_profiler.c | 68 +++++++++++++++++++++++++++++ include/drm/amdgpu_drm.h | 92 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 200 insertions(+), 1 deletion(-) diff --git a/amdgpu/amdgpu-symbols.txt b/amdgpu/amdgpu-symbols.txt index 8cd5559c..b33958ef 100644 --- a/amdgpu/amdgpu-symbols.txt +++ b/amdgpu/amdgpu-symbols.txt @@ -88,3 +88,6 @@ amdgpu_create_userqueue amdgpu_free_userqueue amdgpu_userq_signal amdgpu_userq_wait +amdgpu_spm_acquire +amdgpu_spm_release +amdgpu_spm_setdestbuff diff --git a/amdgpu/amdgpu.h b/amdgpu/amdgpu.h index 4ec1f6b6..8698f26a 100644 --- a/amdgpu/amdgpu.h +++ b/amdgpu/amdgpu.h @@ -2129,7 +2129,45 @@ int amdgpu_cwsr_set_l2_trap_handler(amdgpu_device_handle dev, */ int amdgpu_profiler_version(amdgpu_device_handle dev); +/** + * Acquire request exclusive use of SPM + * \param dev - \c [in] device handle + * + * \return 0 on success otherwise POSIX Error code + */ +int amdgpu_spm_acquire(amdgpu_device_handle dev); + +/** + * Release exclusive use of SPM + * + * \return 0 on success otherwise POSIX Error code + */ +int amdgpu_spm_release(amdgpu_device_handle dev); + +/** + * Set up the destination user mode buffer for stream performance + * counter data. + * \param dev - \c [in] device handle + * \param size_in_bytes - \c [in] size of the buffer + * \param timeout - \c [in/out] timeout in milliseconds + * \param size_copied - \c [in] number of bytes copied + * \param dest_mem_addr - \c [in] destination address. Set to NULL + * to stop copy on previous buffer + * \param is_spm_data_loss - \c [in] true if data was lost + * + * \return 0 on success otherwise POSIX Error code + */ +int amdgpu_spm_setdestbuff( + amdgpu_device_handle dev, + uint32_t size_in_bytes, + uint32_t *timeout, + uint32_t *size_copied, + void *dest_mem_addr, + bool *is_spm_data_loss + ); + #ifdef __cplusplus } + #endif #endif /* #ifdef _AMDGPU_H_ */ diff --git a/amdgpu/amdgpu_profiler.c b/amdgpu/amdgpu_profiler.c index 8d4dffe4..e9d30fb6 100644 --- a/amdgpu/amdgpu_profiler.c +++ b/amdgpu/amdgpu_profiler.c @@ -44,3 +44,71 @@ amdgpu_profiler_version(amdgpu_device_handle dev) return ret; } + +drm_public int +amdgpu_spm_acquire(amdgpu_device_handle dev) +{ + int ret; + struct drm_amdgpu_profiler_args user_arg; + + if (!dev) + return -EINVAL; + + memset(&user_arg, 0, sizeof(user_arg)); + user_arg.op = AMDGPU_PROFILER_SPM; + user_arg.spm.op = AMDGPU_SPM_OP_ACQUIRE; + + ret = drmCommandWriteRead(dev->fd, DRM_AMDGPU_PROFILER, + &user_arg, sizeof(user_arg)); + + return ret; +} + +drm_public int +amdgpu_spm_release(amdgpu_device_handle dev) +{ + struct drm_amdgpu_profiler_args user_arg; + + if (!dev) + return -EINVAL; + + memset(&user_arg, 0, sizeof(user_arg)); + user_arg.op = AMDGPU_PROFILER_SPM; + user_arg.spm.op = AMDGPU_SPM_OP_RELEASE; + + return drmCommandWriteRead(dev->fd, DRM_AMDGPU_PROFILER, + &user_arg, sizeof(user_arg)); +} + +drm_public int +amdgpu_spm_setdestbuff( + amdgpu_device_handle dev, + uint32_t size_in_bytes, + uint32_t *timeout, + uint32_t *size_copied, + void *dest_mem_addr, + bool *is_spm_data_loss + ) +{ + int ret; + struct drm_amdgpu_profiler_args user_arg; + + if (!dev) + return -EINVAL; + + memset(&user_arg, 0, sizeof(user_arg)); + user_arg.op = AMDGPU_PROFILER_SPM; + user_arg.spm.op = AMDGPU_SPM_OP_SET_DEST_BUF; + user_arg.spm.timeout = *timeout; + user_arg.spm.dest_buf = (uint64_t)dest_mem_addr; + user_arg.spm.buf_size = size_in_bytes; + + ret = drmCommandWriteRead(dev->fd, DRM_AMDGPU_PROFILER, + &user_arg, sizeof(user_arg)); + + *size_copied = user_arg.spm.bytes_copied; + *is_spm_data_loss = user_arg.spm.has_data_loss; + *timeout = user_arg.spm.timeout; + + return ret; +} diff --git a/include/drm/amdgpu_drm.h b/include/drm/amdgpu_drm.h index 307242ac..60c73233 100644 --- a/include/drm/amdgpu_drm.h +++ b/include/drm/amdgpu_drm.h @@ -1698,10 +1698,99 @@ struct drm_amdgpu_info_gpuvm_fault { #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ /* - * Supported Profiler Operations + * Supported SPM (Stream Performance Monitor) Operations */ +/** + * drm_amdgpu_spm_op - SPM ioctl operations + * + * @AMDGPU_SPM_OP_ACQUIRE: acquire exclusive access to SPM + * @AMDGPU_SPM_OP_RELEASE: release exclusive access to SPM + * @AMDGPU_SPM_OP_SET_DEST_BUF: set or unset destination buffer for SPM streaming + */ +enum drm_amdgpu_spm_op { + AMDGPU_SPM_OP_ACQUIRE, + AMDGPU_SPM_OP_RELEASE, + AMDGPU_SPM_OP_SET_DEST_BUF +}; + +/** + * drm_amdgpu_spm_args - Arguments for SPM ioctl + * + * @op[in]: specifies the operation to perform + * @dst_buf[in]: used for the address of the destination buffer + * in @AMDGPU_SPM_SET_DEST_BUFFER + * @buf_size[in]: size of the destination buffer + * @timeout[in/out]: [in]: timeout in milliseconds, [out]: amount of time left + * `in the timeout window + * @bytes_copied[out]: total amount of data that was copied to the previous dest_buf + * @has_data_loss: total count for sub-block which has data loss + * + * This ioctl performs different functions depending on the @op parameter. + * + * AMDGPU_SPM_OP_ACQUIRE + * ------------------------ + * + * Acquires exclusive access of SPM on the specified for the calling process. + * This must be called before using AMDGPU_SPM_OP_SET_DEST_BUF. + * + * AMDGPU_SPM_OP_RELEASE + * ------------------------ + * + * Releases exclusive access of SPM on the specified for the calling process, + * which allows another process to acquire it in the future. + * + * AMDGPU_SPM_OP_SET_DEST_BUF + * ----------------------------- + * + * If @dst_buf is NULL, the destination buffer address is unset and copying of counters + * is stopped. + * + * If @dst_buf is not NULL, it specifies the pointer to a new destination buffer. + * @buf_size specifies the size of the buffer. + * + * If @timeout is non-0, the call will wait for up to @timeout ms for the previous + * buffer to be filled. If previous buffer to be filled before timeout, the @timeout + * will be updated value with the time remaining. If the timeout is exceeded, the function + * copies any partial data available into the previous user buffer and returns success. + * The amount of valid data in the previous user buffer is indicated by @bytes_copied. + * + * If @timeout is 0, the function immediately replaces the previous destination buffer + * without waiting for the previous buffer to be filled. That means the previous buffer + * may only be partially filled, and @bytes_copied will indicate how much data has been + * copied to it. + * + * If data was lost, e.g. due to a ring buffer overflow, @has_data_loss will be non-0. + * + * Returns negative error code on failure, 0 on success. + */ +struct drm_amdgpu_spm_args { + __u64 dest_buf; + __u32 buf_size; + __u32 op; + __u32 timeout; + __u32 bytes_copied; + __u32 has_data_loss; + __u32 pad; +}; + +/** + * drm_amdgpu_spm_buffer_header - SPM Buffer header for drm_amdgpu_spm_args->dest_buf + * + * @version [out]: spm version + * @bytes_copied [out]: amount of data for each sub-block + * @has_data_loss: [out]: boolean indicating whether data was lost for each sub-block + * (e.g. due to a ring-buffer overflow) + */ +struct drm_amdgpu_spm_buffer_header { + __u32 version; /* 0-23: minor 24-31: major */ + __u32 bytes_copied; + __u32 has_data_loss; + __u32 reserved[5]; +}; + enum drm_amdgpu_profiler_ops { AMDGPU_PROFILER_VERSION = 0, + AMDGPU_PROFILER_SPM, }; struct drm_amdgpu_profiler_args { @@ -1711,6 +1800,7 @@ struct drm_amdgpu_profiler_args { * lower 16 bit: minor * higher 16 bit: major */ + struct drm_amdgpu_spm_args spm; }; }; -- 2.34.1
