On 4/6/26 18:25, Tom St Denis wrote:
> The gpu_metrics sysfs file carries a binary blob but was implemented as
> a text device_attribute, which imposes a hard PAGE_SIZE-1 cap and makes
> it impossible for userspace to distinguish a successful short read from
> a truncated one (stat reports 4 KiB regardless of actual payload size).
>
> Convert gpu_metrics to a bin_attribute. The declared file size is an
> upper bound (128 KiB); the real payload length is the byte count
> returned before EOF.
>
> To guarantee that multi-chunk reads (which kernfs splits at PAGE_SIZE
> boundaries) return a coherent snapshot, each reader (identified by its
> struct file pointer) gets its own cached metrics buffer via an xarray.
> When offset is 0, PMFW is sampled and stored in that reader's entry;
> subsequent offsets are served from the same snapshot. The entry is
> freed on EOF. A mutex serialises xarray mutations and PMFW access.
>
> Stale entries from readers that close without reaching EOF (e.g.,
> killed processes) are lazily evicted after 30 seconds whenever any
> new reader starts.
>
> This per-reader approach avoids the cross-contamination problem of
> a single shared cache: concurrent readers each see their own coherent
> PMFW snapshot rather than risking one reader's off=0 overwriting the
> buffer while another reader is mid-way through a multi-chunk read.
>
> V3: Per-reader snapshot state, so each open fd gets its own cached
> gpu_metrics buffer and all chunks for that reader come from the
> same sample (Vitaly)
>
> V4: Remove the residual PAGE_SIZE cap that v3 still carried over from
> v1. The whole point of the bin_attribute conversion is to
> lift the old PAGE_SIZE-1 limit, but v3 still had:
> - WARN_ON_ONCE(len > PAGE_SIZE) with silent truncation to 4K
> - kzalloc(PAGE_SIZE) for the snapshot buffer (fixed allocation)
> - a separate memcpy from metrics into that fixed buffer
> In v4 the snapshot buffer is allocated with kmemdup() sized to the
> actual payload returned by amdgpu_dpm_get_gpu_metrics(), so payloads
> larger than PAGE_SIZE work correctly. The only remaining upper bound
> is AMDGPU_GPU_METRICS_BIN_ATTR_MAX_SZ (128 KiB), which is the
> declared bin_attribute file size and serves as a sanity check —
> exceeding it now returns -EOVERFLOW instead of silently truncating.
> (Tom)
>
> Signed-off-by: Tom St Denis <[email protected]>
> Acked-by: Vitaly Prosyak <[email protected]>
> Change-Id: Ib4fa233d9a25a396f1cc7d5fcf74f5f6578329f5
> ---
> drivers/gpu/drm/amd/pm/amdgpu_pm.c | 208 +++++++++++++++++++++---
> drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 4 +
> drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h | 1 -
> 3 files changed, 188 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> index a4d8e667eafb..303a2d643b10 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> @@ -34,8 +34,19 @@
> #include <linux/nospec.h>
> #include <linux/pm_runtime.h>
> #include <linux/string_choices.h>
> +#include <linux/sysfs.h>
> +#include <linux/sizes.h>
> +#include <linux/xarray.h>
> #include <asm/processor.h>
>
> +/*
> + * Sysfs reports this as the file size (stat/ls); kernfs also uses it to cap
> + * read offsets. Actual payload length is the return value of
> + * amdgpu_dpm_get_gpu_metrics() and must not exceed this.
> + */
> +#define AMDGPU_GPU_METRICS_BIN_ATTR_MAX_SZ SZ_128K
> +
> +
> #define MAX_NUM_OF_FEATURES_PER_SUBSET 8
> #define MAX_NUM_OF_SUBSETS 8
>
> @@ -1734,43 +1745,170 @@ static ssize_t amdgpu_get_pm_metrics(struct device
> *dev,
> * DOC: gpu_metrics
> *
> * The amdgpu driver provides a sysfs API for retrieving current gpu
> - * metrics data. The file gpu_metrics is used for this. Reading the
> - * file will dump all the current gpu metrics data.
> + * metrics data. The binary sysfs file gpu_metrics is used for this.
> + * Reading the file returns the raw metrics blob. The sysfs file size
> + * is an upper bound for inode metadata; the real length is the amount
> + * returned before EOF.
> + *
> + * Metrics are sampled atomically per reader: the first read at offset 0
> + * captures a snapshot into a per-fd cache; subsequent reads at higher
> + * offsets (for payloads that span multiple pages) are served from that
> + * same snapshot. Concurrent readers each get their own cache.
> *
> * These data include temperature, frequency, engines utilization,
> * power consume, throttler status, fan speed and cpu core statistics(
> * available for APU only). That's it will give a snapshot of all sensors
> * at the same time.
> */
> -static ssize_t amdgpu_get_gpu_metrics(struct device *dev,
> - struct device_attribute *attr,
> - char *buf)
> +
> +/* Per-reader snapshot entry, keyed by struct file pointer in the xarray */
> +struct gpu_metrics_snap_entry {
> + void *data;
> + size_t size;
> + ktime_t timestamp;
> +};
> +
> +/* Evict stale entries from readers that closed without reaching EOF */
> +#define GPU_METRICS_SNAP_STALE_NS (30ULL * NSEC_PER_SEC)
> +
> +static void gpu_metrics_evict_stale_locked(struct xarray *xa,
> + unsigned long skip_key)
> +{
> + struct gpu_metrics_snap_entry *entry;
> + unsigned long idx;
> + ktime_t cutoff;
> +
> + cutoff = ktime_sub(ktime_get(), ns_to_ktime(GPU_METRICS_SNAP_STALE_NS));
> +
> + xa_for_each(xa, idx, entry) {
> + if (idx != skip_key && ktime_before(entry->timestamp, cutoff)) {
> + xa_erase(xa, idx);
> + kfree(entry->data);
> + kfree(entry);
> + }
> + }
> +}
> +
> +static void gpu_metrics_free_all(struct xarray *xa)
> {
> + struct gpu_metrics_snap_entry *entry;
> + unsigned long idx;
> +
> + xa_for_each(xa, idx, entry) {
> + xa_erase(xa, idx);
> + kfree(entry->data);
> + kfree(entry);
> + }
> + xa_destroy(xa);
> +}
> +
> +static bool amdgpu_pm_gpu_metrics_bin_visible(struct amdgpu_device *adev,
> + uint32_t mask)
> +{
> + uint32_t gc_ver = amdgpu_ip_version(adev, GC_HWIP, 0);
> +
> + if (!((ATTR_FLAG_BASIC | ATTR_FLAG_ONEVF) & mask))
> + return false;
> +
> + return gc_ver >= IP_VERSION(9, 1, 0);
> +}
> +
> +static ssize_t amdgpu_sysfs_gpu_metrics_read(struct file *f,
> + struct kobject *kobj,
> + const struct bin_attribute *attr,
> + char *buf, loff_t off,
> + size_t count)
> +{
> + struct device *dev = kobj_to_dev(kobj);
> struct drm_device *ddev = dev_get_drvdata(dev);
> struct amdgpu_device *adev = drm_to_adev(ddev);
> - void *gpu_metrics;
> - ssize_t size = 0;
> - int ret;
> + unsigned long key = (unsigned long)f;
> + struct gpu_metrics_snap_entry *entry;
> + ssize_t ret;
>
> - ret = amdgpu_pm_get_access_if_active(adev);
> - if (ret)
> - return ret;
> + mutex_lock(&adev->pm.gpu_metrics_lock);
>
> - size = amdgpu_dpm_get_gpu_metrics(adev, &gpu_metrics);
> - if (size <= 0)
> - goto out;
> + if (off == 0) {
> + void *metrics;
> + void *old;
> + int len;
>
> - if (size >= PAGE_SIZE)
> - size = PAGE_SIZE - 1;
> + /* Evict stale entries from readers that never hit EOF */
> + gpu_metrics_evict_stale_locked(&adev->pm.gpu_metrics_readers,
> + key);
>
> - memcpy(buf, gpu_metrics, size);
> + ret = amdgpu_pm_get_access(adev);
> + if (ret)
> + goto out_unlock;
>
> -out:
> - amdgpu_pm_put_access(adev);
> + len = amdgpu_dpm_get_gpu_metrics(adev, &metrics);
> + amdgpu_pm_put_access(adev);
>
> - return size;
> + if (len <= 0) {
> + ret = len;
> + goto out_unlock;
> + }
> +
> + if (WARN_ON_ONCE(len > AMDGPU_GPU_METRICS_BIN_ATTR_MAX_SZ)) {
> + ret = -EOVERFLOW;
> + goto out_unlock;
> + }
> +
> + entry = xa_load(&adev->pm.gpu_metrics_readers, key);
> + if (!entry) {
> + entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> + if (!entry) {
> + ret = -ENOMEM;
> + goto out_unlock;
> + }
> + old = xa_store(&adev->pm.gpu_metrics_readers, key,
> + entry, GFP_KERNEL);
> + if (xa_is_err(old)) {
> + kfree(entry);
> + ret = xa_err(old);
> + goto out_unlock;
> + }
> + }
> +
> + /* (Re-)allocate snapshot buffer sized to actual payload */
> + kfree(entry->data);
> + entry->data = kmemdup(metrics, len, GFP_KERNEL);
> + if (!entry->data) {
> + xa_erase(&adev->pm.gpu_metrics_readers, key);
> + kfree(entry);
> + ret = -ENOMEM;
> + goto out_unlock;
> + }
> + entry->size = len;
> + entry->timestamp = ktime_get();
> + } else {
> + entry = xa_load(&adev->pm.gpu_metrics_readers, key);
> + if (!entry) {
> + ret = -EIO;
> + goto out_unlock;
> + }
> + }
> +
> + if (off >= entry->size) {
> + xa_erase(&adev->pm.gpu_metrics_readers, key);
> + kfree(entry->data);
> + kfree(entry);
> + ret = 0;
> + goto out_unlock;
> + }
> +
> + count = min_t(size_t, count, entry->size - off);
> + memcpy(buf, (u8 *)entry->data + off, count);
> + ret = count;
> +
> +out_unlock:
> + mutex_unlock(&adev->pm.gpu_metrics_lock);
> + return ret;
> }
>
> +static const BIN_ATTR(gpu_metrics, 0444, amdgpu_sysfs_gpu_metrics_read, NULL,
> + AMDGPU_GPU_METRICS_BIN_ATTR_MAX_SZ);
> +
> static int amdgpu_show_powershift_percent(struct device *dev,
> char *buf, enum amd_pp_sensors sensor)
> {
> @@ -2579,7 +2717,6 @@ static struct amdgpu_device_attr amdgpu_device_attrs[]
> = {
> AMDGPU_DEVICE_ATTR_RO(unique_id,
> ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
> AMDGPU_DEVICE_ATTR_RW(thermal_throttling_logging,
> ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
> AMDGPU_DEVICE_ATTR_RW(apu_thermal_cap,
> ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
> - AMDGPU_DEVICE_ATTR_RO(gpu_metrics,
> ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
> AMDGPU_DEVICE_ATTR_RO(smartshift_apu_power,
> ATTR_FLAG_BASIC,
> .attr_update = ss_power_attr_update),
> AMDGPU_DEVICE_ATTR_RO(smartshift_dgpu_power,
> ATTR_FLAG_BASIC,
> @@ -2657,9 +2794,6 @@ static int default_attr_update(struct amdgpu_device
> *adev, struct amdgpu_device_
> gc_ver != IP_VERSION(9, 4, 3)) ||
> gc_ver < IP_VERSION(9, 0, 0))
> *states = ATTR_STATE_UNSUPPORTED;
> - } else if (DEVICE_ATTR_IS(gpu_metrics)) {
> - if (gc_ver < IP_VERSION(9, 1, 0))
> - *states = ATTR_STATE_UNSUPPORTED;
> } else if (DEVICE_ATTR_IS(pp_power_profile_mode)) {
> if (amdgpu_dpm_get_power_profile_mode(adev, NULL) ==
> -EOPNOTSUPP)
> *states = ATTR_STATE_UNSUPPORTED;
> @@ -4755,6 +4889,19 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
> if (ret)
> goto err_out0;
>
> + if (amdgpu_pm_gpu_metrics_bin_visible(adev, mask)) {
> + mutex_init(&adev->pm.gpu_metrics_lock);
> + xa_init(&adev->pm.gpu_metrics_readers);
> + ret = sysfs_create_bin_file(&adev->dev->kobj,
> + &bin_attr_gpu_metrics);
> + if (ret) {
> + xa_destroy(&adev->pm.gpu_metrics_readers);
> + mutex_destroy(&adev->pm.gpu_metrics_lock);
> + goto err_out1;
> + }
> + adev->pm.gpu_metrics_bin_registered = true;
> + }
> +
> if (amdgpu_dpm_is_overdrive_supported(adev)) {
> ret = amdgpu_od_set_init(adev);
> if (ret)
> @@ -4806,6 +4953,12 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
> return 0;
>
> err_out1:
> + if (adev->pm.gpu_metrics_bin_registered) {
> + sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_gpu_metrics);
> + gpu_metrics_free_all(&adev->pm.gpu_metrics_readers);
> + mutex_destroy(&adev->pm.gpu_metrics_lock);
> + adev->pm.gpu_metrics_bin_registered = false;
> + }
> amdgpu_device_attr_remove_groups(adev, &adev->pm.pm_attr_list);
> err_out0:
> if (adev->pm.int_hwmon_dev)
> @@ -4821,6 +4974,13 @@ void amdgpu_pm_sysfs_fini(struct amdgpu_device *adev)
> if (adev->pm.int_hwmon_dev)
> hwmon_device_unregister(adev->pm.int_hwmon_dev);
>
> + if (adev->pm.gpu_metrics_bin_registered) {
> + sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_gpu_metrics);
> + gpu_metrics_free_all(&adev->pm.gpu_metrics_readers);
> + mutex_destroy(&adev->pm.gpu_metrics_lock);
> + adev->pm.gpu_metrics_bin_registered = false;
> + }
> +
> amdgpu_device_attr_remove_groups(adev, &adev->pm.pm_attr_list);
> }
>
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> index aa3f427819a0..3677a4f543fb 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> @@ -349,6 +349,10 @@ struct amdgpu_pm {
> /* dpm */
> bool dpm_enabled;
> bool sysfs_initialized;
> + bool gpu_metrics_bin_registered;
> + struct mutex gpu_metrics_lock;
> + /* per-reader snapshot entries, keyed by (unsigned long)struct file * */
> + struct xarray gpu_metrics_readers;
Pretty clear NAK on that approach. What the heck are you doing here?
Regards,
Christian.
> struct amdgpu_dpm dpm;
> const struct firmware *fw; /* SMC firmware */
> uint32_t fw_version;
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h
> b/drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h
> index c12ced32f780..dc6875871f1d 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_pm.h
> @@ -73,7 +73,6 @@ enum amdgpu_device_attr_id {
> device_attr_id__unique_id,
> device_attr_id__thermal_throttling_logging,
> device_attr_id__apu_thermal_cap,
> - device_attr_id__gpu_metrics,
> device_attr_id__smartshift_apu_power,
> device_attr_id__smartshift_dgpu_power,
> device_attr_id__smartshift_bias,