On Thu, May 09, 2019 at 10:31:05AM +0000, Pan, Xinhui wrote: > add badpages node. > it will output badpages list in format > gpu pfn : gpu page size : flags > > example > 0x00000000 : 0x00001000 : R > 0x00000001 : 0x00001000 : R > 0x00000002 : 0x00001000 : R > 0x00000003 : 0x00001000 : R > 0x00000004 : 0x00001000 : R > 0x00000005 : 0x00001000 : R > 0x00000006 : 0x00001000 : R > 0x00000007 : 0x00001000 : P > 0x00000008 : 0x00001000 : P > 0x00000009 : 0x00001000 : P > > flags can be one of below characters > R: reserved. > P: pending for reserve. > F: failed to reserve for some reasons. > > Signed-off-by: xinhui pan <xinhui....@amd.com> > Reviewed-by: Alex Deucher <alexander.deuc...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 146 ++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + > 2 files changed, 147 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index c60d5f813801..c9e24f60938e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -90,6 +90,12 @@ struct ras_manager { > struct ras_err_data err_data; > }; > > +struct ras_badpage { > + unsigned int bp; > + unsigned int size; > + unsigned int flags; > +}; > + > const char *ras_error_string[] = { > "none", > "parity", > @@ -710,6 +716,77 @@ int amdgpu_ras_query_error_count(struct amdgpu_device > *adev, > > /* sysfs begin */ > > +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, > + struct ras_badpage **bps, unsigned int *count); > + > +static char *amdgpu_ras_badpage_flags_str(unsigned int flags) > +{ > + switch (flags) { > + case 0: > + return "R"; > + case 1: > + return "P"; > + case 2: > + default: > + return "F"; > + }; > +} > + > +/* > + * DOC: ras sysfs gpu_vram_bad_pages interface > + * > + * It allows user to read the bad pages of vram on the gpu through > + * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages > + * > + * It outputs multiple lines, and each line stands for one gpu page. > + * > + * The format of one line is below, > + * gpu pfn : gpu page size : flags > + * > + * gpu pfn and gpu page size are printed in hex format. > + * flags can be one of below character, > + * R: reserved, this gpu page is reserved and not able to use. > + * P: pending for reserve, this gpu page is marked as bad, will be reserved > + * in next window of page_reserve. > + * F: unable to reserve. this gpu page can't be reserved due to some reasons. > + * > + * examples: > + * 0x00000001 : 0x00001000 : R > + * 0x00000002 : 0x00001000 : P > + */ > + > +static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, > + struct kobject *kobj, struct bin_attribute *attr, > + char *buf, loff_t ppos, size_t count) > +{ > + struct amdgpu_ras *con = > + container_of(attr, struct amdgpu_ras, badpages_attr); > + struct amdgpu_device *adev = con->adev; > + const unsigned int element_size = > + sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; > + unsigned int start = (ppos + element_size - 1) / element_size; > + unsigned int end = (ppos + count - 1) / element_size;
I believe these two lines cause a link time error with arm32 defconfig + CONFIG_DRM_AMDGPU (filtered down from allyesconfig): arm-linux-gnueabi-ld: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.o: in function `amdgpu_ras_sysfs_badpages_read': amdgpu_ras.c:(.text+0x804): undefined reference to `__aeabi_ldivmod' arm-linux-gnueabi-ld: amdgpu_ras.c:(.text+0x830): undefined reference to `__aeabi_ldivmod' The assignments of start and end involve a 64-bit dividend because loff_t is defined as long long, meaning one of the 64-bit division functions from include/linux/math64.h should be used. I am not sure of which one otherwise I would have sent a patch :) Cheers, Nathan > + ssize_t s = 0; > + struct ras_badpage *bps = NULL; > + unsigned int bps_count = 0; > + > + memset(buf, 0, count); > + > + if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) > + return 0; > + > + for (; start < end && start < bps_count; start++) > + s += scnprintf(&buf[s], element_size + 1, > + "0x%08x : 0x%08x : %1s\n", > + bps[start].bp, > + bps[start].size, > + amdgpu_ras_badpage_flags_str(bps[start].flags)); > + > + kfree(bps); > + > + return s; > +} > + > static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, > struct device_attribute *attr, char *buf) > { > @@ -750,9 +827,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct > amdgpu_device *adev) > &con->features_attr.attr, > NULL > }; > + struct bin_attribute *bin_attrs[] = { > + &con->badpages_attr, > + NULL > + }; > struct attribute_group group = { > .name = "ras", > .attrs = attrs, > + .bin_attrs = bin_attrs, > }; > > con->features_attr = (struct device_attribute) { > @@ -762,7 +844,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct > amdgpu_device *adev) > }, > .show = amdgpu_ras_sysfs_features_read, > }; > + > + con->badpages_attr = (struct bin_attribute) { > + .attr = { > + .name = "gpu_vram_bad_pages", > + .mode = S_IRUGO, > + }, > + .size = 0, > + .private = NULL, > + .read = amdgpu_ras_sysfs_badpages_read, > + }; > + > sysfs_attr_init(attrs[0]); > + sysfs_bin_attr_init(bin_attrs[0]); > > return sysfs_create_group(&adev->dev->kobj, &group); > } > @@ -774,9 +868,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct > amdgpu_device *adev) > &con->features_attr.attr, > NULL > }; > + struct bin_attribute *bin_attrs[] = { > + &con->badpages_attr, > + NULL > + }; > struct attribute_group group = { > .name = "ras", > .attrs = attrs, > + .bin_attrs = bin_attrs, > }; > > sysfs_remove_group(&adev->dev->kobj, &group); > @@ -1108,6 +1207,53 @@ static int amdgpu_ras_interrupt_remove_all(struct > amdgpu_device *adev) > /* ih end */ > > /* recovery begin */ > + > +/* return 0 on success. > + * caller need free bps. > + */ > +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, > + struct ras_badpage **bps, unsigned int *count) > +{ > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + struct ras_err_handler_data *data; > + int i = 0; > + int ret = 0; > + > + if (!con || !con->eh_data || !bps || !count) > + return -EINVAL; > + > + mutex_lock(&con->recovery_lock); > + data = con->eh_data; > + if (!data || data->count == 0) { > + *bps = NULL; > + goto out; > + } > + > + *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); > + if (!*bps) { > + ret = -ENOMEM; > + goto out; > + } > + > + for (; i < data->count; i++) { > + (*bps)[i] = (struct ras_badpage){ > + .bp = data->bps[i].bp, > + .size = AMDGPU_GPU_PAGE_SIZE, > + .flags = 0, > + }; > + > + if (data->last_reserved <= i) > + (*bps)[i].flags = 1; > + else if (data->bps[i].bo == NULL) > + (*bps)[i].flags = 2; > + } > + > + *count = data->count; > +out: > + mutex_unlock(&con->recovery_lock); > + return ret; > +} > + > static void amdgpu_ras_do_recovery(struct work_struct *work) > { > struct amdgpu_ras *ras = > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 065c66baf947..e2dff00b8d1c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -93,6 +93,7 @@ struct amdgpu_ras { > struct dentry *ent; > /* sysfs */ > struct device_attribute features_attr; > + struct bin_attribute badpages_attr; > /* block array */ > struct ras_manager *objs; > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx