amdgpu: change ras bps type to eeprom table record structure

Zhou1, Tao Sun, 01 Sep 2019 20:14:56 -0700


> -----Original Message-----
> From: Chen, Guchun <guchun.c...@amd.com>
> Sent: 2019年9月2日 10:13
> To: Zhou1, Tao <tao.zh...@amd.com>; amd-gfx@lists.freedesktop.org;
> Grodzovsky, Andrey <andrey.grodzov...@amd.com>; Li, Dennis
> <dennis...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>
> Cc: Zhou1, Tao <tao.zh...@amd.com>
> Subject: RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table
> record structure
> 
> 
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Friday, August 30, 2019 8:25 PM
> To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
> <andrey.grodzov...@amd.com>; Chen, Guchun <guchun.c...@amd.com>;
> Li, Dennis <dennis...@amd.com>; Zhang, Hawking
> <hawking.zh...@amd.com>
> Cc: Zhou1, Tao <tao.zh...@amd.com>
> Subject: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table
> record structure
> 
> change bps type from retired page to eeprom table record, prepare for
> saving error records to eeprom
> 
> Signed-off-by: Tao Zhou <tao.zh...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++-------
> --  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
>  2 files changed, 43 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2ca3997d4b3a..24663ec41248 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1187,14 +1187,14 @@ static int amdgpu_ras_badpages_read(struct
> amdgpu_device *adev,
> 
>       for (; i < data->count; i++) {
>               (*bps)[i] = (struct ras_badpage){
> -                     .bp = data->bps[i].bp,
> +                     .bp = data->bps[i].retired_page,
>                       .size = AMDGPU_GPU_PAGE_SIZE,
>                       .flags = 0,
>               };
> 
>               if (data->last_reserved <= i)
>                       (*bps)[i].flags = 1;
> -             else if (data->bps[i].bo == NULL)
> +             else if (data->bps_bo[i] == NULL)
>                       (*bps)[i].flags = 2;
>       }
> 
> @@ -1288,30 +1288,40 @@ static int
> amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,  {
>       unsigned int old_space = data->count + data->space_left;
>       unsigned int new_space = old_space + pages;
> -     unsigned int align_space = ALIGN(new_space, 1024);
> -     void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
> -
> -     if (!tmp)
> +     unsigned int align_space = ALIGN(new_space, 512);
> [Guchun]Any special reason to change alignment from 512 to 1024?


[Tao] The old "data->bps" is 16 byte and new " struct eeprom_table_record bps" 
is 31 bytes on 64bit system, I'd like to lower the pressure on memory system. 
The value can be adjusted according to feedback in the future.

> 
> +     void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
> +     struct amdgpu_bo **bps_bo =
> +                     kmalloc(align_space * sizeof(*data->bps_bo),
> GFP_KERNEL);
> +
> +     if (!bps || !bps_bo) {
> +             kfree(bps);
> +             kfree(bps_bo);
>               return -ENOMEM;
> +     }
> 
>       if (data->bps) {
> -             memcpy(tmp, data->bps,
> +             memcpy(bps, data->bps,
>                               data->count * sizeof(*data->bps));
>               kfree(data->bps);
>       }
> +     if (data->bps_bo) {
> +             memcpy(bps_bo, data->bps_bo,
> +                             data->count * sizeof(*data->bps_bo));
> +             kfree(data->bps_bo);
> +     }
> 
> -     data->bps = tmp;
> +     data->bps = bps;
> +     data->bps_bo = bps_bo;
>       data->space_left += align_space - old_space;
>       return 0;
>  }
> 
>  /* it deal with vram only. */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> -             unsigned long *bps, int pages)
> +             struct eeprom_table_record *bps, int pages)
>  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>       struct ras_err_handler_data *data;
> -     int i = pages;
>       int ret = 0;
> 
>       if (!con || !con->eh_data || !bps || pages <= 0) @@ -1328,10
> +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>                       goto out;
>               }
> 
> -     while (i--)
> -             data->bps[data->count++].bp = bps[i];
> -
> +     memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
> +     data->count += pages;
>       data->space_left -= pages;
> +
>  out:
>       mutex_unlock(&con->recovery_lock);
> 
> @@ -1356,13 +1366,13 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
>               goto out;
>       /* reserve vram at driver post stage. */
>       for (i = data->last_reserved; i < data->count; i++) {
> -             bp = data->bps[i].bp;
> +             bp = data->bps[i].retired_page;
> 
>               if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
>                                       PAGE_SIZE, &bo))
>                       DRM_ERROR("RAS ERROR: reserve vram %llx fail\n",
> bp);
> 
> -             data->bps[i].bo = bo;
> +             data->bps_bo[i] = bo;
>               data->last_reserved = i + 1;
>       }
>  out:
> @@ -1387,11 +1397,11 @@ static int amdgpu_ras_release_bad_pages(struct
> amdgpu_device *adev)
>               goto out;
> 
>       for (i = data->last_reserved - 1; i >= 0; i--) {
> -             bo = data->bps[i].bo;
> +             bo = data->bps_bo[i];
> 
>               amdgpu_ras_release_vram(adev, &bo);
> 
> -             data->bps[i].bo = bo;
> +             data->bps_bo[i] = bo;
>               data->last_reserved = i;
>       }
>  out:
> @@ -1407,12 +1417,19 @@ static int amdgpu_ras_save_bad_pages(struct
> amdgpu_device *adev)
>       return 0;
>  }
> 
> +/*
> + * read error record array in eeprom and reserve enough space for
> + * storing new bad pages
> + */
>  static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)  {
> -     /* TODO
> -      * read the array to eeprom when SMU disabled.
> -      */
> -     return 0;
> +     struct eeprom_table_record *bps = NULL;
> +     int ret;
> +
> +     ret = amdgpu_ras_add_bad_pages(adev, bps,
> +                             adev->umc.max_ras_err_cnt_per_query);
> +
> +     return ret;
>  }
> 
>  static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 66b71525446e..b6bac873c588 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -351,11 +351,10 @@ struct ras_err_data {  };
> 
>  struct ras_err_handler_data {
> -     /* point to bad pages array */
> -     struct {
> -             unsigned long bp;
> -             struct amdgpu_bo *bo;
> -     } *bps;
> +     /* point to bad page records array */
> +     struct eeprom_table_record *bps;
> +     /* point to reserved bo array */
> +     struct amdgpu_bo **bps_bo;
>       /* the count of entries */
>       int count;
>       /* the space can place new entries */
> @@ -492,7 +491,7 @@ unsigned long
> amdgpu_ras_query_error_count(struct amdgpu_device *adev,
> 
>  /* error handling functions */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> -             unsigned long *bps, int pages);
> +             struct eeprom_table_record *bps, int pages);
> 
>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
> 
> --
> 2.17.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure

Reply via email to