amdgpu: Use delayed work to collect RAS error counters

Luben Tuikov Wed, 26 May 2021 07:50:52 -0700

On 2021-05-26 6:15 a.m., Christian König wrote:
> Am 26.05.21 um 02:40 schrieb Luben Tuikov:
>> On Context Query2 IOCTL return the correctable and
>> uncorrectable errors in O(1) fashion, from cached
>> values, and schedule a delayed work function to
>> calculate and cache them for the next such IOCTL.
>>
>> v2: Cancel pending delayed work at ras_fini().
>>
>> Cc: Alexander Deucher <alexander.deuc...@amd.com>
>> Cc: Christian König <christian.koe...@amd.com>
>> Cc: John Clements <john.cleme...@amd.com>
>> Cc: Hawking Zhang <hawking.zh...@amd.com>
>> Signed-off-by: Luben Tuikov <luben.tui...@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 32 +++++++++++++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 41 +++++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +++
>>   3 files changed, 76 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index bb0cfe871aba..4e95d255960b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -331,10 +331,13 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
>>      return 0;
>>   }
>>   
>> +#define AMDGPU_RAS_COUNTE_DELAY_MS 3000
>> +
>>   static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>> -    struct amdgpu_fpriv *fpriv, uint32_t id,
>> -    union drm_amdgpu_ctx_out *out)
>> +                         struct amdgpu_fpriv *fpriv, uint32_t id,
>> +                         union drm_amdgpu_ctx_out *out)
>>   {
>> +    struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>>      struct amdgpu_ctx *ctx;
>>      struct amdgpu_ctx_mgr *mgr;
>>   
>> @@ -361,6 +364,31 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>>      if (atomic_read(&ctx->guilty))
>>              out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
>>   
>> +    if (adev->ras_enabled && con) {
>> +            /* Return the cached values in O(1),
>> +             * and schedule delayed work to cache
>> +             * new vaues.
>> +             */
>> +            int ce_count, ue_count;
>> +
>> +            ce_count = atomic_read(&con->ras_ce_count);
>> +            ue_count = atomic_read(&con->ras_ue_count);
>> +
>> +            if (ce_count != ctx->ras_counter_ce) {
>> +                    ctx->ras_counter_ce = ce_count;
>> +                    out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
>> +            }
>> +
>> +            if (ue_count != ctx->ras_counter_ue) {
>> +                    ctx->ras_counter_ue = ue_count;
>> +                    out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
>> +            }
>> +
>> +            if (!delayed_work_pending(&con->ras_counte_delay_work))
>> +                    schedule_delayed_work(&con->ras_counte_delay_work,
>> +                              msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS));
> You can skip the delayed_work_pending() check here, if I'm not totally 
> mistaken that is already integrated in the schedule_delayed_work() logic.


I'll take a look and if so I'll remove the check. I've remembered this check 
from many
years ago--habit.

>
>> +    }
>> +
>>      mutex_unlock(&mgr->lock);
>>      return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index ed3c43e8b0b5..01114529040a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -27,6 +27,7 @@
>>   #include <linux/uaccess.h>
>>   #include <linux/reboot.h>
>>   #include <linux/syscalls.h>
>> +#include <linux/pm_runtime.h>
>>   
>>   #include "amdgpu.h"
>>   #include "amdgpu_ras.h"
>> @@ -2116,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct 
>> amdgpu_device *adev)
>>              adev->ras_hw_enabled & amdgpu_ras_mask;
>>   }
>>   
>> +static void amdgpu_ras_counte_dw(struct work_struct *work)
>> +{
>> +    struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
>> +                                          ras_counte_delay_work.work);
>> +    struct amdgpu_device *adev = con->adev;
>> +    struct drm_device *dev = &adev->ddev;
>> +    unsigned long ce_count, ue_count;
>> +    int res;
>> +
>> +    res = pm_runtime_get_sync(dev->dev);
>> +    if (res < 0)
>> +            goto Out;
>> +
>> +    /* Cache new values.
>> +     */
>> +    amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
>> +    atomic_set(&con->ras_ce_count, ce_count);
>> +    atomic_set(&con->ras_ue_count, ue_count);
>> +
>> +    pm_runtime_mark_last_busy(dev->dev);
>> +Out:
>> +    pm_runtime_put_autosuspend(dev->dev);
>> +}
>> +
>>   int amdgpu_ras_init(struct amdgpu_device *adev)
>>   {
>>      struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> @@ -2130,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>>      if (!con)
>>              return -ENOMEM;
>>   
>> +    con->adev = adev;
>> +    INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
>> +    atomic_set(&con->ras_ce_count, 0);
>> +    atomic_set(&con->ras_ue_count, 0);
>> +
>>      con->objs = (struct ras_manager *)(con + 1);
>>   
>>      amdgpu_ras_set_context(adev, con);
>> @@ -2233,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
>>                       struct ras_fs_if *fs_info,
>>                       struct ras_ih_if *ih_info)
>>   {
>> +    struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> +    unsigned long ue_count, ce_count;
>>      int r;
>>   
>>      /* disable RAS feature per IP block if it is not supported */
>> @@ -2273,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
>>      if (r)
>>              goto sysfs;
>>   
>> +    /* Those are the cached values at init.
>> +     */
>> +    amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
>> +    atomic_set(&con->ras_ce_count, ce_count);
>> +    atomic_set(&con->ras_ue_count, ue_count);
>> +
>>      return 0;
>>   cleanup:
>>      amdgpu_ras_sysfs_remove(adev, ras_block);
>> @@ -2390,6 +2428,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>>      if (con->features)
>>              amdgpu_ras_disable_all_features(adev, 1);
>>   
>> +    if (delayed_work_pending(&con->ras_counte_delay_work))
>> +            cancel_delayed_work_sync(&con->ras_counte_delay_work);
>> +
> Using delayed_work_pending() here causes a race problem, because the 
> work might be running at the moment.
>
> Just calling cancel_delayed_work_sync() should fix that because the 
> *_sync() variant also waits for running delayed work items to finish.

Yes, that's why I used the *_sync() version--we can't go on until the delayed 
work has
finished.

Yes, I agree that it is indeed a race, and I saw that, and I wonder why that 
interface exists
in the kernel in the first place.

Regards,
Luben

>
> Apart from that the patch looks good to me.
>
> Regards,
> Christian.
>
>>      amdgpu_ras_set_context(adev, NULL);
>>      kfree(con);
>>   
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 10fca0393106..256cea5d34f2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -340,6 +340,11 @@ struct amdgpu_ras {
>>   
>>      /* disable ras error count harvest in recovery */
>>      bool disable_ras_err_cnt_harvest;
>> +
>> +    /* RAS count errors delayed work */
>> +    struct delayed_work ras_counte_delay_work;
>> +    atomic_t ras_ue_count;
>> +    atomic_t ras_ce_count;
>>   };
>>   
>>   struct ras_fs_data {

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 3/3] drm/amdgpu: Use delayed work to collect RAS error counters

Reply via email to