On 16/05/2023 12:30, Oded Gabbay wrote:
> If a workload got stuck, we print an error to the kernel log about it.
> Add to that print the configured max timeout value, as that value is
> not fixed between ASICs and in addition it can be configured using
> a kernel module parameter.
>
> Signed-off-by: Oded Gabbay <ogab...@kernel.org>
> ---
>   .../habanalabs/common/command_submission.c    | 26 +++++++++++--------
>   1 file changed, 15 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/accel/habanalabs/common/command_submission.c 
> b/drivers/accel/habanalabs/common/command_submission.c
> index ccf68f482948..4ec28af3ed78 100644
> --- a/drivers/accel/habanalabs/common/command_submission.c
> +++ b/drivers/accel/habanalabs/common/command_submission.c
> @@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref)
>   
>   static void cs_timedout(struct work_struct *work)
>   {
> +     struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
> +     bool skip_reset_on_timeout, device_reset = false;
>       struct hl_device *hdev;
>       u64 event_mask = 0x0;
> +     uint timeout_sec;
>       int rc;
> -     struct hl_cs *cs = container_of(work, struct hl_cs,
> -                                              work_tdr.work);
> -     bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = 
> false;
> +
> +     skip_reset_on_timeout = cs->skip_reset_on_timeout;
>   
>       rc = cs_get_unless_zero(cs);
>       if (!rc)
> @@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work)
>               event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
>       }
>   
> +     timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
> +
>       switch (cs->type) {
>       case CS_TYPE_SIGNAL:
>               dev_err(hdev->dev,
> -                     "Signal command submission %llu has not finished in 
> time!\n",
> -                     cs->sequence);
> +                     "Signal command submission %llu has not finished in %u 
> seconds!\n",
> +                     cs->sequence, timeout_sec);
>               break;
>   
>       case CS_TYPE_WAIT:
>               dev_err(hdev->dev,
> -                     "Wait command submission %llu has not finished in 
> time!\n",
> -                     cs->sequence);
> +                     "Wait command submission %llu has not finished in %u 
> seconds!\n",
> +                     cs->sequence, timeout_sec);
>               break;
>   
>       case CS_TYPE_COLLECTIVE_WAIT:
>               dev_err(hdev->dev,
> -                     "Collective Wait command submission %llu has not 
> finished in time!\n",
> -                     cs->sequence);
> +                     "Collective Wait command submission %llu has not 
> finished in %u seconds!\n",
> +                     cs->sequence, timeout_sec);
>               break;
>   
>       default:
>               dev_err(hdev->dev,
> -                     "Command submission %llu has not finished in time!\n",
> -                     cs->sequence);
> +                     "Command submission %llu has not finished in %u 
> seconds!\n",
> +                     cs->sequence, timeout_sec);
>               break;
>       }
>   

Reviewed-by: Ofir Bitton<obit...@habana.ai>

Reply via email to