Re: [Intel-gfx] [PATCH 26/37] drm/i915/dg1: Handle GRF/IC ECC error irq

2020-05-21 Thread Chris Wilson
Quoting Lucas De Marchi (2020-05-21 01:37:52)
> From: Fernando Pacheco 
> 
> The error detection and correction capability
> for GRF and instruction cache (IC) will utilize
> the new interrupt and error handling infrastructure
> for dgfx products. The GFX device can generate
> a number of classes of error under the new
> infrastructure: correctable, non-fatal, and
> fatal errors.
> 
> The non-fatal and fatal error classes distinguish
> between levels of severity for uncorrectable errors.
> All ECC uncorrectable errors will be reported as
> fatal to produce the desired system response. Fatal
> errors are expected to route as PCIe error messages
> which should result in OS issuing a GFX device FLR.
> But the option exists to route fatal errors as
> interrupts.
> 
> Driver will only handle logging of errors. Anything
> more will be handled at system level.
> 
> For errors that will route as interrupts, three
> bits in the Master Interrupt Register will be used
> to convey the class of error.
> 
> For each class of error:
> 1. Determine source of error (IP block) by reading
>the Device Error Source Register (RW1C) that
>corresponds to the class of error being serviced.
> 2. If the generating IP block is GT, read and log the
>GT Error Register (RW1C) that corresponds to the
>class of error being serviced. Non-GT errors will
>be logged in aggregate for now.
> 
> Bspec: 50875
> 
> Cc: Paulo Zanoni 
> Cc: Daniele Ceraolo Spurio 
> Cc: Fernando Pacheco 
> Cc: Radhakrishna Sripada 
> Signed-off-by: Fernando Pacheco 
> Signed-off-by: Lucas De Marchi 
> ---
>  drivers/gpu/drm/i915/i915_irq.c | 121 
>  drivers/gpu/drm/i915/i915_reg.h |  28 
>  2 files changed, 149 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index ebc80e8b1599..17e679b910da 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2515,6 +2515,124 @@ static irqreturn_t gen8_irq_handler(int irq, void 
> *arg)
> return IRQ_HANDLED;
>  }
>  
> +static const char *
> +hardware_error_type_to_str(const enum hardware_error hw_err)
> +{
> +   switch (hw_err) {
> +   case HARDWARE_ERROR_CORRECTABLE:
> +   return "CORRECTABLE";
> +   case HARDWARE_ERROR_NONFATAL:
> +   return "NONFATAL";
> +   case HARDWARE_ERROR_FATAL:
> +   return "FATAL";
> +   default:
> +   return "UNKNOWN";
> +   }
> +}
> +
> +static void
> +gen12_gt_hw_error_handler(struct drm_i915_private * const i915,
> + const enum hardware_error hw_err)
> +{
> +   void __iomem * const regs = i915->uncore.regs;
> +   const char *hw_err_str = hardware_error_type_to_str(hw_err);
> +   u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
> +   u32 errstat;
> +
> +   lockdep_assert_held(>irq_lock);

Wrong place and wrong locks.
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH 26/37] drm/i915/dg1: Handle GRF/IC ECC error irq

2020-05-20 Thread Lucas De Marchi
From: Fernando Pacheco 

The error detection and correction capability
for GRF and instruction cache (IC) will utilize
the new interrupt and error handling infrastructure
for dgfx products. The GFX device can generate
a number of classes of error under the new
infrastructure: correctable, non-fatal, and
fatal errors.

The non-fatal and fatal error classes distinguish
between levels of severity for uncorrectable errors.
All ECC uncorrectable errors will be reported as
fatal to produce the desired system response. Fatal
errors are expected to route as PCIe error messages
which should result in OS issuing a GFX device FLR.
But the option exists to route fatal errors as
interrupts.

Driver will only handle logging of errors. Anything
more will be handled at system level.

For errors that will route as interrupts, three
bits in the Master Interrupt Register will be used
to convey the class of error.

For each class of error:
1. Determine source of error (IP block) by reading
   the Device Error Source Register (RW1C) that
   corresponds to the class of error being serviced.
2. If the generating IP block is GT, read and log the
   GT Error Register (RW1C) that corresponds to the
   class of error being serviced. Non-GT errors will
   be logged in aggregate for now.

Bspec: 50875

Cc: Paulo Zanoni 
Cc: Daniele Ceraolo Spurio 
Cc: Fernando Pacheco 
Cc: Radhakrishna Sripada 
Signed-off-by: Fernando Pacheco 
Signed-off-by: Lucas De Marchi 
---
 drivers/gpu/drm/i915/i915_irq.c | 121 
 drivers/gpu/drm/i915/i915_reg.h |  28 
 2 files changed, 149 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ebc80e8b1599..17e679b910da 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2515,6 +2515,124 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
return IRQ_HANDLED;
 }
 
+static const char *
+hardware_error_type_to_str(const enum hardware_error hw_err)
+{
+   switch (hw_err) {
+   case HARDWARE_ERROR_CORRECTABLE:
+   return "CORRECTABLE";
+   case HARDWARE_ERROR_NONFATAL:
+   return "NONFATAL";
+   case HARDWARE_ERROR_FATAL:
+   return "FATAL";
+   default:
+   return "UNKNOWN";
+   }
+}
+
+static void
+gen12_gt_hw_error_handler(struct drm_i915_private * const i915,
+ const enum hardware_error hw_err)
+{
+   void __iomem * const regs = i915->uncore.regs;
+   const char *hw_err_str = hardware_error_type_to_str(hw_err);
+   u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
+   u32 errstat;
+
+   lockdep_assert_held(>irq_lock);
+
+   errstat = raw_reg_read(regs, ERR_STAT_GT_REG(hw_err));
+
+   if (unlikely(!errstat)) {
+   DRM_ERROR("ERR_STAT_GT_REG_%s blank!\n", hw_err_str);
+   return;
+   }
+
+   /*
+* TODO: The GT Non Fatal Error Status Register
+* only has reserved bitfields defined.
+* Remove once there is something to service.
+*/
+   if (hw_err == HARDWARE_ERROR_NONFATAL) {
+   DRM_ERROR("detected Non-Fatal hardware error\n");
+   raw_reg_write(regs, ERR_STAT_GT_REG(hw_err), errstat);
+   return;
+   }
+
+   if (errstat & EU_GRF_ERROR)
+   DRM_ERROR("detected EU GRF %s hardware error\n", hw_err_str);
+
+   if (errstat & EU_IC_ERROR)
+   DRM_ERROR("detected EU IC %s hardware error\n", hw_err_str);
+
+   /*
+* TODO: The remaining GT errors don't have a
+* need for targeted logging at the moment. We
+* still want to log detection of these errors, but
+* let's aggregate them until someone has a need for them.
+*/
+   if (errstat & other_errors)
+   DRM_ERROR("detected hardware error(s) in ERR_STAT_GT_REG_%s: 
0x%08x\n",
+ hw_err_str, errstat & other_errors);
+
+   raw_reg_write(regs, ERR_STAT_GT_REG(hw_err), errstat);
+}
+
+static void
+gen12_hw_error_source_handler(struct drm_i915_private * const i915,
+ const enum hardware_error hw_err)
+{
+   void __iomem * const regs = i915->uncore.regs;
+   const char *hw_err_str = hardware_error_type_to_str(hw_err);
+   u32 errsrc;
+
+   spin_lock(>irq_lock);
+   errsrc = raw_reg_read(regs, DEV_ERR_STAT_REG(hw_err));
+
+   if (unlikely(!errsrc)) {
+   DRM_ERROR("DEV_ERR_STAT_REG_%s blank!\n", hw_err_str);
+   goto out_unlock;
+   }
+
+   if (errsrc & DEV_ERR_STAT_GT_ERROR)
+   gen12_gt_hw_error_handler(i915, hw_err);
+
+   if (errsrc & ~DEV_ERR_STAT_GT_ERROR)
+   DRM_ERROR("non-GT hardware error(s) in DEV_ERR_STAT_REG_%s: 
0x%08x\n",
+ hw_err_str, errsrc & ~DEV_ERR_STAT_GT_ERROR);
+
+   raw_reg_write(regs, DEV_ERR_STAT_REG(hw_err), errsrc);
+
+out_unlock:
+