Re: [Intel-gfx] [PATCH v5] drm/i915/guc: capture GuC logs if FW fails to load

2017-05-18 Thread Daniele Ceraolo Spurio



On 18/05/17 10:08, Michal Wajdeczko wrote:

On Tue, May 16, 2017 at 02:52:53PM -0700, Daniele Ceraolo Spurio wrote:

We're currently deleting the GuC logs if the FW fails to load, but those
are still useful to understand why the loading failed. Keeping the
object around allows us to access them after driver load is completed.

v2: keep the object around instead of using kernel memory (chris)
don't store the logs in the gpu_error struct (Chris)
add a check on guc_log_level to avoid snapshotting empty logs

v3: use separate debugfs for error log (Chris)

v4: rebased

v5: clean up obj selection, move err_load inside guc_log, move err_load
cleanup, rename functions (Michal)

Cc: Chris Wilson 
Cc: Oscar Mateo 
Cc: Michal Wajdeczko 
Signed-off-by: Daniele Ceraolo Spurio 
---
 drivers/gpu/drm/i915/i915_debugfs.c  | 38 +++-
 drivers/gpu/drm/i915/intel_guc_log.c | 17 
 drivers/gpu/drm/i915/intel_uc.c  | 14 +++--
 drivers/gpu/drm/i915/intel_uc.h  |  5 +
 4 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index bd9abef..740395c 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2607,27 +2607,36 @@ static int i915_guc_stage_pool(struct seq_file *m, void 
*data)

 static int i915_guc_log_dump(struct seq_file *m, void *data)
 {
-   struct drm_i915_private *dev_priv = node_to_i915(m->private);
-   struct drm_i915_gem_object *obj;
-   int i = 0, pg;
-
-   if (!dev_priv->guc.log.vma)
-   return 0;
+   struct drm_info_node *node = m->private;
+   struct drm_i915_private *dev_priv = node_to_i915(node);
+   bool dump_load_err = !!node->info_ent->data;
+   struct drm_i915_gem_object *obj = NULL;
+   u32 *log;
+   int i = 0;

-   obj = dev_priv->guc.log.vma->obj;
-   for (pg = 0; pg < obj->base.size / PAGE_SIZE; pg++) {
-   u32 *log = kmap_atomic(i915_gem_object_get_page(obj, pg));
+   if (dump_load_err)
+   obj = dev_priv->guc.log.load_err;
+   else if (dev_priv->guc.log.vma)
+   obj = dev_priv->guc.log.vma->obj;

-   for (i = 0; i < PAGE_SIZE / sizeof(u32); i += 4)
-   seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
-  *(log + i), *(log + i + 1),
-  *(log + i + 2), *(log + i + 3));
+   if (!obj)
+   return 0;

-   kunmap_atomic(log);
+   log = i915_gem_object_pin_map(obj, I915_MAP_WC);
+   if (IS_ERR(log)) {
+   seq_puts(m, "Failed to pin object\n");


Hmm, quite unusual to report internal message. Maybe:

DRM_DEBUG("Failed to pin object\n");
seq_puts(m, "(log data unaccessible)\n");



+   return PTR_ERR(log);
}

+   for (i = 0; i < obj->base.size / sizeof(u32); i += 4)
+   seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
+  *(log + i), *(log + i + 1),
+  *(log + i + 2), *(log + i + 3));
+
seq_putc(m, '\n');

+   i915_gem_object_unpin_map(obj);
+
return 0;
 }

@@ -4811,6 +4820,7 @@ static int i915_hpd_storm_ctl_open(struct inode *inode, 
struct file *file)
{"i915_guc_info", i915_guc_info, 0},
{"i915_guc_load_status", i915_guc_load_status_info, 0},
{"i915_guc_log_dump", i915_guc_log_dump, 0},
+   {"i915_guc_load_err_log_dump", i915_guc_log_dump, 0, (void *)1},


bikeshed: quite long name, maybe "i915_guc_load_log_dump" ?


I'd prefer to keep it as it is to make it explicit that it is an error log.




{"i915_guc_stage_pool", i915_guc_stage_pool, 0},
{"i915_huc_load_status", i915_huc_load_status_info, 0},
{"i915_frequency_info", i915_frequency_info, 0},
diff --git a/drivers/gpu/drm/i915/intel_guc_log.c 
b/drivers/gpu/drm/i915/intel_guc_log.c
index 16d3b87..31e7bec 100644
--- a/drivers/gpu/drm/i915/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/intel_guc_log.c
@@ -660,3 +660,20 @@ void i915_guc_log_unregister(struct drm_i915_private 
*dev_priv)
guc_log_runtime_destroy(&dev_priv->guc);
mutex_unlock(&dev_priv->drm.struct_mutex);
 }
+
+void intel_guc_log_capture_load_err(struct intel_guc *guc)
+{
+   if (!guc->log.vma || i915.guc_log_level < 0)
+   return;
+
+   if (!guc->log.load_err)
+   guc->log.load_err = i915_gem_object_get(guc->log.vma->obj);
+
+   return;
+}
+
+void intel_guc_log_free_load_err(struct intel_guc *guc)
+{
+   if (guc->log.load_err)
+   i915_gem_object_put(guc->log.load_err);
+}


Based on the discussion from IM, it looks that it would be better to keep
guc->load_err object directly in struct intel_guc, and at the same time
move both capture/free functions to guc_uc.c as static:

void guc_capture_load_err_log(struct int

Re: [Intel-gfx] [PATCH v5] drm/i915/guc: capture GuC logs if FW fails to load

2017-05-18 Thread Michal Wajdeczko
On Tue, May 16, 2017 at 02:52:53PM -0700, Daniele Ceraolo Spurio wrote:
> We're currently deleting the GuC logs if the FW fails to load, but those
> are still useful to understand why the loading failed. Keeping the
> object around allows us to access them after driver load is completed.
> 
> v2: keep the object around instead of using kernel memory (chris)
> don't store the logs in the gpu_error struct (Chris)
> add a check on guc_log_level to avoid snapshotting empty logs
> 
> v3: use separate debugfs for error log (Chris)
> 
> v4: rebased
> 
> v5: clean up obj selection, move err_load inside guc_log, move err_load
> cleanup, rename functions (Michal)
> 
> Cc: Chris Wilson 
> Cc: Oscar Mateo 
> Cc: Michal Wajdeczko 
> Signed-off-by: Daniele Ceraolo Spurio 
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c  | 38 
> +++-
>  drivers/gpu/drm/i915/intel_guc_log.c | 17 
>  drivers/gpu/drm/i915/intel_uc.c  | 14 +++--
>  drivers/gpu/drm/i915/intel_uc.h  |  5 +
>  4 files changed, 58 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index bd9abef..740395c 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2607,27 +2607,36 @@ static int i915_guc_stage_pool(struct seq_file *m, 
> void *data)
>  
>  static int i915_guc_log_dump(struct seq_file *m, void *data)
>  {
> - struct drm_i915_private *dev_priv = node_to_i915(m->private);
> - struct drm_i915_gem_object *obj;
> - int i = 0, pg;
> -
> - if (!dev_priv->guc.log.vma)
> - return 0;
> + struct drm_info_node *node = m->private;
> + struct drm_i915_private *dev_priv = node_to_i915(node);
> + bool dump_load_err = !!node->info_ent->data;
> + struct drm_i915_gem_object *obj = NULL;
> + u32 *log;
> + int i = 0;
>  
> - obj = dev_priv->guc.log.vma->obj;
> - for (pg = 0; pg < obj->base.size / PAGE_SIZE; pg++) {
> - u32 *log = kmap_atomic(i915_gem_object_get_page(obj, pg));
> + if (dump_load_err)
> + obj = dev_priv->guc.log.load_err;
> + else if (dev_priv->guc.log.vma)
> + obj = dev_priv->guc.log.vma->obj;
>  
> - for (i = 0; i < PAGE_SIZE / sizeof(u32); i += 4)
> - seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
> -*(log + i), *(log + i + 1),
> -*(log + i + 2), *(log + i + 3));
> + if (!obj)
> + return 0;
>  
> - kunmap_atomic(log);
> + log = i915_gem_object_pin_map(obj, I915_MAP_WC);
> + if (IS_ERR(log)) {
> + seq_puts(m, "Failed to pin object\n");

Hmm, quite unusual to report internal message. Maybe:

DRM_DEBUG("Failed to pin object\n");
seq_puts(m, "(log data unaccessible)\n");


> + return PTR_ERR(log);
>   }
>  
> + for (i = 0; i < obj->base.size / sizeof(u32); i += 4)
> + seq_printf(m, "0x%08x 0x%08x 0x%08x 0x%08x\n",
> +*(log + i), *(log + i + 1),
> +*(log + i + 2), *(log + i + 3));
> +
>   seq_putc(m, '\n');
>  
> + i915_gem_object_unpin_map(obj);
> +
>   return 0;
>  }
>  
> @@ -4811,6 +4820,7 @@ static int i915_hpd_storm_ctl_open(struct inode *inode, 
> struct file *file)
>   {"i915_guc_info", i915_guc_info, 0},
>   {"i915_guc_load_status", i915_guc_load_status_info, 0},
>   {"i915_guc_log_dump", i915_guc_log_dump, 0},
> + {"i915_guc_load_err_log_dump", i915_guc_log_dump, 0, (void *)1},

bikeshed: quite long name, maybe "i915_guc_load_log_dump" ?

>   {"i915_guc_stage_pool", i915_guc_stage_pool, 0},
>   {"i915_huc_load_status", i915_huc_load_status_info, 0},
>   {"i915_frequency_info", i915_frequency_info, 0},
> diff --git a/drivers/gpu/drm/i915/intel_guc_log.c 
> b/drivers/gpu/drm/i915/intel_guc_log.c
> index 16d3b87..31e7bec 100644
> --- a/drivers/gpu/drm/i915/intel_guc_log.c
> +++ b/drivers/gpu/drm/i915/intel_guc_log.c
> @@ -660,3 +660,20 @@ void i915_guc_log_unregister(struct drm_i915_private 
> *dev_priv)
>   guc_log_runtime_destroy(&dev_priv->guc);
>   mutex_unlock(&dev_priv->drm.struct_mutex);
>  }
> +
> +void intel_guc_log_capture_load_err(struct intel_guc *guc)
> +{
> + if (!guc->log.vma || i915.guc_log_level < 0)
> + return;
> +
> + if (!guc->log.load_err)
> + guc->log.load_err = i915_gem_object_get(guc->log.vma->obj);
> +
> + return;
> +}
> +
> +void intel_guc_log_free_load_err(struct intel_guc *guc)
> +{
> + if (guc->log.load_err)
> + i915_gem_object_put(guc->log.load_err);
> +}

Based on the discussion from IM, it looks that it would be better to keep
guc->load_err object directly in struct intel_guc, and at the same time
move both capture/free functions to guc_uc.c as static:

void guc_capture_load_err_log(s