Create a devcoredump on any faulty or fatal event. The coredump data is
in YAML format for readability and flexibility.

Only panthor_group state is captured for now.

Signed-off-by: Chia-I Wu <olva...@gmail.com>
---
 drivers/gpu/drm/panthor/Makefile           |   2 +
 drivers/gpu/drm/panthor/panthor_coredump.c | 225 +++++++++++++++++++++
 drivers/gpu/drm/panthor/panthor_coredump.h |  68 +++++++
 drivers/gpu/drm/panthor/panthor_device.h   |   6 +
 drivers/gpu/drm/panthor/panthor_sched.c    |  69 +++++++
 drivers/gpu/drm/panthor/panthor_sched.h    |   5 +
 6 files changed, 375 insertions(+)
 create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.c
 create mode 100644 drivers/gpu/drm/panthor/panthor_coredump.h

diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile
index 15294719b09c..9fd1e74af1df 100644
--- a/drivers/gpu/drm/panthor/Makefile
+++ b/drivers/gpu/drm/panthor/Makefile
@@ -11,4 +11,6 @@ panthor-y := \
        panthor_mmu.o \
        panthor_sched.o
 
+panthor-$(CONFIG_DEV_COREDUMP) += panthor_coredump.o
+
 obj-$(CONFIG_DRM_PANTHOR) += panthor.o
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.c 
b/drivers/gpu/drm/panthor/panthor_coredump.c
new file mode 100644
index 000000000000..767f3327e3e8
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_coredump.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/* Copyright 2025 Google LLC */
+
+#include <drm/drm_drv.h>
+#include <drm/drm_print.h>
+#include <drm/drm_managed.h>
+#include <generated/utsrelease.h>
+#include <linux/devcoredump.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+
+#include "panthor_coredump.h"
+#include "panthor_device.h"
+#include "panthor_sched.h"
+
+/**
+ * enum panthor_coredump_mask - Coredump state
+ */
+enum panthor_coredump_mask {
+       PANTHOR_COREDUMP_GROUP = BIT(0),
+};
+
+/**
+ * struct panthor_coredump_header - Coredump header
+ */
+struct panthor_coredump_header {
+       enum panthor_coredump_reason reason;
+       ktime_t timestamp;
+};
+
+/**
+ * struct panthor_coredump - Coredump
+ */
+struct panthor_coredump {
+       /** @ptdev: Device. */
+       struct panthor_device *ptdev;
+
+       /** @work: Bottom half of panthor_coredump_capture. */
+       struct work_struct work;
+
+       /** @header: Header. */
+       struct panthor_coredump_header header;
+
+       /** @mask: Bitmask of captured states. */
+       u32 mask;
+
+       struct panthor_coredump_group_state group;
+
+       /* @data: Serialized coredump data. */
+       void *data;
+
+       /* @size: Serialized coredump size. */
+       size_t size;
+};
+
+static const char *reason_str(enum panthor_coredump_reason reason)
+{
+       switch (reason) {
+       case PANTHOR_COREDUMP_REASON_MMU_FAULT:
+               return "MMU_FAULT";
+       case PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT:
+               return "CSG_REQ_TIMEOUT";
+       case PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE:
+               return "CSG_UNKNOWN_STATE";
+       case PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT:
+               return "CSG_PROGRESS_TIMEOUT";
+       case PANTHOR_COREDUMP_REASON_CS_FATAL:
+               return "CS_FATAL";
+       case PANTHOR_COREDUMP_REASON_CS_FAULT:
+               return "CS_FAULT";
+       case PANTHOR_COREDUMP_REASON_CS_TILER_OOM:
+               return "CS_TILER_OOM";
+       case PANTHOR_COREDUMP_REASON_JOB_TIMEOUT:
+               return "JOB_TIMEOUT";
+       default:
+               return "UNKNOWN";
+       }
+}
+
+static void print_group(struct drm_printer *p,
+                       const struct panthor_coredump_group_state *group)
+{
+       drm_puts(p, "group:\n");
+       drm_printf(p, "  priority: %d\n", group->priority);
+       drm_printf(p, "  queue_count: %u\n", group->queue_count);
+       drm_printf(p, "  pid: %d\n", group->pid);
+       drm_printf(p, "  comm: %s\n", group->comm);
+       drm_printf(p, "  destroyed: %d\n", group->destroyed);
+       drm_printf(p, "  csg_id: %d\n", group->csg_id);
+}
+
+static void print_header(struct drm_printer *p,
+                        const struct panthor_coredump_header *header,
+                        const struct drm_driver *drv)
+{
+       drm_puts(p, "header:\n");
+       drm_puts(p, "  kernel: " UTS_RELEASE "\n");
+       drm_puts(p, "  module: " KBUILD_MODNAME "\n");
+       drm_printf(p, "  driver_version: %d.%d\n", drv->major, drv->minor);
+
+       drm_printf(p, "  reason: %s\n", reason_str(header->reason));
+       drm_printf(p, "  timestamp: %lld\n", ktime_to_ns(header->timestamp));
+}
+
+static void print_cd(struct drm_printer *p, const struct panthor_coredump *cd)
+{
+       /* in YAML format */
+       drm_puts(p, "---\n");
+       print_header(p, &cd->header, cd->ptdev->base.driver);
+
+       if (cd->mask & PANTHOR_COREDUMP_GROUP)
+               print_group(p, &cd->group);
+}
+
+static void process_cd(struct panthor_device *ptdev,
+                      struct panthor_coredump *cd)
+{
+       struct drm_print_iterator iter = {
+               .remain = SSIZE_MAX,
+       };
+       struct drm_printer p = drm_coredump_printer(&iter);
+
+       print_cd(&p, cd);
+
+       iter.remain = SSIZE_MAX - iter.remain;
+       iter.data = kvmalloc(iter.remain, GFP_USER);
+       if (!iter.data)
+               return;
+
+       cd->data = iter.data;
+       cd->size = iter.remain;
+
+       drm_info(&ptdev->base, "generating coredump of size %zu\n", cd->size);
+
+       p = drm_coredump_printer(&iter);
+       print_cd(&p, cd);
+}
+
+static void capture_cd(struct panthor_device *ptdev,
+                      struct panthor_coredump *cd, struct panthor_group *group)
+{
+       drm_info(&ptdev->base, "capturing coredump states\n");
+
+       if (group) {
+               panthor_group_capture_coredump(group, &cd->group);
+               cd->mask |= PANTHOR_COREDUMP_GROUP;
+       }
+}
+
+static void panthor_coredump_free(void *data)
+{
+       struct panthor_coredump *cd = data;
+       struct panthor_device *ptdev = cd->ptdev;
+
+       kvfree(cd->data);
+       kfree(cd);
+
+       atomic_set(&ptdev->coredump.pending, 0);
+}
+
+static ssize_t panthor_coredump_read(char *buffer, loff_t offset, size_t count,
+                                    void *data, size_t datalen)
+{
+       const struct panthor_coredump *cd = data;
+
+       if (offset >= cd->size)
+               return 0;
+
+       if (count > cd->size - offset)
+               count = cd->size - offset;
+
+       memcpy(buffer, cd->data + offset, count);
+
+       return count;
+}
+
+static void panthor_coredump_process_work(struct work_struct *work)
+{
+       struct panthor_coredump *cd =
+               container_of(work, struct panthor_coredump, work);
+       struct panthor_device *ptdev = cd->ptdev;
+
+       process_cd(ptdev, cd);
+
+       dev_coredumpm(ptdev->base.dev, THIS_MODULE, cd, 0, GFP_KERNEL,
+                     panthor_coredump_read, panthor_coredump_free);
+}
+
+void panthor_coredump_capture(struct panthor_coredump *cd,
+                             struct panthor_group *group)
+{
+       struct panthor_device *ptdev = cd->ptdev;
+
+       capture_cd(ptdev, cd, group);
+
+       queue_work(system_unbound_wq, &cd->work);
+}
+
+struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+                      enum panthor_coredump_reason reason, gfp_t gfp)
+{
+       struct panthor_coredump *cd;
+
+       /* reject all but the first coredump until it is handled */
+       if (atomic_cmpxchg(&ptdev->coredump.pending, 0, 1)) {
+               drm_dbg(&ptdev->base, "skip subsequent coredump\n");
+               return NULL;
+       }
+
+       cd = kzalloc(sizeof(*cd), gfp);
+       if (!cd) {
+               atomic_set(&ptdev->coredump.pending, 0);
+               return NULL;
+       }
+
+       cd->ptdev = ptdev;
+       INIT_WORK(&cd->work, panthor_coredump_process_work);
+
+       cd->header.reason = reason;
+       cd->header.timestamp = ktime_get_real();
+
+       return cd;
+}
diff --git a/drivers/gpu/drm/panthor/panthor_coredump.h 
b/drivers/gpu/drm/panthor/panthor_coredump.h
new file mode 100644
index 000000000000..dd1fe1c2e175
--- /dev/null
+++ b/drivers/gpu/drm/panthor/panthor_coredump.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+/* Copyright 2019 Collabora ltd. */
+
+#ifndef __PANTHOR_COREDUMP_H__
+#define __PANTHOR_COREDUMP_H__
+
+#include <drm/panthor_drm.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+struct panthor_coredump;
+struct panthor_device;
+struct panthor_group;
+
+/**
+ * enum panthor_coredump_reason - Coredump reason
+ */
+enum panthor_coredump_reason {
+       PANTHOR_COREDUMP_REASON_MMU_FAULT,
+       PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
+       PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
+       PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT,
+       PANTHOR_COREDUMP_REASON_CS_FATAL,
+       PANTHOR_COREDUMP_REASON_CS_FAULT,
+       PANTHOR_COREDUMP_REASON_CS_TILER_OOM,
+       PANTHOR_COREDUMP_REASON_JOB_TIMEOUT,
+};
+
+/**
+ * struct panthor_coredump_group_state - Coredump group state
+ *
+ * Interesting panthor_group fields.
+ */
+struct panthor_coredump_group_state {
+       enum drm_panthor_group_priority priority;
+       u32 queue_count;
+       pid_t pid;
+       char comm[TASK_COMM_LEN];
+       bool destroyed;
+       int csg_id;
+};
+
+#ifdef CONFIG_DEV_COREDUMP
+
+struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+                      enum panthor_coredump_reason reason, gfp_t gfp);
+
+void panthor_coredump_capture(struct panthor_coredump *cd,
+                             struct panthor_group *group);
+
+#else /* CONFIG_DEV_COREDUMP */
+
+static inline struct panthor_coredump *
+panthor_coredump_alloc(struct panthor_device *ptdev,
+                      enum panthor_coredump_reason reason, gfp_t gfp)
+{
+       return NULL;
+}
+
+static inline void panthor_coredump_capture(struct panthor_coredump *cd,
+                                           struct panthor_group *group)
+{
+}
+
+#endif /* CONFIG_DEV_COREDUMP */
+
+#endif /* __PANTHOR_COREDUMP_H__ */
diff --git a/drivers/gpu/drm/panthor/panthor_device.h 
b/drivers/gpu/drm/panthor/panthor_device.h
index 4fc7cf2aeed5..766e53c25cfa 100644
--- a/drivers/gpu/drm/panthor/panthor_device.h
+++ b/drivers/gpu/drm/panthor/panthor_device.h
@@ -197,6 +197,12 @@ struct panthor_device {
                atomic_t recovery_needed;
        } pm;
 
+       /** @coredump: Coredump-related data. */
+       struct {
+               /** @pending: True if there is a pending coredump. */
+               atomic_t pending;
+       } coredump;
+
        /** @profile_mask: User-set profiling flags for job accounting. */
        u32 profile_mask;
 
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c 
b/drivers/gpu/drm/panthor/panthor_sched.c
index a2248f692a03..eb45b5ad9774 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -23,6 +23,7 @@
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 
+#include "panthor_coredump.h"
 #include "panthor_devfreq.h"
 #include "panthor_device.h"
 #include "panthor_fw.h"
@@ -1031,6 +1032,10 @@ group_unbind_locked(struct panthor_group *group)
        return 0;
 }
 
+static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
+                                         enum panthor_coredump_reason reason,
+                                         struct panthor_group *group);
+
 /**
  * cs_slot_prog_locked() - Program a queue slot
  * @ptdev: Device.
@@ -1249,6 +1254,10 @@ csg_slot_sync_state_locked(struct panthor_device *ptdev, 
u32 csg_id)
                drm_err(&ptdev->base, "Invalid state on CSG %d (state=%d)",
                        csg_id, csg_state);
                new_state = PANTHOR_CS_GROUP_UNKNOWN_STATE;
+
+               panthor_sched_coredump_locked(
+                       ptdev, PANTHOR_COREDUMP_REASON_CSG_UNKNOWN_STATE,
+                       group);
                break;
        }
 
@@ -1378,6 +1387,9 @@ cs_slot_process_fatal_event_locked(struct panthor_device 
*ptdev,
                 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)),
                 (unsigned int)CS_EXCEPTION_DATA(fatal),
                 info);
+
+       panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FATAL,
+                                     group);
 }
 
 static void
@@ -1426,6 +1438,9 @@ cs_slot_process_fault_event_locked(struct panthor_device 
*ptdev,
                 panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)),
                 (unsigned int)CS_EXCEPTION_DATA(fault),
                 info);
+
+       panthor_sched_coredump_locked(ptdev, PANTHOR_COREDUMP_REASON_CS_FAULT,
+                                     group);
 }
 
 static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id)
@@ -1480,6 +1495,10 @@ static int group_process_tiler_oom(struct panthor_group 
*group, u32 cs_id)
                drm_warn(&ptdev->base, "Failed to extend the tiler heap\n");
                group->fatal_queues |= BIT(cs_id);
                sched_queue_delayed_work(sched, tick, 0);
+
+               panthor_sched_coredump_locked(
+                       ptdev, PANTHOR_COREDUMP_REASON_CS_TILER_OOM, group);
+
                goto out_put_heap_pool;
        }
 
@@ -1639,6 +1658,9 @@ csg_slot_process_progress_timer_event_locked(struct 
panthor_device *ptdev, u32 c
                group->timedout = true;
 
        sched_queue_delayed_work(sched, tick, 0);
+
+       panthor_sched_coredump_locked(
+               ptdev, PANTHOR_COREDUMP_REASON_CSG_PROGRESS_TIMEOUT, group);
 }
 
 static void sched_process_csg_irq_locked(struct panthor_device *ptdev, u32 
csg_id)
@@ -1858,8 +1880,16 @@ static int csgs_upd_ctx_apply_locked(struct 
panthor_device *ptdev,
 
                if (ret && acked != req_mask &&
                    ((csg_iface->input->req ^ csg_iface->output->ack) & 
req_mask) != 0) {
+                       struct panthor_csg_slot *csg_slot =
+                               &sched->csg_slots[csg_id];
+                       struct panthor_group *group = csg_slot->group;
+
                        drm_err(&ptdev->base, "CSG %d update request timedout", 
csg_id);
                        ctx->timedout_mask |= BIT(csg_id);
+
+                       panthor_sched_coredump_locked(
+                               ptdev, PANTHOR_COREDUMP_REASON_CSG_REQ_TIMEOUT,
+                               group);
                }
        }
 
@@ -2027,6 +2057,10 @@ tick_ctx_init(struct panthor_scheduler *sched,
                 * CSG IRQs, so we can flag the faulty queue.
                 */
                if (panthor_vm_has_unhandled_faults(group->vm)) {
+                       panthor_sched_coredump_locked(
+                               ptdev, PANTHOR_COREDUMP_REASON_MMU_FAULT,
+                               group);
+
                        sched_process_csg_irq_locked(ptdev, i);
 
                        /* No fatal fault reported, flag all queues as faulty. 
*/
@@ -3237,6 +3271,10 @@ queue_timedout_job(struct drm_sched_job *sched_job)
 
                group_queue_work(group, term);
        }
+
+       panthor_sched_coredump_locked(
+               ptdev, PANTHOR_COREDUMP_REASON_JOB_TIMEOUT, group);
+
        mutex_unlock(&sched->lock);
 
        queue_start(queue);
@@ -3627,6 +3665,37 @@ int panthor_group_get_state(struct panthor_file *pfile,
        return 0;
 }
 
+static void panthor_sched_coredump_locked(struct panthor_device *ptdev,
+                                         enum panthor_coredump_reason reason,
+                                         struct panthor_group *group)
+{
+       struct panthor_coredump *cd;
+
+       lockdep_assert_held(&ptdev->scheduler->lock);
+
+       /* GFP_NOWAIT because this may be called from fence signaling path */
+       cd = panthor_coredump_alloc(ptdev, reason, GFP_NOWAIT);
+       if (!cd)
+               return;
+
+       panthor_coredump_capture(cd, group);
+}
+
+void panthor_group_capture_coredump(const struct panthor_group *group,
+                                   struct panthor_coredump_group_state *state)
+{
+       const struct panthor_device *ptdev = group->ptdev;
+
+       /* this is called from panthor_coredump_capture */
+       lockdep_assert_held(&ptdev->scheduler->lock);
+
+       state->priority = group->priority;
+       state->queue_count = group->queue_count;
+       /* TODO state->pid and state->comm */
+       state->destroyed = group->destroyed;
+       state->csg_id = group->csg_id;
+}
+
 int panthor_group_pool_create(struct panthor_file *pfile)
 {
        struct panthor_group_pool *gpool;
diff --git a/drivers/gpu/drm/panthor/panthor_sched.h 
b/drivers/gpu/drm/panthor/panthor_sched.h
index 742b0b4ff3a3..6c564153133e 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.h
+++ b/drivers/gpu/drm/panthor/panthor_sched.h
@@ -14,8 +14,10 @@ struct drm_panthor_group_create;
 struct drm_panthor_queue_create;
 struct drm_panthor_group_get_state;
 struct drm_panthor_queue_submit;
+struct panthor_coredump_group_state;
 struct panthor_device;
 struct panthor_file;
+struct panthor_group;
 struct panthor_group_pool;
 struct panthor_job;
 
@@ -26,6 +28,9 @@ int panthor_group_destroy(struct panthor_file *pfile, u32 
group_handle);
 int panthor_group_get_state(struct panthor_file *pfile,
                            struct drm_panthor_group_get_state *get_state);
 
+void panthor_group_capture_coredump(const struct panthor_group *group,
+                                   struct panthor_coredump_group_state *state);
+
 struct drm_sched_job *
 panthor_job_create(struct panthor_file *pfile,
                   u16 group_handle,
-- 
2.50.0.727.gbf7dc18ff4-goog

Reply via email to