Re: [ndctl PATCH v3 2/7] libcxl: Add CXL protocol errors

Dave Jiang Thu, 23 Oct 2025 15:50:36 -0700

On 10/23/25 1:15 PM, Cheatham, Benjamin wrote:
> On 10/21/2025 6:15 PM, Dave Jiang wrote:
>>
>>
>> On 10/21/25 11:31 AM, Ben Cheatham wrote:
>>> The v6.11 Linux kernel adds CXL protocl (CXL.cache & CXL.mem) error
>>> injection for platforms that implement the error types as according to
>>> the v6.5+ ACPI specification. The interface for injecting these errors
>>> are provided by the kernel under the CXL debugfs. The relevant files in
>>> the interface are the einj_types file, which provides the available CXL
>>> error types for injection, and the einj_inject file, which injects the
>>> error into a CXL VH root port or CXL RCH downstream port.
>>>
>>> Add a library API to retrieve the CXL error types and inject them. This
>>> API will be used in a later commit by the 'cxl-inject-error' and
>>> 'cxl-list' commands.
>>>
>>> Signed-off-by: Ben Cheatham <[email protected]>
>>> ---
>>>  cxl/lib/libcxl.c   | 174 +++++++++++++++++++++++++++++++++++++++++++++
>>>  cxl/lib/libcxl.sym |   5 ++
>>>  cxl/lib/private.h  |  14 ++++
>>>  cxl/libcxl.h       |  13 ++++
>>>  4 files changed, 206 insertions(+)
>>>
>>> diff --git a/cxl/lib/libcxl.c b/cxl/lib/libcxl.c
>>> index ea5831f..9486b0f 100644
>>> --- a/cxl/lib/libcxl.c
>>> +++ b/cxl/lib/libcxl.c
>>> @@ -46,11 +46,13 @@ struct cxl_ctx {
>>>     void *userdata;
>>>     int memdevs_init;
>>>     int buses_init;
>>> +   int perrors_init;
>>>     unsigned long timeout;
>>>     struct udev *udev;
>>>     struct udev_queue *udev_queue;
>>>     struct list_head memdevs;
>>>     struct list_head buses;
>>> +   struct list_head perrors;
>>>     struct kmod_ctx *kmod_ctx;
>>>     struct daxctl_ctx *daxctl_ctx;
>>>     void *private_data;
>>> @@ -205,6 +207,14 @@ static void free_bus(struct cxl_bus *bus, struct 
>>> list_head *head)
>>>     free(bus);
>>>  }
>>>  
>>> +static void free_protocol_error(struct cxl_protocol_error *perror,
>>> +                           struct list_head *head)
>>> +{
>>> +   if (head)
>>> +           list_del_from(head, &perror->list);
>>
>> I would go if (!head) return;
>>
> 
> Would that work? I think I would still need to free perror below.

Ah right you need to free that. nm

DJ> 
>>> +   free(perror);
>>> +}
>>> +
>>>  /**
>>>   * cxl_get_userdata - retrieve stored data pointer from library context
>>>   * @ctx: cxl library context
>>> @@ -328,6 +338,7 @@ CXL_EXPORT int cxl_new(struct cxl_ctx **ctx)
>>>     *ctx = c;
>>>     list_head_init(&c->memdevs);
>>>     list_head_init(&c->buses);
>>> +   list_head_init(&c->perrors);
>>>     c->kmod_ctx = kmod_ctx;
>>>     c->daxctl_ctx = daxctl_ctx;
>>>     c->udev = udev;
>>> @@ -369,6 +380,7 @@ CXL_EXPORT struct cxl_ctx *cxl_ref(struct cxl_ctx *ctx)
>>>   */
>>>  CXL_EXPORT void cxl_unref(struct cxl_ctx *ctx)
>>>  {
>>> +   struct cxl_protocol_error *perror, *_p;
>>>     struct cxl_memdev *memdev, *_d;
>>>     struct cxl_bus *bus, *_b;
>>>  
>>> @@ -384,6 +396,9 @@ CXL_EXPORT void cxl_unref(struct cxl_ctx *ctx)
>>>     list_for_each_safe(&ctx->buses, bus, _b, port.list)
>>>             free_bus(bus, &ctx->buses);
>>>  
>>> +   list_for_each_safe(&ctx->perrors, perror, _p, list)
>>> +           free_protocol_error(perror, &ctx->perrors);
>>> +
>>>     udev_queue_unref(ctx->udev_queue);
>>>     udev_unref(ctx->udev);
>>>     kmod_unref(ctx->kmod_ctx);
>>> @@ -3416,6 +3431,165 @@ CXL_EXPORT int cxl_port_decoders_committed(struct 
>>> cxl_port *port)
>>>     return port->decoders_committed;
>>>  }
>>>  
>>> +const struct cxl_protocol_error cxl_protocol_errors[] = {
>>> +   CXL_PROTOCOL_ERROR(12, "cache-correctable"),
>>> +   CXL_PROTOCOL_ERROR(13, "cache-uncorrectable"),
>>> +   CXL_PROTOCOL_ERROR(14, "cache-fatal"),
>>> +   CXL_PROTOCOL_ERROR(15, "mem-correctable"),
>>> +   CXL_PROTOCOL_ERROR(16, "mem-uncorrectable"),
>>> +   CXL_PROTOCOL_ERROR(17, "mem-fatal")
>>> +};
>>> +
>>> +static struct cxl_protocol_error *create_cxl_protocol_error(struct cxl_ctx 
>>> *ctx,
>>> +                                                       unsigned long n)
>>
>> why unsigned long instead of int? are there that many errors?
>>
> 
> No there aren't. I'll change it over to unsigned int instead.
> 
>>> +{
>>> +   struct cxl_protocol_error *perror;
>>> +
>>> +   for (unsigned long i = 0; i < ARRAY_SIZE(cxl_protocol_errors); i++) {
>>> +           if (n != BIT(cxl_protocol_errors[i].num))
>>> +                   continue;
>>> +
>>> +           perror = calloc(1, sizeof(*perror));
>>> +           if (!perror)
>>> +                   return NULL;
>>> +
>>> +           *perror = cxl_protocol_errors[i];
>>> +           perror->ctx = ctx;
>>> +           return perror;
>>> +   }
>>> +
>>> +   return NULL;
>>> +}
>>> +
>>> +static void cxl_add_protocol_errors(struct cxl_ctx *ctx)
>>> +{
>>> +   struct cxl_protocol_error *perror;
>>> +   char *path, *num, *save;
>>> +   unsigned long n;
>>> +   size_t path_len;
>>> +   char buf[512];
>>
>> Use SYSFS_ATTR_SIZE rather than 512
> 
> Wasn't aware of that, will do!
> 
>>
>>> +   int rc = 0;
>>> +
>>> +   if (!ctx->debugfs)
>>> +           return;
>>> +
>>> +   path_len = strlen(ctx->debugfs) + 100;
>>> +   path = calloc(1, path_len);
>>> +   if (!path)
>>> +           return;
>>> +
>>> +   snprintf(path, path_len, "%s/cxl/einj_types", ctx->debugfs);
>>> +   rc = access(path, F_OK);
>>> +   if (rc) {
>>> +           err(ctx, "failed to access %s: %s\n", path, strerror(-rc));
>> strerror(errno)? access() returns -1 and the actual error is in errno.
> 
> My bad, will update it (and elsewhere).
> 
>>> +           goto err;
>>> +   }
>>> +
>>> +   rc = sysfs_read_attr(ctx, path, buf);
>>> +   if (rc) {
>>> +           err(ctx, "failed to read %s: %s\n", path, strerror(-rc));
>>> +           goto err;
>>> +   }
>>> +
>>> +   /*
>>> +    * The format of the output of the einj_types attr is:
>>> +    * <Error number in hex 1> <Error name 1>
>>> +    * <Error number in hex 2> <Error name 2>
>>> +    * ...
>>> +    *
>>> +    * We only need the number, so parse that and skip the rest of
>>> +    * the line.
>>> +    */
>>> +   num = strtok_r(buf, " \n", &save);
>>> +   while (num) {
>>> +           n = strtoul(num, NULL, 16);
>>> +           perror = create_cxl_protocol_error(ctx, n);
>>> +           if (perror)
>>> +                   list_add(&ctx->perrors, &perror->list);
>>> +
>>> +           num = strtok_r(NULL, "\n", &save);
>>> +           if (!num)
>>> +                   break;
>>> +
>>> +           num = strtok_r(NULL, " \n", &save);
>>> +   }
>>> +
>>> +err:
>>> +   free(path);
>>> +}
>>> +
>>> +static void cxl_protocol_errors_init(struct cxl_ctx *ctx)
>>> +{
>>> +   if (ctx->perrors_init)
>>> +           return;
>>> +
>>> +   ctx->perrors_init = 1;
>>> +   cxl_add_protocol_errors(ctx);
>>> +}
>>> +
>>> +CXL_EXPORT struct cxl_protocol_error *
>>> +cxl_protocol_error_get_first(struct cxl_ctx *ctx)
>>> +{
>>> +   cxl_protocol_errors_init(ctx);
>>> +
>>> +   return list_top(&ctx->perrors, struct cxl_protocol_error, list);
>>> +}
>>> +
>>> +CXL_EXPORT struct cxl_protocol_error *
>>> +cxl_protocol_error_get_next(struct cxl_protocol_error *perror)
>>> +{
>>> +   struct cxl_ctx *ctx = perror->ctx;
>>> +
>>> +   return list_next(&ctx->perrors, perror, list);
>>> +}
>>> +
>>> +CXL_EXPORT unsigned long
>>> +cxl_protocol_error_get_num(struct cxl_protocol_error *perror)
>>> +{
>>> +   return perror->num;
>>> +}
>>> +
>>> +CXL_EXPORT const char *
>>> +cxl_protocol_error_get_str(struct cxl_protocol_error *perror)
>>> +{
>>> +   return perror->string;
>>> +}
>>> +
>>> +CXL_EXPORT int cxl_dport_protocol_error_inject(struct cxl_dport *dport,
>>> +                                          unsigned long error)
>>> +{
>>> +   struct cxl_ctx *ctx = dport->port->ctx;
>>> +   unsigned long path_len;
>>> +   char buf[32] = { 0 };
>>> +   char *path;
>>> +   int rc;
>>> +
>>> +   if (!ctx->debugfs)
>>> +           return -ENOENT;
>>> +
>>> +   path_len = strlen(ctx->debugfs) + 100;
>>> +   path = calloc(path_len, sizeof(char));
>>> +   if (!path)
>>> +           return -ENOMEM;
>>> +
>>> +   snprintf(path, path_len, "%s/cxl/%s/einj_inject", ctx->debugfs,
>>> +            cxl_dport_get_devname(dport));
>>
>> check return value
> 
> Yep, will do (elsewhere as well).
> 
>>
>>> +   rc = access(path, F_OK);
>>> +   if (rc) {
>>> +           err(ctx, "failed to access %s: %s\n", path, strerror(-rc));
>>
>> errno
>>
>>> +           free(path);
>>> +           return rc;
>> -errno instead of rc
>>
>>> +   }
>>> +
>>> +   snprintf(buf, sizeof(buf), "0x%lx\n", error);
>>
>> check return value?
>>
>> DJ
>>
Re: [ndctl PATCH v3 2/7] libcxl: Add CXL protocol errors

Reply via email to