Re: [ndctl PATCH v3 2/7] libcxl: Add CXL protocol errors

Cheatham, Benjamin Thu, 23 Oct 2025 13:15:38 -0700

On 10/21/2025 6:15 PM, Dave Jiang wrote:
> 
> 
> On 10/21/25 11:31 AM, Ben Cheatham wrote:
>> The v6.11 Linux kernel adds CXL protocl (CXL.cache & CXL.mem) error
>> injection for platforms that implement the error types as according to
>> the v6.5+ ACPI specification. The interface for injecting these errors
>> are provided by the kernel under the CXL debugfs. The relevant files in
>> the interface are the einj_types file, which provides the available CXL
>> error types for injection, and the einj_inject file, which injects the
>> error into a CXL VH root port or CXL RCH downstream port.
>>
>> Add a library API to retrieve the CXL error types and inject them. This
>> API will be used in a later commit by the 'cxl-inject-error' and
>> 'cxl-list' commands.
>>
>> Signed-off-by: Ben Cheatham <[email protected]>
>> ---
>>  cxl/lib/libcxl.c   | 174 +++++++++++++++++++++++++++++++++++++++++++++
>>  cxl/lib/libcxl.sym |   5 ++
>>  cxl/lib/private.h  |  14 ++++
>>  cxl/libcxl.h       |  13 ++++
>>  4 files changed, 206 insertions(+)
>>
>> diff --git a/cxl/lib/libcxl.c b/cxl/lib/libcxl.c
>> index ea5831f..9486b0f 100644
>> --- a/cxl/lib/libcxl.c
>> +++ b/cxl/lib/libcxl.c
>> @@ -46,11 +46,13 @@ struct cxl_ctx {
>>      void *userdata;
>>      int memdevs_init;
>>      int buses_init;
>> +    int perrors_init;
>>      unsigned long timeout;
>>      struct udev *udev;
>>      struct udev_queue *udev_queue;
>>      struct list_head memdevs;
>>      struct list_head buses;
>> +    struct list_head perrors;
>>      struct kmod_ctx *kmod_ctx;
>>      struct daxctl_ctx *daxctl_ctx;
>>      void *private_data;
>> @@ -205,6 +207,14 @@ static void free_bus(struct cxl_bus *bus, struct 
>> list_head *head)
>>      free(bus);
>>  }
>>  
>> +static void free_protocol_error(struct cxl_protocol_error *perror,
>> +                            struct list_head *head)
>> +{
>> +    if (head)
>> +            list_del_from(head, &perror->list);
> 
> I would go if (!head) return;
>


Would that work? I think I would still need to free perror below.

>> +    free(perror);
>> +}
>> +
>>  /**
>>   * cxl_get_userdata - retrieve stored data pointer from library context
>>   * @ctx: cxl library context
>> @@ -328,6 +338,7 @@ CXL_EXPORT int cxl_new(struct cxl_ctx **ctx)
>>      *ctx = c;
>>      list_head_init(&c->memdevs);
>>      list_head_init(&c->buses);
>> +    list_head_init(&c->perrors);
>>      c->kmod_ctx = kmod_ctx;
>>      c->daxctl_ctx = daxctl_ctx;
>>      c->udev = udev;
>> @@ -369,6 +380,7 @@ CXL_EXPORT struct cxl_ctx *cxl_ref(struct cxl_ctx *ctx)
>>   */
>>  CXL_EXPORT void cxl_unref(struct cxl_ctx *ctx)
>>  {
>> +    struct cxl_protocol_error *perror, *_p;
>>      struct cxl_memdev *memdev, *_d;
>>      struct cxl_bus *bus, *_b;
>>  
>> @@ -384,6 +396,9 @@ CXL_EXPORT void cxl_unref(struct cxl_ctx *ctx)
>>      list_for_each_safe(&ctx->buses, bus, _b, port.list)
>>              free_bus(bus, &ctx->buses);
>>  
>> +    list_for_each_safe(&ctx->perrors, perror, _p, list)
>> +            free_protocol_error(perror, &ctx->perrors);
>> +
>>      udev_queue_unref(ctx->udev_queue);
>>      udev_unref(ctx->udev);
>>      kmod_unref(ctx->kmod_ctx);
>> @@ -3416,6 +3431,165 @@ CXL_EXPORT int cxl_port_decoders_committed(struct 
>> cxl_port *port)
>>      return port->decoders_committed;
>>  }
>>  
>> +const struct cxl_protocol_error cxl_protocol_errors[] = {
>> +    CXL_PROTOCOL_ERROR(12, "cache-correctable"),
>> +    CXL_PROTOCOL_ERROR(13, "cache-uncorrectable"),
>> +    CXL_PROTOCOL_ERROR(14, "cache-fatal"),
>> +    CXL_PROTOCOL_ERROR(15, "mem-correctable"),
>> +    CXL_PROTOCOL_ERROR(16, "mem-uncorrectable"),
>> +    CXL_PROTOCOL_ERROR(17, "mem-fatal")
>> +};
>> +
>> +static struct cxl_protocol_error *create_cxl_protocol_error(struct cxl_ctx 
>> *ctx,
>> +                                                        unsigned long n)
> 
> why unsigned long instead of int? are there that many errors?
>

No there aren't. I'll change it over to unsigned int instead.

>> +{
>> +    struct cxl_protocol_error *perror;
>> +
>> +    for (unsigned long i = 0; i < ARRAY_SIZE(cxl_protocol_errors); i++) {
>> +            if (n != BIT(cxl_protocol_errors[i].num))
>> +                    continue;
>> +
>> +            perror = calloc(1, sizeof(*perror));
>> +            if (!perror)
>> +                    return NULL;
>> +
>> +            *perror = cxl_protocol_errors[i];
>> +            perror->ctx = ctx;
>> +            return perror;
>> +    }
>> +
>> +    return NULL;
>> +}
>> +
>> +static void cxl_add_protocol_errors(struct cxl_ctx *ctx)
>> +{
>> +    struct cxl_protocol_error *perror;
>> +    char *path, *num, *save;
>> +    unsigned long n;
>> +    size_t path_len;
>> +    char buf[512];
> 
> Use SYSFS_ATTR_SIZE rather than 512

Wasn't aware of that, will do!

> 
>> +    int rc = 0;
>> +
>> +    if (!ctx->debugfs)
>> +            return;
>> +
>> +    path_len = strlen(ctx->debugfs) + 100;
>> +    path = calloc(1, path_len);
>> +    if (!path)
>> +            return;
>> +
>> +    snprintf(path, path_len, "%s/cxl/einj_types", ctx->debugfs);
>> +    rc = access(path, F_OK);
>> +    if (rc) {
>> +            err(ctx, "failed to access %s: %s\n", path, strerror(-rc));
> strerror(errno)? access() returns -1 and the actual error is in errno.

My bad, will update it (and elsewhere).

>> +            goto err;
>> +    }
>> +
>> +    rc = sysfs_read_attr(ctx, path, buf);
>> +    if (rc) {
>> +            err(ctx, "failed to read %s: %s\n", path, strerror(-rc));
>> +            goto err;
>> +    }
>> +
>> +    /*
>> +     * The format of the output of the einj_types attr is:
>> +     * <Error number in hex 1> <Error name 1>
>> +     * <Error number in hex 2> <Error name 2>
>> +     * ...
>> +     *
>> +     * We only need the number, so parse that and skip the rest of
>> +     * the line.
>> +     */
>> +    num = strtok_r(buf, " \n", &save);
>> +    while (num) {
>> +            n = strtoul(num, NULL, 16);
>> +            perror = create_cxl_protocol_error(ctx, n);
>> +            if (perror)
>> +                    list_add(&ctx->perrors, &perror->list);
>> +
>> +            num = strtok_r(NULL, "\n", &save);
>> +            if (!num)
>> +                    break;
>> +
>> +            num = strtok_r(NULL, " \n", &save);
>> +    }
>> +
>> +err:
>> +    free(path);
>> +}
>> +
>> +static void cxl_protocol_errors_init(struct cxl_ctx *ctx)
>> +{
>> +    if (ctx->perrors_init)
>> +            return;
>> +
>> +    ctx->perrors_init = 1;
>> +    cxl_add_protocol_errors(ctx);
>> +}
>> +
>> +CXL_EXPORT struct cxl_protocol_error *
>> +cxl_protocol_error_get_first(struct cxl_ctx *ctx)
>> +{
>> +    cxl_protocol_errors_init(ctx);
>> +
>> +    return list_top(&ctx->perrors, struct cxl_protocol_error, list);
>> +}
>> +
>> +CXL_EXPORT struct cxl_protocol_error *
>> +cxl_protocol_error_get_next(struct cxl_protocol_error *perror)
>> +{
>> +    struct cxl_ctx *ctx = perror->ctx;
>> +
>> +    return list_next(&ctx->perrors, perror, list);
>> +}
>> +
>> +CXL_EXPORT unsigned long
>> +cxl_protocol_error_get_num(struct cxl_protocol_error *perror)
>> +{
>> +    return perror->num;
>> +}
>> +
>> +CXL_EXPORT const char *
>> +cxl_protocol_error_get_str(struct cxl_protocol_error *perror)
>> +{
>> +    return perror->string;
>> +}
>> +
>> +CXL_EXPORT int cxl_dport_protocol_error_inject(struct cxl_dport *dport,
>> +                                           unsigned long error)
>> +{
>> +    struct cxl_ctx *ctx = dport->port->ctx;
>> +    unsigned long path_len;
>> +    char buf[32] = { 0 };
>> +    char *path;
>> +    int rc;
>> +
>> +    if (!ctx->debugfs)
>> +            return -ENOENT;
>> +
>> +    path_len = strlen(ctx->debugfs) + 100;
>> +    path = calloc(path_len, sizeof(char));
>> +    if (!path)
>> +            return -ENOMEM;
>> +
>> +    snprintf(path, path_len, "%s/cxl/%s/einj_inject", ctx->debugfs,
>> +             cxl_dport_get_devname(dport));
> 
> check return value

Yep, will do (elsewhere as well).

> 
>> +    rc = access(path, F_OK);
>> +    if (rc) {
>> +            err(ctx, "failed to access %s: %s\n", path, strerror(-rc));
> 
> errno
> 
>> +            free(path);
>> +            return rc;
> -errno instead of rc
> 
>> +    }
>> +
>> +    snprintf(buf, sizeof(buf), "0x%lx\n", error);
> 
> check return value?
> 
> DJ
>

Re: [ndctl PATCH v3 2/7] libcxl: Add CXL protocol errors

Reply via email to