On 10/21/2025 6:15 PM, Dave Jiang wrote:
>
>
> On 10/21/25 11:31 AM, Ben Cheatham wrote:
>> The v6.11 Linux kernel adds CXL protocl (CXL.cache & CXL.mem) error
>> injection for platforms that implement the error types as according to
>> the v6.5+ ACPI specification. The interface for injecting these errors
>> are provided by the kernel under the CXL debugfs. The relevant files in
>> the interface are the einj_types file, which provides the available CXL
>> error types for injection, and the einj_inject file, which injects the
>> error into a CXL VH root port or CXL RCH downstream port.
>>
>> Add a library API to retrieve the CXL error types and inject them. This
>> API will be used in a later commit by the 'cxl-inject-error' and
>> 'cxl-list' commands.
>>
>> Signed-off-by: Ben Cheatham <[email protected]>
>> ---
>> cxl/lib/libcxl.c | 174 +++++++++++++++++++++++++++++++++++++++++++++
>> cxl/lib/libcxl.sym | 5 ++
>> cxl/lib/private.h | 14 ++++
>> cxl/libcxl.h | 13 ++++
>> 4 files changed, 206 insertions(+)
>>
>> diff --git a/cxl/lib/libcxl.c b/cxl/lib/libcxl.c
>> index ea5831f..9486b0f 100644
>> --- a/cxl/lib/libcxl.c
>> +++ b/cxl/lib/libcxl.c
>> @@ -46,11 +46,13 @@ struct cxl_ctx {
>> void *userdata;
>> int memdevs_init;
>> int buses_init;
>> + int perrors_init;
>> unsigned long timeout;
>> struct udev *udev;
>> struct udev_queue *udev_queue;
>> struct list_head memdevs;
>> struct list_head buses;
>> + struct list_head perrors;
>> struct kmod_ctx *kmod_ctx;
>> struct daxctl_ctx *daxctl_ctx;
>> void *private_data;
>> @@ -205,6 +207,14 @@ static void free_bus(struct cxl_bus *bus, struct
>> list_head *head)
>> free(bus);
>> }
>>
>> +static void free_protocol_error(struct cxl_protocol_error *perror,
>> + struct list_head *head)
>> +{
>> + if (head)
>> + list_del_from(head, &perror->list);
>
> I would go if (!head) return;
>
Would that work? I think I would still need to free perror below.
>> + free(perror);
>> +}
>> +
>> /**
>> * cxl_get_userdata - retrieve stored data pointer from library context
>> * @ctx: cxl library context
>> @@ -328,6 +338,7 @@ CXL_EXPORT int cxl_new(struct cxl_ctx **ctx)
>> *ctx = c;
>> list_head_init(&c->memdevs);
>> list_head_init(&c->buses);
>> + list_head_init(&c->perrors);
>> c->kmod_ctx = kmod_ctx;
>> c->daxctl_ctx = daxctl_ctx;
>> c->udev = udev;
>> @@ -369,6 +380,7 @@ CXL_EXPORT struct cxl_ctx *cxl_ref(struct cxl_ctx *ctx)
>> */
>> CXL_EXPORT void cxl_unref(struct cxl_ctx *ctx)
>> {
>> + struct cxl_protocol_error *perror, *_p;
>> struct cxl_memdev *memdev, *_d;
>> struct cxl_bus *bus, *_b;
>>
>> @@ -384,6 +396,9 @@ CXL_EXPORT void cxl_unref(struct cxl_ctx *ctx)
>> list_for_each_safe(&ctx->buses, bus, _b, port.list)
>> free_bus(bus, &ctx->buses);
>>
>> + list_for_each_safe(&ctx->perrors, perror, _p, list)
>> + free_protocol_error(perror, &ctx->perrors);
>> +
>> udev_queue_unref(ctx->udev_queue);
>> udev_unref(ctx->udev);
>> kmod_unref(ctx->kmod_ctx);
>> @@ -3416,6 +3431,165 @@ CXL_EXPORT int cxl_port_decoders_committed(struct
>> cxl_port *port)
>> return port->decoders_committed;
>> }
>>
>> +const struct cxl_protocol_error cxl_protocol_errors[] = {
>> + CXL_PROTOCOL_ERROR(12, "cache-correctable"),
>> + CXL_PROTOCOL_ERROR(13, "cache-uncorrectable"),
>> + CXL_PROTOCOL_ERROR(14, "cache-fatal"),
>> + CXL_PROTOCOL_ERROR(15, "mem-correctable"),
>> + CXL_PROTOCOL_ERROR(16, "mem-uncorrectable"),
>> + CXL_PROTOCOL_ERROR(17, "mem-fatal")
>> +};
>> +
>> +static struct cxl_protocol_error *create_cxl_protocol_error(struct cxl_ctx
>> *ctx,
>> + unsigned long n)
>
> why unsigned long instead of int? are there that many errors?
>
No there aren't. I'll change it over to unsigned int instead.
>> +{
>> + struct cxl_protocol_error *perror;
>> +
>> + for (unsigned long i = 0; i < ARRAY_SIZE(cxl_protocol_errors); i++) {
>> + if (n != BIT(cxl_protocol_errors[i].num))
>> + continue;
>> +
>> + perror = calloc(1, sizeof(*perror));
>> + if (!perror)
>> + return NULL;
>> +
>> + *perror = cxl_protocol_errors[i];
>> + perror->ctx = ctx;
>> + return perror;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static void cxl_add_protocol_errors(struct cxl_ctx *ctx)
>> +{
>> + struct cxl_protocol_error *perror;
>> + char *path, *num, *save;
>> + unsigned long n;
>> + size_t path_len;
>> + char buf[512];
>
> Use SYSFS_ATTR_SIZE rather than 512
Wasn't aware of that, will do!
>
>> + int rc = 0;
>> +
>> + if (!ctx->debugfs)
>> + return;
>> +
>> + path_len = strlen(ctx->debugfs) + 100;
>> + path = calloc(1, path_len);
>> + if (!path)
>> + return;
>> +
>> + snprintf(path, path_len, "%s/cxl/einj_types", ctx->debugfs);
>> + rc = access(path, F_OK);
>> + if (rc) {
>> + err(ctx, "failed to access %s: %s\n", path, strerror(-rc));
> strerror(errno)? access() returns -1 and the actual error is in errno.
My bad, will update it (and elsewhere).
>> + goto err;
>> + }
>> +
>> + rc = sysfs_read_attr(ctx, path, buf);
>> + if (rc) {
>> + err(ctx, "failed to read %s: %s\n", path, strerror(-rc));
>> + goto err;
>> + }
>> +
>> + /*
>> + * The format of the output of the einj_types attr is:
>> + * <Error number in hex 1> <Error name 1>
>> + * <Error number in hex 2> <Error name 2>
>> + * ...
>> + *
>> + * We only need the number, so parse that and skip the rest of
>> + * the line.
>> + */
>> + num = strtok_r(buf, " \n", &save);
>> + while (num) {
>> + n = strtoul(num, NULL, 16);
>> + perror = create_cxl_protocol_error(ctx, n);
>> + if (perror)
>> + list_add(&ctx->perrors, &perror->list);
>> +
>> + num = strtok_r(NULL, "\n", &save);
>> + if (!num)
>> + break;
>> +
>> + num = strtok_r(NULL, " \n", &save);
>> + }
>> +
>> +err:
>> + free(path);
>> +}
>> +
>> +static void cxl_protocol_errors_init(struct cxl_ctx *ctx)
>> +{
>> + if (ctx->perrors_init)
>> + return;
>> +
>> + ctx->perrors_init = 1;
>> + cxl_add_protocol_errors(ctx);
>> +}
>> +
>> +CXL_EXPORT struct cxl_protocol_error *
>> +cxl_protocol_error_get_first(struct cxl_ctx *ctx)
>> +{
>> + cxl_protocol_errors_init(ctx);
>> +
>> + return list_top(&ctx->perrors, struct cxl_protocol_error, list);
>> +}
>> +
>> +CXL_EXPORT struct cxl_protocol_error *
>> +cxl_protocol_error_get_next(struct cxl_protocol_error *perror)
>> +{
>> + struct cxl_ctx *ctx = perror->ctx;
>> +
>> + return list_next(&ctx->perrors, perror, list);
>> +}
>> +
>> +CXL_EXPORT unsigned long
>> +cxl_protocol_error_get_num(struct cxl_protocol_error *perror)
>> +{
>> + return perror->num;
>> +}
>> +
>> +CXL_EXPORT const char *
>> +cxl_protocol_error_get_str(struct cxl_protocol_error *perror)
>> +{
>> + return perror->string;
>> +}
>> +
>> +CXL_EXPORT int cxl_dport_protocol_error_inject(struct cxl_dport *dport,
>> + unsigned long error)
>> +{
>> + struct cxl_ctx *ctx = dport->port->ctx;
>> + unsigned long path_len;
>> + char buf[32] = { 0 };
>> + char *path;
>> + int rc;
>> +
>> + if (!ctx->debugfs)
>> + return -ENOENT;
>> +
>> + path_len = strlen(ctx->debugfs) + 100;
>> + path = calloc(path_len, sizeof(char));
>> + if (!path)
>> + return -ENOMEM;
>> +
>> + snprintf(path, path_len, "%s/cxl/%s/einj_inject", ctx->debugfs,
>> + cxl_dport_get_devname(dport));
>
> check return value
Yep, will do (elsewhere as well).
>
>> + rc = access(path, F_OK);
>> + if (rc) {
>> + err(ctx, "failed to access %s: %s\n", path, strerror(-rc));
>
> errno
>
>> + free(path);
>> + return rc;
> -errno instead of rc
>
>> + }
>> +
>> + snprintf(buf, sizeof(buf), "0x%lx\n", error);
>
> check return value?
>
> DJ
>