On Sun, 2023-10-01 at 15:31 -0700, [email protected] wrote:
> From: Alison Schofield <[email protected]>
>
> Poison list records are logged as events in the kernel tracing
> subsystem. To prepare the poison list for cxl list, enable tracing,
> trigger the poison list read, and parse the generated cxl_poison
> events into a json representation.
>
> Signed-off-by: Alison Schofield <[email protected]>
> ---
> cxl/json.c | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> util/json.h | 1 +
> 2 files changed, 209 insertions(+)
>
> diff --git a/cxl/json.c b/cxl/json.c
> index 7678d02020b6..36db73de4f8f 100644
> --- a/cxl/json.c
> +++ b/cxl/json.c
> @@ -2,15 +2,19 @@
> // Copyright (C) 2015-2021 Intel Corporation. All rights reserved.
> #include <limits.h>
> #include <util/json.h>
> +#include <util/bitmap.h>
> #include <uuid/uuid.h>
> #include <cxl/libcxl.h>
> #include <json-c/json.h>
> #include <json-c/printbuf.h>
> #include <ccan/short_types/short_types.h>
> +#include <traceevent/event-parse.h>
> +#include <tracefs/tracefs.h>
>
> #include "filter.h"
> #include "json.h"
> #include "../daxctl/json.h"
> +#include "event_trace.h"
>
> #define CXL_FW_VERSION_STR_LEN 16
> #define CXL_FW_MAX_SLOTS 4
> @@ -571,6 +575,190 @@ err_jobj:
> return NULL;
> }
>
> +/* CXL 8.2.9.5.4.1 Get Poison List: Poison Source */
These usually have a spec version too - "CXL 3.0 8.2.9... "
> +#define CXL_POISON_SOURCE_UNKNOWN 0
> +#define CXL_POISON_SOURCE_EXTERNAL 1
> +#define CXL_POISON_SOURCE_INTERNAL 2
> +#define CXL_POISON_SOURCE_INJECTED 3
> +#define CXL_POISON_SOURCE_VENDOR 7
> +
> +/* CXL 8.2.9.5.4.1 Get Poison List: Payload out flags */
Same thing here.
> +#define CXL_POISON_FLAG_MORE BIT(0)
> +#define CXL_POISON_FLAG_OVERFLOW BIT(1)
> +#define CXL_POISON_FLAG_SCANNING BIT(2)
> +
> +static struct json_object *
> +util_cxl_poison_events_to_json(struct tracefs_instance *inst, bool is_region,
> + unsigned long flags)
> +{
> + struct json_object *jerrors, *jmedia, *jobj = NULL;
Since everything else is now 'poison', might be good to also
s/jmedia/jpoison/ everywhere.
> + struct jlist_node *jnode, *next;
> + struct event_ctx ectx = {
> + .event_name = "cxl_poison",
> + .event_pid = getpid(),
> + .system = "cxl",
> + };
> + int rc, count = 0;
> +
> + list_head_init(&ectx.jlist_head);
> + rc = cxl_parse_events(inst, &ectx);
> + if (rc < 0) {
> + fprintf(stderr, "Failed to parse events: %d\n", rc);
> + return NULL;
> + }
> + /* Add nr_poison_records:0 to json */
> + if (list_empty(&ectx.jlist_head))
> + goto out;
> +
> + jerrors = json_object_new_array();
> + if (!jerrors)
> + return NULL;
> +
> + list_for_each_safe(&ectx.jlist_head, jnode, next, list) {
> + struct json_object *jval = NULL;
> + struct json_object *jp = NULL;
Are the NULL assignments needed? At least for @jp, it is
unconditionally assigned below, and isn't used before that. I suspect
json-c probably doesn't care about what's in @jval either before
writing it.
> + int source, pflags;
> + u64 addr, len;
> +
> + jp = json_object_new_object();
> + if (!jp)
> + return NULL;
> +
> + if (is_region) {
> + /* Add the memdev name in a by region list */
> + if (json_object_object_get_ex(jnode->jobj, "memdev",
> + &jval))
> + json_object_object_add(jp, "memdev", jval);
> + }
> +
> + /*
> + * When listing is by memdev, region names and valid HPAs
> + * will appear if the poison address is part of a region.
> + * Pick up those valid region names and HPAs but ignore the
> + * empties and invalids.
> + */
> +
> + /* Only add non NULL region names */
> + if (json_object_object_get_ex(jnode->jobj, "region", &jval)) {
> + if (strlen(json_object_get_string(jval)) != 0)
> + json_object_object_add(jp, "region", jval);
> + }
> + /* Only display valid HPAs */
> + if (json_object_object_get_ex(jnode->jobj, "hpa", &jval)) {
> + addr = json_object_get_uint64(jval);
> + if (addr != ULLONG_MAX) {
> + jobj = util_json_object_hex(addr, flags);
> + json_object_object_add(jp, "hpa", jobj);
> + }
> + }
> + if (json_object_object_get_ex(jnode->jobj, "dpa", &jval)) {
> + addr = json_object_get_int64(jval);
> + jobj = util_json_object_hex(addr, flags);
> + json_object_object_add(jp, "dpa", jobj);
> + }
> + if (json_object_object_get_ex(jnode->jobj, "dpa_length",
> &jval)) {
> + len = json_object_get_int64(jval);
> + jobj = util_json_object_size(len, flags);
> + json_object_object_add(jp, "dpa_length", jobj);
> + }
> + if (json_object_object_get_ex(jnode->jobj, "source", &jval)) {
> + source = json_object_get_int(jval);
> + if (source == CXL_POISON_SOURCE_UNKNOWN)
> + jobj = json_object_new_string("Unknown");
> + else if (source == CXL_POISON_SOURCE_EXTERNAL)
> + jobj = json_object_new_string("External");
> + else if (source == CXL_POISON_SOURCE_INTERNAL)
> + jobj = json_object_new_string("Internal");
> + else if (source == CXL_POISON_SOURCE_INJECTED)
> + jobj = json_object_new_string("Injected");
> + else if (source == CXL_POISON_SOURCE_VENDOR)
> + jobj = json_object_new_string("Vendor");
> + else
> + jobj = json_object_new_string("Reserved");
Minor nit, but maybe 'switch (source) ...' would look a bit cleaner?
> + json_object_object_add(jp, "source", jobj);
> + }
> + if (json_object_object_get_ex(jnode->jobj, "flags", &jval)) {
> + char flag_str[32] = { '\0' };
> +
> + pflags = json_object_get_int(jval);
> + if (pflags & CXL_POISON_FLAG_MORE)
> + strcat(flag_str, "More,");
> + if (pflags & CXL_POISON_FLAG_OVERFLOW)
> + strcat(flag_str, "Overflow,");
> + if (pflags & CXL_POISON_FLAG_SCANNING)
> + strcat(flag_str, "Scanning,");
> + jobj = json_object_new_string(flag_str);
> + if (jobj)
> + json_object_object_add(jp, "flags", jobj);
> + }
> + if (json_object_object_get_ex(jnode->jobj, "overflow_t",
> &jval))
> + json_object_object_add(jp, "overflow_time", jval);
> +
> + json_object_array_add(jerrors, jp);
> + count++;
> + } /* list_for_each_safe */
> +
> +out:
> + jmedia = json_object_new_object();
> + if (!jmedia)
> + return NULL;
> +
> + /* Always include the count. If count is zero, no records follow. */
> + jobj = json_object_new_int(count);
> + if (jobj)
> + json_object_object_add(jmedia, "nr_poison_records", jobj);
> + if (count)
> + json_object_object_add(jmedia, "poison_records", jerrors);
Since these are already nested under a 'poison' JSON object, I'm
tempted to say these can just be 'nr_records' and 'records'
respectively.
> +
> + return jmedia;
> +}
> +
> +struct cxl_poison_ctx {
> + void *dev;
> + bool is_region;
> +};
This structure is a bit awkward - what do you think about creating
different wrappers for the memdev and region case -
util_cxl_memdev_poison_list_to_json(), and
util_cxl_region_poison_list_to_json() that are called respectively by
util_cxl_{memdev,region}_to_json(), and internally they can call:
util_cxl_poison_list_to_json(NULL, memdev, flags), or
util_cxl_poison_list_to_json(region, NULL, flags)
For the next level down, i.e. poison_events_to_json, the @is_region
bool passed in directly is fine as it doesn't need the memdev or region
objects passed in via void *.
> +
> +static struct json_object *
> +util_cxl_poison_list_to_json(struct cxl_poison_ctx *pctx,
> + unsigned long flags)
> +{
> + struct json_object *jmedia = NULL;
> + struct tracefs_instance *inst;
> + int rc;
> +
> + inst = tracefs_instance_create("cxl list");
> + if (!inst) {
> + fprintf(stderr, "tracefs_instance_create() failed\n");
> + return NULL;
> + }
> +
> + rc = cxl_event_tracing_enable(inst, "cxl", "cxl_poison");
> + if (rc < 0) {
> + fprintf(stderr, "Failed to enable trace: %d\n", rc);
> + goto err_free;
> + }
> +
> + if (pctx->is_region)
> + rc = cxl_region_trigger_poison_list(pctx->dev);
> + else
> + rc = cxl_memdev_trigger_poison_list(pctx->dev);
> + if (rc) {
> + fprintf(stderr, "Failed write of sysfs attribute: %d\n", rc);
This would be incorrect if the memdev trigger reported an ENOMEM, and
then this reported a sysfs write failure.
It should at least be something like 'failed to trigger poison" - but
since the memdev trigger helper has prints for every failure case,
maybe this can just be omitted?
> + goto err_free;
> + }
> +
> + rc = cxl_event_tracing_disable(inst);
> + if (rc < 0) {
> + fprintf(stderr, "Failed to disable trace: %d\n", rc);
> + goto err_free;
> + }
> +
> + jmedia = util_cxl_poison_events_to_json(inst, pctx->is_region, flags);
> +err_free:
> + tracefs_instance_free(inst);
> + return jmedia;
> +}
> +
> struct json_object *util_cxl_memdev_to_json(struct cxl_memdev *memdev,
> unsigned long flags)
> {
> @@ -649,6 +837,16 @@ struct json_object *util_cxl_memdev_to_json(struct
> cxl_memdev *memdev,
> json_object_object_add(jdev, "firmware", jobj);
> }
>
> + if (flags & UTIL_JSON_POISON_LIST) {
> + struct cxl_poison_ctx pctx = {
> + .dev = memdev,
> + .is_region = false,
> + };
> + jobj = util_cxl_poison_list_to_json(&pctx, flags);
> + if (jobj)
> + json_object_object_add(jdev, "poison", jobj);
> + }
> +
> json_object_set_userdata(jdev, memdev, NULL);
> return jdev;
> }
> @@ -987,6 +1185,16 @@ struct json_object *util_cxl_region_to_json(struct
> cxl_region *region,
> json_object_object_add(jregion, "state", jobj);
> }
>
> + if (flags & UTIL_JSON_POISON_LIST) {
> + struct cxl_poison_ctx pectx = {
> + .dev = region,
> + .is_region = true,
> + };
> + jobj = util_cxl_poison_list_to_json(&pectx, flags);
> + if (jobj)
> + json_object_object_add(jregion, "poison", jobj);
> + }
> +
> util_cxl_mappings_append_json(jregion, region, flags);
>
> if (flags & UTIL_JSON_DAX) {
> diff --git a/util/json.h b/util/json.h
> index ea370df4d1b7..3ae4074a95c3 100644
> --- a/util/json.h
> +++ b/util/json.h
> @@ -21,6 +21,7 @@ enum util_json_flags {
> UTIL_JSON_TARGETS = (1 << 11),
> UTIL_JSON_PARTITION = (1 << 12),
> UTIL_JSON_ALERT_CONFIG = (1 << 13),
> + UTIL_JSON_POISON_LIST = (1 << 14),
There's already a UTIL_JSON_MEDIA_ERRORS, can we just reuse that (in
spite of the name :))
> };
>
> void util_display_json_array(FILE *f_out, struct json_object *jarray,