Adds the support for receiving DC event records but defers
the real add/release logic to subsequent commits. Simply refuse all
extents for DC_ADD and ack all DC_RELEASE events for now. Forced
release is currently unsupported.
In order, this commit adds the following:
1. Learn about DC Event Records and how to respond to them
* cxl_mem_get_event_records() learns about the DC Event record.
Records of that type are routed to cxl_handle_dcd_event_records().
* cxl_handle_dcd_event_records() switches on event_type:
- DCD_ADD_CAPACITY -> handle_add_event()
- DCD_RELEASE_CAPACITY -> cxl_rm_extent()
- DCD_FORCED_CAPACITY_RELEASE is logged and ignored (FM/device-only).
* cxl_send_dc_response() sends the reply mailbox commands
ADD_DC_RESPONSE / RELEASE_DC
2. Add stubs for DC_ADD and DC_RELEASE logic
* handle_add_event() stages incoming extents onto
mds->add_ctx.pending_extents and, when More=0 closes the chain,
replies with an empty ADD_DC_RESPONSE — refusing all extents for now
* cxl_rm_extent() acks the release via memdev_release_extent() so the
device's view stays consistent; we can ack all releases because
we currently don't accept/use any extents offered.
3. Structural setup for later commits:
* struct dc_extent, struct cxl_dc_tag_group, and pending_add_ctx
set up the stage for the real DC_ADD path, which will enforce
tag/grouping semantics
Based on an original patch by Navneet Singh.
Signed-off-by: Ira Weiny <[email protected]>
Signed-off-by: Anisa Su <[email protected]>
---
Changes:
[anisa: restructured from the original "Process dynamic partition
events" monolith; this commit lands only the wire-level intake and
dispatches the add/release logic to stubbed handlers. The handlers
are fleshed out in subsequent commits.]
---
drivers/cxl/core/mbox.c | 246 +++++++++++++++++++++++++++++++++++++++-
drivers/cxl/cxl.h | 73 +++++++++++-
drivers/cxl/cxlmem.h | 45 ++++++++
include/cxl/event.h | 38 +++++++
4 files changed, 400 insertions(+), 2 deletions(-)
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 01b1a318f34f..1b38f34538f3 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -5,6 +5,7 @@
#include <linux/ktime.h>
#include <linux/mutex.h>
#include <linux/unaligned.h>
+#include <linux/list.h>
#include <cxlpci.h>
#include <cxlmem.h>
#include <cxl.h>
@@ -1102,6 +1103,238 @@ static int cxl_clear_event_record(struct
cxl_memdev_state *mds,
return rc;
}
+static int send_one_response(struct cxl_mailbox *cxl_mbox,
+ struct cxl_mbox_dc_response *response,
+ int opcode, u32 extent_list_size, u8 flags)
+{
+ struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = opcode,
+ .size_in = struct_size(response, extent_list, extent_list_size),
+ .payload_in = response,
+ };
+
+ response->extent_list_size = cpu_to_le32(extent_list_size);
+ response->flags = flags;
+ return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+}
+
+static int cxl_send_dc_response(struct cxl_memdev_state *mds, int opcode,
+ struct list_head *extent_list, int cnt)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_dc_response *p;
+ struct cxl_extent_list_node *pos, *tmp;
+ struct cxl_extent *extent;
+ u32 pl_index;
+
+ size_t pl_size = struct_size(p, extent_list, cnt);
+ u32 max_extents = cnt;
+
+ /* May have to use more bit on response. */
+ if (pl_size > cxl_mbox->payload_size) {
+ max_extents = (cxl_mbox->payload_size - sizeof(*p)) /
+ sizeof(struct updated_extent_list);
+ pl_size = struct_size(p, extent_list, max_extents);
+ }
+
+ struct cxl_mbox_dc_response *response __free(kfree) =
+ kzalloc(pl_size, GFP_KERNEL);
+ if (!response)
+ return -ENOMEM;
+
+ if (cnt == 0)
+ return send_one_response(cxl_mbox, response, opcode, 0, 0);
+
+ pl_index = 0;
+ list_for_each_entry_safe(pos, tmp, extent_list, list) {
+ extent = pos->extent;
+ response->extent_list[pl_index].dpa_start = extent->start_dpa;
+ response->extent_list[pl_index].length = extent->length;
+ pl_index++;
+
+ if (pl_index == max_extents) {
+ u8 flags = 0;
+ int rc;
+
+ if (pl_index < cnt)
+ flags |= CXL_DCD_EVENT_MORE;
+ rc = send_one_response(cxl_mbox, response, opcode,
+ pl_index, flags);
+ if (rc)
+ return rc;
+ cnt -= pl_index;
+ if (cnt < max_extents)
+ max_extents = cnt;
+ pl_index = 0;
+ }
+ }
+
+ if (!pl_index) /* nothing more to do */
+ return 0;
+ return send_one_response(cxl_mbox, response, opcode, pl_index, 0);
+}
+
+static void delete_extent_node(struct cxl_extent_list_node *node)
+{
+ list_del(&node->list);
+ kfree(node->extent);
+ kfree(node);
+}
+
+static void memdev_release_extent(struct cxl_memdev_state *mds, struct range
*range)
+{
+ struct device *dev = mds->cxlds.dev;
+ struct cxl_extent_list_node *node;
+ LIST_HEAD(extent_list);
+
+ dev_dbg(dev, "Release response dpa %pra\n", range);
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return;
+
+ node->extent = kzalloc(sizeof(*node->extent), GFP_KERNEL);
+ if (!node->extent) {
+ kfree(node);
+ return;
+ }
+
+ node->extent->start_dpa = cpu_to_le64(range->start);
+ node->extent->length = cpu_to_le64(range_len(range));
+ list_add_tail(&node->list, &extent_list);
+
+ if (cxl_send_dc_response(mds, CXL_MBOX_OP_RELEASE_DC, &extent_list, 1))
+ dev_dbg(dev, "Failed to release %pra\n", range);
+
+ delete_extent_node(node);
+}
+
+static void clear_pending_extents(void *_mds)
+{
+ struct cxl_memdev_state *mds = _mds;
+ struct cxl_extent_list_node *pos, *tmp;
+
+ list_for_each_entry_safe(pos, tmp, &mds->add_ctx.pending_extents, list)
+ delete_extent_node(pos);
+ mds->add_ctx.group = NULL;
+}
+
+static int add_to_pending_list(struct list_head *pending_list,
+ struct cxl_extent *to_add)
+{
+ struct cxl_extent_list_node *node;
+ struct cxl_extent *extent;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+ extent = kmemdup(to_add, sizeof(*extent), GFP_KERNEL);
+ if (!extent)
+ return -ENOMEM;
+
+ node->extent = extent;
+ list_add_tail(&node->list, pending_list);
+ return 0;
+}
+
+/*
+ * Stub: stage extents on the pending list and reply with an empty
+ * ADD_DC_RESPONSE on More=0 (refuse all). A later commit replaces
+ * the no-op tail with the real Add pipeline that surfaces a dax
+ * device per accepted extent.
+ */
+static int handle_add_event(struct cxl_memdev_state *mds,
+ struct cxl_event_dcd *event)
+{
+ struct device *dev = mds->cxlds.dev;
+ int rc;
+
+ rc = add_to_pending_list(&mds->add_ctx.pending_extents, &event->extent);
+ if (rc)
+ return rc;
+
+ if (event->flags & CXL_DCD_EVENT_MORE) {
+ dev_dbg(dev, "more bit set; delay the surfacing of extent\n");
+ return 0;
+ }
+
+ rc = cxl_send_dc_response(mds, CXL_MBOX_OP_ADD_DC_RESPONSE,
+ &mds->add_ctx.pending_extents, 0);
+ clear_pending_extents(mds);
+ return rc;
+}
+
+/*
+ * Stub: ack the release back to the device so it knows we are not
+ * using the range. A later commit replaces this with the real
+ * teardown that walks the region's tag group and tears down the
+ * member dc_extent devices.
+ */
+static int cxl_rm_extent(struct cxl_memdev_state *mds,
+ struct cxl_extent *extent)
+{
+ u64 start_dpa = le64_to_cpu(extent->start_dpa);
+ struct range dpa_range = {
+ .start = start_dpa,
+ .end = start_dpa + le64_to_cpu(extent->length) - 1,
+ };
+
+ memdev_release_extent(mds, &dpa_range);
+ return 0;
+}
+
+static char *cxl_dcd_evt_type_str(u8 type)
+{
+ switch (type) {
+ case DCD_ADD_CAPACITY:
+ return "add";
+ case DCD_RELEASE_CAPACITY:
+ return "release";
+ case DCD_FORCED_CAPACITY_RELEASE:
+ return "force release";
+ default:
+ break;
+ }
+
+ return "<unknown>";
+}
+
+static void cxl_handle_dcd_event_records(struct cxl_memdev_state *mds,
+ struct cxl_event_record_raw *raw_rec)
+{
+ struct cxl_event_dcd *event = &raw_rec->event.dcd;
+ struct cxl_extent *extent = &event->extent;
+ struct device *dev = mds->cxlds.dev;
+ uuid_t *id = &raw_rec->id;
+ int rc;
+
+ if (!uuid_equal(id, &CXL_EVENT_DC_EVENT_UUID))
+ return;
+
+ dev_dbg(dev, "DCD event %s : DPA:%#llx LEN:%#llx\n",
+ cxl_dcd_evt_type_str(event->event_type),
+ le64_to_cpu(extent->start_dpa), le64_to_cpu(extent->length));
+
+ switch (event->event_type) {
+ case DCD_ADD_CAPACITY:
+ rc = handle_add_event(mds, event);
+ break;
+ case DCD_RELEASE_CAPACITY:
+ rc = cxl_rm_extent(mds, &event->extent);
+ break;
+ case DCD_FORCED_CAPACITY_RELEASE:
+ dev_err_ratelimited(dev, "Forced release event ignored.\n");
+ rc = 0;
+ break;
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ if (rc)
+ dev_err_ratelimited(dev, "dcd event failed: %d\n", rc);
+}
+
static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
enum cxl_event_log_type type)
{
@@ -1138,9 +1371,13 @@ static void cxl_mem_get_records_log(struct
cxl_memdev_state *mds,
if (!nr_rec)
break;
- for (i = 0; i < nr_rec; i++)
+ for (i = 0; i < nr_rec; i++) {
__cxl_event_trace_record(cxlmd, type,
&payload->records[i]);
+ if (type == CXL_EVENT_TYPE_DCD)
+ cxl_handle_dcd_event_records(mds,
+ &payload->records[i]);
+ }
if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
trace_cxl_overflow(cxlmd, type, payload);
@@ -1172,6 +1409,8 @@ void cxl_mem_get_event_records(struct cxl_memdev_state
*mds, u32 status)
{
dev_dbg(mds->cxlds.dev, "Reading event logs: %x\n", status);
+ if (cxl_dcd_supported(mds) && (status & CXLDEV_EVENT_STATUS_DCD))
+ cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_DCD);
if (status & CXLDEV_EVENT_STATUS_FATAL)
cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_FATAL);
if (status & CXLDEV_EVENT_STATUS_FAIL)
@@ -1769,6 +2008,11 @@ struct cxl_memdev_state *cxl_memdev_state_create(struct
device *dev, u64 serial,
}
mutex_init(&mds->event.log_lock);
+ INIT_LIST_HEAD(&mds->add_ctx.pending_extents);
+
+ rc = devm_add_action_or_reset(dev, clear_pending_extents, mds);
+ if (rc)
+ return ERR_PTR(rc);
rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier);
if (rc == -EOPNOTSUPP)
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 1297594beaec..5ef2cf4d005b 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -12,6 +12,7 @@
#include <linux/node.h>
#include <linux/io.h>
#include <linux/range.h>
+#include <linux/xarray.h>
#include <cxl/cxl.h>
extern const struct nvdimm_security_ops *cxl_security_ops;
@@ -180,11 +181,13 @@ static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
#define CXLDEV_EVENT_STATUS_WARN BIT(1)
#define CXLDEV_EVENT_STATUS_FAIL BIT(2)
#define CXLDEV_EVENT_STATUS_FATAL BIT(3)
+#define CXLDEV_EVENT_STATUS_DCD BIT(4)
#define CXLDEV_EVENT_STATUS_ALL (CXLDEV_EVENT_STATUS_INFO | \
CXLDEV_EVENT_STATUS_WARN | \
CXLDEV_EVENT_STATUS_FAIL | \
- CXLDEV_EVENT_STATUS_FATAL)
+ CXLDEV_EVENT_STATUS_FATAL | \
+ CXLDEV_EVENT_STATUS_DCD)
/* CXL rev 3.0 section 8.2.9.2.4; Table 8-52 */
#define CXLDEV_EVENT_INT_MODE_MASK GENMASK(1, 0)
@@ -306,6 +309,41 @@ enum cxl_decoder_state {
CXL_DECODER_STATE_AUTO_STAGED,
};
+struct cxl_dc_tag_group;
+
+/**
+ * struct dc_extent - A single dynamic-capacity extent surfaced to the host.
+ *
+ * One per device-stamped extent. Multiple dc_extents that share a tag
+ * (see &struct cxl_dc_tag_group) form a single logical allocation, but
+ * each dc_extent has its own HPA range and is the unit that the DAX
+ * layer sees as a backing dax_resource.
+ *
+ * @dev: device representing this extent; child of cxlr_dax->dev.
+ * @group: containing tag group (allocation); shared across siblings.
+ * @cxled: endpoint decoder backing the DPA range.
+ * @dpa_range: DPA range this extent covers within @cxled.
+ * @hpa_range: HPA range that @dpa_range decodes to, relative to
+ * cxlr_dax->hpa_range.start.
+ * @uuid: tag uuid (matches @group->uuid; kept for the release-path log).
+ * @seq_num: 1..n assembly-order index within the tag group. For extents
+ * from a sharable partition this equals the device-stamped
+ * shared_extn_seq (CXL 3.1 Table 8-51). For extents from a
+ * non-sharable partition the device leaves shared_extn_seq == 0
+ * and the host assigns @seq_num in event arrival order at
+ * cxl_add_pending() time. Used by the dax layer to assemble
+ * ranges in the right order regardless of source.
+ */
+struct dc_extent {
+ struct device dev;
+ struct cxl_dc_tag_group *group;
+ struct cxl_endpoint_decoder *cxled;
+ struct range dpa_range;
+ struct range hpa_range;
+ uuid_t uuid;
+ u16 seq_num;
+};
+
/**
* struct cxl_endpoint_decoder - Endpoint / SPA to DPA decoder
* @cxld: base cxl_decoder_object
@@ -518,12 +556,45 @@ struct cxl_pmem_region {
struct cxl_pmem_region_mapping mapping[];
};
+/* See CXL 3.1 8.2.9.2.1.6 */
+enum dc_event {
+ DCD_ADD_CAPACITY,
+ DCD_RELEASE_CAPACITY,
+ DCD_FORCED_CAPACITY_RELEASE,
+ DCD_REGION_CONFIGURATION_UPDATED,
+};
+
struct cxl_dax_region {
struct device dev;
struct cxl_region *cxlr;
struct range hpa_range;
};
+/**
+ * struct cxl_dc_tag_group - A tagged dynamic-capacity allocation.
+ *
+ * Container for the &struct dc_extent siblings that share a tag. The
+ * group has no sysfs identity; userspace sees the individual dc_extents
+ * directly under the parent dax_region device. The group exists to
+ * keep tag-scoped invariants (atomic add, atomic release, ordered carve
+ * by seq_num) in one place.
+ *
+ * @cxlr_dax: back reference to parent region device.
+ * @uuid: tag identifying this allocation; same across all member dc_extents.
+ * @dc_extents: xarray of &struct dc_extent in this group, indexed by the
+ * dc_extent's @seq_num (1..n, dense). See &struct dc_extent
+ * for how seq_num is sourced for sharable vs non-sharable
+ * allocations.
+ * @nr_extents: live count of dc_extents in the group; the group is freed
+ * when the last dc_extent device is released.
+ */
+struct cxl_dc_tag_group {
+ struct cxl_dax_region *cxlr_dax;
+ uuid_t uuid;
+ struct xarray dc_extents;
+ unsigned int nr_extents;
+};
+
/**
* struct cxl_port - logical collection of upstream port devices and
* downstream port devices to construct a CXL memory
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 65c009b02da6..592c8e3b611c 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -7,6 +7,7 @@
#include <linux/cdev.h>
#include <linux/uuid.h>
#include <linux/node.h>
+#include <linux/list.h>
#include <cxl/event.h>
#include <cxl/mailbox.h>
#include "cxl.h"
@@ -399,6 +400,23 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct
cxl_mailbox *cxl_mbox)
return dev_get_drvdata(cxl_mbox->host);
}
+/**
+ * struct pending_add_ctx - Staging state for an in-progress
+ * DCD_ADD_CAPACITY event chain
+ * @pending_extents: extents received so far in the chain; flushed when
+ * the chain closes (More=0)
+ * @group: tag group being assembled from the chain
+ *
+ * A DCD_ADD_CAPACITY notification can span multiple event records
+ * stitched together by the CXL_DCD_EVENT_MORE flag. Records are staged
+ * here until the device clears More, at which point the staged batch is
+ * processed and responded to as a single Add_DC_Response.
+ */
+struct pending_add_ctx {
+ struct list_head pending_extents;
+ struct cxl_dc_tag_group *group;
+};
+
/**
* struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data
*
@@ -417,6 +435,8 @@ static inline struct cxl_dev_state *mbox_to_cxlds(struct
cxl_mailbox *cxl_mbox)
* @active_volatile_bytes: sum of hard + soft volatile
* @active_persistent_bytes: sum of hard + soft persistent
* @dcd_supported: all DCD commands are supported
+ * @add_ctx: state for an in-progress DCD_ADD_CAPACITY chain
+ * (see &struct pending_add_ctx)
* @event: event log driver state
* @poison: poison driver state info
* @security: security driver state info
@@ -437,6 +457,7 @@ struct cxl_memdev_state {
u64 active_volatile_bytes;
u64 active_persistent_bytes;
bool dcd_supported;
+ struct pending_add_ctx add_ctx;
struct cxl_event_state event;
struct cxl_poison_state poison;
@@ -513,6 +534,21 @@ enum cxl_opcode {
UUID_INIT(0x5e1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19,
\
0x40, 0x3d, 0x86)
+/*
+ * Add Dynamic Capacity Response
+ * CXL rev 3.1 section 8.2.9.9.9.3; Table 8-168 & Table 8-169
+ */
+struct cxl_mbox_dc_response {
+ __le32 extent_list_size;
+ u8 flags;
+ u8 reserved[3];
+ struct updated_extent_list {
+ __le64 dpa_start;
+ __le64 length;
+ u8 reserved[8];
+ } __packed extent_list[] __counted_by(extent_list_size);
+} __packed;
+
struct cxl_mbox_get_supported_logs {
__le16 entries;
u8 rsvd[6];
@@ -583,6 +619,14 @@ struct cxl_mbox_identify {
UUID_INIT(0xe71f3a40, 0x2d29, 0x4092, 0x8a, 0x39, 0x4d, 0x1c, 0x96, \
0x6c, 0x7c, 0x65)
+/*
+ * Dynamic Capacity Event Record
+ * CXL rev 3.1 section 8.2.9.2.1; Table 8-43
+ */
+#define CXL_EVENT_DC_EVENT_UUID \
+ UUID_INIT(0xca95afa7, 0xf183, 0x4018, 0x8c, 0x2f, 0x95, 0x26, 0x8e, \
+ 0x10, 0x1a, 0x2a)
+
/*
* Get Event Records output payload
* CXL rev 3.0 section 8.2.9.2.2; Table 8-50
@@ -608,6 +652,7 @@ enum cxl_event_log_type {
CXL_EVENT_TYPE_WARN,
CXL_EVENT_TYPE_FAIL,
CXL_EVENT_TYPE_FATAL,
+ CXL_EVENT_TYPE_DCD,
CXL_EVENT_TYPE_MAX
};
diff --git a/include/cxl/event.h b/include/cxl/event.h
index ff97fea718d2..fa3cd895f656 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -6,6 +6,7 @@
#include <linux/types.h>
#include <linux/uuid.h>
#include <linux/workqueue_types.h>
+#include <linux/list.h>
/*
* Common Event Record Format
@@ -141,12 +142,49 @@ struct cxl_event_mem_sparing {
u8 reserved2[0x25];
} __packed;
+/*
+ * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-51
+ */
+struct cxl_extent {
+ __le64 start_dpa;
+ __le64 length;
+ u8 uuid[UUID_SIZE];
+ __le16 shared_extn_seq;
+ u8 reserved[0x6];
+} __packed;
+
+struct cxl_extent_list_node {
+ struct cxl_extent *extent;
+ struct list_head list;
+ int rid;
+};
+
+/*
+ * Dynamic Capacity Event Record
+ * CXL rev 3.1 section 8.2.9.2.1.6; Table 8-50
+ */
+#define CXL_DCD_EVENT_MORE BIT(0)
+struct cxl_event_dcd {
+ struct cxl_event_record_hdr hdr;
+ u8 event_type;
+ u8 validity_flags;
+ __le16 host_id;
+ u8 partition_index;
+ u8 flags;
+ u8 reserved1[0x2];
+ struct cxl_extent extent;
+ u8 reserved2[0x18];
+ __le32 num_avail_extents;
+ __le32 num_avail_tags;
+} __packed;
+
union cxl_event {
struct cxl_event_generic generic;
struct cxl_event_gen_media gen_media;
struct cxl_event_dram dram;
struct cxl_event_mem_module mem_module;
struct cxl_event_mem_sparing mem_sparing;
+ struct cxl_event_dcd dcd;
/* dram & gen_media event header */
struct cxl_event_media_hdr media_hdr;
} __packed;
--
2.43.0