extent: Process DCD events and realize region extents

Ira Weiny Thu, 29 Aug 2024 14:18:28 -0700

Jonathan Cameron wrote:
> On Fri, 16 Aug 2024 09:44:26 -0500
> ira.we...@intel.com wrote:
> 
> > From: Navneet Singh <navneet.si...@intel.com>
> >


[snip]

> > +static int match_contains(struct device *dev, void *data)
> > +{
> > +   struct region_extent *region_extent = to_region_extent(dev);
> > +   struct match_data *md = data;
> > +   struct cxled_extent *entry;
> > +   unsigned long index;
> > +
> > +   if (!region_extent)
> > +           return 0;
> > +
> > +   xa_for_each(&region_extent->decoder_extents, index, entry) {
> > +           if (md->cxled == entry->cxled &&
> > +               range_contains(&entry->dpa_range, md->new_range))
> > +                   return true;
> As below, this returns int, so shouldn't be true or false.

Yep.  Thanks.

> 
> > +   }
> > +   return false;
> > +}
> 
> > +static int match_overlaps(struct device *dev, void *data)
> > +{
> > +   struct region_extent *region_extent = to_region_extent(dev);
> > +   struct match_data *md = data;
> > +   struct cxled_extent *entry;
> > +   unsigned long index;
> > +
> > +   if (!region_extent)
> > +           return 0;
> > +
> > +   xa_for_each(&region_extent->decoder_extents, index, entry) {
> > +           if (md->cxled == entry->cxled &&
> > +               range_overlaps(&entry->dpa_range, md->new_range))
> > +                   return true;
> 
> returns int, so returning true or false is odd.

Yep.

> 
> > +   }
> > +
> > +   return false;
> > +}
> 
> 
> > +int cxl_rm_extent(struct cxl_memdev_state *mds, struct cxl_extent *extent)
> > +{
> > +   u64 start_dpa = le64_to_cpu(extent->start_dpa);
> > +   struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
> > +   struct cxl_endpoint_decoder *cxled;
> > +   struct range hpa_range, dpa_range;
> > +   struct cxl_region *cxlr;
> > +
> > +   dpa_range = (struct range) {
> > +           .start = start_dpa,
> > +           .end = start_dpa + le64_to_cpu(extent->length) - 1,
> > +   };
> > +
> > +   guard(rwsem_read)(&cxl_region_rwsem);
> > +   cxlr = cxl_dpa_to_region(cxlmd, start_dpa, &cxled);
> > +   if (!cxlr) {
> > +           memdev_release_extent(mds, &dpa_range);
> 
> How does this condition happen?  Perhaps a comment needed.

Fair enough.  Proposed comment.

        /*
         * No region can happen here for a few reasons:
         *
         * 1) Extents were accepted and the host crashed/rebooted
         *    leaving them in an accepted state.  On reboot the host
         *    has not yet created a region to own them.
         *
         * 2) Region destruction won the race with the device releasing
         *    all the extents.  Here the release will be a duplicate of
         *    the one sent via region destruction.
         *
         * 3) The device is confused and releasing extents for which no
         *    region ever existed.
         *
         * In all these cases make sure the device knows we are not
         * using this extent.
         */

Item 2 is AFAICS ok with the spec.

> 
> > +           return -ENXIO;
> > +   }
> > +
> > +   calc_hpa_range(cxled, cxlr->cxlr_dax, &dpa_range, &hpa_range);
> > +
> > +   /* Remove region extents which overlap */
> > +   return device_for_each_child(&cxlr->cxlr_dax->dev, &hpa_range,
> > +                                cxlr_rm_extent);
> > +}
> > +
> > +static int cxlr_add_extent(struct cxl_dax_region *cxlr_dax,
> > +                      struct cxl_endpoint_decoder *cxled,
> > +                      struct cxled_extent *ed_extent)
> > +{
> > +   struct region_extent *region_extent;
> > +   struct range hpa_range;
> > +   int rc;
> > +
> > +   calc_hpa_range(cxled, cxlr_dax, &ed_extent->dpa_range, &hpa_range);
> > +
> > +   region_extent = alloc_region_extent(cxlr_dax, &hpa_range, 
> > ed_extent->tag);
> > +   if (IS_ERR(region_extent))
> > +           return PTR_ERR(region_extent);
> > +
> > +   rc = xa_insert(&region_extent->decoder_extents, (unsigned 
> > long)ed_extent, ed_extent,
> 
> I'd wrap that earlier to keep the line a bit shorter.

Done.

> 
> > +                  GFP_KERNEL);
> > +   if (rc) {
> > +           free_region_extent(region_extent);
> > +           return rc;
> > +   }
> > +
> > +   /* device model handles freeing region_extent */
> > +   return online_region_extent(region_extent);
> > +}
> > +
> > +/* Callers are expected to ensure cxled has been attached to a region */
> > +int cxl_add_extent(struct cxl_memdev_state *mds, struct cxl_extent *extent)
> > +{
> > +   u64 start_dpa = le64_to_cpu(extent->start_dpa);
> > +   struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
> > +   struct cxl_endpoint_decoder *cxled;
> > +   struct range ed_range, ext_range;
> > +   struct cxl_dax_region *cxlr_dax;
> > +   struct cxled_extent *ed_extent;
> > +   struct cxl_region *cxlr;
> > +   struct device *dev;
> > +
> > +   ext_range = (struct range) {
> > +           .start = start_dpa,
> > +           .end = start_dpa + le64_to_cpu(extent->length) - 1,
> > +   };
> > +
> > +   guard(rwsem_read)(&cxl_region_rwsem);
> > +   cxlr = cxl_dpa_to_region(cxlmd, start_dpa, &cxled);
> > +   if (!cxlr)
> > +           return -ENXIO;
> > +
> > +   cxlr_dax = cxled->cxld.region->cxlr_dax;
> > +   dev = &cxled->cxld.dev;
> > +   ed_range = (struct range) {
> > +           .start = cxled->dpa_res->start,
> > +           .end = cxled->dpa_res->end,
> > +   };
> > +
> > +   dev_dbg(&cxled->cxld.dev, "Checking ED (%pr) for extent %par\n",
> > +           cxled->dpa_res, &ext_range);
> > +
> > +   if (!range_contains(&ed_range, &ext_range)) {
> > +           dev_err_ratelimited(dev,
> > +                               "DC extent DPA %par (%*phC) is not fully in 
> > ED %par\n",
> > +                               &ext_range.start, CXL_EXTENT_TAG_LEN,
> > +                               extent->tag, &ed_range);
> > +           return -ENXIO;
> > +   }
> > +
> > +   if (extents_contain(cxlr_dax, cxled, &ext_range))
> 
> This case confuses me. If the extents are already there I think we should
> error out or at least print something as that's very wrong.

I thought we discussed this in one of the community meetings that it would be
ok to accept these.  We could certainly print a warning here.

In all honestly I'm wondering if these restrictions are really needed anymore.
But at the same time I really, really, really don't think anyone has a good use
case to have to support these cases.  So I'm keeping the code simple for now.

> 
> > +           return 0;
> > +
> > +   if (extents_overlap(cxlr_dax, cxled, &ext_range))
> > +           return -ENXIO;
> > +
> > +   ed_extent = kzalloc(sizeof(*ed_extent), GFP_KERNEL);
> > +   if (!ed_extent)
> > +           return -ENOMEM;
> > +
> > +   ed_extent->cxled = cxled;
> > +   ed_extent->dpa_range = ext_range;
> > +   memcpy(ed_extent->tag, extent->tag, CXL_EXTENT_TAG_LEN);
> > +
> > +   dev_dbg(dev, "Add extent %par (%*phC)\n", &ed_extent->dpa_range,
> > +           CXL_EXTENT_TAG_LEN, ed_extent->tag);
> > +
> > +   return cxlr_add_extent(cxlr_dax, cxled, ed_extent);
> > +}
> > diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> > index 01a447aaa1b1..f629ad7488ac 100644
> > --- a/drivers/cxl/core/mbox.c
> > +++ b/drivers/cxl/core/mbox.c
> > @@ -882,6 +882,48 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
> >  }
> >  EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
> >  
> > +static int cxl_validate_extent(struct cxl_memdev_state *mds,
> > +                          struct cxl_extent *extent)
> > +{
> > +   u64 start = le64_to_cpu(extent->start_dpa);
> > +   u64 length = le64_to_cpu(extent->length);
> > +   struct device *dev = mds->cxlds.dev;
> > +
> > +   struct range ext_range = (struct range){
> > +           .start = start,
> > +           .end = start + length - 1,
> > +   };
> > +
> > +   if (le16_to_cpu(extent->shared_extn_seq) != 0) {
> 
> That's not the 'main' way to tell if an extent is shared because
> we could have a single extent (so seq == 0).
> Should verify it's not in a DCD region that
> is shareable to make this decision.

Ah...  :-/

> 
> I've lost track on the region handling so maybe you already do
> this by not including those regions at all?

I don't think so.

I'll add the region check.  I see now why I glossed over this though.  The
shared nature of a DCD partition is defined in the DSMAS.

Is that correct?  Or am I missing something in the spec?

> 
> > +           dev_err_ratelimited(dev,
> > +                               "DC extent DPA %par (%*phC) can not be 
> > shared\n",
> > +                               &ext_range.start, CXL_EXTENT_TAG_LEN,
> > +                               extent->tag);
> > +           return -ENXIO;
> > +   }
> > +
> > +   /* Extents must not cross DC region boundary's */
> > +   for (int i = 0; i < mds->nr_dc_region; i++) {
> > +           struct cxl_dc_region_info *dcr = &mds->dc_region[i];
> > +           struct range region_range = (struct range) {
> > +                   .start = dcr->base,
> > +                   .end = dcr->base + dcr->decode_len - 1,
> > +           };
> > +
> > +           if (range_contains(&region_range, &ext_range)) {
> > +                   dev_dbg(dev, "DC extent DPA %par 
> > (DCR:%d:%#llx)(%*phC)\n",
> > +                           &ext_range, i, start - dcr->base,
> > +                           CXL_EXTENT_TAG_LEN, extent->tag);
> > +                   return 0;
> > +           }
> > +   }
> > +
> > +   dev_err_ratelimited(dev,
> > +                       "DC extent DPA %par (%*phC) is not in any DC 
> > region\n",
> > +                       &ext_range, CXL_EXTENT_TAG_LEN, extent->tag);
> > +   return -ENXIO;
> > +}
> > +
> >  void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> >                         enum cxl_event_log_type type,
> >                         enum cxl_event_type event_type,
> > @@ -1009,6 +1051,207 @@ static int cxl_clear_event_record(struct 
> > cxl_memdev_state *mds,
> >     return rc;
> >  }
> >  
> > +static int cxl_send_dc_response(struct cxl_memdev_state *mds, int opcode,
> > +                           struct xarray *extent_array, int cnt)
> > +{
> > +   struct cxl_mbox_dc_response *p;
> > +   struct cxl_mbox_cmd mbox_cmd;
> > +   struct cxl_extent *extent;
> > +   unsigned long index;
> > +   u32 pl_index;
> > +   int rc = 0;
> > +
> > +   size_t pl_size = struct_size(p, extent_list, cnt);
> > +   u32 max_extents = cnt;
> > +
> What is cnt is zero? All extents rejected so none in the
> extent_array. Need to send a zero extent response to reject
> them all IIRC.

yes.  I missed that thanks.

> 
> > +   /* May have to use more bit on response. */
> > +   if (pl_size > mds->payload_size) {
> > +           max_extents = (mds->payload_size - sizeof(*p)) /
> > +                         sizeof(struct updated_extent_list);
> > +           pl_size = struct_size(p, extent_list, max_extents);
> > +   }
> > +
> > +   struct cxl_mbox_dc_response *response __free(kfree) =
> > +                                           kzalloc(pl_size, GFP_KERNEL);
> > +   if (!response)
> > +           return -ENOMEM;
> > +
> > +   pl_index = 0;
> > +   xa_for_each(extent_array, index, extent) {
> > +
> > +           response->extent_list[pl_index].dpa_start = extent->start_dpa;
> > +           response->extent_list[pl_index].length = extent->length;
> > +           pl_index++;
> > +           response->extent_list_size = cpu_to_le32(pl_index);
> > +
> > +           if (pl_index == max_extents) {
> > +                   mbox_cmd = (struct cxl_mbox_cmd) {
> > +                           .opcode = opcode,
> > +                           .size_in = struct_size(response, extent_list,
> > +                                                  pl_index),
> > +                           .payload_in = response,
> > +                   };
> > +
> > +                   response->flags = 0;
> > +                   if (pl_index < cnt)
> > +                           response->flags &= CXL_DCD_EVENT_MORE;
> > +
> > +                   rc = cxl_internal_send_cmd(mds, &mbox_cmd);
> > +                   if (rc)
> > +                           return rc;
> > +                   pl_index = 0;
> > +           }
> > +   }
> > +
> > +   if (pl_index) {
> || !cnt 
> 
> I think so we send a nothing accepted message.

Yep.

> 
> > +           mbox_cmd = (struct cxl_mbox_cmd) {
> > +                   .opcode = opcode,
> > +                   .size_in = struct_size(response, extent_list,
> > +                                          pl_index),
> > +                   .payload_in = response,
> > +           };
> > +
> > +           response->flags = 0;
> > +           rc = cxl_internal_send_cmd(mds, &mbox_cmd);
>               if (rc)
>                       return rc;
> > +   }
> > +
> 
> return 0;  So that reader doesn't have to check what rc was in !pl_index
> case and avoids assigning rc right at the top.

Ah thanks.  That might have been left over from something previous.

> 
> 
> > +   return rc;
> > +}
> 
> 
> > +static int cxl_add_pending(struct cxl_memdev_state *mds)
> > +{
> > +   struct device *dev = mds->cxlds.dev;
> > +   struct cxl_extent *extent;
> > +   unsigned long index;
> > +   unsigned long cnt = 0;
> > +   int rc;
> > +
> > +   xa_for_each(&mds->pending_extents, index, extent) {
> > +           if (validate_add_extent(mds, extent)) {
> 
> 
> Add a comment here that not accepting an extent but
> accepting some or none means this one was rejected (I'd forgotten how
> that bit worked)

Ok yeah that may not be clear without reading the spec closely.

        /*
         * Any extents which are to be rejected are omitted from
         * the response.  An empty response means all are
         * rejected.
         */

> 
> > +                   dev_dbg(dev, "unconsumed DC extent DPA:%#llx 
> > LEN:%#llx\n",
> > +                           le64_to_cpu(extent->start_dpa),
> > +                           le64_to_cpu(extent->length));
> > +                   xa_erase(&mds->pending_extents, index);
> > +                   kfree(extent);
> > +                   continue;
> > +           }
> > +           cnt++;
> > +   }
> > +   rc = cxl_send_dc_response(mds, CXL_MBOX_OP_ADD_DC_RESPONSE,
> > +                             &mds->pending_extents, cnt);
> > +   xa_for_each(&mds->pending_extents, index, extent) {
> > +           xa_erase(&mds->pending_extents, index);
> > +           kfree(extent);
> > +   }
> > +   return rc;
> > +}
> > +
> > +static int handle_add_event(struct cxl_memdev_state *mds,
> > +                       struct cxl_event_dcd *event)
> > +{
> > +   struct cxl_extent *tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
> > +   struct device *dev = mds->cxlds.dev;
> > +
> > +   if (!tmp)
> > +           return -ENOMEM;
> > +
> > +   memcpy(tmp, &event->extent, sizeof(*tmp));
> 
> kmemdup?

yep.

> 
> > +   if (xa_insert(&mds->pending_extents, (unsigned long)tmp, tmp,
> > +                 GFP_KERNEL)) {
> > +           kfree(tmp);
> > +           return -ENOMEM;
> > +   }
> > +
> > +   if (event->flags & CXL_DCD_EVENT_MORE) {
> > +           dev_dbg(dev, "more bit set; delay the surfacing of extent\n");
> > +           return 0;
> > +   }
> > +
> > +   /* extents are removed and free'ed in cxl_add_pending() */
> > +   return cxl_add_pending(mds);
> > +}
> 
> >  static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
> >                                 enum cxl_event_log_type type)
> >  {
> > @@ -1044,9 +1287,17 @@ static void cxl_mem_get_records_log(struct 
> > cxl_memdev_state *mds,
> >             if (!nr_rec)
> >                     break;
> >  
> > -           for (i = 0; i < nr_rec; i++)
> > +           for (i = 0; i < nr_rec; i++) {
> >                     __cxl_event_trace_record(cxlmd, type,
> >                                              &payload->records[i]);
> > +                   if (type == CXL_EVENT_TYPE_DCD) {
> Bit of a deep indent so maybe flip logic?
> 
> Logic wise it's a bit dubious as we might want to match other
> types in future though so up to you.

I was thinking more along these lines.  But the rc is unneeded.  That print
can be in the handle function.


Something like this:

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 88b823afe482..e86a483d80eb 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1231,16 +1231,17 @@ static char *cxl_dcd_evt_type_str(u8 type)
        return "<unknown>";
 }

-static int cxl_handle_dcd_event_records(struct cxl_memdev_state *mds,
+static void cxl_handle_dcd_event_records(struct cxl_memdev_state *mds,
                                        struct cxl_event_record_raw *raw_rec)
 {
        struct cxl_event_dcd *event = &raw_rec->event.dcd;
        struct cxl_extent *extent = &event->extent;
        struct device *dev = mds->cxlds.dev;
        uuid_t *id = &raw_rec->id;
+       int rc;

        if (!uuid_equal(id, &CXL_EVENT_DC_EVENT_UUID))
-               return -EINVAL;
+               return;

        dev_dbg(dev, "DCD event %s : DPA:%#llx LEN:%#llx\n",
                cxl_dcd_evt_type_str(event->event_type),
@@ -1248,15 +1249,22 @@ static int cxl_handle_dcd_event_records(struct 
cxl_memdev_state *mds,

        switch (event->event_type) {
        case DCD_ADD_CAPACITY:
-               return handle_add_event(mds, event);
+               rc = handle_add_event(mds, event);
+               break;
        case DCD_RELEASE_CAPACITY:
-               return cxl_rm_extent(mds, &event->extent);
+               rc = cxl_rm_extent(mds, &event->extent);
+               break;
        case DCD_FORCED_CAPACITY_RELEASE:
                dev_err_ratelimited(dev, "Forced release event ignored.\n");
-               return 0;
+               rc = 0;
+               break;
        default:
-               return -EINVAL;
+               rc = -EINVAL;
+               break;
        }
+
+       if (rc)
+               dev_err_ratelimited(dev, "dcd event failed: %d\n", rc);
 }

 static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
@@ -1297,13 +1305,9 @@ static void cxl_mem_get_records_log(struct 
cxl_memdev_state *mds,
                for (i = 0; i < nr_rec; i++) {
                        __cxl_event_trace_record(cxlmd, type,
                                                 &payload->records[i]);
-                       if (type == CXL_EVENT_TYPE_DCD) {
-                               rc = cxl_handle_dcd_event_records(mds,
-                                                                 
&payload->records[i]);
-                               if (rc)
-                                       dev_err_ratelimited(dev, "dcd event 
failed: %d\n",
-                                                           rc);
-                       }
+                       if (type == CXL_EVENT_TYPE_DCD)
+                               cxl_handle_dcd_event_records(mds,
+                                                       &payload->records[i]);
                }

                if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
<end diff>

> 
>                       if (type != CXL_EVENT_TYPE_DCD)
>                               continue;
> 
>                       rc = 
> 
> > +                           rc = cxl_handle_dcd_event_records(mds,
> > +                                                             
> > &payload->records[i]);
> > +                           if (rc)
> > +                                   dev_err_ratelimited(dev, "dcd event 
> > failed: %d\n",
> > +                                                       rc);
> > +                   }
> > +           }
> >  
> 
> >  struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
> >  {
> >     struct cxl_memdev_state *mds;
> > @@ -1628,6 +1892,8 @@ struct cxl_memdev_state 
> > *cxl_memdev_state_create(struct device *dev)
> >     mds->cxlds.type = CXL_DEVTYPE_CLASSMEM;
> >     mds->ram_perf.qos_class = CXL_QOS_CLASS_INVALID;
> >     mds->pmem_perf.qos_class = CXL_QOS_CLASS_INVALID;
> > +   xa_init(&mds->pending_extents);
> > +   devm_add_action_or_reset(dev, clear_pending_extents, mds);
> 
> Why don't you need to check if this failed? Definitely seems unlikely
> to leave things in a good state. Unlikely to fail of course, but you never 
> know.

yea good catch.

> 
> >  
> >     return mds;
> >  }
> 
> > @@ -3090,6 +3091,8 @@ static struct cxl_dax_region 
> > *cxl_dax_region_alloc(struct cxl_region *cxlr)
> >  
> >     dev = &cxlr_dax->dev;
> >     cxlr_dax->cxlr = cxlr;
> > +   cxlr->cxlr_dax = cxlr_dax;
> > +   ida_init(&cxlr_dax->extent_ida);
> >     device_initialize(dev);
> >     lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
> >     device_set_pm_not_required(dev);
> > @@ -3190,7 +3193,10 @@ static int devm_cxl_add_pmem_region(struct 
> > cxl_region *cxlr)
> >  static void cxlr_dax_unregister(void *_cxlr_dax)
> >  {
> >     struct cxl_dax_region *cxlr_dax = _cxlr_dax;
> > +   struct cxl_region *cxlr = cxlr_dax->cxlr;
> >  
> > +   cxlr->cxlr_dax = NULL;
> > +   cxlr_dax->cxlr = NULL;
> 
> cxlr_dax->cxlr was assigned before this patch. 
> 
> I'm not seeing any new checks on these being non null so why
> are the needed?  If there is a good reason for this then
> a comment would be useful.

I'm not sure anymore either.  Perhaps this was left over from an earlier
version.  Or was something I thought I would need that ended up getting
removed.  I'll test without this hunk and remove it if I can.

Thanks for the review,
Ira

[snip]

Re: [PATCH v3 18/25] cxl/extent: Process DCD events and realize region extents

Reply via email to