Re: [PATCH v3 17/25] cxl/core: Return endpoint decoder information from region search

2024-08-19 Thread Dave Jiang



On 8/16/24 7:44 AM, Ira Weiny wrote:
> cxl_dpa_to_region() finds the region from a  tuple.
> The search involves finding the device endpoint decoder as well.
> 
> Dynamic capacity extent processing uses the endpoint decoder HPA
> information to calculate the HPA offset.  In addition, well behaved
> extents should be contained within an endpoint decoder.
> 
> Return the endpoint decoder found to be used in subsequent DCD code.
> 
> Signed-off-by: Ira Weiny 

Reviewed-by: Dave Jiang 
> ---
>  drivers/cxl/core/core.h   | 6 --
>  drivers/cxl/core/mbox.c   | 2 +-
>  drivers/cxl/core/memdev.c | 4 ++--
>  drivers/cxl/core/region.c | 8 +++-
>  4 files changed, 14 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 15b6cf1c19ef..76c4153a9b2c 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -39,7 +39,8 @@ void cxl_decoder_kill_region(struct cxl_endpoint_decoder 
> *cxled);
>  int cxl_region_init(void);
>  void cxl_region_exit(void);
>  int cxl_get_poison_by_endpoint(struct cxl_port *port);
> -struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 
> dpa);
> +struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa,
> +  struct cxl_endpoint_decoder **cxled);
>  u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
>  u64 dpa);
>  
> @@ -50,7 +51,8 @@ static inline u64 cxl_dpa_to_hpa(struct cxl_region *cxlr,
>   return ULLONG_MAX;
>  }
>  static inline
> -struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa)
> +struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa,
> +  struct cxl_endpoint_decoder **cxled)
>  {
>   return NULL;
>  }
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 68c26c4be91a..01a447aaa1b1 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -909,7 +909,7 @@ void cxl_event_trace_record(const struct cxl_memdev 
> *cxlmd,
>   guard(rwsem_read)(&cxl_dpa_rwsem);
>  
>   dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK;
> - cxlr = cxl_dpa_to_region(cxlmd, dpa);
> + cxlr = cxl_dpa_to_region(cxlmd, dpa, NULL);
>   if (cxlr)
>   hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa);
>  
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index 7da1f0f5711a..12fb07fb89a6 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -323,7 +323,7 @@ int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa)
>   if (rc)
>   goto out;
>  
> - cxlr = cxl_dpa_to_region(cxlmd, dpa);
> + cxlr = cxl_dpa_to_region(cxlmd, dpa, NULL);
>   if (cxlr)
>   dev_warn_once(mds->cxlds.dev,
> "poison inject dpa:%#llx region: %s\n", dpa,
> @@ -387,7 +387,7 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
>   if (rc)
>   goto out;
>  
> - cxlr = cxl_dpa_to_region(cxlmd, dpa);
> + cxlr = cxl_dpa_to_region(cxlmd, dpa, NULL);
>   if (cxlr)
>   dev_warn_once(mds->cxlds.dev,
> "poison clear dpa:%#llx region: %s\n", dpa,
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 35c4a1f4f9bd..8e0884b52f84 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -2828,6 +2828,7 @@ int cxl_get_poison_by_endpoint(struct cxl_port *port)
>  struct cxl_dpa_to_region_context {
>   struct cxl_region *cxlr;
>   u64 dpa;
> + struct cxl_endpoint_decoder *cxled;
>  };
>  
>  static int __cxl_dpa_to_region(struct device *dev, void *arg)
> @@ -2861,11 +2862,13 @@ static int __cxl_dpa_to_region(struct device *dev, 
> void *arg)
>   dev_name(dev));
>  
>   ctx->cxlr = cxlr;
> + ctx->cxled = cxled;
>  
>   return 1;
>  }
>  
> -struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa)
> +struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa,
> +  struct cxl_endpoint_decoder **cxled)
>  {
>   struct cxl_dpa_to_region_context ctx;
>   struct cxl_port *port;
> @@ -2877,6 +2880,9 @@ struct cxl_region *cxl_dpa_to_region(const struct 
> cxl_memdev *cxlmd, u64 dpa)
>   if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
>   device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region);
>  
> + if (cxled)
> + *cxled = ctx.cxled;
> +
>   return ctx.cxlr;
>  }
>  
> 



Re: [PATCH v3 18/25] cxl/extent: Process DCD events and realize region extents

2024-08-19 Thread Dave Jiang



On 8/16/24 7:44 AM, ira.we...@intel.com wrote:
> From: Navneet Singh 
> 
> A dynamic capacity device (DCD) sends events to signal the host for
> changes in the availability of Dynamic Capacity (DC) memory.  These
> events contain extents describing a DPA range and meta data for memory
> to be added or removed.  Events may be sent from the device at any time.
> 
> Three types of events can be signaled, Add, Release, and Force Release.
> 
> On add, the host may accept or reject the memory being offered.  If no
> region exists, or the extent is invalid, the extent should be rejected.
> Add extent events may be grouped by a 'more' bit which indicates those
> extents should be processed as a group.
> 
> On remove, the host can delay the response until the host is safely not
> using the memory.  If no region exists the release can be sent
> immediately.  The host may also release extents (or partial extents) at
> any time.  Thus the 'more' bit grouping of release events is of less
> value and can be ignored in favor of sending multiple release capacity
> responses for groups of release events.
> 
> Force removal is intended as a mechanism between the FM and the device
> and intended only when the host is unresponsive, out of sync, or
> otherwise broken.  Purposely ignore force removal events.
> 
> Regions are made up of one or more devices which may be surfacing memory
> to the host.  Once all devices in a region have surfaced an extent the
> region can expose a corresponding extent for the user to consume.
> Without interleaving a device extent forms a 1:1 relationship with the
> region extent.  Immediately surface a region extent upon getting a
> device extent.
> 
> Per the specification the device is allowed to offer or remove extents
> at any time.  However, anticipated use cases can expect extents to be
> offered, accepted, and removed in well defined chunks.
> 
> Simplify extent tracking with the following restrictions.
> 
>   1) Flag for removal any extent which overlaps a requested
>  release range.
>   2) Refuse the offer of extents which overlap already accepted
>  memory ranges.
>   3) Accept again a range which has already been accepted by the
>  host.  (It is likely the device has an error because it
>  should already know that this range was accepted.  But from
>  the host point of view it is safe to acknowledge that
>  acceptance again.)
> 
> Management of the region extent devices must be synchronized with
> potential uses of the memory within the DAX layer.  Create region extent
> devices as children of the cxl_dax_region device such that the DAX
> region driver can co-drive them and synchronize with the DAX layer.
> Synchronization and management is handled in a subsequent patch.
> 
> Process DCD events and create region devices.
> 
> Signed-off-by: Navneet Singh 
> Co-developed-by: Ira Weiny 
> Signed-off-by: Ira Weiny 

A few nits below, but in general
Reviewed-by: Dave Jiang 

> 
> ---
> Changes:
> [iweiny: combine this with the extent surface patches to better show the
>  lifetime extent objects in review]
> [iweiny: clean up commit message.]
> [iweiny: move extent verification of the 'read extents on region
>  creation' to this patch]
> [iweiny: Provide for a common path for extent realization between an add
>event and adding existing extents.]
> [iweiny: Persist a check that an extent is within an endpoint decoder]
> [iweiny: reduce exported and non-static calls]
> [iweiny: use %par]
> 
>   
> 
> [Jonathan: implement the more bit with a simple algorithm which accepts
>  all extents it can.
>  Also include the response more bit to prevent payload
>  overflow]
> [Fan: Do not error if a contained extent is added.]
> [Jonathan: allocate ida after kzalloc]
> [iweiny: fix ida resource leak]
> [fan/djiang: remove unneeded memset]
> [djiang: fix indentation]
> [Jonathan: Fix indentation]
> [Jonathan/djbw: make tag a uuid]
> [djbw: create helper calc_hpa_range() straight away]
> [djbw: Allow for multiple cxled_extents per region_extent]
> [djbw: s/cxl_ed/cxled]
> [djbw: s/cxl_release_ed_extent/cxled_release_extent/]
> [djbw: s/reg_ext/region_extent/]
> [djbw: s/dc_extent/extent/]
> [Gregory/djbw: reject shared extents]
> [iweiny: predicate extent.c compile on CONFIG_CXL_REGION]
> ---
>  drivers/cxl/core/Makefile |   2 +-
>  drivers/cxl/core/core.h   |  13 ++
>  drivers/cxl/core/extent.c | 345 
> ++
>  drivers/cxl/core/mbox.c   | 268 ++-
>  drivers/cxl/core/region.c |   6 +
>  drivers/cxl/cxl.h |  52 ++-
>  drivers/cxl/cxlmem.h  |  26 
>  include/linux/cxl-event.h |  32 +
>  tools/testing/cxl/Kbuild  |   3 +-
>  9 files changed, 743 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
> index 9259bcc6773c..3b812515e725 100644
> --- a/drive

Re: [PATCH v3 19/25] cxl/region/extent: Expose region extent information in sysfs

2024-08-19 Thread Dave Jiang



On 8/16/24 7:44 AM, ira.we...@intel.com wrote:
> From: Navneet Singh 
> 
> Extent information can be helpful to the user to coordinate memory usage
> with the external orchestrator and FM.
> 
> Expose the details of region extents by creating the following
> sysfs entries.
> 
> /sys/bus/cxl/devices/dax_regionX/extentX.Y
> /sys/bus/cxl/devices/dax_regionX/extentX.Y/offset
> /sys/bus/cxl/devices/dax_regionX/extentX.Y/length
> /sys/bus/cxl/devices/dax_regionX/extentX.Y/tag
> 
> Signed-off-by: Navneet Singh 
> Co-developed-by: Ira Weiny 
> Signed-off-by: Ira Weiny 
> 
> ---
> Changes:
> [iweiny: split this out]
> [Jonathan: add documentation for extent sysfs]
> [Jonathan/djbw: s/label/tag]
> [Jonathan/djbw: treat tag as uuid]
> [djbw: use __ATTRIBUTE_GROUPS]
> [djbw: make tag invisible if it is empty]
> [djbw/iweiny: use conventional id names for extents; extentX.Y]
> ---
>  Documentation/ABI/testing/sysfs-bus-cxl | 13 
>  drivers/cxl/core/extent.c   | 58 
> +
>  2 files changed, 71 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl 
> b/Documentation/ABI/testing/sysfs-bus-cxl
> index 3a5ee88e551b..e97e6a73c960 100644
> --- a/Documentation/ABI/testing/sysfs-bus-cxl
> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> @@ -599,3 +599,16 @@ Description:
>   See Documentation/ABI/stable/sysfs-devices-node. access0 
> provides
>   the number to the closest initiator and access1 provides the
>   number to the closest CPU.
> +
> +What:/sys/bus/cxl/devices/dax_regionX/extentX.Y/offset
> + /sys/bus/cxl/devices/dax_regionX/extentX.Y/length
> + /sys/bus/cxl/devices/dax_regionX/extentX.Y/tag

I wonder consider an entry for each with their own descriptions, which seems to 
be the standard practice.

DJ

> +Date:October, 2024
> +KernelVersion:   v6.12
> +Contact: linux-...@vger.kernel.org
> +Description:
> + (RO) [For Dynamic Capacity regions only]  Extent offset and
> + length within the region.  Users can use the extent information
> + to create DAX devices on specific extents.  This is done by
> + creating and destroying DAX devices in specific sequences and
> + looking at the mappings created.
> diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c
> index 34456594cdc3..d7d526a51e2b 100644
> --- a/drivers/cxl/core/extent.c
> +++ b/drivers/cxl/core/extent.c
> @@ -6,6 +6,63 @@
>  
>  #include "core.h"
>  
> +static ssize_t offset_show(struct device *dev, struct device_attribute *attr,
> +char *buf)
> +{
> + struct region_extent *region_extent = to_region_extent(dev);
> +
> + return sysfs_emit(buf, "%#llx\n", region_extent->hpa_range.start);
> +}
> +static DEVICE_ATTR_RO(offset);
> +
> +static ssize_t length_show(struct device *dev, struct device_attribute *attr,
> +char *buf)
> +{
> + struct region_extent *region_extent = to_region_extent(dev);
> + u64 length = range_len(®ion_extent->hpa_range);
> +
> + return sysfs_emit(buf, "%#llx\n", length);
> +}
> +static DEVICE_ATTR_RO(length);
> +
> +static ssize_t tag_show(struct device *dev, struct device_attribute *attr,
> + char *buf)
> +{
> + struct region_extent *region_extent = to_region_extent(dev);
> +
> + return sysfs_emit(buf, "%pUb\n", ®ion_extent->tag);
> +}
> +static DEVICE_ATTR_RO(tag);
> +
> +static struct attribute *region_extent_attrs[] = {
> + &dev_attr_offset.attr,
> + &dev_attr_length.attr,
> + &dev_attr_tag.attr,
> + NULL,
> +};
> +
> +static uuid_t empty_tag = { 0 };
> +
> +static umode_t region_extent_visible(struct kobject *kobj,
> +  struct attribute *a, int n)
> +{
> + struct device *dev = kobj_to_dev(kobj);
> + struct region_extent *region_extent = to_region_extent(dev);
> +
> + if (a == &dev_attr_tag.attr &&
> + uuid_equal(®ion_extent->tag, &empty_tag))
> + return 0;
> +
> + return a->mode;
> +}
> +
> +static const struct attribute_group region_extent_attribute_group = {
> + .attrs = region_extent_attrs,
> + .is_visible = region_extent_visible,
> +};
> +
> +__ATTRIBUTE_GROUPS(region_extent_attribute);
> +
>  static void cxled_release_extent(struct cxl_endpoint_decoder *cxled,
>struct cxled_extent *ed_extent)
>  {
> @@ -44,6 +101,7 @@ static void region_extent_release(struct device *dev)
>  static const struct device_type region_extent_type = {
>   .name = "extent",
>   .release = region_extent_release,
> + .groups = region_extent_attribute_groups,
>  };
>  
>  bool is_region_extent(struct device *dev)
> 



Re: [PATCH v3 20/25] dax/bus: Factor out dev dax resize logic

2024-08-19 Thread Dave Jiang



On 8/16/24 7:44 AM, Ira Weiny wrote:
> Dynamic Capacity regions must limit dev dax resources to those areas
> which have extents backing real memory.  Such DAX regions are dubbed
> 'sparse' regions.  In order to manage where memory is available four
> alternatives were considered:
> 
> 1) Create a single region resource child on region creation which
>reserves the entire region.  Then as extents are added punch holes in
>this reservation.  This requires new resource manipulation to punch
>the holes and still requires an additional iteration over the extent
>areas which may already have existing dev dax resources used.
> 
> 2) Maintain an ordered xarray of extents which can be queried while
>processing the resize logic.  The issue is that existing region->res
>children may artificially limit the allocation size sent to
>alloc_dev_dax_range().  IE the resource children can't be directly
>used in the resize logic to find where space in the region is.  This
>also poses a problem of managing the available size in 2 places.
> 
> 3) Maintain a separate resource tree with extents.  This option is the
>same as 2) but with the different data structure.  Most ideally there
>should be a unified representation of the resource tree not two places
>to look for space.
> 
> 4) Create region resource children for each extent.  Manage the dax dev
>resize logic in the same way as before but use a region child
>(extent) resource as the parents to find space within each extent.
> 
> Option 4 can leverage the existing resize algorithm to find space within
> the extents.  It manages the available space in a singular resource tree
> which is less complicated for finding space.
> 
> In preparation for this change, factor out the dev_dax_resize logic.
> For static regions use dax_region->res as the parent to find space for
> the dax ranges.  Future patches will use the same algorithm with
> individual extent resources as the parent.
> 
> Signed-off-by: Ira Weiny 
Reviewed-by: Dave Jiang 

> ---
> Changes:
> [iweiny: Rebase on new DAX region locking]
> [iweiny: Reword commit message]
> [iweiny: Drop reviews]
> ---
>  drivers/dax/bus.c | 129 
> +-
>  1 file changed, 79 insertions(+), 50 deletions(-)
> 
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index d8cb5195a227..975860371d9f 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -844,11 +844,9 @@ static int devm_register_dax_mapping(struct dev_dax 
> *dev_dax, int range_id)
>   return 0;
>  }
>  
> -static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start,
> - resource_size_t size)
> +static int alloc_dev_dax_range(struct resource *parent, struct dev_dax 
> *dev_dax,
> +u64 start, resource_size_t size)
>  {
> - struct dax_region *dax_region = dev_dax->region;
> - struct resource *res = &dax_region->res;
>   struct device *dev = &dev_dax->dev;
>   struct dev_dax_range *ranges;
>   unsigned long pgoff = 0;
> @@ -866,14 +864,14 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, 
> u64 start,
>   return 0;
>   }
>  
> - alloc = __request_region(res, start, size, dev_name(dev), 0);
> + alloc = __request_region(parent, start, size, dev_name(dev), 0);
>   if (!alloc)
>   return -ENOMEM;
>  
>   ranges = krealloc(dev_dax->ranges, sizeof(*ranges)
>   * (dev_dax->nr_range + 1), GFP_KERNEL);
>   if (!ranges) {
> - __release_region(res, alloc->start, resource_size(alloc));
> + __release_region(parent, alloc->start, resource_size(alloc));
>   return -ENOMEM;
>   }
>  
> @@ -1026,50 +1024,45 @@ static bool adjust_ok(struct dev_dax *dev_dax, struct 
> resource *res)
>   return true;
>  }
>  
> -static ssize_t dev_dax_resize(struct dax_region *dax_region,
> - struct dev_dax *dev_dax, resource_size_t size)
> +/**
> + * dev_dax_resize_static - Expand the device into the unused portion of the
> + * region. This may involve adjusting the end of an existing resource, or
> + * allocating a new resource.
> + *
> + * @parent: parent resource to allocate this range in
> + * @dev_dax: DAX device to be expanded
> + * @to_alloc: amount of space to alloc; must be <= space available in @parent
> + *
> + * Return the amount of space allocated or -ERRNO on failure
> + */
> +static ssize_t dev_dax_resize_static(struct resource *parent,
> +  struct dev_dax *dev_dax,
> +  resource_size_t to_alloc)
>  {
> - resource_size_t avail = dax_region_avail_size(dax_region), to_alloc;
> - resource_size_t dev_size = dev_dax_size(dev_dax);
> - struct resource *region_res = &dax_region->res;
> - struct device *dev = &dev_dax->dev;
>   struct resource *res, *first;
> - resource_size_t alloc = 0;
>   int rc;
>  

Re: [PATCH v3 21/25] dax/region: Create resources on sparse DAX regions

2024-08-19 Thread Dave Jiang



On 8/16/24 7:44 AM, ira.we...@intel.com wrote:
> From: Navneet Singh 
> 
> DAX regions which map dynamic capacity partitions require that memory be
> allowed to come and go.  Recall sparse regions were created for this
> purpose.  Now that extents can be realized within DAX regions the DAX
> region driver can start tracking sub-resource information.
> 
> The tight relationship between DAX region operations and extent
> operations require memory changes to be controlled synchronously with
> the user of the region.  Synchronize through the dax_region_rwsem and by
> having the region driver drive both the region device as well as the
> extent sub-devices.
> 
> Recall requests to remove extents can happen at any time and that a host
> is not obligated to release the memory until it is not being used.  If
> an extent is not used allow a release response.
> 
> The DAX layer has no need for the details of the CXL memory extent
> devices.  Expose extents to the DAX layer as device children of the DAX
> region device.  A single callback from the driver aids the DAX layer to
> determine if the child device is an extent.  The DAX layer also
> registers a devres function to automatically clean up when the device is
> removed from the region.
> 
> There is a race between extents being surfaced and the dax_cxl driver
> being loaded.  The driver must therefore scan for any existing extents
> while still under the device lock.
> 
> Respond to extent notifications.  Manage the DAX region resource tree
> based on the extents lifetime.  Return the status of remove
> notifications to lower layers such that it can manage the hardware
> appropriately.
> 
> Signed-off-by: Navneet Singh 
> Co-developed-by: Ira Weiny 
> Signed-off-by: Ira Weiny 
> 
> ---
> Changes:
> [iweiny: patch reorder]
> [iweiny: move hunks from other patches to clarify code changes and
>  add/release flows WRT dax regions]
> [iweiny: use %par]
> [iweiny: clean up variable names]
> [iweiny: Simplify sparse_ops]
> [Fan: avoid open coding range_len()]
> [djbw: s/reg_ext/region_extent]
> ---
>  drivers/cxl/core/extent.c |  76 +--
>  drivers/cxl/cxl.h |   6 ++
>  drivers/dax/bus.c | 243 
> +-
>  drivers/dax/bus.h |   3 +-
>  drivers/dax/cxl.c |  63 +++-
>  drivers/dax/dax-private.h |  34 +++
>  drivers/dax/hmem/hmem.c   |   2 +-
>  drivers/dax/pmem.c|   2 +-
>  8 files changed, 391 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c
> index d7d526a51e2b..103b0bec3a4a 100644
> --- a/drivers/cxl/core/extent.c
> +++ b/drivers/cxl/core/extent.c
> @@ -271,20 +271,67 @@ static void calc_hpa_range(struct cxl_endpoint_decoder 
> *cxled,
>   hpa_range->end = hpa_range->start + range_len(dpa_range) - 1;
>  }
>  
> +static int cxlr_notify_extent(struct cxl_region *cxlr, enum dc_event event,
> +   struct region_extent *region_extent)
> +{
> + struct cxl_dax_region *cxlr_dax;
> + struct device *dev;
> + int rc = 0;
> +
> + cxlr_dax = cxlr->cxlr_dax;
> + dev = &cxlr_dax->dev;
> + dev_dbg(dev, "Trying notify: type %d HPA %par\n",
> + event, ®ion_extent->hpa_range);
> +
> + /*
> +  * NOTE the lack of a driver indicates a notification has failed.  No
> +  * user space coordiantion was possible.
> +  */
> + device_lock(dev);
> + if (dev->driver) {
> + struct cxl_driver *driver = to_cxl_drv(dev->driver);
> + struct cxl_notify_data notify_data = (struct cxl_notify_data) {
> + .event = event,
> + .region_extent = region_extent,
> + };
> +
> + if (driver->notify) {
> + dev_dbg(dev, "Notify: type %d HPA %par\n",
> + event, ®ion_extent->hpa_range);
> + rc = driver->notify(dev, ¬ify_data);
> + }
> + }
> + device_unlock(dev);

Maybe a cleaner version:
guard(device)(dev);
if (!dev->driver || !dev->driver->notify)
return 0;

dev_dbg(...);
return driver->notify(dev, ¬ify_data);


> + return rc;
> +}
> +
> +struct rm_data {
> + struct cxl_region *cxlr;
> + struct range *range;
> +};
> +
>  static int cxlr_rm_extent(struct device *dev, void *data)
>  {
>   struct region_extent *region_extent = to_region_extent(dev);
> - struct range *region_hpa_range = data;
> + struct rm_data *rm_data = data;
> + int rc;
>  
>   if (!region_extent)
>   return 0;
>  
>   /*
> -  * Any extent which 'touches' the released range is removed.
> +  * Any extent which 'touches' the released range is attempted to be
> +  * removed.
>*/
> - if (range_overlaps(region_hpa_range, ®ion_extent->hpa_range)) {
> + if (range_overlaps(rm_data->range, ®ion_extent->hpa_range)) {
> +

Re: [PATCH v3 22/25] cxl/region: Read existing extents on region creation

2024-08-19 Thread Dave Jiang



On 8/16/24 7:44 AM, ira.we...@intel.com wrote:
> From: Navneet Singh 
> 
> Dynamic capacity device extents may be left in an accepted state on a
> device due to an unexpected host crash.  In this case it is expected
> that the creation of a new region on top of a DC partition can read
> those extents and surface them for continued use.
> 
> Once all endpoint decoders are part of a region and the region is being
> realized a read of the devices extent list can reveal these previously
> accepted extents.

Once all endpoint decoders are part of a region and the region is being
realized, a read of the 'devices extend list' can reveal these previously
accepted extents.

> 
> CXL r3.1 specifies the mailbox call Get Dynamic Capacity Extent List for
> this purpose.  The call returns all the extents for all dynamic capacity
> partitions.  If the fabric manager is adding extents to any DCD
> partition, the extent list for the recovered region may change.  In this
> case the query must retry.  Upon retry the query could encounter extents
> which were accepted on a previous list query.  Adding such extents is
> ignored without error because they are entirely within a previous
> accepted extent.
> 
> The scan for existing extents races with the dax_cxl driver.  This is
> synchronized through the region device lock.  Extents which are found
> after the driver has loaded will surface through the normal notification
> path while extents seen prior to the driver are read during driver load.
> 
> Signed-off-by: Navneet Singh 
> Co-developed-by: Ira Weiny 
> Signed-off-by: Ira Weiny 
> 
> ---
> Changes:
> [iweiny: Leverage the new add path from the event processing code such
>that the adding and surfacing of extents flows through the same
>code path for both event processing and existing extents.
>While this does validate existing extents again on start up
>this is an error recovery case / new boot scenario and should
>not cause any major issues while making the code more
>straight forward and maintainable.]
> 
> [iweiny: use %par]
> [iweiny: rebase]
> [iweiny: Move this patch later in the series such that the realization
>  of extents can go through the same path as an add event]
> [Fan: Issue a retry if the gen number changes]
> [djiang: s/uint64_t/u64/]
> [djiang: update function names]
> [Jørgen/djbw: read the generation and total count on first iteration of
>   the Get Extent List call]
> [djbw: s/cxl_mbox_get_dc_extent_in/cxl_mbox_get_extent_in/]
> [djbw: s/cxl_mbox_get_dc_extent_out/cxl_mbox_get_extent_out/]
> [djbw/iweiny: s/cxl_read_dc_extents/cxl_read_extent_list]
> ---
>  drivers/cxl/core/core.h   |   2 +
>  drivers/cxl/core/mbox.c   | 100 
> ++
>  drivers/cxl/core/region.c |  12 ++
>  drivers/cxl/cxlmem.h  |  21 ++
>  4 files changed, 135 insertions(+)
> 
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 8dfc97b2e0a4..9e54064a6f48 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -21,6 +21,8 @@ cxled_to_mds(struct cxl_endpoint_decoder *cxled)
>   return container_of(cxlds, struct cxl_memdev_state, cxlds);
>  }
>  
> +void cxl_read_extent_list(struct cxl_endpoint_decoder *cxled);
> +
>  #ifdef CONFIG_CXL_REGION
>  extern struct device_attribute dev_attr_create_pmem_region;
>  extern struct device_attribute dev_attr_create_ram_region;
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index f629ad7488ac..d43ac8eabf56 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -1670,6 +1670,106 @@ int cxl_dev_dynamic_capacity_identify(struct 
> cxl_memdev_state *mds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dev_dynamic_capacity_identify, CXL);
>  
> +/* Return -EAGAIN if the extent list changes while reading */
> +static int __cxl_read_extent_list(struct cxl_endpoint_decoder *cxled)
> +{
> + u32 current_index, total_read, total_expected, initial_gen_num;
> + struct cxl_memdev_state *mds = cxled_to_mds(cxled);
> + struct device *dev = mds->cxlds.dev;
> + struct cxl_mbox_cmd mbox_cmd;
> + u32 max_extent_count;
> + bool first = true;
> +
> + struct cxl_mbox_get_extent_out *extents __free(kfree) =
> + kvmalloc(mds->payload_size, GFP_KERNEL);
> + if (!extents)
> + return -ENOMEM;
> +
> + total_read = 0;
> + current_index = 0;
> + total_expected = 0;
> + max_extent_count = (mds->payload_size - sizeof(*extents)) /
> + sizeof(struct cxl_extent);
> + do {
> + struct cxl_mbox_get_extent_in get_extent;
> + u32 nr_returned, current_total, current_gen_num;
> + int rc;
> +
> + get_extent = (struct cxl_mbox_get_extent_in) {
> + .extent_cnt = max(max_extent_count,
> +   total_expected - current_index),
> +