On 5/23/26 2:43 AM, Anisa Su wrote: > DC DAX regions are populated with dax_resource children that each carry a > backing tag uuid and a per-allocation sequence number (seq_num). Add the > userspace claim semantics that resolve those tagged groups into DAX > devices. > > A DC region's seed dax device is created at 0-size on probe; userspace > populates it by writing to its 'uuid' attribute: > > * A non-null UUID claims every dax_resource on this region whose tag > matches, in seq_num order via uuid_claim_tagged(). The match set > must form a dense 1..n sequence (no gap, no duplicate); the CXL > side maintains this invariant for both sharable allocations (where > the device stamps shared_extn_seq) and non-sharable allocations > (where cxl_add_pending assigns arrival-order seq). The resulting > DAX device's size equals the sum of every member extent's size. > > * "0" claims a single untagged dax_resource via > uuid_claim_untagged(). Untagged extents are independent > allocations; collapsing several would aggregate unrelated capacity, > so each uuid="0" write consumes exactly one untagged resource. > > * A write that matches no dax_resource returns -ENOENT; the device > stays at size 0. > > uuid_show() reads back the backing tag uuid (or the null UUID for an > untagged claim). The attribute is read-only (0444) on non-DC dax > devices; writes to it on non-DC regions return -EOPNOTSUPP. > > dev_dax_visible() exposes the uuid attribute only on DC dax devices. > > Based on an original patch by Navneet Singh. > > Signed-off-by: Ira Weiny <[email protected]> > Signed-off-by: Anisa Su <[email protected]> > > --- > Changes: > [anisa: split out from the original "Surface dc_extents" commit; > userspace tag-claim semantics only.] > --- > drivers/dax/bus.c | 260 +++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 256 insertions(+), 4 deletions(-) > > diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c > index c030eb103ad0..1dccb3e5cd0f 100644 > --- a/drivers/dax/bus.c > +++ b/drivers/dax/bus.c > @@ -5,6 +5,7 @@ > #include <linux/mutex.h> > #include <linux/list.h> > #include <linux/slab.h> > +#include <linux/sort.h> > #include <linux/dax.h> > #include <linux/io.h> > #include "dax-private.h" > @@ -1316,6 +1317,89 @@ static ssize_t dev_dax_resize(struct dax_region > *dax_region, > return 0; > } > > +/* DC extents are all-or-nothing: an extent is either free or fully claimed. > */ > +static bool dax_resource_in_use(const struct dax_resource *dax_resource) > +{ > + return dax_resource->use_cnt > 0; > +} > + > +struct dax_uuid_match { > + const struct dax_region *dax_region; > + const uuid_t *uuid; > +}; > + > +static int find_uuid_extent(struct device *dev, const void *data) > +{ > + const struct dax_uuid_match *match = data; > + struct dax_resource *dax_resource; > + > + if (!match->dax_region->dc_ops->is_extent(dev)) > + return 0; > + > + dax_resource = dev_get_drvdata(dev); > + if (!dax_resource || dax_resource_in_use(dax_resource)) > + return 0; > + return uuid_equal(&dax_resource->uuid, match->uuid); > +} > + > +struct dax_tag_collect { > + const struct dax_region *dax_region; > + const uuid_t *uuid; > + struct dax_resource **arr; > + unsigned int count; > + unsigned int cap; > +}; > + > +static int collect_uuid_extent(struct device *dev, void *data) > +{ > + struct dax_tag_collect *c = data; > + struct dax_resource *dax_resource; > + > + if (!c->dax_region->dc_ops->is_extent(dev)) > + return 0; > + > + dax_resource = dev_get_drvdata(dev); > + if (!dax_resource || dax_resource_in_use(dax_resource)) > + return 0; > + if (!uuid_equal(&dax_resource->uuid, c->uuid)) > + return 0; > + > + if (c->count == c->cap) > + return -ENOSPC; > + c->arr[c->count++] = dax_resource; > + return 0; > +} > + > +static int count_uuid_extent(struct device *dev, void *data) > +{ > + struct dax_tag_collect *c = data; > + struct dax_resource *dax_resource; > + > + if (!c->dax_region->dc_ops->is_extent(dev)) > + return 0; > + > + dax_resource = dev_get_drvdata(dev); > + if (!dax_resource || dax_resource_in_use(dax_resource)) > + return 0; > + if (!uuid_equal(&dax_resource->uuid, c->uuid)) > + return 0; > + > + c->count++; > + return 0; > +} > + > +static int dax_resource_seq_cmp(const void *a, const void *b) > +{ > + const struct dax_resource * const *pa = a; > + const struct dax_resource * const *pb = b; > + > + if ((*pa)->seq_num < (*pb)->seq_num) > + return -1; > + if ((*pa)->seq_num > (*pb)->seq_num) > + return 1; > + return 0; > +} > + > static ssize_t size_store(struct device *dev, struct device_attribute *attr, > const char *buf, size_t len) > { > @@ -1548,13 +1632,177 @@ static DEVICE_ATTR_RO(numa_node); > static ssize_t uuid_show(struct device *dev, > struct device_attribute *attr, char *buf) > { > - return sysfs_emit(buf, "%d\n", 0); > + struct dev_dax *dev_dax = to_dev_dax(dev); > + int rc; > + > + rc = down_read_interruptible(&dax_dev_rwsem); Since we are here, may as well convert these to ACQUIRE() and be rid of the gotos ACQUIRE(rwsem_read_intr, rwsem)(&dax_dev_rwsem); if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) ... > + if (rc) > + return rc; > + > + for (int i = 0; i < dev_dax->nr_range; i++) { > + struct dax_resource *r = dev_dax->ranges[i].dax_resource; > + > + if (r && !uuid_is_null(&r->uuid)) { > + rc = sysfs_emit(buf, "%pUb\n", &r->uuid); > + goto out; > + } > + } > + rc = sysfs_emit(buf, "0\n"); As pointed out earlyer, should display null_uuid to be consistent. > +out: > + up_read(&dax_dev_rwsem); > + return rc; > +} > + > +static ssize_t uuid_claim_untagged(struct dax_region *dax_region, > + struct dev_dax *dev_dax) > +{ > + struct dax_uuid_match match = { > + .dax_region = dax_region, > + .uuid = &uuid_null, > + }; > + struct dax_resource *dax_resource; > + resource_size_t to_alloc; > + struct device *extent_dev; > + ssize_t alloc; > + > + extent_dev = device_find_child(dax_region->dev, &match, > + find_uuid_extent); > + if (!extent_dev) > + return -ENOENT; > + > + dax_resource = dev_get_drvdata(extent_dev); > + to_alloc = resource_size(dax_resource->res); > + alloc = __dev_dax_resize(dax_resource->res, dev_dax, to_alloc, > + dax_resource); > + put_device(extent_dev); > + if (alloc < 0) > + return alloc; > + if (alloc == 0) > + return -ENOENT; > + dax_resource->use_cnt++; > + return 0; > +} > + > +static ssize_t uuid_claim_tagged(struct dax_region *dax_region, > + struct dev_dax *dev_dax, const uuid_t *uuid) > +{ > + struct dax_tag_collect c = { > + .dax_region = dax_region, > + .uuid = uuid, > + }; > + unsigned int i; > + ssize_t rc; > + > + /* Two-pass: count, then collect into a sized array. */ > + device_for_each_child(dax_region->dev, &c, count_uuid_extent); > + if (!c.count) > + return -ENOENT; > + > + c.arr = kmalloc_array(c.count, sizeof(*c.arr), GFP_KERNEL); > + if (!c.arr) > + return -ENOMEM; > + c.cap = c.count; > + c.count = 0; > + > + rc = device_for_each_child(dax_region->dev, &c, collect_uuid_extent); > + if (rc) > + goto out; > + > + sort(c.arr, c.count, sizeof(*c.arr), dax_resource_seq_cmp, NULL); > + > + /* > + * Tagged groups carry a dense 1..n @seq_num regardless of source > + * (sharable: device-stamped; non-sharable: host-assigned in > + * arrival order — see &struct dax_resource). A gap or > + * out-of-range value here means an extent went missing on the > + * cxl side (e.g. a per-extent failure in cxl_add_pending) or a > + * cxl-side validation gap; in either case refuse the whole > + * group rather than carve a partial allocation. > + */ > + for (i = 0; i < c.count; i++) { > + if (c.arr[i]->seq_num != i + 1) { > + dev_WARN_ONCE(dax_region->dev, 1, > + "tag %pUb seq invariant violated at slot %u > (got %u)\n", > + uuid, i, c.arr[i]->seq_num); > + rc = -EINVAL; > + goto out; > + } > + } > + > + for (i = 0; i < c.count; i++) { > + resource_size_t to_alloc = resource_size(c.arr[i]->res); > + ssize_t alloc; > + > + alloc = __dev_dax_resize(c.arr[i]->res, dev_dax, to_alloc, > + c.arr[i]); > + if (alloc < 0) { > + rc = alloc; > + goto rollback; > + } > + if (alloc == 0) { > + rc = -ENOSPC; > + goto rollback; > + } > + c.arr[i]->use_cnt++; > + } > + rc = 0; > + goto out; > + > +rollback: > + /* > + * Partial failure: trim every range we added in this attempt. > + * trim_dev_dax_range pops the most-recently-appended range from > + * dev_dax->ranges[] and decrements its dax_resource->use_cnt, so > + * looping until we have undone @i additions restores both > + * dev_dax->ranges[] and the matched dax_resources' use_cnt. > + */ > + while (i-- > 0) > + trim_dev_dax_range(dev_dax); > +out: > + kfree(c.arr); > + return rc; > } > > static ssize_t uuid_store(struct device *dev, struct device_attribute *attr, > const char *buf, size_t len) > { > - return -EOPNOTSUPP; > + struct dev_dax *dev_dax = to_dev_dax(dev); > + struct dax_region *dax_region = dev_dax->region; > + uuid_t uuid; > + ssize_t rc; > + > + if (!is_dynamic(dax_region)) > + return -EOPNOTSUPP; > + > + if (sysfs_streq(buf, "0")) > + uuid_copy(&uuid, &uuid_null); > + else { > + rc = uuid_parse(buf, &uuid); > + if (rc) > + return rc; > + } > + > + rc = down_write_killable(&dax_region_rwsem); > + if (rc) > + return rc; > + if (!dax_region->dev->driver) { > + rc = -ENXIO; > + goto err_region; > + } > + rc = down_write_killable(&dax_dev_rwsem); same comments about ACQUIRE() > + if (rc) > + goto err_region; > + Does it need to check if the device is already claimed before proceeding to claiming? What happens if the uuid is written twice to this sysfs file? DJ > + if (uuid_is_null(&uuid)) > + rc = uuid_claim_untagged(dax_region, dev_dax); > + else > + rc = uuid_claim_tagged(dax_region, dev_dax, &uuid); > + > + up_write(&dax_dev_rwsem); > +err_region: > + up_write(&dax_region_rwsem); > + > + return rc < 0 ? rc : len; > } > static DEVICE_ATTR_RW(uuid); > > @@ -1614,8 +1862,12 @@ static umode_t dev_dax_visible(struct kobject *kobj, > struct attribute *a, int n) > return 0; > if (a == &dev_attr_mapping.attr && is_dynamic(dax_region)) > return 0; > - if ((a == &dev_attr_align.attr || > - a == &dev_attr_size.attr) && is_static(dax_region)) > + if (a == &dev_attr_uuid.attr && !is_dynamic(dax_region)) > + return 0444; > + if (a == &dev_attr_align.attr && > + (is_static(dax_region) || is_dynamic(dax_region))) > + return 0444; > + if (a == &dev_attr_size.attr && is_static(dax_region)) > return 0444; > return a->mode; > }

