Jonathan Cameron wrote:
> On Thu, 24 Nov 2022 10:34:52 -0800
> Dan Williams <[email protected]> wrote:
>
> > The three objects 'struct cxl_nvdimm_bridge', 'struct cxl_nvdimm', and
> > 'struct cxl_pmem_region' manage CXL persistent memory resources. The
> > bridge represents base platform resources, the nvdimm represents one or
> > more endpoints, and the region is a collection of nvdimms that
> > contribute to an assembled address range.
> >
> > Their relationship is such that a region is torn down if any component
> > endpoints are removed. All regions and endpoints are torn down if the
> > foundational bridge device goes down.
> >
> > A workqueue was deployed to manage these interdependencies, but it is
> > difficult to reason about, and fragile. A recent attempt to take the CXL
> > root device lock in the cxl_mem driver was reported by lockdep as
> > colliding with the flush_work() in the cxl_pmem flows.
> >
> > Instead of the workqueue, arrange for all pmem/nvdimm devices to be torn
> > down immediately and hierarchically. A similar change is made to both
> > the 'cxl_nvdimm' and 'cxl_pmem_region' objects. For bisect-ability both
> > changes are made in the same patch which unfortunately makes the patch
> > bigger than desired.
> >
> > Arrange for cxl_memdev and cxl_region to register a cxl_nvdimm and
> > cxl_pmem_region as a devres release action of the bridge device.
> > Additionally, include a devres release action of the cxl_memdev or
> > cxl_region device that triggers the bridge's release action if an endpoint
> > exits before the bridge. I.e. this allows either unplugging the bridge,
> > or unplugging and endpoint to result in the same cleanup actions.
> >
> > To keep the patch smaller the cleanup of the now defunct workqueue
> > infrastructure is saved for a follow-on patch.
> >
> > Signed-off-by: Dan Williams <[email protected]>
>
> Hi Dan,
>
> This is fiddly to follow, but then so was the original.
Fiddly is a charitable comment for the original. This patch is larger
than I would have liked, but I did not see a way to unwind the
complexity without causing a bisect break.
> A few minor comments inline.
Thanks for taking a look.
>
> Jonathan
>
> > ---
> > drivers/cxl/core/pmem.c | 70 ++++++++++++++++++++----
> > drivers/cxl/core/region.c | 54 ++++++++++++++++++-
> > drivers/cxl/cxl.h | 7 ++
> > drivers/cxl/cxlmem.h | 4 +
> > drivers/cxl/mem.c | 9 +++
> > drivers/cxl/pci.c | 3 -
> > drivers/cxl/pmem.c | 122
> > ++++++++++++------------------------------
> > tools/testing/cxl/test/mem.c | 3 -
> > 8 files changed, 164 insertions(+), 108 deletions(-)
> >
> > diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
> > index 1d12a8206444..647b3a30638e 100644
> > --- a/drivers/cxl/core/pmem.c
> > +++ b/drivers/cxl/core/pmem.c
> > @@ -219,7 +219,8 @@ EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm, CXL);
> >
> > static struct lock_class_key cxl_nvdimm_key;
> >
> > -static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_memdev *cxlmd)
> > +static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_nvdimm_bridge
> > *cxl_nvb,
> > + struct cxl_memdev *cxlmd)
> > {
> > struct cxl_nvdimm *cxl_nvd;
> > struct device *dev;
> > @@ -230,6 +231,7 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct
> > cxl_memdev *cxlmd)
> >
> > dev = &cxl_nvd->dev;
> > cxl_nvd->cxlmd = cxlmd;
> > + cxlmd->cxl_nvd = cxl_nvd;
> > device_initialize(dev);
> > lockdep_set_class(&dev->mutex, &cxl_nvdimm_key);
> > device_set_pm_not_required(dev);
> > @@ -240,27 +242,52 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct
> > cxl_memdev *cxlmd)
> > return cxl_nvd;
> > }
> >
> > -static void cxl_nvd_unregister(void *dev)
> > +static void cxl_nvd_unregister(void *_cxl_nvd)
> > {
> > - device_unregister(dev);
> > + struct cxl_nvdimm *cxl_nvd = _cxl_nvd;
> > + struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
> > +
> > + device_lock_assert(&cxlmd->cxl_nvb->dev);
>
> Locally it's not immediately obvious if that is always the same
> as
> device_lock_assert(&cxl_nvb->dev);
> If not, a comment, if it is maybe just change to that.
Sure, yes, it is always the latter.
>
> > + cxl_nvd->cxlmd = NULL;
> > + cxlmd->cxl_nvd = NULL;
> > + device_unregister(&cxl_nvd->dev);
> > +}
> > +
> > +static void cxlmd_release_nvdimm(void *_cxlmd)
> > +{
> > + struct cxl_memdev *cxlmd = _cxlmd;
> > + struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb;
> > +
> > + device_lock(&cxl_nvb->dev);
> > + if (cxlmd->cxl_nvd)
> > + devm_release_action(&cxl_nvb->dev, cxl_nvd_unregister,
> > + cxlmd->cxl_nvd);
> > + device_unlock(&cxl_nvb->dev);
> > + put_device(&cxl_nvb->dev);
> > }
> >
> > /**
> > * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm
> > - * @host: same host as @cxlmd
> > * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations
> > *
> > * Return: 0 on success negative error code on failure.
> > */
> > -int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd)
> > +int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd)
> > {
> > + struct cxl_nvdimm_bridge *cxl_nvb = cxl_find_nvdimm_bridge(&cxlmd->dev);
>
> Another cosmetic change, but I'd prefer the actual
> cxl_nvb = cxl_find_nvdimm_bridge();
>
> to be just above the error check rather than up here.
ok.
>
> > struct cxl_nvdimm *cxl_nvd;
> > struct device *dev;
> > int rc;
> >
> > - cxl_nvd = cxl_nvdimm_alloc(cxlmd);
> > - if (IS_ERR(cxl_nvd))
> > - return PTR_ERR(cxl_nvd);
> > + if (!cxl_nvb)
> > + return -ENODEV;
> > +
> > + cxl_nvd = cxl_nvdimm_alloc(cxl_nvb, cxlmd);
> > + if (IS_ERR(cxl_nvd)) {
> > + rc = PTR_ERR(cxl_nvd);
> > + goto err_alloc;
> > + }
> > + cxlmd->cxl_nvb = cxl_nvb;
> >
> > dev = &cxl_nvd->dev;
> > rc = dev_set_name(dev, "pmem%d", cxlmd->id);
> > @@ -271,13 +298,34 @@ int devm_cxl_add_nvdimm(struct device *host, struct
> > cxl_memdev *cxlmd)
> > if (rc)
> > goto err;
> >
> > - dev_dbg(host, "%s: register %s\n", dev_name(dev->parent),
> > - dev_name(dev));
> > + dev_dbg(&cxlmd->dev, "register %s\n", dev_name(dev));
> >
> > - return devm_add_action_or_reset(host, cxl_nvd_unregister, dev);
> > + /*
> > + * Remove this nvdimm connection if either the top-level PMEM
> > + * bridge goes down, or the endpoint device goes through
> > + * ->remove().
> > + */
>
> Perhaps move this comment down to inside the if (cxl_nvb->dev.driver)
> block as it only refers (I think) to the devm_add_action_or_reset(),
> not the surrounding driver binding checks.
It refers to the whole rest of this function because this action:
devm_add_action_or_reset(&cxl_nvb->dev, cxl_nvd_unregister, cxl_nvd)
...deletes the cxl_nvdimm when the bridge is removed, and this action:
devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, cxlmd)
...removes the nvdimm, and the above release action, if the endpoint is
removed *before* the bridge.
At a minimum the comment needs strengthening if that detail was not
conveyed.
>
> > + device_lock(&cxl_nvb->dev);
> > + if (cxl_nvb->dev.driver)
> > + rc = devm_add_action_or_reset(&cxl_nvb->dev, cxl_nvd_unregister,
> > + cxl_nvd);
> > + else
> > + rc = -ENXIO;
> > + device_unlock(&cxl_nvb->dev);
> > +
> > + if (rc)
> > + goto err_alloc;
> > +
> > + /* @cxlmd carries a reference on @cxl_nvb until cxlmd_release_nvdimm */
> > + return devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm,
> > cxlmd);
> >
> > err:
> > put_device(dev);
> > +err_alloc:
> > + put_device(&cxl_nvb->dev);
>
> Is this ordering necessary? It's not reverse of the setup above, so if we can
> reordering
> to be so, that is probably a good thing. (move these NULL setting above the
> put_device(&cxl_nvb->dev)).
Sure, they can move above.
>
> > + cxlmd->cxl_nvb = NULL;
> > + cxlmd->cxl_nvd = NULL;
> > +
> > return rc;
> > }
> > EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm, CXL);
> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index f9ae5ad284ff..e73bec828032 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -1812,6 +1812,7 @@ static struct lock_class_key cxl_pmem_region_key;
> > static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region
> > *cxlr)
> > {
> > struct cxl_region_params *p = &cxlr->params;
> > + struct cxl_nvdimm_bridge *cxl_nvb;
> > struct cxl_pmem_region *cxlr_pmem;
> > struct device *dev;
> > int i;
> > @@ -1839,6 +1840,14 @@ static struct cxl_pmem_region
> > *cxl_pmem_region_alloc(struct cxl_region *cxlr)
> > struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> > struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
> >
> > + if (i == 0) {
>
> Whilst kind of obvious, maybe a comment in here that for end points in the
> region the
> cxl_nvb will be the same hence we just look it up for the first one?
ok.
>
> > + cxl_nvb = cxl_find_nvdimm_bridge(&cxlmd->dev);
> > + if (!cxl_nvb) {
> > + cxlr_pmem = ERR_PTR(-ENODEV);
> > + goto out;
> > + }
> > + cxlr->cxl_nvb = cxl_nvb;
> > + }
> > m->cxlmd = cxlmd;
> > get_device(&cxlmd->dev);
> > m->start = cxled->dpa_res->start;
> > @@ -1848,6 +1857,7 @@ static struct cxl_pmem_region
> > *cxl_pmem_region_alloc(struct cxl_region *cxlr)
> >
> > dev = &cxlr_pmem->dev;
> > cxlr_pmem->cxlr = cxlr;
> > + cxlr->cxlr_pmem = cxlr_pmem;
> > device_initialize(dev);
> > lockdep_set_class(&dev->mutex, &cxl_pmem_region_key);
> > device_set_pm_not_required(dev);
> > @@ -1860,9 +1870,30 @@ static struct cxl_pmem_region
> > *cxl_pmem_region_alloc(struct cxl_region *cxlr)
> > return cxlr_pmem;
> > }
> >
> > -static void cxlr_pmem_unregister(void *dev)
> > +static void cxlr_pmem_unregister(void *_cxlr_pmem)
> > +{
> > + struct cxl_pmem_region *cxlr_pmem = _cxlr_pmem;
> > + struct cxl_region *cxlr = cxlr_pmem->cxlr;
> > + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
> > +
> > + device_lock_assert(&cxl_nvb->dev);
>
> This scheme is obvious in this patch, but probably less so when just
> looking at the resulting code. Perhaps worth a comment
> here on why we care about that particular lock?
ok, and I'll make a similar comment in cxl_nvd_unregister.
>
> > + cxlr->cxlr_pmem = NULL;
> > + cxlr_pmem->cxlr = NULL;
> > + device_unregister(&cxlr_pmem->dev);
> > +}
> > +
> > +static void cxlr_release_nvdimm(void *_cxlr)
> > {
> > - device_unregister(dev);
> > + struct cxl_region *cxlr = _cxlr;
> > + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
> > +
> > + device_lock(&cxl_nvb->dev);
> > + if (cxlr->cxlr_pmem)
> > + devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister,
> > + cxlr->cxlr_pmem);
> > + device_unlock(&cxl_nvb->dev);
> > + cxlr->cxl_nvb = NULL;
> > + put_device(&cxl_nvb->dev);
> > }
> >
> > /**
> > @@ -1874,12 +1905,14 @@ static void cxlr_pmem_unregister(void *dev)
> > static int devm_cxl_add_pmem_region(struct cxl_region *cxlr)
> > {
> > struct cxl_pmem_region *cxlr_pmem;
> > + struct cxl_nvdimm_bridge *cxl_nvb;
> > struct device *dev;
> > int rc;
> >
> > cxlr_pmem = cxl_pmem_region_alloc(cxlr);
> > if (IS_ERR(cxlr_pmem))
> > return PTR_ERR(cxlr_pmem);
> > + cxl_nvb = cxlr->cxl_nvb;
> >
> > dev = &cxlr_pmem->dev;
> > rc = dev_set_name(dev, "pmem_region%d", cxlr->id);
> > @@ -1893,10 +1926,25 @@ static int devm_cxl_add_pmem_region(struct
> > cxl_region *cxlr)
> > dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
> > dev_name(dev));
> >
> > - return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev);
> > + device_lock(&cxl_nvb->dev);
> > + if (cxl_nvb->dev.driver)
> > + rc = devm_add_action_or_reset(&cxl_nvb->dev,
> > + cxlr_pmem_unregister, cxlr_pmem);
> > + else
> > + rc = -ENXIO;
> > + device_unlock(&cxl_nvb->dev);
> > +
> > + if (rc)
> > + goto err_bridge;
> > +
> > + /* @cxlr carries a reference on @cxl_nvb until cxlr_release_nvdimm */
> > + return devm_add_action_or_reset(&cxlr->dev, cxlr_release_nvdimm, cxlr);
> >
> > err:
> > put_device(dev);
> > +err_bridge:
> > + put_device(&cxl_nvb->dev);
> > + cxlr->cxl_nvb = NULL;
> > return rc;
> > }
> >
> > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > index 4ac7938eaf6c..9b5ba9626636 100644
> > --- a/drivers/cxl/cxl.h
> > +++ b/drivers/cxl/cxl.h
> > @@ -386,6 +386,8 @@ struct cxl_region_params {
> > * @id: This region's id. Id is globally unique across all regions
> > * @mode: Endpoint decoder allocation / access mode
> > * @type: Endpoint decoder target type
> > + * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem shutdown
>
> I was going to suggest just carrying the struct device around, but this
> comment isn't quite true. I think cxl_region->cxl_nvb is also used in
> cxl_pmem_region_probe() to get the nvdimm_buss for nvdimm_pmem_region_create()
Perhaps just make the comment less specific and say "nvdimm bridge for
coordinating @cxlr_pmem setup and shutdown".
>
>
> > + * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
> > * @params: active + config params for the region
> > */
> > struct cxl_region {
> > @@ -393,6 +395,8 @@ struct cxl_region {
> > int id;
> > enum cxl_decoder_mode mode;
> > enum cxl_decoder_type type;
> > + struct cxl_nvdimm_bridge *cxl_nvb;
> > + struct cxl_pmem_region *cxlr_pmem;
> > struct cxl_region_params params;
> > };
> >
> > @@ -438,7 +442,6 @@ struct cxl_pmem_region {
> > struct device dev;
> > struct cxl_region *cxlr;
> > struct nd_region *nd_region;
> > - struct cxl_nvdimm_bridge *bridge;
> > struct range hpa_range;
> > int nr_mappings;
> > struct cxl_pmem_region_mapping mapping[];
> > @@ -637,7 +640,7 @@ struct cxl_nvdimm_bridge
> > *devm_cxl_add_nvdimm_bridge(struct device *host,
> > struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev);
> > bool is_cxl_nvdimm(struct device *dev);
> > bool is_cxl_nvdimm_bridge(struct device *dev);
> > -int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd);
> > +int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd);
> > struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *dev);
> >
> > #ifdef CONFIG_CXL_REGION
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 88e3a8e54b6a..c1c9960ab05f 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -35,6 +35,8 @@
> > * @cdev: char dev core object for ioctl operations
> > * @cxlds: The device state backing this device
> > * @detach_work: active memdev lost a port in its ancestry
> > + * @cxl_nvb: coordinate removal of @cxl_nvd if present
> > + * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem
> > * @id: id number of this memdev instance.
> > */
> > struct cxl_memdev {
> > @@ -42,6 +44,8 @@ struct cxl_memdev {
> > struct cdev cdev;
> > struct cxl_dev_state *cxlds;
> > struct work_struct detach_work;
> > + struct cxl_nvdimm_bridge *cxl_nvb;
> > + struct cxl_nvdimm *cxl_nvd;
> > int id;
> > };
> >
>
> > diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
> > index 652f00fc68ca..73357d0c3f25 100644
> > --- a/drivers/cxl/pmem.c
> > +++ b/drivers/cxl/pmem.c
>
>
> > static struct cxl_driver cxl_nvdimm_driver = {
> > @@ -200,6 +182,16 @@ static int cxl_pmem_ctl(struct nvdimm_bus_descriptor
> > *nd_desc,
> > return cxl_pmem_nvdimm_ctl(nvdimm, cmd, buf, buf_len);
> > }
> >
> > +static void unregister_nvdimm_bus(void *_cxl_nvb)
> > +{
> > + struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb;
> > + struct nvdimm_bus *nvdimm_bus = cxl_nvb->nvdimm_bus;
> > +
> > + cxl_nvb->nvdimm_bus = NULL;
> > + nvdimm_bus_unregister(nvdimm_bus);
> > +}
> > +
>
> Single blank line.
>
> > +
> > static bool online_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb)
> > {
> > if (cxl_nvb->nvdimm_bus)
> > @@ -303,23 +295,21 @@ static int cxl_nvdimm_bridge_probe(struct device *dev)
> > {
> > struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev);
> >
> > - if (cxl_nvb->state == CXL_NVB_DEAD)
> > - return -ENXIO;
> > + cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor){
> ) {
> matches existing style in this file.
clang-format removes that space, will add it back.
>
> > + .provider_name = "CXL",
> > + .module = THIS_MODULE,
> > + .ndctl = cxl_pmem_ctl,
> > + };
> >
> > - if (cxl_nvb->state == CXL_NVB_NEW) {
> > - cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor) {
> > - .provider_name = "CXL",
> > - .module = THIS_MODULE,
> > - .ndctl = cxl_pmem_ctl,
> > - };
> > + cxl_nvb->nvdimm_bus =
> > + nvdimm_bus_register(&cxl_nvb->dev, &cxl_nvb->nd_desc);
> >
> > - INIT_WORK(&cxl_nvb->state_work, cxl_nvb_update_state);
> > - }
> > + if (!cxl_nvb->nvdimm_bus)
> > + return -ENOMEM;
> >
> > - cxl_nvb->state = CXL_NVB_ONLINE;
> > - cxl_nvdimm_bridge_state_work(cxl_nvb);
> > + INIT_WORK(&cxl_nvb->state_work, cxl_nvb_update_state);
> >
> > - return 0;
> > + return devm_add_action_or_reset(dev, unregister_nvdimm_bus, cxl_nvb);