On Wed, Jul 07, 2021 at 12:43:56PM +0200, Hannes Reinecke wrote: > On 7/7/21 11:53 AM, Klaus Jensen wrote: > > On Jul 7 09:49, Hannes Reinecke wrote: > > > On 7/6/21 11:33 AM, Klaus Jensen wrote: > > > > From: Klaus Jensen <k.jen...@samsung.com> > > > > > > > > Prior to this patch the nvme-ns devices are always children of the > > > > NvmeBus owned by the NvmeCtrl. This causes the namespaces to be > > > > unrealized when the parent device is removed. However, when subsystems > > > > are involved, this is not what we want since the namespaces may be > > > > attached to other controllers as well. > > > > > > > > This patch adds an additional NvmeBus on the subsystem device. When > > > > nvme-ns devices are realized, if the parent controller device is linked > > > > to a subsystem, the parent bus is set to the subsystem one instead. This > > > > makes sure that namespaces are kept alive and not unrealized. > > > > > > > > Signed-off-by: Klaus Jensen <k.jen...@samsung.com> > > > > --- > > > > hw/nvme/nvme.h | 18 ++++++++++-------- > > > > hw/nvme/ctrl.c | 8 +++++--- > > > > hw/nvme/ns.c | 32 +++++++++++++++++++++++++------- > > > > hw/nvme/subsys.c | 4 ++++ > > > > 4 files changed, 44 insertions(+), 18 deletions(-) > > > > > > > > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h > > > > index c4065467d877..9401e212f9f7 100644 > > > > --- a/hw/nvme/nvme.h > > > > +++ b/hw/nvme/nvme.h > > > > @@ -33,12 +33,21 @@ QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > > > > > NVME_NSID_BROADCAST - 1); > > > > typedef struct NvmeCtrl NvmeCtrl; > > > > typedef struct NvmeNamespace NvmeNamespace; > > > > +#define TYPE_NVME_BUS "nvme-bus" > > > > +OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS) > > > > + > > > > +typedef struct NvmeBus { > > > > + BusState parent_bus; > > > > + bool is_subsys; > > > > +} NvmeBus; > > > > + > > > > #define TYPE_NVME_SUBSYS "nvme-subsys" > > > > #define NVME_SUBSYS(obj) \ > > > > OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS) > > > > typedef struct NvmeSubsystem { > > > > DeviceState parent_obj; > > > > + NvmeBus bus; > > > > uint8_t subnqn[256]; > > > > NvmeCtrl *ctrls[NVME_MAX_CONTROLLERS]; > > > > @@ -365,13 +374,6 @@ typedef struct NvmeCQueue { > > > > QTAILQ_HEAD(, NvmeRequest) req_list; > > > > } NvmeCQueue; > > > > -#define TYPE_NVME_BUS "nvme-bus" > > > > -#define NVME_BUS(obj) OBJECT_CHECK(NvmeBus, (obj), TYPE_NVME_BUS) > > > > - > > > > -typedef struct NvmeBus { > > > > - BusState parent_bus; > > > > -} NvmeBus; > > > > - > > > > #define TYPE_NVME "nvme" > > > > #define NVME(obj) \ > > > > OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME) > > > > @@ -463,7 +465,7 @@ typedef struct NvmeCtrl { > > > > static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid) > > > > { > > > > - if (!nsid || nsid > NVME_MAX_NAMESPACES) { > > > > + if (!n || !nsid || nsid > NVME_MAX_NAMESPACES) { > > > > return NULL; > > > > } > > > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > > > > index 90e3ee2b70ee..7c8fca36d9a5 100644 > > > > --- a/hw/nvme/ctrl.c > > > > +++ b/hw/nvme/ctrl.c > > > > @@ -6516,11 +6516,13 @@ static void nvme_exit(PCIDevice *pci_dev) > > > > for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { > > > > ns = nvme_ns(n, i); > > > > - if (!ns) { > > > > - continue; > > > > + if (ns) { > > > > + ns->attached--; > > > > } > > > > + } > > > > - nvme_ns_cleanup(ns); > > > > + if (n->subsys) { > > > > + nvme_subsys_unregister_ctrl(n->subsys, n); > > > > } > > > > if (n->subsys) { > > > > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c > > > > index 3c4f5b8c714a..612a2786d75d 100644 > > > > --- a/hw/nvme/ns.c > > > > +++ b/hw/nvme/ns.c > > > > @@ -444,13 +444,29 @@ void nvme_ns_cleanup(NvmeNamespace *ns) > > > > static void nvme_ns_realize(DeviceState *dev, Error **errp) > > > > { > > > > NvmeNamespace *ns = NVME_NS(dev); > > > > - BusState *s = qdev_get_parent_bus(dev); > > > > - NvmeCtrl *n = NVME(s->parent); > > > > - NvmeSubsystem *subsys = n->subsys; > > > > + BusState *qbus = qdev_get_parent_bus(dev); > > > > + NvmeBus *bus = NVME_BUS(qbus); > > > > + NvmeCtrl *n = NULL; > > > > + NvmeSubsystem *subsys = NULL; > > > > uint32_t nsid = ns->params.nsid; > > > > int i; > > > > - if (!n->subsys) { > > > > + if (bus->is_subsys) { > > > > + subsys = NVME_SUBSYS(qbus->parent); > > > > + } else { > > > > + n = NVME(qbus->parent); > > > > + subsys = n->subsys; > > > > + } > > > > + > > > > + if (subsys) { > > > > + /* > > > > + * If this namespace belongs to a subsystem (through a > > > > link on the > > > > + * controller device), reparent the device. > > > > + */ > > > > + if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) { > > > > + return; > > > > + } > > > > + } else { > > > > if (ns->params.detached) { > > > > error_setg(errp, "detached requires that the nvme > > > > device is " > > > > "linked to an nvme-subsys device"); > > > > @@ -470,7 +486,7 @@ static void nvme_ns_realize(DeviceState > > > > *dev, Error **errp) > > > > if (!nsid) { > > > > for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { > > > > - if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) { > > > > + if (nvme_subsys_ns(subsys, i) || nvme_ns(n, i)) { > > > > continue; > > > > } > > > > @@ -483,7 +499,7 @@ static void nvme_ns_realize(DeviceState > > > > *dev, Error **errp) > > > > return; > > > > } > > > > } else { > > > > - if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) { > > > > + if (nvme_subsys_ns(subsys, nsid) || nvme_ns(n, nsid)) { > > > > error_setg(errp, "namespace id '%d' already > > > > allocated", nsid); > > > > return; > > > > } > > > > @@ -509,7 +525,9 @@ static void nvme_ns_realize(DeviceState > > > > *dev, Error **errp) > > > > } > > > > } > > > > - nvme_attach_ns(n, ns); > > > > + if (n) { > > > > + nvme_attach_ns(n, ns); > > > > + } > > > > } > > > > static Property nvme_ns_props[] = { > > > > diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c > > > > index 92caa604a280..fb7c3a7c55fc 100644 > > > > --- a/hw/nvme/subsys.c > > > > +++ b/hw/nvme/subsys.c > > > > @@ -50,6 +50,10 @@ static void nvme_subsys_realize(DeviceState > > > > *dev, Error **errp) > > > > { > > > > NvmeSubsystem *subsys = NVME_SUBSYS(dev); > > > > + qbus_create_inplace(&subsys->bus, sizeof(NvmeBus), > > > > TYPE_NVME_BUS, dev, > > > > + dev->id); > > > > + subsys->bus.is_subsys = true; > > > > + > > > > nvme_subsys_setup(subsys); > > > > } > > > > > > > Why not always create a subsystem, and distinguish between 'shared' > > > and 'non-shared' subsystems instead of the ->subsys pointer? > > > That way all namespaces can be children of the subsystem, we won't > > > need any reparenting, and the whole thing will be more in-line with > > > qdev, no? > > > > > > > Hi Hannes, > > > > Thanks for your reviews and comments! > > > > So, I have actually considered what you suggest. > > > > The problem is that the nvme-ns device currently expects to plug into a > > bus (an 'nvme-bus') and we cannot really get rid of that 'bus' parameter > > without breaking compatibility. I experimented with removing the bus > > from the nvme device and instead creating it in the nvme-subsys device. > > My idea here was to implicitly always create an nvme-subsys if not > > explicitly created by the user, but since nvme-subsys does not plug into > > any bus itself, it is not exposed in the qtree and thus not reachable > > (i.e., when realizing the nvme-ns device, it will complain that there > > are no 'nvme-bus' available to plug into). To make this work, I believe > > we'd have to have the nvme-subsys plug into the main system bus (or some > > other bus rooted at main-system-bus), and I'd prefer not to do that > > since this is already a flawed design and I think it would be more > > intrusive than what my patch does. > > > Sigh. > _Again_. > > I seem to trip over issues for which no patch can possibly be accepted as > first the infrastructure has to be reworked. > > ... there is a reason why I'm avoiding posting patches to qemu-devel :-( > > > I raised these issues on the mailinglist some time ago[1]. We didn't > > really find a good solution, but I think the conclusion is that the > > bus/device design of subsystems and namespaces is flawed. That's why I > > am working on an "objectification" of the two devices as suggested by > > Stefan[2] as a proper fix for this mess. > > > Actually, I _do_ think that it would be good to have an nvme-subsystem bus, > providing a list of all namespaces for that subsystem. > Creating a controller would then mean that one has to create a controller > and a namespace _separately_, as then the namespace would _not_ be a child > of the controller. > But that's arguably a good thing, as the namespaces _do_ have a separate > lifetime from controllers. > And the lifetime of the subsystem could be completely self-contained; the > controller would be looking up subsystems via the subsystem NQN if present; > I guess we'll need to create ad-hoc subsystems for PCIe. > But nothing insurmountable. > And certainly nothing which require a large-scale rework of qemu internals, > I would think.
I don't remember the reason to use --object instead of --device. Maybe the --device approach is viable and it makes some sense because this is a "device" (like a SCSI LUN) that exposed to the guest. It would be necessary to have a nvme-subsys device on the system bus to act as the bus for nvme-ns devices as Klaus described. > But _if_ you work on the rework, please ensure that not only NVMe is > represented here. SCSI multipathing has the same issue currently; there is a > strict host->lun->block->OS device relationship currently in SCSI, making it > impossible to represent a multipathed SCSI device in qemu. > The only sneaky way is to allow for a shared OS device (ie specifying the > filename more than once), but that completely sidesteps qemu internals, too. Do you want one SCSIDevice to be accessible via multiple SCSIBus instances for multipathing in QEMU? Stefan
signature.asc
Description: PGP signature