On Wed, Apr 02, 2025 at 06:42:34PM +0200, Martin Wilck wrote:
> On Mon, 2025-03-31 at 19:17 -0400, Benjamin Marzinski wrote:
> > When a new device is added by the multipath command, multipathd may
> > know
> > of other paths that cannot be added to the device because they are
> > currently offline. Instead of ignoring these paths, multipathd will
> > now
> > re-add them when they come back online. To do this, it multipathd
> > needs
> > a new path variable add_when_online, to track devices that could not
> > be
> > added to an existing multipath device because they were offline.
> > These
> > paths are handled along with the other uninitialized paths.
> >
> > Signed-off-by: Benjamin Marzinski <[email protected]>
> > ---
> > libmultipath/print.c | 5 +++-
> > libmultipath/structs.h | 1 +
> > libmultipath/structs_vec.c | 5 ++++
> > multipathd/main.c | 59
> > ++++++++++++++++++++++++++++++++++++--
> > multipathd/multipathd.8.in | 5 ++--
> > 5 files changed, 70 insertions(+), 5 deletions(-)
> >
> > diff --git a/libmultipath/print.c b/libmultipath/print.c
> > index 00c03ace..019ae56f 100644
> > --- a/libmultipath/print.c
> > +++ b/libmultipath/print.c
> > @@ -658,8 +658,11 @@ snprint_path_serial (struct strbuf *buff, const
> > struct path * pp)
> > static int
> > snprint_path_mpp (struct strbuf *buff, const struct path * pp)
> > {
> > - if (!pp->mpp)
> > + if (!pp->mpp) {
> > + if (pp->add_when_online)
> > + return append_strbuf_str(buff, "[offline]");
> > return append_strbuf_str(buff, "[orphan]");
> > + }
> > if (!pp->mpp->alias)
> > return append_strbuf_str(buff, "[unknown]");
> > return snprint_str(buff, pp->mpp->alias);
> > diff --git a/libmultipath/structs.h b/libmultipath/structs.h
> > index 28de9a7f..39d1c71c 100644
> > --- a/libmultipath/structs.h
> > +++ b/libmultipath/structs.h
> > @@ -413,6 +413,7 @@ struct path {
> > int eh_deadline;
> > enum check_path_states is_checked;
> > bool can_use_env_uid;
> > + bool add_when_online;
> > unsigned int checker_timeout;
> > /* configlet pointers */
> > vector hwe;
> > diff --git a/libmultipath/structs_vec.c b/libmultipath/structs_vec.c
> > index f6407e12..663c9053 100644
> > --- a/libmultipath/structs_vec.c
> > +++ b/libmultipath/structs_vec.c
> > @@ -389,6 +389,9 @@ static void orphan_paths(vector pathvec, struct
> > multipath *mpp, const char *reas
> > free_path(pp);
> > } else
> > orphan_path(pp, reason);
> > + } else if (pp->add_when_online &&
> > + strncmp(mpp->wwid, pp->wwid, WWID_SIZE)
> > == 0) {
> > + pp->add_when_online = false;
> > }
> > }
> > }
> > @@ -595,6 +598,8 @@ void sync_paths(struct multipath *mpp, vector
> > pathvec)
> > found = 0;
> > vector_foreach_slot(mpp->pg, pgp, j) {
> > if (find_slot(pgp->paths, (void *)pp) != -1)
> > {
> > + if (pp->add_when_online)
> > + pp->add_when_online = false;
> > found = 1;
> > break;
> > }
> > diff --git a/multipathd/main.c b/multipathd/main.c
> > index 7aaae773..9aa5a2fa 100644
> > --- a/multipathd/main.c
> > +++ b/multipathd/main.c
> > @@ -644,11 +644,45 @@ pr_register_active_paths(struct multipath *mpp)
> > }
> > }
> >
> > +static void
> > +save_offline_paths(const struct multipath *mpp, vector
> > offline_paths)
> > +{
> > + unsigned int i, j;
> > + struct path *pp;
> > + struct pathgroup *pgp;
> > +
> > + vector_foreach_slot (mpp->pg, pgp, i)
> > + vector_foreach_slot (pgp->paths, pp, j)
> > + if (pp->initialized == INIT_OK &&
> > + pp->sysfs_state == PATH_DOWN)
> > + /* ignore failures storing the
> > paths. */
> > + store_path(offline_paths, pp);
> > +}
> > +
> > +static void
> > +handle_orphaned_offline_paths(vector offline_paths)
> > +{
> > + unsigned int i;
> > + struct path *pp;
> > +
> > + vector_foreach_slot (offline_paths, pp, i)
> > + if (pp->mpp == NULL)
> > + pp->add_when_online = true;
> > +}
> > +
> > +static void
> > +cleanup_reset_vec(struct vector_s **v)
> > +{
> > + vector_reset(*v);
> > +}
> > +
> > static int
> > update_map (struct multipath *mpp, struct vectors *vecs, int
> > new_map)
> > {
> > int retries = 3;
> > char *params __attribute__((cleanup(cleanup_charp))) = NULL;
> > + struct vector_s offline_paths_vec = { .allocated = 0 };
> > + vector offline_paths
> > __attribute__((cleanup(cleanup_reset_vec))) = &offline_paths_vec;
> >
> > retry:
> > condlog(4, "%s: updating new map", mpp->alias);
> > @@ -685,6 +719,9 @@ fail:
> > return 1;
> > }
> >
> > + if (new_map && retries < 0)
> > + save_offline_paths(mpp, offline_paths);
> > +
> > if (setup_multipath(vecs, mpp))
> > return 1;
> >
> > @@ -695,6 +732,9 @@ fail:
> > if (mpp->prflag == PRFLAG_SET)
> > pr_register_active_paths(mpp);
> >
> > + if (VECTOR_SIZE(offline_paths) != 0)
> > + handle_orphaned_offline_paths(offline_paths);
> > +
> > if (retries < 0)
> > condlog(0, "%s: failed reload in new map update",
> > mpp->alias);
> > return 0;
> > @@ -2793,7 +2833,8 @@ check_uninitialized_path(struct path * pp,
> > unsigned int ticks)
> > struct config *conf;
> >
> > if (pp->initialized != INIT_NEW && pp->initialized !=
> > INIT_FAILED &&
> > - pp->initialized != INIT_MISSING_UDEV)
> > + pp->initialized != INIT_MISSING_UDEV &&
> > + !(pp->initialized == INIT_OK && pp->add_when_online))
> > return CHECK_PATH_SKIPPED;
> >
> > if (pp->tick)
> > @@ -2849,7 +2890,8 @@ update_uninitialized_path(struct vectors *
> > vecs, struct path * pp)
> > struct config *conf;
> >
> > if (pp->initialized != INIT_NEW && pp->initialized !=
> > INIT_FAILED &&
> > - pp->initialized != INIT_MISSING_UDEV)
> > + pp->initialized != INIT_MISSING_UDEV &&
> > + !(pp->initialized == INIT_OK && pp->add_when_online))
> > return CHECK_PATH_SKIPPED;
> >
> > newstate = get_new_state(pp);
> > @@ -2875,6 +2917,19 @@ update_uninitialized_path(struct vectors *
> > vecs, struct path * pp)
> > free_path(pp);
> > return CHECK_PATH_REMOVED;
> > }
> > + } else if (pp->initialized == INIT_OK && pp->add_when_online
> > &&
> > + (newstate == PATH_UP || newstate == PATH_GHOST))
> > {
> > + pp->initialized = INIT_OK;
> > + if (pp->recheck_wwid == RECHECK_WWID_ON &&
>
> I wonder if we should always check the WWID here. After all, this path
> has never been part of the map, and it was offline when the map was
> created (IOW, the WWID couldn't be checked at that point in time).
> Can we rely on the stored WWID?
That make sense. I also noticed a typo in my documentation changes. I'll
send a new patch.
-Ben
> Other than that, LGTM.
>
> Regards,
> Martin
>
> > + check_path_wwid_change(pp)) {
> > + condlog(0, "%s: path wwid change detected.
> > Removing",
> > + pp->dev);
> > + return handle_path_wwid_change(pp, vecs)?
> > + CHECK_PATH_REMOVED :
> > + CHECK_PATH_SKIPPED;
> > + }
> > + ev_add_path(pp, vecs, 1);
> > + pp->tick = 1;
> > }
> > return CHECK_PATH_CHECKED;
> > }
> > diff --git a/multipathd/multipathd.8.in b/multipathd/multipathd.8.in
> > index 43f87bf8..a91acff1 100644
> > --- a/multipathd/multipathd.8.in
> > +++ b/multipathd/multipathd.8.in
> > @@ -595,8 +595,9 @@ The device serial number.
> > The device marginal state, either \fImarginal\fR or \fInormal\fR.
> > .TP
> > .B %m
> > -The multipath device that this device is a path of, or
> > \fI[orphan]\fR if
> > -it is not part of any multipath device.
> > +The multipath device that this device is a path of, or
> > \fI[offline]\fR
> > +if this device could not be added to a device because is is offline
> > or
> > +\fI[orphan]\fR if it is not part of any multipath device.
> > .TP
> > .B %N
> > The host World Wide Node Name (WWNN) of the device, if any.