The branch main has been updated by glebius:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=6dd0803ffd31c60a84488d06928813353c6303d3

commit 6dd0803ffd31c60a84488d06928813353c6303d3
Author:     Gleb Smirnoff <gleb...@freebsd.org>
AuthorDate: 2025-07-16 18:15:12 +0000
Commit:     Gleb Smirnoff <gleb...@freebsd.org>
CommitDate: 2025-07-16 18:15:54 +0000

    libsa/zfs: prefer a vdev with newer txg to a stale one
    
    Before this change, the first probed member of a pool would initialize
    vdev tree for the pool.  Now, imagine a situation when a machine has a
    disk that has been removed from the pool, but the ZFS label was not
    erased.  That's a typical scenario - disk goes offline, it is replaced
    with a spare, no data changes written to the gone disk.  Then, disk
    appears back at boot time and it is the first one to be probed by the
    loader.  It has the same pool GUID as all other members and naive loader
    would not see a conflict.  Then the disk will be used as source of truth
    to read the bootenv.
    
    To fix that, provide vdev_free() that allows to rollback the already
    prebuilt vdev tree, so that a new one can be built from scratch.  Upon
    encountering a newer configuration for already known top level part of a
    pool, call vdev_free() and let vdev_probe() to build a new one.
    
    The change has been tested with loader_lua and userboot.so, but it should
    have same effect on the legacy boot1 loader.
    
    Reviewed by:            tsoome, mav, imp
    Differential Revision:  https://reviews.freebsd.org/D51219
---
 stand/libsa/zfs/zfsimpl.c   | 52 +++++++++++++++++++++++++++++++++++++--------
 sys/cddl/boot/zfs/zfsimpl.h |  2 +-
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/stand/libsa/zfs/zfsimpl.c b/stand/libsa/zfs/zfsimpl.c
index c7dc72561eff..971d71d098d3 100644
--- a/stand/libsa/zfs/zfsimpl.c
+++ b/stand/libsa/zfs/zfsimpl.c
@@ -1106,7 +1106,8 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
 }
 
 static int
-vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist)
+vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
+    const nvlist_t *nvlist)
 {
        vdev_t *top_vdev, *vdev;
        nvlist_t **kids = NULL;
@@ -1120,6 +1121,7 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const 
nvlist_t *nvlist)
                        return (rc);
                top_vdev->v_spa = spa;
                top_vdev->v_top = top_vdev;
+               top_vdev->v_txg = txg;
                vdev_insert(spa->spa_root_vdev, top_vdev);
        }
 
@@ -1163,7 +1165,7 @@ done:
 static int
 vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
 {
-       uint64_t pool_guid, top_guid;
+       uint64_t pool_guid, top_guid, txg;
        nvlist_t *vdevs;
        int rc;
 
@@ -1171,13 +1173,15 @@ vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
            NULL, &pool_guid, NULL) ||
            nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
            NULL, &top_guid, NULL) ||
+           nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
+           NULL, &txg, NULL) != 0 ||
            nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
            NULL, &vdevs, NULL)) {
                printf("ZFS: can't find vdev details\n");
                return (ENOENT);
        }
 
-       rc = vdev_from_nvlist(spa, top_guid, vdevs);
+       rc = vdev_from_nvlist(spa, top_guid, txg, vdevs);
        nvlist_destroy(vdevs);
        return (rc);
 }
@@ -1267,6 +1271,21 @@ vdev_update_from_nvlist(uint64_t top_guid, const 
nvlist_t *nvlist)
        return (rc);
 }
 
+/*
+ * Shall not be called on root vdev, that is not linked into zfs_vdevs.
+ * See comment in vdev_create().
+ */
+static void
+vdev_free(struct vdev *vdev)
+{
+       struct vdev *kid, *safe;
+
+       STAILQ_FOREACH_SAFE(kid, &vdev->v_children, v_childlink, safe)
+               vdev_free(kid);
+       STAILQ_REMOVE(&zfs_vdevs, vdev, vdev, v_alllink);
+       free(vdev);
+}
+
 static int
 vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
 {
@@ -1313,9 +1332,10 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
                vdev = vdev_find(guid);
                /*
                 * Top level vdev is missing, create it.
+                * XXXGL: how can this happen?
                 */
                if (vdev == NULL)
-                       rc = vdev_from_nvlist(spa, guid, kids[i]);
+                       rc = vdev_from_nvlist(spa, guid, 0, kids[i]);
                else
                        rc = vdev_update_from_nvlist(guid, kids[i]);
                if (rc != 0)
@@ -2006,8 +2026,7 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t 
*_write, void *priv,
        vdev_t *vdev;
        nvlist_t *nvl;
        uint64_t val;
-       uint64_t guid;
-       uint64_t pool_txg, pool_guid;
+       uint64_t guid, pool_guid, top_guid, txg;
        const char *pool_name;
        int rc, namelen;
 
@@ -2063,7 +2082,9 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t 
*_write, void *priv,
        }
 
        if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
-           NULL, &pool_txg, NULL) != 0 ||
+           NULL, &txg, NULL) != 0 ||
+           nvlist_find(nvl, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
+           NULL, &top_guid, NULL) != 0 ||
            nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
            NULL, &pool_guid, NULL) != 0 ||
            nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
@@ -2098,9 +2119,22 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t 
*_write, void *priv,
                        nvlist_destroy(nvl);
                        return (ENOMEM);
                }
+       } else {
+               struct vdev *kid;
+
+               STAILQ_FOREACH(kid, &spa->spa_root_vdev->v_children,
+                   v_childlink)
+                       if (kid->v_guid == top_guid && kid->v_txg < txg) {
+                               printf("ZFS: pool %s vdev %s ignoring stale "
+                                   "label from txg 0x%jx, using 0x%jx@0x%jx\n",
+                                   spa->spa_name, kid->v_name,
+                                   kid->v_txg, guid, txg);
+                               STAILQ_REMOVE(&spa->spa_root_vdev->v_children,
+                                   kid, vdev, v_childlink);
+                               vdev_free(kid);
+                               break;
+                       }
        }
-       if (pool_txg > spa->spa_txg)
-               spa->spa_txg = pool_txg;
 
        /*
         * Get the vdev tree and create our in-core copy of it.
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
index 0ce38384abbf..83d964360343 100644
--- a/sys/cddl/boot/zfs/zfsimpl.h
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -2019,6 +2019,7 @@ typedef struct vdev {
        vdev_list_t     v_children;     /* children of this vdev */
        const char      *v_name;        /* vdev name */
        uint64_t        v_guid;         /* vdev guid */
+       uint64_t        v_txg;          /* most recent transaction */
        uint64_t        v_id;           /* index in parent */
        uint64_t        v_psize;        /* physical device capacity */
        int             v_ashift;       /* offset to block shift */
@@ -2048,7 +2049,6 @@ typedef struct spa {
        STAILQ_ENTRY(spa) spa_link;     /* link in global pool list */
        char            *spa_name;      /* pool name */
        uint64_t        spa_guid;       /* pool guid */
-       uint64_t        spa_txg;        /* most recent transaction */
        struct uberblock *spa_uberblock;        /* best uberblock so far */
        vdev_t          *spa_root_vdev; /* toplevel vdev container */
        objset_phys_t   *spa_mos;       /* MOS for this pool */

Reply via email to