The branch main has been updated by glebius: URL: https://cgit.FreeBSD.org/src/commit/?id=6dd0803ffd31c60a84488d06928813353c6303d3
commit 6dd0803ffd31c60a84488d06928813353c6303d3 Author: Gleb Smirnoff <gleb...@freebsd.org> AuthorDate: 2025-07-16 18:15:12 +0000 Commit: Gleb Smirnoff <gleb...@freebsd.org> CommitDate: 2025-07-16 18:15:54 +0000 libsa/zfs: prefer a vdev with newer txg to a stale one Before this change, the first probed member of a pool would initialize vdev tree for the pool. Now, imagine a situation when a machine has a disk that has been removed from the pool, but the ZFS label was not erased. That's a typical scenario - disk goes offline, it is replaced with a spare, no data changes written to the gone disk. Then, disk appears back at boot time and it is the first one to be probed by the loader. It has the same pool GUID as all other members and naive loader would not see a conflict. Then the disk will be used as source of truth to read the bootenv. To fix that, provide vdev_free() that allows to rollback the already prebuilt vdev tree, so that a new one can be built from scratch. Upon encountering a newer configuration for already known top level part of a pool, call vdev_free() and let vdev_probe() to build a new one. The change has been tested with loader_lua and userboot.so, but it should have same effect on the legacy boot1 loader. Reviewed by: tsoome, mav, imp Differential Revision: https://reviews.freebsd.org/D51219 --- stand/libsa/zfs/zfsimpl.c | 52 +++++++++++++++++++++++++++++++++++++-------- sys/cddl/boot/zfs/zfsimpl.h | 2 +- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/stand/libsa/zfs/zfsimpl.c b/stand/libsa/zfs/zfsimpl.c index c7dc72561eff..971d71d098d3 100644 --- a/stand/libsa/zfs/zfsimpl.c +++ b/stand/libsa/zfs/zfsimpl.c @@ -1106,7 +1106,8 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev) } static int -vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist) +vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg, + const nvlist_t *nvlist) { vdev_t *top_vdev, *vdev; nvlist_t **kids = NULL; @@ -1120,6 +1121,7 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist) return (rc); top_vdev->v_spa = spa; top_vdev->v_top = top_vdev; + top_vdev->v_txg = txg; vdev_insert(spa->spa_root_vdev, top_vdev); } @@ -1163,7 +1165,7 @@ done: static int vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist) { - uint64_t pool_guid, top_guid; + uint64_t pool_guid, top_guid, txg; nvlist_t *vdevs; int rc; @@ -1171,13 +1173,15 @@ vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist) NULL, &pool_guid, NULL) || nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64, NULL, &top_guid, NULL) || + nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, + NULL, &txg, NULL) != 0 || nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, NULL, &vdevs, NULL)) { printf("ZFS: can't find vdev details\n"); return (ENOENT); } - rc = vdev_from_nvlist(spa, top_guid, vdevs); + rc = vdev_from_nvlist(spa, top_guid, txg, vdevs); nvlist_destroy(vdevs); return (rc); } @@ -1267,6 +1271,21 @@ vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist) return (rc); } +/* + * Shall not be called on root vdev, that is not linked into zfs_vdevs. + * See comment in vdev_create(). + */ +static void +vdev_free(struct vdev *vdev) +{ + struct vdev *kid, *safe; + + STAILQ_FOREACH_SAFE(kid, &vdev->v_children, v_childlink, safe) + vdev_free(kid); + STAILQ_REMOVE(&zfs_vdevs, vdev, vdev, v_alllink); + free(vdev); +} + static int vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist) { @@ -1313,9 +1332,10 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist) vdev = vdev_find(guid); /* * Top level vdev is missing, create it. + * XXXGL: how can this happen? */ if (vdev == NULL) - rc = vdev_from_nvlist(spa, guid, kids[i]); + rc = vdev_from_nvlist(spa, guid, 0, kids[i]); else rc = vdev_update_from_nvlist(guid, kids[i]); if (rc != 0) @@ -2006,8 +2026,7 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv, vdev_t *vdev; nvlist_t *nvl; uint64_t val; - uint64_t guid; - uint64_t pool_txg, pool_guid; + uint64_t guid, pool_guid, top_guid, txg; const char *pool_name; int rc, namelen; @@ -2063,7 +2082,9 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv, } if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, - NULL, &pool_txg, NULL) != 0 || + NULL, &txg, NULL) != 0 || + nvlist_find(nvl, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64, + NULL, &top_guid, NULL) != 0 || nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, NULL, &pool_guid, NULL) != 0 || nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, @@ -2098,9 +2119,22 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv, nvlist_destroy(nvl); return (ENOMEM); } + } else { + struct vdev *kid; + + STAILQ_FOREACH(kid, &spa->spa_root_vdev->v_children, + v_childlink) + if (kid->v_guid == top_guid && kid->v_txg < txg) { + printf("ZFS: pool %s vdev %s ignoring stale " + "label from txg 0x%jx, using 0x%jx@0x%jx\n", + spa->spa_name, kid->v_name, + kid->v_txg, guid, txg); + STAILQ_REMOVE(&spa->spa_root_vdev->v_children, + kid, vdev, v_childlink); + vdev_free(kid); + break; + } } - if (pool_txg > spa->spa_txg) - spa->spa_txg = pool_txg; /* * Get the vdev tree and create our in-core copy of it. diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h index 0ce38384abbf..83d964360343 100644 --- a/sys/cddl/boot/zfs/zfsimpl.h +++ b/sys/cddl/boot/zfs/zfsimpl.h @@ -2019,6 +2019,7 @@ typedef struct vdev { vdev_list_t v_children; /* children of this vdev */ const char *v_name; /* vdev name */ uint64_t v_guid; /* vdev guid */ + uint64_t v_txg; /* most recent transaction */ uint64_t v_id; /* index in parent */ uint64_t v_psize; /* physical device capacity */ int v_ashift; /* offset to block shift */ @@ -2048,7 +2049,6 @@ typedef struct spa { STAILQ_ENTRY(spa) spa_link; /* link in global pool list */ char *spa_name; /* pool name */ uint64_t spa_guid; /* pool guid */ - uint64_t spa_txg; /* most recent transaction */ struct uberblock *spa_uberblock; /* best uberblock so far */ vdev_t *spa_root_vdev; /* toplevel vdev container */ objset_phys_t *spa_mos; /* MOS for this pool */