Author: tsoome
Date: Sun Nov  3 21:19:52 2019
New Revision: 354323
URL: https://svnweb.freebsd.org/changeset/base/354323

Log:
  loader: factor out label and uberblock load from vdev_probe, add MMP checks
  
  Clean up the label read.

Modified:
  head/stand/libsa/zfs/zfsimpl.c
  head/sys/cddl/boot/zfs/zfsimpl.h

Modified: head/stand/libsa/zfs/zfsimpl.c
==============================================================================
--- head/stand/libsa/zfs/zfsimpl.c      Sun Nov  3 21:17:50 2019        
(r354322)
+++ head/stand/libsa/zfs/zfsimpl.c      Sun Nov  3 21:19:52 2019        
(r354323)
@@ -1549,71 +1549,104 @@ vdev_label_offset(uint64_t psize, int l, uint64_t offs
 }
 
 static int
-vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
 {
-       vdev_t vtmp;
-       vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
-       vdev_phys_t *tmp_label;
-       spa_t *spa;
-       vdev_t *vdev, *top_vdev, *pool_vdev;
-       off_t off;
+       unsigned int seq1 = 0;
+       unsigned int seq2 = 0;
+       int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+
+       if (cmp != 0)
+               return (cmp);
+
+       cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+       if (cmp != 0)
+               return (cmp);
+
+       if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
+               seq1 = MMP_SEQ(ub1);
+
+       if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
+               seq2 = MMP_SEQ(ub2);
+
+       return (AVL_CMP(seq1, seq2));
+}
+
+static int
+uberblock_verify(uberblock_t *ub)
+{
+       if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
+               byteswap_uint64_array(ub, sizeof (uberblock_t));
+       }
+
+       if (ub->ub_magic != UBERBLOCK_MAGIC ||
+           !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
+               return (EINVAL);
+
+       return (0);
+}
+
+static int
+vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
+    size_t size)
+{
        blkptr_t bp;
-       const unsigned char *nvlist = NULL;
-       uint64_t val;
-       uint64_t guid;
-       uint64_t best_txg = 0;
-       uint64_t pool_txg, pool_guid;
-       const char *pool_name;
-       const unsigned char *vdevs;
-       const unsigned char *features;
-       int i, l, rc, is_newer;
-       char *upbuf;
-       const struct uberblock *up;
+       off_t off;
 
-       /*
-        * Load the vdev label and figure out which
-        * uberblock is most current.
-        */
-       memset(&vtmp, 0, sizeof(vtmp));
-       vtmp.v_phys_read = _read;
-       vtmp.v_read_priv = read_priv;
-       vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
-           (uint64_t)sizeof (vdev_label_t));
+       off = vdev_label_offset(vd->v_psize, l, offset);
 
-       /* Test for minimum pool size. */
-       if (vtmp.v_psize < SPA_MINDEVSIZE)
-               return (EIO);
+       BP_ZERO(&bp);
+       BP_SET_LSIZE(&bp, size);
+       BP_SET_PSIZE(&bp, size);
+       BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+       BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+       DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
+       ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
 
-       tmp_label = zfs_alloc(sizeof(vdev_phys_t));
+       return (vdev_read_phys(vd, &bp, buf, off, size));
+}
 
-       for (l = 0; l < VDEV_LABELS; l++) {
-               off = vdev_label_offset(vtmp.v_psize, l,
-                   offsetof(vdev_label_t, vl_vdev_phys));
+static unsigned char *
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
+{
+       vdev_phys_t *label;
+       uint64_t best_txg = 0;
+       uint64_t label_txg = 0;
+       uint64_t asize;
+       unsigned char *nvl;
+       size_t nvl_size;
+       int error;
 
-               BP_ZERO(&bp);
-               BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
-               BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
-               BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
-               BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
-               DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
-               ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+       label = malloc(sizeof (vdev_phys_t));
+       if (label == NULL)
+               return (NULL);
 
-               if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
-                       continue;
+       nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4;
+       nvl = malloc(nvl_size);
+       if (nvl == NULL) {
+               free(label);
+               return (NULL);
+       }
 
-               if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
+       for (int l = 0; l < VDEV_LABELS; l++) {
+               const unsigned char *nvlist;
+
+               if (vdev_label_read(vd, l, label,
+                   offsetof(vdev_label_t, vl_vdev_phys),
+                   sizeof (vdev_phys_t)))
                        continue;
 
-               nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
-               if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
-                   DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
+               if (label->vp_nvlist[0] != NV_ENCODE_XDR)
                        continue;
 
-               if (best_txg <= pool_txg) {
-                       uint64_t asize;
+               nvlist = (const unsigned char *) label->vp_nvlist + 4;
+               error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
+                   DATA_TYPE_UINT64, NULL, &label_txg);
+               if (error != 0 || label_txg == 0)
+                       return (nvl);
 
-                       best_txg = pool_txg;
-                       memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
+               if (label_txg <= txg && label_txg > best_txg) {
+                       best_txg = label_txg;
+                       memcpy(nvl, nvlist, nvl_size);
 
                        /*
                         * Use asize from pool config. We need this
@@ -1621,30 +1654,87 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
                         */
                        if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
                            DATA_TYPE_UINT64, NULL, &asize) == 0) {
-                               vtmp.v_psize = asize +
+                               vd->v_psize = asize +
                                    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
                        }
                }
        }
 
-       zfs_free(tmp_label, sizeof (vdev_phys_t));
+       if (best_txg == 0) {
+               free(nvl);
+               nvl = NULL;
+       }
+       return (nvl);
+}
 
-       if (best_txg == 0)
+static void
+vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
+{
+       uberblock_t *buf;
+
+       buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
+       if (buf == NULL)
+               return;
+
+       for (int l = 0; l < VDEV_LABELS; l++) {
+               for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+                       if (vdev_label_read(vd, l, buf,
+                           VDEV_UBERBLOCK_OFFSET(vd, n),
+                           VDEV_UBERBLOCK_SIZE(vd)))
+                               continue;
+                       if (uberblock_verify(buf) != 0)
+                               continue;
+
+                       if (vdev_uberblock_compare(buf, ub) > 0)
+                               *ub = *buf;
+               }
+       }
+       free(buf);
+}
+
+static int
+vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
+{
+       vdev_t vtmp;
+       spa_t *spa;
+       vdev_t *vdev, *top_vdev, *pool_vdev;
+       unsigned char *nvlist;
+       uint64_t val;
+       uint64_t guid;
+       uint64_t pool_txg, pool_guid;
+       const char *pool_name;
+       const unsigned char *vdevs;
+       const unsigned char *features;
+       int rc, is_newer;
+
+       /*
+        * Load the vdev label and figure out which
+        * uberblock is most current.
+        */
+       memset(&vtmp, 0, sizeof(vtmp));
+       vtmp.v_phys_read = _read;
+       vtmp.v_read_priv = read_priv;
+       vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
+           (uint64_t)sizeof (vdev_label_t));
+
+       /* Test for minimum device size. */
+       if (vtmp.v_psize < SPA_MINDEVSIZE)
                return (EIO);
 
-       if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
+       nvlist = vdev_label_read_config(&vtmp, UINT64_MAX);
+       if (nvlist == NULL)
                return (EIO);
 
-       nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
-
        if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
            NULL, &val) != 0) {
+               free(nvlist);
                return (EIO);
        }
 
        if (!SPA_VERSION_IS_SUPPORTED(val)) {
                printf("ZFS: unsupported ZFS version %u (should be %u)\n",
                    (unsigned) val, (unsigned) SPA_VERSION);
+               free(nvlist);
                return (EIO);
        }
 
@@ -1652,16 +1742,19 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
        if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
            DATA_TYPE_NVLIST, NULL, &features) == 0 &&
            nvlist_check_features_for_read(features) != 0) {
+               free(nvlist);
                return (EIO);
        }
 
        if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
            NULL, &val) != 0) {
+               free(nvlist);
                return (EIO);
        }
 
        if (val == POOL_STATE_DESTROYED) {
                /* We don't boot only from destroyed pools. */
+               free(nvlist);
                return (EIO);
        }
 
@@ -1675,12 +1768,13 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
                 * Cache and spare devices end up here - just ignore
                 * them.
                 */
-               /*printf("ZFS: can't find pool details\n");*/
+               free(nvlist);
                return (EIO);
        }
 
        if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
            NULL, &val) == 0 && val != 0) {
+               free(nvlist);
                return (EIO);
        }
 
@@ -1690,8 +1784,10 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
        spa = spa_find_by_guid(pool_guid);
        if (spa == NULL) {
                spa = spa_create(pool_guid, pool_name);
-               if (spa == NULL)
+               if (spa == NULL) {
+                       free(nvlist);
                        return (ENOMEM);
+               }
        }
        if (pool_txg > spa->spa_txg) {
                spa->spa_txg = pool_txg;
@@ -1708,18 +1804,24 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
         */
        if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
            NULL, &guid) != 0) {
+               free(nvlist);
                return (EIO);
        }
        vdev = vdev_find(guid);
-       if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
+       /* Has this vdev already been inited? */
+       if (vdev && vdev->v_phys_read) {
+               free(nvlist);
                return (EIO);
+       }
 
        if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
            NULL, &vdevs)) {
+               free(nvlist);
                return (EIO);
        }
 
        rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
+       free(nvlist);
        if (rc != 0)
                return (rc);
 
@@ -1729,6 +1831,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
        STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
                if (top_vdev == pool_vdev)
                        break;
+
        if (!pool_vdev && top_vdev) {
                top_vdev->spa = spa;
                STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
@@ -1765,36 +1868,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
         * the best uberblock and then we can actually access
         * the contents of the pool.
         */
-       upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
-       up = (const struct uberblock *)upbuf;
-       for (l = 0; l < VDEV_LABELS; l++) {
-               for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
-                       off = vdev_label_offset(vdev->v_psize, l,
-                           VDEV_UBERBLOCK_OFFSET(vdev, i));
-                       BP_ZERO(&bp);
-                       DVA_SET_OFFSET(&bp.blk_dva[0], off);
-                       BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
-                       BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
-                       BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
-                       BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
-                       ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
-
-                       if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
-                               continue;
-
-                       if (up->ub_magic != UBERBLOCK_MAGIC)
-                               continue;
-                       if (up->ub_txg < spa->spa_txg)
-                               continue;
-                       if (up->ub_txg > spa->spa_uberblock.ub_txg ||
-                           (up->ub_txg == spa->spa_uberblock.ub_txg &&
-                           up->ub_timestamp >
-                           spa->spa_uberblock.ub_timestamp)) {
-                               spa->spa_uberblock = *up;
-                       }
-               }
-       }
-       zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
+       vdev_uberblock_load(vdev, &spa->spa_uberblock);
 
        vdev->spa = spa;
        if (spap != NULL)

Modified: head/sys/cddl/boot/zfs/zfsimpl.h
==============================================================================
--- head/sys/cddl/boot/zfs/zfsimpl.h    Sun Nov  3 21:17:50 2019        
(r354322)
+++ head/sys/cddl/boot/zfs/zfsimpl.h    Sun Nov  3 21:19:52 2019        
(r354323)
@@ -63,6 +63,14 @@
 
 #define _NOTE(s)
 
+/*
+ * AVL comparator helpers
+ */
+#define        AVL_ISIGN(a)    (((a) > 0) - ((a) < 0))
+#define        AVL_CMP(a, b)   (((a) > (b)) - ((a) < (b)))
+#define        AVL_PCMP(a, b)  \
+       (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
 typedef enum { B_FALSE, B_TRUE } boolean_t;
 
 /* CRC64 table */
@@ -490,8 +498,16 @@ typedef struct zio_gbh {
 #define        VDEV_PHYS_SIZE          (112 << 10)
 #define        VDEV_UBERBLOCK_RING     (128 << 10)
 
+/*
+ * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
+ * ring when MMP is enabled.
+ */
+#define        MMP_BLOCKS_PER_LABEL    1
+
+/* The largest uberblock we support is 8k. */
+#define        MAX_UBERBLOCK_SHIFT     (13)
 #define        VDEV_UBERBLOCK_SHIFT(vd)        \
-       MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT)
+       MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT)
 #define        VDEV_UBERBLOCK_COUNT(vd)        \
        (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
 #define        VDEV_UBERBLOCK_OFFSET(vd, n)    \
@@ -841,14 +857,88 @@ typedef enum pool_state {
 #define        UBERBLOCK_MAGIC         0x00bab10c              /* oo-ba-bloc!  
*/
 #define        UBERBLOCK_SHIFT         10                      /* up to 1K     
*/
 
-struct uberblock {
+#define        MMP_MAGIC               0xa11cea11              /* all-see-all  
*/
+
+#define        MMP_INTERVAL_VALID_BIT  0x01
+#define        MMP_SEQ_VALID_BIT       0x02
+#define        MMP_FAIL_INT_VALID_BIT  0x04
+
+#define        MMP_VALID(ubp)          (ubp->ub_magic == UBERBLOCK_MAGIC && \
+                                   ubp->ub_mmp_magic == MMP_MAGIC)
+#define        MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config 
& \
+                                   MMP_INTERVAL_VALID_BIT))
+#define        MMP_SEQ_VALID(ubp)      (MMP_VALID(ubp) && (ubp->ub_mmp_config 
& \
+                                   MMP_SEQ_VALID_BIT))
+#define        MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config 
& \
+                                   MMP_FAIL_INT_VALID_BIT))
+
+#define        MMP_INTERVAL(ubp)       ((ubp->ub_mmp_config & 
0x00000000FFFFFF00) \
+                                   >> 8)
+#define        MMP_SEQ(ubp)            ((ubp->ub_mmp_config & 
0x0000FFFF00000000) \
+                                   >> 32)
+#define        MMP_FAIL_INT(ubp)       ((ubp->ub_mmp_config & 
0xFFFF000000000000) \
+                                   >> 48)
+
+typedef struct uberblock {
        uint64_t        ub_magic;       /* UBERBLOCK_MAGIC              */
        uint64_t        ub_version;     /* SPA_VERSION                  */
        uint64_t        ub_txg;         /* txg of last sync             */
        uint64_t        ub_guid_sum;    /* sum of all vdev guids        */
        uint64_t        ub_timestamp;   /* UTC time of last sync        */
        blkptr_t        ub_rootbp;      /* MOS objset_phys_t            */
-};
+       /* highest SPA_VERSION supported by software that wrote this txg */
+       uint64_t        ub_software_version;
+       /* Maybe missing in uberblocks we read, but always written */
+       uint64_t        ub_mmp_magic;
+       /*
+        * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
+        * Otherwise, nanosec since last MMP write.
+        */
+       uint64_t        ub_mmp_delay;
+
+       /*
+        * The ub_mmp_config contains the multihost write interval, multihost
+        * fail intervals, sequence number for sub-second granularity, and
+        * valid bit mask.  This layout is as follows:
+        *
+        *   64      56      48      40      32      24      16      8       0
+        *   +-------+-------+-------+-------+-------+-------+-------+-------+
+        * 0 | Fail Intervals|      Seq      |   Write Interval (ms) | VALID |
+        *   +-------+-------+-------+-------+-------+-------+-------+-------+
+        *
+        * This allows a write_interval of (2^24/1000)s, over 4.5 hours
+        *
+        * VALID Bits:
+        * - 0x01 - Write Interval (ms)
+        * - 0x02 - Sequence number exists
+        * - 0x04 - Fail Intervals
+        * - 0xf8 - Reserved
+        */
+       uint64_t        ub_mmp_config;
+
+       /*
+        * ub_checkpoint_txg indicates two things about the current uberblock:
+        *
+        * 1] If it is not zero then this uberblock is a checkpoint. If it is
+        *    zero, then this uberblock is not a checkpoint.
+        *
+        * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+        *    the ub_txg that the uberblock had at the time we moved it to
+        *    the MOS config.
+        *
+        * The field is set when we checkpoint the uberblock and continues to
+        * hold that value even after we've rewound (unlike the ub_txg that
+        * is reset to a higher value).
+        *
+        * Besides checks used to determine whether we are reopening the
+        * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+        * the value of the field is used to determine which ZIL blocks have
+        * been allocated according to the ms_sm when we are rewinding to a
+        * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+        * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+        */
+       uint64_t        ub_checkpoint_txg;
+} uberblock_t;
 
 /*
  * Flags.
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to