svn commit: r354323 - in head: stand/libsa/zfs sys/cddl/boot/zfs
Toomas Soome
tsoome at FreeBSD.org
Sun Nov 3 21:19:52 UTC 2019
Author: tsoome
Date: Sun Nov 3 21:19:52 2019
New Revision: 354323
URL: https://svnweb.freebsd.org/changeset/base/354323
Log:
loader: factor out label and uberblock load from vdev_probe, add MMP checks
Clean up the label read.
Modified:
head/stand/libsa/zfs/zfsimpl.c
head/sys/cddl/boot/zfs/zfsimpl.h
Modified: head/stand/libsa/zfs/zfsimpl.c
==============================================================================
--- head/stand/libsa/zfs/zfsimpl.c Sun Nov 3 21:17:50 2019 (r354322)
+++ head/stand/libsa/zfs/zfsimpl.c Sun Nov 3 21:19:52 2019 (r354323)
@@ -1549,71 +1549,104 @@ vdev_label_offset(uint64_t psize, int l, uint64_t offs
}
static int
-vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
{
- vdev_t vtmp;
- vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
- vdev_phys_t *tmp_label;
- spa_t *spa;
- vdev_t *vdev, *top_vdev, *pool_vdev;
- off_t off;
+ unsigned int seq1 = 0;
+ unsigned int seq2 = 0;
+ int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+
+ if (cmp != 0)
+ return (cmp);
+
+ cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+ if (cmp != 0)
+ return (cmp);
+
+ if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
+ seq1 = MMP_SEQ(ub1);
+
+ if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
+ seq2 = MMP_SEQ(ub2);
+
+ return (AVL_CMP(seq1, seq2));
+}
+
+static int
+uberblock_verify(uberblock_t *ub)
+{
+ if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) {
+ byteswap_uint64_array(ub, sizeof (uberblock_t));
+ }
+
+ if (ub->ub_magic != UBERBLOCK_MAGIC ||
+ !SPA_VERSION_IS_SUPPORTED(ub->ub_version))
+ return (EINVAL);
+
+ return (0);
+}
+
+static int
+vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset,
+ size_t size)
+{
blkptr_t bp;
- const unsigned char *nvlist = NULL;
- uint64_t val;
- uint64_t guid;
- uint64_t best_txg = 0;
- uint64_t pool_txg, pool_guid;
- const char *pool_name;
- const unsigned char *vdevs;
- const unsigned char *features;
- int i, l, rc, is_newer;
- char *upbuf;
- const struct uberblock *up;
+ off_t off;
- /*
- * Load the vdev label and figure out which
- * uberblock is most current.
- */
- memset(&vtmp, 0, sizeof(vtmp));
- vtmp.v_phys_read = _read;
- vtmp.v_read_priv = read_priv;
- vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
- (uint64_t)sizeof (vdev_label_t));
+ off = vdev_label_offset(vd->v_psize, l, offset);
- /* Test for minimum pool size. */
- if (vtmp.v_psize < SPA_MINDEVSIZE)
- return (EIO);
+ BP_ZERO(&bp);
+ BP_SET_LSIZE(&bp, size);
+ BP_SET_PSIZE(&bp, size);
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
+ ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
- tmp_label = zfs_alloc(sizeof(vdev_phys_t));
+ return (vdev_read_phys(vd, &bp, buf, off, size));
+}
- for (l = 0; l < VDEV_LABELS; l++) {
- off = vdev_label_offset(vtmp.v_psize, l,
- offsetof(vdev_label_t, vl_vdev_phys));
+static unsigned char *
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
+{
+ vdev_phys_t *label;
+ uint64_t best_txg = 0;
+ uint64_t label_txg = 0;
+ uint64_t asize;
+ unsigned char *nvl;
+ size_t nvl_size;
+ int error;
- BP_ZERO(&bp);
- BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
- BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
- BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
- BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
- DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
- ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
+ label = malloc(sizeof (vdev_phys_t));
+ if (label == NULL)
+ return (NULL);
- if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
- continue;
+ nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4;
+ nvl = malloc(nvl_size);
+ if (nvl == NULL) {
+ free(label);
+ return (NULL);
+ }
- if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ const unsigned char *nvlist;
+
+ if (vdev_label_read(vd, l, label,
+ offsetof(vdev_label_t, vl_vdev_phys),
+ sizeof (vdev_phys_t)))
continue;
- nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
- if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
- DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
+ if (label->vp_nvlist[0] != NV_ENCODE_XDR)
continue;
- if (best_txg <= pool_txg) {
- uint64_t asize;
+ nvlist = (const unsigned char *) label->vp_nvlist + 4;
+ error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
+ DATA_TYPE_UINT64, NULL, &label_txg);
+ if (error != 0 || label_txg == 0)
+ return (nvl);
- best_txg = pool_txg;
- memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
+ if (label_txg <= txg && label_txg > best_txg) {
+ best_txg = label_txg;
+ memcpy(nvl, nvlist, nvl_size);
/*
* Use asize from pool config. We need this
@@ -1621,30 +1654,87 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
*/
if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE,
DATA_TYPE_UINT64, NULL, &asize) == 0) {
- vtmp.v_psize = asize +
+ vd->v_psize = asize +
VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
}
}
}
- zfs_free(tmp_label, sizeof (vdev_phys_t));
+ if (best_txg == 0) {
+ free(nvl);
+ nvl = NULL;
+ }
+ return (nvl);
+}
- if (best_txg == 0)
+static void
+vdev_uberblock_load(vdev_t *vd, uberblock_t *ub)
+{
+ uberblock_t *buf;
+
+ buf = malloc(VDEV_UBERBLOCK_SIZE(vd));
+ if (buf == NULL)
+ return;
+
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+ if (vdev_label_read(vd, l, buf,
+ VDEV_UBERBLOCK_OFFSET(vd, n),
+ VDEV_UBERBLOCK_SIZE(vd)))
+ continue;
+ if (uberblock_verify(buf) != 0)
+ continue;
+
+ if (vdev_uberblock_compare(buf, ub) > 0)
+ *ub = *buf;
+ }
+ }
+ free(buf);
+}
+
+static int
+vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
+{
+ vdev_t vtmp;
+ spa_t *spa;
+ vdev_t *vdev, *top_vdev, *pool_vdev;
+ unsigned char *nvlist;
+ uint64_t val;
+ uint64_t guid;
+ uint64_t pool_txg, pool_guid;
+ const char *pool_name;
+ const unsigned char *vdevs;
+ const unsigned char *features;
+ int rc, is_newer;
+
+ /*
+ * Load the vdev label and figure out which
+ * uberblock is most current.
+ */
+ memset(&vtmp, 0, sizeof(vtmp));
+ vtmp.v_phys_read = _read;
+ vtmp.v_read_priv = read_priv;
+ vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv),
+ (uint64_t)sizeof (vdev_label_t));
+
+ /* Test for minimum device size. */
+ if (vtmp.v_psize < SPA_MINDEVSIZE)
return (EIO);
- if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
+ nvlist = vdev_label_read_config(&vtmp, UINT64_MAX);
+ if (nvlist == NULL)
return (EIO);
- nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
-
if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
NULL, &val) != 0) {
+ free(nvlist);
return (EIO);
}
if (!SPA_VERSION_IS_SUPPORTED(val)) {
printf("ZFS: unsupported ZFS version %u (should be %u)\n",
(unsigned) val, (unsigned) SPA_VERSION);
+ free(nvlist);
return (EIO);
}
@@ -1652,16 +1742,19 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
DATA_TYPE_NVLIST, NULL, &features) == 0 &&
nvlist_check_features_for_read(features) != 0) {
+ free(nvlist);
return (EIO);
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
NULL, &val) != 0) {
+ free(nvlist);
return (EIO);
}
if (val == POOL_STATE_DESTROYED) {
/* We don't boot only from destroyed pools. */
+ free(nvlist);
return (EIO);
}
@@ -1675,12 +1768,13 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
* Cache and spare devices end up here - just ignore
* them.
*/
- /*printf("ZFS: can't find pool details\n");*/
+ free(nvlist);
return (EIO);
}
if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
NULL, &val) == 0 && val != 0) {
+ free(nvlist);
return (EIO);
}
@@ -1690,8 +1784,10 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
spa = spa_find_by_guid(pool_guid);
if (spa == NULL) {
spa = spa_create(pool_guid, pool_name);
- if (spa == NULL)
+ if (spa == NULL) {
+ free(nvlist);
return (ENOMEM);
+ }
}
if (pool_txg > spa->spa_txg) {
spa->spa_txg = pool_txg;
@@ -1708,18 +1804,24 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
*/
if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &guid) != 0) {
+ free(nvlist);
return (EIO);
}
vdev = vdev_find(guid);
- if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */
+ /* Has this vdev already been inited? */
+ if (vdev && vdev->v_phys_read) {
+ free(nvlist);
return (EIO);
+ }
if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
NULL, &vdevs)) {
+ free(nvlist);
return (EIO);
}
rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
+ free(nvlist);
if (rc != 0)
return (rc);
@@ -1729,6 +1831,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
if (top_vdev == pool_vdev)
break;
+
if (!pool_vdev && top_vdev) {
top_vdev->spa = spa;
STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
@@ -1765,36 +1868,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s
* the best uberblock and then we can actually access
* the contents of the pool.
*/
- upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
- up = (const struct uberblock *)upbuf;
- for (l = 0; l < VDEV_LABELS; l++) {
- for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
- off = vdev_label_offset(vdev->v_psize, l,
- VDEV_UBERBLOCK_OFFSET(vdev, i));
- BP_ZERO(&bp);
- DVA_SET_OFFSET(&bp.blk_dva[0], off);
- BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
- BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
- BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
- BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
- ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
-
- if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
- continue;
-
- if (up->ub_magic != UBERBLOCK_MAGIC)
- continue;
- if (up->ub_txg < spa->spa_txg)
- continue;
- if (up->ub_txg > spa->spa_uberblock.ub_txg ||
- (up->ub_txg == spa->spa_uberblock.ub_txg &&
- up->ub_timestamp >
- spa->spa_uberblock.ub_timestamp)) {
- spa->spa_uberblock = *up;
- }
- }
- }
- zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
+ vdev_uberblock_load(vdev, &spa->spa_uberblock);
vdev->spa = spa;
if (spap != NULL)
Modified: head/sys/cddl/boot/zfs/zfsimpl.h
==============================================================================
--- head/sys/cddl/boot/zfs/zfsimpl.h Sun Nov 3 21:17:50 2019 (r354322)
+++ head/sys/cddl/boot/zfs/zfsimpl.h Sun Nov 3 21:19:52 2019 (r354323)
@@ -63,6 +63,14 @@
#define _NOTE(s)
+/*
+ * AVL comparator helpers
+ */
+#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
+#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define AVL_PCMP(a, b) \
+ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
typedef enum { B_FALSE, B_TRUE } boolean_t;
/* CRC64 table */
@@ -490,8 +498,16 @@ typedef struct zio_gbh {
#define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10)
+/*
+ * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
+ * ring when MMP is enabled.
+ */
+#define MMP_BLOCKS_PER_LABEL 1
+
+/* The largest uberblock we support is 8k. */
+#define MAX_UBERBLOCK_SHIFT (13)
#define VDEV_UBERBLOCK_SHIFT(vd) \
- MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT)
+ MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT)
#define VDEV_UBERBLOCK_COUNT(vd) \
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
#define VDEV_UBERBLOCK_OFFSET(vd, n) \
@@ -841,14 +857,88 @@ typedef enum pool_state {
#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */
#define UBERBLOCK_SHIFT 10 /* up to 1K */
-struct uberblock {
+#define MMP_MAGIC 0xa11cea11 /* all-see-all */
+
+#define MMP_INTERVAL_VALID_BIT 0x01
+#define MMP_SEQ_VALID_BIT 0x02
+#define MMP_FAIL_INT_VALID_BIT 0x04
+
+#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
+ ubp->ub_mmp_magic == MMP_MAGIC)
+#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_INTERVAL_VALID_BIT))
+#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_SEQ_VALID_BIT))
+#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
+ MMP_FAIL_INT_VALID_BIT))
+
+#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
+ >> 8)
+#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
+ >> 32)
+#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
+ >> 48)
+
+typedef struct uberblock {
uint64_t ub_magic; /* UBERBLOCK_MAGIC */
uint64_t ub_version; /* SPA_VERSION */
uint64_t ub_txg; /* txg of last sync */
uint64_t ub_guid_sum; /* sum of all vdev guids */
uint64_t ub_timestamp; /* UTC time of last sync */
blkptr_t ub_rootbp; /* MOS objset_phys_t */
-};
+ /* highest SPA_VERSION supported by software that wrote this txg */
+ uint64_t ub_software_version;
+ /* Maybe missing in uberblocks we read, but always written */
+ uint64_t ub_mmp_magic;
+ /*
+ * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off.
+ * Otherwise, nanosec since last MMP write.
+ */
+ uint64_t ub_mmp_delay;
+
+ /*
+ * The ub_mmp_config contains the multihost write interval, multihost
+ * fail intervals, sequence number for sub-second granularity, and
+ * valid bit mask. This layout is as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | Fail Intervals| Seq | Write Interval (ms) | VALID |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * This allows a write_interval of (2^24/1000)s, over 4.5 hours
+ *
+ * VALID Bits:
+ * - 0x01 - Write Interval (ms)
+ * - 0x02 - Sequence number exists
+ * - 0x04 - Fail Intervals
+ * - 0xf8 - Reserved
+ */
+ uint64_t ub_mmp_config;
+
+ /*
+ * ub_checkpoint_txg indicates two things about the current uberblock:
+ *
+ * 1] If it is not zero then this uberblock is a checkpoint. If it is
+ * zero, then this uberblock is not a checkpoint.
+ *
+ * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is
+ * the ub_txg that the uberblock had at the time we moved it to
+ * the MOS config.
+ *
+ * The field is set when we checkpoint the uberblock and continues to
+ * hold that value even after we've rewound (unlike the ub_txg that
+ * is reset to a higher value).
+ *
+ * Besides checks used to determine whether we are reopening the
+ * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
+ * the value of the field is used to determine which ZIL blocks have
+ * been allocated according to the ms_sm when we are rewinding to a
+ * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+ * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
+ */
+ uint64_t ub_checkpoint_txg;
+} uberblock_t;
/*
* Flags.
More information about the svn-src-all
mailing list