svn commit: r354948 - in head: cddl/contrib/opensolaris/cmd/zdb sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Andriy Gapon
avg at FreeBSD.org
Thu Nov 21 13:35:44 UTC 2019
Author: avg
Date: Thu Nov 21 13:35:43 2019
New Revision: 354948
URL: https://svnweb.freebsd.org/changeset/base/354948
Log:
MFV r354383: 10592 misc. metaslab and vdev related ZoL bug fixes
illumos/illumos-gate at 555d674d5d4b8191dc83723188349d28278b2431
https://github.com/illumos/illumos-gate/commit/555d674d5d4b8191dc83723188349d28278b2431
https://www.illumos.org/issues/10592
This is a collection of recent fixes from ZoL:
8eef997679b Error path in metaslab_load_impl() forgets to drop ms_sync_lock
928e8ad47d3 Introduce auxiliary metaslab histograms
425d3237ee8 Get rid of space_map_update() for ms_synced_length
6c926f426a2 Simplify log vdev removal code
21e7cf5da89 zdb -L should skip leak detection altogether
df72b8bebe0 Rename range_tree_verify to range_tree_verify_not_present
75058f33034 Remove unused vdev_t fields
Portions contributed by: Jerry Jelinek <jerry.jelinek at joyent.com>
Author: Serapheim Dimitropoulos <serapheim at delphix.com>
MFC after: 4 weeks
Modified:
head/cddl/contrib/opensolaris/cmd/zdb/zdb.8
head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
Directory Properties:
head/cddl/contrib/opensolaris/ (props changed)
head/cddl/contrib/opensolaris/cmd/zdb/ (props changed)
head/sys/cddl/contrib/opensolaris/ (props changed)
Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.8
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.8 Thu Nov 21 13:22:23 2019 (r354947)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.8 Thu Nov 21 13:35:43 2019 (r354948)
@@ -10,7 +10,7 @@
.\"
.\"
.\" Copyright 2012, Richard Lowe.
-.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\"
.Dd October 06, 2017
@@ -187,7 +187,7 @@ If the
.Fl u
option is also specified, also display the uberblocks on this device.
.It Fl L
-Disable leak tracing and the loading of space maps.
+Disable leak detection and the loading of space maps.
By default,
.Nm
verifies that all non-free blocks are referenced, which can be very expensive.
Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Thu Nov 21 13:22:23 2019 (r354947)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Thu Nov 21 13:35:43 2019 (r354948)
@@ -785,18 +785,21 @@ dump_spacemap(objset_t *os, space_map_t *sm)
return;
(void) printf("space map object %llu:\n",
- (longlong_t)sm->sm_phys->smp_object);
- (void) printf(" smp_objsize = 0x%llx\n",
- (longlong_t)sm->sm_phys->smp_objsize);
+ (longlong_t)sm->sm_object);
+ (void) printf(" smp_length = 0x%llx\n",
+ (longlong_t)sm->sm_phys->smp_length);
(void) printf(" smp_alloc = 0x%llx\n",
(longlong_t)sm->sm_phys->smp_alloc);
+ if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+ return;
+
/*
* Print out the freelist entries in both encoded and decoded form.
*/
uint8_t mapshift = sm->sm_shift;
int64_t alloc = 0;
- uint64_t word;
+ uint64_t word, entry_id = 0;
for (uint64_t offset = 0; offset < space_map_length(sm);
offset += sizeof (word)) {
@@ -804,11 +807,12 @@ dump_spacemap(objset_t *os, space_map_t *sm)
sizeof (word), &word, DMU_READ_PREFETCH));
if (sm_entry_is_debug(word)) {
- (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
- (u_longlong_t)(offset / sizeof (word)),
+ (void) printf("\t [%6llu] %s: txg %llu pass %llu\n",
+ (u_longlong_t)entry_id,
ddata[SM_DEBUG_ACTION_DECODE(word)],
(u_longlong_t)SM_DEBUG_TXG_DECODE(word),
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+ entry_id++;
continue;
}
@@ -846,7 +850,7 @@ dump_spacemap(objset_t *os, space_map_t *sm)
(void) printf("\t [%6llu] %c range:"
" %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
- (u_longlong_t)(offset / sizeof (word)),
+ (u_longlong_t)entry_id,
entry_type, (u_longlong_t)entry_off,
(u_longlong_t)(entry_off + entry_run),
(u_longlong_t)entry_run,
@@ -856,8 +860,9 @@ dump_spacemap(objset_t *os, space_map_t *sm)
alloc += entry_run;
else
alloc -= entry_run;
+ entry_id++;
}
- if ((uint64_t)alloc != space_map_allocated(sm)) {
+ if (alloc != space_map_allocated(sm)) {
(void) printf("space_map_object alloc (%lld) INCONSISTENT "
"with space map summary (%lld)\n",
(longlong_t)space_map_allocated(sm), (longlong_t)alloc);
@@ -921,11 +926,8 @@ dump_metaslab(metaslab_t *msp)
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}
- if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
- ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
-
- dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
- }
+ ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+ dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
}
static void
@@ -3140,6 +3142,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
ddt_entry_t dde;
int error;
+ ASSERT(!dump_opt['L']);
+
bzero(&ddb, sizeof (ddb));
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
blkptr_t blk;
@@ -3163,12 +3167,10 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
zcb->zcb_dedup_blocks++;
}
}
- if (!dump_opt['L']) {
- ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
- ddt_enter(ddt);
- VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
- ddt_exit(ddt);
- }
+ ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+ ddt_enter(ddt);
+ VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+ ddt_exit(ddt);
}
ASSERT(error == ENOENT);
@@ -3210,6 +3212,9 @@ claim_segment_cb(void *arg, uint64_t offset, uint64_t
static void
zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
{
+ if (dump_opt['L'])
+ return;
+
if (spa->spa_vdev_removal == NULL)
return;
@@ -3301,7 +3306,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
space_map_t *prev_obsolete_sm = NULL;
VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
- space_map_update(prev_obsolete_sm);
vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
prev_obsolete_sm);
space_map_close(prev_obsolete_sm);
@@ -3395,9 +3399,9 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
- space_map_update(checkpoint_sm);
VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
checkpoint_sm_exclude_entry_cb, &cseea));
space_map_close(checkpoint_sm);
@@ -3407,6 +3411,8 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_
static void
zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
{
+ ASSERT(!dump_opt['L']);
+
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
@@ -3503,6 +3509,8 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab
static void
zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
{
+ ASSERT(!dump_opt['L']);
+
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
@@ -3549,67 +3557,63 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
zcb->zcb_spa = spa;
- if (!dump_opt['L']) {
- dsl_pool_t *dp = spa->spa_dsl_pool;
- vdev_t *rvd = spa->spa_root_vdev;
+ if (dump_opt['L'])
+ return;
- /*
- * We are going to be changing the meaning of the metaslab's
- * ms_allocatable. Ensure that the allocator doesn't try to
- * use the tree.
- */
- spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
- spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ vdev_t *rvd = spa->spa_root_vdev;
- zcb->zcb_vd_obsolete_counts =
- umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
- UMEM_NOFAIL);
+ /*
+ * We are going to be changing the meaning of the metaslab's
+ * ms_allocatable. Ensure that the allocator doesn't try to
+ * use the tree.
+ */
+ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+ spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
- /*
- * For leak detection, we overload the ms_allocatable trees
- * to contain allocated segments instead of free segments.
- * As a result, we can't use the normal metaslab_load/unload
- * interfaces.
- */
- zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
- load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
+ zcb->zcb_vd_obsolete_counts =
+ umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
+ UMEM_NOFAIL);
- /*
- * On load_concrete_ms_allocatable_trees() we loaded all the
- * allocated entries from the ms_sm to the ms_allocatable for
- * each metaslab. If the pool has a checkpoint or is in the
- * middle of discarding a checkpoint, some of these blocks
- * may have been freed but their ms_sm may not have been
- * updated because they are referenced by the checkpoint. In
- * order to avoid false-positives during leak-detection, we
- * go through the vdev's checkpoint space map and exclude all
- * its entries from their relevant ms_allocatable.
- *
- * We also aggregate the space held by the checkpoint and add
- * it to zcb_checkpoint_size.
- *
- * Note that at this point we are also verifying that all the
- * entries on the checkpoint_sm are marked as allocated in
- * the ms_sm of their relevant metaslab.
- * [see comment in checkpoint_sm_exclude_entry_cb()]
- */
- zdb_leak_init_exclude_checkpoint(spa, zcb);
+ /*
+ * For leak detection, we overload the ms_allocatable trees
+ * to contain allocated segments instead of free segments.
+ * As a result, we can't use the normal metaslab_load/unload
+ * interfaces.
+ */
+ zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+ load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
- /* for cleaner progress output */
- (void) fprintf(stderr, "\n");
+ /*
+ * On load_concrete_ms_allocatable_trees() we loaded all the
+ * allocated entries from the ms_sm to the ms_allocatable for
+ * each metaslab. If the pool has a checkpoint or is in the
+ * middle of discarding a checkpoint, some of these blocks
+ * may have been freed but their ms_sm may not have been
+ * updated because they are referenced by the checkpoint. In
+ * order to avoid false-positives during leak-detection, we
+ * go through the vdev's checkpoint space map and exclude all
+ * its entries from their relevant ms_allocatable.
+ *
+ * We also aggregate the space held by the checkpoint and add
+ * it to zcb_checkpoint_size.
+ *
+ * Note that at this point we are also verifying that all the
+ * entries on the checkpoint_sm are marked as allocated in
+ * the ms_sm of their relevant metaslab.
+ * [see comment in checkpoint_sm_exclude_entry_cb()]
+ */
+ zdb_leak_init_exclude_checkpoint(spa, zcb);
+ ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
- if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
- ASSERT(spa_feature_is_enabled(spa,
- SPA_FEATURE_DEVICE_REMOVAL));
- (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
- increment_indirect_mapping_cb, zcb, NULL);
- }
- } else {
- /*
- * If leak tracing is disabled, we still need to consider
- * any checkpointed space in our space verification.
- */
- zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+
+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_DEVICE_REMOVAL));
+ (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
+ increment_indirect_mapping_cb, zcb, NULL);
}
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -3690,52 +3694,58 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb
static boolean_t
zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
{
+ if (dump_opt['L'])
+ return (B_FALSE);
+
boolean_t leaks = B_FALSE;
- if (!dump_opt['L']) {
- vdev_t *rvd = spa->spa_root_vdev;
- for (unsigned c = 0; c < rvd->vdev_children; c++) {
- vdev_t *vd = rvd->vdev_child[c];
- metaslab_group_t *mg = vd->vdev_mg;
- if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
- leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
- }
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (unsigned c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+#if DEBUG
+ metaslab_group_t *mg = vd->vdev_mg;
+#endif
- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
- ASSERT3P(mg, ==, msp->ms_group);
+ if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
+ leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+ }
- /*
- * ms_allocatable has been overloaded
- * to contain allocated segments. Now that
- * we finished traversing all blocks, any
- * block that remains in the ms_allocatable
- * represents an allocated block that we
- * did not claim during the traversal.
- * Claimed blocks would have been removed
- * from the ms_allocatable. For indirect
- * vdevs, space remaining in the tree
- * represents parts of the mapping that are
- * not referenced, which is not a bug.
- */
- if (vd->vdev_ops == &vdev_indirect_ops) {
- range_tree_vacate(msp->ms_allocatable,
- NULL, NULL);
- } else {
- range_tree_vacate(msp->ms_allocatable,
- zdb_leak, vd);
- }
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT3P(mg, ==, msp->ms_group);
- if (msp->ms_loaded) {
- msp->ms_loaded = B_FALSE;
- }
+ /*
+ * ms_allocatable has been overloaded
+ * to contain allocated segments. Now that
+ * we finished traversing all blocks, any
+ * block that remains in the ms_allocatable
+ * represents an allocated block that we
+ * did not claim during the traversal.
+ * Claimed blocks would have been removed
+ * from the ms_allocatable. For indirect
+ * vdevs, space remaining in the tree
+ * represents parts of the mapping that are
+ * not referenced, which is not a bug.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ range_tree_vacate(msp->ms_allocatable,
+ NULL, NULL);
+ } else {
+ range_tree_vacate(msp->ms_allocatable,
+ zdb_leak, vd);
}
+
+ if (msp->ms_loaded) {
+ msp->ms_loaded = B_FALSE;
+ }
}
- umem_free(zcb->zcb_vd_obsolete_counts,
- rvd->vdev_children * sizeof (uint32_t *));
- zcb->zcb_vd_obsolete_counts = NULL;
}
+
+ umem_free(zcb->zcb_vd_obsolete_counts,
+ rvd->vdev_children * sizeof (uint32_t *));
+ zcb->zcb_vd_obsolete_counts = NULL;
+
return (leaks);
}
@@ -3774,13 +3784,18 @@ dump_block_stats(spa_t *spa)
!dump_opt['L'] ? "nothing leaked " : "");
/*
- * Load all space maps as SM_ALLOC maps, then traverse the pool
- * claiming each block we discover. If the pool is perfectly
- * consistent, the space maps will be empty when we're done.
- * Anything left over is a leak; any block we can't claim (because
- * it's not part of any space map) is a double allocation,
- * reference to a freed block, or an unclaimed log block.
+ * When leak detection is enabled we load all space maps as SM_ALLOC
+ * maps, then traverse the pool claiming each block we discover. If
+ * the pool is perfectly consistent, the segment trees will be empty
+ * when we're done. Anything left over is a leak; any block we can't
+ * claim (because it's not part of any space map) is a double
+ * allocation, reference to a freed block, or an unclaimed log block.
+ *
+ * When leak detection is disabled (-L option) we still traverse the
+ * pool claiming each block we discover, but we skip opening any space
+ * maps.
*/
+ bzero(&zcb, sizeof (zdb_cb_t));
zdb_leak_init(spa, &zcb);
/*
@@ -3859,11 +3874,10 @@ dump_block_stats(spa_t *spa)
total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
- if (total_found == total_alloc) {
- if (!dump_opt['L'])
- (void) printf("\n\tNo leaks (block sum matches space"
- " maps exactly)\n");
- } else {
+ if (total_found == total_alloc && !dump_opt['L']) {
+ (void) printf("\n\tNo leaks (block sum matches space"
+ " maps exactly)\n");
+ } else if (!dump_opt['L']) {
(void) printf("block traversal size %llu != alloc %llu "
"(%s %lld)\n",
(u_longlong_t)total_found,
@@ -4203,7 +4217,6 @@ verify_device_removal_feature_counts(spa_t *spa)
spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object,
0, vd->vdev_asize, 0));
- space_map_update(prev_obsolete_sm);
dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
(void) printf("\n");
space_map_close(prev_obsolete_sm);
@@ -4409,7 +4422,8 @@ verify_checkpoint_sm_entry_cb(space_map_entry_t *sme,
* their respective ms_allocateable trees should not contain them.
*/
mutex_enter(&ms->ms_lock);
- range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
+ range_tree_verify_not_present(ms->ms_allocatable,
+ sme->sme_offset, sme->sme_run);
mutex_exit(&ms->ms_lock);
return (0);
@@ -4472,7 +4486,6 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, sp
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
checkpoint_sm_obj, 0, current_vd->vdev_asize,
current_vd->vdev_ashift));
- space_map_update(checkpoint_sm);
verify_checkpoint_sm_entry_cb_arg_t vcsec;
vcsec.vcsec_vd = ckpoint_vd;
@@ -4480,6 +4493,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, sp
vcsec.vcsec_num_entries =
space_map_length(checkpoint_sm) / sizeof (uint64_t);
VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
verify_checkpoint_sm_entry_cb, &vcsec));
dump_spacemap(current->spa_meta_objset, checkpoint_sm);
space_map_close(checkpoint_sm);
@@ -4559,7 +4573,7 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_
* are part of the checkpoint were freed by mistake.
*/
range_tree_walk(ckpoint_msp->ms_allocatable,
- (range_tree_func_t *)range_tree_verify,
+ (range_tree_func_t *)range_tree_verify_not_present,
current_msp->ms_allocatable);
}
}
@@ -4571,6 +4585,8 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_
static void
verify_checkpoint_blocks(spa_t *spa)
{
+ ASSERT(!dump_opt['L']);
+
spa_t *checkpoint_spa;
char *checkpoint_pool;
nvlist_t *config = NULL;
@@ -4636,7 +4652,6 @@ dump_leftover_checkpoint_blocks(spa_t *spa)
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
- space_map_update(checkpoint_sm);
dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
space_map_close(checkpoint_sm);
}
Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Thu Nov 21 13:22:23 2019 (r354947)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Thu Nov 21 13:35:43 2019 (r354948)
@@ -584,45 +584,62 @@ metaslab_compare(const void *x1, const void *x2)
return (AVL_CMP(m1->ms_start, m2->ms_start));
}
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+ return (msp->ms_allocated_space);
+}
+
/*
* Verify that the space accounting on disk matches the in-core range_trees.
*/
-void
+static void
metaslab_verify_space(metaslab_t *msp, uint64_t txg)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
- uint64_t allocated = 0;
+ uint64_t allocating = 0;
uint64_t sm_free_space, msp_free_space;
ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(!msp->ms_condensing);
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
return;
/*
* We can only verify the metaslab space when we're called
- * from syncing context with a loaded metaslab that has an allocated
- * space map. Calling this in non-syncing context does not
- * provide a consistent view of the metaslab since we're performing
- * allocations in the future.
+ * from syncing context with a loaded metaslab that has an
+ * allocated space map. Calling this in non-syncing context
+ * does not provide a consistent view of the metaslab since
+ * we're performing allocations in the future.
*/
if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
!msp->ms_loaded)
return;
- sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
- space_map_alloc_delta(msp->ms_sm);
+ /*
+ * Even though the smp_alloc field can get negative (e.g.
+ * see vdev_checkpoint_sm), that should never be the case
+ * when it come's to a metaslab's space map.
+ */
+ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+ sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
+
/*
- * Account for future allocations since we would have already
- * deducted that space from the ms_freetree.
+ * Account for future allocations since we would have
+ * already deducted that space from the ms_allocatable.
*/
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
- allocated +=
+ allocating +=
range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
}
- msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+ ASSERT3U(msp->ms_deferspace, ==,
+ range_tree_space(msp->ms_defer[0]) +
+ range_tree_space(msp->ms_defer[1]));
+
+ msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
msp->ms_deferspace + range_tree_space(msp->ms_freed);
VERIFY3U(sm_free_space, ==, msp_free_space);
@@ -929,6 +946,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT(msp != NULL);
/* skip if not active or not a member */
if (msp->ms_sm == NULL || msp->ms_group != mg)
@@ -1470,7 +1488,204 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
* ==========================================================================
*/
+static void
+metaslab_aux_histograms_clear(metaslab_t *msp)
+{
+ /*
+ * Auxiliary histograms are only cleared when resetting them,
+ * which can only happen while the metaslab is loaded.
+ */
+ ASSERT(msp->ms_loaded);
+
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+}
+
+static void
+metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
+ range_tree_t *rt)
+{
+ /*
+ * This is modeled after space_map_histogram_add(), so refer to that
+ * function for implementation details. We want this to work like
+ * the space map histogram, and not the range tree histogram, as we
+ * are essentially constructing a delta that will be later subtracted
+ * from the space map histogram.
+ */
+ int idx = 0;
+ for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ ASSERT3U(i, >=, idx + shift);
+ histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
+
+ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+ ASSERT3U(idx + shift, ==, i);
+ idx++;
+ ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+ }
+ }
+}
+
/*
+ * Called at every sync pass that the metaslab gets synced.
+ *
+ * The reason is that we want our auxiliary histograms to be updated
+ * wherever the metaslab's space map histogram is updated. This way
+ * we stay consistent on which parts of the metaslab space map's
+ * histogram are currently not available for allocations (e.g because
+ * they are in the defer, freed, and freeing trees).
+ */
+static void
+metaslab_aux_histograms_update(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(sm != NULL);
+
+ /*
+ * This is similar to the metaslab's space map histogram updates
+ * that take place in metaslab_sync(). The only difference is that
+ * we only care about segments that haven't made it into the
+ * ms_allocatable tree yet.
+ */
+ if (msp->ms_loaded) {
+ metaslab_aux_histograms_clear(msp);
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freed);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ metaslab_aux_histogram_add(msp->ms_deferhist[t],
+ sm->sm_shift, msp->ms_defer[t]);
+ }
+ }
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freeing);
+}
+
+/*
+ * Called every time we are done syncing (writing to) the metaslab,
+ * i.e. at the end of each sync pass.
+ * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
+ */
+static void
+metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ space_map_t *sm = msp->ms_sm;
+
+ if (sm == NULL) {
+ /*
+ * We came here from metaslab_init() when creating/opening a
+ * pool, looking at a metaslab that hasn't had any allocations
+ * yet.
+ */
+ return;
+ }
+
+ /*
+ * This is similar to the actions that we take for the ms_freed
+ * and ms_defer trees in metaslab_sync_done().
+ */
+ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
+ if (defer_allowed) {
+ bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_synchist));
+ } else {
+ bzero(msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_deferhist[hist_index]));
+ }
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+}
+
+/*
+ * Ensure that the metaslab's weight and fragmentation are consistent
+ * with the contents of the histogram (either the range tree's histogram
+ * or the space map's depending whether the metaslab is loaded).
+ */
+static void
+metaslab_verify_weight_and_frag(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /* see comment in metaslab_verify_unflushed_changes() */
+ if (msp->ms_group == NULL)
+ return;
+
+ /*
+ * Devices being removed always return a weight of 0 and leave
+ * fragmentation and ms_max_size as is - there is nothing for
+ * us to verify here.
+ */
+ vdev_t *vd = msp->ms_group->mg_vd;
+ if (vd->vdev_removing)
+ return;
+
+ /*
+ * If the metaslab is dirty it probably means that we've done
+ * some allocations or frees that have changed our histograms
+ * and thus the weight.
+ */
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (txg_list_member(&vd->vdev_ms_list, msp, t))
+ return;
+ }
+
+ /*
+ * This verification checks that our in-memory state is consistent
+ * with what's on disk. If the pool is read-only then there aren't
+ * any changes and we just have the initially-loaded state.
+ */
+ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
+ return;
+
+ /* some extra verification for in-core tree if you can */
+ if (msp->ms_loaded) {
+ range_tree_stat_verify(msp->ms_allocatable);
+ VERIFY(space_map_histogram_verify(msp->ms_sm,
+ msp->ms_allocatable));
+ }
+
+ uint64_t weight = msp->ms_weight;
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
+ uint64_t frag = msp->ms_fragmentation;
+ uint64_t max_segsize = msp->ms_max_size;
+
+ msp->ms_weight = 0;
+ msp->ms_fragmentation = 0;
+ msp->ms_max_size = 0;
+
+ /*
+ * This function is used for verification purposes. Regardless of
+ * whether metaslab_weight() thinks this metaslab should be active or
+ * not, we want to ensure that the actual weight (and therefore the
+ * value of ms_weight) would be the same if it was to be recalculated
+ * at this point.
+ */
+ msp->ms_weight = metaslab_weight(msp) | was_active;
+
+ VERIFY3U(max_segsize, ==, msp->ms_max_size);
+
+ /*
+ * If the weight type changed then there is no point in doing
+ * verification. Revert fields to their original values.
+ */
+ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
+ (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
+ msp->ms_fragmentation = frag;
+ msp->ms_weight = weight;
+ return;
+ }
+
+ VERIFY3U(msp->ms_fragmentation, ==, frag);
+ VERIFY3U(msp->ms_weight, ==, weight);
+}
+
+/*
* Wait for any in-progress metaslab loads to complete.
*/
static void
@@ -1491,47 +1706,94 @@ metaslab_load_impl(metaslab_t *msp)
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
/*
- * Nobody else can manipulate a loading metaslab, so it's now safe
- * to drop the lock. This way we don't have to hold the lock while
- * reading the spacemap from disk.
+ * We temporarily drop the lock to unblock other operations while we
+ * are reading the space map. Therefore, metaslab_sync() and
+ * metaslab_sync_done() can run at the same time as we do.
+ *
+ * metaslab_sync() can append to the space map while we are loading.
+ * Therefore we load only entries that existed when we started the
+ * load. Additionally, metaslab_sync_done() has to wait for the load
+ * to complete because there are potential races like metaslab_load()
+ * loading parts of the space map that are currently being appended
+ * by metaslab_sync(). If we didn't, the ms_allocatable would have
+ * entries that metaslab_sync_done() would try to re-add later.
+ *
+ * That's why before dropping the lock we remember the synced length
+ * of the metaslab and read up to that point of the space map,
+ * ignoring entries appended by metaslab_sync() that happen after we
+ * drop the lock.
*/
+ uint64_t length = msp->ms_synced_length;
mutex_exit(&msp->ms_lock);
- /*
- * If the space map has not been allocated yet, then treat
- * all the space in the metaslab as free and add it to ms_allocatable.
- */
if (msp->ms_sm != NULL) {
- error = space_map_load(msp->ms_sm, msp->ms_allocatable,
- SM_FREE);
+ error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
+ SM_FREE, length);
} else {
+ /*
+ * The space map has not been allocated yet, so treat
+ * all the space in the metaslab as free and add it to the
+ * ms_allocatable tree.
+ */
range_tree_add(msp->ms_allocatable,
msp->ms_start, msp->ms_size);
}
+ /*
+ * We need to grab the ms_sync_lock to prevent metaslab_sync() from
+ * changing the ms_sm and the metaslab's range trees while we are
+ * about to use them and populate the ms_allocatable. The ms_lock
+ * is insufficient for this because metaslab_sync() doesn't hold
+ * the ms_lock while writing the ms_checkpointing tree to disk.
+ */
+ mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
+ ASSERT(!msp->ms_condensing);
- if (error != 0)
+ if (error != 0) {
+ mutex_exit(&msp->ms_sync_lock);
return (error);
+ }
ASSERT3P(msp->ms_group, !=, NULL);
msp->ms_loaded = B_TRUE;
/*
- * If the metaslab already has a spacemap, then we need to
- * remove all segments from the defer tree; otherwise, the
- * metaslab is completely empty and we can skip this.
+ * The ms_allocatable contains the segments that exist in the
+ * ms_defer trees [see ms_synced_length]. Thus we need to remove
+ * them from ms_allocatable as they will be added again in
+ * metaslab_sync_done().
*/
- if (msp->ms_sm != NULL) {
- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_walk(msp->ms_defer[t],
- range_tree_remove, msp->ms_allocatable);
- }
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_remove, msp->ms_allocatable);
}
+
+ /*
+ * Call metaslab_recalculate_weight_and_sort() now that the
+ * metaslab is loaded so we get the metaslab's real weight.
+ *
+ * Unless this metaslab was created with older software and
+ * has not yet been converted to use segment-based weight, we
+ * expect the new weight to be better or equal to the weight
+ * that the metaslab had while it was not loaded. This is
+ * because the old weight does not take into account the
+ * consolidation of adjacent segments between TXGs. [see
+ * comment for ms_synchist and ms_deferhist[] for more info]
+ */
+ uint64_t weight = msp->ms_weight;
+ metaslab_recalculate_weight_and_sort(msp);
+ if (!WEIGHT_IS_SPACEBASED(weight))
+ ASSERT3U(weight, <=, msp->ms_weight);
msp->ms_max_size = metaslab_block_maxsize(msp);
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ metaslab_verify_space(msp, spa_syncing_txg(spa));
+ mutex_exit(&msp->ms_sync_lock);
+
return (0);
}
@@ -1548,6 +1810,7 @@ metaslab_load(metaslab_t *msp)
if (msp->ms_loaded)
return (0);
VERIFY(!msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
msp->ms_loading = B_TRUE;
int error = metaslab_load_impl(msp);
@@ -1561,10 +1824,29 @@ void
metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ metaslab_verify_weight_and_frag(msp);
+
range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
+
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
msp->ms_max_size = 0;
+
+ /*
+ * We explicitly recalculate the metaslab's weight based on its space
+ * map (as it is now not loaded). We want unload metaslabs to always
+ * have their weights calculated from the space map histograms, while
+ * loaded ones have it calculated from their in-core range tree
+ * [see metaslab_load()]. This way, the weight reflects the information
+ * available in-core, whether it is loaded or not
+ *
+ * If ms_group == NULL means that we came here from metaslab_fini(),
+ * at which point it doesn't make sense for us to do the recalculation
+ * and the sorting.
+ */
+ if (msp->ms_group != NULL)
+ metaslab_recalculate_weight_and_sort(msp);
}
static void
@@ -1604,6 +1886,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint6
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
+ *
+ * Note:
+ * When called from vdev_expand(), we can't call into the DMU as
+ * we are holding the spa_config_lock as a writer and we would
+ * deadlock [see relevant comment in vdev_metaslab_init()]. in
+ * that case, the object parameter is zero though, so we won't
+ * call into the DMU.
*/
if (object != 0) {
error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
@@ -1615,14 +1904,17 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint6
}
ASSERT(ms->ms_sm != NULL);
+ ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
+ ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
}
/*
- * We create the main range tree here, but we don't create the
+ * We create the ms_allocatable here, but we don't create the
* other range trees until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
- * addition of new space; and for debugging, it ensures that we'd
- * data fault on any attempt to use this metaslab before it's ready.
+ * addition of new space; and for debugging, it ensures that
+ * we'd data fault on any attempt to use this metaslab before
+ * it's ready.
*/
ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
metaslab_rangesize_compare, 0);
@@ -1639,8 +1931,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint6
* out this txg. This ensures that we don't attempt to allocate
* from it before we have initialized it completely.
*/
- if (txg <= TXG_INITIAL)
+ if (txg <= TXG_INITIAL) {
metaslab_sync_done(ms, 0);
+ metaslab_space_update(vd, mg->mg_class,
+ metaslab_allocated_space(ms), 0, 0);
+ }
/*
* If metaslab_debug_load is set and we're initializing a metaslab
@@ -1674,7 +1969,7 @@ metaslab_fini(metaslab_t *msp)
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
metaslab_space_update(vd, mg->mg_class,
- -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
+ -metaslab_allocated_space(msp), 0, -msp->ms_size);
space_map_close(msp->ms_sm);
@@ -1695,6 +1990,9 @@ metaslab_fini(metaslab_t *msp)
range_tree_destroy(msp->ms_checkpointing);
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
+
mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
@@ -1710,7 +2008,7 @@ metaslab_fini(metaslab_t *msp)
* This table defines a segment size based fragmentation metric that will
* allow each metaslab to derive its own fragmentation value. This is done
* by calculating the space in each bucket of the spacemap histogram and
- * multiplying that by the fragmetation metric in this table. Doing
+ * multiplying that by the fragmentation metric in this table. Doing
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list