svn commit: r258371 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c vendor-s...
Andriy Gapon
avg at FreeBSD.org
Wed Nov 20 10:41:12 UTC 2013
Author: avg
Date: Wed Nov 20 10:41:10 2013
New Revision: 258371
URL: http://svnweb.freebsd.org/changeset/base/258371
Log:
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4104 ::spa_space no longer works
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Revision illumos/illumos-gate at 0713e232b7712cd27d99e1e935ebb8d5de61c57d
Added:
vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c/
vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c/range_tree.c (contents, props changed)
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_reftree.h/
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_reftree.h/space_reftree.h (contents, props changed)
Modified:
vendor-sys/illumos/dist/common/zfs/zfeature_common.c
vendor-sys/illumos/dist/common/zfs/zfeature_common.h
vendor-sys/illumos/dist/uts/common/Makefile.files
vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfeature.h
vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zfeature.c
Changes in other areas also in this revision:
Modified:
vendor/illumos/dist/cmd/zdb/zdb.c
vendor/illumos/dist/cmd/ztest/ztest.c
vendor/illumos/dist/man/man5/zpool-features.5
Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.c Wed Nov 20 10:38:37 2013 (r258370)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.c Wed Nov 20 10:41:10 2013 (r258371)
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
@@ -164,4 +164,7 @@ zpool_feature_init(void)
zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
"com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
"Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, NULL);
+ zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
+ "com.delphix:spacemap_histogram", "spacemap_histogram",
+ "Spacemaps maintain space histograms.", B_TRUE, B_FALSE, NULL);
}
Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.h
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.h Wed Nov 20 10:38:37 2013 (r258370)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.h Wed Nov 20 10:41:10 2013 (r258371)
@@ -20,7 +20,7 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
@@ -56,6 +56,7 @@ enum spa_feature {
SPA_FEATURE_EMPTY_BPOBJ,
SPA_FEATURE_LZ4_COMPRESS,
SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+ SPA_FEATURE_SPACEMAP_HISTOGRAM,
SPA_FEATURES
} spa_feature_t;
Modified: vendor-sys/illumos/dist/uts/common/Makefile.files
==============================================================================
--- vendor-sys/illumos/dist/uts/common/Makefile.files Wed Nov 20 10:38:37 2013 (r258370)
+++ vendor-sys/illumos/dist/uts/common/Makefile.files Wed Nov 20 10:41:10 2013 (r258371)
@@ -22,7 +22,7 @@
#
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
-# Copyright (c) 2012 by Delphix. All rights reserved.
+# Copyright (c) 2013 by Delphix. All rights reserved.
# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
#
@@ -1359,6 +1359,7 @@ ZFS_COMMON_OBJS += \
lz4.o \
lzjb.o \
metaslab.o \
+ range_tree.o \
refcount.o \
rrwlock.o \
sa.o \
@@ -1369,6 +1370,7 @@ ZFS_COMMON_OBJS += \
spa_history.o \
spa_misc.o \
space_map.o \
+ space_reftree.o \
txg.o \
uberblock.o \
unique.o \
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Wed Nov 20 10:38:37 2013 (r258370)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Wed Nov 20 10:41:10 2013 (r258371)
@@ -1334,7 +1334,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t si
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
/* Check for any allocated blocks beyond the first */
- if (dn->dn_phys->dn_maxblkid != 0)
+ if (dn->dn_maxblkid != 0)
goto fail;
mutex_enter(&dn->dn_dbufs_mtx);
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c Wed Nov 20 10:38:37 2013 (r258370)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c Wed Nov 20 10:41:10 2013 (r258371)
@@ -31,6 +31,7 @@
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
+#include <sys/spa_impl.h>
/*
* Allow allocations to switch to gang blocks quickly. We do this to
@@ -44,6 +45,11 @@
(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
METASLAB_GANG_AVOID)))
+#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
+#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
+#define METASLAB_ACTIVE_MASK \
+ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+
uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
@@ -79,9 +85,14 @@ int zfs_mg_alloc_failures = 0;
int zfs_mg_noalloc_threshold = 0;
/*
- * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ * When set will load all metaslabs when pool is first opened.
*/
-static int metaslab_debug = 0;
+int metaslab_debug_load = 0;
+
+/*
+ * When set will prevent metaslabs from being unloaded.
+ */
+int metaslab_debug_unload = 0;
/*
* Minimum size which forces the dynamic allocator to change
@@ -106,14 +117,16 @@ int metaslab_df_free_pct = 4;
uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
/*
- * Max number of space_maps to prefetch.
+ * Percentage of all cpus that can be used by the metaslab taskq.
*/
-int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+int metaslab_load_pct = 50;
/*
- * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ * Determines how many txgs a metaslab may remain loaded without having any
+ * allocations from it. As long as a metaslab continues to be used we will
+ * keep it loaded.
*/
-int metaslab_smo_bonus_pct = 150;
+int metaslab_unload_delay = TXG_SIZE * 2;
/*
* Should we be willing to write data to degraded vdevs?
@@ -121,12 +134,28 @@ int metaslab_smo_bonus_pct = 150;
boolean_t zfs_write_to_degraded = B_FALSE;
/*
+ * Max number of metaslabs per group to preload.
+ */
+int metaslab_preload_limit = SPA_DVAS_PER_BP;
+
+/*
+ * Enable/disable preloading of metaslab.
+ */
+boolean_t metaslab_preload_enabled = B_TRUE;
+
+/*
+ * Enable/disable additional weight factor for each metaslab.
+ */
+boolean_t metaslab_weight_factor_enable = B_FALSE;
+
+
+/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
{
metaslab_class_t *mc;
@@ -230,9 +259,9 @@ metaslab_compare(const void *x1, const v
/*
* If the weights are identical, use the offset to force uniqueness.
*/
- if (m1->ms_map->sm_start < m2->ms_map->sm_start)
+ if (m1->ms_start < m2->ms_start)
return (-1);
- if (m1->ms_map->sm_start > m2->ms_map->sm_start)
+ if (m1->ms_start > m2->ms_start)
return (1);
ASSERT3P(m1, ==, m2);
@@ -300,6 +329,9 @@ metaslab_group_create(metaslab_class_t *
mg->mg_class = mc;
mg->mg_activation_count = 0;
+ mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
+ minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
+
return (mg);
}
@@ -368,6 +400,8 @@ metaslab_group_passivate(metaslab_group_
return;
}
+ taskq_wait(mg->mg_taskq);
+
mgprev = mg->mg_prev;
mgnext = mg->mg_next;
@@ -447,130 +481,200 @@ metaslab_group_allocatable(metaslab_grou
/*
* ==========================================================================
- * Common allocator routines
+ * Range tree callbacks
* ==========================================================================
*/
+
+/*
+ * Comparison function for the private size-ordered tree. Tree is sorted
+ * by size, larger sizes at the end of the tree.
+ */
static int
-metaslab_segsize_compare(const void *x1, const void *x2)
+metaslab_rangesize_compare(const void *x1, const void *x2)
{
- const space_seg_t *s1 = x1;
- const space_seg_t *s2 = x2;
- uint64_t ss_size1 = s1->ss_end - s1->ss_start;
- uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+ const range_seg_t *r1 = x1;
+ const range_seg_t *r2 = x2;
+ uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+ uint64_t rs_size2 = r2->rs_end - r2->rs_start;
- if (ss_size1 < ss_size2)
+ if (rs_size1 < rs_size2)
return (-1);
- if (ss_size1 > ss_size2)
+ if (rs_size1 > rs_size2)
return (1);
- if (s1->ss_start < s2->ss_start)
+ if (r1->rs_start < r2->rs_start)
return (-1);
- if (s1->ss_start > s2->ss_start)
+
+ if (r1->rs_start > r2->rs_start)
return (1);
return (0);
}
/*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
*/
-static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
- uint64_t align)
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
{
- space_seg_t *ss, ssearch;
- avl_index_t where;
-
- ssearch.ss_start = *cursor;
- ssearch.ss_end = *cursor + size;
-
- ss = avl_find(t, &ssearch, &where);
- if (ss == NULL)
- ss = avl_nearest(t, where, AVL_AFTER);
+ metaslab_t *msp = arg;
- while (ss != NULL) {
- uint64_t offset = P2ROUNDUP(ss->ss_start, align);
-
- if (offset + size <= ss->ss_end) {
- *cursor = offset + size;
- return (offset);
- }
- ss = AVL_NEXT(t, ss);
- }
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT(msp->ms_tree == NULL);
- /*
- * If we know we've searched the whole map (*cursor == 0), give up.
- * Otherwise, reset the cursor to the beginning and try again.
- */
- if (*cursor == 0)
- return (-1ULL);
-
- *cursor = 0;
- return (metaslab_block_picker(t, cursor, size, align));
+ avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+ sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
}
+/*
+ * Destroy the block allocator specific components.
+ */
static void
-metaslab_pp_load(space_map_t *sm)
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
{
- space_seg_t *ss;
+ metaslab_t *msp = arg;
- ASSERT(sm->sm_ppd == NULL);
- sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT3P(msp->ms_tree, ==, rt);
+ ASSERT0(avl_numnodes(&msp->ms_size_tree));
- sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
- avl_create(sm->sm_pp_root, metaslab_segsize_compare,
- sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
-
- for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
- avl_add(sm->sm_pp_root, ss);
+ avl_destroy(&msp->ms_size_tree);
}
static void
-metaslab_pp_unload(space_map_t *sm)
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
{
- void *cookie = NULL;
-
- kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
- sm->sm_ppd = NULL;
+ metaslab_t *msp = arg;
- while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
- /* tear down the tree */
- }
-
- avl_destroy(sm->sm_pp_root);
- kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
- sm->sm_pp_root = NULL;
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT3P(msp->ms_tree, ==, rt);
+ VERIFY(!msp->ms_condensing);
+ avl_add(&msp->ms_size_tree, rs);
}
-/* ARGSUSED */
static void
-metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
{
- /* No need to update cursor */
+ metaslab_t *msp = arg;
+
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT3P(msp->ms_tree, ==, rt);
+ VERIFY(!msp->ms_condensing);
+ avl_remove(&msp->ms_size_tree, rs);
}
-/* ARGSUSED */
static void
-metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
{
- /* No need to update cursor */
+ metaslab_t *msp = arg;
+
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT3P(msp->ms_tree, ==, rt);
+
+ /*
+ * Normally one would walk the tree freeing nodes along the way.
+ * Since the nodes are shared with the range trees we can avoid
+ * walking all nodes and just reinitialize the avl tree. The nodes
+ * will be freed by the range tree, so we don't want to free them here.
+ */
+ avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+ sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
}
+static range_tree_ops_t metaslab_rt_ops = {
+ metaslab_rt_create,
+ metaslab_rt_destroy,
+ metaslab_rt_add,
+ metaslab_rt_remove,
+ metaslab_rt_vacate
+};
+
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
/*
* Return the maximum contiguous segment within the metaslab.
*/
uint64_t
-metaslab_pp_maxsize(space_map_t *sm)
+metaslab_block_maxsize(metaslab_t *msp)
{
- avl_tree_t *t = sm->sm_pp_root;
- space_seg_t *ss;
+ avl_tree_t *t = &msp->ms_size_tree;
+ range_seg_t *rs;
- if (t == NULL || (ss = avl_last(t)) == NULL)
+ if (t == NULL || (rs = avl_last(t)) == NULL)
return (0ULL);
- return (ss->ss_end - ss->ss_start);
+ return (rs->rs_end - rs->rs_start);
+}
+
+uint64_t
+metaslab_block_alloc(metaslab_t *msp, uint64_t size)
+{
+ uint64_t start;
+ range_tree_t *rt = msp->ms_tree;
+
+ VERIFY(!msp->ms_condensing);
+
+ start = msp->ms_ops->msop_alloc(msp, size);
+ if (start != -1ULL) {
+ vdev_t *vd = msp->ms_group->mg_vd;
+
+ VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+ range_tree_remove(rt, start, size);
+ }
+ return (start);
+}
+
+/*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
+static uint64_t
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+ uint64_t align)
+{
+ range_seg_t *rs, rsearch;
+ avl_index_t where;
+
+ rsearch.rs_start = *cursor;
+ rsearch.rs_end = *cursor + size;
+
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL)
+ rs = avl_nearest(t, where, AVL_AFTER);
+
+ while (rs != NULL) {
+ uint64_t offset = P2ROUNDUP(rs->rs_start, align);
+
+ if (offset + size <= rs->rs_end) {
+ *cursor = offset + size;
+ return (offset);
+ }
+ rs = AVL_NEXT(t, rs);
+ }
+
+ /*
+ * If we know we've searched the whole map (*cursor == 0), give up.
+ * Otherwise, reset the cursor to the beginning and try again.
+ */
+ if (*cursor == 0)
+ return (-1ULL);
+
+ *cursor = 0;
+ return (metaslab_block_picker(t, cursor, size, align));
}
/*
@@ -579,29 +683,31 @@ metaslab_pp_maxsize(space_map_t *sm)
* ==========================================================================
*/
static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &sm->sm_root;
+ /*
+ * Find the largest power of 2 block size that evenly divides the
+ * requested size. This is used to try to allocate blocks with similar
+ * alignment from the same area of the metaslab (i.e. same cursor
+ * bucket) but it does not guarantee that other allocations sizes
+ * may exist in the same region.
+ */
uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+ uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
+ avl_tree_t *t = &msp->ms_tree->rt_root;
return (metaslab_block_picker(t, cursor, size, align));
}
/* ARGSUSED */
-boolean_t
-metaslab_ff_fragmented(space_map_t *sm)
+static boolean_t
+metaslab_ff_fragmented(metaslab_t *msp)
{
return (B_TRUE);
}
-static space_map_ops_t metaslab_ff_ops = {
- metaslab_pp_load,
- metaslab_pp_unload,
+static metaslab_ops_t metaslab_ff_ops = {
metaslab_ff_alloc,
- metaslab_pp_claim,
- metaslab_pp_free,
- metaslab_pp_maxsize,
metaslab_ff_fragmented
};
@@ -614,16 +720,24 @@ static space_map_ops_t metaslab_ff_ops =
* ==========================================================================
*/
static uint64_t
-metaslab_df_alloc(space_map_t *sm, uint64_t size)
+metaslab_df_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &sm->sm_root;
+ /*
+ * Find the largest power of 2 block size that evenly divides the
+ * requested size. This is used to try to allocate blocks with similar
+ * alignment from the same area of the metaslab (i.e. same cursor
+ * bucket) but it does not guarantee that other allocations sizes
+ * may exist in the same region.
+ */
uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
- uint64_t max_size = metaslab_pp_maxsize(sm);
- int free_pct = sm->sm_space * 100 / sm->sm_size;
+ uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
+ range_tree_t *rt = msp->ms_tree;
+ avl_tree_t *t = &rt->rt_root;
+ uint64_t max_size = metaslab_block_maxsize(msp);
+ int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
- ASSERT(MUTEX_HELD(sm->sm_lock));
- ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
if (max_size < size)
return (-1ULL);
@@ -634,7 +748,7 @@ metaslab_df_alloc(space_map_t *sm, uint6
*/
if (max_size < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) {
- t = sm->sm_pp_root;
+ t = &msp->ms_size_tree;
*cursor = 0;
}
@@ -642,10 +756,11 @@ metaslab_df_alloc(space_map_t *sm, uint6
}
static boolean_t
-metaslab_df_fragmented(space_map_t *sm)
+metaslab_df_fragmented(metaslab_t *msp)
{
- uint64_t max_size = metaslab_pp_maxsize(sm);
- int free_pct = sm->sm_space * 100 / sm->sm_size;
+ range_tree_t *rt = msp->ms_tree;
+ uint64_t max_size = metaslab_block_maxsize(msp);
+ int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
if (max_size >= metaslab_df_alloc_threshold &&
free_pct >= metaslab_df_free_pct)
@@ -654,182 +769,228 @@ metaslab_df_fragmented(space_map_t *sm)
return (B_TRUE);
}
-static space_map_ops_t metaslab_df_ops = {
- metaslab_pp_load,
- metaslab_pp_unload,
+static metaslab_ops_t metaslab_df_ops = {
metaslab_df_alloc,
- metaslab_pp_claim,
- metaslab_pp_free,
- metaslab_pp_maxsize,
metaslab_df_fragmented
};
/*
* ==========================================================================
- * Other experimental allocators
+ * Cursor fit block allocator -
+ * Select the largest region in the metaslab, set the cursor to the beginning
+ * of the range and the cursor_end to the end of the range. As allocations
+ * are made advance the cursor. Continue allocating from the cursor until
+ * the range is exhausted and then find a new range.
* ==========================================================================
*/
static uint64_t
-metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &sm->sm_root;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd;
- uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
- uint64_t max_size = metaslab_pp_maxsize(sm);
- uint64_t rsize = size;
+ range_tree_t *rt = msp->ms_tree;
+ avl_tree_t *t = &msp->ms_size_tree;
+ uint64_t *cursor = &msp->ms_lbas[0];
+ uint64_t *cursor_end = &msp->ms_lbas[1];
uint64_t offset = 0;
- ASSERT(MUTEX_HELD(sm->sm_lock));
- ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
-
- if (max_size < size)
- return (-1ULL);
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
- ASSERT3U(*extent_end, >=, *cursor);
+ ASSERT3U(*cursor_end, >=, *cursor);
- /*
- * If we're running low on space switch to using the size
- * sorted AVL tree (best-fit).
- */
- if ((*cursor + size) > *extent_end) {
+ if ((*cursor + size) > *cursor_end) {
+ range_seg_t *rs;
- t = sm->sm_pp_root;
- *cursor = *extent_end = 0;
+ rs = avl_last(&msp->ms_size_tree);
+ if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+ return (-1ULL);
- if (max_size > 2 * SPA_MAXBLOCKSIZE)
- rsize = MIN(metaslab_min_alloc_size, max_size);
- offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
- if (offset != -1)
- *cursor = offset + size;
- } else {
- offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+ *cursor = rs->rs_start;
+ *cursor_end = rs->rs_end;
}
- ASSERT3U(*cursor, <=, *extent_end);
+
+ offset = *cursor;
+ *cursor += size;
+
return (offset);
}
static boolean_t
-metaslab_cdf_fragmented(space_map_t *sm)
+metaslab_cf_fragmented(metaslab_t *msp)
{
- uint64_t max_size = metaslab_pp_maxsize(sm);
-
- if (max_size > (metaslab_min_alloc_size * 10))
- return (B_FALSE);
- return (B_TRUE);
+ return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
}
-static space_map_ops_t metaslab_cdf_ops = {
- metaslab_pp_load,
- metaslab_pp_unload,
- metaslab_cdf_alloc,
- metaslab_pp_claim,
- metaslab_pp_free,
- metaslab_pp_maxsize,
- metaslab_cdf_fragmented
+static metaslab_ops_t metaslab_cf_ops = {
+ metaslab_cf_alloc,
+ metaslab_cf_fragmented
};
+/*
+ * ==========================================================================
+ * New dynamic fit allocator -
+ * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
+ * contiguous blocks. If no region is found then just use the largest segment
+ * that remains.
+ * ==========================================================================
+ */
+
+/*
+ * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
+ * to request from the allocator.
+ */
uint64_t metaslab_ndf_clump_shift = 4;
static uint64_t
-metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &sm->sm_root;
+ avl_tree_t *t = &msp->ms_tree->rt_root;
avl_index_t where;
- space_seg_t *ss, ssearch;
+ range_seg_t *rs, rsearch;
uint64_t hbit = highbit(size);
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
- uint64_t max_size = metaslab_pp_maxsize(sm);
+ uint64_t *cursor = &msp->ms_lbas[hbit - 1];
+ uint64_t max_size = metaslab_block_maxsize(msp);
- ASSERT(MUTEX_HELD(sm->sm_lock));
- ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
if (max_size < size)
return (-1ULL);
- ssearch.ss_start = *cursor;
- ssearch.ss_end = *cursor + size;
+ rsearch.rs_start = *cursor;
+ rsearch.rs_end = *cursor + size;
- ss = avl_find(t, &ssearch, &where);
- if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
- t = sm->sm_pp_root;
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+ t = &msp->ms_size_tree;
- ssearch.ss_start = 0;
- ssearch.ss_end = MIN(max_size,
+ rsearch.rs_start = 0;
+ rsearch.rs_end = MIN(max_size,
1ULL << (hbit + metaslab_ndf_clump_shift));
- ss = avl_find(t, &ssearch, &where);
- if (ss == NULL)
- ss = avl_nearest(t, where, AVL_AFTER);
- ASSERT(ss != NULL);
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL)
+ rs = avl_nearest(t, where, AVL_AFTER);
+ ASSERT(rs != NULL);
}
- if (ss != NULL) {
- if (ss->ss_start + size <= ss->ss_end) {
- *cursor = ss->ss_start + size;
- return (ss->ss_start);
- }
+ if ((rs->rs_end - rs->rs_start) >= size) {
+ *cursor = rs->rs_start + size;
+ return (rs->rs_start);
}
return (-1ULL);
}
static boolean_t
-metaslab_ndf_fragmented(space_map_t *sm)
+metaslab_ndf_fragmented(metaslab_t *msp)
{
- uint64_t max_size = metaslab_pp_maxsize(sm);
-
- if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
- return (B_FALSE);
- return (B_TRUE);
+ return (metaslab_block_maxsize(msp) <=
+ (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
}
-
-static space_map_ops_t metaslab_ndf_ops = {
- metaslab_pp_load,
- metaslab_pp_unload,
+static metaslab_ops_t metaslab_ndf_ops = {
metaslab_ndf_alloc,
- metaslab_pp_claim,
- metaslab_pp_free,
- metaslab_pp_maxsize,
metaslab_ndf_fragmented
};
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
/*
* ==========================================================================
* Metaslabs
* ==========================================================================
*/
+
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+void
+metaslab_load_wait(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ while (msp->ms_loading) {
+ ASSERT(!msp->ms_loaded);
+ cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+ }
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(!msp->ms_loaded);
+ ASSERT(!msp->ms_loading);
+
+ msp->ms_loading = B_TRUE;
+
+ /*
+ * If the space map has not been allocated yet, then treat
+ * all the space in the metaslab as free and add it to the
+ * ms_tree.
+ */
+ if (msp->ms_sm != NULL)
+ error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
+ else
+ range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+
+ msp->ms_loaded = (error == 0);
+ msp->ms_loading = B_FALSE;
+
+ if (msp->ms_loaded) {
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defertree[t],
+ range_tree_remove, msp->ms_tree);
+ }
+ }
+ cv_broadcast(&msp->ms_load_cv);
+ return (error);
+}
+
+void
+metaslab_unload(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ range_tree_vacate(msp->ms_tree, NULL, NULL);
+ msp->ms_loaded = B_FALSE;
+ msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+}
+
metaslab_t *
-metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
- uint64_t start, uint64_t size, uint64_t txg)
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
{
vdev_t *vd = mg->mg_vd;
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
metaslab_t *msp;
msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
+ msp->ms_id = id;
+ msp->ms_start = id << vd->vdev_ms_shift;
+ msp->ms_size = 1ULL << vd->vdev_ms_shift;
- msp->ms_smo_syncing = *smo;
+ /*
+ * We only open space map objects that already exist. All others
+ * will be opened when we finally allocate an object for it.
+ */
+ if (object != 0) {
+ VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
+ msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
+ ASSERT(msp->ms_sm != NULL);
+ }
/*
- * We create the main space map here, but we don't create the
- * allocmaps and freemaps until metaslab_sync_done(). This serves
+ * We create the main range tree here, but we don't create the
+ * alloctree and freetree until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
- msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
- space_map_create(msp->ms_map, start, size,
- vd->vdev_ashift, &msp->ms_lock);
-
+ msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
metaslab_group_add(mg, msp);
- if (metaslab_debug && smo->smo_object != 0) {
- mutex_enter(&msp->ms_lock);
- VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
- SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
- mutex_exit(&msp->ms_lock);
- }
+ msp->ms_ops = mg->mg_class->mc_ops;
/*
* If we're opening an existing pool (txg == 0) or creating
@@ -840,6 +1001,17 @@ metaslab_init(metaslab_group_t *mg, spac
if (txg <= TXG_INITIAL)
metaslab_sync_done(msp, 0);
+ /*
+ * If metaslab_debug_load is set and we're initializing a metaslab
+ * that has an allocated space_map object then load the its space
+ * map so that can verify frees.
+ */
+ if (metaslab_debug_load && msp->ms_sm != NULL) {
+ mutex_enter(&msp->ms_lock);
+ VERIFY0(metaslab_load(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+
if (txg != 0) {
vdev_dirty(vd, 0, NULL, txg);
vdev_dirty(vd, VDD_METASLAB, msp, txg);
@@ -853,48 +1025,103 @@ metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
- vdev_space_update(mg->mg_vd,
- -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
-
metaslab_group_remove(mg, msp);
mutex_enter(&msp->ms_lock);
- space_map_unload(msp->ms_map);
- space_map_destroy(msp->ms_map);
- kmem_free(msp->ms_map, sizeof (*msp->ms_map));
+ VERIFY(msp->ms_group == NULL);
+ vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
+ 0, -msp->ms_size);
+ space_map_close(msp->ms_sm);
+
+ metaslab_unload(msp);
+ range_tree_destroy(msp->ms_tree);
for (int t = 0; t < TXG_SIZE; t++) {
- space_map_destroy(msp->ms_allocmap[t]);
- space_map_destroy(msp->ms_freemap[t]);
- kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
- kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
+ range_tree_destroy(msp->ms_alloctree[t]);
+ range_tree_destroy(msp->ms_freetree[t]);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- space_map_destroy(msp->ms_defermap[t]);
- kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
+ range_tree_destroy(msp->ms_defertree[t]);
}
ASSERT0(msp->ms_deferspace);
mutex_exit(&msp->ms_lock);
+ cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
kmem_free(msp, sizeof (metaslab_t));
}
-#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
-#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
-#define METASLAB_ACTIVE_MASK \
- (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+/*
+ * Apply a weighting factor based on the histogram information for this
+ * metaslab. The current weighting factor is somewhat arbitrary and requires
+ * additional investigation. The implementation provides a measure of
+ * "weighted" free space and gives a higher weighting for larger contiguous
+ * regions. The weighting factor is determined by counting the number of
+ * sm_shift sectors that exist in each region represented by the histogram.
+ * That value is then multiplied by the power of 2 exponent and the sm_shift
+ * value.
+ *
+ * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
+ * metaslab has an sm_shift value of 9 (512B):
+ *
+ * 1) calculate the number of sm_shift sectors in the region:
+ * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
+ * 2) multiply by the power of 2 exponent and the sm_shift value:
+ * 16384 * 21 * 9 = 3096576
+ * This value will be added to the weighting of the metaslab.
+ */
+static uint64_t
+metaslab_weight_factor(metaslab_t *msp)
+{
+ uint64_t factor = 0;
+ uint64_t sectors;
+ int i;
+
+ /*
+ * A null space map means that the entire metaslab is free,
+ * calculate a weight factor that spans the entire size of the
+ * metaslab.
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list