svn commit: r212671 - in stable/8: cddl/contrib/opensolaris/cmd/zdb sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys

Martin Matuska mm at FreeBSD.org
Wed Sep 15 16:14:38 UTC 2010


Author: mm
Date: Wed Sep 15 16:14:38 2010
New Revision: 212671
URL: http://svn.freebsd.org/changeset/base/212671

Log:
  MFC r211931:
  
  Update ZFS metaslab code from OpenSolaris.
  This provides a noticeable write speedup, especially on pools with
  less than 30% of free space.
  
  Detailed information (OpenSolaris onnv changesets and Bug IDs):
  
  11146:7e58f40bcb1c
  6826241 Sync write IOPS drops dramatically during TXG sync
  6869229 zfs should switch to shiny new metaslabs more frequently
  
  11728:59fdb3b856f6
  6918420 zdb -m has issues printing metaslab statistics
  
  12047:7c1fcc8419ca
  6917066 zfs block picking can be improved
  
  Approved by:	delphij (mentor)
  Obtained from:	OpenSolaris (Bug ID 6826241, 6869229, 6918420, 6917066)

Modified:
  stable/8/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
  stable/8/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/   (props changed)
  stable/8/sys/amd64/include/xen/   (props changed)
  stable/8/sys/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/contrib/dev/acpica/   (props changed)
  stable/8/sys/contrib/pf/   (props changed)
  stable/8/sys/dev/xen/xenpci/   (props changed)

Modified: stable/8/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- stable/8/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Wed Sep 15 16:14:38 2010	(r212671)
@@ -491,35 +491,37 @@ dump_metaslab_stats(metaslab_t *msp)
 static void
 dump_metaslab(metaslab_t *msp)
 {
-	char freebuf[5];
-	space_map_obj_t *smo = &msp->ms_smo;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
+	space_map_t *sm = &msp->ms_map;
+	space_map_obj_t *smo = &msp->ms_smo;
+	char freebuf[5];
 
-	nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
+	nicenum(sm->sm_size - smo->smo_alloc, freebuf);
 
 	(void) printf(
 	    "\tvdev %5llu   offset %12llx   spacemap %6llu   free    %5s\n",
-	    (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
-	    (u_longlong_t)smo->smo_object, freebuf);
+	    (u_longlong_t)(sm->sm_start / sm->sm_size),
+	    (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
 
 	if (dump_opt['m'] > 1) {
 		mutex_enter(&msp->ms_lock);
-		VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops,
-		    SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0);
+		space_map_load_wait(sm);
+		if (!sm->sm_loaded)
+			VERIFY(space_map_load(sm, zfs_metaslab_ops,
+			    SM_FREE, smo, spa->spa_meta_objset) == 0);
 		dump_metaslab_stats(msp);
-		space_map_unload(&msp->ms_map);
+		space_map_unload(sm);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
-		ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+		ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift));
 
 		mutex_enter(&msp->ms_lock);
-		dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+		dump_spacemap(spa->spa_meta_objset, smo, sm);
 		mutex_exit(&msp->ms_lock);
 	}
-
 }
 
 static void

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Wed Sep 15 16:14:38 2010	(r212671)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -37,7 +36,7 @@ uint64_t metaslab_gang_bang = SPA_MAXBLO
 
 /*
  * Minimum size which forces the dynamic allocator to change
- * it's allocation strategy. Once the space map cannot satisfy
+ * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
@@ -49,7 +48,23 @@ uint64_t metaslab_df_alloc_threshold = S
  * Once the space_map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
-int metaslab_df_free_pct = 30;
+int metaslab_df_free_pct = 4;
+
+/*
+ * A metaslab is considered "free" if it contains a contiguous
+ * segment which is greater than metaslab_min_alloc_size.
+ */
+uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+
+/*
+ * Max number of space_maps to prefetch.
+ */
+int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+
+/*
+ * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ */
+int metaslab_smo_bonus_pct = 150;
 
 /*
  * ==========================================================================
@@ -219,6 +234,32 @@ metaslab_group_sort(metaslab_group_t *mg
 }
 
 /*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+static int
+metaslab_segsize_compare(const void *x1, const void *x2)
+{
+	const space_seg_t *s1 = x1;
+	const space_seg_t *s2 = x2;
+	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+	if (ss_size1 < ss_size2)
+		return (-1);
+	if (ss_size1 > ss_size2)
+		return (1);
+
+	if (s1->ss_start < s2->ss_start)
+		return (-1);
+	if (s1->ss_start > s2->ss_start)
+		return (1);
+
+	return (0);
+}
+
+/*
  * This is a helper function that can be used by the allocator to find
  * a suitable block to allocate. This will search the specified AVL
  * tree looking for a block that matches the specified criteria.
@@ -258,68 +299,58 @@ metaslab_block_picker(avl_tree_t *t, uin
 	return (metaslab_block_picker(t, cursor, size, align));
 }
 
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
 static void
-metaslab_ff_load(space_map_t *sm)
+metaslab_pp_load(space_map_t *sm)
 {
+	space_seg_t *ss;
+
 	ASSERT(sm->sm_ppd == NULL);
 	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-	sm->sm_pp_root = NULL;
+
+	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+	avl_create(sm->sm_pp_root, metaslab_segsize_compare,
+	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		avl_add(sm->sm_pp_root, ss);
 }
 
 static void
-metaslab_ff_unload(space_map_t *sm)
+metaslab_pp_unload(space_map_t *sm)
 {
+	void *cookie = NULL;
+
 	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
 	sm->sm_ppd = NULL;
-}
 
-static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
-{
-	avl_tree_t *t = &sm->sm_root;
-	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+		/* tear down the tree */
+	}
 
-	return (metaslab_block_picker(t, cursor, size, align));
+	avl_destroy(sm->sm_pp_root);
+	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+	sm->sm_pp_root = NULL;
 }
 
 /* ARGSUSED */
 static void
-metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
 {
 	/* No need to update cursor */
 }
 
 /* ARGSUSED */
 static void
-metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
 {
 	/* No need to update cursor */
 }
 
-static space_map_ops_t metaslab_ff_ops = {
-	metaslab_ff_load,
-	metaslab_ff_unload,
-	metaslab_ff_alloc,
-	metaslab_ff_claim,
-	metaslab_ff_free,
-	NULL	/* maxsize */
-};
-
 /*
- * Dynamic block allocator -
- * Uses the first fit allocation scheme until space get low and then
- * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
- * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * Return the maximum contiguous segment within the metaslab.
  */
-
 uint64_t
-metaslab_df_maxsize(space_map_t *sm)
+metaslab_pp_maxsize(space_map_t *sm)
 {
 	avl_tree_t *t = sm->sm_pp_root;
 	space_seg_t *ss;
@@ -330,67 +361,53 @@ metaslab_df_maxsize(space_map_t *sm)
 	return (ss->ss_end - ss->ss_start);
 }
 
-static int
-metaslab_df_seg_compare(const void *x1, const void *x2)
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 {
-	const space_seg_t *s1 = x1;
-	const space_seg_t *s2 = x2;
-	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
-	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
-
-	if (ss_size1 < ss_size2)
-		return (-1);
-	if (ss_size1 > ss_size2)
-		return (1);
-
-	if (s1->ss_start < s2->ss_start)
-		return (-1);
-	if (s1->ss_start > s2->ss_start)
-		return (1);
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t align = size & -size;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 
-	return (0);
+	return (metaslab_block_picker(t, cursor, size, align));
 }
 
-static void
-metaslab_df_load(space_map_t *sm)
+/* ARGSUSED */
+boolean_t
+metaslab_ff_fragmented(space_map_t *sm)
 {
-	space_seg_t *ss;
-
-	ASSERT(sm->sm_ppd == NULL);
-	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-
-	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-	avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
-	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
-
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		avl_add(sm->sm_pp_root, ss);
+	return (B_TRUE);
 }
 
-static void
-metaslab_df_unload(space_map_t *sm)
-{
-	void *cookie = NULL;
-
-	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-	sm->sm_ppd = NULL;
-
-	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
-		/* tear down the tree */
-	}
-
-	avl_destroy(sm->sm_pp_root);
-	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
-	sm->sm_pp_root = NULL;
-}
+static space_map_ops_t metaslab_ff_ops = {
+	metaslab_pp_load,
+	metaslab_pp_unload,
+	metaslab_ff_alloc,
+	metaslab_pp_claim,
+	metaslab_pp_free,
+	metaslab_pp_maxsize,
+	metaslab_ff_fragmented
+};
 
+/*
+ * ==========================================================================
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * ==========================================================================
+ */
 static uint64_t
 metaslab_df_alloc(space_map_t *sm, uint64_t size)
 {
 	avl_tree_t *t = &sm->sm_root;
 	uint64_t align = size & -size;
 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
-	uint64_t max_size = metaslab_df_maxsize(sm);
+	uint64_t max_size = metaslab_pp_maxsize(sm);
 	int free_pct = sm->sm_space * 100 / sm->sm_size;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -412,30 +429,158 @@ metaslab_df_alloc(space_map_t *sm, uint6
 	return (metaslab_block_picker(t, cursor, size, 1ULL));
 }
 
-/* ARGSUSED */
-static void
-metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+static boolean_t
+metaslab_df_fragmented(space_map_t *sm)
 {
-	/* No need to update cursor */
-}
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+	int free_pct = sm->sm_space * 100 / sm->sm_size;
 
-/* ARGSUSED */
-static void
-metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	/* No need to update cursor */
+	if (max_size >= metaslab_df_alloc_threshold &&
+	    free_pct >= metaslab_df_free_pct)
+		return (B_FALSE);
+
+	return (B_TRUE);
 }
 
 static space_map_ops_t metaslab_df_ops = {
-	metaslab_df_load,
-	metaslab_df_unload,
+	metaslab_pp_load,
+	metaslab_pp_unload,
 	metaslab_df_alloc,
-	metaslab_df_claim,
-	metaslab_df_free,
-	metaslab_df_maxsize
+	metaslab_pp_claim,
+	metaslab_pp_free,
+	metaslab_pp_maxsize,
+	metaslab_df_fragmented
+};
+
+/*
+ * ==========================================================================
+ * Other experimental allocators
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
+	uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+	uint64_t rsize = size;
+	uint64_t offset = 0;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+	if (max_size < size)
+		return (-1ULL);
+
+	ASSERT3U(*extent_end, >=, *cursor);
+
+	/*
+	 * If we're running low on space switch to using the size
+	 * sorted AVL tree (best-fit).
+	 */
+	if ((*cursor + size) > *extent_end) {
+
+		t = sm->sm_pp_root;
+		*cursor = *extent_end = 0;
+
+		if (max_size > 2 * SPA_MAXBLOCKSIZE)
+			rsize = MIN(metaslab_min_alloc_size, max_size);
+		offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
+		if (offset != -1)
+			*cursor = offset + size;
+	} else {
+		offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+	}
+	ASSERT3U(*cursor, <=, *extent_end);
+	return (offset);
+}
+
+static boolean_t
+metaslab_cdf_fragmented(space_map_t *sm)
+{
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+
+	if (max_size > (metaslab_min_alloc_size * 10))
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
+static space_map_ops_t metaslab_cdf_ops = {
+	metaslab_pp_load,
+	metaslab_pp_unload,
+	metaslab_cdf_alloc,
+	metaslab_pp_claim,
+	metaslab_pp_free,
+	metaslab_pp_maxsize,
+	metaslab_cdf_fragmented
+};
+
+uint64_t metaslab_ndf_clump_shift = 4;
+
+static uint64_t
+metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	avl_index_t where;
+	space_seg_t *ss, ssearch;
+	uint64_t hbit = highbit(size);
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+	if (max_size < size)
+		return (-1ULL);
+
+	ssearch.ss_start = *cursor;
+	ssearch.ss_end = *cursor + size;
+
+	ss = avl_find(t, &ssearch, &where);
+	if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
+		t = sm->sm_pp_root;
+
+		ssearch.ss_start = 0;
+		ssearch.ss_end = MIN(max_size,
+		    1ULL << (hbit + metaslab_ndf_clump_shift));
+		ss = avl_find(t, &ssearch, &where);
+		if (ss == NULL)
+			ss = avl_nearest(t, where, AVL_AFTER);
+		ASSERT(ss != NULL);
+	}
+
+	if (ss != NULL) {
+		if (ss->ss_start + size <= ss->ss_end) {
+			*cursor = ss->ss_start + size;
+			return (ss->ss_start);
+		}
+	}
+	return (-1ULL);
+}
+
+static boolean_t
+metaslab_ndf_fragmented(space_map_t *sm)
+{
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+
+	if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
+
+static space_map_ops_t metaslab_ndf_ops = {
+	metaslab_pp_load,
+	metaslab_pp_unload,
+	metaslab_ndf_alloc,
+	metaslab_pp_claim,
+	metaslab_pp_free,
+	metaslab_pp_maxsize,
+	metaslab_ndf_fragmented
 };
 
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
 
 /*
  * ==========================================================================
@@ -522,7 +667,6 @@ metaslab_fini(metaslab_t *msp)
 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
 #define	METASLAB_ACTIVE_MASK		\
 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
-#define	METASLAB_SMO_BONUS_MULTIPLIER	2
 
 static uint64_t
 metaslab_weight(metaslab_t *msp)
@@ -555,25 +699,60 @@ metaslab_weight(metaslab_t *msp)
 	ASSERT(weight >= space && weight <= 2 * space);
 
 	/*
-	 * For locality, assign higher weight to metaslabs we've used before.
+	 * For locality, assign higher weight to metaslabs which have
+	 * a lower offset than what we've already activated.
 	 */
-	if (smo->smo_object != 0)
-		weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+	if (sm->sm_start <= mg->mg_bonus_area)
+		weight *= (metaslab_smo_bonus_pct / 100);
 	ASSERT(weight >= space &&
-	    weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
+	    weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
+
+	if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
+		/*
+		 * If this metaslab is one we're actively using, adjust its
+		 * weight to make it preferable to any inactive metaslab so
+		 * we'll polish it off.
+		 */
+		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+	}
+	return (weight);
+}
+
+static void
+metaslab_prefetch(metaslab_group_t *mg)
+{
+	spa_t *spa = mg->mg_vd->vdev_spa;
+	metaslab_t *msp;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	int m;
+
+	mutex_enter(&mg->mg_lock);
 
 	/*
-	 * If this metaslab is one we're actively using, adjust its weight to
-	 * make it preferable to any inactive metaslab so we'll polish it off.
+	 * Prefetch the next potential metaslabs
 	 */
-	weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+	for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
+		space_map_t *sm = &msp->ms_map;
+		space_map_obj_t *smo = &msp->ms_smo;
 
-	return (weight);
+		/* If we have reached our prefetch limit then we're done */
+		if (m >= metaslab_prefetch_limit)
+			break;
+
+		if (!sm->sm_loaded && smo->smo_object != 0) {
+			mutex_exit(&mg->mg_lock);
+			dmu_prefetch(spa->spa_meta_objset, smo->smo_object,
+			    0ULL, smo->smo_objsize);
+			mutex_enter(&mg->mg_lock);
+		}
+	}
+	mutex_exit(&mg->mg_lock);
 }
 
 static int
 metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 {
+	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
 	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 
@@ -588,6 +767,15 @@ metaslab_activate(metaslab_t *msp, uint6
 		}
 
 		/*
+		 * Track the bonus area as we activate new metaslabs.
+		 */
+		if (sm->sm_start > mg->mg_bonus_area) {
+			mutex_enter(&mg->mg_lock);
+			mg->mg_bonus_area = sm->sm_start;
+			mutex_exit(&mg->mg_lock);
+		}
+
+		/*
 		 * If we were able to load the map then make sure
 		 * that this map is still able to satisfy our request.
 		 */
@@ -773,6 +961,32 @@ metaslab_sync_done(metaslab_t *msp, uint
 	mutex_exit(&msp->ms_lock);
 }
 
+void
+metaslab_sync_reassess(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+
+	/*
+	 * Re-evaluate all metaslabs which have lower offsets than the
+	 * bonus area.
+	 */
+	for (int m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *msp = vd->vdev_ms[m];
+
+		if (msp->ms_map.sm_start > mg->mg_bonus_area)
+			break;
+
+		mutex_enter(&msp->ms_lock);
+		metaslab_group_sort(mg, msp, metaslab_weight(msp));
+		mutex_exit(&msp->ms_lock);
+	}
+
+	/*
+	 * Prefetch the next potential metaslabs
+	 */
+	metaslab_prefetch(mg);
+}
+
 static uint64_t
 metaslab_distance(metaslab_t *msp, dva_t *dva)
 {
@@ -868,7 +1082,7 @@ metaslab_group_alloc(metaslab_group_t *m
 		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
 			break;
 
-		metaslab_passivate(msp, size - 1);
+		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
 
 		mutex_exit(&msp->ms_lock);
 	}

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	Wed Sep 15 16:14:38 2010	(r212671)
@@ -74,35 +74,38 @@ enum zti_modes {
 	zti_mode_fixed,			/* value is # of threads (min 1) */
 	zti_mode_online_percent,	/* value is % of online CPUs */
 	zti_mode_tune,			/* fill from zio_taskq_tune_* */
+	zti_mode_null,			/* don't create a taskq */
 	zti_nmodes
 };
 
-#define	ZTI_THREAD_FIX(n)	{ zti_mode_fixed, (n) }
-#define	ZTI_THREAD_PCT(n)	{ zti_mode_online_percent, (n) }
-#define	ZTI_THREAD_TUNE		{ zti_mode_tune, 0 }
+#define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
+#define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
+#define	ZTI_TUNE	{ zti_mode_tune, 0 }
+#define	ZTI_NULL	{ zti_mode_null, 0 }
 
-#define	ZTI_THREAD_ONE		ZTI_THREAD_FIX(1)
+#define	ZTI_ONE		ZTI_FIX(1)
 
 typedef struct zio_taskq_info {
-	const char *zti_name;
-	struct {
-		enum zti_modes zti_mode;
-		uint_t zti_value;
-	} zti_nthreads[ZIO_TASKQ_TYPES];
+	enum zti_modes zti_mode;
+	uint_t zti_value;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
-				"issue",		"intr"
+		"issue", "issue_high", "intr", "intr_high"
 };
 
-const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = {
-	/*			ISSUE			INTR		*/
-	{ "spa_zio_null",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
-	{ "spa_zio_read",	{ ZTI_THREAD_FIX(8),	ZTI_THREAD_TUNE } },
-	{ "spa_zio_write",	{ ZTI_THREAD_TUNE,	ZTI_THREAD_FIX(8) } },
-	{ "spa_zio_free",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
-	{ "spa_zio_claim",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
-	{ "spa_zio_ioctl",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
+/*
+ * Define the taskq threads for the following I/O types:
+ * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
+ */
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
+	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_TUNE,	ZTI_NULL },
+	{ ZTI_TUNE,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 };
 
 enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
@@ -581,14 +584,14 @@ spa_activate(spa_t *spa, int mode)
 	spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
-		const zio_taskq_info_t *ztip = &zio_taskqs[t];
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
-			uint_t value = ztip->zti_nthreads[q].zti_value;
+			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+			enum zti_modes mode = ztip->zti_mode;
+			uint_t value = ztip->zti_value;
 			char name[32];
 
 			(void) snprintf(name, sizeof (name),
-			    "%s_%s", ztip->zti_name, zio_taskq_types[q]);
+			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
 
 			if (mode == zti_mode_tune) {
 				mode = zio_taskq_tune_mode;
@@ -613,6 +616,10 @@ spa_activate(spa_t *spa, int mode)
 				    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
 				break;
 
+			case zti_mode_null:
+				spa->spa_zio_taskq[t][q] = NULL;
+				break;
+
 			case zti_mode_tune:
 			default:
 				panic("unrecognized mode for "
@@ -659,7 +666,8 @@ spa_deactivate(spa_t *spa)
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			taskq_destroy(spa->spa_zio_taskq[t][q]);
+			if (spa->spa_zio_taskq[t][q] != NULL)
+				taskq_destroy(spa->spa_zio_taskq[t][q]);
 			spa->spa_zio_taskq[t][q] = NULL;
 		}
 	}

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	Wed Sep 15 16:14:38 2010	(r212671)
@@ -368,10 +368,8 @@ space_map_unload(space_map_t *sm)
 uint64_t
 space_map_maxsize(space_map_t *sm)
 {
-	if (sm->sm_loaded && sm->sm_ops != NULL)
-		return (sm->sm_ops->smop_max(sm));
-	else
-		return (-1ULL);
+	ASSERT(sm->sm_ops != NULL);
+	return (sm->sm_ops->smop_max(sm));
 }
 
 uint64_t

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h	Wed Sep 15 16:14:38 2010	(r212671)
@@ -46,6 +46,7 @@ extern metaslab_t *metaslab_init(metasla
 extern void metaslab_fini(metaslab_t *msp);
 extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_reassess(metaslab_group_t *mg);
 
 #define	METASLAB_HINTBP_FAVOR	0x0
 #define	METASLAB_HINTBP_AVOID	0x1

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	Wed Sep 15 16:14:38 2010	(r212671)
@@ -46,6 +46,7 @@ struct metaslab_group {
 	kmutex_t		mg_lock;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
+	uint64_t		mg_bonus_area;
 	int64_t			mg_bias;
 	metaslab_class_t	*mg_class;
 	vdev_t			*mg_vd;

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	Wed Sep 15 16:14:38 2010	(r212671)
@@ -87,7 +87,9 @@ typedef enum spa_log_state {
 
 enum zio_taskq_type {
 	ZIO_TASKQ_ISSUE = 0,
+	ZIO_TASKQ_ISSUE_HIGH,
 	ZIO_TASKQ_INTERRUPT,
+	ZIO_TASKQ_INTERRUPT_HIGH,
 	ZIO_TASKQ_TYPES
 };
 

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h	Wed Sep 15 16:14:38 2010	(r212671)
@@ -77,6 +77,7 @@ struct space_map_ops {
 	void	(*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
 	void	(*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
 	uint64_t (*smop_max)(space_map_t *sm);
+	boolean_t (*smop_fragmented)(space_map_t *sm);
 };
 
 /*

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Wed Sep 15 16:14:38 2010	(r212671)
@@ -107,14 +107,15 @@ enum zio_compress {
 #define	ZIO_PRIORITY_NOW		(zio_priority_table[0])
 #define	ZIO_PRIORITY_SYNC_READ		(zio_priority_table[1])
 #define	ZIO_PRIORITY_SYNC_WRITE		(zio_priority_table[2])
-#define	ZIO_PRIORITY_ASYNC_READ		(zio_priority_table[3])
-#define	ZIO_PRIORITY_ASYNC_WRITE	(zio_priority_table[4])
-#define	ZIO_PRIORITY_FREE		(zio_priority_table[5])
-#define	ZIO_PRIORITY_CACHE_FILL		(zio_priority_table[6])
-#define	ZIO_PRIORITY_LOG_WRITE		(zio_priority_table[7])
-#define	ZIO_PRIORITY_RESILVER		(zio_priority_table[8])
-#define	ZIO_PRIORITY_SCRUB		(zio_priority_table[9])
-#define	ZIO_PRIORITY_TABLE_SIZE		10
+#define	ZIO_PRIORITY_LOG_WRITE		(zio_priority_table[3])
+#define	ZIO_PRIORITY_CACHE_FILL		(zio_priority_table[4])
+#define	ZIO_PRIORITY_AGG		(zio_priority_table[5])
+#define	ZIO_PRIORITY_FREE		(zio_priority_table[6])
+#define	ZIO_PRIORITY_ASYNC_WRITE	(zio_priority_table[7])
+#define	ZIO_PRIORITY_ASYNC_READ		(zio_priority_table[8])
+#define	ZIO_PRIORITY_RESILVER		(zio_priority_table[9])
+#define	ZIO_PRIORITY_SCRUB		(zio_priority_table[10])
+#define	ZIO_PRIORITY_TABLE_SIZE		11
 
 #define	ZIO_FLAG_MUSTSUCCEED		0x00000
 #define	ZIO_FLAG_CANFAIL		0x00001

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	Wed Sep 15 16:14:38 2010	(r212671)
@@ -1773,9 +1773,13 @@ void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
+	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 		metaslab_sync_done(msp, txg);
+
+	if (reassess)
+		metaslab_sync_reassess(vd->vdev_mg);
 }
 
 void

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	Wed Sep 15 16:14:38 2010	(r212671)
@@ -233,7 +233,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq,
 		ASSERT(size <= zfs_vdev_aggregation_limit);
 
 		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
-		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
+		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
 		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 		    vdev_queue_agg_io_done, NULL);
 

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Wed Sep 15 16:10:38 2010	(r212670)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Wed Sep 15 16:14:38 2010	(r212671)
@@ -49,11 +49,12 @@ uint8_t zio_priority_table[ZIO_PRIORITY_
 	0,	/* ZIO_PRIORITY_NOW		*/
 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
-	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
-	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
-	4,	/* ZIO_PRIORITY_FREE		*/
-	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
+	1,	/* ZIO_PRIORITY_CACHE_FILL	*/
+	1,	/* ZIO_PRIORITY_AGG		*/
+	4,	/* ZIO_PRIORITY_FREE		*/
+	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
+	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
 	10,	/* ZIO_PRIORITY_RESILVER	*/
 	20,	/* ZIO_PRIORITY_SCRUB		*/
 };
@@ -64,7 +65,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_
  * ==========================================================================
  */
 char *zio_type_name[ZIO_TYPES] = {
-	"null", "read", "write", "free", "claim", "ioctl" };
+	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
+	"zio_ioctl"
+};
 
 #define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
 #define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
@@ -942,6 +945,7 @@ zio_write_bp_init(zio_t *zio)
 static void
 zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
 {
+	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
 
 	/*
@@ -958,7 +962,15 @@ zio_taskq_dispatch(zio_t *zio, enum zio_
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
-	(void) taskq_dispatch_safe(zio->io_spa->spa_zio_taskq[t][q],
+	/*
+	 * If this is a high priority I/O, then use the high priority taskq.
+	 */
+	if (zio->io_priority == ZIO_PRIORITY_NOW &&
+	    spa->spa_zio_taskq[t][q + 1] != NULL)
+		q++;
+
+	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+	(void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q],
 	    (task_func_t *)zio_execute, zio, &zio->io_task);
 }
 


More information about the svn-src-stable-8 mailing list