svn commit: r354954 - vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/zinject

Andriy Gapon avg at FreeBSD.org
Thu Nov 21 14:00:12 UTC 2019


Author: avg
Date: Thu Nov 21 14:00:09 2019
New Revision: 354954
URL: https://svnweb.freebsd.org/changeset/base/354954

Log:
  10566 Multiple DVA Scrubbing Fix
  
  illumos/illumos-gate at 12a8814c13fbb1d6d58616cf090ea5815dc107f9
  https://github.com/illumos/illumos-gate/commit/12a8814c13fbb1d6d58616cf090ea5815dc107f9
  
  https://www.illumos.org/issues/10566
    ZoL PR_8453
    Author: Tom Caputi <tcaputi at datto.com>
    Date:   Fri Mar 15 17:14:31 2019 -0400
  
        Multiple DVA Scrubbing Fix
  
        Currently, there is an issue in the sequential scrub code which
        prevents self healing from working in some cases. The scrub code
        will split up all DVA copies of a bp and issue each of them
        separately. The problem is that, since each of the DVAs is no
        longer associated with the others, the self healing code doesn't
        have the opportunity to repair problems that show up in one of the
        DVAs with the data from the others.
  
        This patch fixes this issue by ensuring that all IOs issued by the
        sequential scrub code include all DVAs. Initially, only the first
        DVA of each is attempted. If an issue arises, the IO is retried
        with all available copies, giving the self healing code a chance
        to correct the issue.
  
        To test this change, this patch also adds the ability for zinject
        to specify individual DVAs to inject read errors into. We then
        add a new test case that utilizes this functionality to ensure
        scrubs and self-healing reads can handle and transparently fix
        issues with individual copies of blocks.
  
    This update is followup on #10405
    While attempting to port this update, the following ZoL updates are included:
  
        551905dd4 vdev_mirror: kstat observables for preferred vdev
        d6c6590c5 vdev_mirror: load balancing fixes
        9f500936c FreeBSD r256956: Improve ZFS N-way mirror read performance by
  
  Portions contributed by: Toomas Soome <tsoome at me.com>
  Portions contributed by: Jerry Jelinek <jerry.jelinek at joyent.com>
  Author: Tom Caputi <tcaputi at datto.com>

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio_inject.c

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zinject/zinject.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c	Thu Nov 21 14:00:09 2019	(r354954)
@@ -249,24 +249,43 @@ typedef enum {
  */
 typedef struct scan_io {
 	/* fields from blkptr_t */
-	uint64_t		sio_offset;
 	uint64_t		sio_blk_prop;
 	uint64_t		sio_phys_birth;
 	uint64_t		sio_birth;
 	zio_cksum_t		sio_cksum;
-	uint32_t		sio_asize;
+	uint32_t		sio_nr_dvas;
 
 	/* fields from zio_t */
-	int			sio_flags;
+	uint32_t		sio_flags;
 	zbookmark_phys_t	sio_zb;
 
 	/* members for queue sorting */
 	union {
-		avl_node_t	sio_addr_node; /* link into issueing queue */
+		avl_node_t	sio_addr_node; /* link into issuing queue */
 		list_node_t	sio_list_node; /* link for issuing to disk */
 	} sio_nodes;
+
+	/*
+	 * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
+	 * depending on how many were in the original bp. Only the
+	 * first DVA is really used for sorting and issuing purposes.
+	 * The other DVAs (if provided) simply exist so that the zio
+	 * layer can find additional copies to repair from in the
+	 * event of an error. This array must go at the end of the
+	 * struct to allow this for the variable number of elements.
+	 */
+	dva_t			sio_dva[0];
 } scan_io_t;
 
+#define	SIO_SET_OFFSET(sio, x)		DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
+#define	SIO_SET_ASIZE(sio, x)		DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
+#define	SIO_GET_OFFSET(sio)		DVA_GET_OFFSET(&(sio)->sio_dva[0])
+#define	SIO_GET_ASIZE(sio)		DVA_GET_ASIZE(&(sio)->sio_dva[0])
+#define	SIO_GET_END_OFFSET(sio)		\
+	(SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
+#define	SIO_GET_MUSED(sio)		\
+	(sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
+
 struct dsl_scan_io_queue {
 	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
 	vdev_t		*q_vd; /* top-level vdev that this queue represents */
@@ -275,6 +294,7 @@ struct dsl_scan_io_queue {
 	range_tree_t	*q_exts_by_addr;
 	avl_tree_t	q_exts_by_size;
 	avl_tree_t	q_sios_by_addr;
+	uint64_t	q_sio_memused;
 
 	/* members for zio rate limiting */
 	uint64_t	q_maxinflight_bytes;
@@ -313,8 +333,28 @@ static void scan_io_queue_insert_impl(dsl_scan_io_queu
 static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
 static void scan_io_queues_destroy(dsl_scan_t *scn);
 
-static kmem_cache_t *sio_cache;
+static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
 
+/* sio->sio_nr_dvas must be set so we know which cache to free from */
+static void
+sio_free(scan_io_t *sio)
+{
+	ASSERT3U(sio->sio_nr_dvas, >, 0);
+	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+	kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
+}
+
+/* It is up to the caller to set sio->sio_nr_dvas for freeing */
+static scan_io_t *
+sio_alloc(unsigned short nr_dvas)
+{
+	ASSERT3U(nr_dvas, >, 0);
+	ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
+
+	return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
+}
+
 void
 scan_init(void)
 {
@@ -328,14 +368,22 @@ scan_init(void)
 	 */
 	fill_weight = zfs_scan_fill_weight;
 
-	sio_cache = kmem_cache_create("sio_cache",
-	    sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+		char name[36];
+
+		(void) sprintf(name, "sio_cache_%d", i);
+		sio_cache[i] = kmem_cache_create(name,
+		    (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
+		    0, NULL, NULL, NULL, NULL, NULL, 0);
+	}
 }
 
 void
 scan_fini(void)
 {
-	kmem_cache_destroy(sio_cache);
+	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+		kmem_cache_destroy(sio_cache[i]);
+	}
 }
 
 static inline boolean_t
@@ -352,29 +400,39 @@ dsl_scan_resilvering(dsl_pool_t *dp)
 }
 
 static inline void
-sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+sio2bp(const scan_io_t *sio, blkptr_t *bp)
 {
 	bzero(bp, sizeof (*bp));
-	DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
-	DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
-	DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
 	bp->blk_prop = sio->sio_blk_prop;
 	bp->blk_phys_birth = sio->sio_phys_birth;
 	bp->blk_birth = sio->sio_birth;
 	bp->blk_fill = 1;	/* we always only work with data pointers */
 	bp->blk_cksum = sio->sio_cksum;
+
+	ASSERT3U(sio->sio_nr_dvas, >, 0);
+	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+	bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
 }
 
 static inline void
 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
 {
-	/* we discard the vdev id, since we can deduce it from the queue */
-	sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
-	sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
 	sio->sio_blk_prop = bp->blk_prop;
 	sio->sio_phys_birth = bp->blk_phys_birth;
 	sio->sio_birth = bp->blk_birth;
 	sio->sio_cksum = bp->blk_cksum;
+	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
+
+	/*
+	 * Copy the DVAs to the sio. We need all copies of the block so
+	 * that the self healing code can use the alternate copies if the
+	 * first is corrupted. We want the DVA at index dva_i to be first
+	 * in the sio since this is the primary one that we want to issue.
+	 */
+	for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
+		sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
+	}
 }
 
 int
@@ -1076,11 +1134,9 @@ dsl_scan_should_clear(dsl_scan_t *scn)
 		mutex_enter(&tvd->vdev_scan_io_queue_lock);
 		queue = tvd->vdev_scan_io_queue;
 		if (queue != NULL) {
-			/* #extents in exts_by_size = # in exts_by_addr */
+			/* # extents in exts_by_size = # in exts_by_addr */
 			mused += avl_numnodes(&queue->q_exts_by_size) *
-			    sizeof (range_seg_t) +
-			    avl_numnodes(&queue->q_sios_by_addr) *
-			    sizeof (scan_io_t);
+			    sizeof (range_seg_t) + queue->q_sio_memused;
 		}
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	}
@@ -2546,13 +2602,13 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t
 			break;
 		}
 
-		sio2bp(sio, &bp, queue->q_vd->vdev_id);
-		bytes_issued += sio->sio_asize;
+		sio2bp(sio, &bp);
+		bytes_issued += SIO_GET_ASIZE(sio);
 		scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
 		    &sio->sio_zb, queue);
 		(void) list_remove_head(io_list);
 		scan_io_queues_update_zio_stats(queue, &bp);
-		kmem_free(sio, sizeof (*sio));
+		sio_free(sio);
 	}
 
 	atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
@@ -2569,7 +2625,7 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t
 static boolean_t
 scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
 {
-	scan_io_t srch_sio, *sio, *next_sio;
+	scan_io_t *srch_sio, *sio, *next_sio;
 	avl_index_t idx;
 	uint_t num_sios = 0;
 	int64_t bytes_issued = 0;
@@ -2577,24 +2633,30 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range
 	ASSERT(rs != NULL);
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
-	srch_sio.sio_offset = rs->rs_start;
+	srch_sio = sio_alloc(1);
+	srch_sio->sio_nr_dvas = 1;
+	SIO_SET_OFFSET(srch_sio, rs->rs_start);
 
 	/*
 	 * The exact start of the extent might not contain any matching zios,
 	 * so if that's the case, examine the next one in the tree.
 	 */
-	sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+	sio_free(srch_sio);
+
 	if (sio == NULL)
 		sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
 
-	while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
-		ASSERT3U(sio->sio_offset, >=, rs->rs_start);
-		ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+	while (sio != NULL &&
+	    SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) {
+		ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start);
+		ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end);
 
 		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
 		avl_remove(&queue->q_sios_by_addr, sio);
+		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
-		bytes_issued += sio->sio_asize;
+		bytes_issued += SIO_GET_ASIZE(sio);
 		num_sios++;
 		list_insert_tail(list, sio);
 		sio = next_sio;
@@ -2606,11 +2668,11 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range
 	 * in the segment we update it to reflect the work we were able to
 	 * complete. Otherwise, we remove it from the range tree entirely.
 	 */
-	if (sio != NULL && sio->sio_offset < rs->rs_end) {
+	if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) {
 		range_tree_adjust_fill(queue->q_exts_by_addr, rs,
 		    -bytes_issued);
 		range_tree_resize_segment(queue->q_exts_by_addr, rs,
-		    sio->sio_offset, rs->rs_end - sio->sio_offset);
+		    SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio));
 
 		return (B_TRUE);
 	} else {
@@ -2715,9 +2777,9 @@ scan_io_queues_run_one(void *arg)
 			first_sio = list_head(&sio_list);
 			last_sio = list_tail(&sio_list);
 
-			seg_end = last_sio->sio_offset + last_sio->sio_asize;
+			seg_end = SIO_GET_END_OFFSET(last_sio);
 			if (seg_start == 0)
-				seg_start = first_sio->sio_offset;
+				seg_start = SIO_GET_OFFSET(first_sio);
 
 			/*
 			 * Issuing sios can take a long time so drop the
@@ -3369,10 +3431,23 @@ count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, 
 {
 	int i;
 
-	/* update the spa's stats on how many bytes we have issued */
-	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+	/*
+	 * Update the spa's stats on how many bytes we have issued.
+	 * Sequential scrubs create a zio for each DVA of the bp. Each
+	 * of these will include all DVAs for repair purposes, but the
+	 * zio code will only try the first one unless there is an issue.
+	 * Therefore, we should only count the first DVA for these IOs.
+	 */
+	if (scn->scn_is_sorted) {
 		atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
-		    DVA_GET_ASIZE(&bp->blk_dva[i]));
+		    DVA_GET_ASIZE(&bp->blk_dva[0]));
+	} else {
+		spa_t *spa = scn->scn_dp->dp_spa;
+
+		for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+			atomic_add_64(&spa->spa_scan_pass_issued,
+			    DVA_GET_ASIZE(&bp->blk_dva[i]));
+		}
 	}
 
 	/*
@@ -3426,7 +3501,7 @@ static void
 scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
 {
 	avl_index_t idx;
-	int64_t asize = sio->sio_asize;
+	int64_t asize = SIO_GET_ASIZE(sio);
 	dsl_scan_t *scn = queue->q_scn;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3434,11 +3509,12 @@ scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, 
 	if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
 		/* block is already scheduled for reading */
 		atomic_add_64(&scn->scn_bytes_pending, -asize);
-		kmem_free(sio, sizeof (*sio));
+		sio_free(sio);
 		return;
 	}
 	avl_insert(&queue->q_sios_by_addr, sio, idx);
-	range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
+	queue->q_sio_memused += SIO_GET_MUSED(sio);
+	range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
 }
 
 /*
@@ -3452,7 +3528,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const
     int zio_flags, const zbookmark_phys_t *zb)
 {
 	dsl_scan_t *scn = queue->q_scn;
-	scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP);
+	scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
 
 	ASSERT0(BP_IS_GANG(bp));
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
@@ -3466,7 +3542,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const
 	 * get an integer underflow in case the worker processes the
 	 * zio before we get to incrementing this counter.
 	 */
-	atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+	atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
 
 	scan_io_queue_insert_impl(queue, sio);
 }
@@ -3699,15 +3775,11 @@ ext_size_compare(const void *x, const void *y)
  * based on LBA-order (from lowest to highest).
  */
 static int
-io_addr_compare(const void *x, const void *y)
+sio_addr_compare(const void *x, const void *y)
 {
 	const scan_io_t *a = x, *b = y;
 
-	if (a->sio_offset < b->sio_offset)
-		return (-1);
-	if (a->sio_offset == b->sio_offset)
-		return (0);
-	return (1);
+	return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
 }
 
 /* IO queues are created on demand when they are needed. */
@@ -3719,10 +3791,11 @@ scan_io_queue_create(vdev_t *vd)
 
 	q->q_scn = scn;
 	q->q_vd = vd;
+	q->q_sio_memused = 0;
 	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
 	q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
 	    &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
-	avl_create(&q->q_sios_by_addr, io_addr_compare,
+	avl_create(&q->q_sios_by_addr, sio_addr_compare,
 	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
 
 	return (q);
@@ -3746,11 +3819,13 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
 	while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
 	    NULL) {
 		ASSERT(range_tree_contains(queue->q_exts_by_addr,
-		    sio->sio_offset, sio->sio_asize));
-		bytes_dequeued += sio->sio_asize;
-		kmem_free(sio, sizeof (*sio));
+		    SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
+		bytes_dequeued += SIO_GET_ASIZE(sio);
+		queue->q_sio_memused -= SIO_GET_MUSED(sio);
+		sio_free(sio);
 	}
 
+	ASSERT0(queue->q_sio_memused);
 	atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
 	range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
 	range_tree_destroy(queue->q_exts_by_addr);
@@ -3805,7 +3880,7 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int
 	vdev_t *vdev;
 	kmutex_t *q_lock;
 	dsl_scan_io_queue_t *queue;
-	scan_io_t srch, *sio;
+	scan_io_t *srch_sio, *sio;
 	avl_index_t idx;
 	uint64_t start, size;
 
@@ -3820,9 +3895,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int
 		return;
 	}
 
-	bp2sio(bp, &srch, dva_i);
-	start = srch.sio_offset;
-	size = srch.sio_asize;
+	srch_sio = sio_alloc(BP_GET_NDVAS(bp));
+	bp2sio(bp, srch_sio, dva_i);
+	start = SIO_GET_OFFSET(srch_sio);
+	size = SIO_GET_ASIZE(srch_sio);
 
 	/*
 	 * We can find the zio in two states:
@@ -3842,15 +3918,18 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int
 	 *	be done with issuing the zio's it gathered and will
 	 *	signal us.
 	 */
-	sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+	sio_free(srch_sio);
+
 	if (sio != NULL) {
-		int64_t asize = sio->sio_asize;
+		int64_t asize = SIO_GET_ASIZE(sio);
 		blkptr_t tmpbp;
 
 		/* Got it while it was cold in the queue */
-		ASSERT3U(start, ==, sio->sio_offset);
+		ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
 		ASSERT3U(size, ==, asize);
 		avl_remove(&queue->q_sios_by_addr, sio);
+		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
 		range_tree_remove_fill(queue->q_exts_by_addr, start, size);
@@ -3863,10 +3942,10 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int
 		atomic_add_64(&scn->scn_bytes_pending, -asize);
 
 		/* count the block as though we issued it */
-		sio2bp(sio, &tmpbp, dva_i);
+		sio2bp(sio, &tmpbp);
 		count_block(scn, dp->dp_blkstats, &tmpbp);
 
-		kmem_free(sio, sizeof (*sio));
+		sio_free(sio);
 	}
 	mutex_exit(q_lock);
 }

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Thu Nov 21 14:00:09 2019	(r354954)
@@ -2069,7 +2069,7 @@ metaslab_space_weight(metaslab_t *msp)
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
-	if (metaslab_lba_weighting_enabled) {
+	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 		ASSERT(weight >= space && weight <= 2 * space);
 	}

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c	Thu Nov 21 14:00:09 2019	(r354954)
@@ -2036,6 +2036,7 @@ spa_init(int mode)
 	dmu_init();
 	zil_init();
 	vdev_cache_stat_init();
+	vdev_mirror_stat_init();
 	zfs_prop_init();
 	zpool_prop_init();
 	zpool_feature_init();
@@ -2052,6 +2053,7 @@ spa_fini(void)
 	spa_evict_all();
 
 	vdev_cache_stat_fini();
+	vdev_mirror_stat_fini();
 	zil_fini();
 	dmu_fini();
 	zio_fini();

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h	Thu Nov 21 14:00:09 2019	(r354954)
@@ -907,6 +907,10 @@ extern void spa_get_errlists(spa_t *spa, avl_tree_t *l
 extern void vdev_cache_stat_init(void);
 extern void vdev_cache_stat_fini(void);
 
+/* vdev mirror */
+extern void vdev_mirror_stat_init(void);
+extern void vdev_mirror_stat_fini(void);
+
 /* Initialization and termination */
 extern void spa_init(int flags);
 extern void spa_fini(void);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h	Thu Nov 21 14:00:09 2019	(r354954)
@@ -139,6 +139,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
 extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_last_offset(vdev_t *vd);
+
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
 extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h	Thu Nov 21 14:00:09 2019	(r354954)
@@ -225,6 +225,7 @@ struct vdev {
 	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
 	boolean_t	vdev_expanding;	/* expand the vdev?		*/
 	boolean_t	vdev_reopening;	/* reopen in progress?		*/
+	boolean_t	vdev_nonrot;	/* true if solid state		*/
 	int		vdev_open_error; /* error on last open		*/
 	kthread_t	*vdev_open_thread; /* thread opening children	*/
 	uint64_t	vdev_crtxg;	/* txg when top-level was added */

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h	Thu Nov 21 14:00:09 2019	(r354954)
@@ -318,12 +318,14 @@ typedef struct zinject_record {
 	uint64_t	zi_timer;
 	uint64_t	zi_nlanes;
 	uint32_t	zi_cmd;
-	uint32_t	zi_pad;
+	uint32_t	zi_dvas;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
 #define	ZINJECT_FLUSH_ARC	0x2
 #define	ZINJECT_UNLOAD_SPA	0x4
+
+#define	ZI_NO_DVA		(-1)
 
 typedef enum zinject_type {
 	ZINJECT_UNINITIALIZED,

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c	Thu Nov 21 14:00:09 2019	(r354954)
@@ -1479,19 +1479,27 @@ vdev_open_children(vdev_t *vd)
 	 * spa_namespace_lock
 	 */
 	if (vdev_uses_zvols(vd)) {
+retry_sync:
 		for (int c = 0; c < children; c++)
 			vd->vdev_child[c]->vdev_open_error =
 			    vdev_open(vd->vdev_child[c]);
-		return;
+	} else {
+		tq = taskq_create("vdev_open", children, minclsyspri,
+		    children, children, TASKQ_PREPOPULATE);
+		if (tq == NULL)
+			goto retry_sync;
+
+		for (int c = 0; c < children; c++)
+			VERIFY(taskq_dispatch(tq, vdev_open_child,
+			    vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
+
+		taskq_destroy(tq);
 	}
-	tq = taskq_create("vdev_open", children, minclsyspri,
-	    children, children, TASKQ_PREPOPULATE);
 
-	for (int c = 0; c < children; c++)
-		VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
-		    TQ_SLEEP) != TASKQID_INVALID);
+	vd->vdev_nonrot = B_TRUE;
 
-	taskq_destroy(tq);
+	for (int c = 0; c < children; c++)
+		vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
 }
 
 /*

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c	Thu Nov 21 14:00:09 2019	(r354954)
@@ -606,6 +606,16 @@ skip_open:
 	 */
 	vd->vdev_nowritecache = B_FALSE;
 
+	/* Inform the ZIO pipeline that we are non-rotational */
+	vd->vdev_nonrot = B_FALSE;
+	if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+	    "device-solid-state")) {
+		if (ldi_prop_get_int(dvd->vd_lh,
+		    LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
+		    "device-solid-state", B_FALSE) != 0)
+			vd->vdev_nonrot = B_TRUE;
+	}
+
 	return (0);
 }
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c	Thu Nov 21 14:00:09 2019	(r354954)
@@ -58,6 +58,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *
 	vattr_t vattr;
 	int error;
 
+	/* Rotational optimizations only make sense on block devices */
+	vd->vdev_nonrot = B_TRUE;
+
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c	Thu Nov 21 13:59:06 2019	(r354953)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c	Thu Nov 21 14:00:09 2019	(r354954)
@@ -38,6 +38,65 @@
 #include <sys/fs/zfs.h>
 
 /*
+ * Vdev mirror kstats
+ */
+static kstat_t *mirror_ksp = NULL;
+
+typedef struct mirror_stats {
+	kstat_named_t vdev_mirror_stat_rotating_linear;
+	kstat_named_t vdev_mirror_stat_rotating_offset;
+	kstat_named_t vdev_mirror_stat_rotating_seek;
+	kstat_named_t vdev_mirror_stat_non_rotating_linear;
+	kstat_named_t vdev_mirror_stat_non_rotating_seek;
+
+	kstat_named_t vdev_mirror_stat_preferred_found;
+	kstat_named_t vdev_mirror_stat_preferred_not_found;
+} mirror_stats_t;
+
+static mirror_stats_t mirror_stats = {
+	/* New I/O follows directly the last I/O */
+	{ "rotating_linear",			KSTAT_DATA_UINT64 },
+	/* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
+	{ "rotating_offset",			KSTAT_DATA_UINT64 },
+	/* New I/O requires random seek */
+	{ "rotating_seek",			KSTAT_DATA_UINT64 },
+	/* New I/O follows directly the last I/O  (nonrot) */
+	{ "non_rotating_linear",		KSTAT_DATA_UINT64 },
+	/* New I/O requires random seek (nonrot) */
+	{ "non_rotating_seek",			KSTAT_DATA_UINT64 },
+	/* Preferred child vdev found */
+	{ "preferred_found",			KSTAT_DATA_UINT64 },
+	/* Preferred child vdev not found or equal load  */
+	{ "preferred_not_found",		KSTAT_DATA_UINT64 },
+
+};
+
+#define	MIRROR_STAT(stat)		(mirror_stats.stat.value.ui64)
+#define	MIRROR_INCR(stat, val)		atomic_add_64(&MIRROR_STAT(stat), val)
+#define	MIRROR_BUMP(stat)		MIRROR_INCR(stat, 1)
+
+void
+vdev_mirror_stat_init(void)
+{
+	mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
+	    "misc", KSTAT_TYPE_NAMED,
+	    sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (mirror_ksp != NULL) {
+		mirror_ksp->ks_data = &mirror_stats;
+		kstat_install(mirror_ksp);
+	}
+}
+
+void
+vdev_mirror_stat_fini(void)
+{
+	if (mirror_ksp != NULL) {
+		kstat_delete(mirror_ksp);
+		mirror_ksp = NULL;
+	}
+}
+
+/*
  * Virtual device vector for mirroring.
  */
 
@@ -45,48 +104,182 @@ typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
+	int		mc_load;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
 } mirror_child_t;
 
 typedef struct mirror_map {
+	int		*mm_preferred;
+	int		mm_preferred_cnt;
 	int		mm_children;
 	int		mm_resilvering;
-	int		mm_preferred;
 	int		mm_root;
-	mirror_child_t	mm_child[1];
+	mirror_child_t	mm_child[];
 } mirror_map_t;
 
 int vdev_mirror_shift = 21;
 
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
+ * as it will direct more reads to the non-rotating vdevs which are more likely
+ * to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int zfs_vdev_mirror_rotating_inc = 0;
+static int zfs_vdev_mirror_rotating_seek_inc = 5;
+static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
+
+/* Non-rotating media load calculation configuration. */
+static int zfs_vdev_mirror_non_rotating_inc = 0;
+static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+	return (offsetof(mirror_map_t, mm_child[children]) +
+	    sizeof (int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+	mirror_map_t *mm;
+
+	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+	mm->mm_children = children;
+	mm->mm_resilvering = resilvering;
+	mm->mm_root = root;
+	mm->mm_preferred = (int *)((uintptr_t)mm +
+	    offsetof(mirror_map_t, mm_child[children]));
+
+	return (mm);
+}
+
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 
-	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
-	vdev_mirror_map_free,
-	zio_vsd_default_cksum_report
+	.vsd_free = vdev_mirror_map_free,
+	.vsd_cksum_report = zio_vsd_default_cksum_report
 };
 
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+	uint64_t last_offset;
+	int64_t offset_diff;
+	int load;
+
+	/* All DVAs have equal weight at the root. */
+	if (mm->mm_root)
+		return (INT_MAX);
+
+	/*
+	 * We don't return INT_MAX if the device is resilvering i.e.
+	 * vdev_resilver_txg != 0 as when tested performance was slightly
+	 * worse overall when resilvering with compared to without.
+	 */
+
+	/* Fix zio_offset for leaf vdevs */
+	if (vd->vdev_ops->vdev_op_leaf)
+		zio_offset += VDEV_LABEL_START_SIZE;
+
+	/* Standard load based on pending queue length. */
+	load = vdev_queue_length(vd);
+	last_offset = vdev_queue_last_offset(vd);
+
+	if (vd->vdev_nonrot) {
+		/* Non-rotating media. */
+		if (last_offset == zio_offset) {
+			MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
+			return (load + zfs_vdev_mirror_non_rotating_inc);
+		}
+
+		/*
+		 * Apply a seek penalty even for non-rotating devices as
+		 * sequential I/O's can be aggregated into fewer operations on
+		 * the device, thus avoiding unnecessary per-command overhead
+		 * and boosting performance.
+		 */
+		MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
+		return (load + zfs_vdev_mirror_non_rotating_seek_inc);
+	}
+
+	/* Rotating media I/O's which directly follow the last I/O. */
+	if (last_offset == zio_offset) {
+		MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
+		return (load + zfs_vdev_mirror_rotating_inc);
+	}
+
+	/*
+	 * Apply half the seek increment to I/O's within seek offset
+	 * of the last I/O issued to this vdev as they should incur less
+	 * of a seek increment.
+	 */
+	offset_diff = (int64_t)(last_offset - zio_offset);
+	if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
+		MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
+		return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
+	}
+
+	/* Apply the full seek increment to all other I/O's. */
+	MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
+	return (load + zfs_vdev_mirror_rotating_seek_inc);
+}
+
 static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
 {
 	mirror_map_t *mm = NULL;
 	mirror_child_t *mc;
 	vdev_t *vd = zio->io_vd;
-	int c, d;
+	int c;
 
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
+		dsl_scan_t *scn = NULL;
 		dva_t dva_copy[SPA_DVAS_PER_BP];
 
-		c = BP_GET_NDVAS(zio->io_bp);
+		if (spa->spa_dsl_pool != NULL) {
+			scn = spa->spa_dsl_pool->dp_scan;
+		}
+		/*
+		 * The sequential scrub code sorts and issues all DVAs
+		 * of a bp separately. Each of these IOs includes all
+		 * original DVA copies so that repairs can be performed
+		 * in the event of an error, but we only actually want
+		 * to check the first DVA since the others will be
+		 * checked by their respective sorted IOs. Only if we
+		 * hit an error will we try all DVAs upon retrying.
+		 *
+		 * Note: This check is safe even if the user switches
+		 * from a legacy scrub to a sequential one in the middle
+		 * of processing, since scn_is_sorted isn't updated until
+		 * all outstanding IOs from the previous scrub pass
+		 * complete.
+		 */
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+		    !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
+		    scn != NULL &&
+		    scn->scn_is_sorted &&
+		    dsl_scan_scrubbing(spa->spa_dsl_pool)) {
+			c = 1;
+		} else {
+			c = BP_GET_NDVAS(zio->io_bp);
+		}
 
 		/*
 		 * If we do not trust the pool config, some DVAs might be
@@ -110,24 +303,7 @@ vdev_mirror_map_alloc(zio_t *zio)
 			}
 		}
 
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_resilvering = B_FALSE;
-		mm->mm_preferred = spa_get_random(c);
-		mm->mm_root = B_TRUE;
-
-		/*
-		 * Check the other, lower-index DVAs to see if they're on
-		 * the same vdev as the child we picked.  If they are, use
-		 * them since they are likely to have been allocated from
-		 * the primary metaslab in use at the time, and hence are
-		 * more likely to have locality with single-copy data.
-		 */
-		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
-			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
-				mm->mm_preferred = d;
-		}
-
+		mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 
@@ -135,12 +311,6 @@ vdev_mirror_map_alloc(zio_t *zio)
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
-		int replacing;
-
-		c = vd->vdev_children;
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
 		/*
 		 * If we are resilvering, then we should handle scrub reads
 		 * differently; we shouldn't issue them to the resilvering
@@ -164,25 +334,12 @@ vdev_mirror_map_alloc(zio_t *zio)
 		 * automatically removed from the pool after the user replaces
 		 * the device that originally failed.
 		 */
-		replacing = (vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		/*
-		 * If a spa load is in progress, then spa_dsl_pool may be
-		 * uninitialized.  But we shouldn't be resilvering during a spa
-		 * load anyway.
-		 */
-		if (replacing &&
-		    (spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE) &&
-		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool)) {
-			mm->mm_resilvering = B_TRUE;
-		} else {
-			mm->mm_resilvering = B_FALSE;
-		}
-
-		mm->mm_preferred = mm->mm_resilvering ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
-		mm->mm_root = B_FALSE;
-
+		boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+		    vd->vdev_ops == &vdev_spare_ops) &&
+		    spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
+		mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+		    B_FALSE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
@@ -269,6 +426,7 @@ vdev_mirror_scrub_done(zio_t *zio)
 		}
 		mutex_exit(&zio->io_lock);
 	}
+
 	abd_free(zio->io_abd);
 
 	mc->mc_error = zio->io_error;
@@ -277,6 +435,54 @@ vdev_mirror_scrub_done(zio_t *zio)
 }
 
 /*
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked.  If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+	dva_t *dva = zio->io_bp->blk_dva;
+	mirror_map_t *mm = zio->io_vsd;
+	int preferred;
+	int c;
+
+	preferred = mm->mm_preferred[p];
+	for (p--; p >= 0; p--) {
+		c = mm->mm_preferred[p];
+		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+			preferred = c;
+	}
+	return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	int p;
+
+	if (mm->mm_root) {
+		p = spa_get_random(mm->mm_preferred_cnt);
+		return (vdev_mirror_dva_select(zio, p));
+	}
+
+	/*
+	 * To ensure we don't always favour the first matching vdev,
+	 * which could lead to wear leveling issues on SSD's, we
+	 * use the I/O offset as a pseudo random seed into the vdevs
+	 * which have the lowest load.
+	 */
+	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+	return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
  * Try to find a child whose DTL doesn't contain the block we want to read.
  * If we can't, try the read on any vdev we haven't already tried.
  */
@@ -284,43 +490,64 @@ static int
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
-	mirror_child_t *mc;
 	uint64_t txg = zio->io_txg;
-	int i, c;
+	int c, lowest_load;
 
 	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
-	/*
-	 * Try to find a child whose DTL doesn't contain the block to read.
-	 * If a child is known to be completely inaccessible (indicated by
-	 * vdev_readable() returning B_FALSE), don't even try.
-	 */
-	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
-		if (c >= mm->mm_children)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list