svn commit: r256956 - in head/sys: cam/ata cam/scsi cddl/contrib/opensolaris/uts/common/fs/zfs cddl/contrib/opensolaris/uts/common/fs/zfs/sys geom sys

Steven Hartland smh at FreeBSD.org
Wed Oct 23 09:55:02 UTC 2013


Author: smh
Date: Wed Oct 23 09:54:58 2013
New Revision: 256956
URL: http://svnweb.freebsd.org/changeset/base/256956

Log:
  Improve ZFS N-way mirror read performance by using load and locality
  information.
  
  The existing algorithm selects a preferred leaf vdev based on offset of the zio
  request modulo the number of members in the mirror. It assumes the devices are
  of equal performance and that spreading the requests randomly over both drives
  will be sufficient to saturate them. In practice this results in the leaf vdevs
  being under utilized.
  
  The new algorithm takes into the following additional factors:
  * Load of the vdevs (number outstanding I/O requests)
  * The locality of last queued I/O vs the new I/O request.
  
  Within the locality calculation additional knowledge about the underlying vdev
  is considered such as; is the device backing the vdev a rotating media device.
  
  This results in performance increases across the board as well as significant
  increases for predominantly streaming loads and for configurations which don't
  have evenly performing devices.
  
  The following are results from a setup with 3 Way Mirror with 2 x HD's and
  1 x SSD from a basic test running multiple parrallel dd's.
  
  With pre-fetch disabled (vfs.zfs.prefetch_disable=1):
  
  == Stripe Balanced (default) ==
  Read 15360MB using bs: 1048576, readers: 3, took 161 seconds @ 95 MB/s
  == Load Balanced (zfslinux) ==
  Read 15360MB using bs: 1048576, readers: 3, took 297 seconds @ 51 MB/s
  == Load Balanced (locality freebsd) ==
  Read 15360MB using bs: 1048576, readers: 3, took 54 seconds @ 284 MB/s
  
  With pre-fetch enabled (vfs.zfs.prefetch_disable=0):
  
  == Stripe Balanced (default) ==
  Read 15360MB using bs: 1048576, readers: 3, took 91 seconds @ 168 MB/s
  == Load Balanced (zfslinux) ==
  Read 15360MB using bs: 1048576, readers: 3, took 108 seconds @ 142 MB/s
  == Load Balanced (locality freebsd) ==
  Read 15360MB using bs: 1048576, readers: 3, took 48 seconds @ 320 MB/s
  
  In addition to the performance changes the code was also restructured, with
  the help of Justin Gibbs, to provide a more logical flow which also ensures
  vdevs loads are only calculated from the set of valid candidates.
  
  The following additional sysctls where added to allow the administrator
  to tune the behaviour of the load algorithm:
  * vfs.zfs.vdev.mirror.rotating_inc
  * vfs.zfs.vdev.mirror.rotating_seek_inc
  * vfs.zfs.vdev.mirror.rotating_seek_offset
  * vfs.zfs.vdev.mirror.non_rotating_inc
  * vfs.zfs.vdev.mirror.non_rotating_seek_inc
  
  These changes where based on work started by the zfsonlinux developers:
  https://github.com/zfsonlinux/zfs/pull/1487
  
  Reviewed by:	gibbs, mav, will
  MFC after:	2 weeks
  Sponsored by:	Multiplay

Modified:
  head/sys/cam/ata/ata_da.c
  head/sys/cam/scsi/scsi_all.h
  head/sys/cam/scsi/scsi_da.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  head/sys/geom/geom.h
  head/sys/geom/geom_disk.c
  head/sys/geom/geom_disk.h
  head/sys/geom/geom_subr.c
  head/sys/sys/ata.h

Modified: head/sys/cam/ata/ata_da.c
==============================================================================
--- head/sys/cam/ata/ata_da.c	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/cam/ata/ata_da.c	Wed Oct 23 09:54:58 2013	(r256956)
@@ -1222,12 +1222,13 @@ adaregister(struct cam_periph *periph, v
 	    "kern.cam.ada.%d.write_cache", periph->unit_number);
 	TUNABLE_INT_FETCH(announce_buf, &softc->write_cache);
 	/* Disable queue sorting for non-rotational media by default. */
-	if (cgd->ident_data.media_rotation_rate == 1)
+	if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING)
 		softc->sort_io_queue = 0;
 	else
 		softc->sort_io_queue = -1;
 	adagetparams(periph, cgd);
 	softc->disk = disk_alloc();
+	softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate;
 	softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
 			  periph->unit_number, softc->params.secsize,
 			  DEVSTAT_ALL_SUPPORTED,

Modified: head/sys/cam/scsi/scsi_all.h
==============================================================================
--- head/sys/cam/scsi/scsi_all.h	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/cam/scsi/scsi_all.h	Wed Oct 23 09:54:58 2013	(r256956)
@@ -1451,7 +1451,7 @@ struct scsi_vpd_block_characteristics
 	u_int8_t page_length[2];
 	u_int8_t medium_rotation_rate[2];
 #define SVPD_BDC_RATE_NOT_REPORTED	0x00
-#define SVPD_BDC_RATE_NONE_ROTATING	0x01
+#define SVPD_BDC_RATE_NON_ROTATING	0x01
 	u_int8_t reserved1;
 	u_int8_t nominal_form_factor;
 #define SVPD_BDC_FORM_NOT_REPORTED	0x00

Modified: head/sys/cam/scsi/scsi_da.c
==============================================================================
--- head/sys/cam/scsi/scsi_da.c	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/cam/scsi/scsi_da.c	Wed Oct 23 09:54:58 2013	(r256956)
@@ -3377,9 +3377,18 @@ dadone(struct cam_periph *periph, union 
 			 * Disable queue sorting for non-rotational media
 			 * by default.
 			 */
-			if (scsi_2btoul(bdc->medium_rotation_rate) ==
-			    SVPD_BDC_RATE_NONE_ROTATING)
+			u_int old_rate = softc->disk->d_rotation_rate;
+
+			softc->disk->d_rotation_rate =
+				scsi_2btoul(bdc->medium_rotation_rate);
+			if (softc->disk->d_rotation_rate ==
+			    SVPD_BDC_RATE_NON_ROTATING) {
 				softc->sort_io_queue = 0;
+			}
+			if (softc->disk->d_rotation_rate != old_rate) {
+				disk_attr_changed(softc->disk,
+				    "GEOM::rotation_rate", M_NOWAIT);
+			}
 		} else {
 			int error;
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
@@ -3414,6 +3423,8 @@ dadone(struct cam_periph *periph, union 
 		ptr = (uint16_t *)ata_params;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
+			uint16_t old_rate;
+
 			for (i = 0; i < sizeof(*ata_params) / 2; i++)
 				ptr[i] = le16toh(ptr[i]);
 			if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM) {
@@ -3428,8 +3439,18 @@ dadone(struct cam_periph *periph, union 
 			 * Disable queue sorting for non-rotational media
 			 * by default.
 			 */
-			if (ata_params->media_rotation_rate == 1)
+			old_rate = softc->disk->d_rotation_rate;
+			softc->disk->d_rotation_rate =
+			    ata_params->media_rotation_rate;
+			if (softc->disk->d_rotation_rate ==
+			    ATA_RATE_NON_ROTATING) {
 				softc->sort_io_queue = 0;
+			}
+
+			if (softc->disk->d_rotation_rate != old_rate) {
+				disk_attr_changed(softc->disk,
+				    "GEOM::rotation_rate", M_NOWAIT);
+			}
 		} else {
 			int error;
 			error = daerror(done_ccb, CAM_RETRY_SELTO,

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	Wed Oct 23 09:54:58 2013	(r256956)
@@ -120,6 +120,9 @@ extern void vdev_queue_init(vdev_t *vd);
 extern void vdev_queue_fini(vdev_t *vd);
 extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
+extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h	Wed Oct 23 09:54:58 2013	(r256956)
@@ -106,6 +106,7 @@ struct vdev_queue {
 	avl_tree_t	vq_pending_tree;
 	hrtime_t	vq_io_complete_ts;
 	kmutex_t	vq_lock;
+	uint64_t	vq_lastoffset;
 };
 
 /*
@@ -217,7 +218,10 @@ struct vdev {
 	spa_aux_vdev_t	*vdev_aux;	/* for l2cache vdevs		*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
 	vdev_aux_t	vdev_label_aux;	/* on-disk aux state		*/
-	struct trim_map	*vdev_trimmap;
+	struct trim_map	*vdev_trimmap;	/* map on outstanding trims	*/ 
+	uint16_t	vdev_rotation_rate; /* rotational rate of the media */
+#define	VDEV_RATE_UNKNOWN	0
+#define	VDEV_RATE_NON_ROTATING	1
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	Wed Oct 23 09:54:58 2013	(r256956)
@@ -42,9 +42,11 @@
  * Virtual device vector for GEOM.
  */
 
+static g_attrchanged_t vdev_geom_attrchanged;
 struct g_class zfs_vdev_class = {
 	.name = "ZFS::VDEV",
 	.version = G_VERSION,
+	.attrchanged = vdev_geom_attrchanged,
 };
 
 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
@@ -62,6 +64,34 @@ SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_
     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
 
 static void
+vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
+{ 
+	int error;
+	uint16_t rate;
+
+	error = g_getattr("GEOM::rotation_rate", cp, &rate);
+	if (error == 0)
+		vd->vdev_rotation_rate = rate;
+	else
+		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+	vdev_t *vd;
+
+	vd = cp->private;
+	if (vd == NULL)
+		return;
+
+	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
+		vdev_geom_set_rotation_rate(vd, cp);
+		return;
+	}
+}
+
+static void
 vdev_geom_orphan(struct g_consumer *cp)
 {
 	vdev_t *vd;
@@ -683,6 +713,11 @@ vdev_geom_open(vdev_t *vd, uint64_t *psi
 	vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
 	snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
 
+	/*
+	 * Determine the device's rotation rate.
+	 */
+	vdev_geom_set_rotation_rate(vd, cp);
+
 	return (0);
 }
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c	Wed Oct 23 09:54:58 2013	(r256956)
@@ -41,27 +41,97 @@ typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
+	int		mc_load;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
 } mirror_child_t;
 
 typedef struct mirror_map {
+	int		*mm_preferred;
+	int		mm_preferred_cnt;
 	int		mm_children;
-	int		mm_replacing;
-	int		mm_preferred;
-	int		mm_root;
-	mirror_child_t	mm_child[1];
+	boolean_t	mm_replacing;
+	boolean_t	mm_root;
+	mirror_child_t	mm_child[];
 } mirror_map_t;
 
-int vdev_mirror_shift = 21;
+static int vdev_mirror_shift = 21;
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
+    "ZFS VDEV Mirror");
+
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int rotating_inc = 0;
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_inc", &rotating_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RW,
+    &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
+
+static int rotating_seek_inc = 5;
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_inc", &rotating_seek_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RW,
+    &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
+
+static int rotating_seek_offset = 1 * 1024 * 1024;
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_offset", &rotating_seek_offset);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RW,
+    &rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
+    "triggers a reduced rotating media seek increment");
+
+/* Non-rotating media load calculation configuration. */
+static int non_rotating_inc = 0;
+TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_inc", &non_rotating_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RW,
+    &non_rotating_inc, 0,
+    "Non-rotating media load increment for non-seeking I/O's");
+
+static int non_rotating_seek_inc = 1;
+TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_seek_inc",
+     &non_rotating_seek_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RW,
+    &non_rotating_seek_inc, 0,
+    "Non-rotating media load increment for seeking I/O's");
+
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+	return (offsetof(mirror_map_t, mm_child[children]) +
+	    sizeof(int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root)
+{
+	mirror_map_t *mm;
+
+	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+	mm->mm_children = children;
+	mm->mm_replacing = replacing;
+	mm->mm_root = root;
+	mm->mm_preferred = (int *)((uintptr_t)mm + 
+	    offsetof(mirror_map_t, mm_child[children]));
+
+	return mm;
+}
 
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 
-	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
@@ -69,55 +139,80 @@ static const zio_vsd_ops_t vdev_mirror_v
 	zio_vsd_default_cksum_report
 };
 
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+	uint64_t lastoffset;
+	int load;
+
+	/* All DVAs have equal weight at the root. */
+	if (mm->mm_root)
+		return (INT_MAX);
+
+	/*
+	 * We don't return INT_MAX if the device is resilvering i.e.
+	 * vdev_resilver_txg != 0 as when tested performance was slightly
+	 * worse overall when resilvering with compared to without.
+	 */
+
+	/* Standard load based on pending queue length. */
+	load = vdev_queue_length(vd);
+	lastoffset = vdev_queue_lastoffset(vd);
+
+	if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) {
+		/* Non-rotating media. */
+		if (lastoffset == zio_offset)
+			return (load + non_rotating_inc);
+
+		/*
+		 * Apply a seek penalty even for non-rotating devices as
+		 * sequential I/O'a can be aggregated into fewer operations
+		 * on the device, thus avoiding unnecessary per-command
+		 * overhead and boosting performance.
+		 */
+		return (load + non_rotating_seek_inc);
+	}
+
+	/* Rotating media I/O's which directly follow the last I/O. */
+	if (lastoffset == zio_offset)
+		return (load + rotating_inc);
+
+	/*
+	 * Apply half the seek increment to I/O's within seek offset
+	 * of the last I/O queued to this vdev as they should incure less
+	 * of a seek increment.
+	 */
+	if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
+		return (load + (rotating_seek_inc / 2));
+
+	/* Apply the full seek increment to all other I/O's. */
+	return (load + rotating_seek_inc);
+}
+
+
 static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
 {
 	mirror_map_t *mm = NULL;
 	mirror_child_t *mc;
 	vdev_t *vd = zio->io_vd;
-	int c, d;
+	int c;
 
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
 
-		c = BP_GET_NDVAS(zio->io_bp);
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = B_FALSE;
-		mm->mm_preferred = spa_get_random(c);
-		mm->mm_root = B_TRUE;
-
-		/*
-		 * Check the other, lower-index DVAs to see if they're on
-		 * the same vdev as the child we picked.  If they are, use
-		 * them since they are likely to have been allocated from
-		 * the primary metaslab in use at the time, and hence are
-		 * more likely to have locality with single-copy data.
-		 */
-		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
-			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
-				mm->mm_preferred = d;
-		}
-
+		mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
+		    B_TRUE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
-
 			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
-		c = vd->vdev_children;
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		mm->mm_preferred = mm->mm_replacing ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
-		mm->mm_root = B_FALSE;
-
+		mm = vdev_mirror_map_alloc(vd->vdev_children,
+		    (vd->vdev_ops == &vdev_replacing_ops ||
+                    vd->vdev_ops == &vdev_spare_ops), B_FALSE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
@@ -211,50 +306,121 @@ vdev_mirror_scrub_done(zio_t *zio)
 }
 
 /*
- * Try to find a child whose DTL doesn't contain the block we want to read.
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked.  If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int preferred)
+{
+	dva_t *dva = zio->io_bp->blk_dva;
+	mirror_map_t *mm = zio->io_vsd;
+	int c;
+
+	for (c = preferred - 1; c >= 0; c--) {
+		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+			preferred = c;
+	}
+	return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	int p;
+
+	if (mm->mm_root) {
+		p = spa_get_random(mm->mm_preferred_cnt);
+		return (vdev_mirror_dva_select(zio, mm->mm_preferred[p]));
+	}
+
+	/*
+	 * To ensure we don't always favour the first matching vdev,
+	 * which could lead to wear leveling issues on SSD's, we
+	 * use the I/O offset as a pseudo random seed into the vdevs
+	 * which have the lowest load.
+	 */
+	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+	return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
  * If we can't, try the read on any vdev we haven't already tried.
  */
 static int
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
-	mirror_child_t *mc;
 	uint64_t txg = zio->io_txg;
-	int i, c;
+	int c, lowest_load;
 
 	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
-	/*
-	 * Try to find a child whose DTL doesn't contain the block to read.
-	 * If a child is known to be completely inaccessible (indicated by
-	 * vdev_readable() returning B_FALSE), don't even try.
-	 */
-	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
-		if (c >= mm->mm_children)
-			c = 0;
+	lowest_load = INT_MAX;
+	mm->mm_preferred_cnt = 0;
+	for (c = 0; c < mm->mm_children; c++) {
+		mirror_child_t *mc;
+
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
+
 		if (!vdev_readable(mc->mc_vd)) {
 			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
-			return (c);
-		mc->mc_error = SET_ERROR(ESTALE);
-		mc->mc_skipped = 1;
-		mc->mc_speculative = 1;
+
+		if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+			mc->mc_error = SET_ERROR(ESTALE);
+			mc->mc_skipped = 1;
+			mc->mc_speculative = 1;
+			continue;
+		}
+
+		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+		if (mc->mc_load > lowest_load)
+			continue;
+
+		if (mc->mc_load < lowest_load) {
+			lowest_load = mc->mc_load;
+			mm->mm_preferred_cnt = 0;
+		}
+		mm->mm_preferred[mm->mm_preferred_cnt] = c;
+		mm->mm_preferred_cnt++;
+	}
+
+	if (mm->mm_preferred_cnt == 1) {
+		vdev_queue_register_lastoffset(
+		    mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
+		return (mm->mm_preferred[0]);
+	}
+
+	if (mm->mm_preferred_cnt > 1) {
+		int c = vdev_mirror_preferred_child_randomize(zio);
+
+		vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
+		return (c);
 	}
 
 	/*
 	 * Every device is either missing or has this txg in its DTL.
 	 * Look for any child we haven't already tried before giving up.
 	 */
-	for (c = 0; c < mm->mm_children; c++)
-		if (!mm->mm_child[c].mc_tried)
+	for (c = 0; c < mm->mm_children; c++) {
+		if (!mm->mm_child[c].mc_tried) {
+			vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
+			    zio);
 			return (c);
+		}
+	}
 
 	/*
 	 * Every child failed.  There's no place left to look.
@@ -269,7 +435,7 @@ vdev_mirror_io_start(zio_t *zio)
 	mirror_child_t *mc;
 	int c, children;
 
-	mm = vdev_mirror_map_alloc(zio);
+	mm = vdev_mirror_map_init(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	Wed Oct 23 09:54:58 2013	(r256956)
@@ -155,6 +155,8 @@ vdev_queue_init(vdev_t *vd)
 
 	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+
+	vq->vq_lastoffset = 0;
 }
 
 void
@@ -446,3 +448,26 @@ vdev_queue_io_done(zio_t *zio)
 
 	mutex_exit(&vq->vq_lock);
 }
+
+/*
+ * As these three methods are only used for load calculations we're not concerned
+ * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
+ * use here, instead we prefer to keep it lock free for performance.
+ */ 
+int
+vdev_queue_length(vdev_t *vd)
+{
+	return (avl_numnodes(&vd->vdev_queue.vq_pending_tree));
+}
+
+uint64_t
+vdev_queue_lastoffset(vdev_t *vd)
+{
+	return (vd->vdev_queue.vq_lastoffset);
+}
+
+void
+vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
+{
+	vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
+}

Modified: head/sys/geom/geom.h
==============================================================================
--- head/sys/geom/geom.h	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/geom/geom.h	Wed Oct 23 09:54:58 2013	(r256956)
@@ -274,6 +274,7 @@ int g_handleattr(struct bio *bp, const c
     int len);
 int g_handleattr_int(struct bio *bp, const char *attribute, int val);
 int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val);
+int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val);
 int g_handleattr_str(struct bio *bp, const char *attribute, const char *str);
 struct g_consumer * g_new_consumer(struct g_geom *gp);
 struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...)

Modified: head/sys/geom/geom_disk.c
==============================================================================
--- head/sys/geom/geom_disk.c	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/geom/geom_disk.c	Wed Oct 23 09:54:58 2013	(r256956)
@@ -387,22 +387,25 @@ g_disk_start(struct bio *bp)
 			break;
 		else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident))
 			break;
-		else if (g_handleattr(bp, "GEOM::hba_vendor",
-		    &dp->d_hba_vendor, 2))
+		else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor",
+		    dp->d_hba_vendor))
 			break;
-		else if (g_handleattr(bp, "GEOM::hba_device",
-		    &dp->d_hba_device, 2))
+		else if (g_handleattr_uint16_t(bp, "GEOM::hba_device",
+		    dp->d_hba_device))
 			break;
-		else if (g_handleattr(bp, "GEOM::hba_subvendor",
-		    &dp->d_hba_subvendor, 2))
+		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor",
+		    dp->d_hba_subvendor))
 			break;
-		else if (g_handleattr(bp, "GEOM::hba_subdevice",
-		    &dp->d_hba_subdevice, 2))
+		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice",
+		    dp->d_hba_subdevice))
 			break;
 		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
 			g_disk_kerneldump(bp, dp);
 		else if (!strcmp(bp->bio_attribute, "GEOM::setstate"))
 			g_disk_setstate(bp, sc);
+		else if (g_handleattr_uint16_t(bp, "GEOM::rotation_rate",
+		    dp->d_rotation_rate))
+			break;
 		else 
 			error = ENOIOCTL;
 		break;

Modified: head/sys/geom/geom_disk.h
==============================================================================
--- head/sys/geom/geom_disk.h	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/geom/geom_disk.h	Wed Oct 23 09:54:58 2013	(r256956)
@@ -97,6 +97,7 @@ struct disk {
 	uint16_t		d_hba_device;
 	uint16_t		d_hba_subvendor;
 	uint16_t		d_hba_subdevice;
+	uint16_t		d_rotation_rate;
 
 	/* Fields private to the driver */
 	void			*d_drv1;
@@ -121,7 +122,8 @@ int disk_resize(struct disk *dp, int fla
 #define DISK_VERSION_01		0x5856105a
 #define DISK_VERSION_02		0x5856105b
 #define DISK_VERSION_03		0x5856105c
-#define DISK_VERSION		DISK_VERSION_03
+#define DISK_VERSION_04		0x5856105d
+#define DISK_VERSION		DISK_VERSION_04
 
 #endif /* _KERNEL */
 #endif /* _GEOM_GEOM_DISK_H_ */

Modified: head/sys/geom/geom_subr.c
==============================================================================
--- head/sys/geom/geom_subr.c	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/geom/geom_subr.c	Wed Oct 23 09:54:58 2013	(r256956)
@@ -951,6 +951,13 @@ g_handleattr_int(struct bio *bp, const c
 }
 
 int
+g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val)
+{
+
+	return (g_handleattr(bp, attribute, &val, sizeof val));
+}
+
+int
 g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val)
 {
 

Modified: head/sys/sys/ata.h
==============================================================================
--- head/sys/sys/ata.h	Wed Oct 23 09:53:37 2013	(r256955)
+++ head/sys/sys/ata.h	Wed Oct 23 09:54:58 2013	(r256956)
@@ -259,6 +259,8 @@ struct ata_params {
 /*215*/ u_int16_t       nv_cache_size_1;
 	u_int16_t       nv_cache_size_2;
 /*217*/ u_int16_t       media_rotation_rate;
+#define ATA_RATE_NOT_REPORTED		0x0000
+#define ATA_RATE_NON_ROTATING		0x0001
 	u_int16_t       reserved218;
 /*219*/ u_int16_t       nv_cache_opt;
 /*220*/ u_int16_t       wrv_mode;


More information about the svn-src-head mailing list