svn commit: r271238 - in stable/10/sys: cam/ata cam/scsi cddl/contrib/opensolaris/uts/common/fs/zfs cddl/contrib/opensolaris/uts/common/fs/zfs/sys geom sys

Steven Hartland smh at FreeBSD.org
Sun Sep 7 21:30:50 UTC 2014


Author: smh
Date: Sun Sep  7 21:30:47 2014
New Revision: 271238
URL: http://svnweb.freebsd.org/changeset/base/271238

Log:
  MFC r256956:
  Improve ZFS N-way mirror read performance by using load and locality
  information.
  
  MFC r260713:
  Fix ZFS mirror code for handling multiple DVA's
  
  Also make the addition of the d_rotation_rate binary compatible. This allows
  storage drivers compiled for 10.0 to work by preserving the ABI for disks.
  
  Approved by:	re (gjb)
  Sponsored by:	Multiplay

Modified:
  stable/10/sys/cam/ata/ata_da.c
  stable/10/sys/cam/scsi/scsi_all.h
  stable/10/sys/cam/scsi/scsi_da.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  stable/10/sys/geom/geom.h
  stable/10/sys/geom/geom_disk.c
  stable/10/sys/geom/geom_disk.h
  stable/10/sys/geom/geom_subr.c
  stable/10/sys/sys/ata.h
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/cam/ata/ata_da.c
==============================================================================
--- stable/10/sys/cam/ata/ata_da.c	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/cam/ata/ata_da.c	Sun Sep  7 21:30:47 2014	(r271238)
@@ -1229,12 +1229,13 @@ adaregister(struct cam_periph *periph, v
 	    "kern.cam.ada.%d.write_cache", periph->unit_number);
 	TUNABLE_INT_FETCH(announce_buf, &softc->write_cache);
 	/* Disable queue sorting for non-rotational media by default. */
-	if (cgd->ident_data.media_rotation_rate == 1)
+	if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING)
 		softc->sort_io_queue = 0;
 	else
 		softc->sort_io_queue = -1;
 	adagetparams(periph, cgd);
 	softc->disk = disk_alloc();
+	softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate;
 	softc->disk->d_devstat = devstat_new_entry(periph->periph_name,
 			  periph->unit_number, softc->params.secsize,
 			  DEVSTAT_ALL_SUPPORTED,

Modified: stable/10/sys/cam/scsi/scsi_all.h
==============================================================================
--- stable/10/sys/cam/scsi/scsi_all.h	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/cam/scsi/scsi_all.h	Sun Sep  7 21:30:47 2014	(r271238)
@@ -2302,7 +2302,7 @@ struct scsi_vpd_block_characteristics
 	u_int8_t page_length[2];
 	u_int8_t medium_rotation_rate[2];
 #define SVPD_BDC_RATE_NOT_REPORTED	0x00
-#define SVPD_BDC_RATE_NONE_ROTATING	0x01
+#define SVPD_BDC_RATE_NON_ROTATING	0x01
 	u_int8_t reserved1;
 	u_int8_t nominal_form_factor;
 #define SVPD_BDC_FORM_NOT_REPORTED	0x00

Modified: stable/10/sys/cam/scsi/scsi_da.c
==============================================================================
--- stable/10/sys/cam/scsi/scsi_da.c	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/cam/scsi/scsi_da.c	Sun Sep  7 21:30:47 2014	(r271238)
@@ -3390,9 +3390,18 @@ dadone(struct cam_periph *periph, union 
 			 * Disable queue sorting for non-rotational media
 			 * by default.
 			 */
-			if (scsi_2btoul(bdc->medium_rotation_rate) ==
-			    SVPD_BDC_RATE_NONE_ROTATING)
+			u_int old_rate = softc->disk->d_rotation_rate;
+
+			softc->disk->d_rotation_rate =
+				scsi_2btoul(bdc->medium_rotation_rate);
+			if (softc->disk->d_rotation_rate ==
+			    SVPD_BDC_RATE_NON_ROTATING) {
 				softc->sort_io_queue = 0;
+			}
+			if (softc->disk->d_rotation_rate != old_rate) {
+				disk_attr_changed(softc->disk,
+				    "GEOM::rotation_rate", M_NOWAIT);
+			}
 		} else {
 			int error;
 			error = daerror(done_ccb, CAM_RETRY_SELTO,
@@ -3427,6 +3436,8 @@ dadone(struct cam_periph *periph, union 
 		ptr = (uint16_t *)ata_params;
 
 		if ((csio->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
+			uint16_t old_rate;
+
 			for (i = 0; i < sizeof(*ata_params) / 2; i++)
 				ptr[i] = le16toh(ptr[i]);
 			if (ata_params->support_dsm & ATA_SUPPORT_DSM_TRIM &&
@@ -3442,8 +3453,18 @@ dadone(struct cam_periph *periph, union 
 			 * Disable queue sorting for non-rotational media
 			 * by default.
 			 */
-			if (ata_params->media_rotation_rate == 1)
+			old_rate = softc->disk->d_rotation_rate;
+			softc->disk->d_rotation_rate =
+			    ata_params->media_rotation_rate;
+			if (softc->disk->d_rotation_rate ==
+			    ATA_RATE_NON_ROTATING) {
 				softc->sort_io_queue = 0;
+			}
+
+			if (softc->disk->d_rotation_rate != old_rate) {
+				disk_attr_changed(softc->disk,
+				    "GEOM::rotation_rate", M_NOWAIT);
+			}
 		} else {
 			int error;
 			error = daerror(done_ccb, CAM_RETRY_SELTO,

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h	Sun Sep  7 21:30:47 2014	(r271238)
@@ -120,6 +120,9 @@ extern void vdev_queue_init(vdev_t *vd);
 extern void vdev_queue_fini(vdev_t *vd);
 extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
+extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h	Sun Sep  7 21:30:47 2014	(r271238)
@@ -116,6 +116,7 @@ struct vdev_queue {
 	uint64_t	vq_last_offset;
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	kmutex_t	vq_lock;
+	uint64_t	vq_lastoffset;
 };
 
 /*
@@ -227,7 +228,10 @@ struct vdev {
 	spa_aux_vdev_t	*vdev_aux;	/* for l2cache vdevs		*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
 	vdev_aux_t	vdev_label_aux;	/* on-disk aux state		*/
-	struct trim_map	*vdev_trimmap;
+	struct trim_map	*vdev_trimmap;	/* map on outstanding trims	*/ 
+	uint16_t	vdev_rotation_rate; /* rotational rate of the media */
+#define	VDEV_RATE_UNKNOWN	0
+#define	VDEV_RATE_NON_ROTATING	1
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	Sun Sep  7 21:30:47 2014	(r271238)
@@ -42,9 +42,11 @@
  * Virtual device vector for GEOM.
  */
 
+static g_attrchanged_t vdev_geom_attrchanged;
 struct g_class zfs_vdev_class = {
 	.name = "ZFS::VDEV",
 	.version = G_VERSION,
+	.attrchanged = vdev_geom_attrchanged,
 };
 
 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
@@ -62,6 +64,34 @@ SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_
     &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
 
 static void
+vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
+{ 
+	int error;
+	uint16_t rate;
+
+	error = g_getattr("GEOM::rotation_rate", cp, &rate);
+	if (error == 0)
+		vd->vdev_rotation_rate = rate;
+	else
+		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+	vdev_t *vd;
+
+	vd = cp->private;
+	if (vd == NULL)
+		return;
+
+	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
+		vdev_geom_set_rotation_rate(vd, cp);
+		return;
+	}
+}
+
+static void
 vdev_geom_orphan(struct g_consumer *cp)
 {
 	vdev_t *vd;
@@ -689,6 +719,11 @@ vdev_geom_open(vdev_t *vd, uint64_t *psi
 	vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
 	snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
 
+	/*
+	 * Determine the device's rotation rate.
+	 */
+	vdev_geom_set_rotation_rate(vd, cp);
+
 	return (0);
 }
 

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c	Sun Sep  7 21:30:47 2014	(r271238)
@@ -41,27 +41,97 @@ typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
+	int		mc_load;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
 } mirror_child_t;
 
 typedef struct mirror_map {
+	int		*mm_preferred;
+	int		mm_preferred_cnt;
 	int		mm_children;
-	int		mm_replacing;
-	int		mm_preferred;
-	int		mm_root;
-	mirror_child_t	mm_child[1];
+	boolean_t	mm_replacing;
+	boolean_t	mm_root;
+	mirror_child_t	mm_child[];
 } mirror_map_t;
 
-int vdev_mirror_shift = 21;
+static int vdev_mirror_shift = 21;
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
+    "ZFS VDEV Mirror");
+
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int rotating_inc = 0;
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_inc", &rotating_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RW,
+    &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
+
+static int rotating_seek_inc = 5;
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_inc", &rotating_seek_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RW,
+    &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
+
+static int rotating_seek_offset = 1 * 1024 * 1024;
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_offset", &rotating_seek_offset);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RW,
+    &rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
+    "triggers a reduced rotating media seek increment");
+
+/* Non-rotating media load calculation configuration. */
+static int non_rotating_inc = 0;
+TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_inc", &non_rotating_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RW,
+    &non_rotating_inc, 0,
+    "Non-rotating media load increment for non-seeking I/O's");
+
+static int non_rotating_seek_inc = 1;
+TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_seek_inc",
+     &non_rotating_seek_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RW,
+    &non_rotating_seek_inc, 0,
+    "Non-rotating media load increment for seeking I/O's");
+
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+	return (offsetof(mirror_map_t, mm_child[children]) +
+	    sizeof(int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root)
+{
+	mirror_map_t *mm;
+
+	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+	mm->mm_children = children;
+	mm->mm_replacing = replacing;
+	mm->mm_root = root;
+	mm->mm_preferred = (int *)((uintptr_t)mm + 
+	    offsetof(mirror_map_t, mm_child[children]));
+
+	return mm;
+}
 
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 
-	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
@@ -69,55 +139,80 @@ static const zio_vsd_ops_t vdev_mirror_v
 	zio_vsd_default_cksum_report
 };
 
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+	uint64_t lastoffset;
+	int load;
+
+	/* All DVAs have equal weight at the root. */
+	if (mm->mm_root)
+		return (INT_MAX);
+
+	/*
+	 * We don't return INT_MAX if the device is resilvering i.e.
+	 * vdev_resilver_txg != 0 as when tested performance was slightly
+	 * worse overall when resilvering with compared to without.
+	 */
+
+	/* Standard load based on pending queue length. */
+	load = vdev_queue_length(vd);
+	lastoffset = vdev_queue_lastoffset(vd);
+
+	if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) {
+		/* Non-rotating media. */
+		if (lastoffset == zio_offset)
+			return (load + non_rotating_inc);
+
+		/*
+		 * Apply a seek penalty even for non-rotating devices as
+		 * sequential I/O'a can be aggregated into fewer operations
+		 * on the device, thus avoiding unnecessary per-command
+		 * overhead and boosting performance.
+		 */
+		return (load + non_rotating_seek_inc);
+	}
+
+	/* Rotating media I/O's which directly follow the last I/O. */
+	if (lastoffset == zio_offset)
+		return (load + rotating_inc);
+
+	/*
+	 * Apply half the seek increment to I/O's within seek offset
+	 * of the last I/O queued to this vdev as they should incure less
+	 * of a seek increment.
+	 */
+	if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
+		return (load + (rotating_seek_inc / 2));
+
+	/* Apply the full seek increment to all other I/O's. */
+	return (load + rotating_seek_inc);
+}
+
+
 static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
 {
 	mirror_map_t *mm = NULL;
 	mirror_child_t *mc;
 	vdev_t *vd = zio->io_vd;
-	int c, d;
+	int c;
 
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
 
-		c = BP_GET_NDVAS(zio->io_bp);
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = B_FALSE;
-		mm->mm_preferred = spa_get_random(c);
-		mm->mm_root = B_TRUE;
-
-		/*
-		 * Check the other, lower-index DVAs to see if they're on
-		 * the same vdev as the child we picked.  If they are, use
-		 * them since they are likely to have been allocated from
-		 * the primary metaslab in use at the time, and hence are
-		 * more likely to have locality with single-copy data.
-		 */
-		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
-			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
-				mm->mm_preferred = d;
-		}
-
+		mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
+		    B_TRUE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
-
 			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
-		c = vd->vdev_children;
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		mm->mm_preferred = mm->mm_replacing ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
-		mm->mm_root = B_FALSE;
-
+		mm = vdev_mirror_map_alloc(vd->vdev_children,
+		    (vd->vdev_ops == &vdev_replacing_ops ||
+                    vd->vdev_ops == &vdev_spare_ops), B_FALSE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
@@ -211,50 +306,124 @@ vdev_mirror_scrub_done(zio_t *zio)
 }
 
 /*
- * Try to find a child whose DTL doesn't contain the block we want to read.
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked.  If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+	dva_t *dva = zio->io_bp->blk_dva;
+	mirror_map_t *mm = zio->io_vsd;
+	int preferred;
+	int c;
+
+	preferred = mm->mm_preferred[p];
+	for (p-- ; p >= 0; p--) {
+		c = mm->mm_preferred[p];
+		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+			preferred = c;
+	}
+	return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	int p;
+
+	if (mm->mm_root) {
+		p = spa_get_random(mm->mm_preferred_cnt);
+		return (vdev_mirror_dva_select(zio, p));
+	}
+
+	/*
+	 * To ensure we don't always favour the first matching vdev,
+	 * which could lead to wear leveling issues on SSD's, we
+	 * use the I/O offset as a pseudo random seed into the vdevs
+	 * which have the lowest load.
+	 */
+	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+	return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
  * If we can't, try the read on any vdev we haven't already tried.
  */
 static int
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
-	mirror_child_t *mc;
 	uint64_t txg = zio->io_txg;
-	int i, c;
+	int c, lowest_load;
 
 	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
-	/*
-	 * Try to find a child whose DTL doesn't contain the block to read.
-	 * If a child is known to be completely inaccessible (indicated by
-	 * vdev_readable() returning B_FALSE), don't even try.
-	 */
-	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
-		if (c >= mm->mm_children)
-			c = 0;
+	lowest_load = INT_MAX;
+	mm->mm_preferred_cnt = 0;
+	for (c = 0; c < mm->mm_children; c++) {
+		mirror_child_t *mc;
+
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
+
 		if (!vdev_readable(mc->mc_vd)) {
 			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
-			return (c);
-		mc->mc_error = SET_ERROR(ESTALE);
-		mc->mc_skipped = 1;
-		mc->mc_speculative = 1;
+
+		if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+			mc->mc_error = SET_ERROR(ESTALE);
+			mc->mc_skipped = 1;
+			mc->mc_speculative = 1;
+			continue;
+		}
+
+		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+		if (mc->mc_load > lowest_load)
+			continue;
+
+		if (mc->mc_load < lowest_load) {
+			lowest_load = mc->mc_load;
+			mm->mm_preferred_cnt = 0;
+		}
+		mm->mm_preferred[mm->mm_preferred_cnt] = c;
+		mm->mm_preferred_cnt++;
+	}
+
+	if (mm->mm_preferred_cnt == 1) {
+		vdev_queue_register_lastoffset(
+		    mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
+		return (mm->mm_preferred[0]);
+	}
+
+	if (mm->mm_preferred_cnt > 1) {
+		int c = vdev_mirror_preferred_child_randomize(zio);
+
+		vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
+		return (c);
 	}
 
 	/*
 	 * Every device is either missing or has this txg in its DTL.
 	 * Look for any child we haven't already tried before giving up.
 	 */
-	for (c = 0; c < mm->mm_children; c++)
-		if (!mm->mm_child[c].mc_tried)
+	for (c = 0; c < mm->mm_children; c++) {
+		if (!mm->mm_child[c].mc_tried) {
+			vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
+			    zio);
 			return (c);
+		}
+	}
 
 	/*
 	 * Every child failed.  There's no place left to look.
@@ -269,7 +438,7 @@ vdev_mirror_io_start(zio_t *zio)
 	mirror_child_t *mc;
 	int c, children;
 
-	mm = vdev_mirror_map_alloc(zio);
+	mm = vdev_mirror_map_init(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	Sun Sep  7 21:30:47 2014	(r271238)
@@ -289,6 +289,8 @@ vdev_queue_init(vdev_t *vd)
 		    vdev_queue_offset_compare,
 		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	}
+
+	vq->vq_lastoffset = 0;
 }
 
 void
@@ -815,3 +817,26 @@ vdev_queue_io_done(zio_t *zio)
 
 	mutex_exit(&vq->vq_lock);
 }
+
+/*
+ * As these three methods are only used for load calculations we're not concerned
+ * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
+ * use here, instead we prefer to keep it lock free for performance.
+ */ 
+int
+vdev_queue_length(vdev_t *vd)
+{
+	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_lastoffset(vdev_t *vd)
+{
+	return (vd->vdev_queue.vq_lastoffset);
+}
+
+void
+vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
+{
+	vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
+}

Modified: stable/10/sys/geom/geom.h
==============================================================================
--- stable/10/sys/geom/geom.h	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/geom/geom.h	Sun Sep  7 21:30:47 2014	(r271238)
@@ -274,6 +274,7 @@ int g_handleattr(struct bio *bp, const c
     int len);
 int g_handleattr_int(struct bio *bp, const char *attribute, int val);
 int g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val);
+int g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val);
 int g_handleattr_str(struct bio *bp, const char *attribute, const char *str);
 struct g_consumer * g_new_consumer(struct g_geom *gp);
 struct g_geom * g_new_geomf(struct g_class *mp, const char *fmt, ...)

Modified: stable/10/sys/geom/geom_disk.c
==============================================================================
--- stable/10/sys/geom/geom_disk.c	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/geom/geom_disk.c	Sun Sep  7 21:30:47 2014	(r271238)
@@ -412,23 +412,32 @@ g_disk_start(struct bio *bp)
 			break;
 		else if (g_handleattr_str(bp, "GEOM::ident", dp->d_ident))
 			break;
-		else if (g_handleattr(bp, "GEOM::hba_vendor",
-		    &dp->d_hba_vendor, 2))
+		else if (g_handleattr_uint16_t(bp, "GEOM::hba_vendor",
+		    dp->d_hba_vendor))
 			break;
-		else if (g_handleattr(bp, "GEOM::hba_device",
-		    &dp->d_hba_device, 2))
+		else if (g_handleattr_uint16_t(bp, "GEOM::hba_device",
+		    dp->d_hba_device))
 			break;
-		else if (g_handleattr(bp, "GEOM::hba_subvendor",
-		    &dp->d_hba_subvendor, 2))
+		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subvendor",
+		    dp->d_hba_subvendor))
 			break;
-		else if (g_handleattr(bp, "GEOM::hba_subdevice",
-		    &dp->d_hba_subdevice, 2))
+		else if (g_handleattr_uint16_t(bp, "GEOM::hba_subdevice",
+		    dp->d_hba_subdevice))
 			break;
 		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
 			g_disk_kerneldump(bp, dp);
 		else if (!strcmp(bp->bio_attribute, "GEOM::setstate"))
 			g_disk_setstate(bp, sc);
-		else 
+		else if (!strcmp(bp->bio_attribute, "GEOM::rotation_rate")) {
+			uint64_t v;
+
+			if ((dp->d_flags & DISKFLAG_LACKS_ROTRATE) == 0)
+				v = dp->d_rotation_rate;
+			else
+				v = 0; /* rate unknown */
+			g_handleattr_uint16_t(bp, "GEOM::rotation_rate", v);
+			break;
+		} else 
 			error = ENOIOCTL;
 		break;
 	case BIO_FLUSH:
@@ -694,6 +703,8 @@ disk_create(struct disk *dp, int version
 		    dp->d_name, dp->d_unit);
 		return;
 	}
+	if (version < DISK_VERSION_04)
+		dp->d_flags |= DISKFLAG_LACKS_ROTRATE;
 	KASSERT(dp->d_strategy != NULL, ("disk_create need d_strategy"));
 	KASSERT(dp->d_name != NULL, ("disk_create need d_name"));
 	KASSERT(*dp->d_name != 0, ("disk_create need d_name"));

Modified: stable/10/sys/geom/geom_disk.h
==============================================================================
--- stable/10/sys/geom/geom_disk.h	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/geom/geom_disk.h	Sun Sep  7 21:30:47 2014	(r271238)
@@ -100,6 +100,9 @@ struct disk {
 
 	/* Fields private to the driver */
 	void			*d_drv1;
+
+	/* New field - don't use if DISKFLAG_LACKS_ROTRATE is set */
+	uint16_t		d_rotation_rate;
 };
 
 #define DISKFLAG_NEEDSGIANT	0x1
@@ -108,6 +111,7 @@ struct disk {
 #define DISKFLAG_CANFLUSHCACHE	0x8
 #define	DISKFLAG_UNMAPPED_BIO	0x10
 #define	DISKFLAG_DIRECT_COMPLETION	0x20
+#define DISKFLAG_LACKS_ROTRATE	0x40
 
 struct disk *disk_alloc(void);
 void disk_create(struct disk *disk, int version);
@@ -122,7 +126,8 @@ int disk_resize(struct disk *dp, int fla
 #define DISK_VERSION_01		0x5856105a
 #define DISK_VERSION_02		0x5856105b
 #define DISK_VERSION_03		0x5856105c
-#define DISK_VERSION		DISK_VERSION_03
+#define DISK_VERSION_04		0x5856105d
+#define DISK_VERSION		DISK_VERSION_04
 
 #endif /* _KERNEL */
 #endif /* _GEOM_GEOM_DISK_H_ */

Modified: stable/10/sys/geom/geom_subr.c
==============================================================================
--- stable/10/sys/geom/geom_subr.c	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/geom/geom_subr.c	Sun Sep  7 21:30:47 2014	(r271238)
@@ -952,6 +952,13 @@ g_handleattr_int(struct bio *bp, const c
 }
 
 int
+g_handleattr_uint16_t(struct bio *bp, const char *attribute, uint16_t val)
+{
+
+	return (g_handleattr(bp, attribute, &val, sizeof val));
+}
+
+int
 g_handleattr_off_t(struct bio *bp, const char *attribute, off_t val)
 {
 

Modified: stable/10/sys/sys/ata.h
==============================================================================
--- stable/10/sys/sys/ata.h	Sun Sep  7 20:27:48 2014	(r271237)
+++ stable/10/sys/sys/ata.h	Sun Sep  7 21:30:47 2014	(r271238)
@@ -262,6 +262,8 @@ struct ata_params {
 /*215*/ u_int16_t       nv_cache_size_1;
 	u_int16_t       nv_cache_size_2;
 /*217*/ u_int16_t       media_rotation_rate;
+#define ATA_RATE_NOT_REPORTED		0x0000
+#define ATA_RATE_NON_ROTATING		0x0001
 	u_int16_t       reserved218;
 /*219*/ u_int16_t       nv_cache_opt;
 /*220*/ u_int16_t       wrv_mode;


More information about the svn-src-all mailing list