svn commit: r270312 - in stable/10/sys/cddl: compat/opensolaris/sys contrib/opensolaris/uts/common/fs/zfs contrib/opensolaris/uts/common/fs/zfs/sys

Steven Hartland smh at FreeBSD.org
Thu Aug 21 22:44:11 UTC 2014


Author: smh
Date: Thu Aug 21 22:44:08 2014
New Revision: 270312
URL: http://svnweb.freebsd.org/changeset/base/270312

Log:
  MFC r265152 - Reintroduce priority for the TRIM ZIOs instead of using the "NOW" priority
  MFC r265321 - Fix double fault panic when returning EOPNOTSUPP
  MFC r269407 - Don't return ZIO_PIPELINE_CONTINUE from vdev_op_io_start methods
  
  Sponsored by:	Multiplay

Modified:
  stable/10/sys/cddl/compat/opensolaris/sys/dkio.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/cddl/compat/opensolaris/sys/dkio.h
==============================================================================
--- stable/10/sys/cddl/compat/opensolaris/sys/dkio.h	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/compat/opensolaris/sys/dkio.h	Thu Aug 21 22:44:08 2014	(r270312)
@@ -75,8 +75,6 @@ extern "C" {
  */
 #define	DKIOCFLUSHWRITECACHE	(DKIOC|34)	/* flush cache to phys medium */
 
-#define	DKIOCTRIM		(DKIOC|35)	/* TRIM a block */
-
 struct dk_callback {
 	void (*dkc_callback)(void *dkc_cookie, int error);
 	void *dkc_cookie;

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Thu Aug 21 22:44:08 2014	(r270312)
@@ -146,9 +146,10 @@ typedef enum zio_priority {
 	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
 	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
 	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
+	ZIO_PRIORITY_TRIM,		/* free requests used for TRIM */
 	ZIO_PRIORITY_NUM_QUEUEABLE,
 
-	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
+	ZIO_PRIORITY_NOW		/* non-queued I/Os (e.g. ioctl) */
 } zio_priority_t;
 
 #define	ZIO_PIPELINE_CONTINUE		0x100
@@ -361,7 +362,7 @@ typedef struct zio_transform {
 	struct zio_transform	*zt_next;
 } zio_transform_t;
 
-typedef int zio_pipe_stage_t(zio_t **ziop);
+typedef int zio_pipe_stage_t(zio_t *zio);
 
 /*
  * The io_reexecute flags are distinct from io_flags because the child must
@@ -520,7 +521,7 @@ extern zio_t *zio_claim(zio_t *pio, spa_
 
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
     uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv,
-    enum zio_flag flags);
+    zio_priority_t priority, enum zio_flag flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h	Thu Aug 21 22:44:08 2014	(r270312)
@@ -215,6 +215,10 @@ enum zio_stage {
 	ZIO_STAGE_FREE_BP_INIT |		\
 	ZIO_STAGE_DVA_FREE)
 
+#define	ZIO_FREE_PHYS_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_VDEV_IO_STAGES)
+
 #define	ZIO_DDT_FREE_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_STAGE_FREE_BP_INIT |		\

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c	Thu Aug 21 22:44:08 2014	(r270312)
@@ -449,7 +449,7 @@ trim_map_vdev_commit(spa_t *spa, zio_t *
 {
 	trim_map_t *tm = vd->vdev_trimmap;
 	trim_seg_t *ts;
-	uint64_t size, txgtarget, txgsafe;
+	uint64_t size, offset, txgtarget, txgsafe;
 	hrtime_t timelimit;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
@@ -477,9 +477,20 @@ trim_map_vdev_commit(spa_t *spa, zio_t *
 		avl_remove(&tm->tm_queued_frees, ts);
 		avl_add(&tm->tm_inflight_frees, ts);
 		size = ts->ts_end - ts->ts_start;
-		zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size));
+		offset = ts->ts_start;
 		TRIM_MAP_SDEC(tm, size);
 		TRIM_MAP_QDEC(tm);
+		/*
+		 * We drop the lock while we call zio_nowait as the IO
+		 * scheduler can result in a different IO being run e.g.
+		 * a write which would result in a recursive lock.
+		 */
+		mutex_exit(&tm->tm_lock);
+
+		zio_nowait(zio_trim(zio, spa, vd, offset, size));
+
+		mutex_enter(&tm->tm_lock);
+		ts = trim_map_first(tm, txgtarget, txgsafe, timelimit);
 	}
 	mutex_exit(&tm->tm_lock);
 }

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c	Thu Aug 21 22:44:08 2014	(r270312)
@@ -684,7 +684,7 @@ vdev_disk_io_intr(buf_t *bp)
 	 * Rather than teach the rest of the stack about other error
 	 * possibilities (EFAULT, etc), we normalize the error value here.
 	 */
-	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
+	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
 
 	if (zio->io_error == 0 && bp->b_resid != 0)
 		zio->io_error = SET_ERROR(EIO);
@@ -730,15 +730,17 @@ vdev_disk_io_start(zio_t *zio)
 	 * Nothing to be done here but return failure.
 	 */
 	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
-		zio->io_error = ENXIO;
-		return (ZIO_PIPELINE_CONTINUE);
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
-			return (ZIO_PIPELINE_CONTINUE);
+			zio_interrupt(zio);
+			return (ZIO_PIPELINE_STOP);
 		}
 
 		switch (zio->io_cmd) {
@@ -790,7 +792,8 @@ vdev_disk_io_start(zio_t *zio)
 			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 
 	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c	Thu Aug 21 22:44:08 2014	(r270312)
@@ -164,7 +164,8 @@ vdev_file_io_start(zio_t *zio)
 
 	if (!vdev_readable(vd)) {
 		zio->io_error = SET_ERROR(ENXIO);
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 
 	vf = vd->vdev_tsd;
@@ -180,7 +181,8 @@ vdev_file_io_start(zio_t *zio)
 			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 
 	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	Thu Aug 21 22:44:08 2014	(r270312)
@@ -716,7 +716,7 @@ vdev_geom_io_intr(struct bio *bp)
 	vd = zio->io_vd;
 	zio->io_error = bp->bio_error;
 	if (zio->io_error == 0 && bp->bio_resid != 0)
-		zio->io_error = EIO;
+		zio->io_error = SET_ERROR(EIO);
 
 	switch(zio->io_error) {
 	case ENOTSUP:
@@ -765,41 +765,43 @@ vdev_geom_io_start(zio_t *zio)
 
 	vd = zio->io_vd;
 
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
+	switch (zio->io_type) {
+	case ZIO_TYPE_IOCTL:
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
-			zio->io_error = ENXIO;
-			return (ZIO_PIPELINE_CONTINUE);
+			zio->io_error = SET_ERROR(ENXIO);
+		} else {
+			switch (zio->io_cmd) {
+			case DKIOCFLUSHWRITECACHE:
+				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
+					break;
+				if (vd->vdev_nowritecache) {
+					zio->io_error = SET_ERROR(ENOTSUP);
+					break;
+				}
+				goto sendreq;
+			default:
+				zio->io_error = SET_ERROR(ENOTSUP);
+			}
 		}
 
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-			if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
-				break;
-			if (vd->vdev_nowritecache) {
-				zio->io_error = ENOTSUP;
-				break;
-			}
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
+	case ZIO_TYPE_FREE:
+		if (vd->vdev_notrim) {
+			zio->io_error = SET_ERROR(ENOTSUP);
+		} else if (!vdev_geom_bio_delete_disable) {
 			goto sendreq;
-		case DKIOCTRIM:
-			if (vdev_geom_bio_delete_disable)
-				break;
-			if (vd->vdev_notrim) {
-				zio->io_error = ENOTSUP;
-				break;
-			}
-			goto sendreq;
-		default:
-			zio->io_error = ENOTSUP;
 		}
-
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 sendreq:
 	cp = vd->vdev_tsd;
 	if (cp == NULL) {
-		zio->io_error = ENXIO;
-		return (ZIO_PIPELINE_CONTINUE);
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 	bp = g_alloc_bio();
 	bp->bio_caller1 = zio;
@@ -811,22 +813,18 @@ sendreq:
 		bp->bio_offset = zio->io_offset;
 		bp->bio_length = zio->io_size;
 		break;
+	case ZIO_TYPE_FREE:
+		bp->bio_cmd = BIO_DELETE;
+		bp->bio_data = NULL;
+		bp->bio_offset = zio->io_offset;
+		bp->bio_length = zio->io_size;
+		break;
 	case ZIO_TYPE_IOCTL:
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-			bp->bio_cmd = BIO_FLUSH;
-			bp->bio_flags |= BIO_ORDERED;
-			bp->bio_data = NULL;
-			bp->bio_offset = cp->provider->mediasize;
-			bp->bio_length = 0;
-			break;
-		case DKIOCTRIM:
-			bp->bio_cmd = BIO_DELETE;
-			bp->bio_data = NULL;
-			bp->bio_offset = zio->io_offset;
-			bp->bio_length = zio->io_size;
-			break;
-		}
+		bp->bio_cmd = BIO_FLUSH;
+		bp->bio_flags |= BIO_ORDERED;
+		bp->bio_data = NULL;
+		bp->bio_offset = cp->provider->mediasize;
+		bp->bio_length = 0;
 		break;
 	}
 	bp->bio_done = vdev_geom_io_intr;

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c	Thu Aug 21 22:44:08 2014	(r270312)
@@ -287,7 +287,8 @@ vdev_mirror_io_start(zio_t *zio)
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_mirror_scrub_done, mc));
 			}
-			return (ZIO_PIPELINE_CONTINUE);
+			zio_interrupt(zio);
+			return (ZIO_PIPELINE_STOP);
 		}
 		/*
 		 * For normal reads just pick one child.
@@ -314,7 +315,8 @@ vdev_mirror_io_start(zio_t *zio)
 		c++;
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	zio_interrupt(zio);
+	return (ZIO_PIPELINE_STOP);
 }
 
 static int

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c	Thu Aug 21 22:44:08 2014	(r270312)
@@ -71,7 +71,8 @@ static int
 vdev_missing_io_start(zio_t *zio)
 {
 	zio->io_error = SET_ERROR(ENOTSUP);
-	return (ZIO_PIPELINE_CONTINUE);
+	zio_interrupt(zio);
+	return (ZIO_PIPELINE_STOP);
 }
 
 /* ARGSUSED */

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c	Thu Aug 21 22:44:08 2014	(r270312)
@@ -40,9 +40,9 @@
  *
  * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
  * I/O scheduler determines when and in what order those operations are
- * issued.  The I/O scheduler divides operations into five I/O classes
+ * issued.  The I/O scheduler divides operations into six I/O classes
  * prioritized in the following order: sync read, sync write, async read,
- * async write, and scrub/resilver.  Each queue defines the minimum and
+ * async write, scrub/resilver and trim.  Each queue defines the minimum and
  * maximum number of concurrent operations that may be issued to the device.
  * In addition, the device has an aggregate maximum. Note that the sum of the
  * per-queue minimums must not exceed the aggregate maximum, and if the
@@ -61,7 +61,7 @@
  * done in the order specified above. No further operations are issued if the
  * aggregate maximum number of concurrent operations has been hit or if there
  * are no operations queued for an I/O class that has not hit its maximum.
- * Every time an i/o is queued or an operation completes, the I/O scheduler
+ * Every time an I/O is queued or an operation completes, the I/O scheduler
  * looks for new operations to issue.
  *
  * All I/O classes have a fixed maximum number of outstanding operations
@@ -70,7 +70,7 @@
  * transaction groups (see txg.c). Transaction groups enter the syncing state
  * periodically so the number of queued async writes will quickly burst up and
  * then bleed down to zero. Rather than servicing them as quickly as possible,
- * the I/O scheduler changes the maximum number of active async write i/os
+ * the I/O scheduler changes the maximum number of active async write I/Os
  * according to the amount of dirty data in the pool (see dsl_pool.c). Since
  * both throughput and latency typically increase with the number of
  * concurrent operations issued to physical devices, reducing the burstiness
@@ -113,14 +113,14 @@
  */
 
 /*
- * The maximum number of i/os active to each device.  Ideally, this will be >=
+ * The maximum number of I/Os active to each device.  Ideally, this will be >=
  * the sum of each queue's max_active.  It must be at least the sum of each
  * queue's min_active.
  */
 uint32_t zfs_vdev_max_active = 1000;
 
 /*
- * Per-queue limits on the number of i/os active to each device.  If the
+ * Per-queue limits on the number of I/Os active to each device.  If the
  * sum of the queue's max_active is < zfs_vdev_max_active, then the
  * min_active comes into play.  We will send min_active from each queue,
  * and then select from queues in the order defined by zio_priority_t.
@@ -145,6 +145,14 @@ uint32_t zfs_vdev_async_write_min_active
 uint32_t zfs_vdev_async_write_max_active = 10;
 uint32_t zfs_vdev_scrub_min_active = 1;
 uint32_t zfs_vdev_scrub_max_active = 2;
+uint32_t zfs_vdev_trim_min_active = 1;
+/*
+ * TRIM max active is large in comparison to the other values due to the fact
+ * that TRIM IOs are coalesced at the device layer. This value is set such
+ * that a typical SSD can process the queued IOs in a single request.
+ */
+uint32_t zfs_vdev_trim_max_active = 64;
+
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -171,7 +179,7 @@ SYSCTL_DECL(_vfs_zfs_vdev);
 TUNABLE_INT("vfs.zfs.vdev.max_active", &zfs_vdev_max_active);
 SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RW,
     &zfs_vdev_max_active, 0,
-    "The maximum number of i/os of all types active for each device.");
+    "The maximum number of I/Os of all types active for each device.");
 
 #define ZFS_VDEV_QUEUE_KNOB_MIN(name)					\
 TUNABLE_INT("vfs.zfs.vdev." #name "_min_active",			\
@@ -199,6 +207,8 @@ ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
 ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
 ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
 ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
+ZFS_VDEV_QUEUE_KNOB_MIN(trim);
+ZFS_VDEV_QUEUE_KNOB_MAX(trim);
 
 #undef ZFS_VDEV_QUEUE_KNOB
 
@@ -297,6 +307,7 @@ static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
 
@@ -313,6 +324,7 @@ static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
 
@@ -401,6 +413,8 @@ vdev_queue_class_min_active(zio_priority
 		return (zfs_vdev_async_write_min_active);
 	case ZIO_PRIORITY_SCRUB:
 		return (zfs_vdev_scrub_min_active);
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_min_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -460,6 +474,8 @@ vdev_queue_class_max_active(spa_t *spa, 
 		return (vdev_queue_max_async_writes(spa));
 	case ZIO_PRIORITY_SCRUB:
 		return (zfs_vdev_scrub_max_active);
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_max_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -476,6 +492,8 @@ vdev_queue_class_to_issue(vdev_queue_t *
 	spa_t *spa = vq->vq_vdev->vdev_spa;
 	zio_priority_t p;
 
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
 	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
 		return (ZIO_PRIORITY_NUM_QUEUEABLE);
 
@@ -517,10 +535,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, z
 	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
 	uint64_t maxgap = 0;
 	uint64_t size;
-	boolean_t stretch = B_FALSE;
-	vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
-	avl_tree_t *t = &vqc->vqc_queued_tree;
-	enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+	boolean_t stretch;
+	avl_tree_t *t;
+	enum zio_flag flags;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
 	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
 		return (NULL);
@@ -558,6 +577,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, z
 	 * Walk backwards through sufficiently contiguous I/Os
 	 * recording the last non-option I/O.
 	 */
+	flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+	t = &vq->vq_class[zio->io_priority].vqc_queued_tree;
 	while ((dio = AVL_PREV(t, first)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
@@ -597,6 +618,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, z
 	 * non-optional I/O is close enough to make aggregation
 	 * worthwhile.
 	 */
+	stretch = B_FALSE;
 	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
 		zio_t *nio = last;
 		while ((dio = AVL_NEXT(t, nio)) != NULL &&
@@ -737,11 +759,13 @@ vdev_queue_io(zio_t *zio)
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_SCRUB)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
-	} else {
-		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_FREE);
+		zio->io_priority = ZIO_PRIORITY_TRIM;
 	}
 
 	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c	Thu Aug 21 22:44:08 2014	(r270312)
@@ -1755,7 +1755,9 @@ vdev_raidz_io_start(zio_t *zio)
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
-		return (ZIO_PIPELINE_CONTINUE);
+
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -1787,7 +1789,8 @@ vdev_raidz_io_start(zio_t *zio)
 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 		}
 
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_interrupt(zio);
+		return (ZIO_PIPELINE_STOP);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1827,7 +1830,8 @@ vdev_raidz_io_start(zio_t *zio)
 		}
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	zio_interrupt(zio);
+	return (ZIO_PIPELINE_STOP);
 }
 
 

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Thu Aug 21 22:42:02 2014	(r270311)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Thu Aug 21 22:44:08 2014	(r270312)
@@ -807,6 +807,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, ui
 	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 		stage |= ZIO_STAGE_ISSUE_ASYNC;
 
+	flags |= ZIO_FLAG_DONT_QUEUE;
+
 	zio = zio_create(pio, spa, txg, bp, NULL, size,
 	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 	    NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
@@ -851,14 +853,14 @@ zio_claim(zio_t *pio, spa_t *spa, uint64
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
     uint64_t size, zio_done_func_t *done, void *private,
-    enum zio_flag flags)
+    zio_priority_t priority, enum zio_flag flags)
 {
 	zio_t *zio;
 	int c;
 
 	if (vd->vdev_children == 0) {
 		zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
-		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, offset, NULL,
+		    ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 
 		zio->io_cmd = cmd;
@@ -867,7 +869,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t
 
 		for (c = 0; c < vd->vdev_children; c++)
 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-			    offset, size, done, private, flags));
+			    offset, size, done, private, priority, flags));
 	}
 
 	return (zio);
@@ -952,6 +954,10 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
+	/* Not all IO types require vdev io done stage e.g. free */
+	if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
+		pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
+
 	if (vd->vdev_children == 0)
 		offset += VDEV_LABEL_START_SIZE;
 
@@ -997,7 +1003,7 @@ void
 zio_flush(zio_t *zio, vdev_t *vd)
 {
 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
-	    NULL, NULL,
+	    NULL, NULL, ZIO_PRIORITY_NOW,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
@@ -1007,9 +1013,10 @@ zio_trim(zio_t *zio, spa_t *spa, vdev_t 
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
-	return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size,
-	    NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
+	return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL,
+	    ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
 }
 
 void
@@ -1036,9 +1043,8 @@ zio_shrink(zio_t *zio, uint64_t size)
  */
 
 static int
-zio_read_bp_init(zio_t **ziop)
+zio_read_bp_init(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	blkptr_t *bp = zio->io_bp;
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
@@ -1071,9 +1077,8 @@ zio_read_bp_init(zio_t **ziop)
 }
 
 static int
-zio_write_bp_init(zio_t **ziop)
+zio_write_bp_init(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
 	enum zio_compress compress = zp->zp_compress;
@@ -1253,9 +1258,8 @@ zio_write_bp_init(zio_t **ziop)
 }
 
 static int
-zio_free_bp_init(zio_t **ziop)
+zio_free_bp_init(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
@@ -1338,10 +1342,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_t
 }
 
 static int
-zio_issue_async(zio_t **ziop)
+zio_issue_async(zio_t *zio)
 {
-	zio_t *zio = *ziop;
-
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 
 	return (ZIO_PIPELINE_STOP);
@@ -1409,7 +1411,7 @@ zio_execute(zio_t *zio)
 		}
 
 		zio->io_stage = stage;
-		rv = zio_pipeline[highbit64(stage) - 1](&zio);
+		rv = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (rv == ZIO_PIPELINE_STOP)
 			return;
@@ -1843,9 +1845,8 @@ zio_gang_tree_issue(zio_t *pio, zio_gang
 }
 
 static int
-zio_gang_assemble(zio_t **ziop)
+zio_gang_assemble(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
@@ -1859,9 +1860,8 @@ zio_gang_assemble(zio_t **ziop)
 }
 
 static int
-zio_gang_issue(zio_t **ziop)
+zio_gang_issue(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
@@ -1995,9 +1995,8 @@ zio_write_gang_block(zio_t *pio)
  * writes) and as a result is mutually exclusive with dedup.
  */
 static int
-zio_nop_write(zio_t **ziop)
+zio_nop_write(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
@@ -2068,9 +2067,8 @@ zio_ddt_child_read_done(zio_t *zio)
 }
 
 static int
-zio_ddt_read_start(zio_t **ziop)
+zio_ddt_read_start(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_GET_DEDUP(bp));
@@ -2112,9 +2110,8 @@ zio_ddt_read_start(zio_t **ziop)
 }
 
 static int
-zio_ddt_read_done(zio_t **ziop)
+zio_ddt_read_done(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
@@ -2282,9 +2279,8 @@ zio_ddt_ditto_write_done(zio_t *zio)
 }
 
 static int
-zio_ddt_write(zio_t **ziop)
+zio_ddt_write(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
@@ -2395,9 +2391,8 @@ zio_ddt_write(zio_t **ziop)
 ddt_entry_t *freedde; /* for debugging */
 
 static int
-zio_ddt_free(zio_t **ziop)
+zio_ddt_free(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
@@ -2422,9 +2417,8 @@ zio_ddt_free(zio_t **ziop)
  * ==========================================================================
  */
 static int
-zio_dva_allocate(zio_t **ziop)
+zio_dva_allocate(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	spa_t *spa = zio->io_spa;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
@@ -2466,19 +2460,16 @@ zio_dva_allocate(zio_t **ziop)
 }
 
 static int
-zio_dva_free(zio_t **ziop)
+zio_dva_free(zio_t *zio)
 {
-	zio_t *zio = *ziop;
-
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
-zio_dva_claim(zio_t **ziop)
+zio_dva_claim(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	int error;
 
 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
@@ -2572,12 +2563,12 @@ zio_free_zil(spa_t *spa, uint64_t txg, b
  * ==========================================================================
  */
 static int
-zio_vdev_io_start(zio_t **ziop)
+zio_vdev_io_start(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
+	int ret;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
@@ -2592,7 +2583,8 @@ zio_vdev_io_start(zio_t **ziop)
 		return (vdev_mirror_ops.vdev_op_io_start(zio));
 	}
 
-	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
+	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
+	    zio->io_priority == ZIO_PRIORITY_NOW) {
 		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
@@ -2677,41 +2669,44 @@ zio_vdev_io_start(zio_t **ziop)
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
-	if (vd->vdev_ops->vdev_op_leaf &&
-	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
-
-		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
-			return (ZIO_PIPELINE_CONTINUE);
+	if (vd->vdev_ops->vdev_op_leaf) {
+		switch (zio->io_type) {
+		case ZIO_TYPE_READ:
+			if (vdev_cache_read(zio))
+				return (ZIO_PIPELINE_CONTINUE);
+			/* FALLTHROUGH */
+		case ZIO_TYPE_WRITE:
+		case ZIO_TYPE_FREE:
+			if ((zio = vdev_queue_io(zio)) == NULL)
+				return (ZIO_PIPELINE_STOP);
 
-		if ((zio = vdev_queue_io(zio)) == NULL)
-			return (ZIO_PIPELINE_STOP);
-		*ziop = zio;
-
-		if (!vdev_accessible(vd, zio)) {
-			zio->io_error = SET_ERROR(ENXIO);
-			zio_interrupt(zio);
-			return (ZIO_PIPELINE_STOP);
+			if (!vdev_accessible(vd, zio)) {
+				zio->io_error = SET_ERROR(ENXIO);
+				zio_interrupt(zio);
+				return (ZIO_PIPELINE_STOP);
+			}
+			break;
 		}
-	}
-
-	/*
-	 * Note that we ignore repair writes for TRIM because they can conflict
-	 * with normal writes. This isn't an issue because, by definition, we
-	 * only repair blocks that aren't freed.
-	 */
-	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE &&
-	    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
-		if (!trim_map_write_start(zio))
+		/*
+		 * Note that we ignore repair writes for TRIM because they can
+		 * conflict with normal writes. This isn't an issue because, by
+		 * definition, we only repair blocks that aren't freed.
+		 */
+		if (zio->io_type == ZIO_TYPE_WRITE &&
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+		    !trim_map_write_start(zio))
 			return (ZIO_PIPELINE_STOP);
 	}
 
-	return (vd->vdev_ops->vdev_op_io_start(zio));
+	ret = vd->vdev_ops->vdev_op_io_start(zio);
+	ASSERT(ret == ZIO_PIPELINE_STOP);
+
+	return (ret);
 }
 
 static int
-zio_vdev_io_done(zio_t **ziop)
+zio_vdev_io_done(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	vdev_t *vd = zio->io_vd;
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
@@ -2723,7 +2718,8 @@ zio_vdev_io_done(zio_t **ziop)
 	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
-	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_FREE)) {
 
 		if (zio->io_type == ZIO_TYPE_WRITE &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
@@ -2785,9 +2781,8 @@ zio_vsd_default_cksum_report(zio_t *zio,
 }
 
 static int
-zio_vdev_io_assess(zio_t **ziop)
+zio_vdev_io_assess(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
@@ -2804,7 +2799,8 @@ zio_vdev_io_assess(zio_t **ziop)
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
-	if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
+	if (zio->io_type == ZIO_TYPE_FREE &&
+	    zio->io_priority != ZIO_PRIORITY_NOW) {
 		switch (zio->io_error) {
 		case 0:
 			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
@@ -2817,6 +2813,7 @@ zio_vdev_io_assess(zio_t **ziop)
 			ZIO_TRIM_STAT_BUMP(failed);
 			break;
 		}
+	}
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
@@ -2900,9 +2897,8 @@ zio_vdev_io_bypass(zio_t *zio)
  * ==========================================================================
  */
 static int
-zio_checksum_generate(zio_t **ziop)
+zio_checksum_generate(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum;
 
@@ -2932,9 +2928,8 @@ zio_checksum_generate(zio_t **ziop)
 }
 
 static int
-zio_checksum_verify(zio_t **ziop)
+zio_checksum_verify(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
@@ -3005,9 +3000,8 @@ zio_worst_error(int e1, int e2)
  * ==========================================================================
  */
 static int
-zio_ready(zio_t **ziop)
+zio_ready(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 
@@ -3064,9 +3058,8 @@ zio_ready(zio_t **ziop)
 }
 
 static int
-zio_done(zio_t **ziop)
+zio_done(zio_t *zio)
 {
-	zio_t *zio = *ziop;
 	spa_t *spa = zio->io_spa;
 	zio_t *lio = zio->io_logical;
 	blkptr_t *bp = zio->io_bp;


More information about the svn-src-all mailing list