svn commit: r277430 - in vendor-sys/illumos/dist/uts/common/fs/zfs: . sys

Xin LI delphij at FreeBSD.org
Tue Jan 20 20:14:51 UTC 2015


Author: delphij
Date: Tue Jan 20 20:14:50 2015
New Revision: 277430
URL: https://svnweb.freebsd.org/changeset/base/277430

Log:
  5313 Allow I/Os to be aggregated across ZIO priority classes
  Reviewed by: Andriy Gapon <avg at FreeBSD.org>
  Reviewed by: Will Andrews <willa at SpectraLogic.com>
  Reviewed by: Matt Ahrens <mahrens at delphix.com>
  Reviewed by: George Wilson <george at delphix.com>
  Approved by: Robert Mustacchi <rm at joyent.com>
  Author: Justin T. Gibbs <justing at spectralogic.com>
  
  illumos/illumos-gate at fe319232d24f4ae183730a5a24a09423d8ab4429

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h	Tue Jan 20 20:13:46 2015	(r277429)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h	Tue Jan 20 20:14:50 2015	(r277430)
@@ -113,6 +113,8 @@ struct vdev_queue {
 	vdev_t		*vq_vdev;
 	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
 	avl_tree_t	vq_active_tree;
+	avl_tree_t	vq_read_offset_tree;
+	avl_tree_t	vq_write_offset_tree;
 	uint64_t	vq_last_offset;
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	kmutex_t	vq_lock;

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h	Tue Jan 20 20:13:46 2015	(r277429)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h	Tue Jan 20 20:14:50 2015	(r277430)
@@ -419,6 +419,7 @@ struct zio {
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;
 	avl_node_t	io_queue_node;
+	avl_node_t	io_offset_node;
 
 	/* Internal pipeline state */
 	enum zio_flag	io_flags;

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c	Tue Jan 20 20:13:46 2015	(r277429)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c	Tue Jan 20 20:14:50 2015	(r277430)
@@ -185,6 +185,22 @@ vdev_queue_offset_compare(const void *x1
 	return (0);
 }
 
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+	return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+	ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
+	if (t == ZIO_TYPE_READ)
+		return (&vq->vq_read_offset_tree);
+	else
+		return (&vq->vq_write_offset_tree);
+}
+
 int
 vdev_queue_timestamp_compare(const void *x1, const void *x2)
 {
@@ -214,19 +230,27 @@ vdev_queue_init(vdev_t *vd)
 
 	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
 	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
 
 	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		int (*compfn) (const void *, const void *);
+
 		/*
-		 * The synchronous i/o queues are FIFO rather than LBA ordered.
-		 * This provides more consistent latency for these i/os, and
-		 * they tend to not be tightly clustered anyway so there is
-		 * little to no throughput loss.
+		 * The synchronous i/o queues are dispatched in FIFO rather
+		 * than LBA order.  This provides more consistent latency for
+		 * these i/os.
 		 */
-		boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ ||
-		    p == ZIO_PRIORITY_SYNC_WRITE);
-		avl_create(&vq->vq_class[p].vqc_queued_tree,
-		    fifo ? vdev_queue_timestamp_compare :
-		    vdev_queue_offset_compare,
+		if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+			compfn = vdev_queue_timestamp_compare;
+		else
+			compfn = vdev_queue_offset_compare;
+
+		avl_create(vdev_queue_class_tree(vq, p), compfn,
 		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	}
 }
@@ -237,8 +261,10 @@ vdev_queue_fini(vdev_t *vd)
 	vdev_queue_t *vq = &vd->vdev_queue;
 
 	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
-		avl_destroy(&vq->vq_class[p].vqc_queued_tree);
+		avl_destroy(vdev_queue_class_tree(vq, p));
 	avl_destroy(&vq->vq_active_tree);
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 
 	mutex_destroy(&vq->vq_lock);
 }
@@ -248,7 +274,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_
 {
 	spa_t *spa = zio->io_spa;
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
+	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 
 	mutex_enter(&spa->spa_iokstat_lock);
 	spa->spa_queue_stats[zio->io_priority].spa_queued++;
@@ -262,7 +289,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, z
 {
 	spa_t *spa = zio->io_spa;
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
-	avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
+	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 
 	mutex_enter(&spa->spa_iokstat_lock);
 	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
@@ -423,7 +451,7 @@ vdev_queue_class_to_issue(vdev_queue_t *
 
 	/* find a queue that has not reached its minimum # outstanding i/os */
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 		    vq->vq_class[p].vqc_active <
 		    vdev_queue_class_min_active(p))
 			return (p);
@@ -434,7 +462,7 @@ vdev_queue_class_to_issue(vdev_queue_t *
 	 * maximum # outstanding i/os.
 	 */
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
-		if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
+		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 		    vq->vq_class[p].vqc_active <
 		    vdev_queue_class_max_active(spa, p))
 			return (p);
@@ -460,22 +488,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, z
 	uint64_t maxgap = 0;
 	uint64_t size;
 	boolean_t stretch = B_FALSE;
-	vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
-	avl_tree_t *t = &vqc->vqc_queued_tree;
+	avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
 	enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 
 	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
 		return (NULL);
 
-	/*
-	 * The synchronous i/o queues are not sorted by LBA, so we can't
-	 * find adjacent i/os.  These i/os tend to not be tightly clustered,
-	 * or too large to aggregate, so this has little impact on performance.
-	 */
-	if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
-	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
-		return (NULL);
-
 	first = last = zio;
 
 	if (zio->io_type == ZIO_TYPE_READ)
@@ -607,7 +625,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
 	zio_t *zio, *aio;
 	zio_priority_t p;
 	avl_index_t idx;
-	vdev_queue_class_t *vqc;
+	avl_tree_t *tree;
 	zio_t search;
 
 again:
@@ -626,13 +644,13 @@ again:
 	 *
 	 * For FIFO queues (sync), issue the i/o with the lowest timestamp.
 	 */
-	vqc = &vq->vq_class[p];
+	tree = vdev_queue_class_tree(vq, p);
 	search.io_timestamp = 0;
 	search.io_offset = vq->vq_last_offset + 1;
-	VERIFY3P(avl_find(&vqc->vqc_queued_tree, &search, &idx), ==, NULL);
-	zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
+	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+	zio = avl_nearest(tree, idx, AVL_AFTER);
 	if (zio == NULL)
-		zio = avl_first(&vqc->vqc_queued_tree);
+		zio = avl_first(tree);
 	ASSERT3U(zio->io_priority, ==, p);
 
 	aio = vdev_queue_aggregate(vq, zio);


More information about the svn-src-all mailing list