Call for Review: Suport for BIO_ORDERED bios

Justin T. Gibbs gibbs at scsiguy.com
Thu Aug 12 21:25:47 UTC 2010


The following patches introduce the BIO_ORDERED flag for bios.
A bio tagged with the BIO_ORDERED flag has barrier semantics:
all bios queued before the BIO_ORDERED bio are executed before
the BIO_ORDERED bio and any bios queued after the BIO_ORDERED bio
are executed after the BIO_ORDERED bio.

FreeBSD has offered these semantics before.  I added support for
ordered bufs over a decade ago, but with no consumers of that
interface in the tree, it was removed.  Today the landscape has
changed: ZFS requires ordered semantics to be able to safely
commit transactions, Jeff Roberson has talked of modifying
UFS to take advantage of this feature to speed up soft-updates,
Linux ext3/4 filesystems will take advantage of this feature,
and VMs using storage exported via Xen's blkback interface can
also querry for this feature.

My changes are sufficient to allow ZFS and the Xen blkback driver
(more on that driver in another email) to perform ordered I/O.
Additional, mostly trivial, changes will be required to pass ordering
information through the buf interface if/when other buf clients
grow to use this capability.

Are there any comments/concerns about these changes before I commit
them?

Thanks,
Justin

Change 216125 by justing at spectrabsd.import on 2010/07/29 16:24:07

	Add the BIO_ORDERED flag for struct bio and update bio clients to
	use it.
	
	sys/sys/bio.h:
		Add BIO_ORDERED as bit 4 of the bio_flags field in struct bio.
	
	sys/kern/subr_disk.c:
		In bioq_disksort(), bypass the normal sort for bios with the
		BIO_ORDERED attribute and instead insert them into the queue
		with bioq_insert_tail().  bioq_insert_tail() not only gives
		the desired command order during insertion, but also provides
		barrier semantics so that commands disksorted in the future
		cannot pass the just enqueued transaction.
	
	sys/cam/ata/ata_da.c:
	sys/cam/scsi/scsi_da.c
		Use an ordered command for SCSI/ATA-NCQ commands issued in
		response to bios with the BIO_ORDERED flag set.
	
	sys/cam/scsi/scsi_da.c
		Use an ordered tag when issuing a synchronize cache command.
	
		Wrap some lines to 80 columns.
	
	sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
	sys/geom/geom_io.c
		Mark bios with the BIO_FLUSH command as BIO_ORDERED.
	
Affected files ...

... //depot/SpectraBSD/head/sys/cam/ata/ata_da.c#5 edit
... //depot/SpectraBSD/head/sys/cam/scsi/scsi_da.c#4 edit
... //depot/SpectraBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c#3 edit
... //depot/SpectraBSD/head/sys/geom/geom_io.c#4 edit
... //depot/SpectraBSD/head/sys/kern/subr_disk.c#3 edit
... //depot/SpectraBSD/head/sys/sys/bio.h#3 edit

Change 216126 by justing at spectrabsd.import on 2010/07/29 16:35:46

	Correct bioq_disksort so that bioq_insert_tail() offers barrier
	semantic.
	
	The barrier semantics of bioq_insert_tail() were broken in two ways:
	
	 o In bioq_disksort(), an added bio could be inserted at the head of
	   the queue, even when a barrier was present, if the sort key for
	   the new entry was less than that of the last queued barrier bio.
	
	 o The last_offset used to generate the sort key for newly queued bios
	   did not stay at the position of the barrier until either the
	   barrier was de-queued, or a new barrier (which updates last_offset)
	   was queued.  When a barrier is in effect, we know that the disk
	   will pass through the barrier position just before the
	   "blocked bios" are released, so using the barrier's offset for
	   last_offset is the optimal choice.
	
	sys/geom/sched/subr_disk.c:
	sys/kern/subr_disk.c:
	 o Update last_offset in bioq_insert_tail().
	
	 o Only update last_offset in bioq_remove() if the removed bio is
	   at the head of the queue (typically due to a call via
	   bioq_takefirst()) and no barrier is active.
	
	 o In bioq_disksort(), if we have a barrier (insert_point is non-NULL),
	   set prev to the barrier and cur to it's next element.  Now that
	   last_offset is kept at the barrier position, this change isn't
	   strictly necessary, but since we have to take a decision branch
	   anyway, it does avoid one, no-op, loop iteration in the while
	   loop that immediately follows.

Affected files ...

... //depot/SpectraBSD/head/sys/geom/sched/subr_disk.c#3 edit
... //depot/SpectraBSD/head/sys/kern/subr_disk.c#4 edit


--- //depot/vendor/FreeBSD/head/sys/cam/ata/ata_da.c	2010-07-25 15:43:52.000000000 -0600
+++ /home/justing/perforce/FreeBSD/head/sys/cam/ata/ata_da.c	2010-07-25 15:43:52.000000000 -0600
@@ -874,7 +874,8 @@
 		}
 		bioq_remove(&softc->bio_queue, bp);

-		if ((softc->flags & ADA_FLAG_NEED_OTAG) != 0) {
+		if ((bp->bio_flags & BIO_ORDERED) != 0
+		 || (softc->flags & ADA_FLAG_NEED_OTAG) != 0) {
 			softc->flags &= ~ADA_FLAG_NEED_OTAG;
 			softc->ordered_tag_count++;
 			tag_code = 0;
--- //depot/vendor/FreeBSD/head/sys/cam/scsi/scsi_da.c	2010-07-25 15:43:52.000000000 -0600
+++ /home/justing/perforce/FreeBSD/head/sys/cam/scsi/scsi_da.c	2010-07-25 15:43:52.000000000 -0600
@@ -1354,7 +1354,8 @@

 			bioq_remove(&softc->bio_queue, bp);

-			if ((softc->flags & DA_FLAG_NEED_OTAG) != 0) {
+			if ((bp->bio_flags & BIO_ORDERED) != 0
+			 || (softc->flags & DA_FLAG_NEED_OTAG) != 0) {
 				softc->flags &= ~DA_FLAG_NEED_OTAG;
 				softc->ordered_tag_count++;
 				tag_code = MSG_ORDERED_Q_TAG;
@@ -1368,7 +1369,8 @@
 						/*retries*/da_retry_count,
 						/*cbfcnp*/dadone,
 						/*tag_action*/tag_code,
-						/*read_op*/bp->bio_cmd == BIO_READ,
+						/*read_op*/bp->bio_cmd
+							== BIO_READ,
 						/*byte2*/0,
 						softc->minimum_cmd_size,
 						/*lba*/bp->bio_pblkno,
@@ -1377,17 +1379,24 @@
 						/*data_ptr*/ bp->bio_data,
 						/*dxfer_len*/ bp->bio_bcount,
 						/*sense_len*/SSD_FULL_SIZE,
-						/*timeout*/da_default_timeout*1000);
+						da_default_timeout * 1000);
 				break;
 			case BIO_FLUSH:
+				/*
+				 * BIO_FLUSH doesn't currently communicate
+				 * range data, so we synchronize the cache
+				 * over the whole disk.  We also force
+				 * ordered tag semantics the flush applies
+				 * to all previously queued I/O.
+				 */
 				scsi_synchronize_cache(&start_ccb->csio,
 						       /*retries*/1,
 						       /*cbfcnp*/dadone,
-						       MSG_SIMPLE_Q_TAG,
-						       /*begin_lba*/0,/* Cover the whole disk */
+						       MSG_ORDERED_Q_TAG,
+						       /*begin_lba*/0,
 						       /*lb_count*/0,
 						       SSD_FULL_SIZE,
-						       /*timeout*/da_default_timeout*1000);
+						       da_default_timeout*1000);
 				break;
 			}
 			start_ccb->ccb_h.ccb_state = DA_CCB_BUFFER_IO;
--- //depot/vendor/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	2010-07-12 23:49:04.000000000 -0600
+++ /home/justing/perforce/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c	2010-07-12 23:49:04.000000000 -0600
@@ -598,6 +598,7 @@
 		break;
 	case ZIO_TYPE_IOCTL:
 		bp->bio_cmd = BIO_FLUSH;
+		bp->bio_flags |= BIO_ORDERED;
 		bp->bio_data = NULL;
 		bp->bio_offset = cp->provider->mediasize;
 		bp->bio_length = 0;
--- //depot/vendor/FreeBSD/head/sys/geom/geom_io.c	2010-06-10 17:49:36.000000000 -0600
+++ /home/justing/perforce/FreeBSD/head/sys/geom/geom_io.c	2010-06-10 17:49:36.000000000 -0600
@@ -265,6 +265,7 @@
 	g_trace(G_T_BIO, "bio_flush(%s)", cp->provider->name);
 	bp = g_alloc_bio();
 	bp->bio_cmd = BIO_FLUSH;
+	bp->bio_flags |= BIO_ORDERED;
 	bp->bio_done = NULL;
 	bp->bio_attribute = NULL;
 	bp->bio_offset = cp->provider->mediasize;
--- //depot/vendor/FreeBSD/head/sys/geom/sched/subr_disk.c	2010-04-12 16:37:45.000000000 -0600
+++ /home/justing/perforce/FreeBSD/head/sys/geom/sched/subr_disk.c	2010-04-12 16:37:45.000000000 -0600
@@ -86,7 +86,7 @@
  * bioq_remove()	remove a generic element from the queue, act as
  *		bioq_takefirst() if invoked on the head of the queue.
  *
- * The semantic of these methods is the same of the operations
+ * The semantic of these methods is the same as the operations
  * on the underlying TAILQ, but with additional guarantees on
  * subsequent bioq_disksort() calls. E.g. bioq_insert_tail()
  * can be useful for making sure that all previous ops are flushed
@@ -115,10 +115,10 @@
 gs_bioq_remove(struct bio_queue_head *head, struct bio *bp)
 {

-	if (bp == TAILQ_FIRST(&head->queue))
-		head->last_offset = bp->bio_offset + bp->bio_length;
-
-	if (bp == head->insert_point)
+	if (head->insert_point == NULL) {
+		if (bp == TAILQ_FIRST(&head->queue))
+			head->last_offset = bp->bio_offset + bp->bio_length;
+	} else if (bp == head->insert_point)
 		head->insert_point = NULL;

 	TAILQ_REMOVE(&head->queue, bp, bio_queue);
@@ -137,7 +137,8 @@
 gs_bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
 {

-	head->last_offset = bp->bio_offset;
+	if (head->insert_point == NULL)
+		head->last_offset = bp->bio_offset;
 	TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
 }

@@ -147,6 +148,7 @@

 	TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue);
 	head->insert_point = bp;
+	head->last_offset = bp->bio_offset;
 }

 struct bio *
@@ -189,13 +191,28 @@
 void
 gs_bioq_disksort(struct bio_queue_head *head, struct bio *bp)
 {
-	struct bio *cur, *prev = NULL;
-	uoff_t key = gs_bioq_bio_key(head, bp);
+	struct bio *cur, *prev;
+	uoff_t key;
+
+	if ((bp->bio_flags & BIO_ORDERED) != 0) {
+		/*
+		 * Ordered transactions can only be dispatched
+		 * after any currently queued transactions.  They
+		 * also have barrier semantics - no transactions
+		 * queued in the future can pass them.
+		 */
+		gs_bioq_insert_tail(head, bp);
+		return;
+	}

+	prev = NULL;
+	key = gs_bioq_bio_key(head, bp);
 	cur = TAILQ_FIRST(&head->queue);

-	if (head->insert_point)
-		cur = head->insert_point;
+	if (head->insert_point) {
+		prev = head->insert_point;
+		cur = TAILQ_NEXT(head->insert_point, bio_queue);
+	}

 	while (cur != NULL && key >= gs_bioq_bio_key(head, cur)) {
 		prev = cur;
--- //depot/vendor/FreeBSD/head/sys/kern/subr_disk.c	2010-07-18 20:57:53.000000000 -0600
+++ /home/justing/perforce/FreeBSD/head/sys/kern/subr_disk.c	2010-07-18 20:57:53.000000000 -0600
@@ -127,7 +127,7 @@
  * bioq_remove()	remove a generic element from the queue, act as
  *		bioq_takefirst() if invoked on the head of the queue.
  *
- * The semantic of these methods is the same of the operations
+ * The semantic of these methods is the same as the operations
  * on the underlying TAILQ, but with additional guarantees on
  * subsequent bioq_disksort() calls. E.g. bioq_insert_tail()
  * can be useful for making sure that all previous ops are flushed
@@ -156,10 +156,10 @@
 bioq_remove(struct bio_queue_head *head, struct bio *bp)
 {

-	if (bp == TAILQ_FIRST(&head->queue))
-		head->last_offset = bp->bio_offset + bp->bio_length;
-
-	if (bp == head->insert_point)
+	if (head->insert_point == NULL) {
+		if (bp == TAILQ_FIRST(&head->queue))
+			head->last_offset = bp->bio_offset + bp->bio_length;
+	} else if (bp == head->insert_point)
 		head->insert_point = NULL;

 	TAILQ_REMOVE(&head->queue, bp, bio_queue);
@@ -178,7 +178,8 @@
 bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
 {

-	head->last_offset = bp->bio_offset;
+	if (head->insert_point == NULL)
+		head->last_offset = bp->bio_offset;
 	TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
 }

@@ -188,6 +189,7 @@

 	TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue);
 	head->insert_point = bp;
+	head->last_offset = bp->bio_offset;
 }

 struct bio *
@@ -230,13 +232,28 @@
 void
 bioq_disksort(struct bio_queue_head *head, struct bio *bp)
 {
-	struct bio *cur, *prev = NULL;
-	uoff_t key = bioq_bio_key(head, bp);
+	struct bio *cur, *prev;
+	uoff_t key;
+
+	if ((bp->bio_flags & BIO_ORDERED) != 0) {
+		/*
+		 * Ordered transactions can only be dispatched
+		 * after any currently queued transactions.  They
+		 * also have barrier semantics - no transactions
+		 * queued in the future can pass them.
+		 */
+		bioq_insert_tail(head, bp);
+		return;
+	}

+	prev = NULL;
+	key = bioq_bio_key(head, bp);
 	cur = TAILQ_FIRST(&head->queue);

-	if (head->insert_point)
-		cur = head->insert_point;
+	if (head->insert_point) {
+		prev = head->insert_point;
+		cur = TAILQ_NEXT(head->insert_point, bio_queue);
+	}

 	while (cur != NULL && key >= bioq_bio_key(head, cur)) {
 		prev = cur;
--- //depot/vendor/FreeBSD/head/sys/sys/bio.h	2009-12-11 10:35:58.000000000 -0700
+++ /home/justing/perforce/FreeBSD/head/sys/sys/bio.h	2009-12-11 10:35:58.000000000 -0700
@@ -54,6 +54,7 @@
 #define BIO_ERROR	0x01
 #define BIO_DONE	0x02
 #define BIO_ONQUEUE	0x04
+#define BIO_ORDERED	0x08

 #ifdef _KERNEL
 struct disk;


More information about the freebsd-current mailing list