svn commit: r243013 - vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/ztest

Martin Matuska mm at FreeBSD.org
Wed Nov 14 00:43:28 UTC 2012


Author: mm
Date: Wed Nov 14 00:43:27 2012
New Revision: 243013
URL: http://svnweb.freebsd.org/changeset/base/243013

Log:
  Update vendor/illumos/dist and vendor/illumos-sys/dist
  to illumos-gate 13887:196932ec9e6a
  (illumos zfs issue #3236)

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zvol.c

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/ztest/ztest.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c	Wed Nov 14 00:43:27 2012	(r243013)
@@ -3418,6 +3418,12 @@ arc_write_done(zio_t *zio)
 				arc_hdr_destroy(exists);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
+			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+				/* nopwrite */
+				ASSERT(zio->io_prop.zp_nopwrite);
+				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+					panic("bad nopwrite, hdr=%p exists=%p",
+					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT(hdr->b_datacnt == 1);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	Wed Nov 14 00:43:27 2012	(r243013)
@@ -768,13 +768,15 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
-	if (!BP_IS_HOLE(bp)) {
+	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
 		spa_t *spa;
 
 		DB_GET_SPA(&spa, db);
 		zio_free(spa, txg, bp);
 	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	dr->dt.dl.dr_nopwrite = B_FALSE;
+
 	/*
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
@@ -2189,6 +2191,13 @@ dmu_buf_freeable(dmu_buf_t *dbuf)
 	return (res);
 }
 
+blkptr_t *
+dmu_buf_get_blkptr(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	return (dbi->db_blkptr);
+}
+
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
@@ -2527,7 +2536,11 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b
 	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
-	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+	/*
+	 * For nopwrites and rewrites we ensure that the bp matches our
+	 * original and bypass all the accounting.
+	 */
+	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		objset_t *os;
@@ -2718,7 +2731,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-		    dr->dt.dl.dr_copies);
+		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c	Wed Nov 14 00:43:27 2012	(r243013)
@@ -40,12 +40,18 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
 #include <sys/sa.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
 #endif
 
+/*
+ * Enable/disable nopwrite feature.
+ */
+int zfs_nopwrite_enabled = 1;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
 	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
@@ -1281,6 +1287,16 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
+		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
+		if (dr->dt.dl.dr_nopwrite) {
+			blkptr_t *bp = zio->io_bp;
+			blkptr_t *bp_orig = &zio->io_bp_orig;
+			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
+
+			ASSERT(BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
+			ASSERT(zio_checksum_table[chksum].ci_dedup);
+		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
@@ -1302,11 +1318,22 @@ dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
 
 	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
-		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
-		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
-		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+		/*
+		 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
+		 * then there is nothing to do here. Otherwise, free the
+		 * newly allocated block in this txg.
+		 */
+		if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+			ASSERT(BP_EQUAL(bp, bp_orig));
+		} else {
+			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
@@ -1351,7 +1378,7 @@ dmu_sync_late_arrival(zio_t *pio, objset
  *
  * Return values:
  *
- *	EEXIST: this txg has already been synced, so there's nothing to to.
+ *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
@@ -1383,7 +1410,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 	dnode_t *dn;
 
 	ASSERT(pio != NULL);
-	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
@@ -1438,6 +1464,23 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 		return (ENOENT);
 	}
 
+	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
+
+	/*
+	 * Assume the on-disk data is X, the current syncing data is Y,
+	 * and the current in-memory data is Z (currently in dmu_sync).
+	 * X and Z are identical but Y is has been modified. Normally,
+	 * when X and Z are the same we will perform a nopwrite but if Y
+	 * is different we must disable nopwrite since the resulting write
+	 * of Y to disk can free the block containing X. If we allowed a
+	 * nopwrite to occur the block pointing to Z would reference a freed
+	 * block. Since this is a rare case we simplify this by disabling
+	 * nopwrite if the current dmu_sync-ing dbuf has been modified in
+	 * a previous transaction.
+	 */
+	if (dr->dr_next)
+		zp.zp_nopwrite = B_FALSE;
+
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
@@ -1522,15 +1565,26 @@ dmu_write_policy(objset_t *os, dnode_t *
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
-	boolean_t dedup;
+	boolean_t dedup, nopwrite;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	int copies = os->os_copies;
 
 	/*
-	 * Determine checksum setting.
+	 * We maintain different write policies for each of the following
+	 * types of data:
+	 *	 1. metadata
+	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
+	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		/*
+		 * XXX -- we should design a compression algorithm
+		 * that specializes in arrays of bps.
+		 */
+		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+		    ZIO_COMPRESS_LZJB;
+
+		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
@@ -1540,45 +1594,52 @@ dmu_write_policy(objset_t *os, dnode_t *
 		if (zio_checksum_table[checksum].ci_correctable < 1 ||
 		    zio_checksum_table[checksum].ci_eck)
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
-	} else {
-		checksum = zio_checksum_select(dn->dn_checksum, checksum);
-	}
 
-	/*
-	 * Determine compression setting.
-	 */
-	if (ismd) {
+		dedup = B_FALSE;
+		nopwrite = B_FALSE;
+	} else if (wp & WP_NOFILL) {
+		ASSERT(level == 0);
+
 		/*
-		 * XXX -- we should design a compression algorithm
-		 * that specializes in arrays of bps.
+		 * If we're writing preallocated blocks, we aren't actually
+		 * writing them so don't set any policy properties.  These
+		 * blocks are currently only used by an external subsystem
+		 * outside of zfs (i.e. dump) and not written by the zio
+		 * pipeline.
 		 */
-		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-		    ZIO_COMPRESS_LZJB;
+		compress = ZIO_COMPRESS_OFF;
+		checksum = ZIO_CHECKSUM_OFF;
+		dedup = B_FALSE;
+		nopwrite = B_FALSE;
 	} else {
 		compress = zio_compress_select(dn->dn_compress, compress);
-	}
 
-	/*
-	 * Determine dedup setting.  If we are in dmu_sync(), we won't
-	 * actually dedup now because that's all done in syncing context;
-	 * but we do want to use the dedup checkum.  If the checksum is not
-	 * strong enough to ensure unique signatures, force dedup_verify.
-	 */
-	dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
-	if (dedup) {
-		checksum = dedup_checksum;
-		if (!zio_checksum_table[checksum].ci_dedup)
-			dedup_verify = 1;
-	}
+		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
+		    zio_checksum_select(dn->dn_checksum, checksum) :
+		    dedup_checksum;
 
-	if (wp & WP_DMU_SYNC)
-		dedup = 0;
+		/*
+		 * Determine dedup setting.  If we are in dmu_sync(),
+		 * we won't actually dedup now because that's all
+		 * done in syncing context; but we do want to use the
+		 * dedup checkum.  If the checksum is not strong
+		 * enough to ensure unique signatures, force
+		 * dedup_verify.
+		 */
+		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
+			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
+			if (!zio_checksum_table[checksum].ci_dedup)
+				dedup_verify = B_TRUE;
+		}
 
-	if (wp & WP_NOFILL) {
-		ASSERT(!ismd && level == 0);
-		checksum = ZIO_CHECKSUM_OFF;
-		compress = ZIO_COMPRESS_OFF;
-		dedup = B_FALSE;
+		/*
+		 * Enable nopwrite if we have a cryptographically secure
+		 * checksum that has no known collisions (i.e. SHA-256)
+		 * and compression is enabled.  We don't enable nopwrite if
+		 * dedup is enabled as the two features are mutually exclusive.
+		 */
+		nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
+		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	zp->zp_checksum = checksum;
@@ -1588,6 +1649,7 @@ dmu_write_policy(objset_t *os, dnode_t *
 	zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
+	zp->zp_nopwrite = nopwrite;
 }
 
 int

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c	Wed Nov 14 00:43:27 2012	(r243013)
@@ -415,7 +415,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t t
 	 * clean up our in-memory structures accumulated while syncing:
 	 *
 	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
-	 *  - clean up zil records
 	 *  - release hold from dsl_dataset_dirty()
 	 */
 	while (ds = list_remove_head(&synced_datasets)) {

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c	Wed Nov 14 00:43:27 2012	(r243013)
@@ -260,7 +260,6 @@ extern int zfs_txg_synctime_ms;
  * Secondly, the value determines if an I/O is considered "hung".
  * Any I/O that has not completed in zfs_deadman_synctime is considered
  * "hung" resulting in a system panic.
- * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
  */
 uint64_t zfs_deadman_synctime = 1000ULL;
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	Wed Nov 14 00:43:27 2012	(r243013)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DBUF_H
@@ -130,6 +131,7 @@ typedef struct dbuf_dirty_record {
 			blkptr_t dr_overridden_by;
 			override_states_t dr_override_state;
 			uint8_t dr_copies;
+			boolean_t dr_nopwrite;
 		} dl;
 	} dt;
 } dbuf_dirty_record_t;

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h	Wed Nov 14 00:43:27 2012	(r243013)
@@ -496,6 +496,11 @@ void dmu_evict_user(objset_t *os, dmu_bu
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 /*
+ * Returns the blkptr associated with this dbuf, or NULL if not set.
+ */
+struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
+
+/*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h	Wed Nov 14 00:43:27 2012	(r243013)
@@ -187,7 +187,9 @@ enum zio_flag {
 	ZIO_FLAG_RAW		= 1 << 21,
 	ZIO_FLAG_GANG_CHILD	= 1 << 22,
 	ZIO_FLAG_DDT_CHILD	= 1 << 23,
-	ZIO_FLAG_GODFATHER	= 1 << 24
+	ZIO_FLAG_GODFATHER	= 1 << 24,
+	ZIO_FLAG_NOPWRITE	= 1 << 25,
+	ZIO_FLAG_REEXECUTED	= 1 << 26,
 };
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
@@ -285,8 +287,9 @@ typedef struct zio_prop {
 	dmu_object_type_t	zp_type;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
-	uint8_t			zp_dedup;
-	uint8_t			zp_dedup_verify;
+	boolean_t		zp_dedup;
+	boolean_t		zp_dedup_verify;
+	boolean_t		zp_nopwrite;
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
@@ -453,7 +456,8 @@ extern zio_t *zio_rewrite(zio_t *pio, sp
     void *data, uint64_t size, zio_done_func_t *done, void *private,
     int priority, enum zio_flag flags, zbookmark_t *zb);
 
-extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
+    boolean_t nopwrite);
 
 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_impl.h	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_impl.h	Wed Nov 14 00:43:27 2012	(r243013)
@@ -38,6 +38,70 @@ extern "C" {
 #endif
 
 /*
+ * XXX -- Describe ZFS I/O pipleine here. Fill in as needed.
+ *
+ * The ZFS I/O pipeline is comprised of various stages which are defined
+ * in the zio_stage enum below. The individual stages are used to construct
+ * these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
+ *
+ * I/O operations: (XXX - provide detail for each of the operations)
+ *
+ * Read:
+ * Write:
+ * Free:
+ * Claim:
+ * Ioctl:
+ *
+ * Although the most common pipeline are used by the basic I/O operations
+ * above, there are some helper pipelines (one could consider them
+ * sub-pipelines) which are used internally by the ZIO module and are
+ * explained below:
+ *
+ * Interlock Pipeline:
+ * The interlock pipeline is the most basic pipeline and is used by all
+ * of the I/O operations. The interlock pipeline does not perform any I/O
+ * and is used to coordinate the dependencies between I/Os that are being
+ * issued (i.e. the parent/child relationship).
+ *
+ * Vdev child Pipeline:
+ * The vdev child pipeline is responsible for performing the physical I/O.
+ * It is in this pipeline where the I/O are queued and possibly cached.
+ *
+ * In addition to performing I/O, the pipeline is also responsible for
+ * data transformations. The transformations performed are based on the
+ * specific properties that user may have selected and modify the
+ * behavior of the pipeline. Examples of supported transformations are
+ * compression, dedup, and nop writes. Transformations will either modify
+ * the data or the pipeline. This list below further describes each of
+ * the supported transformations:
+ *
+ * Compression:
+ * ZFS supports three different flavors of compression -- gzip, lzjb, and
+ * zle. Compression occurs as part of the write pipeline and is performed
+ * in the ZIO_STAGE_WRITE_BP_INIT stage.
+ *
+ * Dedup:
+ * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and
+ * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing
+ * read pipeline if the dedup bit is set on the block pointer.
+ * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage
+ * and added to a write pipeline if a user has enabled dedup on that
+ * particular dataset.
+ *
+ * NOP Write:
+ * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage
+ * and is added to an existing write pipeline if a crypographically
+ * secure checksum (i.e. SHA256) is enabled and compression is turned on.
+ * The NOP write stage will compare the checksums of the current data
+ * on-disk (level-0 blocks only) and the data that is currently being written.
+ * If the checksum values are identical then the pipeline is converted to
+ * an interlock pipeline skipping block allocation and bypassing the
+ * physical I/O.  The nop write feature can handle writes in either
+ * syncing or open context (i.e. zil writes) and as a result is mutually
+ * exclusive with dedup.
+ */
+
+/*
  * zio pipeline stage definitions
  */
 enum zio_stage {
@@ -50,27 +114,29 @@ enum zio_stage {
 
 	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 5,	/* -W--- */
 
-	ZIO_STAGE_DDT_READ_START	= 1 << 6,	/* R---- */
-	ZIO_STAGE_DDT_READ_DONE		= 1 << 7,	/* R---- */
-	ZIO_STAGE_DDT_WRITE		= 1 << 8,	/* -W--- */
-	ZIO_STAGE_DDT_FREE		= 1 << 9,	/* --F-- */
+	ZIO_STAGE_NOP_WRITE		= 1 << 6,	/* -W--- */
+
+	ZIO_STAGE_DDT_READ_START	= 1 << 7,	/* R---- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 8,	/* R---- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 9,	/* -W--- */
+	ZIO_STAGE_DDT_FREE		= 1 << 10,	/* --F-- */
 
-	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 10,	/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE		= 1 << 11,	/* RWFC- */
+	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 11,	/* RWFC- */
+	ZIO_STAGE_GANG_ISSUE		= 1 << 12,	/* RWFC- */
 
-	ZIO_STAGE_DVA_ALLOCATE		= 1 << 12,	/* -W--- */
-	ZIO_STAGE_DVA_FREE		= 1 << 13,	/* --F-- */
-	ZIO_STAGE_DVA_CLAIM		= 1 << 14,	/* ---C- */
+	ZIO_STAGE_DVA_ALLOCATE		= 1 << 13,	/* -W--- */
+	ZIO_STAGE_DVA_FREE		= 1 << 14,	/* --F-- */
+	ZIO_STAGE_DVA_CLAIM		= 1 << 15,	/* ---C- */
 
-	ZIO_STAGE_READY			= 1 << 15,	/* RWFCI */
+	ZIO_STAGE_READY			= 1 << 16,	/* RWFCI */
 
-	ZIO_STAGE_VDEV_IO_START		= 1 << 16,	/* RW--I */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 17,	/* RW--I */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 18,	/* RW--I */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 17,	/* RW--I */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 18,	/* RW--I */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 19,	/* RW--I */
 
-	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 19,	/* R---- */
+	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 20,	/* R---- */
 
-	ZIO_STAGE_DONE			= 1 << 20	/* RWFCI */
+	ZIO_STAGE_DONE			= 1 << 21	/* RWFCI */
 };
 
 #define	ZIO_INTERLOCK_STAGES			\

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c	Wed Nov 14 00:43:27 2012	(r243013)
@@ -1053,6 +1053,12 @@ zfs_get_data(void *arg, lr_write_t *lr, 
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c	Wed Nov 14 00:43:27 2012	(r243013)
@@ -648,9 +648,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64
 	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
-	    zp->zp_copies <= spa_max_replication(spa) &&
-	    zp->zp_dedup <= 1 &&
-	    zp->zp_dedup_verify <= 1);
+	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
@@ -678,13 +676,20 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint
 }
 
 void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 
+	/*
+	 * We must reset the io_prop to match the values that existed
+	 * when the bp was first written by dmu_sync() keeping in mind
+	 * that nopwrite and dedup are mutually exclusive.
+	 */
+	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
+	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
@@ -972,6 +977,19 @@ zio_write_bp_init(zio_t *zio)
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+		/*
+		 * If we've been overridden and nopwrite is set then
+		 * set the flag accordingly to indicate that a nopwrite
+		 * has already occurred.
+		 */
+		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+			ASSERT(!zp->zp_dedup);
+			zio->io_flags |= ZIO_FLAG_NOPWRITE;
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+
+		ASSERT(!zp->zp_nopwrite);
+
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (ZIO_PIPELINE_CONTINUE);
 
@@ -1059,6 +1077,11 @@ zio_write_bp_init(zio_t *zio)
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
+		if (zp->zp_nopwrite) {
+			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
+		}
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
@@ -1280,6 +1303,7 @@ zio_reexecute(zio_t *pio)
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
+	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_error = 0;
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_state[w] = 0;
@@ -1755,8 +1779,9 @@ zio_write_gang_block(zio_t *pio)
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
-		zp.zp_dedup = 0;
-		zp.zp_dedup_verify = 0;
+		zp.zp_dedup = B_FALSE;
+		zp.zp_dedup_verify = B_FALSE;
+		zp.zp_nopwrite = B_FALSE;
 
 		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1776,6 +1801,62 @@ zio_write_gang_block(zio_t *pio)
 }
 
 /*
+ * The zio_nop_write stage in the pipeline determines if allocating
+ * a new bp is necessary.  By leveraging a cryptographically secure checksum,
+ * such as SHA256, we can compare the checksums of the new data and the old
+ * to determine if allocating a new block is required.  The nopwrite
+ * feature can handle writes in either syncing or open context (i.e. zil
+ * writes) and as a result is mutually exclusive with dedup.
+ */
+static int
+zio_nop_write(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	zio_prop_t *zp = &zio->io_prop;
+
+	ASSERT(BP_GET_LEVEL(bp) == 0);
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+	ASSERT(zp->zp_nopwrite);
+	ASSERT(!zp->zp_dedup);
+	ASSERT(zio->io_bp_override == NULL);
+	ASSERT(IO_IS_ALLOCATING(zio));
+
+	/*
+	 * Check to see if the original bp and the new bp have matching
+	 * characteristics (i.e. same checksum, compression algorithms, etc).
+	 * If they don't then just continue with the pipeline which will
+	 * allocate a new bp.
+	 */
+	if (BP_IS_HOLE(bp_orig) ||
+	    !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
+	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
+	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
+	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
+	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
+		return (ZIO_PIPELINE_CONTINUE);
+
+	/*
+	 * If the checksums match then reset the pipeline so that we
+	 * avoid allocating a new bp and issuing any I/O.
+	 */
+	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
+		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
+		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
+		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
+		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
+		    sizeof (uint64_t)) == 0);
+
+		*bp = *bp_orig;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		zio->io_flags |= ZIO_FLAG_NOPWRITE;
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
  * ==========================================================================
  * Dedup
  * ==========================================================================
@@ -2047,7 +2128,7 @@ zio_ddt_write(zio_t *zio)
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
-			zp->zp_dedup = 0;
+			zp->zp_dedup = B_FALSE;
 		}
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
@@ -2669,7 +2750,8 @@ zio_ready(zio_t *zio)
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
-		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
+		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
@@ -2751,6 +2833,8 @@ zio_done(zio_t *zio)
 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
 		}
+		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
+			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
 	}
 
 	/*
@@ -2860,7 +2944,7 @@ zio_done(zio_t *zio)
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
-	    !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
+	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
@@ -2999,6 +3083,7 @@ static zio_pipe_stage_t *zio_pipeline[] 
 	zio_issue_async,
 	zio_write_bp_init,
 	zio_checksum_generate,
+	zio_nop_write,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zvol.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zvol.c	Wed Nov 14 00:40:40 2012	(r243012)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zvol.c	Wed Nov 14 00:43:27 2012	(r243013)
@@ -80,6 +80,7 @@
 #include <sys/zvol.h>
 #include <sys/dumphdr.h>
 #include <sys/zil_impl.h>
+#include <sys/dbuf.h>
 
 #include "zfs_namecheck.h"
 
@@ -972,6 +973,12 @@ zvol_get_data(void *arg, lr_write_t *lr,
 		error = dmu_buf_hold(os, object, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 


More information about the svn-src-all mailing list