svn commit: r322226 - in head: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs

Andriy Gapon avg at FreeBSD.org
Tue Aug 8 10:46:53 UTC 2017


Author: avg
Date: Tue Aug  8 10:46:51 2017
New Revision: 322226
URL: https://svnweb.freebsd.org/changeset/base/322226

Log:
  MFV r322223: 8378 crash due to bp in-memory modification of nopwrite block
  
  illumos/illumos-gate at b7edcb940884114e61382937505433c4c38c0278
  https://github.com/illumos/illumos-gate/commit/b7edcb940884114e61382937505433c4c38c0278
  
  https://www.illumos.org/issues/8378
    The problem is that zfs_get_data() supplies a stale zgd_bp to dmu_sync(), which
    we then nopwrite against.
    zfs_get_data() doesn't hold any DMU-related locks, so after it copies db_blkptr
    to zgd_bp, dbuf_write_ready()
    could change db_blkptr, and dbuf_write_done() could remove the dirty record.
    dmu_sync() then sees the stale
    BP and that the dbuf it not dirty, so it is eligible for nop-writing.
    The fix is for dmu_sync() to copy db_blkptr to zgd_bp after acquiring the
    db_mtx. We could still see a stale
    db_blkptr, but if it is stale then the dirty record will still exist and thus
    we won't attempt to nopwrite.
  
  Reviewed by: Prakash Surya <prakash.surya at delphix.com>
  Reviewed by: George Wilson <george.wilson at delphix.com>
  Approved by: Robert Mustacchi <rm at joyent.com>
  Author: Matthew Ahrens <mahrens at delphix.com>
  
  MFC after:	2 weeks

Modified:
  head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
Directory Properties:
  head/cddl/contrib/opensolaris/   (props changed)
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Tue Aug  8 10:45:22 2017	(r322225)
+++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Tue Aug  8 10:46:51 2017	(r322226)
@@ -1838,7 +1838,6 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, z
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
-	blkptr_t *bp = &lr->lr_blkptr;
 	uint64_t txg = lr->lr_common.lrc_txg;
 	uint64_t crtxg;
 	dmu_object_info_t doi;
@@ -1892,11 +1891,7 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, z
 		    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
-			blkptr_t *obp = dmu_buf_get_blkptr(db);
-			if (obp) {
-				ASSERT(BP_IS_HOLE(bp));
-				*bp = *obp;
-			}
+			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Tue Aug  8 10:45:22 2017	(r322225)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Tue Aug  8 10:46:51 2017	(r322226)
@@ -1657,6 +1657,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
+			VERIFY(BP_EQUAL(bp, db->db_blkptr));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			ASSERT(zio_checksum_table[chksum].ci_flags &
 			    ZCHECKSUM_FLAG_NOPWRITE);
@@ -1697,19 +1698,11 @@ dmu_sync_late_arrival_done(zio_t *zio)
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 
 	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
-		/*
-		 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
-		 * then there is nothing to do here. Otherwise, free the
-		 * newly allocated block in this txg.
-		 */
-		if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
-			ASSERT(BP_EQUAL(bp, bp_orig));
-		} else {
-			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
-			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
-			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
-		}
+		ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
+		ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
@@ -1741,6 +1734,29 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sy
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
+	/*
+	 * Since we are currently syncing this txg, it's nontrivial to
+	 * determine what BP to nopwrite against, so we disable nopwrite.
+	 *
+	 * When syncing, the db_blkptr is initially the BP of the previous
+	 * txg.  We can not nopwrite against it because it will be changed
+	 * (this is similar to the non-late-arrival case where the dbuf is
+	 * dirty in a future txg).
+	 *
+	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
+	 * We can not nopwrite against it because although the BP will not
+	 * (typically) be changed, the data has not yet been persisted to this
+	 * location.
+	 *
+	 * Finally, when dbuf_write_done() is called, it is theoretically
+	 * possible to always nopwrite, because the data that was written in
+	 * this txg is the same data that we are trying to write.  However we
+	 * would need to check that this dbuf is not dirty in any future
+	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
+	 * don't nopwrite in this case.
+	 */
+	zp->zp_nopwrite = B_FALSE;
+
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
@@ -1778,7 +1794,6 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sy
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
-	blkptr_t *bp = zgd->zgd_bp;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
@@ -1845,6 +1860,21 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done
 
 	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
 
+	if (db->db_blkptr != NULL) {
+		/*
+		 * We need to fill in zgd_bp with the current blkptr so that
+		 * the nopwrite code can check if we're writing the same
+		 * data that's already on disk.  We can only nopwrite if we
+		 * are sure that after making the copy, db_blkptr will not
+		 * change until our i/o completes.  We ensure this by
+		 * holding the db_mtx, and only allowing nopwrite if the
+		 * block is not already dirty (see below).  This is verified
+		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
+		 * not changed.
+		 */
+		*zgd->zgd_bp = *db->db_blkptr;
+	}
+
 	/*
 	 * Assume the on-disk data is X, the current syncing data (in
 	 * txg - 1) is Y, and the current in-memory data is Z (currently
@@ -1896,7 +1926,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
-	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
+	    zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
 	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	Tue Aug  8 10:45:22 2017	(r322225)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	Tue Aug  8 10:46:51 2017	(r322226)
@@ -1297,7 +1297,6 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
-	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
@@ -1374,11 +1373,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
-			blkptr_t *obp = dmu_buf_get_blkptr(db);
-			if (obp) {
-				ASSERT(BP_IS_HOLE(bp));
-				*bp = *obp;
-			}
+			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Tue Aug  8 10:45:22 2017	(r322225)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Tue Aug  8 10:46:51 2017	(r322226)
@@ -27,7 +27,7 @@
  * Portions Copyright 2010 Robert Milkowski
  *
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
@@ -1340,7 +1340,6 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zi
 	uint64_t object = ZVOL_OBJ;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;	/* length of user data */
-	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error;
@@ -1368,11 +1367,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zi
 		error = dmu_buf_hold(os, object, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (error == 0) {
-			blkptr_t *obp = dmu_buf_get_blkptr(db);
-			if (obp) {
-				ASSERT(BP_IS_HOLE(bp));
-				*bp = *obp;
-			}
+			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;


More information about the svn-src-all mailing list