svn commit: r285202 - in stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys

Andriy Gapon avg at FreeBSD.org
Mon Jul 6 10:40:55 UTC 2015


Author: avg
Date: Mon Jul  6 10:40:51 2015
New Revision: 285202
URL: https://svnweb.freebsd.org/changeset/base/285202

Log:
  MFC r284593: MFV r284412: 5911 ZFS "hangs" while deleting file
  
  illumos/illumos-gate at 46e1baa6cf6d5432f5fd231bb588df8f9570c858
  https://www.illumos.org/issues/5911
  Sometimes ZFS appears to hang while deleting a file. It is actually
  making slow progress at the file deletion, but other operations
  (administrative and writes via the data path) "hang" until the file
  removal completes, which can take a long time if the file has many
  blocks. The deletion (or most of it) happens in a single txg, and the
  sync thread spends most of its time reading indirect blocks...
  
  Reviewed by: Bayard Bell <buffer.g.overflow at gmail.com>
  Reviewed by: Alek Pinchuk <alek at nexenta.com>
  Reviewed by: Simon Klinkert <simon.klinkert at gmail.com>
  Reviewed by: Dan McDonald <danmcd at omniti.com>
  Approved by: Richard Lowe <richlowe at richlowe.net>
  Author: Matthew Ahrens <mahrens at delphix.com>
  
  PR:	199775
  Approved by:	re(kib)

Modified:
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Mon Jul  6 09:57:40 2015	(r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Mon Jul  6 10:40:51 2015	(r285202)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
@@ -1298,6 +1298,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
+
+	/*
+	 * Due to our use of dn_nlevels below, this can only be called
+	 * in open context, unless we are operating on the MOS.
+	 * From syncing context, dn_nlevels may be different from the
+	 * dn_nlevels used when dbuf was dirtied.
+	 */
+	ASSERT(db->db_objset ==
+	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1320,11 +1330,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 
 	ASSERT(db->db.db_size != 0);
 
-	/*
-	 * Any space we accounted for in dp_dirty_* will be cleaned up by
-	 * dsl_pool_sync().  This is relatively rare so the discrepancy
-	 * is not a big deal.
-	 */
+	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+	    dr->dr_accounted, txg);
 
 	*drp = dr->dr_next;
 
@@ -1339,7 +1346,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
-	    db->db_level+1 == dn->dn_nlevels) {
+	    db->db_level + 1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
@@ -1356,11 +1363,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
 			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
 	}
 
-	if (db->db_level != 0) {
-		mutex_destroy(&dr->dt.di.dr_mtx);
-		list_destroy(&dr->dt.di.dr_children);
-	}
-
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
@@ -2318,7 +2320,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
-	dbuf_sync_list(&dr->dt.di.dr_children, tx);
+	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
@@ -2464,7 +2466,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 }
 
 void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
@@ -2481,6 +2483,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *t
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
+		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf->db_level > 0)
 			dbuf_sync_indirect(dr, tx);

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Mon Jul  6 09:57:40 2015	(r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Mon Jul  6 10:40:51 2015	(r285202)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -686,7 +686,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
 			uint64_t ibyte = i << shift;
 			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
-			if (err == ESRCH)
+			if (err == ESRCH || i > end)
 				break;
 			if (err) {
 				tx->tx_err = err;

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Mon Jul  6 09:57:40 2015	(r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c	Mon Jul  6 10:40:51 2015	(r285202)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1492,6 +1492,16 @@ out:
 		rw_downgrade(&dn->dn_struct_rwlock);
 }
 
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+	if (db != NULL) {
+		dmu_buf_will_dirty(&db->db, tx);
+		dbuf_rele(db, FTAG);
+	}
+}
+
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
@@ -1612,27 +1622,67 @@ dnode_free_range(dnode_t *dn, uint64_t o
 		nblks += 1;
 
 	/*
-	 * Dirty the first and last indirect blocks, as they (and/or their
-	 * parents) will need to be written out if they were only
-	 * partially freed.  Interior indirect blocks will be themselves freed,
-	 * by free_children(), so they need not be dirtied.  Note that these
-	 * interior blocks have already been prefetched by dmu_tx_hold_free().
+	 * Dirty all the indirect blocks in this range.  Note that only
+	 * the first and last indirect blocks can actually be written
+	 * (if they were partially freed) -- they must be dirtied, even if
+	 * they do not exist on disk yet.  The interior blocks will
+	 * be freed by free_children(), so they will not actually be written.
+	 * Even though these interior blocks will not be written, we
+	 * dirty them for two reasons:
+	 *
+	 *  - It ensures that the indirect blocks remain in memory until
+	 *    syncing context.  (They have already been prefetched by
+	 *    dmu_tx_hold_free(), so we don't have to worry about reading
+	 *    them serially here.)
+	 *
+	 *  - The dirty space accounting will put pressure on the txg sync
+	 *    mechanism to begin syncing, and to delay transactions if there
+	 *    is a large amount of freeing.  Even though these indirect
+	 *    blocks will not be written, we could need to write the same
+	 *    amount of space if we copy the freed BPs into deadlists.
 	 */
 	if (dn->dn_nlevels > 1) {
 		uint64_t first, last;
 
 		first = blkid >> epbs;
-		if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
-			dmu_buf_will_dirty(&db->db, tx);
-			dbuf_rele(db, FTAG);
-		}
+		dnode_dirty_l1(dn, first, tx);
 		if (trunc)
 			last = dn->dn_maxblkid >> epbs;
 		else
 			last = (blkid + nblks - 1) >> epbs;
-		if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
-			dmu_buf_will_dirty(&db->db, tx);
-			dbuf_rele(db, FTAG);
+		if (last != first)
+			dnode_dirty_l1(dn, last, tx);
+
+		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		    SPA_BLKPTRSHIFT;
+		for (uint64_t i = first + 1; i < last; i++) {
+			/*
+			 * Set i to the blockid of the next non-hole
+			 * level-1 indirect block at or after i.  Note
+			 * that dnode_next_offset() operates in terms of
+			 * level-0-equivalent bytes.
+			 */
+			uint64_t ibyte = i << shift;
+			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+			    &ibyte, 2, 1, 0);
+			i = ibyte >> shift;
+			if (i >= last)
+				break;
+
+			/*
+			 * Normally we should not see an error, either
+			 * from dnode_next_offset() or dbuf_hold_level()
+			 * (except for ESRCH from dnode_next_offset).
+			 * If there is an i/o error, then when we read
+			 * this block in syncing context, it will use
+			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+			 * to the "failmode" property.  dnode_next_offset()
+			 * doesn't have a flag to indicate MUSTSUCCEED.
+			 */
+			if (err != 0)
+				break;
+
+			dnode_dirty_l1(dn, i, tx);
 		}
 	}
 

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c	Mon Jul  6 09:57:40 2015	(r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c	Mon Jul  6 10:40:51 2015	(r285202)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -712,7 +712,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		mutex_exit(&dn->dn_mtx);
 	}
 
-	dbuf_sync_list(list, tx);
+	dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
 
 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		ASSERT3P(list_head(list), ==, NULL);

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Mon Jul  6 09:57:40 2015	(r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Mon Jul  6 10:40:51 2015	(r285202)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
@@ -287,7 +287,7 @@ void dbuf_evict(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
 void dbuf_release_bp(dmu_buf_impl_t *db);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,


More information about the svn-src-stable-10 mailing list