ZFS: Deadlock during vnode recycling

Sat Sep 29 23:06:57 UTC 2012

On Sep 29, 2012, at 8:45 AM, Pawel Jakub Dawidek <pjd at freebsd.org> wrote:

> On Tue, Sep 18, 2012 at 09:14:22AM -0600, Justin T. Gibbs wrote:
>> One of our systems became unresponsive due to an inability to recycle
>> vnodes.  We tracked this down to a deadlock in zfs_zget().  I've attached
>> the stack trace from the vnlru process to the end of this email.
>> 
>> We are currently testing the following patch. Since this issue is hard to
>> replicate I would appreciate review and feedback before I commit it to
>> FreeBSD.
> 
> I think the patch is ok. The whole mess we have to deal when to comes to
> reclamation won't get much more messy with this patch in place:)

We're running now with something a little more invasive than the
last patch I presented.  I added stats for the two different types
of collisions, but doing so made zfs_zget() even more messy, so I
refactored it.  I also made the stats in the file real kstats.

--
Justin

Both together:

Change 635310 by justing at justing_ns1_spectrabsd on 2012/09/17 15:30:14

For most vnode consumers of ZFS, the appropriate behavior
when encountering a vnode that is in the process of being
reclaimed is to wait for that process to complete and then
allocate a new vnode.  This behavior is enforced in zfs_zget()
by checking for the VI_DOOMED vnode flag.  In the case of
the thread actually reclaiming the vnode, zfs_zget() must
return the current vnode, otherwise a deadlock will occur.

sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h:
	Create a virtual znode field, z_reclaim_td, which is
	implemeted as a macro that redirects to z_task.ta_context.

	z_task is only used by the reclaim code to perform the
	final cleanup of a znode in a secondary thread.  Since
	this can only occur after any calls to zfs_zget(), it
	is safe to reuse the ta_context field.

sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c:
	In zfs_freebsd_reclaim(), record curthread in the
	znode being reclaimed.

sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c:
	o Null out z_reclaim_td when znode_ts are constructed.

	o In zfs_zget(), return a "doomed vnode" if the current
	  thread is actively reclaiming this object.

Change 635653 on 2012/09/19 by justing at justing_ns1_spectrabsd

Add statistics for zfs_zget() vnode reclamation collisions.

sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c:
	o Convert the existing, but sun only, znode stats to
	  use the solaris kstat facility much like the rest
	  of ZFS.
	o Create a debug class of znode stats which are only
	  collected and published if ZFS's DEBUG macro is defined.
	  These are incremented using ZNODE_DEBUG_STAT_ADD()
	  instead of ZNODE_STAT_ADD().  All of the pre-existing
	  stats were debug only, and are now in this class.
	o Add two new (always available) stat entries:
	    zget_reclaim_collisions:
		The number of times ZFS has slept in zfs_zget()
		waiting for a vnode to be reclaimed.

	    zget_reclaim_td_collisions:
		The number of times ZFS has entered zfs_zget()
		from the thread reclaiming the vnode that is being
		reclaimed.  In this case the "doomed" vnode is
		returned imemediately so that reclamation can
		complete.
	o Refactor zfs_zget()
	  Two new helper functions were created:     
	    zfs_zget_znode_alloc():
		Called by zfs_zget() when no existing znode is found.

	    zfs_zget_reclaim_pause():
		Called by zfs_zget() when it must pause to wait for
		an in-progress vnode reclamation to complete.

	  Using these routines, restructure zfs_zget() to remove
	  a goto and nested blocks that pushed the common path
	  way off to the right.

--- /usr/home/justing/perforce/vendor/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.h	2012-09-14 16:27:27.869936258 -0600
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.h	2012-09-20 10:56:49.908850009 -0600
@@ -241,6 +241,7 @@
 	struct task	z_task;
 } znode_t;
 
+#define	z_reclaim_td z_task.ta_context
 
 /*
  * Convert between znode pointers and vnode pointers

--- /usr/home/justing/perforce/vendor/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	2012-09-14 16:27:27.869936258 -0600
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	2012-09-20 10:56:49.908850009 -0600
@@ -6083,6 +6083,13 @@ zfs_freebsd_reclaim(ap)
 
 	ASSERT(zp != NULL);
 
+ 	/*
+	 * Mark the znode so that operations that typically block
+	 * waiting for reclamation to complete will return the current,
+	 * "doomed vnode", for this thread.
+	 */
+	zp->z_reclaim_td = curthread;
+
 	/*
 	 * Destroy the vm object and flush associated pages.
 	 */

--- /usr/home/justing/perforce/vendor/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	2012-09-14 16:27:27.869936258 -0600
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c	2012-09-20 10:56:49.908850009 -0600
@@ -43,6 +42,7 @@
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/atomic.h>
+#include <sys/kstat.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
@@ -70,19 +70,53 @@
 SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
     "sizeof(znode_t)");
 
-/*
- * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
+typedef struct znode_stats {
+	kstat_named_t zns_zget_reclaim_collisions;
+	kstat_named_t zns_zget_reclaim_td_collisions;
+} znode_stats_t;
+
+static znode_stats_t znode_stats = {
+	{ "zget_reclaim_collisions",	KSTAT_DATA_UINT64 },
+	{ "zget_reclaim_td_collisions",	KSTAT_DATA_UINT64 },
+};
+kstat_t	*znode_ksp;
+
+#define	ZNODE_STAT_ADD(stat)	\
+    atomic_add_64(&znode_stats.zns_##stat.value.ui64, 1)
+
 #ifdef	DEBUG
-#define	ZNODE_STATS
-#endif	/* DEBUG */
+typedef struct znode_debug_stats {
+#ifdef sun
+	kstat_named_t znds_move_zfsvfs_invald;
+	kstat_named_t znds_move_zfsvfs_recheck1;
+	kstat_named_t znds_move_zfsvfs_unmounted;
+	kstat_named_t znds_move_zfsvfs_recheck2;
+	kstat_named_t znds_move_obj_held;
+	kstat_named_t znds_move_vnode_locked;
+	kstat_named_t znds_move_not_only_dnlc;
+	kstat_named_t znds_zget_reclaim_collision;
+	kstat_named_t znds_zget_reclaim_td_collision;
+#endif	/* sun */
+} znode_debug_stats_t;
 
-#ifdef	ZNODE_STATS
-#define	ZNODE_STAT_ADD(stat)			((stat)++)
+static znode_debug_stats_t znode_debug_stats = {
+#ifdef sun
+	{ "move_zfsvfs_invalid",	KSTAT_DATA_UINT64 },
+	{ "move_zfsvfs_recheck1",	KSTAT_DATA_UINT64 },
+	{ "move_zfsvfs_unmounted",	KSTAT_DATA_UINT64 },
+	{ "move_zfsvfs_recheck2",	KSTAT_DATA_UINT64 },
+	{ "move_obj_held",		KSTAT_DATA_UINT64 },
+	{ "move_vnode_locked",		KSTAT_DATA_UINT64 },
+	{ "move_not_only_dnlc",		KSTAT_DATA_UINT64 },
+#endif	/* sun */
+};
+kstat_t	*znode_debug_ksp;
+
+#define	ZNODE_DEBUG_STAT_ADD(stat)	\
+    atomic_add_64(&znode_debug_stats.znds_##stat.value.ui64, 1)
 #else
-#define	ZNODE_STAT_ADD(stat)			/* nothing */
-#endif	/* ZNODE_STATS */
+#define	ZNODE_DEBUG_STAT_ADD(stat) do { } while (0)
+#endif	/* DEBUG */
 
 /*
  * Functions needed for userland (ie: libzpool) are not put under
@@ -159,6 +193,7 @@ zfs_znode_cache_constructor(void *buf, v
 	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
 	zp->z_moved = 0;
+	zp->z_reclaim_td = NULL;
 	return (0);
 }
 
@@ -277,7 +312,7 @@ zfs_znode_move(void *buf, void *newbuf, 
 	 */
 	zfsvfs = ozp->z_zfsvfs;
 	if (!POINTER_IS_VALID(zfsvfs)) {
-		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
+		ZNODE_DEBUG_STAT_ADD(move_zfsvfs_invalid);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
@@ -290,7 +325,7 @@ zfs_znode_move(void *buf, void *newbuf, 
 	rw_enter(&zfsvfs_lock, RW_WRITER);
 	if (zfsvfs != ozp->z_zfsvfs) {
 		rw_exit(&zfsvfs_lock);
-		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
+		ZNODE_DEBUG_STAT_ADD(move_zfsvfs_recheck1);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
@@ -304,7 +339,7 @@ zfs_znode_move(void *buf, void *newbuf, 
 	if (zfsvfs->z_unmounted) {
 		ZFS_EXIT(zfsvfs);
 		rw_exit(&zfsvfs_lock);
-		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
+		ZNODE_DEBUG_STAT_ADD(move_zfsvfs_unmounted);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 	rw_exit(&zfsvfs_lock);
@@ -317,7 +352,7 @@ zfs_znode_move(void *buf, void *newbuf, 
 	if (zfsvfs != ozp->z_zfsvfs) {
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
-		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
+		ZNODE_DEBUG_STAT_ADD(move_zfsvfs_recheck2);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
@@ -329,7 +364,7 @@ zfs_znode_move(void *buf, void *newbuf, 
 	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
-		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
+		ZNODE_DEBUG_STAT_ADD(move_obj_held);
 		return (KMEM_CBRC_LATER);
 	}
 
@@ -338,7 +373,7 @@ zfs_znode_move(void *buf, void *newbuf, 
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
-		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
+		ZNODE_DEBUG_STAT_ADD(move_vnode_locked);
 		return (KMEM_CBRC_LATER);
 	}
 
@@ -348,7 +383,7 @@ zfs_znode_move(void *buf, void *newbuf, 
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
-		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
+		ZNODE_DEBUG_STAT_ADD(move_not_only_dnlc);
 		return (KMEM_CBRC_LATER);
 	}
 
@@ -380,6 +415,27 @@ zfs_znode_init(void)
 	    sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
 	kmem_cache_set_move(znode_cache, zfs_znode_move);
+
+	znode_ksp = kstat_create("zfs", 0, "znodestats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (znode_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (znode_ksp != NULL) {
+		znode_ksp->ks_data = &znode_stats;
+		kstat_install(znode_ksp);
+	}
+
+#ifdef DEBUG
+	znode_debug_ksp = kstat_create("zfs", 0, "znodestats", "debug",
+	    KSTAT_TYPE_NAMED,
+	    sizeof (znode_debug_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (znode_debug_ksp != NULL) {
+		znode_debug_ksp->ks_data = &znode_debug_stats;
+		kstat_install(znode_debug_ksp);
+	}
+#endif	/* DEBUG */
 }
 
 void
@@ -399,6 +455,17 @@ zfs_znode_fini(void)
 		kmem_cache_destroy(znode_cache);
 	znode_cache = NULL;
 	rw_destroy(&zfsvfs_lock);
+
+	if (znode_ksp != NULL) {
+		kstat_delete(znode_ksp);
+		znode_ksp = NULL;
+	}
+#ifdef DEBUG
+	if (znode_debug_ksp != NULL) {
+		kstat_delete(znode_debug_ksp);
+		znode_debug_ksp = NULL;
+	}
+#endif	/* DEBUG */
 }
 
 #ifdef sun
@@ -1140,6 +1203,69 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xv
 	}
 }
 
+/*
+ * Safely sleep for a short period while waiting for vnode
+ * reclamation to complete.
+ */
+static void
+zfs_zget_reclaim_pause(zfsvfs_t *zfsvfs, uint64_t obj_num, dmu_buf_t *db,
+    znode_t *zp, vnode_t *vp)
+{
+	sa_buf_rele(db, NULL);
+	mutex_exit(&zp->z_lock);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+	if (vp != NULL) {
+		/*
+		 * We must VN_RELE() the vnode here, after dropping
+		 * the znode lock, instead of closer to the VN_HOLD()
+		 * where the dying vnode is first detected.  VN_RELE()
+		 * will call vn_lock() when the last reference is
+		 * dropped, but the defined locking order is vnode lock
+		 * followed by the znode lock.
+		 */
+		VN_RELE(vp);
+	}
+	ZNODE_STAT_ADD(zget_reclaim_collisions);
+	tsleep(zp, 0, "zcollide", 1);
+}
+
+/*
+ * Create a new znode/vnode but only if the file exists.
+ *
+ * Note: There is a small window where zfs_vget() could
+ *       find this object while a file create is still in
+ *       progress.  This is checked for in zfs_znode_alloc().
+ *
+ * Note: If zfs_znode_alloc() fails it will drop the hold on the
+ *       bonus buffer.
+ */
+static int
+zfs_zget_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, dmu_object_info_t *doi,
+    znode_t **zpp)
+{
+	znode_t	*zp;
+	vnode_t *vp;
+	int	err;
+
+	*zpp = NULL;
+	zp = zfs_znode_alloc(zfsvfs, db, doi->doi_data_block_size,
+	    doi->doi_bonus_type, NULL);
+	if (zp == NULL)
+		return (ENOENT);
+
+	vp = ZTOV(zp);
+	err = insmntque(vp, zfsvfs->z_vfs);
+	if (err != 0) {
+		zp->z_vnode = NULL;
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_free(zp);
+		return (err);
+	}
+	*zpp = zp;
+	VOP_UNLOCK(vp, 0);
+	return (0);
+}
+
 int
 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 {
@@ -1148,33 +1274,38 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_
 	znode_t		*zp;
 	int err;
 	sa_handle_t	*hdl;
-	int first = 1;
 
 	*zpp = NULL;
 
-again:
-	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+	for (;;) {
+		vnode_t	*vp;
 
-	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
-	if (err) {
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (err);
-	}
+		ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
-	dmu_object_info_from_db(db, &doi);
-	if (doi.doi_bonus_type != DMU_OT_SA &&
-	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
-	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
-	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
-		sa_buf_rele(db, NULL);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (EINVAL);
-	}
+		err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+		if (err) {
+			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+			return (err);
+		}
 
-	hdl = dmu_buf_get_user(db);
-	if (hdl != NULL) {
-		zp  = sa_get_userdata(hdl);
+		dmu_object_info_from_db(db, &doi);
+		if (doi.doi_bonus_type != DMU_OT_SA &&
+		    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+		    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+		    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+			sa_buf_rele(db, NULL);
+			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+			return (EINVAL);
+		}
+
+		hdl = (sa_handle_t *)dmu_buf_get_user(db);
+		if (hdl == NULL) {
+			err = zfs_zget_znode_alloc(zfsvfs, db, &doi, zpp);
+			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+			return (err);
+		}
 
+		zp = sa_get_userdata(hdl);
 
 		/*
 		 * Since "SA" does immediate eviction we
@@ -1188,83 +1319,36 @@ again:
 		ASSERT3U(zp->z_id, ==, obj_num);
 		if (zp->z_unlinked) {
 			err = ENOENT;
-		} else {
-			vnode_t *vp;
-			int dying = 0;
+			break;
+		}
 
-			vp = ZTOV(zp);
-			if (vp == NULL)
-				dying = 1;
-			else {
-				VN_HOLD(vp);
-				if ((vp->v_iflag & VI_DOOMED) != 0) {
-					dying = 1;
-					/*
-					 * Don't VN_RELE() vnode here, because
-					 * it can call vn_lock() which creates
-					 * LOR between vnode lock and znode
-					 * lock. We will VN_RELE() the vnode
-					 * after droping znode lock.
-					 */
-				}
-			}
-			if (dying) {
-				if (first) {
-					ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
-					first = 0;
-				}
-				/*
-				 * znode is dying so we can't reuse it, we must
-				 * wait until destruction is completed.
-				 */
-				sa_buf_rele(db, NULL);
-				mutex_exit(&zp->z_lock);
-				ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-				if (vp != NULL)
-					VN_RELE(vp);
-				tsleep(zp, 0, "zcollide", 1);
-				goto again;
-			}
-			*zpp = zp;
-			err = 0;
+		vp = ZTOV(zp);
+		if (vp == NULL) {
+			zfs_zget_reclaim_pause(zfsvfs, obj_num, db, zp, vp);
+			continue;
 		}
-		sa_buf_rele(db, NULL);
-		mutex_exit(&zp->z_lock);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (err);
-	}
 
-	/*
-	 * Not found create new znode/vnode
-	 * but only if file exists.
-	 *
-	 * There is a small window where zfs_vget() could
-	 * find this object while a file create is still in
-	 * progress.  This is checked for in zfs_znode_alloc()
-	 *
-	 * if zfs_znode_alloc() fails it will drop the hold on the
-	 * bonus buffer.
-	 */
-	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
-	    doi.doi_bonus_type, NULL);
-	if (zp == NULL) {
-		err = ENOENT;
-	} else {
-		*zpp = zp;
-	}
-	if (err == 0) {
-		vnode_t *vp = ZTOV(zp);
+		VN_HOLD(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0) {
 
-		err = insmntque(vp, zfsvfs->z_vfs);
-		if (err == 0)
-			VOP_UNLOCK(vp, 0);
-		else {
-			zp->z_vnode = NULL;
-			zfs_znode_dmu_fini(zp);
-			zfs_znode_free(zp);
-			*zpp = NULL;
+			if ((zp->z_reclaim_td != curthread)) {
+				zfs_zget_reclaim_pause(zfsvfs, obj_num, db,
+				    zp, vp);
+				continue;
+			}
+
+			/*
+			 * The the thread reclaiming this vnode
+			 * is allowed to use it.
+			 */
+			ZNODE_STAT_ADD(zget_reclaim_td_collisions);
 		}
+		*zpp = zp;
+		err = 0;
+		break;
 	}
+	sa_buf_rele(db, NULL);
+	mutex_exit(&zp->z_lock);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 	return (err);
 }