ZFS: Deadlock during vnode recycling
Justin T. Gibbs
gibbs at scsiguy.com
Sat Sep 29 23:06:57 UTC 2012
On Sep 29, 2012, at 8:45 AM, Pawel Jakub Dawidek <pjd at freebsd.org> wrote:
> On Tue, Sep 18, 2012 at 09:14:22AM -0600, Justin T. Gibbs wrote:
>> One of our systems became unresponsive due to an inability to recycle
>> vnodes. We tracked this down to a deadlock in zfs_zget(). I've attached
>> the stack trace from the vnlru process to the end of this email.
>>
>> We are currently testing the following patch. Since this issue is hard to
>> replicate I would appreciate review and feedback before I commit it to
>> FreeBSD.
>
> I think the patch is ok. The whole mess we have to deal when to comes to
> reclamation won't get much more messy with this patch in place:)
We're running now with something a little more invasive than the
last patch I presented. I added stats for the two different types
of collisions, but doing so made zfs_zget() even more messy, so I
refactored it. I also made the stats in the file real kstats.
--
Justin
Both together:
Change 635310 by justing at justing_ns1_spectrabsd on 2012/09/17 15:30:14
For most vnode consumers of ZFS, the appropriate behavior
when encountering a vnode that is in the process of being
reclaimed is to wait for that process to complete and then
allocate a new vnode. This behavior is enforced in zfs_zget()
by checking for the VI_DOOMED vnode flag. In the case of
the thread actually reclaiming the vnode, zfs_zget() must
return the current vnode, otherwise a deadlock will occur.
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h:
Create a virtual znode field, z_reclaim_td, which is
implemeted as a macro that redirects to z_task.ta_context.
z_task is only used by the reclaim code to perform the
final cleanup of a znode in a secondary thread. Since
this can only occur after any calls to zfs_zget(), it
is safe to reuse the ta_context field.
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c:
In zfs_freebsd_reclaim(), record curthread in the
znode being reclaimed.
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c:
o Null out z_reclaim_td when znode_ts are constructed.
o In zfs_zget(), return a "doomed vnode" if the current
thread is actively reclaiming this object.
Change 635653 on 2012/09/19 by justing at justing_ns1_spectrabsd
Add statistics for zfs_zget() vnode reclamation collisions.
sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c:
o Convert the existing, but sun only, znode stats to
use the solaris kstat facility much like the rest
of ZFS.
o Create a debug class of znode stats which are only
collected and published if ZFS's DEBUG macro is defined.
These are incremented using ZNODE_DEBUG_STAT_ADD()
instead of ZNODE_STAT_ADD(). All of the pre-existing
stats were debug only, and are now in this class.
o Add two new (always available) stat entries:
zget_reclaim_collisions:
The number of times ZFS has slept in zfs_zget()
waiting for a vnode to be reclaimed.
zget_reclaim_td_collisions:
The number of times ZFS has entered zfs_zget()
from the thread reclaiming the vnode that is being
reclaimed. In this case the "doomed" vnode is
returned imemediately so that reclamation can
complete.
o Refactor zfs_zget()
Two new helper functions were created:
zfs_zget_znode_alloc():
Called by zfs_zget() when no existing znode is found.
zfs_zget_reclaim_pause():
Called by zfs_zget() when it must pause to wait for
an in-progress vnode reclamation to complete.
Using these routines, restructure zfs_zget() to remove
a goto and nested blocks that pushed the common path
way off to the right.
--- /usr/home/justing/perforce/vendor/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.h 2012-09-14 16:27:27.869936258 -0600
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.h 2012-09-20 10:56:49.908850009 -0600
@@ -241,6 +241,7 @@
struct task z_task;
} znode_t;
+#define z_reclaim_td z_task.ta_context
/*
* Convert between znode pointers and vnode pointers
--- /usr/home/justing/perforce/vendor/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c 2012-09-14 16:27:27.869936258 -0600
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c 2012-09-20 10:56:49.908850009 -0600
@@ -6083,6 +6083,13 @@ zfs_freebsd_reclaim(ap)
ASSERT(zp != NULL);
+ /*
+ * Mark the znode so that operations that typically block
+ * waiting for reclamation to complete will return the current,
+ * "doomed vnode", for this thread.
+ */
+ zp->z_reclaim_td = curthread;
+
/*
* Destroy the vm object and flush associated pages.
*/
--- /usr/home/justing/perforce/vendor/FreeBSD/head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c 2012-09-14 16:27:27.869936258 -0600
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c 2012-09-20 10:56:49.908850009 -0600
@@ -43,6 +42,7 @@
#include <sys/errno.h>
#include <sys/unistd.h>
#include <sys/atomic.h>
+#include <sys/kstat.h>
#include <sys/zfs_dir.h>
#include <sys/zfs_acl.h>
#include <sys/zfs_ioctl.h>
@@ -70,19 +70,53 @@
SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
"sizeof(znode_t)");
-/*
- * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
+typedef struct znode_stats {
+ kstat_named_t zns_zget_reclaim_collisions;
+ kstat_named_t zns_zget_reclaim_td_collisions;
+} znode_stats_t;
+
+static znode_stats_t znode_stats = {
+ { "zget_reclaim_collisions", KSTAT_DATA_UINT64 },
+ { "zget_reclaim_td_collisions", KSTAT_DATA_UINT64 },
+};
+kstat_t *znode_ksp;
+
+#define ZNODE_STAT_ADD(stat) \
+ atomic_add_64(&znode_stats.zns_##stat.value.ui64, 1)
+
#ifdef DEBUG
-#define ZNODE_STATS
-#endif /* DEBUG */
+typedef struct znode_debug_stats {
+#ifdef sun
+ kstat_named_t znds_move_zfsvfs_invald;
+ kstat_named_t znds_move_zfsvfs_recheck1;
+ kstat_named_t znds_move_zfsvfs_unmounted;
+ kstat_named_t znds_move_zfsvfs_recheck2;
+ kstat_named_t znds_move_obj_held;
+ kstat_named_t znds_move_vnode_locked;
+ kstat_named_t znds_move_not_only_dnlc;
+ kstat_named_t znds_zget_reclaim_collision;
+ kstat_named_t znds_zget_reclaim_td_collision;
+#endif /* sun */
+} znode_debug_stats_t;
-#ifdef ZNODE_STATS
-#define ZNODE_STAT_ADD(stat) ((stat)++)
+static znode_debug_stats_t znode_debug_stats = {
+#ifdef sun
+ { "move_zfsvfs_invalid", KSTAT_DATA_UINT64 },
+ { "move_zfsvfs_recheck1", KSTAT_DATA_UINT64 },
+ { "move_zfsvfs_unmounted", KSTAT_DATA_UINT64 },
+ { "move_zfsvfs_recheck2", KSTAT_DATA_UINT64 },
+ { "move_obj_held", KSTAT_DATA_UINT64 },
+ { "move_vnode_locked", KSTAT_DATA_UINT64 },
+ { "move_not_only_dnlc", KSTAT_DATA_UINT64 },
+#endif /* sun */
+};
+kstat_t *znode_debug_ksp;
+
+#define ZNODE_DEBUG_STAT_ADD(stat) \
+ atomic_add_64(&znode_debug_stats.znds_##stat.value.ui64, 1)
#else
-#define ZNODE_STAT_ADD(stat) /* nothing */
-#endif /* ZNODE_STATS */
+#define ZNODE_DEBUG_STAT_ADD(stat) do { } while (0)
+#endif /* DEBUG */
/*
* Functions needed for userland (ie: libzpool) are not put under
@@ -159,6 +193,7 @@ zfs_znode_cache_constructor(void *buf, v
zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
zp->z_moved = 0;
+ zp->z_reclaim_td = NULL;
return (0);
}
@@ -277,7 +312,7 @@ zfs_znode_move(void *buf, void *newbuf,
*/
zfsvfs = ozp->z_zfsvfs;
if (!POINTER_IS_VALID(zfsvfs)) {
- ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
+ ZNODE_DEBUG_STAT_ADD(move_zfsvfs_invalid);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -290,7 +325,7 @@ zfs_znode_move(void *buf, void *newbuf,
rw_enter(&zfsvfs_lock, RW_WRITER);
if (zfsvfs != ozp->z_zfsvfs) {
rw_exit(&zfsvfs_lock);
- ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
+ ZNODE_DEBUG_STAT_ADD(move_zfsvfs_recheck1);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -304,7 +339,7 @@ zfs_znode_move(void *buf, void *newbuf,
if (zfsvfs->z_unmounted) {
ZFS_EXIT(zfsvfs);
rw_exit(&zfsvfs_lock);
- ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
+ ZNODE_DEBUG_STAT_ADD(move_zfsvfs_unmounted);
return (KMEM_CBRC_DONT_KNOW);
}
rw_exit(&zfsvfs_lock);
@@ -317,7 +352,7 @@ zfs_znode_move(void *buf, void *newbuf,
if (zfsvfs != ozp->z_zfsvfs) {
mutex_exit(&zfsvfs->z_znodes_lock);
ZFS_EXIT(zfsvfs);
- ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
+ ZNODE_DEBUG_STAT_ADD(move_zfsvfs_recheck2);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -329,7 +364,7 @@ zfs_znode_move(void *buf, void *newbuf,
if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
mutex_exit(&zfsvfs->z_znodes_lock);
ZFS_EXIT(zfsvfs);
- ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
+ ZNODE_DEBUG_STAT_ADD(move_obj_held);
return (KMEM_CBRC_LATER);
}
@@ -338,7 +373,7 @@ zfs_znode_move(void *buf, void *newbuf,
ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
mutex_exit(&zfsvfs->z_znodes_lock);
ZFS_EXIT(zfsvfs);
- ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
+ ZNODE_DEBUG_STAT_ADD(move_vnode_locked);
return (KMEM_CBRC_LATER);
}
@@ -348,7 +383,7 @@ zfs_znode_move(void *buf, void *newbuf,
ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
mutex_exit(&zfsvfs->z_znodes_lock);
ZFS_EXIT(zfsvfs);
- ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
+ ZNODE_DEBUG_STAT_ADD(move_not_only_dnlc);
return (KMEM_CBRC_LATER);
}
@@ -380,6 +415,27 @@ zfs_znode_init(void)
sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
kmem_cache_set_move(znode_cache, zfs_znode_move);
+
+ znode_ksp = kstat_create("zfs", 0, "znodestats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (znode_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (znode_ksp != NULL) {
+ znode_ksp->ks_data = &znode_stats;
+ kstat_install(znode_ksp);
+ }
+
+#ifdef DEBUG
+ znode_debug_ksp = kstat_create("zfs", 0, "znodestats", "debug",
+ KSTAT_TYPE_NAMED,
+ sizeof (znode_debug_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (znode_debug_ksp != NULL) {
+ znode_debug_ksp->ks_data = &znode_debug_stats;
+ kstat_install(znode_debug_ksp);
+ }
+#endif /* DEBUG */
}
void
@@ -399,6 +455,17 @@ zfs_znode_fini(void)
kmem_cache_destroy(znode_cache);
znode_cache = NULL;
rw_destroy(&zfsvfs_lock);
+
+ if (znode_ksp != NULL) {
+ kstat_delete(znode_ksp);
+ znode_ksp = NULL;
+ }
+#ifdef DEBUG
+ if (znode_debug_ksp != NULL) {
+ kstat_delete(znode_debug_ksp);
+ znode_debug_ksp = NULL;
+ }
+#endif /* DEBUG */
}
#ifdef sun
@@ -1140,6 +1203,69 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xv
}
}
+/*
+ * Safely sleep for a short period while waiting for vnode
+ * reclamation to complete.
+ */
+static void
+zfs_zget_reclaim_pause(zfsvfs_t *zfsvfs, uint64_t obj_num, dmu_buf_t *db,
+ znode_t *zp, vnode_t *vp)
+{
+ sa_buf_rele(db, NULL);
+ mutex_exit(&zp->z_lock);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ if (vp != NULL) {
+ /*
+ * We must VN_RELE() the vnode here, after dropping
+ * the znode lock, instead of closer to the VN_HOLD()
+ * where the dying vnode is first detected. VN_RELE()
+ * will call vn_lock() when the last reference is
+ * dropped, but the defined locking order is vnode lock
+ * followed by the znode lock.
+ */
+ VN_RELE(vp);
+ }
+ ZNODE_STAT_ADD(zget_reclaim_collisions);
+ tsleep(zp, 0, "zcollide", 1);
+}
+
+/*
+ * Create a new znode/vnode but only if the file exists.
+ *
+ * Note: There is a small window where zfs_vget() could
+ * find this object while a file create is still in
+ * progress. This is checked for in zfs_znode_alloc().
+ *
+ * Note: If zfs_znode_alloc() fails it will drop the hold on the
+ * bonus buffer.
+ */
+static int
+zfs_zget_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, dmu_object_info_t *doi,
+ znode_t **zpp)
+{
+ znode_t *zp;
+ vnode_t *vp;
+ int err;
+
+ *zpp = NULL;
+ zp = zfs_znode_alloc(zfsvfs, db, doi->doi_data_block_size,
+ doi->doi_bonus_type, NULL);
+ if (zp == NULL)
+ return (ENOENT);
+
+ vp = ZTOV(zp);
+ err = insmntque(vp, zfsvfs->z_vfs);
+ if (err != 0) {
+ zp->z_vnode = NULL;
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_free(zp);
+ return (err);
+ }
+ *zpp = zp;
+ VOP_UNLOCK(vp, 0);
+ return (0);
+}
+
int
zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
{
@@ -1148,33 +1274,38 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_
znode_t *zp;
int err;
sa_handle_t *hdl;
- int first = 1;
*zpp = NULL;
-again:
- ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+ for (;;) {
+ vnode_t *vp;
- err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
- if (err) {
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- return (err);
- }
+ ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
- dmu_object_info_from_db(db, &doi);
- if (doi.doi_bonus_type != DMU_OT_SA &&
- (doi.doi_bonus_type != DMU_OT_ZNODE ||
- (doi.doi_bonus_type == DMU_OT_ZNODE &&
- doi.doi_bonus_size < sizeof (znode_phys_t)))) {
- sa_buf_rele(db, NULL);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- return (EINVAL);
- }
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
- hdl = dmu_buf_get_user(db);
- if (hdl != NULL) {
- zp = sa_get_userdata(hdl);
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (EINVAL);
+ }
+
+ hdl = (sa_handle_t *)dmu_buf_get_user(db);
+ if (hdl == NULL) {
+ err = zfs_zget_znode_alloc(zfsvfs, db, &doi, zpp);
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+ return (err);
+ }
+ zp = sa_get_userdata(hdl);
/*
* Since "SA" does immediate eviction we
@@ -1188,83 +1319,36 @@ again:
ASSERT3U(zp->z_id, ==, obj_num);
if (zp->z_unlinked) {
err = ENOENT;
- } else {
- vnode_t *vp;
- int dying = 0;
+ break;
+ }
- vp = ZTOV(zp);
- if (vp == NULL)
- dying = 1;
- else {
- VN_HOLD(vp);
- if ((vp->v_iflag & VI_DOOMED) != 0) {
- dying = 1;
- /*
- * Don't VN_RELE() vnode here, because
- * it can call vn_lock() which creates
- * LOR between vnode lock and znode
- * lock. We will VN_RELE() the vnode
- * after droping znode lock.
- */
- }
- }
- if (dying) {
- if (first) {
- ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
- first = 0;
- }
- /*
- * znode is dying so we can't reuse it, we must
- * wait until destruction is completed.
- */
- sa_buf_rele(db, NULL);
- mutex_exit(&zp->z_lock);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- if (vp != NULL)
- VN_RELE(vp);
- tsleep(zp, 0, "zcollide", 1);
- goto again;
- }
- *zpp = zp;
- err = 0;
+ vp = ZTOV(zp);
+ if (vp == NULL) {
+ zfs_zget_reclaim_pause(zfsvfs, obj_num, db, zp, vp);
+ continue;
}
- sa_buf_rele(db, NULL);
- mutex_exit(&zp->z_lock);
- ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- return (err);
- }
- /*
- * Not found create new znode/vnode
- * but only if file exists.
- *
- * There is a small window where zfs_vget() could
- * find this object while a file create is still in
- * progress. This is checked for in zfs_znode_alloc()
- *
- * if zfs_znode_alloc() fails it will drop the hold on the
- * bonus buffer.
- */
- zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
- doi.doi_bonus_type, NULL);
- if (zp == NULL) {
- err = ENOENT;
- } else {
- *zpp = zp;
- }
- if (err == 0) {
- vnode_t *vp = ZTOV(zp);
+ VN_HOLD(vp);
+ if ((vp->v_iflag & VI_DOOMED) != 0) {
- err = insmntque(vp, zfsvfs->z_vfs);
- if (err == 0)
- VOP_UNLOCK(vp, 0);
- else {
- zp->z_vnode = NULL;
- zfs_znode_dmu_fini(zp);
- zfs_znode_free(zp);
- *zpp = NULL;
+ if ((zp->z_reclaim_td != curthread)) {
+ zfs_zget_reclaim_pause(zfsvfs, obj_num, db,
+ zp, vp);
+ continue;
+ }
+
+ /*
+ * The the thread reclaiming this vnode
+ * is allowed to use it.
+ */
+ ZNODE_STAT_ADD(zget_reclaim_td_collisions);
}
+ *zpp = zp;
+ err = 0;
+ break;
}
+ sa_buf_rele(db, NULL);
+ mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (err);
}
More information about the freebsd-fs
mailing list