svn commit: r187478 - in stable/7/sys: . conf contrib/pf dev/ath/ath_hal dev/cxgb kern nfsclient ufs/ffs ufs/ufs

Tue Jan 20 11:02:01 PST 2009

Author: jhb
Date: Tue Jan 20 19:01:59 2009
New Revision: 187478
URL: http://svn.freebsd.org/changeset/base/187478

Log:
  MFC: Close several races with using shared vnode locks for pathname lookups
  with UFS and enable shared lookups for UFS.
  - Change the name cache to fail lookups with EBADF if a directory vnode
    is recycled while it waits for a lock upgrade.
  - Rework the locking of the dirhash to use an sx lock and reference count
    on each hash structure.  Using an sx lock instead of a mutex allows the
    lock to be held across disk I/O closing a number of races when using
    shared vnode locks that were previously handled by exclusive vnode
    locks.
  - Remove the 'i_ino' and 'i_reclen' fields from the i-node.  i_ino is now
    a local variable in ufs_lookup(), and i_reclen is not needed since
    ufs_dirremove() always has the entire block holding the directory
    entry in memory when it updates the directory.
  - 'i_diroff' and 'i_offset' are now local variables in ufs_lookup().
    'i_diroff' is updated after a successful lookup.
  - Only set i_offset in the parent directory's i-node during a lookup for
    non-LOOKUP operations.
  - Remove the LOOKUP_SHARED option.  One can set vfs.lookup_shared to 1
    in either loader.conf or sysctl.conf instead.  The default setting for
    vfs.lookup_shared is not changed and remains off by default.

Modified:
  stable/7/sys/   (props changed)
  stable/7/sys/conf/options
  stable/7/sys/contrib/pf/   (props changed)
  stable/7/sys/dev/ath/ath_hal/   (props changed)
  stable/7/sys/dev/cxgb/   (props changed)
  stable/7/sys/kern/vfs_cache.c
  stable/7/sys/kern/vfs_lookup.c
  stable/7/sys/nfsclient/nfs_vnops.c
  stable/7/sys/ufs/ffs/ffs_vfsops.c
  stable/7/sys/ufs/ufs/dirhash.h
  stable/7/sys/ufs/ufs/inode.h
  stable/7/sys/ufs/ufs/ufs_dirhash.c
  stable/7/sys/ufs/ufs/ufs_lookup.c

Modified: stable/7/sys/conf/options
==============================================================================

--- stable/7/sys/conf/options	Tue Jan 20 18:16:31 2009	(r187477)
+++ stable/7/sys/conf/options	Tue Jan 20 19:01:59 2009	(r187478)
@@ -741,9 +741,6 @@ NI4BTEL			opt_i4b.h
 #XXXBZ#NI4BING			opt_i4b.h
 #XXXBZ#NI4BISPPP		opt_i4b.h
 
-# VFS options
-LOOKUP_SHARED		opt_vfs.h
-
 # HWPMC options
 HWPMC_HOOKS
 

Modified: stable/7/sys/kern/vfs_cache.c
==============================================================================
--- stable/7/sys/kern/vfs_cache.c	Tue Jan 20 18:16:31 2009	(r187477)
+++ stable/7/sys/kern/vfs_cache.c	Tue Jan 20 19:01:59 2009	(r187478)
@@ -300,7 +300,9 @@ cache_zap(ncp)
  * succeeds, the vnode is returned in *vpp, and a status of -1 is
  * returned. If the lookup determines that the name does not exist
  * (negative cacheing), a status of ENOENT is returned. If the lookup
- * fails, a status of zero is returned.
+ * fails, a status of zero is returned.  If the directory vnode is
+ * recycled out from under us due to a forced unmount, a status of
+ * EBADF is returned.
  *
  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
@@ -425,11 +427,19 @@ success:
 		 * When we lookup "." we still can be asked to lock it
 		 * differently...
 		 */
-		ltype = cnp->cn_lkflags & (LK_SHARED | LK_EXCLUSIVE);
-		if (ltype == VOP_ISLOCKED(*vpp, td))
-			return (-1);
-		else if (ltype == LK_EXCLUSIVE)
-			vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td);
+		ltype = cnp->cn_lkflags & LK_TYPE_MASK;
+		if (ltype != VOP_ISLOCKED(*vpp, td)) {
+			if (ltype == LK_EXCLUSIVE) {
+				vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td);
+				if ((*vpp)->v_iflag & VI_DOOMED) {
+					/* forced unmount */
+					vrele(*vpp);
+					*vpp = NULL;
+					return (EBADF);
+				}
+			} else
+				vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY, td);
+		}
 		return (-1);
 	}
 	ltype = 0;	/* silence gcc warning */
@@ -442,12 +452,14 @@ success:
 	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, td);
 	if (cnp->cn_flags & ISDOTDOT)
 		vn_lock(dvp, ltype | LK_RETRY, td);
-	if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_EXCLUSIVE))
-		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 	if (error) {
 		*vpp = NULL;
 		goto retry;
 	}
+	if ((cnp->cn_flags & ISLASTCN) &&
+	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
+		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
+	}
 	return (-1);
 }
 
@@ -663,9 +675,9 @@ vfs_cache_lookup(ap)
 	error = cache_lookup(dvp, vpp, cnp);
 	if (error == 0)
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
-	if (error == ENOENT)
-		return (error);
-	return (0);
+	if (error == -1)
+		return (0);
+	return (error);
 }
 
 

Modified: stable/7/sys/kern/vfs_lookup.c
==============================================================================
--- stable/7/sys/kern/vfs_lookup.c	Tue Jan 20 18:16:31 2009	(r187477)
+++ stable/7/sys/kern/vfs_lookup.c	Tue Jan 20 19:01:59 2009	(r187478)
@@ -39,7 +39,6 @@ __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
-#include "opt_vfs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -88,13 +87,10 @@ nameiinit(void *dummy __unused)
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
 
-#ifdef LOOKUP_SHARED
-static int lookup_shared = 1;
-#else
 static int lookup_shared = 0;
-#endif
 SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
     "Enables/Disables shared locks for path name translation");
+TUNABLE_INT("vfs.lookup_shared", &lookup_shared);
 
 /*
  * Convert a pathname into a pointer to a locked vnode.

Modified: stable/7/sys/nfsclient/nfs_vnops.c
==============================================================================
--- stable/7/sys/nfsclient/nfs_vnops.c	Tue Jan 20 18:16:31 2009	(r187477)
+++ stable/7/sys/nfsclient/nfs_vnops.c	Tue Jan 20 19:01:59 2009	(r187478)
@@ -868,7 +868,10 @@ nfs_lookup(struct vop_lookup_args *ap)
 		*vpp = NULLVP;
 		return (error);
 	}
-	if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) {
+	error = cache_lookup(dvp, vpp, cnp);
+	if (error > 0 && error != ENOENT)
+		return (error);
+	if (error == -1) {
 		struct vattr vattr;
 
 		newvp = *vpp;

Modified: stable/7/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- stable/7/sys/ufs/ffs/ffs_vfsops.c	Tue Jan 20 18:16:31 2009	(r187477)
+++ stable/7/sys/ufs/ffs/ffs_vfsops.c	Tue Jan 20 19:01:59 2009	(r187478)
@@ -852,7 +852,7 @@ ffs_mountfs(devvp, mp, td)
 	 * Initialize filesystem stat information in mount struct.
 	 */
 	MNT_ILOCK(mp);
-	mp->mnt_kern_flag |= MNTK_MPSAFE;
+	mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED;
 	MNT_IUNLOCK(mp);
 #ifdef UFS_EXTATTR
 #ifdef UFS_EXTATTR_AUTOSTART

Modified: stable/7/sys/ufs/ufs/dirhash.h
==============================================================================
--- stable/7/sys/ufs/ufs/dirhash.h	Tue Jan 20 18:16:31 2009	(r187477)
+++ stable/7/sys/ufs/ufs/dirhash.h	Tue Jan 20 19:01:59 2009	(r187478)
@@ -28,6 +28,9 @@
 #ifndef _UFS_UFS_DIRHASH_H_
 #define _UFS_UFS_DIRHASH_H_
 
+#include <sys/_lock.h>
+#include <sys/_sx.h>
+
 /*
  * For fast operations on large directories, we maintain a hash
  * that maps the file name to the offset of the directory entry within
@@ -80,12 +83,14 @@
     ((dh)->dh_hash[(slot) >> DH_BLKOFFSHIFT][(slot) & DH_BLKOFFMASK])
 
 struct dirhash {
-	struct mtx dh_mtx;	/* protects all fields except dh_list */
+	struct sx dh_lock;	/* protects all fields except list & score */
+	int	dh_refcount;
 
 	doff_t	**dh_hash;	/* the hash array (2-level) */
 	int	dh_narrays;	/* number of entries in dh_hash */
 	int	dh_hlen;	/* total slots in the 2-level hash array */
 	int	dh_hused;	/* entries in use */
+	int	dh_memreq;	/* Memory used. */
 
 	/* Free space statistics. XXX assumes DIRBLKSIZ is 512. */
 	u_int8_t *dh_blkfree;	/* free DIRALIGN words in each dir block */

Modified: stable/7/sys/ufs/ufs/inode.h
==============================================================================
--- stable/7/sys/ufs/ufs/inode.h	Tue Jan 20 18:16:31 2009	(r187477)
+++ stable/7/sys/ufs/ufs/inode.h	Tue Jan 20 19:01:59 2009	(r187478)
@@ -82,8 +82,6 @@ struct inode {
 	doff_t	  i_endoff;	/* End of useful stuff in directory. */
 	doff_t	  i_diroff;	/* Offset in dir, where we found last entry. */
 	doff_t	  i_offset;	/* Offset of free space in directory. */
-	ino_t	  i_ino;	/* Inode number of found directory. */
-	u_int32_t i_reclen;	/* Size of found directory entry. */
 
 	union {
 		struct dirhash *dirhash; /* Hashing for large directories. */

Modified: stable/7/sys/ufs/ufs/ufs_dirhash.c
==============================================================================
--- stable/7/sys/ufs/ufs/ufs_dirhash.c	Tue Jan 20 18:16:31 2009	(r187477)
+++ stable/7/sys/ufs/ufs/ufs_dirhash.c	Tue Jan 20 19:01:59 2009	(r187478)
@@ -46,7 +46,9 @@ __FBSDID("$FreeBSD$");
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/refcount.h>
 #include <sys/sysctl.h>
+#include <sys/sx.h>
 #include <vm/uma.h>
 
 #include <ufs/ufs/quota.h>
@@ -88,15 +90,16 @@ static int ufsdirhash_findslot(struct di
 	   doff_t offset);
 static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
 static int ufsdirhash_recycle(int wanted);
+static void ufsdirhash_free_locked(struct inode *ip);
 
 static uma_zone_t	ufsdirhash_zone;
 
 #define DIRHASHLIST_LOCK() 		mtx_lock(&ufsdirhash_mtx)
 #define DIRHASHLIST_UNLOCK() 		mtx_unlock(&ufsdirhash_mtx)
-#define DIRHASH_LOCK(dh)		mtx_lock(&(dh)->dh_mtx)
-#define DIRHASH_UNLOCK(dh) 		mtx_unlock(&(dh)->dh_mtx)
 #define DIRHASH_BLKALLOC_WAITOK() 	uma_zalloc(ufsdirhash_zone, M_WAITOK)
 #define DIRHASH_BLKFREE(ptr) 		uma_zfree(ufsdirhash_zone, (ptr))
+#define	DIRHASH_ASSERT_LOCKED(dh)					\
+    sx_assert(&(dh)->dh_lock, SA_LOCKED)
 
 /* Dirhash list; recently-used entries are near the tail. */
 static TAILQ_HEAD(, dirhash) ufsdirhash_list;
@@ -105,14 +108,199 @@ static TAILQ_HEAD(, dirhash) ufsdirhash_
 static struct mtx	ufsdirhash_mtx;
 
 /*
- * Locking order:
- *	ufsdirhash_mtx
- *	dh_mtx
+ * Locking:
  *
- * The dh_mtx mutex should be acquired either via the inode lock, or via
- * ufsdirhash_mtx. Only the owner of the inode may free the associated
- * dirhash, but anything can steal its memory and set dh_hash to NULL.
+ * The relationship between inode and dirhash is protected either by an
+ * exclusive vnode lock or the vnode interlock where a shared vnode lock
+ * may be used.  The dirhash_mtx is acquired after the dirhash lock.  To
+ * handle teardown races, code wishing to lock the dirhash for an inode
+ * when using a shared vnode lock must obtain a private reference on the
+ * dirhash while holding the vnode interlock.  They can drop it once they
+ * have obtained the dirhash lock and verified that the dirhash wasn't
+ * recycled while they waited for the dirhash lock.
+ *
+ * ufsdirhash_build() acquires a shared lock on the dirhash when it is
+ * successful.  This lock is released after a call to ufsdirhash_lookup().
+ *
+ * Functions requiring exclusive access use ufsdirhash_acquire() which may
+ * free a dirhash structure that was recycled by ufsdirhash_recycle().
+ *
+ * The dirhash lock may be held across io operations.
+ */
+
+static void
+ufsdirhash_hold(struct dirhash *dh)
+{
+
+	refcount_acquire(&dh->dh_refcount);
+}
+
+static void
+ufsdirhash_drop(struct dirhash *dh)
+{
+
+	if (refcount_release(&dh->dh_refcount)) {
+		sx_destroy(&dh->dh_lock);
+		free(dh, M_DIRHASH);
+	}
+}
+
+/*
+ * Release the lock on a dirhash.
+ */
+static void
+ufsdirhash_release(struct dirhash *dh)
+{
+
+	sx_unlock(&dh->dh_lock);
+}
+
+/*
+ * Either acquire an existing hash locked shared or create a new hash and
+ * return it exclusively locked.  May return NULL if the allocation fails.
+ *
+ * The vnode interlock is used to protect the i_dirhash pointer from
+ * simultaneous access while only a shared vnode lock is held.
+ */
+static struct dirhash *
+ufsdirhash_create(struct inode *ip)
+{
+	struct dirhash *ndh;
+	struct dirhash *dh;
+	struct vnode *vp;
+	int error;
+
+	error = 0;
+	ndh = dh = NULL;
+	vp = ip->i_vnode;
+	for (;;) {
+		/* Racy check for i_dirhash to prefetch an dirhash structure. */
+		if (ip->i_dirhash == NULL && ndh == NULL) {
+			MALLOC(ndh, struct dirhash *, sizeof *dh, M_DIRHASH,
+			    M_NOWAIT | M_ZERO);
+			if (ndh == NULL)
+				return (NULL);
+			refcount_init(&ndh->dh_refcount, 1);
+			sx_init(&ndh->dh_lock, "dirhash");
+			sx_xlock(&ndh->dh_lock);
+		}
+		/*
+		 * Check i_dirhash.  If it's NULL just try to use a
+		 * preallocated structure.  If none exists loop and try again.
+		 */
+		VI_LOCK(vp);
+		dh = ip->i_dirhash;
+		if (dh == NULL) {
+			ip->i_dirhash = ndh;
+			VI_UNLOCK(vp);
+			if (ndh == NULL)
+				continue;
+			return (ndh);
+		}
+		ufsdirhash_hold(dh);
+		VI_UNLOCK(vp);
+
+		/* Acquire a shared lock on existing hashes. */
+		sx_slock(&dh->dh_lock);
+
+		/* The hash could've been recycled while we were waiting. */
+		VI_LOCK(vp);
+		if (ip->i_dirhash != dh) {
+			VI_UNLOCK(vp);
+			ufsdirhash_release(dh);
+			ufsdirhash_drop(dh);
+			continue;
+		}
+		VI_UNLOCK(vp);
+		ufsdirhash_drop(dh);
+
+		/* If the hash is still valid we've succeeded. */
+		if (dh->dh_hash != NULL)
+			break;
+		/*
+		 * If the hash is NULL it has been recycled.  Try to upgrade
+		 * so we can recreate it.  If we fail the upgrade, drop our
+		 * lock and try again.
+		 */
+		if (sx_try_upgrade(&dh->dh_lock))
+			break;
+		sx_sunlock(&dh->dh_lock);
+	}
+	/* Free the preallocated structure if it was not necessary. */
+	if (ndh) {
+		ufsdirhash_release(ndh);
+		ufsdirhash_drop(ndh);
+	}
+	return (dh);
+}
+
+/*
+ * Acquire an exclusive lock on an existing hash.  Requires an exclusive
+ * vnode lock to protect the i_dirhash pointer.  hashes that have been
+ * recycled are reclaimed here and NULL is returned.
+ */
+static struct dirhash *
+ufsdirhash_acquire(struct inode *ip)
+{
+	struct dirhash *dh;
+	struct vnode *vp;
+
+	ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__);
+
+	vp = ip->i_vnode;
+	dh = ip->i_dirhash;
+	if (dh == NULL)
+		return (NULL);
+	sx_xlock(&dh->dh_lock);
+	if (dh->dh_hash != NULL)
+		return (dh);
+	ufsdirhash_free_locked(ip);
+	return (NULL);
+}
+
+/*
+ * Acquire exclusively and free the hash pointed to by ip.  Works with a
+ * shared or exclusive vnode lock.
  */
+void
+ufsdirhash_free(struct inode *ip)
+{
+	struct dirhash *dh;
+	struct vnode *vp;
+
+	vp = ip->i_vnode;
+	for (;;) {
+		/* Grab a reference on this inode's dirhash if it has one. */
+		VI_LOCK(vp);
+		dh = ip->i_dirhash;
+		if (dh == NULL) {
+			VI_UNLOCK(vp);
+			return;
+		}
+		ufsdirhash_hold(dh);
+		VI_UNLOCK(vp);
+
+		/* Exclusively lock the dirhash. */
+		sx_xlock(&dh->dh_lock);
+
+		/* If this dirhash still belongs to this inode, then free it. */
+		VI_LOCK(vp);
+		if (ip->i_dirhash == dh) {
+			VI_UNLOCK(vp);
+			ufsdirhash_drop(dh);
+			break;
+		}
+		VI_UNLOCK(vp);
+
+		/*
+		 * This inode's dirhash has changed while we were
+		 * waiting for the dirhash lock, so try again.
+		 */
+		ufsdirhash_release(dh);
+		ufsdirhash_drop(dh);
+	}
+	ufsdirhash_free_locked(ip);
+}
 
 /*
  * Attempt to build up a hash table for the directory contents in
@@ -128,27 +316,23 @@ ufsdirhash_build(struct inode *ip)
 	doff_t bmask, pos;
 	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
 
-	/* Check if we can/should use dirhash. */
-	if (ip->i_dirhash == NULL) {
-		if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode))
+	/* Take care of a decreased sysctl value. */
+	while (ufs_dirhashmem > ufs_dirhashmaxmem)
+		if (ufsdirhash_recycle(0) != 0)
 			return (-1);
-	} else {
-		/* Hash exists, but sysctls could have changed. */
-		if (ip->i_size < ufs_mindirhashsize ||
-		    ufs_dirhashmem > ufs_dirhashmaxmem) {
+
+	/* Check if we can/should use dirhash. */
+	if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) ||
+	    ip->i_effnlink == 0) {
+		if (ip->i_dirhash)
 			ufsdirhash_free(ip);
-			return (-1);
-		}
-		/* Check if hash exists and is intact (note: unlocked read). */
-		if (ip->i_dirhash->dh_hash != NULL)
-			return (0);
-		/* Free the old, recycled hash and build a new one. */
-		ufsdirhash_free(ip);
+		return (-1);
 	}
-
-	/* Don't hash removed directories. */
-	if (ip->i_effnlink == 0)
+	dh = ufsdirhash_create(ip);
+	if (dh == NULL)
 		return (-1);
+	if (dh->dh_hash != NULL)
+		return (0);
 
 	vp = ip->i_vnode;
 	/* Allocate 50% more entries than this dir size could ever need. */
@@ -159,7 +343,6 @@ ufsdirhash_build(struct inode *ip)
 	nslots = narrays * DH_NBLKOFF;
 	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
 	nblocks = (dirblocks * 3 + 1) / 2;
-
 	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
 	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
 	    nblocks * sizeof(*dh->dh_blkfree);
@@ -167,33 +350,40 @@ ufsdirhash_build(struct inode *ip)
 	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
 		DIRHASHLIST_UNLOCK();
 		if (memreqd > ufs_dirhashmaxmem / 2)
-			return (-1);
-
+			goto fail;
 		/* Try to free some space. */
 		if (ufsdirhash_recycle(memreqd) != 0)
-			return (-1);
+			goto fail;
 		/* Enough was freed, and list has been locked. */
 	}
 	ufs_dirhashmem += memreqd;
 	DIRHASHLIST_UNLOCK();
 
+	/* Initialise the hash table and block statistics. */
+	dh->dh_memreq = memreqd;
+	dh->dh_narrays = narrays;
+	dh->dh_hlen = nslots;
+	dh->dh_nblk = nblocks;
+	dh->dh_dirblks = dirblocks;
+	for (i = 0; i < DH_NFSTATS; i++)
+		dh->dh_firstfree[i] = -1;
+	dh->dh_firstfree[DH_NFSTATS] = 0;
+	dh->dh_hused = 0;
+	dh->dh_seqopt = 0;
+	dh->dh_seqoff = 0;
+	dh->dh_score = DH_SCOREINIT;
+
 	/*
 	 * Use non-blocking mallocs so that we will revert to a linear
 	 * lookup on failure rather than potentially blocking forever.
 	 */
-	MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO);
-	if (dh == NULL) {
-		DIRHASHLIST_LOCK();
-		ufs_dirhashmem -= memreqd;
-		DIRHASHLIST_UNLOCK();
-		return (-1);
-	}
-	mtx_init(&dh->dh_mtx, "dirhash", NULL, MTX_DEF);
 	MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
 	    M_DIRHASH, M_NOWAIT | M_ZERO);
+	if (dh->dh_hash == NULL)
+		goto fail;
 	MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
 	    M_DIRHASH, M_NOWAIT);
-	if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
+	if (dh->dh_blkfree == NULL)
 		goto fail;
 	for (i = 0; i < narrays; i++) {
 		if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL)
@@ -201,22 +391,8 @@ ufsdirhash_build(struct inode *ip)
 		for (j = 0; j < DH_NBLKOFF; j++)
 			dh->dh_hash[i][j] = DIRHASH_EMPTY;
 	}
-
-	/* Initialise the hash table and block statistics. */
-	dh->dh_narrays = narrays;
-	dh->dh_hlen = nslots;
-	dh->dh_nblk = nblocks;
-	dh->dh_dirblks = dirblocks;
 	for (i = 0; i < dirblocks; i++)
 		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
-	for (i = 0; i < DH_NFSTATS; i++)
-		dh->dh_firstfree[i] = -1;
-	dh->dh_firstfree[DH_NFSTATS] = 0;
-	dh->dh_seqopt = 0;
-	dh->dh_seqoff = 0;
-	dh->dh_score = DH_SCOREINIT;
-	ip->i_dirhash = dh;
-
 	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
 	pos = 0;
 	while (pos < ip->i_size) {
@@ -254,63 +430,70 @@ ufsdirhash_build(struct inode *ip)
 	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
 	dh->dh_onlist = 1;
 	DIRHASHLIST_UNLOCK();
+	sx_downgrade(&dh->dh_lock);
 	return (0);
 
 fail:
-	if (dh->dh_hash != NULL) {
-		for (i = 0; i < narrays; i++)
-			if (dh->dh_hash[i] != NULL)
-				DIRHASH_BLKFREE(dh->dh_hash[i]);
-		FREE(dh->dh_hash, M_DIRHASH);
-	}
-	if (dh->dh_blkfree != NULL)
-		FREE(dh->dh_blkfree, M_DIRHASH);
-	mtx_destroy(&dh->dh_mtx);
-	FREE(dh, M_DIRHASH);
-	ip->i_dirhash = NULL;
-	DIRHASHLIST_LOCK();
-	ufs_dirhashmem -= memreqd;
-	DIRHASHLIST_UNLOCK();
+	ufsdirhash_free_locked(ip);
 	return (-1);
 }
 
 /*
  * Free any hash table associated with inode 'ip'.
  */
-void
-ufsdirhash_free(struct inode *ip)
+static void
+ufsdirhash_free_locked(struct inode *ip)
 {
 	struct dirhash *dh;
-	int i, mem;
+	struct vnode *vp;
+	int i;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return;
+	DIRHASH_ASSERT_LOCKED(ip->i_dirhash);
+
+	/*
+	 * Clear the pointer in the inode to prevent new threads from
+	 * finding the dead structure.
+	 */
+	vp = ip->i_vnode;
+	VI_LOCK(vp);
+	dh = ip->i_dirhash;
+	ip->i_dirhash = NULL;
+	VI_UNLOCK(vp);
+
+	/*
+	 * Remove the hash from the list since we are going to free its
+	 * memory.
+	 */
 	DIRHASHLIST_LOCK();
-	DIRHASH_LOCK(dh);
 	if (dh->dh_onlist)
 		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
-	DIRHASH_UNLOCK(dh);
+	ufs_dirhashmem -= dh->dh_memreq;
 	DIRHASHLIST_UNLOCK();
 
-	/* The dirhash pointed to by 'dh' is exclusively ours now. */
+	/*
+	 * At this point, any waiters for the lock should hold their
+	 * own reference on the dirhash structure.  They will drop
+	 * that reference once they grab the vnode interlock and see
+	 * that ip->i_dirhash is NULL.
+	 */
+	sx_xunlock(&dh->dh_lock);
 
-	mem = sizeof(*dh);
+	/*
+	 * Handle partially recycled as well as fully constructed hashes.
+	 */
 	if (dh->dh_hash != NULL) {
 		for (i = 0; i < dh->dh_narrays; i++)
-			DIRHASH_BLKFREE(dh->dh_hash[i]);
+			if (dh->dh_hash[i] != NULL)
+				DIRHASH_BLKFREE(dh->dh_hash[i]);
 		FREE(dh->dh_hash, M_DIRHASH);
-		FREE(dh->dh_blkfree, M_DIRHASH);
-		mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
-		    dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
-		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
+		if (dh->dh_blkfree != NULL)
+			FREE(dh->dh_blkfree, M_DIRHASH);
 	}
-	mtx_destroy(&dh->dh_mtx);
-	FREE(dh, M_DIRHASH);
-	ip->i_dirhash = NULL;
 
-	DIRHASHLIST_LOCK();
-	ufs_dirhashmem -= mem;
-	DIRHASHLIST_UNLOCK();
+	/*
+	 * Drop the inode's reference to the data structure.
+	 */
+	ufsdirhash_drop(dh);
 }
 
 /*
@@ -323,6 +506,8 @@ ufsdirhash_free(struct inode *ip)
  * prevoffp is non-NULL, the offset of the previous entry within
  * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
  * is the first in a block, the start of the block is used).
+ *
+ * Must be called with the hash locked.  Returns with the hash unlocked.
  */
 int
 ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
@@ -334,48 +519,36 @@ ufsdirhash_lookup(struct inode *ip, char
 	struct buf *bp;
 	doff_t blkoff, bmask, offset, prevoff;
 	int i, slot;
+	int error;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return (EJUSTRETURN);
+	dh = ip->i_dirhash;
+	KASSERT(dh != NULL && dh->dh_hash != NULL,
+	    ("ufsdirhash_lookup: Invalid dirhash %p\n", dh));
+	DIRHASH_ASSERT_LOCKED(dh);
 	/*
 	 * Move this dirhash towards the end of the list if it has a
-	 * score higher than the next entry, and acquire the dh_mtx.
-	 * Optimise the case where it's already the last by performing
-	 * an unlocked read of the TAILQ_NEXT pointer.
-	 *
-	 * In both cases, end up holding just dh_mtx.
+	 * score higher than the next entry, and acquire the dh_lock.
 	 */
+	DIRHASHLIST_LOCK();
 	if (TAILQ_NEXT(dh, dh_list) != NULL) {
-		DIRHASHLIST_LOCK();
-		DIRHASH_LOCK(dh);
 		/*
 		 * If the new score will be greater than that of the next
 		 * entry, then move this entry past it. With both mutexes
 		 * held, dh_next won't go away, but its dh_score could
 		 * change; that's not important since it is just a hint.
 		 */
-		if (dh->dh_hash != NULL &&
-		    (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
+		if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
 		    dh->dh_score >= dh_next->dh_score) {
 			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
 			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
 			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
 			    dh_list);
 		}
-		DIRHASHLIST_UNLOCK();
-	} else {
-		/* Already the last, though that could change as we wait. */
-		DIRHASH_LOCK(dh);
-	}
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return (EJUSTRETURN);
 	}
-
 	/* Update the score. */
 	if (dh->dh_score < DH_SCOREMAX)
 		dh->dh_score++;
+	DIRHASHLIST_UNLOCK();
 
 	vp = ip->i_vnode;
 	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
@@ -410,23 +583,23 @@ restart:
 	    slot = WRAPINCR(slot, dh->dh_hlen)) {
 		if (offset == DIRHASH_DEL)
 			continue;
-		DIRHASH_UNLOCK(dh);
-
 		if (offset < 0 || offset >= ip->i_size)
 			panic("ufsdirhash_lookup: bad offset in hash array");
 		if ((offset & ~bmask) != blkoff) {
 			if (bp != NULL)
 				brelse(bp);
 			blkoff = offset & ~bmask;
-			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0)
-				return (EJUSTRETURN);
+			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) {
+				error = EJUSTRETURN;
+				goto fail;
+			}
 		}
 		dp = (struct direct *)(bp->b_data + (offset & bmask));
 		if (dp->d_reclen == 0 || dp->d_reclen >
 		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
 			/* Corrupted directory. */
-			brelse(bp);
-			return (EJUSTRETURN);
+			error = EJUSTRETURN;
+			goto fail;
 		}
 		if (dp->d_namlen == namelen &&
 		    bcmp(dp->d_name, name, namelen) == 0) {
@@ -436,8 +609,8 @@ restart:
 					prevoff = ufsdirhash_getprev(dp,
 					    offset);
 					if (prevoff == -1) {
-						brelse(bp);
-						return (EJUSTRETURN);
+						error = EJUSTRETURN;
+						goto fail;
 					}
 				} else
 					prevoff = offset;
@@ -448,20 +621,12 @@ restart:
 			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
 				dh->dh_seqopt = 1;
 			dh->dh_seqoff = offset + DIRSIZ(0, dp);
-
 			*bpp = bp;
 			*offp = offset;
+			ufsdirhash_release(dh);
 			return (0);
 		}
 
-		DIRHASH_LOCK(dh);
-		if (dh->dh_hash == NULL) {
-			DIRHASH_UNLOCK(dh);
-			if (bp != NULL)
-				brelse(bp);
-			ufsdirhash_free(ip);
-			return (EJUSTRETURN);
-		}
 		/*
 		 * When the name doesn't match in the seqopt case, go back
 		 * and search normally.
@@ -471,10 +636,12 @@ restart:
 			goto restart;
 		}
 	}
-	DIRHASH_UNLOCK(dh);
+	error = ENOENT;
+fail:
+	ufsdirhash_release(dh);
 	if (bp != NULL)
 		brelse(bp);
-	return (ENOENT);
+	return (error);
 }
 
 /*
@@ -502,29 +669,22 @@ ufsdirhash_findfree(struct inode *ip, in
 	doff_t pos, slotstart;
 	int dirblock, error, freebytes, i;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return (-1);
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return (-1);
-	}
+	dh = ip->i_dirhash;
+	KASSERT(dh != NULL && dh->dh_hash != NULL,
+	    ("ufsdirhash_findfree: Invalid dirhash %p\n", dh));
+	DIRHASH_ASSERT_LOCKED(dh);
 
 	/* Find a directory block with the desired free space. */
 	dirblock = -1;
 	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
 		if ((dirblock = dh->dh_firstfree[i]) != -1)
 			break;
-	if (dirblock == -1) {
-		DIRHASH_UNLOCK(dh);
+	if (dirblock == -1)
 		return (-1);
-	}
 
 	KASSERT(dirblock < dh->dh_nblk &&
 	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
 	    ("ufsdirhash_findfree: bad stats"));
-	DIRHASH_UNLOCK(dh);
 	pos = dirblock * DIRBLKSIZ;
 	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
 	if (error)
@@ -582,24 +742,18 @@ ufsdirhash_enduseful(struct inode *ip)
 	struct dirhash *dh;
 	int i;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return (-1);
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return (-1);
-	}
+	dh = ip->i_dirhash;
+	DIRHASH_ASSERT_LOCKED(dh);
+	KASSERT(dh != NULL && dh->dh_hash != NULL,
+	    ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh));
 
-	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
-		DIRHASH_UNLOCK(dh);
+	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN)
 		return (-1);
-	}
 
 	for (i = dh->dh_dirblks - 1; i >= 0; i--)
 		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
 			break;
-	DIRHASH_UNLOCK(dh);
+
 	return ((doff_t)(i + 1) * DIRBLKSIZ);
 }
 
@@ -614,15 +768,9 @@ ufsdirhash_add(struct inode *ip, struct 
 	struct dirhash *dh;
 	int slot;
 
-	if ((dh = ip->i_dirhash) == NULL)
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return;
-	}
-
+	
 	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
 	    ("ufsdirhash_add: bad offset"));
 	/*
@@ -630,8 +778,7 @@ ufsdirhash_add(struct inode *ip, struct 
 	 * remove the hash entirely and let it be rebuilt later.
 	 */
 	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+		ufsdirhash_free_locked(ip);
 		return;
 	}
 
@@ -645,7 +792,7 @@ ufsdirhash_add(struct inode *ip, struct 
 
 	/* Update the per-block summary info. */
 	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -659,14 +806,8 @@ ufsdirhash_remove(struct inode *ip, stru
 	struct dirhash *dh;
 	int slot;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	}
 
 	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
 	    ("ufsdirhash_remove: bad offset"));
@@ -678,7 +819,7 @@ ufsdirhash_remove(struct inode *ip, stru
 
 	/* Update the per-block summary info. */
 	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -692,14 +833,8 @@ ufsdirhash_move(struct inode *ip, struct
 	struct dirhash *dh;
 	int slot;
 
-	if ((dh = ip->i_dirhash) == NULL)
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
-		return;
-	}
 
 	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
 	    newoff < dh->dh_dirblks * DIRBLKSIZ,
@@ -707,7 +842,7 @@ ufsdirhash_move(struct inode *ip, struct
 	/* Find the entry, and update the offset. */
 	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
 	DH_ENTRY(dh, slot) = newoff;
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -720,22 +855,15 @@ ufsdirhash_newblk(struct inode *ip, doff
 	struct dirhash *dh;
 	int block;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	}
 
 	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
 	    ("ufsdirhash_newblk: bad offset"));
 	block = offset / DIRBLKSIZ;
 	if (block >= dh->dh_nblk) {
 		/* Out of space; must rebuild. */
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+		ufsdirhash_free_locked(ip);
 		return;
 	}
 	dh->dh_dirblks = block + 1;
@@ -744,7 +872,7 @@ ufsdirhash_newblk(struct inode *ip, doff
 	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
 	if (dh->dh_firstfree[DH_NFSTATS] == -1)
 		dh->dh_firstfree[DH_NFSTATS] = block;
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);
 }
 
 /*
@@ -756,14 +884,8 @@ ufsdirhash_dirtrunc(struct inode *ip, do
 	struct dirhash *dh;
 	int block, i;
 
-	if ((dh = ip->i_dirhash) == NULL)
-		return;
-	DIRHASH_LOCK(dh);
-	if (dh->dh_hash == NULL) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+	if ((dh = ufsdirhash_acquire(ip)) == NULL)
 		return;
-	}
 
 	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
 	    ("ufsdirhash_dirtrunc: bad offset"));
@@ -775,8 +897,7 @@ ufsdirhash_dirtrunc(struct inode *ip, do
 	 * if necessary.
 	 */
 	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
-		DIRHASH_UNLOCK(dh);
-		ufsdirhash_free(ip);
+		ufsdirhash_free_locked(ip);
 		return;
 	}
 
@@ -794,7 +915,7 @@ ufsdirhash_dirtrunc(struct inode *ip, do
 		if (dh->dh_firstfree[i] >= block)
 			panic("ufsdirhash_dirtrunc: first free corrupt");
 	dh->dh_dirblks = block;
-	DIRHASH_UNLOCK(dh);
+	ufsdirhash_release(dh);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***