svn commit: r187478 - in stable/7/sys: . conf contrib/pf
dev/ath/ath_hal dev/cxgb kern nfsclient ufs/ffs ufs/ufs
John Baldwin
jhb at FreeBSD.org
Tue Jan 20 11:02:01 PST 2009
Author: jhb
Date: Tue Jan 20 19:01:59 2009
New Revision: 187478
URL: http://svn.freebsd.org/changeset/base/187478
Log:
MFC: Close several races with using shared vnode locks for pathname lookups
with UFS and enable shared lookups for UFS.
- Change the name cache to fail lookups with EBADF if a directory vnode
is recycled while it waits for a lock upgrade.
- Rework the locking of the dirhash to use an sx lock and reference count
on each hash structure. Using an sx lock instead of a mutex allows the
lock to be held across disk I/O closing a number of races when using
shared vnode locks that were previously handled by exclusive vnode
locks.
- Remove the 'i_ino' and 'i_reclen' fields from the i-node. i_ino is now
a local variable in ufs_lookup(), and i_reclen is not needed since
ufs_dirremove() always has the entire block holding the directory
entry in memory when it updates the directory.
- 'i_diroff' and 'i_offset' are now local variables in ufs_lookup().
'i_diroff' is updated after a successful lookup.
- Only set i_offset in the parent directory's i-node during a lookup for
non-LOOKUP operations.
- Remove the LOOKUP_SHARED option. One can set vfs.lookup_shared to 1
in either loader.conf or sysctl.conf instead. The default setting for
vfs.lookup_shared is not changed and remains off by default.
Modified:
stable/7/sys/ (props changed)
stable/7/sys/conf/options
stable/7/sys/contrib/pf/ (props changed)
stable/7/sys/dev/ath/ath_hal/ (props changed)
stable/7/sys/dev/cxgb/ (props changed)
stable/7/sys/kern/vfs_cache.c
stable/7/sys/kern/vfs_lookup.c
stable/7/sys/nfsclient/nfs_vnops.c
stable/7/sys/ufs/ffs/ffs_vfsops.c
stable/7/sys/ufs/ufs/dirhash.h
stable/7/sys/ufs/ufs/inode.h
stable/7/sys/ufs/ufs/ufs_dirhash.c
stable/7/sys/ufs/ufs/ufs_lookup.c
Modified: stable/7/sys/conf/options
==============================================================================
--- stable/7/sys/conf/options Tue Jan 20 18:16:31 2009 (r187477)
+++ stable/7/sys/conf/options Tue Jan 20 19:01:59 2009 (r187478)
@@ -741,9 +741,6 @@ NI4BTEL opt_i4b.h
#XXXBZ#NI4BING opt_i4b.h
#XXXBZ#NI4BISPPP opt_i4b.h
-# VFS options
-LOOKUP_SHARED opt_vfs.h
-
# HWPMC options
HWPMC_HOOKS
Modified: stable/7/sys/kern/vfs_cache.c
==============================================================================
--- stable/7/sys/kern/vfs_cache.c Tue Jan 20 18:16:31 2009 (r187477)
+++ stable/7/sys/kern/vfs_cache.c Tue Jan 20 19:01:59 2009 (r187478)
@@ -300,7 +300,9 @@ cache_zap(ncp)
* succeeds, the vnode is returned in *vpp, and a status of -1 is
* returned. If the lookup determines that the name does not exist
* (negative cacheing), a status of ENOENT is returned. If the lookup
- * fails, a status of zero is returned.
+ * fails, a status of zero is returned. If the directory vnode is
+ * recycled out from under us due to a forced unmount, a status of
+ * EBADF is returned.
*
* vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is
* unlocked. If we're looking up . an extra ref is taken, but the lock is
@@ -425,11 +427,19 @@ success:
* When we lookup "." we still can be asked to lock it
* differently...
*/
- ltype = cnp->cn_lkflags & (LK_SHARED | LK_EXCLUSIVE);
- if (ltype == VOP_ISLOCKED(*vpp, td))
- return (-1);
- else if (ltype == LK_EXCLUSIVE)
- vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td);
+ ltype = cnp->cn_lkflags & LK_TYPE_MASK;
+ if (ltype != VOP_ISLOCKED(*vpp, td)) {
+ if (ltype == LK_EXCLUSIVE) {
+ vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td);
+ if ((*vpp)->v_iflag & VI_DOOMED) {
+ /* forced unmount */
+ vrele(*vpp);
+ *vpp = NULL;
+ return (EBADF);
+ }
+ } else
+ vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY, td);
+ }
return (-1);
}
ltype = 0; /* silence gcc warning */
@@ -442,12 +452,14 @@ success:
error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, td);
if (cnp->cn_flags & ISDOTDOT)
vn_lock(dvp, ltype | LK_RETRY, td);
- if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_EXCLUSIVE))
- ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
if (error) {
*vpp = NULL;
goto retry;
}
+ if ((cnp->cn_flags & ISLASTCN) &&
+ (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
+ ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
+ }
return (-1);
}
@@ -663,9 +675,9 @@ vfs_cache_lookup(ap)
error = cache_lookup(dvp, vpp, cnp);
if (error == 0)
return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
- if (error == ENOENT)
- return (error);
- return (0);
+ if (error == -1)
+ return (0);
+ return (error);
}
Modified: stable/7/sys/kern/vfs_lookup.c
==============================================================================
--- stable/7/sys/kern/vfs_lookup.c Tue Jan 20 18:16:31 2009 (r187477)
+++ stable/7/sys/kern/vfs_lookup.c Tue Jan 20 19:01:59 2009 (r187478)
@@ -39,7 +39,6 @@ __FBSDID("$FreeBSD$");
#include "opt_ktrace.h"
#include "opt_mac.h"
-#include "opt_vfs.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -88,13 +87,10 @@ nameiinit(void *dummy __unused)
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
-#ifdef LOOKUP_SHARED
-static int lookup_shared = 1;
-#else
static int lookup_shared = 0;
-#endif
SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
"Enables/Disables shared locks for path name translation");
+TUNABLE_INT("vfs.lookup_shared", &lookup_shared);
/*
* Convert a pathname into a pointer to a locked vnode.
Modified: stable/7/sys/nfsclient/nfs_vnops.c
==============================================================================
--- stable/7/sys/nfsclient/nfs_vnops.c Tue Jan 20 18:16:31 2009 (r187477)
+++ stable/7/sys/nfsclient/nfs_vnops.c Tue Jan 20 19:01:59 2009 (r187478)
@@ -868,7 +868,10 @@ nfs_lookup(struct vop_lookup_args *ap)
*vpp = NULLVP;
return (error);
}
- if ((error = cache_lookup(dvp, vpp, cnp)) && error != ENOENT) {
+ error = cache_lookup(dvp, vpp, cnp);
+ if (error > 0 && error != ENOENT)
+ return (error);
+ if (error == -1) {
struct vattr vattr;
newvp = *vpp;
Modified: stable/7/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- stable/7/sys/ufs/ffs/ffs_vfsops.c Tue Jan 20 18:16:31 2009 (r187477)
+++ stable/7/sys/ufs/ffs/ffs_vfsops.c Tue Jan 20 19:01:59 2009 (r187478)
@@ -852,7 +852,7 @@ ffs_mountfs(devvp, mp, td)
* Initialize filesystem stat information in mount struct.
*/
MNT_ILOCK(mp);
- mp->mnt_kern_flag |= MNTK_MPSAFE;
+ mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED;
MNT_IUNLOCK(mp);
#ifdef UFS_EXTATTR
#ifdef UFS_EXTATTR_AUTOSTART
Modified: stable/7/sys/ufs/ufs/dirhash.h
==============================================================================
--- stable/7/sys/ufs/ufs/dirhash.h Tue Jan 20 18:16:31 2009 (r187477)
+++ stable/7/sys/ufs/ufs/dirhash.h Tue Jan 20 19:01:59 2009 (r187478)
@@ -28,6 +28,9 @@
#ifndef _UFS_UFS_DIRHASH_H_
#define _UFS_UFS_DIRHASH_H_
+#include <sys/_lock.h>
+#include <sys/_sx.h>
+
/*
* For fast operations on large directories, we maintain a hash
* that maps the file name to the offset of the directory entry within
@@ -80,12 +83,14 @@
((dh)->dh_hash[(slot) >> DH_BLKOFFSHIFT][(slot) & DH_BLKOFFMASK])
struct dirhash {
- struct mtx dh_mtx; /* protects all fields except dh_list */
+ struct sx dh_lock; /* protects all fields except list & score */
+ int dh_refcount;
doff_t **dh_hash; /* the hash array (2-level) */
int dh_narrays; /* number of entries in dh_hash */
int dh_hlen; /* total slots in the 2-level hash array */
int dh_hused; /* entries in use */
+ int dh_memreq; /* Memory used. */
/* Free space statistics. XXX assumes DIRBLKSIZ is 512. */
u_int8_t *dh_blkfree; /* free DIRALIGN words in each dir block */
Modified: stable/7/sys/ufs/ufs/inode.h
==============================================================================
--- stable/7/sys/ufs/ufs/inode.h Tue Jan 20 18:16:31 2009 (r187477)
+++ stable/7/sys/ufs/ufs/inode.h Tue Jan 20 19:01:59 2009 (r187478)
@@ -82,8 +82,6 @@ struct inode {
doff_t i_endoff; /* End of useful stuff in directory. */
doff_t i_diroff; /* Offset in dir, where we found last entry. */
doff_t i_offset; /* Offset of free space in directory. */
- ino_t i_ino; /* Inode number of found directory. */
- u_int32_t i_reclen; /* Size of found directory entry. */
union {
struct dirhash *dirhash; /* Hashing for large directories. */
Modified: stable/7/sys/ufs/ufs/ufs_dirhash.c
==============================================================================
--- stable/7/sys/ufs/ufs/ufs_dirhash.c Tue Jan 20 18:16:31 2009 (r187477)
+++ stable/7/sys/ufs/ufs/ufs_dirhash.c Tue Jan 20 19:01:59 2009 (r187478)
@@ -46,7 +46,9 @@ __FBSDID("$FreeBSD$");
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
+#include <sys/refcount.h>
#include <sys/sysctl.h>
+#include <sys/sx.h>
#include <vm/uma.h>
#include <ufs/ufs/quota.h>
@@ -88,15 +90,16 @@ static int ufsdirhash_findslot(struct di
doff_t offset);
static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
static int ufsdirhash_recycle(int wanted);
+static void ufsdirhash_free_locked(struct inode *ip);
static uma_zone_t ufsdirhash_zone;
#define DIRHASHLIST_LOCK() mtx_lock(&ufsdirhash_mtx)
#define DIRHASHLIST_UNLOCK() mtx_unlock(&ufsdirhash_mtx)
-#define DIRHASH_LOCK(dh) mtx_lock(&(dh)->dh_mtx)
-#define DIRHASH_UNLOCK(dh) mtx_unlock(&(dh)->dh_mtx)
#define DIRHASH_BLKALLOC_WAITOK() uma_zalloc(ufsdirhash_zone, M_WAITOK)
#define DIRHASH_BLKFREE(ptr) uma_zfree(ufsdirhash_zone, (ptr))
+#define DIRHASH_ASSERT_LOCKED(dh) \
+ sx_assert(&(dh)->dh_lock, SA_LOCKED)
/* Dirhash list; recently-used entries are near the tail. */
static TAILQ_HEAD(, dirhash) ufsdirhash_list;
@@ -105,14 +108,199 @@ static TAILQ_HEAD(, dirhash) ufsdirhash_
static struct mtx ufsdirhash_mtx;
/*
- * Locking order:
- * ufsdirhash_mtx
- * dh_mtx
+ * Locking:
*
- * The dh_mtx mutex should be acquired either via the inode lock, or via
- * ufsdirhash_mtx. Only the owner of the inode may free the associated
- * dirhash, but anything can steal its memory and set dh_hash to NULL.
+ * The relationship between inode and dirhash is protected either by an
+ * exclusive vnode lock or the vnode interlock where a shared vnode lock
+ * may be used. The dirhash_mtx is acquired after the dirhash lock. To
+ * handle teardown races, code wishing to lock the dirhash for an inode
+ * when using a shared vnode lock must obtain a private reference on the
+ * dirhash while holding the vnode interlock. They can drop it once they
+ * have obtained the dirhash lock and verified that the dirhash wasn't
+ * recycled while they waited for the dirhash lock.
+ *
+ * ufsdirhash_build() acquires a shared lock on the dirhash when it is
+ * successful. This lock is released after a call to ufsdirhash_lookup().
+ *
+ * Functions requiring exclusive access use ufsdirhash_acquire() which may
+ * free a dirhash structure that was recycled by ufsdirhash_recycle().
+ *
+ * The dirhash lock may be held across io operations.
+ */
+
+static void
+ufsdirhash_hold(struct dirhash *dh)
+{
+
+ refcount_acquire(&dh->dh_refcount);
+}
+
+static void
+ufsdirhash_drop(struct dirhash *dh)
+{
+
+ if (refcount_release(&dh->dh_refcount)) {
+ sx_destroy(&dh->dh_lock);
+ free(dh, M_DIRHASH);
+ }
+}
+
+/*
+ * Release the lock on a dirhash.
+ */
+static void
+ufsdirhash_release(struct dirhash *dh)
+{
+
+ sx_unlock(&dh->dh_lock);
+}
+
+/*
+ * Either acquire an existing hash locked shared or create a new hash and
+ * return it exclusively locked. May return NULL if the allocation fails.
+ *
+ * The vnode interlock is used to protect the i_dirhash pointer from
+ * simultaneous access while only a shared vnode lock is held.
+ */
+static struct dirhash *
+ufsdirhash_create(struct inode *ip)
+{
+ struct dirhash *ndh;
+ struct dirhash *dh;
+ struct vnode *vp;
+ int error;
+
+ error = 0;
+ ndh = dh = NULL;
+ vp = ip->i_vnode;
+ for (;;) {
+ /* Racy check for i_dirhash to prefetch an dirhash structure. */
+ if (ip->i_dirhash == NULL && ndh == NULL) {
+ MALLOC(ndh, struct dirhash *, sizeof *dh, M_DIRHASH,
+ M_NOWAIT | M_ZERO);
+ if (ndh == NULL)
+ return (NULL);
+ refcount_init(&ndh->dh_refcount, 1);
+ sx_init(&ndh->dh_lock, "dirhash");
+ sx_xlock(&ndh->dh_lock);
+ }
+ /*
+ * Check i_dirhash. If it's NULL just try to use a
+ * preallocated structure. If none exists loop and try again.
+ */
+ VI_LOCK(vp);
+ dh = ip->i_dirhash;
+ if (dh == NULL) {
+ ip->i_dirhash = ndh;
+ VI_UNLOCK(vp);
+ if (ndh == NULL)
+ continue;
+ return (ndh);
+ }
+ ufsdirhash_hold(dh);
+ VI_UNLOCK(vp);
+
+ /* Acquire a shared lock on existing hashes. */
+ sx_slock(&dh->dh_lock);
+
+ /* The hash could've been recycled while we were waiting. */
+ VI_LOCK(vp);
+ if (ip->i_dirhash != dh) {
+ VI_UNLOCK(vp);
+ ufsdirhash_release(dh);
+ ufsdirhash_drop(dh);
+ continue;
+ }
+ VI_UNLOCK(vp);
+ ufsdirhash_drop(dh);
+
+ /* If the hash is still valid we've succeeded. */
+ if (dh->dh_hash != NULL)
+ break;
+ /*
+ * If the hash is NULL it has been recycled. Try to upgrade
+ * so we can recreate it. If we fail the upgrade, drop our
+ * lock and try again.
+ */
+ if (sx_try_upgrade(&dh->dh_lock))
+ break;
+ sx_sunlock(&dh->dh_lock);
+ }
+ /* Free the preallocated structure if it was not necessary. */
+ if (ndh) {
+ ufsdirhash_release(ndh);
+ ufsdirhash_drop(ndh);
+ }
+ return (dh);
+}
+
+/*
+ * Acquire an exclusive lock on an existing hash. Requires an exclusive
+ * vnode lock to protect the i_dirhash pointer. hashes that have been
+ * recycled are reclaimed here and NULL is returned.
+ */
+static struct dirhash *
+ufsdirhash_acquire(struct inode *ip)
+{
+ struct dirhash *dh;
+ struct vnode *vp;
+
+ ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__);
+
+ vp = ip->i_vnode;
+ dh = ip->i_dirhash;
+ if (dh == NULL)
+ return (NULL);
+ sx_xlock(&dh->dh_lock);
+ if (dh->dh_hash != NULL)
+ return (dh);
+ ufsdirhash_free_locked(ip);
+ return (NULL);
+}
+
+/*
+ * Acquire exclusively and free the hash pointed to by ip. Works with a
+ * shared or exclusive vnode lock.
*/
+void
+ufsdirhash_free(struct inode *ip)
+{
+ struct dirhash *dh;
+ struct vnode *vp;
+
+ vp = ip->i_vnode;
+ for (;;) {
+ /* Grab a reference on this inode's dirhash if it has one. */
+ VI_LOCK(vp);
+ dh = ip->i_dirhash;
+ if (dh == NULL) {
+ VI_UNLOCK(vp);
+ return;
+ }
+ ufsdirhash_hold(dh);
+ VI_UNLOCK(vp);
+
+ /* Exclusively lock the dirhash. */
+ sx_xlock(&dh->dh_lock);
+
+ /* If this dirhash still belongs to this inode, then free it. */
+ VI_LOCK(vp);
+ if (ip->i_dirhash == dh) {
+ VI_UNLOCK(vp);
+ ufsdirhash_drop(dh);
+ break;
+ }
+ VI_UNLOCK(vp);
+
+ /*
+ * This inode's dirhash has changed while we were
+ * waiting for the dirhash lock, so try again.
+ */
+ ufsdirhash_release(dh);
+ ufsdirhash_drop(dh);
+ }
+ ufsdirhash_free_locked(ip);
+}
/*
* Attempt to build up a hash table for the directory contents in
@@ -128,27 +316,23 @@ ufsdirhash_build(struct inode *ip)
doff_t bmask, pos;
int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
- /* Check if we can/should use dirhash. */
- if (ip->i_dirhash == NULL) {
- if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode))
+ /* Take care of a decreased sysctl value. */
+ while (ufs_dirhashmem > ufs_dirhashmaxmem)
+ if (ufsdirhash_recycle(0) != 0)
return (-1);
- } else {
- /* Hash exists, but sysctls could have changed. */
- if (ip->i_size < ufs_mindirhashsize ||
- ufs_dirhashmem > ufs_dirhashmaxmem) {
+
+ /* Check if we can/should use dirhash. */
+ if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) ||
+ ip->i_effnlink == 0) {
+ if (ip->i_dirhash)
ufsdirhash_free(ip);
- return (-1);
- }
- /* Check if hash exists and is intact (note: unlocked read). */
- if (ip->i_dirhash->dh_hash != NULL)
- return (0);
- /* Free the old, recycled hash and build a new one. */
- ufsdirhash_free(ip);
+ return (-1);
}
-
- /* Don't hash removed directories. */
- if (ip->i_effnlink == 0)
+ dh = ufsdirhash_create(ip);
+ if (dh == NULL)
return (-1);
+ if (dh->dh_hash != NULL)
+ return (0);
vp = ip->i_vnode;
/* Allocate 50% more entries than this dir size could ever need. */
@@ -159,7 +343,6 @@ ufsdirhash_build(struct inode *ip)
nslots = narrays * DH_NBLKOFF;
dirblocks = howmany(ip->i_size, DIRBLKSIZ);
nblocks = (dirblocks * 3 + 1) / 2;
-
memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
nblocks * sizeof(*dh->dh_blkfree);
@@ -167,33 +350,40 @@ ufsdirhash_build(struct inode *ip)
if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
DIRHASHLIST_UNLOCK();
if (memreqd > ufs_dirhashmaxmem / 2)
- return (-1);
-
+ goto fail;
/* Try to free some space. */
if (ufsdirhash_recycle(memreqd) != 0)
- return (-1);
+ goto fail;
/* Enough was freed, and list has been locked. */
}
ufs_dirhashmem += memreqd;
DIRHASHLIST_UNLOCK();
+ /* Initialise the hash table and block statistics. */
+ dh->dh_memreq = memreqd;
+ dh->dh_narrays = narrays;
+ dh->dh_hlen = nslots;
+ dh->dh_nblk = nblocks;
+ dh->dh_dirblks = dirblocks;
+ for (i = 0; i < DH_NFSTATS; i++)
+ dh->dh_firstfree[i] = -1;
+ dh->dh_firstfree[DH_NFSTATS] = 0;
+ dh->dh_hused = 0;
+ dh->dh_seqopt = 0;
+ dh->dh_seqoff = 0;
+ dh->dh_score = DH_SCOREINIT;
+
/*
* Use non-blocking mallocs so that we will revert to a linear
* lookup on failure rather than potentially blocking forever.
*/
- MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO);
- if (dh == NULL) {
- DIRHASHLIST_LOCK();
- ufs_dirhashmem -= memreqd;
- DIRHASHLIST_UNLOCK();
- return (-1);
- }
- mtx_init(&dh->dh_mtx, "dirhash", NULL, MTX_DEF);
MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
M_DIRHASH, M_NOWAIT | M_ZERO);
+ if (dh->dh_hash == NULL)
+ goto fail;
MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
M_DIRHASH, M_NOWAIT);
- if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
+ if (dh->dh_blkfree == NULL)
goto fail;
for (i = 0; i < narrays; i++) {
if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL)
@@ -201,22 +391,8 @@ ufsdirhash_build(struct inode *ip)
for (j = 0; j < DH_NBLKOFF; j++)
dh->dh_hash[i][j] = DIRHASH_EMPTY;
}
-
- /* Initialise the hash table and block statistics. */
- dh->dh_narrays = narrays;
- dh->dh_hlen = nslots;
- dh->dh_nblk = nblocks;
- dh->dh_dirblks = dirblocks;
for (i = 0; i < dirblocks; i++)
dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
- for (i = 0; i < DH_NFSTATS; i++)
- dh->dh_firstfree[i] = -1;
- dh->dh_firstfree[DH_NFSTATS] = 0;
- dh->dh_seqopt = 0;
- dh->dh_seqoff = 0;
- dh->dh_score = DH_SCOREINIT;
- ip->i_dirhash = dh;
-
bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
pos = 0;
while (pos < ip->i_size) {
@@ -254,63 +430,70 @@ ufsdirhash_build(struct inode *ip)
TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
dh->dh_onlist = 1;
DIRHASHLIST_UNLOCK();
+ sx_downgrade(&dh->dh_lock);
return (0);
fail:
- if (dh->dh_hash != NULL) {
- for (i = 0; i < narrays; i++)
- if (dh->dh_hash[i] != NULL)
- DIRHASH_BLKFREE(dh->dh_hash[i]);
- FREE(dh->dh_hash, M_DIRHASH);
- }
- if (dh->dh_blkfree != NULL)
- FREE(dh->dh_blkfree, M_DIRHASH);
- mtx_destroy(&dh->dh_mtx);
- FREE(dh, M_DIRHASH);
- ip->i_dirhash = NULL;
- DIRHASHLIST_LOCK();
- ufs_dirhashmem -= memreqd;
- DIRHASHLIST_UNLOCK();
+ ufsdirhash_free_locked(ip);
return (-1);
}
/*
* Free any hash table associated with inode 'ip'.
*/
-void
-ufsdirhash_free(struct inode *ip)
+static void
+ufsdirhash_free_locked(struct inode *ip)
{
struct dirhash *dh;
- int i, mem;
+ struct vnode *vp;
+ int i;
- if ((dh = ip->i_dirhash) == NULL)
- return;
+ DIRHASH_ASSERT_LOCKED(ip->i_dirhash);
+
+ /*
+ * Clear the pointer in the inode to prevent new threads from
+ * finding the dead structure.
+ */
+ vp = ip->i_vnode;
+ VI_LOCK(vp);
+ dh = ip->i_dirhash;
+ ip->i_dirhash = NULL;
+ VI_UNLOCK(vp);
+
+ /*
+ * Remove the hash from the list since we are going to free its
+ * memory.
+ */
DIRHASHLIST_LOCK();
- DIRHASH_LOCK(dh);
if (dh->dh_onlist)
TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
- DIRHASH_UNLOCK(dh);
+ ufs_dirhashmem -= dh->dh_memreq;
DIRHASHLIST_UNLOCK();
- /* The dirhash pointed to by 'dh' is exclusively ours now. */
+ /*
+ * At this point, any waiters for the lock should hold their
+ * own reference on the dirhash structure. They will drop
+ * that reference once they grab the vnode interlock and see
+ * that ip->i_dirhash is NULL.
+ */
+ sx_xunlock(&dh->dh_lock);
- mem = sizeof(*dh);
+ /*
+ * Handle partially recycled as well as fully constructed hashes.
+ */
if (dh->dh_hash != NULL) {
for (i = 0; i < dh->dh_narrays; i++)
- DIRHASH_BLKFREE(dh->dh_hash[i]);
+ if (dh->dh_hash[i] != NULL)
+ DIRHASH_BLKFREE(dh->dh_hash[i]);
FREE(dh->dh_hash, M_DIRHASH);
- FREE(dh->dh_blkfree, M_DIRHASH);
- mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
- dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
- dh->dh_nblk * sizeof(*dh->dh_blkfree);
+ if (dh->dh_blkfree != NULL)
+ FREE(dh->dh_blkfree, M_DIRHASH);
}
- mtx_destroy(&dh->dh_mtx);
- FREE(dh, M_DIRHASH);
- ip->i_dirhash = NULL;
- DIRHASHLIST_LOCK();
- ufs_dirhashmem -= mem;
- DIRHASHLIST_UNLOCK();
+ /*
+ * Drop the inode's reference to the data structure.
+ */
+ ufsdirhash_drop(dh);
}
/*
@@ -323,6 +506,8 @@ ufsdirhash_free(struct inode *ip)
* prevoffp is non-NULL, the offset of the previous entry within
* the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
* is the first in a block, the start of the block is used).
+ *
+ * Must be called with the hash locked. Returns with the hash unlocked.
*/
int
ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
@@ -334,48 +519,36 @@ ufsdirhash_lookup(struct inode *ip, char
struct buf *bp;
doff_t blkoff, bmask, offset, prevoff;
int i, slot;
+ int error;
- if ((dh = ip->i_dirhash) == NULL)
- return (EJUSTRETURN);
+ dh = ip->i_dirhash;
+ KASSERT(dh != NULL && dh->dh_hash != NULL,
+ ("ufsdirhash_lookup: Invalid dirhash %p\n", dh));
+ DIRHASH_ASSERT_LOCKED(dh);
/*
* Move this dirhash towards the end of the list if it has a
- * score higher than the next entry, and acquire the dh_mtx.
- * Optimise the case where it's already the last by performing
- * an unlocked read of the TAILQ_NEXT pointer.
- *
- * In both cases, end up holding just dh_mtx.
+ * score higher than the next entry, and acquire the dh_lock.
*/
+ DIRHASHLIST_LOCK();
if (TAILQ_NEXT(dh, dh_list) != NULL) {
- DIRHASHLIST_LOCK();
- DIRHASH_LOCK(dh);
/*
* If the new score will be greater than that of the next
* entry, then move this entry past it. With both mutexes
* held, dh_next won't go away, but its dh_score could
* change; that's not important since it is just a hint.
*/
- if (dh->dh_hash != NULL &&
- (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
+ if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
dh->dh_score >= dh_next->dh_score) {
KASSERT(dh->dh_onlist, ("dirhash: not on list"));
TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
dh_list);
}
- DIRHASHLIST_UNLOCK();
- } else {
- /* Already the last, though that could change as we wait. */
- DIRHASH_LOCK(dh);
- }
- if (dh->dh_hash == NULL) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
- return (EJUSTRETURN);
}
-
/* Update the score. */
if (dh->dh_score < DH_SCOREMAX)
dh->dh_score++;
+ DIRHASHLIST_UNLOCK();
vp = ip->i_vnode;
bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
@@ -410,23 +583,23 @@ restart:
slot = WRAPINCR(slot, dh->dh_hlen)) {
if (offset == DIRHASH_DEL)
continue;
- DIRHASH_UNLOCK(dh);
-
if (offset < 0 || offset >= ip->i_size)
panic("ufsdirhash_lookup: bad offset in hash array");
if ((offset & ~bmask) != blkoff) {
if (bp != NULL)
brelse(bp);
blkoff = offset & ~bmask;
- if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0)
- return (EJUSTRETURN);
+ if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) {
+ error = EJUSTRETURN;
+ goto fail;
+ }
}
dp = (struct direct *)(bp->b_data + (offset & bmask));
if (dp->d_reclen == 0 || dp->d_reclen >
DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
/* Corrupted directory. */
- brelse(bp);
- return (EJUSTRETURN);
+ error = EJUSTRETURN;
+ goto fail;
}
if (dp->d_namlen == namelen &&
bcmp(dp->d_name, name, namelen) == 0) {
@@ -436,8 +609,8 @@ restart:
prevoff = ufsdirhash_getprev(dp,
offset);
if (prevoff == -1) {
- brelse(bp);
- return (EJUSTRETURN);
+ error = EJUSTRETURN;
+ goto fail;
}
} else
prevoff = offset;
@@ -448,20 +621,12 @@ restart:
if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
dh->dh_seqopt = 1;
dh->dh_seqoff = offset + DIRSIZ(0, dp);
-
*bpp = bp;
*offp = offset;
+ ufsdirhash_release(dh);
return (0);
}
- DIRHASH_LOCK(dh);
- if (dh->dh_hash == NULL) {
- DIRHASH_UNLOCK(dh);
- if (bp != NULL)
- brelse(bp);
- ufsdirhash_free(ip);
- return (EJUSTRETURN);
- }
/*
* When the name doesn't match in the seqopt case, go back
* and search normally.
@@ -471,10 +636,12 @@ restart:
goto restart;
}
}
- DIRHASH_UNLOCK(dh);
+ error = ENOENT;
+fail:
+ ufsdirhash_release(dh);
if (bp != NULL)
brelse(bp);
- return (ENOENT);
+ return (error);
}
/*
@@ -502,29 +669,22 @@ ufsdirhash_findfree(struct inode *ip, in
doff_t pos, slotstart;
int dirblock, error, freebytes, i;
- if ((dh = ip->i_dirhash) == NULL)
- return (-1);
- DIRHASH_LOCK(dh);
- if (dh->dh_hash == NULL) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
- return (-1);
- }
+ dh = ip->i_dirhash;
+ KASSERT(dh != NULL && dh->dh_hash != NULL,
+ ("ufsdirhash_findfree: Invalid dirhash %p\n", dh));
+ DIRHASH_ASSERT_LOCKED(dh);
/* Find a directory block with the desired free space. */
dirblock = -1;
for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
if ((dirblock = dh->dh_firstfree[i]) != -1)
break;
- if (dirblock == -1) {
- DIRHASH_UNLOCK(dh);
+ if (dirblock == -1)
return (-1);
- }
KASSERT(dirblock < dh->dh_nblk &&
dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
("ufsdirhash_findfree: bad stats"));
- DIRHASH_UNLOCK(dh);
pos = dirblock * DIRBLKSIZ;
error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
if (error)
@@ -582,24 +742,18 @@ ufsdirhash_enduseful(struct inode *ip)
struct dirhash *dh;
int i;
- if ((dh = ip->i_dirhash) == NULL)
- return (-1);
- DIRHASH_LOCK(dh);
- if (dh->dh_hash == NULL) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
- return (-1);
- }
+ dh = ip->i_dirhash;
+ DIRHASH_ASSERT_LOCKED(dh);
+ KASSERT(dh != NULL && dh->dh_hash != NULL,
+ ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh));
- if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
- DIRHASH_UNLOCK(dh);
+ if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN)
return (-1);
- }
for (i = dh->dh_dirblks - 1; i >= 0; i--)
if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
break;
- DIRHASH_UNLOCK(dh);
+
return ((doff_t)(i + 1) * DIRBLKSIZ);
}
@@ -614,15 +768,9 @@ ufsdirhash_add(struct inode *ip, struct
struct dirhash *dh;
int slot;
- if ((dh = ip->i_dirhash) == NULL)
+ if ((dh = ufsdirhash_acquire(ip)) == NULL)
return;
- DIRHASH_LOCK(dh);
- if (dh->dh_hash == NULL) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
- return;
- }
-
+
KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
("ufsdirhash_add: bad offset"));
/*
@@ -630,8 +778,7 @@ ufsdirhash_add(struct inode *ip, struct
* remove the hash entirely and let it be rebuilt later.
*/
if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
+ ufsdirhash_free_locked(ip);
return;
}
@@ -645,7 +792,7 @@ ufsdirhash_add(struct inode *ip, struct
/* Update the per-block summary info. */
ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
- DIRHASH_UNLOCK(dh);
+ ufsdirhash_release(dh);
}
/*
@@ -659,14 +806,8 @@ ufsdirhash_remove(struct inode *ip, stru
struct dirhash *dh;
int slot;
- if ((dh = ip->i_dirhash) == NULL)
- return;
- DIRHASH_LOCK(dh);
- if (dh->dh_hash == NULL) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
+ if ((dh = ufsdirhash_acquire(ip)) == NULL)
return;
- }
KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
("ufsdirhash_remove: bad offset"));
@@ -678,7 +819,7 @@ ufsdirhash_remove(struct inode *ip, stru
/* Update the per-block summary info. */
ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
- DIRHASH_UNLOCK(dh);
+ ufsdirhash_release(dh);
}
/*
@@ -692,14 +833,8 @@ ufsdirhash_move(struct inode *ip, struct
struct dirhash *dh;
int slot;
- if ((dh = ip->i_dirhash) == NULL)
+ if ((dh = ufsdirhash_acquire(ip)) == NULL)
return;
- DIRHASH_LOCK(dh);
- if (dh->dh_hash == NULL) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
- return;
- }
KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
newoff < dh->dh_dirblks * DIRBLKSIZ,
@@ -707,7 +842,7 @@ ufsdirhash_move(struct inode *ip, struct
/* Find the entry, and update the offset. */
slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
DH_ENTRY(dh, slot) = newoff;
- DIRHASH_UNLOCK(dh);
+ ufsdirhash_release(dh);
}
/*
@@ -720,22 +855,15 @@ ufsdirhash_newblk(struct inode *ip, doff
struct dirhash *dh;
int block;
- if ((dh = ip->i_dirhash) == NULL)
- return;
- DIRHASH_LOCK(dh);
- if (dh->dh_hash == NULL) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
+ if ((dh = ufsdirhash_acquire(ip)) == NULL)
return;
- }
KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
("ufsdirhash_newblk: bad offset"));
block = offset / DIRBLKSIZ;
if (block >= dh->dh_nblk) {
/* Out of space; must rebuild. */
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
+ ufsdirhash_free_locked(ip);
return;
}
dh->dh_dirblks = block + 1;
@@ -744,7 +872,7 @@ ufsdirhash_newblk(struct inode *ip, doff
dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
if (dh->dh_firstfree[DH_NFSTATS] == -1)
dh->dh_firstfree[DH_NFSTATS] = block;
- DIRHASH_UNLOCK(dh);
+ ufsdirhash_release(dh);
}
/*
@@ -756,14 +884,8 @@ ufsdirhash_dirtrunc(struct inode *ip, do
struct dirhash *dh;
int block, i;
- if ((dh = ip->i_dirhash) == NULL)
- return;
- DIRHASH_LOCK(dh);
- if (dh->dh_hash == NULL) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
+ if ((dh = ufsdirhash_acquire(ip)) == NULL)
return;
- }
KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
("ufsdirhash_dirtrunc: bad offset"));
@@ -775,8 +897,7 @@ ufsdirhash_dirtrunc(struct inode *ip, do
* if necessary.
*/
if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
- DIRHASH_UNLOCK(dh);
- ufsdirhash_free(ip);
+ ufsdirhash_free_locked(ip);
return;
}
@@ -794,7 +915,7 @@ ufsdirhash_dirtrunc(struct inode *ip, do
if (dh->dh_firstfree[i] >= block)
panic("ufsdirhash_dirtrunc: first free corrupt");
dh->dh_dirblks = block;
- DIRHASH_UNLOCK(dh);
+ ufsdirhash_release(dh);
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-stable-7
mailing list