svn commit: r292895 - in stable/10/sys: fs/nfs fs/nfsclient kern ufs/ffs

Wed Dec 30 00:04:35 UTC 2015

Author: mckusick
Date: Wed Dec 30 00:04:33 2015
New Revision: 292895
URL: https://svnweb.freebsd.org/changeset/base/292895

Log:
  MFC of 291244, 291380, 291459, 291460, 291671, and 291743:
  
    This MFC includes changes to better manage the vnode freelist
    and to streamline the allocation and freeing of vnodes.
  
    Note that to maintain the KPI the VI_AGE flag is left defined
    in sys/vnode.h though its use is dropped as described in 291380.
    To maintain KBI the vfs.vlru_alloc_cache_src sysctl variable
    remains though it no longer has any effect as described in 291244.
  
  MFC of 291244:
    Move the comment about resident pages preventing vnode from leaving
    active list, into the header comment for vdrop(), which is the
    function that decides whether to leave the vnode on the list.  Note
    that dirty page write-out in vinactive() is asynchronous.
  
    Discussed with: alc
    Sponsored by:   The FreeBSD Foundation
  
    MFC of 291380:
    Remove VI_AGE vnode iflag, it is unused.
  
    Noted by:       bde
    Sponsored by:   The FreeBSD Foundation
  
  MFC of 291459:
    For performance reasons, it is useful to have a single string used as
    the name of a filesystem when setting it as the first parameter to the
    getnewvnode() function. Most filesystems call getnewvnode from just one
    place so can use a literal string as the first parameter. However, NFS
    calls getnewvnode from two places, so we create a global constant string
    that can be used by the two instances. This change also collapses two
    instances of getnewvnode() in the UFS filesystem to a single call.
  
    Reviewed by: kib
    Tested by:   Peter Holm
  
  MFC of 291460:
    As the kernel allocates and frees vnodes, it fully initializes them
    on every allocation and fully releases them on every free.  These
    are not trivial costs: it starts by zeroing a large structure then
    initializes a mutex, a lock manager lock, an rw lock, four lists,
    and six pointers. And looking at vfs.vnodes_created, these operations
    are being done millions of times an hour on a busy machine.
  
    As a performance optimization, this code update uses the uma_init
    and uma_fini routines to do these initializations and cleanups only
    as the vnodes enter and leave the vnode_zone. With this change the
    initializations are only done kern.maxvnodes times at system startup
    and then only rarely again. The frees are done only if the vnode_zone
    shrinks which never happens in practice. For those curious about the
    avoided work, look at the vnode_init() and vnode_fini() functions in
    kern/vfs_subr.c to see the code that has been removed from the main
    vnode allocation/free path.
  
    Reviewed by: kib
    Tested by:   Peter Holm
  
  MFC of 291671:
    We need to zero out the union of pointers in a freed vnode structure.
  
    Fix from:  Mateusz Guzik
    Tested by: Jason Unovitch
  
  MFC of 291743:
    We need to zero out the clustering variables in a freed vnode structure.
    For completeness add a VNASSERT that there are no threads waiting on a
    range lock (this was previously checked on every vnode free).
  
    Reported by; Rick Macklem
    Fix from:    Mateusz Guzik

Modified:
  stable/10/sys/fs/nfs/nfsport.h
  stable/10/sys/fs/nfsclient/nfs_clnode.c
  stable/10/sys/fs/nfsclient/nfs_clport.c
  stable/10/sys/kern/vfs_subr.c
  stable/10/sys/ufs/ffs/ffs_vfsops.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/fs/nfs/nfsport.h
==============================================================================

--- stable/10/sys/fs/nfs/nfsport.h	Tue Dec 29 23:25:26 2015	(r292894)
+++ stable/10/sys/fs/nfs/nfsport.h	Wed Dec 30 00:04:33 2015	(r292895)
@@ -981,6 +981,13 @@ struct nfsreq {
 #define	NFSVNO_DELEGOK(v)	(1)
 #endif
 
+/*
+ * Name used by getnewvnode() to describe filesystem, "newnfs".
+ * For perfomance reasons it is useful to have the same string
+ * used in both places that call getnewvnode().
+ */
+extern const char nfs_vnode_tag[];
+
 #endif	/* _KERNEL */
 
 #endif	/* _NFS_NFSPORT_H */

Modified: stable/10/sys/fs/nfsclient/nfs_clnode.c
==============================================================================
--- stable/10/sys/fs/nfsclient/nfs_clnode.c	Tue Dec 29 23:25:26 2015	(r292894)
+++ stable/10/sys/fs/nfsclient/nfs_clnode.c	Wed Dec 30 00:04:33 2015	(r292895)
@@ -66,6 +66,8 @@ MALLOC_DECLARE(M_NEWNFSREQ);
 
 uma_zone_t newnfsnode_zone;
 
+const char nfs_vnode_tag[] = "newnfs";
+
 static void	nfs_freesillyrename(void *arg, __unused int pending);
 
 void
@@ -124,7 +126,7 @@ ncl_nget(struct mount *mntp, u_int8_t *f
 	}
 	np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO);
 
-	error = getnewvnode("newnfs", mntp, &newnfs_vnodeops, &nvp);
+	error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp);
 	if (error) {
 		uma_zfree(newnfsnode_zone, np);
 		return (error);
@@ -332,4 +334,3 @@ ncl_invalcaches(struct vnode *vp)
 	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	mtx_unlock(&np->n_mtx);
 }
-

Modified: stable/10/sys/fs/nfsclient/nfs_clport.c
==============================================================================
--- stable/10/sys/fs/nfsclient/nfs_clport.c	Tue Dec 29 23:25:26 2015	(r292894)
+++ stable/10/sys/fs/nfsclient/nfs_clport.c	Wed Dec 30 00:04:33 2015	(r292895)
@@ -199,7 +199,7 @@ nfscl_nget(struct mount *mntp, struct vn
 	}
 	np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO);
 
-	error = getnewvnode("newnfs", mntp, &newnfs_vnodeops, &nvp);
+	error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp);
 	if (error) {
 		uma_zfree(newnfsnode_zone, np);
 		FREE((caddr_t)nfhp, M_NFSFH);

Modified: stable/10/sys/kern/vfs_subr.c
==============================================================================
--- stable/10/sys/kern/vfs_subr.c	Tue Dec 29 23:25:26 2015	(r292894)
+++ stable/10/sys/kern/vfs_subr.c	Wed Dec 30 00:04:33 2015	(r292895)
@@ -145,24 +145,51 @@ int vttoif_tab[10] = {
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
- * Free vnode target.  Free vnodes may simply be files which have been stat'd
- * but not read.  This is somewhat common, and a small cache of such files
- * should be kept to avoid recreation costs.
+ * "Free" vnode target.  Free vnodes are rarely completely free, but are
+ * just ones that are cheap to recycle.  Usually they are for files which
+ * have been stat'd but not read; these usually have inode and namecache
+ * data attached to them.  This target is the preferred minimum size of a
+ * sub-cache consisting mostly of such files. The system balances the size
+ * of this sub-cache with its complement to try to prevent either from
+ * thrashing while the other is relatively inactive.  The targets express
+ * a preference for the best balance.
+ *
+ * "Above" this target there are 2 further targets (watermarks) related
+ * to recyling of free vnodes.  In the best-operating case, the cache is
+ * exactly full, the free list has size between vlowat and vhiwat above the
+ * free target, and recycling from it and normal use maintains this state.
+ * Sometimes the free list is below vlowat or even empty, but this state
+ * is even better for immediate use provided the cache is not full.
+ * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
+ * ones) to reach one of these states.  The watermarks are currently hard-
+ * coded as 4% and 9% of the available space higher.  These and the default
+ * of 25% for wantfreevnodes are too large if the memory size is large.
+ * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
+ * whenever vnlru_proc() becomes active.
  */
 static u_long wantfreevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
-/* Number of vnodes in the free list. */
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
+    &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
 static u_long freevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
-    "Number of vnodes in the free list");
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
+    &freevnodes, 0, "Number of \"free\" vnodes");
 
+/*
+ * The vfs.vlru_allow_cache_src sysctl variable is no longer used but
+ * the sysctl remains to provide ABI compatibility. The new code frees
+ * namecache sources as the last chance to satisfy the highest watermark,
+ * instead of selecting the source vnodes randomly. This provides good
+ * enough behaviour to keep vn_fullpath() working in most situations.
+ * The filesystem layout with deep trees, where the depricated knob was
+ * required, is thus handled automatically.
+ */
 static int vlru_allow_cache_src;
 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
-    &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
+    &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)");
 
 static u_long recycles_count;
 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
-    "Number of vnodes recycled to avoid exceding kern.maxvnodes");
+    "Number of vnodes recycled to meet vnode cache targets");
 
 /*
  * Various variables used for debugging the new implementation of
@@ -272,14 +299,13 @@ static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
-/*
- * Number of vnodes we want to exist at any one time.  This is mostly used
- * to size hash tables in vnode-related code.  It is normally not used in
- * getnewvnode(), as wantfreevnodes is normally nonzero.)
- *
- * XXX desiredvnodes is historical cruft and should not exist.
- */
+/* Target for maximum number of vnodes. */
 int desiredvnodes;
+static int gapvnodes;		/* gap between wanted and desired */
+static int vhiwat;		/* enough extras after expansion */
+static int vlowat;		/* minimal extras before expansion */
+static int vstir;		/* nonzero to stir non-free vnodes */
+static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
 
 static int
 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
@@ -290,6 +316,8 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL
 	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
 		return (error);
 	if (old_desiredvnodes != desiredvnodes) {
+		wantfreevnodes = desiredvnodes / 4;
+		/* XXX locking seems to be incomplete. */
 		vfs_hash_changesize(desiredvnodes);
 		cache_changesize(desiredvnodes);
 	}
@@ -298,9 +326,9 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL
 
 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
-    sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
+    sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
-    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
+    &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
@@ -331,11 +359,71 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_tr
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
- * grows, the ratio of physical pages to vnodes approaches sixteen to one.
+ * grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
  */
 #ifndef	MAXVNODES_MAX
-#define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
+#define	MAXVNODES_MAX	(512 * 1024 * 1024 / 64)	/* 8M */
 #endif
+
+/*
+ * Initialize a vnode as it first enters the zone.
+ */
+static int
+vnode_init(void *mem, int size, int flags)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+
+	vp = mem;
+	bzero(vp, size);
+	/*
+	 * Setup locks.
+	 */
+	vp->v_vnlock = &vp->v_lock;
+	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+	/*
+	 * By default, don't allow shared locks unless filesystems opt-in.
+	 */
+	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
+	    LK_NOSHARE | LK_IS_VNODE);
+	/*
+	 * Initialize bufobj.
+	 */
+	bo = &vp->v_bufobj;
+	bo->__bo_vnode = vp;
+	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
+	bo->bo_private = vp;
+	TAILQ_INIT(&bo->bo_clean.bv_hd);
+	TAILQ_INIT(&bo->bo_dirty.bv_hd);
+	/*
+	 * Initialize namecache.
+	 */
+	LIST_INIT(&vp->v_cache_src);
+	TAILQ_INIT(&vp->v_cache_dst);
+	/*
+	 * Initialize rangelocks.
+	 */
+	rangelock_init(&vp->v_rl);
+	return (0);
+}
+
+/*
+ * Free a vnode when it is cleared from the zone.
+ */
+static void
+vnode_fini(void *mem, int size)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+
+	vp = mem;
+	rangelock_destroy(&vp->v_rl);
+	lockdestroy(vp->v_vnlock);
+	mtx_destroy(&vp->v_interlock);
+	bo = &vp->v_bufobj;
+	rw_destroy(BO_LOCKPTR(bo));
+}
+
 static void
 vntblinit(void *dummy __unused)
 {
@@ -345,15 +433,16 @@ vntblinit(void *dummy __unused)
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
 	 * kernel's heap size.  Generally speaking, it scales with the
-	 * physical memory size.  The ratio of desiredvnodes to physical pages
-	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
-	 * marginal ratio of desiredvnodes to physical pages is one to
-	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
+	 * physical memory size.  The ratio of desiredvnodes to the physical
+	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
+	 * Thereafter, the
+	 * marginal ratio of desiredvnodes to the physical memory size is
+	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
 	 * size.  The memory required by desiredvnodes vnodes and vm objects
-	 * may not exceed one seventh of the kernel's heap size.
+	 * must not exceed 1/7th of the kernel's heap size.
 	 */
-	physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
-	    cnt.v_page_count) / 16;
+	physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 +
+	    3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64;
 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode)));
 	desiredvnodes = min(physvnodes, virtvnodes);
@@ -368,7 +457,7 @@ vntblinit(void *dummy __unused)
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	/*
@@ -742,35 +831,41 @@ vattr_null(struct vattr *vap)
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
-vlrureclaim(struct mount *mp)
+vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
 {
 	struct vnode *vp;
-	int done;
-	int trigger;
-	int usevnodes;
-	int count;
+	int count, done, target;
 
-	/*
-	 * Calculate the trigger point, don't allow user
-	 * screwups to blow us up.   This prevents us from
-	 * recycling vnodes with lots of resident pages.  We
-	 * aren't trying to free memory, we are trying to
-	 * free vnodes.
-	 */
-	usevnodes = desiredvnodes;
-	if (usevnodes <= 0)
-		usevnodes = 1;
-	trigger = cnt.v_page_count * 2 / usevnodes;
 	done = 0;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
-	count = mp->mnt_nvnodelistsize / 10 + 1;
-	while (count != 0) {
+	count = mp->mnt_nvnodelistsize;
+	target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
+	target = target / 10 + 1;
+	while (count != 0 && done < target) {
 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		while (vp != NULL && vp->v_type == VMARKER)
 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
 		if (vp == NULL)
 			break;
+		/*
+		 * XXX LRU is completely broken for non-free vnodes.  First
+		 * by calling here in mountpoint order, then by moving
+		 * unselected vnodes to the end here, and most grossly by
+		 * removing the vlruvp() function that was supposed to
+		 * maintain the order.  (This function was born broken
+		 * since syncer problems prevented it doing anything.)  The
+		 * order is closer to LRC (C = Created).
+		 *
+		 * LRU reclaiming of vnodes seems to have last worked in
+		 * FreeBSD-3 where LRU wasn't mentioned under any spelling.
+		 * Then there was no hold count, and inactive vnodes were
+		 * simply put on the free list in LRU order.  The separate
+		 * lists also break LRU.  We prefer to reclaim from the
+		 * free list for technical reasons.  This tends to thrash
+		 * the free list to keep very unrecently used held vnodes.
+		 * The problem is mitigated by keeping the free list large.
+		 */
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
@@ -779,10 +874,12 @@ vlrureclaim(struct mount *mp)
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
+		 * Also skip free vnodes.  We are trying to make space
+		 * to expand the free list, not reduce it.
 		 */
 		if (vp->v_usecount ||
-		    (!vlru_allow_cache_src &&
-			!LIST_EMPTY(&(vp)->v_cache_src)) ||
+		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
+		    ((vp->v_iflag & VI_FREE) != 0) ||
 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
@@ -808,8 +905,8 @@ vlrureclaim(struct mount *mp)
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount ||
-		    (!vlru_allow_cache_src &&
-			!LIST_EMPTY(&(vp)->v_cache_src)) ||
+		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
+		    (vp->v_iflag & VI_FREE) != 0 ||
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK);
@@ -842,7 +939,7 @@ relock_mnt:
 }
 
 /*
- * Attempt to keep the free list at wantfreevnodes length.
+ * Attempt to reduce the free list by the requested amount.
  */
 static void
 vnlru_free(int count)
@@ -899,6 +996,24 @@ vnlru_free(int count)
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
+
+/* XXX some names and initialization are bad for limits and watermarks. */
+static int
+vspace(void)
+{
+	int space;
+
+	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
+	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
+	vlowat = vhiwat / 2;
+	if (numvnodes > desiredvnodes)
+		return (0);
+	space = desiredvnodes - numvnodes;
+	if (freevnodes > wantfreevnodes)
+		space += freevnodes - wantfreevnodes;
+	return (space);
+}
+
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
@@ -911,18 +1026,36 @@ static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
-	int done;
-	struct proc *p = vnlruproc;
+	unsigned long ofreevnodes, onumvnodes;
+	int done, force, reclaim_nc_src, trigger, usevnodes;
 
-	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 	    SHUTDOWN_PRI_FIRST);
 
+	force = 0;
 	for (;;) {
-		kproc_suspend_check(p);
+		kproc_suspend_check(vnlruproc);
 		mtx_lock(&vnode_free_list_mtx);
-		if (freevnodes > wantfreevnodes)
-			vnlru_free(freevnodes - wantfreevnodes);
-		if (numvnodes <= desiredvnodes * 9 / 10) {
+		/*
+		 * If numvnodes is too large (due to desiredvnodes being
+		 * adjusted using its sysctl, or emergency growth), first
+		 * try to reduce it by discarding from the free list.
+		 */
+		if (numvnodes > desiredvnodes && freevnodes > 0)
+			vnlru_free(ulmin(numvnodes - desiredvnodes,
+			    freevnodes));
+		/*
+		 * Sleep if the vnode cache is in a good state.  This is
+		 * when it is not over-full and has space for about a 4%
+		 * or 9% expansion (by growing its size or inexcessively
+		 * reducing its free list).  Otherwise, try to reclaim
+		 * space for a 10% expansion.
+		 */
+		if (vstir && force == 0) {
+			force = 1;
+			vstir = 0;
+		}
+		if (vspace() >= vlowat && force == 0) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
@@ -931,30 +1064,66 @@ vnlru_proc(void)
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
+		ofreevnodes = freevnodes;
+		onumvnodes = numvnodes;
+		/*
+		 * Calculate parameters for recycling.  These are the same
+		 * throughout the loop to give some semblance of fairness.
+		 * The trigger point is to avoid recycling vnodes with lots
+		 * of resident pages.  We aren't trying to free memory; we
+		 * are trying to recycle or at least free vnodes.
+		 */
+		if (numvnodes <= desiredvnodes)
+			usevnodes = numvnodes - freevnodes;
+		else
+			usevnodes = numvnodes;
+		if (usevnodes <= 0)
+			usevnodes = 1;
+		/*
+		 * The trigger value is is chosen to give a conservatively
+		 * large value to ensure that it alone doesn't prevent
+		 * making progress.  The value can easily be so large that
+		 * it is effectively infinite in some congested and
+		 * misconfigured cases, and this is necessary.  Normally
+		 * it is about 8 to 100 (pages), which is quite large.
+		 */
+		trigger = cnt.v_page_count * 2 / usevnodes;
+		if (force < 2)
+			trigger = vsmalltrigger;
+		reclaim_nc_src = force >= 3;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
-			done += vlrureclaim(mp);
+			done += vlrureclaim(mp, reclaim_nc_src, trigger);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp);
 		}
 		mtx_unlock(&mountlist_mtx);
+		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
+			uma_reclaim();
 		if (done == 0) {
-#if 0
-			/* These messages are temporary debugging aids */
-			if (vnlru_nowhere < 5)
-				printf("vnlru process getting nowhere..\n");
-			else if (vnlru_nowhere == 5)
-				printf("vnlru process messages stopped.\n");
-#endif
+			if (force == 0 || force == 1) {
+				force = 2;
+				continue;
+			}
+			if (force == 2) {
+				force = 3;
+				continue;
+			}
+			force = 0;
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else
 			kern_yield(PRI_USER);
+		/*
+		 * After becoming active to expand above low water, keep
+		 * active until above high water.
+		 */
+		force = vspace() < vhiwat;
 	}
 }
 
@@ -1028,22 +1197,31 @@ vtryrecycle(struct vnode *vp)
 	return (0);
 }
 
+static void
+vcheckspace(void)
+{
+
+	if (vspace() < vlowat && vnlruproc_sig == 0) {
+		vnlruproc_sig = 1;
+		wakeup(vnlruproc);
+	}
+}
+
 /*
- * Wait for available vnodes.
+ * Wait if necessary for space for a new vnode.
  */
 static int
 getnewvnode_wait(int suspended)
 {
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
-	if (numvnodes > desiredvnodes) {
+	if (numvnodes >= desiredvnodes) {
 		if (suspended) {
 			/*
-			 * File system is beeing suspended, we cannot risk a
-			 * deadlock here, so allocate new vnode anyway.
+			 * The file system is being suspended.  We cannot
+			 * risk a deadlock here, so allow allocation of
+			 * another vnode even if this would give too many.
 			 */
-			if (freevnodes > wantfreevnodes)
-				vnlru_free(freevnodes - wantfreevnodes);
 			return (0);
 		}
 		if (vnlruproc_sig == 0) {
@@ -1053,18 +1231,34 @@ getnewvnode_wait(int suspended)
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 	}
-	return (numvnodes > desiredvnodes ? ENFILE : 0);
+	/* Post-adjust like the pre-adjust in getnewvnode(). */
+	if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
+		vnlru_free(1);
+	return (numvnodes >= desiredvnodes ? ENFILE : 0);
 }
 
+/*
+ * This hack is fragile, and probably not needed any more now that the
+ * watermark handling works.
+ */
 void
 getnewvnode_reserve(u_int count)
 {
 	struct thread *td;
 
+	/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
+	/* XXX no longer so quick, but this part is not racy. */
+	mtx_lock(&vnode_free_list_mtx);
+	if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
+		vnlru_free(ulmin(numvnodes + count - desiredvnodes,
+		    freevnodes - wantfreevnodes));
+	mtx_unlock(&vnode_free_list_mtx);
+
 	td = curthread;
 	/* First try to be quick and racy. */
 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
 		td->td_vp_reserv += count;
+		vcheckspace();	/* XXX no longer so quick, but more racy */
 		return;
 	} else
 		atomic_subtract_long(&numvnodes, count);
@@ -1077,9 +1271,18 @@ getnewvnode_reserve(u_int count)
 			atomic_add_long(&numvnodes, 1);
 		}
 	}
+	vcheckspace();
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
+/*
+ * This hack is fragile, especially if desiredvnodes or wantvnodes are
+ * misconfgured or changed significantly.  Reducing desiredvnodes below
+ * the reserved amount should cause bizarre behaviour like reducing it
+ * below the number of active vnodes -- the system will try to reduce
+ * numvnodes to match, but should fail, so the subtraction below should
+ * not overflow.
+ */
 void
 getnewvnode_drop_reserve(void)
 {
@@ -1098,8 +1301,9 @@ getnewvnode(const char *tag, struct moun
     struct vnode **vpp)
 {
 	struct vnode *vp;
-	struct bufobj *bo;
 	struct thread *td;
+	struct lock_object *lo;
+	static int cyclecount;
 	int error;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
@@ -1110,57 +1314,77 @@ getnewvnode(const char *tag, struct moun
 		goto alloc;
 	}
 	mtx_lock(&vnode_free_list_mtx);
-	/*
-	 * Lend our context to reclaim vnodes if they've exceeded the max.
-	 */
-	if (freevnodes > wantfreevnodes)
+	if (numvnodes < desiredvnodes)
+		cyclecount = 0;
+	else if (cyclecount++ >= freevnodes) {
+		cyclecount = 0;
+		vstir = 1;
+	}
+	/*
+	 * Grow the vnode cache if it will not be above its target max
+	 * after growing.  Otherwise, if the free list is nonempty, try
+	 * to reclaim 1 item from it before growing the cache (possibly
+	 * above its target max if the reclamation failed or is delayed).
+	 * Otherwise, wait for some space.  In all cases, schedule
+	 * vnlru_proc() if we are getting short of space.  The watermarks
+	 * should be chosen so that we never wait or even reclaim from
+	 * the free list to below its target minimum.
+	 */
+	if (numvnodes + 1 <= desiredvnodes)
+		;
+	else if (freevnodes > 0)
 		vnlru_free(1);
-	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
-	    MNTK_SUSPEND));
+	else {
+		error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+		    MNTK_SUSPEND));
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
-	if (error != 0) {
-		mtx_unlock(&vnode_free_list_mtx);
-		return (error);
-	}
+		if (error != 0) {
+			mtx_unlock(&vnode_free_list_mtx);
+			return (error);
+		}
 #endif
+	}
+	vcheckspace();
 	atomic_add_long(&numvnodes, 1);
 	mtx_unlock(&vnode_free_list_mtx);
 alloc:
 	atomic_add_long(&vnodes_created, 1);
-	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
-	/*
-	 * Setup locks.
-	 */
-	vp->v_vnlock = &vp->v_lock;
-	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
-	/*
-	 * By default, don't allow shared locks unless filesystems
-	 * opt-in.
-	 */
-	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
+	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
 	/*
-	 * Initialize bufobj.
+	 * Locks are given the generic name "vnode" when created.
+	 * Follow the historic practice of using the filesystem
+	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
+	 *
+	 * Locks live in a witness group keyed on their name. Thus,
+	 * when a lock is renamed, it must also move from the witness
+	 * group of its old name to the witness group of its new name.
+	 *
+	 * The change only needs to be made when the vnode moves
+	 * from one filesystem type to another. We ensure that each
+	 * filesystem use a single static name pointer for its tag so
+	 * that we can compare pointers rather than doing a strcmp().
 	 */
-	bo = &vp->v_bufobj;
-	bo->__bo_vnode = vp;
-	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
-	bo->bo_ops = &buf_ops_bio;
-	bo->bo_private = vp;
-	TAILQ_INIT(&bo->bo_clean.bv_hd);
-	TAILQ_INIT(&bo->bo_dirty.bv_hd);
+	lo = &vp->v_vnlock->lock_object;
+	if (lo->lo_name != tag) {
+		lo->lo_name = tag;
+		WITNESS_DESTROY(lo);
+		WITNESS_INIT(lo, tag);
+	}
 	/*
-	 * Initialize namecache.
+	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
-	LIST_INIT(&vp->v_cache_src);
-	TAILQ_INIT(&vp->v_cache_dst);
+	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 	/*
 	 * Finalize various vnode identity bits.
 	 */
+	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
+	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
+	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_incr_usecount(vp);
-	vp->v_data = NULL;
+	vp->v_bufobj.bo_ops = &buf_ops_bio;
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
@@ -1169,11 +1393,10 @@ alloc:
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	if (mp != NULL) {
-		bo->bo_bsize = mp->mnt_stat.f_iosize;
+		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
-	rangelock_init(&vp->v_rl);
 
 	/*
 	 * For the filesystems which do not use vfs_hash_insert(),
@@ -2399,7 +2622,7 @@ vholdl(struct vnode *vp)
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 	freevnodes--;
-	vp->v_iflag &= ~(VI_FREE|VI_AGE);
+	vp->v_iflag &= ~VI_FREE;
 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 	    ("Activating already active vnode"));
 	vp->v_iflag |= VI_ACTIVE;
@@ -2467,15 +2690,9 @@ vdropl(struct vnode *vp)
 				    v_actfreelist);
 				mp->mnt_activevnodelistsize--;
 			}
-			if (vp->v_iflag & VI_AGE) {
-				TAILQ_INSERT_HEAD(&vnode_free_list, vp,
-				    v_actfreelist);
-			} else {
-				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
-				    v_actfreelist);
-			}
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp,
+			    v_actfreelist);
 			freevnodes++;
-			vp->v_iflag &= ~VI_AGE;
 			vp->v_iflag |= VI_FREE;
 			mtx_unlock(&vnode_free_list_mtx);
 		} else {
@@ -2486,6 +2703,12 @@ vdropl(struct vnode *vp)
 	}
 	/*
 	 * The vnode has been marked for destruction, so free it.
+	 *
+	 * The vnode will be returned to the zone where it will
+	 * normally remain until it is needed for another vnode. We
+	 * need to cleanup (or verify that the cleanup has already
+	 * been done) any residual data left from its current use
+	 * so as not to contaminate the freshly allocated vnode.
 	 */
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 	atomic_subtract_long(&numvnodes, 1);
@@ -2506,20 +2729,25 @@ vdropl(struct vnode *vp)
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
+	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
+	    ("Dangling rangelock waiters"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
-	if (vp->v_pollinfo != NULL)
+	if (vp->v_pollinfo != NULL) {
 		destroy_vpollinfo(vp->v_pollinfo);
+		vp->v_pollinfo = NULL;
+	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
-	rangelock_destroy(&vp->v_rl);
-	lockdestroy(vp->v_vnlock);
-	mtx_destroy(&vp->v_interlock);
-	rw_destroy(BO_LOCKPTR(bo));
+	bzero(&vp->v_un, sizeof(vp->v_un));
+	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+	vp->v_iflag = 0;
+	vp->v_vflag = 0;
+	bo->bo_flag = 0;
 	uma_zfree(vnode_zone, vp);
 }
 
@@ -2884,6 +3112,7 @@ vgonel(struct vnode *vp)
 	 * Clear the advisory locks and wake up waiting threads.
 	 */
 	(void)VOP_ADVLOCKPURGE(vp);
+	vp->v_lockf = NULL;
 	/*
 	 * Delete from old mount point vnode list.
 	 */
@@ -2986,8 +3215,6 @@ vn_printf(struct vnode *vp, const char *
 	}
 	if (vp->v_iflag & VI_MOUNT)
 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
-	if (vp->v_iflag & VI_AGE)
-		strlcat(buf, "|VI_AGE", sizeof(buf));
 	if (vp->v_iflag & VI_DOOMED)
 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
 	if (vp->v_iflag & VI_FREE)
@@ -2998,7 +3225,7 @@ vn_printf(struct vnode *vp, const char *
 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 	if (vp->v_iflag & VI_OWEINACT)
 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
-	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
+	flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE |
 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);

Modified: stable/10/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- stable/10/sys/ufs/ffs/ffs_vfsops.c	Tue Dec 29 23:25:26 2015	(r292894)
+++ stable/10/sys/ufs/ffs/ffs_vfsops.c	Wed Dec 30 00:04:33 2015	(r292895)
@@ -1670,10 +1670,8 @@ ffs_vgetf(mp, ino, flags, vpp, ffs_flags
 	ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
 
 	/* Allocate a new vnode/inode. */
-	if (fs->fs_magic == FS_UFS1_MAGIC)
-		error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp);
-	else
-		error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp);
+	error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ?
+	    &ffs_vnodeops1 : &ffs_vnodeops2, &vp);
 	if (error) {
 		*vpp = NULL;
 		uma_zfree(uma_inode, ip);