svn commit: r292909 - in user/ngie/stable-10-libnv: gnu/usr.bin/binutils/ld sys/dev/virtio/balloon sys/dev/xen/balloon sys/fs/nfs sys/fs/nfsclient sys/kern sys/sys sys/ufs/ffs

Wed Dec 30 08:55:16 UTC 2015

Author: ngie
Date: Wed Dec 30 08:55:13 2015
New Revision: 292909
URL: https://svnweb.freebsd.org/changeset/base/292909

Log:
  MFstable/10 @ r292908

Modified:
  user/ngie/stable-10-libnv/gnu/usr.bin/binutils/ld/Makefile
  user/ngie/stable-10-libnv/sys/dev/virtio/balloon/virtio_balloon.c
  user/ngie/stable-10-libnv/sys/dev/xen/balloon/balloon.c
  user/ngie/stable-10-libnv/sys/fs/nfs/nfsport.h
  user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clnode.c
  user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clport.c
  user/ngie/stable-10-libnv/sys/kern/vfs_subr.c
  user/ngie/stable-10-libnv/sys/sys/param.h
  user/ngie/stable-10-libnv/sys/ufs/ffs/ffs_vfsops.c
Directory Properties:
  user/ngie/stable-10-libnv/   (props changed)

Modified: user/ngie/stable-10-libnv/gnu/usr.bin/binutils/ld/Makefile
==============================================================================

--- user/ngie/stable-10-libnv/gnu/usr.bin/binutils/ld/Makefile	Wed Dec 30 08:53:12 2015	(r292908)
+++ user/ngie/stable-10-libnv/gnu/usr.bin/binutils/ld/Makefile	Wed Dec 30 08:55:13 2015	(r292909)
@@ -52,6 +52,7 @@ CLEANFILES+=	ldemul-list.h stringify.sed
 
 FILES=		${LDSCRIPTS:S|^|ldscripts/|}
 FILESDIR=	${SCRIPTDIR}
+LINKS=		${BINDIR}/ld ${BINDIR}/ld.bfd
 
 HOST=		${TARGET_TUPLE}
 LIBSEARCHPATH=	\"=/lib\":\"=/usr/lib\"

Modified: user/ngie/stable-10-libnv/sys/dev/virtio/balloon/virtio_balloon.c
==============================================================================
--- user/ngie/stable-10-libnv/sys/dev/virtio/balloon/virtio_balloon.c	Wed Dec 30 08:53:12 2015	(r292908)
+++ user/ngie/stable-10-libnv/sys/dev/virtio/balloon/virtio_balloon.c	Wed Dec 30 08:55:13 2015	(r292909)
@@ -438,8 +438,7 @@ vtballoon_alloc_page(struct vtballoon_so
 {
 	vm_page_t m;
 
-	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED |
-	    VM_ALLOC_NOOBJ);
+	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
 	if (m != NULL)
 		sc->vtballoon_current_npages++;
 
@@ -450,7 +449,6 @@ static void
 vtballoon_free_page(struct vtballoon_softc *sc, vm_page_t m)
 {
 
-	vm_page_unwire(m, 0);
 	vm_page_free(m);
 	sc->vtballoon_current_npages--;
 }

Modified: user/ngie/stable-10-libnv/sys/dev/xen/balloon/balloon.c
==============================================================================
--- user/ngie/stable-10-libnv/sys/dev/xen/balloon/balloon.c	Wed Dec 30 08:53:12 2015	(r292908)
+++ user/ngie/stable-10-libnv/sys/dev/xen/balloon/balloon.c	Wed Dec 30 08:55:13 2015	(r292909)
@@ -94,13 +94,8 @@ SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO,
 SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD,
     &bs.balloon_high, 0, "High-mem balloon");
 
-struct balloon_entry {
-	vm_page_t page;
-	STAILQ_ENTRY(balloon_entry) list;
-};
-
 /* List of ballooned pages, threaded through the mem_map array. */
-static STAILQ_HEAD(,balloon_entry) ballooned_pages;
+static TAILQ_HEAD(,vm_page) ballooned_pages;
 
 /* Main work function, always executed in process context. */
 static void balloon_process(void *unused);
@@ -110,47 +105,6 @@ static void balloon_process(void *unused
 #define WPRINTK(fmt, args...) \
 	printk(KERN_WARNING "xen_mem: " fmt, ##args)
 
-/* balloon_append: add the given page to the balloon. */
-static int
-balloon_append(vm_page_t page)
-{
-	struct balloon_entry *entry;
-
-	mtx_assert(&balloon_mutex, MA_OWNED);
-
-	entry = malloc(sizeof(struct balloon_entry), M_BALLOON, M_NOWAIT);
-	if (!entry)
-		return (ENOMEM);
-	entry->page = page;
-	STAILQ_INSERT_HEAD(&ballooned_pages, entry, list);
-	bs.balloon_low++;
-
-	return (0);
-}
-
-/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
-static vm_page_t
-balloon_retrieve(void)
-{
-	vm_page_t page;
-	struct balloon_entry *entry;
-
-	mtx_assert(&balloon_mutex, MA_OWNED);
-
-	if (STAILQ_EMPTY(&ballooned_pages))
-		return (NULL);
-
-	entry = STAILQ_FIRST(&ballooned_pages);
-	STAILQ_REMOVE_HEAD(&ballooned_pages, list);
-
-	page = entry->page;
-	free(entry, M_BALLOON);
-	
-	bs.balloon_low--;
-
-	return (page);
-}
-
 static unsigned long 
 current_target(void)
 {
@@ -203,7 +157,6 @@ static int 
 increase_reservation(unsigned long nr_pages)
 {
 	unsigned long  pfn, i;
-	struct balloon_entry *entry;
 	vm_page_t      page;
 	long           rc;
 	struct xen_memory_reservation reservation = {
@@ -217,10 +170,9 @@ increase_reservation(unsigned long nr_pa
 	if (nr_pages > nitems(frame_list))
 		nr_pages = nitems(frame_list);
 
-	for (entry = STAILQ_FIRST(&ballooned_pages), i = 0;
-	     i < nr_pages; i++, entry = STAILQ_NEXT(entry, list)) {
-		KASSERT(entry, ("ballooned_pages list corrupt"));
-		page = entry->page;
+	for (page = TAILQ_FIRST(&ballooned_pages), i = 0;
+	    i < nr_pages; i++, page = TAILQ_NEXT(page, plinks.q)) {
+		KASSERT(page != NULL, ("ballooned_pages list corrupt"));
 		frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
 	}
 
@@ -245,8 +197,10 @@ increase_reservation(unsigned long nr_pa
 	}
 
 	for (i = 0; i < nr_pages; i++) {
-		page = balloon_retrieve();
-		KASSERT(page, ("balloon_retrieve failed"));
+		page = TAILQ_FIRST(&ballooned_pages);
+		KASSERT(page != NULL, ("Unable to get ballooned page"));
+		TAILQ_REMOVE(&ballooned_pages, page, plinks.q);
+		bs.balloon_low--;
 
 		pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
 		KASSERT((xen_feature(XENFEAT_auto_translated_physmap) ||
@@ -255,7 +209,6 @@ increase_reservation(unsigned long nr_pa
 
 		set_phys_to_machine(pfn, frame_list[i]);
 
-		vm_page_unwire(page, 0);
 		vm_page_free(page);
 	}
 
@@ -286,24 +239,29 @@ decrease_reservation(unsigned long nr_pa
 	for (i = 0; i < nr_pages; i++) {
 		if ((page = vm_page_alloc(NULL, 0, 
 			    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 
-			    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
+			    VM_ALLOC_ZERO)) == NULL) {
 			nr_pages = i;
 			need_sleep = 1;
 			break;
 		}
 
+		if ((page->flags & PG_ZERO) == 0) {
+			/*
+			 * Zero the page, or else we might be leaking
+			 * important data to other domains on the same
+			 * host. Xen doesn't scrub ballooned out memory
+			 * pages, the guest is in charge of making
+			 * sure that no information is leaked.
+			 */
+			pmap_zero_page(page);
+		}
+
 		pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
 		frame_list[i] = PFNTOMFN(pfn);
 
 		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
-		if (balloon_append(page) != 0) {
-			vm_page_unwire(page, 0);
-			vm_page_free(page);
-
-			nr_pages = i;
-			need_sleep = 1;
-			break;
-		}
+		TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q);
+		bs.balloon_low++;
 	}
 
 	set_xen_guest_handle(reservation.extent_start, frame_list);
@@ -438,7 +396,8 @@ balloon_init(void *arg)
 	/* Initialise the balloon with excess memory space. */
 	for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
 		page = PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT);
-		balloon_append(page);
+		TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q);
+		bs.balloon_low++;
 	}
 #undef max_pfn
 #endif

Modified: user/ngie/stable-10-libnv/sys/fs/nfs/nfsport.h
==============================================================================
--- user/ngie/stable-10-libnv/sys/fs/nfs/nfsport.h	Wed Dec 30 08:53:12 2015	(r292908)
+++ user/ngie/stable-10-libnv/sys/fs/nfs/nfsport.h	Wed Dec 30 08:55:13 2015	(r292909)
@@ -981,6 +981,13 @@ struct nfsreq {
 #define	NFSVNO_DELEGOK(v)	(1)
 #endif
 
+/*
+ * Name used by getnewvnode() to describe filesystem, "newnfs".
+ * For perfomance reasons it is useful to have the same string
+ * used in both places that call getnewvnode().
+ */
+extern const char nfs_vnode_tag[];
+
 #endif	/* _KERNEL */
 
 #endif	/* _NFS_NFSPORT_H */

Modified: user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clnode.c
==============================================================================
--- user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clnode.c	Wed Dec 30 08:53:12 2015	(r292908)
+++ user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clnode.c	Wed Dec 30 08:55:13 2015	(r292909)
@@ -66,6 +66,8 @@ MALLOC_DECLARE(M_NEWNFSREQ);
 
 uma_zone_t newnfsnode_zone;
 
+const char nfs_vnode_tag[] = "newnfs";
+
 static void	nfs_freesillyrename(void *arg, __unused int pending);
 
 void
@@ -124,7 +126,7 @@ ncl_nget(struct mount *mntp, u_int8_t *f
 	}
 	np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO);
 
-	error = getnewvnode("newnfs", mntp, &newnfs_vnodeops, &nvp);
+	error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp);
 	if (error) {
 		uma_zfree(newnfsnode_zone, np);
 		return (error);
@@ -332,4 +334,3 @@ ncl_invalcaches(struct vnode *vp)
 	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	mtx_unlock(&np->n_mtx);
 }
-

Modified: user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clport.c
==============================================================================
--- user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clport.c	Wed Dec 30 08:53:12 2015	(r292908)
+++ user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clport.c	Wed Dec 30 08:55:13 2015	(r292909)
@@ -199,7 +199,7 @@ nfscl_nget(struct mount *mntp, struct vn
 	}
 	np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO);
 
-	error = getnewvnode("newnfs", mntp, &newnfs_vnodeops, &nvp);
+	error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp);
 	if (error) {
 		uma_zfree(newnfsnode_zone, np);
 		FREE((caddr_t)nfhp, M_NFSFH);

Modified: user/ngie/stable-10-libnv/sys/kern/vfs_subr.c
==============================================================================
--- user/ngie/stable-10-libnv/sys/kern/vfs_subr.c	Wed Dec 30 08:53:12 2015	(r292908)
+++ user/ngie/stable-10-libnv/sys/kern/vfs_subr.c	Wed Dec 30 08:55:13 2015	(r292909)
@@ -145,24 +145,51 @@ int vttoif_tab[10] = {
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
- * Free vnode target.  Free vnodes may simply be files which have been stat'd
- * but not read.  This is somewhat common, and a small cache of such files
- * should be kept to avoid recreation costs.
+ * "Free" vnode target.  Free vnodes are rarely completely free, but are
+ * just ones that are cheap to recycle.  Usually they are for files which
+ * have been stat'd but not read; these usually have inode and namecache
+ * data attached to them.  This target is the preferred minimum size of a
+ * sub-cache consisting mostly of such files. The system balances the size
+ * of this sub-cache with its complement to try to prevent either from
+ * thrashing while the other is relatively inactive.  The targets express
+ * a preference for the best balance.
+ *
+ * "Above" this target there are 2 further targets (watermarks) related
+ * to recyling of free vnodes.  In the best-operating case, the cache is
+ * exactly full, the free list has size between vlowat and vhiwat above the
+ * free target, and recycling from it and normal use maintains this state.
+ * Sometimes the free list is below vlowat or even empty, but this state
+ * is even better for immediate use provided the cache is not full.
+ * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
+ * ones) to reach one of these states.  The watermarks are currently hard-
+ * coded as 4% and 9% of the available space higher.  These and the default
+ * of 25% for wantfreevnodes are too large if the memory size is large.
+ * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
+ * whenever vnlru_proc() becomes active.
  */
 static u_long wantfreevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
-/* Number of vnodes in the free list. */
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
+    &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
 static u_long freevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
-    "Number of vnodes in the free list");
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
+    &freevnodes, 0, "Number of \"free\" vnodes");
 
+/*
+ * The vfs.vlru_allow_cache_src sysctl variable is no longer used but
+ * the sysctl remains to provide ABI compatibility. The new code frees
+ * namecache sources as the last chance to satisfy the highest watermark,
+ * instead of selecting the source vnodes randomly. This provides good
+ * enough behaviour to keep vn_fullpath() working in most situations.
+ * The filesystem layout with deep trees, where the depricated knob was
+ * required, is thus handled automatically.
+ */
 static int vlru_allow_cache_src;
 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
-    &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
+    &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)");
 
 static u_long recycles_count;
 SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
-    "Number of vnodes recycled to avoid exceding kern.maxvnodes");
+    "Number of vnodes recycled to meet vnode cache targets");
 
 /*
  * Various variables used for debugging the new implementation of
@@ -272,14 +299,13 @@ static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
-/*
- * Number of vnodes we want to exist at any one time.  This is mostly used
- * to size hash tables in vnode-related code.  It is normally not used in
- * getnewvnode(), as wantfreevnodes is normally nonzero.)
- *
- * XXX desiredvnodes is historical cruft and should not exist.
- */
+/* Target for maximum number of vnodes. */
 int desiredvnodes;
+static int gapvnodes;		/* gap between wanted and desired */
+static int vhiwat;		/* enough extras after expansion */
+static int vlowat;		/* minimal extras before expansion */
+static int vstir;		/* nonzero to stir non-free vnodes */
+static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
 
 static int
 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
@@ -290,6 +316,8 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL
 	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
 		return (error);
 	if (old_desiredvnodes != desiredvnodes) {
+		wantfreevnodes = desiredvnodes / 4;
+		/* XXX locking seems to be incomplete. */
 		vfs_hash_changesize(desiredvnodes);
 		cache_changesize(desiredvnodes);
 	}
@@ -298,9 +326,9 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL
 
 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
-    sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
+    sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
-    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
+    &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
@@ -331,11 +359,71 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_tr
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
- * grows, the ratio of physical pages to vnodes approaches sixteen to one.
+ * grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
  */
 #ifndef	MAXVNODES_MAX
-#define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
+#define	MAXVNODES_MAX	(512 * 1024 * 1024 / 64)	/* 8M */
 #endif
+
+/*
+ * Initialize a vnode as it first enters the zone.
+ */
+static int
+vnode_init(void *mem, int size, int flags)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+
+	vp = mem;
+	bzero(vp, size);
+	/*
+	 * Setup locks.
+	 */
+	vp->v_vnlock = &vp->v_lock;
+	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+	/*
+	 * By default, don't allow shared locks unless filesystems opt-in.
+	 */
+	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
+	    LK_NOSHARE | LK_IS_VNODE);
+	/*
+	 * Initialize bufobj.
+	 */
+	bo = &vp->v_bufobj;
+	bo->__bo_vnode = vp;
+	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
+	bo->bo_private = vp;
+	TAILQ_INIT(&bo->bo_clean.bv_hd);
+	TAILQ_INIT(&bo->bo_dirty.bv_hd);
+	/*
+	 * Initialize namecache.
+	 */
+	LIST_INIT(&vp->v_cache_src);
+	TAILQ_INIT(&vp->v_cache_dst);
+	/*
+	 * Initialize rangelocks.
+	 */
+	rangelock_init(&vp->v_rl);
+	return (0);
+}
+
+/*
+ * Free a vnode when it is cleared from the zone.
+ */
+static void
+vnode_fini(void *mem, int size)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+
+	vp = mem;
+	rangelock_destroy(&vp->v_rl);
+	lockdestroy(vp->v_vnlock);
+	mtx_destroy(&vp->v_interlock);
+	bo = &vp->v_bufobj;
+	rw_destroy(BO_LOCKPTR(bo));
+}
+
 static void
 vntblinit(void *dummy __unused)
 {
@@ -345,15 +433,16 @@ vntblinit(void *dummy __unused)
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
 	 * kernel's heap size.  Generally speaking, it scales with the
-	 * physical memory size.  The ratio of desiredvnodes to physical pages
-	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
-	 * marginal ratio of desiredvnodes to physical pages is one to
-	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
+	 * physical memory size.  The ratio of desiredvnodes to the physical
+	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
+	 * Thereafter, the
+	 * marginal ratio of desiredvnodes to the physical memory size is
+	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
 	 * size.  The memory required by desiredvnodes vnodes and vm objects
-	 * may not exceed one seventh of the kernel's heap size.
+	 * must not exceed 1/7th of the kernel's heap size.
 	 */
-	physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
-	    cnt.v_page_count) / 16;
+	physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 +
+	    3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64;
 	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode)));
 	desiredvnodes = min(physvnodes, virtvnodes);
@@ -368,7 +457,7 @@ vntblinit(void *dummy __unused)
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	/*
@@ -742,35 +831,41 @@ vattr_null(struct vattr *vap)
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
-vlrureclaim(struct mount *mp)
+vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
 {
 	struct vnode *vp;
-	int done;
-	int trigger;
-	int usevnodes;
-	int count;
+	int count, done, target;
 
-	/*
-	 * Calculate the trigger point, don't allow user
-	 * screwups to blow us up.   This prevents us from
-	 * recycling vnodes with lots of resident pages.  We
-	 * aren't trying to free memory, we are trying to
-	 * free vnodes.
-	 */
-	usevnodes = desiredvnodes;
-	if (usevnodes <= 0)
-		usevnodes = 1;
-	trigger = cnt.v_page_count * 2 / usevnodes;
 	done = 0;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
-	count = mp->mnt_nvnodelistsize / 10 + 1;
-	while (count != 0) {
+	count = mp->mnt_nvnodelistsize;
+	target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
+	target = target / 10 + 1;
+	while (count != 0 && done < target) {
 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		while (vp != NULL && vp->v_type == VMARKER)
 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
 		if (vp == NULL)
 			break;
+		/*
+		 * XXX LRU is completely broken for non-free vnodes.  First
+		 * by calling here in mountpoint order, then by moving
+		 * unselected vnodes to the end here, and most grossly by
+		 * removing the vlruvp() function that was supposed to
+		 * maintain the order.  (This function was born broken
+		 * since syncer problems prevented it doing anything.)  The
+		 * order is closer to LRC (C = Created).
+		 *
+		 * LRU reclaiming of vnodes seems to have last worked in
+		 * FreeBSD-3 where LRU wasn't mentioned under any spelling.
+		 * Then there was no hold count, and inactive vnodes were
+		 * simply put on the free list in LRU order.  The separate
+		 * lists also break LRU.  We prefer to reclaim from the
+		 * free list for technical reasons.  This tends to thrash
+		 * the free list to keep very unrecently used held vnodes.
+		 * The problem is mitigated by keeping the free list large.
+		 */
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
@@ -779,10 +874,12 @@ vlrureclaim(struct mount *mp)
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
+		 * Also skip free vnodes.  We are trying to make space
+		 * to expand the free list, not reduce it.
 		 */
 		if (vp->v_usecount ||
-		    (!vlru_allow_cache_src &&
-			!LIST_EMPTY(&(vp)->v_cache_src)) ||
+		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
+		    ((vp->v_iflag & VI_FREE) != 0) ||
 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
@@ -808,8 +905,8 @@ vlrureclaim(struct mount *mp)
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount ||
-		    (!vlru_allow_cache_src &&
-			!LIST_EMPTY(&(vp)->v_cache_src)) ||
+		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
+		    (vp->v_iflag & VI_FREE) != 0 ||
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK);
@@ -842,7 +939,7 @@ relock_mnt:
 }
 
 /*
- * Attempt to keep the free list at wantfreevnodes length.
+ * Attempt to reduce the free list by the requested amount.
  */
 static void
 vnlru_free(int count)
@@ -899,6 +996,24 @@ vnlru_free(int count)
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
+
+/* XXX some names and initialization are bad for limits and watermarks. */
+static int
+vspace(void)
+{
+	int space;
+
+	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
+	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
+	vlowat = vhiwat / 2;
+	if (numvnodes > desiredvnodes)
+		return (0);
+	space = desiredvnodes - numvnodes;
+	if (freevnodes > wantfreevnodes)
+		space += freevnodes - wantfreevnodes;
+	return (space);
+}
+
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
@@ -911,18 +1026,36 @@ static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
-	int done;
-	struct proc *p = vnlruproc;
+	unsigned long ofreevnodes, onumvnodes;
+	int done, force, reclaim_nc_src, trigger, usevnodes;
 
-	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
+	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 	    SHUTDOWN_PRI_FIRST);
 
+	force = 0;
 	for (;;) {
-		kproc_suspend_check(p);
+		kproc_suspend_check(vnlruproc);
 		mtx_lock(&vnode_free_list_mtx);
-		if (freevnodes > wantfreevnodes)
-			vnlru_free(freevnodes - wantfreevnodes);
-		if (numvnodes <= desiredvnodes * 9 / 10) {
+		/*
+		 * If numvnodes is too large (due to desiredvnodes being
+		 * adjusted using its sysctl, or emergency growth), first
+		 * try to reduce it by discarding from the free list.
+		 */
+		if (numvnodes > desiredvnodes && freevnodes > 0)
+			vnlru_free(ulmin(numvnodes - desiredvnodes,
+			    freevnodes));
+		/*
+		 * Sleep if the vnode cache is in a good state.  This is
+		 * when it is not over-full and has space for about a 4%
+		 * or 9% expansion (by growing its size or inexcessively
+		 * reducing its free list).  Otherwise, try to reclaim
+		 * space for a 10% expansion.
+		 */
+		if (vstir && force == 0) {
+			force = 1;
+			vstir = 0;
+		}
+		if (vspace() >= vlowat && force == 0) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
@@ -931,30 +1064,66 @@ vnlru_proc(void)
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
+		ofreevnodes = freevnodes;
+		onumvnodes = numvnodes;
+		/*
+		 * Calculate parameters for recycling.  These are the same
+		 * throughout the loop to give some semblance of fairness.
+		 * The trigger point is to avoid recycling vnodes with lots
+		 * of resident pages.  We aren't trying to free memory; we
+		 * are trying to recycle or at least free vnodes.
+		 */
+		if (numvnodes <= desiredvnodes)
+			usevnodes = numvnodes - freevnodes;
+		else
+			usevnodes = numvnodes;
+		if (usevnodes <= 0)
+			usevnodes = 1;
+		/*
+		 * The trigger value is is chosen to give a conservatively
+		 * large value to ensure that it alone doesn't prevent
+		 * making progress.  The value can easily be so large that
+		 * it is effectively infinite in some congested and
+		 * misconfigured cases, and this is necessary.  Normally
+		 * it is about 8 to 100 (pages), which is quite large.
+		 */
+		trigger = cnt.v_page_count * 2 / usevnodes;
+		if (force < 2)
+			trigger = vsmalltrigger;
+		reclaim_nc_src = force >= 3;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
-			done += vlrureclaim(mp);
+			done += vlrureclaim(mp, reclaim_nc_src, trigger);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp);
 		}
 		mtx_unlock(&mountlist_mtx);
+		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
+			uma_reclaim();
 		if (done == 0) {
-#if 0
-			/* These messages are temporary debugging aids */
-			if (vnlru_nowhere < 5)
-				printf("vnlru process getting nowhere..\n");
-			else if (vnlru_nowhere == 5)
-				printf("vnlru process messages stopped.\n");
-#endif
+			if (force == 0 || force == 1) {
+				force = 2;
+				continue;
+			}
+			if (force == 2) {
+				force = 3;
+				continue;
+			}
+			force = 0;
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else
 			kern_yield(PRI_USER);
+		/*
+		 * After becoming active to expand above low water, keep
+		 * active until above high water.
+		 */
+		force = vspace() < vhiwat;
 	}
 }
 
@@ -1028,22 +1197,31 @@ vtryrecycle(struct vnode *vp)
 	return (0);
 }
 
+static void
+vcheckspace(void)
+{
+
+	if (vspace() < vlowat && vnlruproc_sig == 0) {
+		vnlruproc_sig = 1;
+		wakeup(vnlruproc);
+	}
+}
+
 /*
- * Wait for available vnodes.
+ * Wait if necessary for space for a new vnode.
  */
 static int
 getnewvnode_wait(int suspended)
 {
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
-	if (numvnodes > desiredvnodes) {
+	if (numvnodes >= desiredvnodes) {
 		if (suspended) {
 			/*
-			 * File system is beeing suspended, we cannot risk a
-			 * deadlock here, so allocate new vnode anyway.
+			 * The file system is being suspended.  We cannot
+			 * risk a deadlock here, so allow allocation of
+			 * another vnode even if this would give too many.
 			 */
-			if (freevnodes > wantfreevnodes)
-				vnlru_free(freevnodes - wantfreevnodes);
 			return (0);
 		}
 		if (vnlruproc_sig == 0) {
@@ -1053,18 +1231,34 @@ getnewvnode_wait(int suspended)
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 	}
-	return (numvnodes > desiredvnodes ? ENFILE : 0);
+	/* Post-adjust like the pre-adjust in getnewvnode(). */
+	if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
+		vnlru_free(1);
+	return (numvnodes >= desiredvnodes ? ENFILE : 0);
 }
 
+/*
+ * This hack is fragile, and probably not needed any more now that the
+ * watermark handling works.
+ */
 void
 getnewvnode_reserve(u_int count)
 {
 	struct thread *td;
 
+	/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
+	/* XXX no longer so quick, but this part is not racy. */
+	mtx_lock(&vnode_free_list_mtx);
+	if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
+		vnlru_free(ulmin(numvnodes + count - desiredvnodes,
+		    freevnodes - wantfreevnodes));
+	mtx_unlock(&vnode_free_list_mtx);
+
 	td = curthread;
 	/* First try to be quick and racy. */
 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
 		td->td_vp_reserv += count;
+		vcheckspace();	/* XXX no longer so quick, but more racy */
 		return;
 	} else
 		atomic_subtract_long(&numvnodes, count);
@@ -1077,9 +1271,18 @@ getnewvnode_reserve(u_int count)
 			atomic_add_long(&numvnodes, 1);
 		}
 	}
+	vcheckspace();
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
+/*
+ * This hack is fragile, especially if desiredvnodes or wantvnodes are
+ * misconfgured or changed significantly.  Reducing desiredvnodes below
+ * the reserved amount should cause bizarre behaviour like reducing it
+ * below the number of active vnodes -- the system will try to reduce
+ * numvnodes to match, but should fail, so the subtraction below should
+ * not overflow.
+ */
 void
 getnewvnode_drop_reserve(void)
 {
@@ -1098,8 +1301,9 @@ getnewvnode(const char *tag, struct moun
     struct vnode **vpp)
 {
 	struct vnode *vp;
-	struct bufobj *bo;
 	struct thread *td;
+	struct lock_object *lo;
+	static int cyclecount;
 	int error;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
@@ -1110,57 +1314,77 @@ getnewvnode(const char *tag, struct moun
 		goto alloc;
 	}
 	mtx_lock(&vnode_free_list_mtx);
-	/*
-	 * Lend our context to reclaim vnodes if they've exceeded the max.
-	 */
-	if (freevnodes > wantfreevnodes)
+	if (numvnodes < desiredvnodes)
+		cyclecount = 0;
+	else if (cyclecount++ >= freevnodes) {
+		cyclecount = 0;
+		vstir = 1;
+	}
+	/*
+	 * Grow the vnode cache if it will not be above its target max
+	 * after growing.  Otherwise, if the free list is nonempty, try
+	 * to reclaim 1 item from it before growing the cache (possibly
+	 * above its target max if the reclamation failed or is delayed).
+	 * Otherwise, wait for some space.  In all cases, schedule
+	 * vnlru_proc() if we are getting short of space.  The watermarks
+	 * should be chosen so that we never wait or even reclaim from
+	 * the free list to below its target minimum.
+	 */
+	if (numvnodes + 1 <= desiredvnodes)
+		;
+	else if (freevnodes > 0)
 		vnlru_free(1);
-	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
-	    MNTK_SUSPEND));
+	else {
+		error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+		    MNTK_SUSPEND));
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
-	if (error != 0) {
-		mtx_unlock(&vnode_free_list_mtx);
-		return (error);
-	}
+		if (error != 0) {
+			mtx_unlock(&vnode_free_list_mtx);
+			return (error);
+		}
 #endif
+	}
+	vcheckspace();
 	atomic_add_long(&numvnodes, 1);
 	mtx_unlock(&vnode_free_list_mtx);
 alloc:
 	atomic_add_long(&vnodes_created, 1);
-	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
-	/*
-	 * Setup locks.
-	 */
-	vp->v_vnlock = &vp->v_lock;
-	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
-	/*
-	 * By default, don't allow shared locks unless filesystems
-	 * opt-in.
-	 */
-	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
+	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
 	/*
-	 * Initialize bufobj.
+	 * Locks are given the generic name "vnode" when created.
+	 * Follow the historic practice of using the filesystem
+	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
+	 *
+	 * Locks live in a witness group keyed on their name. Thus,
+	 * when a lock is renamed, it must also move from the witness
+	 * group of its old name to the witness group of its new name.
+	 *
+	 * The change only needs to be made when the vnode moves
+	 * from one filesystem type to another. We ensure that each
+	 * filesystem use a single static name pointer for its tag so
+	 * that we can compare pointers rather than doing a strcmp().
 	 */
-	bo = &vp->v_bufobj;
-	bo->__bo_vnode = vp;
-	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
-	bo->bo_ops = &buf_ops_bio;
-	bo->bo_private = vp;
-	TAILQ_INIT(&bo->bo_clean.bv_hd);
-	TAILQ_INIT(&bo->bo_dirty.bv_hd);
+	lo = &vp->v_vnlock->lock_object;
+	if (lo->lo_name != tag) {
+		lo->lo_name = tag;
+		WITNESS_DESTROY(lo);
+		WITNESS_INIT(lo, tag);
+	}
 	/*
-	 * Initialize namecache.
+	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
-	LIST_INIT(&vp->v_cache_src);
-	TAILQ_INIT(&vp->v_cache_dst);
+	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 	/*
 	 * Finalize various vnode identity bits.
 	 */
+	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
+	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
+	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_incr_usecount(vp);
-	vp->v_data = NULL;
+	vp->v_bufobj.bo_ops = &buf_ops_bio;
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
@@ -1169,11 +1393,10 @@ alloc:
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	if (mp != NULL) {
-		bo->bo_bsize = mp->mnt_stat.f_iosize;
+		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
-	rangelock_init(&vp->v_rl);
 
 	/*
 	 * For the filesystems which do not use vfs_hash_insert(),
@@ -2399,7 +2622,7 @@ vholdl(struct vnode *vp)
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 	freevnodes--;
-	vp->v_iflag &= ~(VI_FREE|VI_AGE);
+	vp->v_iflag &= ~VI_FREE;
 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 	    ("Activating already active vnode"));
 	vp->v_iflag |= VI_ACTIVE;
@@ -2467,15 +2690,9 @@ vdropl(struct vnode *vp)
 				    v_actfreelist);
 				mp->mnt_activevnodelistsize--;
 			}
-			if (vp->v_iflag & VI_AGE) {
-				TAILQ_INSERT_HEAD(&vnode_free_list, vp,
-				    v_actfreelist);
-			} else {
-				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
-				    v_actfreelist);
-			}
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp,
+			    v_actfreelist);
 			freevnodes++;
-			vp->v_iflag &= ~VI_AGE;
 			vp->v_iflag |= VI_FREE;
 			mtx_unlock(&vnode_free_list_mtx);
 		} else {
@@ -2486,6 +2703,12 @@ vdropl(struct vnode *vp)
 	}
 	/*
 	 * The vnode has been marked for destruction, so free it.
+	 *
+	 * The vnode will be returned to the zone where it will
+	 * normally remain until it is needed for another vnode. We
+	 * need to cleanup (or verify that the cleanup has already
+	 * been done) any residual data left from its current use
+	 * so as not to contaminate the freshly allocated vnode.
 	 */
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 	atomic_subtract_long(&numvnodes, 1);
@@ -2506,20 +2729,25 @@ vdropl(struct vnode *vp)
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
+	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
+	    ("Dangling rangelock waiters"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
-	if (vp->v_pollinfo != NULL)
+	if (vp->v_pollinfo != NULL) {
 		destroy_vpollinfo(vp->v_pollinfo);
+		vp->v_pollinfo = NULL;
+	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
-	rangelock_destroy(&vp->v_rl);
-	lockdestroy(vp->v_vnlock);
-	mtx_destroy(&vp->v_interlock);
-	rw_destroy(BO_LOCKPTR(bo));
+	bzero(&vp->v_un, sizeof(vp->v_un));
+	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+	vp->v_iflag = 0;
+	vp->v_vflag = 0;
+	bo->bo_flag = 0;
 	uma_zfree(vnode_zone, vp);
 }
 
@@ -2884,6 +3112,7 @@ vgonel(struct vnode *vp)
 	 * Clear the advisory locks and wake up waiting threads.
 	 */
 	(void)VOP_ADVLOCKPURGE(vp);
+	vp->v_lockf = NULL;
 	/*
 	 * Delete from old mount point vnode list.
 	 */
@@ -2986,8 +3215,6 @@ vn_printf(struct vnode *vp, const char *
 	}
 	if (vp->v_iflag & VI_MOUNT)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***