svn commit: r289279 - in head/sys: kern vm

Wed Oct 14 02:10:08 UTC 2015

Author: jeff
Date: Wed Oct 14 02:10:07 2015
New Revision: 289279
URL: https://svnweb.freebsd.org/changeset/base/289279

Log:
  Parallelize the buffer cache and rewrite getnewbuf().  This results in a
  8x performance improvement in a micro benchmark on a 4 socket machine.
  
   - Get buffer headers from a per-cpu uma cache that sits in from of the
     free queue.
   - Use a per-cpu quantum cache in vmem to eliminate contention for kva.
   - Use multiple clean queues according to buffer cache size to eliminate
     clean queue lock contention.
   - Introduce a bufspace daemon that attempts to prevent getnewbuf() callers
     from blocking or doing direct recycling.
   - Close some bufspace allocation races that could lead to endless
     recycling.
   - Further the transition to a more modern style of small functions grouped
     by prefix in order to improve growing complexity.
  
  Sponsored by:	EMC / Isilon
  Reviewed by:	kib
  Tested by:	pho

Modified:
  head/sys/kern/vfs_bio.c
  head/sys/vm/vm_init.c

Modified: head/sys/kern/vfs_bio.c
==============================================================================

--- head/sys/kern/vfs_bio.c	Wed Oct 14 00:43:29 2015	(r289278)
+++ head/sys/kern/vfs_bio.c	Wed Oct 14 02:10:07 2015	(r289279)
@@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmem.h>
@@ -100,6 +101,7 @@ caddr_t unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
+struct proc *bufspacedaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -116,11 +118,18 @@ static void vfs_vmio_extend(struct buf *
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static int buf_flush(struct vnode *vp, int);
+static int buf_recycle(bool);
+static int buf_scan(bool);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
+static void bufkva_reclaim(vmem_t *, int);
+static void bufkva_free(struct buf *);
+static int buf_import(void *, void **, int, int);
+static void buf_release(void *, void **, int);
+
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
@@ -145,23 +154,23 @@ static long bufkvaspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
-    "Maximum allowed value of bufspace (including buf_daemon)");
+SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
+    "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
-    "Maximum amount of malloced memory for buffers");
+SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
+    0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
+SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 long hibufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
-    "Maximum allowed value of bufspace (excluding buf_daemon)");
-static int bufreusecnt;
-SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
-    "Number of times we have reused a buffer");
+SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
+    "Maximum allowed value of bufspace (excluding metadata)");
+long bufspacethresh;
+SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
+    0, "Bufspace consumed before waking the daemon to free some");
 static int buffreekvacnt;
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
     "Number of times we have freed the KVA space from some buffer");
@@ -205,10 +214,10 @@ SYSCTL_INT(_vfs, OID_AUTO, numfreebuffer
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
-   "XXX Unused");
+   "Target number of free buffers");
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
-   "XXX Complicatedly unused");
+   "Threshold for clean buffer recycling");
 static int getnewbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
    "Number of calls to getnewbuf");
@@ -219,6 +228,9 @@ static int mappingrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
+static int numbufallocfails;
+SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
+    "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
@@ -233,16 +245,6 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_
     "Permit the use of the unmapped i/o");
 
 /*
- * Lock for the non-dirty bufqueues
- */
-static struct mtx_padalign bqclean;
-
-/*
- * Lock for the dirty queue.
- */
-static struct mtx_padalign bqdirty;
-
-/*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx_padalign bdlock;
@@ -271,6 +273,11 @@ static struct mtx_padalign bdirtylock;
 static int bd_request;
 
 /*
+ * Request/wakeup point for the bufspace daemon.
+ */
+static int bufspace_request;
+
+/*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
@@ -298,7 +305,7 @@ static int runningbufreq;
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
- * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
+ * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static volatile int needsbuffer;
@@ -311,14 +318,21 @@ static int bdirtywait;
 /*
  * Definitions for the buffer free lists.
  */
-#define BUFFER_QUEUES	4	/* number of free buffer queues */
-
 #define QUEUE_NONE	0	/* on no queue */
-#define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
+#define QUEUE_EMPTY	1	/* empty buffer headers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
-#define QUEUE_EMPTY	3	/* empty buffer headers */
+#define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
 #define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
 
+/* Maximum number of clean buffer queues. */
+#define	CLEAN_QUEUES	16
+
+/* Configured number of clean queues. */
+static int clean_queues;
+
+/* Maximum number of buffer queues. */
+#define BUFFER_QUEUES	(QUEUE_CLEAN + CLEAN_QUEUES)
+
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 #ifdef INVARIANTS
@@ -326,15 +340,21 @@ static int bq_len[BUFFER_QUEUES];
 #endif
 
 /*
+ * Lock for each bufqueue
+ */
+static struct mtx_padalign bqlocks[BUFFER_QUEUES];
+
+/*
+ * per-cpu empty buffer cache.
+ */
+uma_zone_t buf_zone;
+
+/*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
-#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
-#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
-#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
-
 static int
 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 {
@@ -382,6 +402,21 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 }
 #endif
 
+static int
+bqcleanq(void)
+{
+	static int nextq;
+
+	return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
+}
+
+static int
+bqisclean(int qindex)
+{
+
+	return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
+}
+
 /*
  *	bqlock:
  *
@@ -391,9 +426,7 @@ static inline struct mtx *
 bqlock(int qindex)
 {
 
-	if (qindex == QUEUE_DIRTY)
-		return (struct mtx *)(&bqdirty);
-	return (struct mtx *)(&bqclean);
+	return (struct mtx *)&bqlocks[qindex];
 }
 
 /*
@@ -447,62 +480,255 @@ bdirtyadd(void)
 }
 
 /*
- *	bufspacewakeup:
+ *	bufspace_wakeup:
  *
  *	Called when buffer space is potentially available for recovery.
  *	getnewbuf() will block on this flag when it is unable to free 
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
-static __inline void
-bufspacewakeup(void)
+static void
+bufspace_wakeup(void)
 {
-	int need_wakeup, on;
 
 	/*
-	 * If someone is waiting for bufspace, wake them up.  Even
-	 * though we may not have freed the kva space yet, the waiting
-	 * process will be able to now.
+	 * If someone is waiting for bufspace, wake them up.
+	 *
+	 * Since needsbuffer is set prior to doing an additional queue
+	 * scan it is safe to check for the flag prior to acquiring the
+	 * lock.  The thread that is preparing to scan again before
+	 * blocking would discover the buf we released.
 	 */
+	if (needsbuffer) {
+		rw_rlock(&nblock);
+		if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
+			wakeup(__DEVOLATILE(void *, &needsbuffer));
+		rw_runlock(&nblock);
+	}
+}
+
+/*
+ *	bufspace_daemonwakeup:
+ *
+ *	Wakeup the daemon responsible for freeing clean bufs.
+ */
+static void
+bufspace_daemonwakeup(void)
+{
 	rw_rlock(&nblock);
-	for (;;) {
-		need_wakeup = 0;
-		on = needsbuffer;
-		if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
-			break;
-		need_wakeup = 1;
-		if (atomic_cmpset_rel_int(&needsbuffer, on,
-		    on & ~VFS_BIO_NEED_BUFSPACE))
-			break;
+	if (bufspace_request == 0) {
+		bufspace_request = 1;
+		wakeup(&bufspace_request);
 	}
-	if (need_wakeup)
-		wakeup(__DEVOLATILE(void *, &needsbuffer));
 	rw_runlock(&nblock);
 }
 
 /*
- *	bufspaceadjust:
+ *	bufspace_adjust:
  *
  *	Adjust the reported bufspace for a KVA managed buffer, possibly
  * 	waking any waiters.
  */
 static void
-bufspaceadjust(struct buf *bp, int bufsize)
+bufspace_adjust(struct buf *bp, int bufsize)
 {
+	long space;
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
-	    ("bufspaceadjust: malloc buf %p", bp));
+	    ("bufspace_adjust: malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
 		atomic_subtract_long(&bufspace, -diff);
-		bufspacewakeup();
-	} else
-		atomic_add_long(&bufspace, diff);
+		bufspace_wakeup();
+	} else {
+		space = atomic_fetchadd_long(&bufspace, diff);
+		/* Wake up the daemon on the transition. */
+		if (space < bufspacethresh && space + diff >= bufspacethresh)
+			bufspace_daemonwakeup();
+	}
 	bp->b_bufsize = bufsize;
 }
 
 /*
+ *	bufspace_reserve:
+ *
+ *	Reserve bufspace before calling allocbuf().  metadata has a
+ *	different space limit than data.
+ */
+static int
+bufspace_reserve(int size, bool metadata)
+{
+	long limit;
+	long space;
+
+	if (metadata)
+		limit = maxbufspace;
+	else
+		limit = hibufspace;
+	do {
+		space = bufspace;
+		if (space + size > limit)
+			return (ENOSPC);
+	} while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
+
+	/* Wake up the daemon on the transition. */
+	if (space < bufspacethresh && space + size >= bufspacethresh)
+		bufspace_daemonwakeup();
+
+	return (0);
+}
+
+/*
+ *	bufspace_release:
+ *
+ *	Release reserved bufspace after bufspace_adjust() has consumed it.
+ */
+static void
+bufspace_release(int size)
+{
+	atomic_subtract_long(&bufspace, size);
+	bufspace_wakeup();
+}
+
+/*
+ *	bufspace_wait:
+ *
+ *	Wait for bufspace, acting as the buf daemon if a locked vnode is
+ *	supplied.  needsbuffer must be set in a safe fashion prior to
+ *	polling for space.  The operation must be re-tried on return.
+ */
+static void
+bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
+{
+	struct thread *td;
+	int error, fl, norunbuf;
+
+	if ((gbflags & GB_NOWAIT_BD) != 0)
+		return;
+
+	td = curthread;
+	rw_wlock(&nblock);
+	while (needsbuffer != 0) {
+		if (vp != NULL && vp->v_type != VCHR &&
+		    (td->td_pflags & TDP_BUFNEED) == 0) {
+			rw_wunlock(&nblock);
+			/*
+			 * getblk() is called with a vnode locked, and
+			 * some majority of the dirty buffers may as
+			 * well belong to the vnode.  Flushing the
+			 * buffers there would make a progress that
+			 * cannot be achieved by the buf_daemon, that
+			 * cannot lock the vnode.
+			 */
+			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+			    (td->td_pflags & TDP_NORUNNINGBUF);
+
+			/*
+			 * Play bufdaemon.  The getnewbuf() function
+			 * may be called while the thread owns lock
+			 * for another dirty buffer for the same
+			 * vnode, which makes it impossible to use
+			 * VOP_FSYNC() there, due to the buffer lock
+			 * recursion.
+			 */
+			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+			fl = buf_flush(vp, flushbufqtarget);
+			td->td_pflags &= norunbuf;
+			rw_wlock(&nblock);
+			if (fl != 0)
+				continue;
+			if (needsbuffer == 0)
+				break;
+		}
+		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
+		if (error != 0)
+			break;
+	}
+	rw_wunlock(&nblock);
+}
+
+
+/*
+ *	bufspace_daemon:
+ *
+ *	buffer space management daemon.  Tries to maintain some marginal
+ *	amount of free buffer space so that requesting processes neither
+ *	block nor work to reclaim buffers.
+ */
+static void
+bufspace_daemon(void)
+{
+	for (;;) {
+		kproc_suspend_check(bufspacedaemonproc);
+
+		/*
+		 * Free buffers from the clean queue until we meet our
+		 * targets.
+		 *
+		 * Theory of operation:  The buffer cache is most efficient
+		 * when some free buffer headers and space are always
+		 * available to getnewbuf().  This daemon attempts to prevent
+		 * the excessive blocking and synchronization associated
+		 * with shortfall.  It goes through three phases according
+		 * demand:
+		 *
+		 * 1)	The daemon wakes up voluntarily once per-second
+		 *	during idle periods when the counters are below
+		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
+		 *
+		 * 2)	The daemon wakes up as we cross the thresholds
+		 *	ahead of any potential blocking.  This may bounce
+		 *	slightly according to the rate of consumption and
+		 *	release.
+		 *
+		 * 3)	The daemon and consumers are starved for working
+		 *	clean buffers.  This is the 'bufspace' sleep below
+		 *	which will inefficiently trade bufs with bqrelse
+		 *	until we return to condition 2.
+		 */
+		while (bufspace > lobufspace ||
+		    numfreebuffers < hifreebuffers) {
+			if (buf_recycle(false) != 0) {
+				atomic_set_int(&needsbuffer, 1);
+				if (buf_recycle(false) != 0) {
+					rw_wlock(&nblock);
+					if (needsbuffer)
+						rw_sleep(__DEVOLATILE(void *,
+						    &needsbuffer), &nblock,
+						    PRIBIO|PDROP, "bufspace",
+						    hz/10);
+					else
+						rw_wunlock(&nblock);
+				}
+			}
+			maybe_yield();
+		}
+
+		/*
+		 * Re-check our limits under the exclusive nblock.
+		 */
+		rw_wlock(&nblock);
+		if (bufspace < bufspacethresh &&
+		    numfreebuffers > lofreebuffers) {
+			bufspace_request = 0;
+			rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
+			    "-", hz);
+		} else
+			rw_wunlock(&nblock);
+	}
+}
+
+static struct kproc_desc bufspace_kp = {
+	"bufspacedaemon",
+	bufspace_daemon,
+	&bufspacedaemonproc
+};
+SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
+    &bufspace_kp);
+
+/*
  *	bufmallocadjust:
  *
  *	Adjust the reported bufspace for a malloc managed buffer, possibly
@@ -516,10 +742,9 @@ bufmallocadjust(struct buf *bp, int bufs
 	KASSERT((bp->b_flags & B_MALLOC) != 0,
 	    ("bufmallocadjust: non-malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
-	if (diff < 0) {
+	if (diff < 0)
 		atomic_subtract_long(&bufmallocspace, -diff);
-		bufspacewakeup();
-	} else
+	else
 		atomic_add_long(&bufmallocspace, diff);
 	bp->b_bufsize = bufsize;
 }
@@ -571,67 +796,6 @@ runningbufwakeup(struct buf *bp)
 }
 
 /*
- *	bufcountadd:
- *
- *	Called when a buffer has been added to one of the free queues to
- *	account for the buffer and to wakeup anyone waiting for free buffers.
- *	This typically occurs when large amounts of metadata are being handled
- *	by the buffer cache ( else buffer space runs out first, usually ).
- */
-static __inline void
-bufcountadd(struct buf *bp)
-{
-	int mask, need_wakeup, old, on;
-
-	KASSERT((bp->b_flags & B_INFREECNT) == 0,
-	    ("buf %p already counted as free", bp));
-	bp->b_flags |= B_INFREECNT;
-	old = atomic_fetchadd_int(&numfreebuffers, 1);
-	KASSERT(old >= 0 && old < nbuf,
-	    ("numfreebuffers climbed to %d", old + 1));
-	mask = VFS_BIO_NEED_ANY;
-	if (numfreebuffers >= hifreebuffers)
-		mask |= VFS_BIO_NEED_FREE;
-	rw_rlock(&nblock);
-	for (;;) {
-		need_wakeup = 0;
-		on = needsbuffer;
-		if (on == 0)
-			break;
-		need_wakeup = 1;
-		if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
-			break;
-	}
-	if (need_wakeup)
-		wakeup(__DEVOLATILE(void *, &needsbuffer));
-	rw_runlock(&nblock);
-}
-
-/*
- *	bufcountsub:
- *
- *	Decrement the numfreebuffers count as needed.
- */
-static void
-bufcountsub(struct buf *bp)
-{
-	int old;
-
-	/*
-	 * Fixup numfreebuffers count.  If the buffer is invalid or not
-	 * delayed-write, the buffer was free and we must decrement
-	 * numfreebuffers.
-	 */
-	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
-		KASSERT((bp->b_flags & B_INFREECNT) != 0,
-		    ("buf %p not counted in numfreebuffers", bp));
-		bp->b_flags &= ~B_INFREECNT;
-		old = atomic_fetchadd_int(&numfreebuffers, -1);
-		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
-	}
-}
-
-/*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
@@ -847,8 +1011,10 @@ bufinit(void)
 	int i;
 
 	CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
-	mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
-	mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
+	mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
+	mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
+	for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
+		mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	rw_init(&nblock, "needsbuffer lock");
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
@@ -864,7 +1030,7 @@ bufinit(void)
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
-		bp->b_flags = B_INVAL | B_INFREECNT;
+		bp->b_flags = B_INVAL;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
@@ -881,18 +1047,19 @@ bufinit(void)
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
-	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
-	 * used by most other processes.  The differential is required to 
-	 * ensure that buf_daemon is able to run when other processes might 
-	 * be blocked waiting for buffer space.
+	 * is nominally used by metadata.  hibufspace is the nominal maximum
+	 * used by most other requests.  The differential is required to 
+	 * ensure that metadata deadlocks don't occur.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
-	 * by the system.
+	 * by the system. XXX This is less true with vmem.  We could use
+	 * PAGE_SIZE.
 	 */
 	maxbufspace = (long)nbuf * BKVASIZE;
 	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
-	lobufspace = hibufspace - MAXBCACHEBUF;
+	lobufspace = (hibufspace / 20) * 19; /* 95% */
+	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
 
 	/*
 	 * Note: The 16 MiB upper limit for hirunningspace was chosen
@@ -906,44 +1073,61 @@ bufinit(void)
 	    16 * 1024 * 1024), 1024 * 1024);
 	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
 
-/*
- * Limit the amount of malloc memory since it is wired permanently into
- * the kernel space.  Even though this is accounted for in the buffer
- * allocation, we don't want the malloced region to grow uncontrolled.
- * The malloc scheme improves memory utilization significantly on average
- * (small) directories.
- */
+	/*
+	 * Limit the amount of malloc memory since it is wired permanently into
+	 * the kernel space.  Even though this is accounted for in the buffer
+	 * allocation, we don't want the malloced region to grow uncontrolled.
+	 * The malloc scheme improves memory utilization significantly on
+	 * average (small) directories.
+	 */
 	maxbufmallocspace = hibufspace / 20;
 
-/*
- * Reduce the chance of a deadlock occuring by limiting the number
- * of delayed-write dirty buffers we allow to stack up.
- */
+	/*
+	 * Reduce the chance of a deadlock occuring by limiting the number
+	 * of delayed-write dirty buffers we allow to stack up.
+	 */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	numdirtybuffers = 0;
-/*
- * To support extreme low-memory systems, make sure hidirtybuffers cannot
- * eat up all available buffer space.  This occurs when our minimum cannot
- * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
- * BKVASIZE'd buffers.
- */
+	/*
+	 * To support extreme low-memory systems, make sure hidirtybuffers
+	 * cannot eat up all available buffer space.  This occurs when our
+	 * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
+	 * buffer space assuming BKVASIZE'd buffers.
+	 */
 	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
-/*
- * Try to keep the number of free buffers in the specified range,
- * and give special processes (e.g. like buf_daemon) access to an 
- * emergency reserve.
- */
-	lofreebuffers = nbuf / 18 + 5;
-	hifreebuffers = 2 * lofreebuffers;
+	/*
+	 * lofreebuffers should be sufficient to avoid stalling waiting on
+	 * buf headers under heavy utilization.  The bufs in per-cpu caches
+	 * are counted as free but will be unavailable to threads executing
+	 * on other cpus.
+	 *
+	 * hifreebuffers is the free target for the bufspace daemon.  This
+	 * should be set appropriately to limit work per-iteration.
+	 */
+	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
+	hifreebuffers = (3 * lofreebuffers) / 2;
 	numfreebuffers = nbuf;
 
 	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+
+	/* Setup the kva and free list allocators. */
+	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
+	buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
+	    NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
+
+	/*
+	 * Size the clean queue according to the amount of buffer space.
+	 * One queue per-256mb up to the max.  More queues gives better
+	 * concurrency but less accurate LRU.
+	 */
+	clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
+
 }
 
 #ifdef INVARIANTS
@@ -1129,10 +1313,25 @@ binsfree(struct buf *bp, int qindex)
 {
 	struct mtx *olock, *nlock;
 
-	BUF_ASSERT_XLOCKED(bp);
+	if (qindex != QUEUE_EMPTY) {
+		BUF_ASSERT_XLOCKED(bp);
+	}
+
+	/*
+	 * Stick to the same clean queue for the lifetime of the buf to
+	 * limit locking below.  Otherwise pick ont sequentially.
+	 */
+	if (qindex == QUEUE_CLEAN) {
+		if (bqisclean(bp->b_qindex))
+			qindex = bp->b_qindex;
+		else
+			qindex = bqcleanq();
+	}
 
+	/*
+	 * Handle delayed bremfree() processing.
+	 */
 	nlock = bqlock(qindex);
-	/* Handle delayed bremfree() processing. */
 	if (bp->b_flags & B_REMFREE) {
 		olock = bqlock(bp->b_qindex);
 		mtx_lock(olock);
@@ -1156,15 +1355,263 @@ binsfree(struct buf *bp, int qindex)
 	bq_len[bp->b_qindex]++;
 #endif
 	mtx_unlock(nlock);
+}
+
+/*
+ * buf_free:
+ *
+ *	Free a buffer to the buf zone once it no longer has valid contents.
+ */
+static void
+buf_free(struct buf *bp)
+{
+
+	if (bp->b_flags & B_REMFREE)
+		bremfreef(bp);
+	if (bp->b_vflags & BV_BKGRDINPROG)
+		panic("losing buffer 1");
+	if (bp->b_rcred != NOCRED) {
+		crfree(bp->b_rcred);
+		bp->b_rcred = NOCRED;
+	}
+	if (bp->b_wcred != NOCRED) {
+		crfree(bp->b_wcred);
+		bp->b_wcred = NOCRED;
+	}
+	if (!LIST_EMPTY(&bp->b_dep))
+		buf_deallocate(bp);
+	bufkva_free(bp);
+	BUF_UNLOCK(bp);
+	uma_zfree(buf_zone, bp);
+	atomic_add_int(&numfreebuffers, 1);
+	bufspace_wakeup();
+}
+
+/*
+ * buf_import:
+ *
+ *	Import bufs into the uma cache from the buf list.  The system still
+ *	expects a static array of bufs and much of the synchronization
+ *	around bufs assumes type stable storage.  As a result, UMA is used
+ *	only as a per-cpu cache of bufs still maintained on a global list.
+ */
+static int
+buf_import(void *arg, void **store, int cnt, int flags)
+{
+	struct buf *bp;
+	int i;
+
+	mtx_lock(&bqlocks[QUEUE_EMPTY]);
+	for (i = 0; i < cnt; i++) {
+		bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+		if (bp == NULL)
+			break;
+		bremfreel(bp);
+		store[i] = bp;
+	}
+	mtx_unlock(&bqlocks[QUEUE_EMPTY]);
+
+	return (i);
+}
+
+/*
+ * buf_release:
+ *
+ *	Release bufs from the uma cache back to the buffer queues.
+ */
+static void
+buf_release(void *arg, void **store, int cnt)
+{
+        int i;
+
+        for (i = 0; i < cnt; i++)
+		binsfree(store[i], QUEUE_EMPTY);
+}
+
+/*
+ * buf_alloc:
+ *
+ *	Allocate an empty buffer header.
+ */
+static struct buf *
+buf_alloc(void)
+{
+	struct buf *bp;
+
+	bp = uma_zalloc(buf_zone, M_NOWAIT);
+	if (bp == NULL) {
+		bufspace_daemonwakeup();
+		atomic_add_int(&numbufallocfails, 1);
+		return (NULL);
+	}
+
+	/*
+	 * Wake-up the bufspace daemon on transition.
+	 */
+	if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
+		bufspace_daemonwakeup();
+
+	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
+	
+	KASSERT(bp->b_vp == NULL,
+	    ("bp: %p still has vnode %p.", bp, bp->b_vp));
+	KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
+	    ("invalid buffer %p flags %#x", bp, bp->b_flags));
+	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+	KASSERT(bp->b_npages == 0,
+	    ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
+	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
+	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
+
+	bp->b_flags = 0;
+	bp->b_ioflags = 0;
+	bp->b_xflags = 0;
+	bp->b_vflags = 0;
+	bp->b_vp = NULL;
+	bp->b_blkno = bp->b_lblkno = 0;
+	bp->b_offset = NOOFFSET;
+	bp->b_iodone = 0;
+	bp->b_error = 0;
+	bp->b_resid = 0;
+	bp->b_bcount = 0;
+	bp->b_npages = 0;
+	bp->b_dirtyoff = bp->b_dirtyend = 0;
+	bp->b_bufobj = NULL;
+	bp->b_pin_count = 0;
+	bp->b_data = bp->b_kvabase = unmapped_buf;
+	bp->b_fsprivate1 = NULL;
+	bp->b_fsprivate2 = NULL;
+	bp->b_fsprivate3 = NULL;
+	LIST_INIT(&bp->b_dep);
+
+	return (bp);
+}
+
+/*
+ *	buf_qrecycle:
+ *
+ *	Free a buffer from the given bufqueue.  kva controls whether the
+ *	freed buf must own some kva resources.  This is used for
+ *	defragmenting.
+ */
+static int
+buf_qrecycle(int qindex, bool kva)
+{
+	struct buf *bp, *nbp;
+
+	if (kva)
+		atomic_add_int(&bufdefragcnt, 1);
+	nbp = NULL;
+	mtx_lock(&bqlocks[qindex]);
+	nbp = TAILQ_FIRST(&bufqueues[qindex]);
+
+	/*
+	 * Run scan, possibly freeing data and/or kva mappings on the fly
+	 * depending.
+	 */
+	while ((bp = nbp) != NULL) {
+		/*
+		 * Calculate next bp (we can only use it if we do not
+		 * release the bqlock).
+		 */
+		nbp = TAILQ_NEXT(bp, b_freelist);
+
+		/*
+		 * If we are defragging then we need a buffer with 
+		 * some kva to reclaim.
+		 */
+		if (kva && bp->b_kvasize == 0)
+			continue;
+
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+			continue;
+
+		/*
+		 * Skip buffers with background writes in progress.
+		 */
+		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
+
+		KASSERT(bp->b_qindex == qindex,
+		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+		/*
+		 * NOTE:  nbp is now entirely invalid.  We can only restart
+		 * the scan from this point on.
+		 */
+		bremfreel(bp);
+		mtx_unlock(&bqlocks[qindex]);
+
+		/*
+		 * Requeue the background write buffer with error and
+		 * restart the scan.
+		 */
+		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
+			bqrelse(bp);
+			mtx_lock(&bqlocks[qindex]);
+			nbp = TAILQ_FIRST(&bufqueues[qindex]);
+			continue;
+		}
+		bp->b_flags |= B_INVAL;
+		brelse(bp);
+		return (0);
+	}
+	mtx_unlock(&bqlocks[qindex]);
+
+	return (ENOBUFS);
+}
+
+/*
+ *	buf_recycle:
+ *
+ *	Iterate through all clean queues until we find a buf to recycle or
+ *	exhaust the search.
+ */
+static int
+buf_recycle(bool kva)
+{
+	int qindex, first_qindex;
+
+	qindex = first_qindex = bqcleanq();
+	do {
+		if (buf_qrecycle(qindex, kva) == 0)
+			return (0);
+		if (++qindex == QUEUE_CLEAN + clean_queues)
+			qindex = QUEUE_CLEAN;
+	} while (qindex != first_qindex);
+
+	return (ENOBUFS);
+}
+
+/*
+ *	buf_scan:
+ *
+ *	Scan the clean queues looking for a buffer to recycle.  needsbuffer
+ *	is set on failure so that the caller may optionally bufspace_wait()
+ *	in a race-free fashion.
+ */
+static int
+buf_scan(bool defrag)
+{
+	int error;
 
 	/*
-	 * Something we can maybe free or reuse.
-	 */
-	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
-		bufspacewakeup();
-
-	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
-		bufcountadd(bp);
+	 * To avoid heavy synchronization and wakeup races we set
+	 * needsbuffer and re-poll before failing.  This ensures that
+	 * no frees can be missed between an unsuccessful poll and
+	 * going to sleep in a synchronized fashion.
+	 */

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***