svn commit: r289279 - in head/sys: kern vm

Mon Nov 2 03:20:15 UTC 2015

hiya jeff,

this broke low-memory, no-swap boards (eg MIPS.)

On a MIPS board (carambola2) with 32MB of RAM, just scp'ing a kernel
into the rootfs on USB hangs the system. After doing some digging, I
found this:

INTERNAL: Allocating one item from buf free cache(0x83fea7e0)
uma_zalloc_arg: Bucketzone returned NULL
INTERNAL: Allocating one item from buf free cache(0x83fea7e0)
uma_zalloc_arg: Bucketzone returned NULL

.. and it was just stuck in a loop trying to allocate them, failing,
and trying to allocate them again.

I'll see if I can reproduce it with a qemu emulator with sufficiently
low RAM so you don't need a MIPS router to reproduce it.

It's sufficient to just start the scp; it runs out of RAM within a
couple of seconds.

Any ideas?

-adrian

On 13 October 2015 at 19:10, Jeff Roberson <jeff at freebsd.org> wrote:
> Author: jeff
> Date: Wed Oct 14 02:10:07 2015
> New Revision: 289279
> URL: https://svnweb.freebsd.org/changeset/base/289279
>
> Log:
>   Parallelize the buffer cache and rewrite getnewbuf().  This results in a
>   8x performance improvement in a micro benchmark on a 4 socket machine.
>
>    - Get buffer headers from a per-cpu uma cache that sits in from of the
>      free queue.
>    - Use a per-cpu quantum cache in vmem to eliminate contention for kva.
>    - Use multiple clean queues according to buffer cache size to eliminate
>      clean queue lock contention.
>    - Introduce a bufspace daemon that attempts to prevent getnewbuf() callers
>      from blocking or doing direct recycling.
>    - Close some bufspace allocation races that could lead to endless
>      recycling.
>    - Further the transition to a more modern style of small functions grouped
>      by prefix in order to improve growing complexity.
>
>   Sponsored by: EMC / Isilon
>   Reviewed by:  kib
>   Tested by:    pho
>
> Modified:
>   head/sys/kern/vfs_bio.c
>   head/sys/vm/vm_init.c
>
> Modified: head/sys/kern/vfs_bio.c
> ==============================================================================
> --- head/sys/kern/vfs_bio.c     Wed Oct 14 00:43:29 2015        (r289278)
> +++ head/sys/kern/vfs_bio.c     Wed Oct 14 02:10:07 2015        (r289279)
> @@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$");
>  #include <sys/proc.h>
>  #include <sys/resourcevar.h>
>  #include <sys/rwlock.h>
> +#include <sys/smp.h>
>  #include <sys/sysctl.h>
>  #include <sys/sysproto.h>
>  #include <sys/vmem.h>
> @@ -100,6 +101,7 @@ caddr_t unmapped_buf;
>
>  /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
>  struct proc *bufdaemonproc;
> +struct proc *bufspacedaemonproc;
>
>  static int inmem(struct vnode *vp, daddr_t blkno);
>  static void vm_hold_free_pages(struct buf *bp, int newbsize);
> @@ -116,11 +118,18 @@ static void vfs_vmio_extend(struct buf *
>  static int vfs_bio_clcheck(struct vnode *vp, int size,
>                 daddr_t lblkno, daddr_t blkno);
>  static int buf_flush(struct vnode *vp, int);
> +static int buf_recycle(bool);
> +static int buf_scan(bool);
>  static int flushbufqueues(struct vnode *, int, int);
>  static void buf_daemon(void);
>  static void bremfreel(struct buf *bp);
>  static __inline void bd_wakeup(void);
>  static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
> +static void bufkva_reclaim(vmem_t *, int);
> +static void bufkva_free(struct buf *);
> +static int buf_import(void *, void **, int, int);
> +static void buf_release(void *, void **, int);
> +
>  #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
>      defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
>  static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
> @@ -145,23 +154,23 @@ static long bufkvaspace;
>  SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
>      "Kernel virtual memory used for buffers");
>  static long maxbufspace;
> -SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
> -    "Maximum allowed value of bufspace (including buf_daemon)");
> +SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
> +    "Maximum allowed value of bufspace (including metadata)");
>  static long bufmallocspace;
>  SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
>      "Amount of malloced memory for buffers");
>  static long maxbufmallocspace;
> -SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
> -    "Maximum amount of malloced memory for buffers");
> +SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
> +    0, "Maximum amount of malloced memory for buffers");
>  static long lobufspace;
> -SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
> +SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
>      "Minimum amount of buffers we want to have");
>  long hibufspace;
> -SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
> -    "Maximum allowed value of bufspace (excluding buf_daemon)");
> -static int bufreusecnt;
> -SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
> -    "Number of times we have reused a buffer");
> +SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
> +    "Maximum allowed value of bufspace (excluding metadata)");
> +long bufspacethresh;
> +SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
> +    0, "Bufspace consumed before waking the daemon to free some");
>  static int buffreekvacnt;
>  SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
>      "Number of times we have freed the KVA space from some buffer");
> @@ -205,10 +214,10 @@ SYSCTL_INT(_vfs, OID_AUTO, numfreebuffer
>      "Number of free buffers");
>  static int lofreebuffers;
>  SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
> -   "XXX Unused");
> +   "Target number of free buffers");
>  static int hifreebuffers;
>  SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
> -   "XXX Complicatedly unused");
> +   "Threshold for clean buffer recycling");
>  static int getnewbufcalls;
>  SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
>     "Number of calls to getnewbuf");
> @@ -219,6 +228,9 @@ static int mappingrestarts;
>  SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
>      "Number of times getblk has had to restart a buffer mapping for "
>      "unmapped buffer");
> +static int numbufallocfails;
> +SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
> +    "Number of times buffer allocations failed");
>  static int flushbufqtarget = 100;
>  SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
>      "Amount of work to do in flushbufqueues when helping bufdaemon");
> @@ -233,16 +245,6 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_
>      "Permit the use of the unmapped i/o");
>
>  /*
> - * Lock for the non-dirty bufqueues
> - */
> -static struct mtx_padalign bqclean;
> -
> -/*
> - * Lock for the dirty queue.
> - */
> -static struct mtx_padalign bqdirty;
> -
> -/*
>   * This lock synchronizes access to bd_request.
>   */
>  static struct mtx_padalign bdlock;
> @@ -271,6 +273,11 @@ static struct mtx_padalign bdirtylock;
>  static int bd_request;
>
>  /*
> + * Request/wakeup point for the bufspace daemon.
> + */
> +static int bufspace_request;
> +
> +/*
>   * Request for the buf daemon to write more buffers than is indicated by
>   * lodirtybuf.  This may be necessary to push out excess dependencies or
>   * defragment the address space where a simple count of the number of dirty
> @@ -298,7 +305,7 @@ static int runningbufreq;
>   * Synchronization (sleep/wakeup) variable for buffer requests.
>   * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
>   * by and/or.
> - * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
> + * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
>   * getnewbuf(), and getblk().
>   */
>  static volatile int needsbuffer;
> @@ -311,14 +318,21 @@ static int bdirtywait;
>  /*
>   * Definitions for the buffer free lists.
>   */
> -#define BUFFER_QUEUES  4       /* number of free buffer queues */
> -
>  #define QUEUE_NONE     0       /* on no queue */
> -#define QUEUE_CLEAN    1       /* non-B_DELWRI buffers */
> +#define QUEUE_EMPTY    1       /* empty buffer headers */
>  #define QUEUE_DIRTY    2       /* B_DELWRI buffers */
> -#define QUEUE_EMPTY    3       /* empty buffer headers */
> +#define QUEUE_CLEAN    3       /* non-B_DELWRI buffers */
>  #define QUEUE_SENTINEL 1024    /* not an queue index, but mark for sentinel */
>
> +/* Maximum number of clean buffer queues. */
> +#define        CLEAN_QUEUES    16
> +
> +/* Configured number of clean queues. */
> +static int clean_queues;
> +
> +/* Maximum number of buffer queues. */
> +#define BUFFER_QUEUES  (QUEUE_CLEAN + CLEAN_QUEUES)
> +
>  /* Queues for free buffers with various properties */
>  static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
>  #ifdef INVARIANTS
> @@ -326,15 +340,21 @@ static int bq_len[BUFFER_QUEUES];
>  #endif
>
>  /*
> + * Lock for each bufqueue
> + */
> +static struct mtx_padalign bqlocks[BUFFER_QUEUES];
> +
> +/*
> + * per-cpu empty buffer cache.
> + */
> +uma_zone_t buf_zone;
> +
> +/*
>   * Single global constant for BUF_WMESG, to avoid getting multiple references.
>   * buf_wmesg is referred from macros.
>   */
>  const char *buf_wmesg = BUF_WMESG;
>
> -#define VFS_BIO_NEED_ANY       0x01    /* any freeable buffer */
> -#define VFS_BIO_NEED_FREE      0x04    /* wait for free bufs, hi hysteresis */
> -#define VFS_BIO_NEED_BUFSPACE  0x08    /* wait for buf space, lo hysteresis */
> -
>  static int
>  sysctl_runningspace(SYSCTL_HANDLER_ARGS)
>  {
> @@ -382,6 +402,21 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
>  }
>  #endif
>
> +static int
> +bqcleanq(void)
> +{
> +       static int nextq;
> +
> +       return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
> +}
> +
> +static int
> +bqisclean(int qindex)
> +{
> +
> +       return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
> +}
> +
>  /*
>   *     bqlock:
>   *
> @@ -391,9 +426,7 @@ static inline struct mtx *
>  bqlock(int qindex)
>  {
>
> -       if (qindex == QUEUE_DIRTY)
> -               return (struct mtx *)(&bqdirty);
> -       return (struct mtx *)(&bqclean);
> +       return (struct mtx *)&bqlocks[qindex];
>  }
>
>  /*
> @@ -447,62 +480,255 @@ bdirtyadd(void)
>  }
>
>  /*
> - *     bufspacewakeup:
> + *     bufspace_wakeup:
>   *
>   *     Called when buffer space is potentially available for recovery.
>   *     getnewbuf() will block on this flag when it is unable to free
>   *     sufficient buffer space.  Buffer space becomes recoverable when
>   *     bp's get placed back in the queues.
>   */
> -static __inline void
> -bufspacewakeup(void)
> +static void
> +bufspace_wakeup(void)
>  {
> -       int need_wakeup, on;
>
>         /*
> -        * If someone is waiting for bufspace, wake them up.  Even
> -        * though we may not have freed the kva space yet, the waiting
> -        * process will be able to now.
> +        * If someone is waiting for bufspace, wake them up.
> +        *
> +        * Since needsbuffer is set prior to doing an additional queue
> +        * scan it is safe to check for the flag prior to acquiring the
> +        * lock.  The thread that is preparing to scan again before
> +        * blocking would discover the buf we released.
>          */
> +       if (needsbuffer) {
> +               rw_rlock(&nblock);
> +               if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
> +                       wakeup(__DEVOLATILE(void *, &needsbuffer));
> +               rw_runlock(&nblock);
> +       }
> +}
> +
> +/*
> + *     bufspace_daemonwakeup:
> + *
> + *     Wakeup the daemon responsible for freeing clean bufs.
> + */
> +static void
> +bufspace_daemonwakeup(void)
> +{
>         rw_rlock(&nblock);
> -       for (;;) {
> -               need_wakeup = 0;
> -               on = needsbuffer;
> -               if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
> -                       break;
> -               need_wakeup = 1;
> -               if (atomic_cmpset_rel_int(&needsbuffer, on,
> -                   on & ~VFS_BIO_NEED_BUFSPACE))
> -                       break;
> +       if (bufspace_request == 0) {
> +               bufspace_request = 1;
> +               wakeup(&bufspace_request);
>         }
> -       if (need_wakeup)
> -               wakeup(__DEVOLATILE(void *, &needsbuffer));
>         rw_runlock(&nblock);
>  }
>
>  /*
> - *     bufspaceadjust:
> + *     bufspace_adjust:
>   *
>   *     Adjust the reported bufspace for a KVA managed buffer, possibly
>   *     waking any waiters.
>   */
>  static void
> -bufspaceadjust(struct buf *bp, int bufsize)
> +bufspace_adjust(struct buf *bp, int bufsize)
>  {
> +       long space;
>         int diff;
>
>         KASSERT((bp->b_flags & B_MALLOC) == 0,
> -           ("bufspaceadjust: malloc buf %p", bp));
> +           ("bufspace_adjust: malloc buf %p", bp));
>         diff = bufsize - bp->b_bufsize;
>         if (diff < 0) {
>                 atomic_subtract_long(&bufspace, -diff);
> -               bufspacewakeup();
> -       } else
> -               atomic_add_long(&bufspace, diff);
> +               bufspace_wakeup();
> +       } else {
> +               space = atomic_fetchadd_long(&bufspace, diff);
> +               /* Wake up the daemon on the transition. */
> +               if (space < bufspacethresh && space + diff >= bufspacethresh)
> +                       bufspace_daemonwakeup();
> +       }
>         bp->b_bufsize = bufsize;
>  }
>
>  /*
> + *     bufspace_reserve:
> + *
> + *     Reserve bufspace before calling allocbuf().  metadata has a
> + *     different space limit than data.
> + */
> +static int
> +bufspace_reserve(int size, bool metadata)
> +{
> +       long limit;
> +       long space;
> +
> +       if (metadata)
> +               limit = maxbufspace;
> +       else
> +               limit = hibufspace;
> +       do {
> +               space = bufspace;
> +               if (space + size > limit)
> +                       return (ENOSPC);
> +       } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
> +
> +       /* Wake up the daemon on the transition. */
> +       if (space < bufspacethresh && space + size >= bufspacethresh)
> +               bufspace_daemonwakeup();
> +
> +       return (0);
> +}
> +
> +/*
> + *     bufspace_release:
> + *
> + *     Release reserved bufspace after bufspace_adjust() has consumed it.
> + */
> +static void
> +bufspace_release(int size)
> +{
> +       atomic_subtract_long(&bufspace, size);
> +       bufspace_wakeup();
> +}
> +
> +/*
> + *     bufspace_wait:
> + *
> + *     Wait for bufspace, acting as the buf daemon if a locked vnode is
> + *     supplied.  needsbuffer must be set in a safe fashion prior to
> + *     polling for space.  The operation must be re-tried on return.
> + */
> +static void
> +bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
> +{
> +       struct thread *td;
> +       int error, fl, norunbuf;
> +
> +       if ((gbflags & GB_NOWAIT_BD) != 0)
> +               return;
> +
> +       td = curthread;
> +       rw_wlock(&nblock);
> +       while (needsbuffer != 0) {
> +               if (vp != NULL && vp->v_type != VCHR &&
> +                   (td->td_pflags & TDP_BUFNEED) == 0) {
> +                       rw_wunlock(&nblock);
> +                       /*
> +                        * getblk() is called with a vnode locked, and
> +                        * some majority of the dirty buffers may as
> +                        * well belong to the vnode.  Flushing the
> +                        * buffers there would make a progress that
> +                        * cannot be achieved by the buf_daemon, that
> +                        * cannot lock the vnode.
> +                        */
> +                       norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
> +                           (td->td_pflags & TDP_NORUNNINGBUF);
> +
> +                       /*
> +                        * Play bufdaemon.  The getnewbuf() function
> +                        * may be called while the thread owns lock
> +                        * for another dirty buffer for the same
> +                        * vnode, which makes it impossible to use
> +                        * VOP_FSYNC() there, due to the buffer lock
> +                        * recursion.
> +                        */
> +                       td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
> +                       fl = buf_flush(vp, flushbufqtarget);
> +                       td->td_pflags &= norunbuf;
> +                       rw_wlock(&nblock);
> +                       if (fl != 0)
> +                               continue;
> +                       if (needsbuffer == 0)
> +                               break;
> +               }
> +               error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
> +                   (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
> +               if (error != 0)
> +                       break;
> +       }
> +       rw_wunlock(&nblock);
> +}
> +
> +
> +/*
> + *     bufspace_daemon:
> + *
> + *     buffer space management daemon.  Tries to maintain some marginal
> + *     amount of free buffer space so that requesting processes neither
> + *     block nor work to reclaim buffers.
> + */
> +static void
> +bufspace_daemon(void)
> +{
> +       for (;;) {
> +               kproc_suspend_check(bufspacedaemonproc);
> +
> +               /*
> +                * Free buffers from the clean queue until we meet our
> +                * targets.
> +                *
> +                * Theory of operation:  The buffer cache is most efficient
> +                * when some free buffer headers and space are always
> +                * available to getnewbuf().  This daemon attempts to prevent
> +                * the excessive blocking and synchronization associated
> +                * with shortfall.  It goes through three phases according
> +                * demand:
> +                *
> +                * 1)   The daemon wakes up voluntarily once per-second
> +                *      during idle periods when the counters are below
> +                *      the wakeup thresholds (bufspacethresh, lofreebuffers).
> +                *
> +                * 2)   The daemon wakes up as we cross the thresholds
> +                *      ahead of any potential blocking.  This may bounce
> +                *      slightly according to the rate of consumption and
> +                *      release.
> +                *
> +                * 3)   The daemon and consumers are starved for working
> +                *      clean buffers.  This is the 'bufspace' sleep below
> +                *      which will inefficiently trade bufs with bqrelse
> +                *      until we return to condition 2.
> +                */
> +               while (bufspace > lobufspace ||
> +                   numfreebuffers < hifreebuffers) {
> +                       if (buf_recycle(false) != 0) {
> +                               atomic_set_int(&needsbuffer, 1);
> +                               if (buf_recycle(false) != 0) {
> +                                       rw_wlock(&nblock);
> +                                       if (needsbuffer)
> +                                               rw_sleep(__DEVOLATILE(void *,
> +                                                   &needsbuffer), &nblock,
> +                                                   PRIBIO|PDROP, "bufspace",
> +                                                   hz/10);
> +                                       else
> +                                               rw_wunlock(&nblock);
> +                               }
> +                       }
> +                       maybe_yield();
> +               }
> +
> +               /*
> +                * Re-check our limits under the exclusive nblock.
> +                */
> +               rw_wlock(&nblock);
> +               if (bufspace < bufspacethresh &&
> +                   numfreebuffers > lofreebuffers) {
> +                       bufspace_request = 0;
> +                       rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
> +                           "-", hz);
> +               } else
> +                       rw_wunlock(&nblock);
> +       }
> +}
> +
> +static struct kproc_desc bufspace_kp = {
> +       "bufspacedaemon",
> +       bufspace_daemon,
> +       &bufspacedaemonproc
> +};
> +SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
> +    &bufspace_kp);
> +
> +/*
>   *     bufmallocadjust:
>   *
>   *     Adjust the reported bufspace for a malloc managed buffer, possibly
> @@ -516,10 +742,9 @@ bufmallocadjust(struct buf *bp, int bufs
>         KASSERT((bp->b_flags & B_MALLOC) != 0,
>             ("bufmallocadjust: non-malloc buf %p", bp));
>         diff = bufsize - bp->b_bufsize;
> -       if (diff < 0) {
> +       if (diff < 0)
>                 atomic_subtract_long(&bufmallocspace, -diff);
> -               bufspacewakeup();
> -       } else
> +       else
>                 atomic_add_long(&bufmallocspace, diff);
>         bp->b_bufsize = bufsize;
>  }
> @@ -571,67 +796,6 @@ runningbufwakeup(struct buf *bp)
>  }
>
>  /*
> - *     bufcountadd:
> - *
> - *     Called when a buffer has been added to one of the free queues to
> - *     account for the buffer and to wakeup anyone waiting for free buffers.
> - *     This typically occurs when large amounts of metadata are being handled
> - *     by the buffer cache ( else buffer space runs out first, usually ).
> - */
> -static __inline void
> -bufcountadd(struct buf *bp)
> -{
> -       int mask, need_wakeup, old, on;
> -
> -       KASSERT((bp->b_flags & B_INFREECNT) == 0,
> -           ("buf %p already counted as free", bp));
> -       bp->b_flags |= B_INFREECNT;
> -       old = atomic_fetchadd_int(&numfreebuffers, 1);
> -       KASSERT(old >= 0 && old < nbuf,
> -           ("numfreebuffers climbed to %d", old + 1));
> -       mask = VFS_BIO_NEED_ANY;
> -       if (numfreebuffers >= hifreebuffers)
> -               mask |= VFS_BIO_NEED_FREE;
> -       rw_rlock(&nblock);
> -       for (;;) {
> -               need_wakeup = 0;
> -               on = needsbuffer;
> -               if (on == 0)
> -                       break;
> -               need_wakeup = 1;
> -               if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
> -                       break;
> -       }
> -       if (need_wakeup)
> -               wakeup(__DEVOLATILE(void *, &needsbuffer));
> -       rw_runlock(&nblock);
> -}
> -
> -/*
> - *     bufcountsub:
> - *
> - *     Decrement the numfreebuffers count as needed.
> - */
> -static void
> -bufcountsub(struct buf *bp)
> -{
> -       int old;
> -
> -       /*
> -        * Fixup numfreebuffers count.  If the buffer is invalid or not
> -        * delayed-write, the buffer was free and we must decrement
> -        * numfreebuffers.
> -        */
> -       if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
> -               KASSERT((bp->b_flags & B_INFREECNT) != 0,
> -                   ("buf %p not counted in numfreebuffers", bp));
> -               bp->b_flags &= ~B_INFREECNT;
> -               old = atomic_fetchadd_int(&numfreebuffers, -1);
> -               KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
> -       }
> -}
> -
> -/*
>   *     waitrunningbufspace()
>   *
>   *     runningbufspace is a measure of the amount of I/O currently
> @@ -847,8 +1011,10 @@ bufinit(void)
>         int i;
>
>         CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
> -       mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
> -       mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
> +       mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
> +       mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
> +       for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
> +               mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
>         mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
>         rw_init(&nblock, "needsbuffer lock");
>         mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
> @@ -864,7 +1030,7 @@ bufinit(void)
>         for (i = 0; i < nbuf; i++) {
>                 bp = &buf[i];
>                 bzero(bp, sizeof *bp);
> -               bp->b_flags = B_INVAL | B_INFREECNT;
> +               bp->b_flags = B_INVAL;
>                 bp->b_rcred = NOCRED;
>                 bp->b_wcred = NOCRED;
>                 bp->b_qindex = QUEUE_EMPTY;
> @@ -881,18 +1047,19 @@ bufinit(void)
>         /*
>          * maxbufspace is the absolute maximum amount of buffer space we are
>          * allowed to reserve in KVM and in real terms.  The absolute maximum
> -        * is nominally used by buf_daemon.  hibufspace is the nominal maximum
> -        * used by most other processes.  The differential is required to
> -        * ensure that buf_daemon is able to run when other processes might
> -        * be blocked waiting for buffer space.
> +        * is nominally used by metadata.  hibufspace is the nominal maximum
> +        * used by most other requests.  The differential is required to
> +        * ensure that metadata deadlocks don't occur.
>          *
>          * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
>          * this may result in KVM fragmentation which is not handled optimally
> -        * by the system.
> +        * by the system. XXX This is less true with vmem.  We could use
> +        * PAGE_SIZE.
>          */
>         maxbufspace = (long)nbuf * BKVASIZE;
>         hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
> -       lobufspace = hibufspace - MAXBCACHEBUF;
> +       lobufspace = (hibufspace / 20) * 19; /* 95% */
> +       bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
>
>         /*
>          * Note: The 16 MiB upper limit for hirunningspace was chosen
> @@ -906,44 +1073,61 @@ bufinit(void)
>             16 * 1024 * 1024), 1024 * 1024);
>         lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
>
> -/*
> - * Limit the amount of malloc memory since it is wired permanently into
> - * the kernel space.  Even though this is accounted for in the buffer
> - * allocation, we don't want the malloced region to grow uncontrolled.
> - * The malloc scheme improves memory utilization significantly on average
> - * (small) directories.
> - */
> +       /*
> +        * Limit the amount of malloc memory since it is wired permanently into
> +        * the kernel space.  Even though this is accounted for in the buffer
> +        * allocation, we don't want the malloced region to grow uncontrolled.
> +        * The malloc scheme improves memory utilization significantly on
> +        * average (small) directories.
> +        */
>         maxbufmallocspace = hibufspace / 20;
>
> -/*
> - * Reduce the chance of a deadlock occuring by limiting the number
> - * of delayed-write dirty buffers we allow to stack up.
> - */
> +       /*
> +        * Reduce the chance of a deadlock occuring by limiting the number
> +        * of delayed-write dirty buffers we allow to stack up.
> +        */
>         hidirtybuffers = nbuf / 4 + 20;
>         dirtybufthresh = hidirtybuffers * 9 / 10;
>         numdirtybuffers = 0;
> -/*
> - * To support extreme low-memory systems, make sure hidirtybuffers cannot
> - * eat up all available buffer space.  This occurs when our minimum cannot
> - * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
> - * BKVASIZE'd buffers.
> - */
> +       /*
> +        * To support extreme low-memory systems, make sure hidirtybuffers
> +        * cannot eat up all available buffer space.  This occurs when our
> +        * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
> +        * buffer space assuming BKVASIZE'd buffers.
> +        */
>         while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
>                 hidirtybuffers >>= 1;
>         }
>         lodirtybuffers = hidirtybuffers / 2;
>
> -/*
> - * Try to keep the number of free buffers in the specified range,
> - * and give special processes (e.g. like buf_daemon) access to an
> - * emergency reserve.
> - */
> -       lofreebuffers = nbuf / 18 + 5;
> -       hifreebuffers = 2 * lofreebuffers;
> +       /*
> +        * lofreebuffers should be sufficient to avoid stalling waiting on
> +        * buf headers under heavy utilization.  The bufs in per-cpu caches
> +        * are counted as free but will be unavailable to threads executing
> +        * on other cpus.
> +        *
> +        * hifreebuffers is the free target for the bufspace daemon.  This
> +        * should be set appropriately to limit work per-iteration.
> +        */
> +       lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
> +       hifreebuffers = (3 * lofreebuffers) / 2;
>         numfreebuffers = nbuf;
>
>         bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
>             VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
> +
> +       /* Setup the kva and free list allocators. */
> +       vmem_set_reclaim(buffer_arena, bufkva_reclaim);
> +       buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
> +           NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
> +
> +       /*
> +        * Size the clean queue according to the amount of buffer space.
> +        * One queue per-256mb up to the max.  More queues gives better
> +        * concurrency but less accurate LRU.
> +        */
> +       clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
> +
>  }
>
>  #ifdef INVARIANTS
> @@ -1129,10 +1313,25 @@ binsfree(struct buf *bp, int qindex)
>  {
>         struct mtx *olock, *nlock;
>
> -       BUF_ASSERT_XLOCKED(bp);
> +       if (qindex != QUEUE_EMPTY) {
> +               BUF_ASSERT_XLOCKED(bp);
> +       }
> +
> +       /*
> +        * Stick to the same clean queue for the lifetime of the buf to
> +        * limit locking below.  Otherwise pick ont sequentially.
> +        */
> +       if (qindex == QUEUE_CLEAN) {
> +               if (bqisclean(bp->b_qindex))
> +                       qindex = bp->b_qindex;
> +               else
> +                       qindex = bqcleanq();
> +       }
>
> +       /*
> +        * Handle delayed bremfree() processing.
> +        */
>         nlock = bqlock(qindex);
> -       /* Handle delayed bremfree() processing. */
>         if (bp->b_flags & B_REMFREE) {
>                 olock = bqlock(bp->b_qindex);
>                 mtx_lock(olock);
> @@ -1156,15 +1355,263 @@ binsfree(struct buf *bp, int qindex)
>         bq_len[bp->b_qindex]++;
>  #endif
>         mtx_unlock(nlock);
> +}
> +
> +/*
> + * buf_free:
> + *
> + *     Free a buffer to the buf zone once it no longer has valid contents.
> + */
> +static void
> +buf_free(struct buf *bp)
> +{
> +
> +       if (bp->b_flags & B_REMFREE)
> +               bremfreef(bp);
> +       if (bp->b_vflags & BV_BKGRDINPROG)
> +               panic("losing buffer 1");
> +       if (bp->b_rcred != NOCRED) {
> +               crfree(bp->b_rcred);
> +               bp->b_rcred = NOCRED;
> +       }
> +       if (bp->b_wcred != NOCRED) {
> +               crfree(bp->b_wcred);
> +               bp->b_wcred = NOCRED;
> +       }
> +       if (!LIST_EMPTY(&bp->b_dep))
> +               buf_deallocate(bp);
> +       bufkva_free(bp);
> +       BUF_UNLOCK(bp);
> +       uma_zfree(buf_zone, bp);
> +       atomic_add_int(&numfreebuffers, 1);
> +       bufspace_wakeup();
> +}
> +
> +/*
> + * buf_import:
> + *
> + *     Import bufs into the uma cache from the buf list.  The system still
> + *     expects a static array of bufs and much of the synchronization
> + *     around bufs assumes type stable storage.  As a result, UMA is used
> + *     only as a per-cpu cache of bufs still maintained on a global list.
> + */
> +static int
> +buf_import(void *arg, void **store, int cnt, int flags)
> +{
> +       struct buf *bp;
> +       int i;
> +
> +       mtx_lock(&bqlocks[QUEUE_EMPTY]);
> +       for (i = 0; i < cnt; i++) {
> +               bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
> +               if (bp == NULL)
> +                       break;
> +               bremfreel(bp);
> +               store[i] = bp;
> +       }
> +       mtx_unlock(&bqlocks[QUEUE_EMPTY]);
> +
> +       return (i);
> +}
> +
> +/*
> + * buf_release:
> + *
> + *     Release bufs from the uma cache back to the buffer queues.
> + */
> +static void
> +buf_release(void *arg, void **store, int cnt)
> +{
> +        int i;
> +
> +        for (i = 0; i < cnt; i++)
> +               binsfree(store[i], QUEUE_EMPTY);
> +}
> +
> +/*
> + * buf_alloc:
> + *
> + *     Allocate an empty buffer header.
> + */
> +static struct buf *
> +buf_alloc(void)
> +{
> +       struct buf *bp;
> +
> +       bp = uma_zalloc(buf_zone, M_NOWAIT);
> +       if (bp == NULL) {
> +               bufspace_daemonwakeup();
> +               atomic_add_int(&numbufallocfails, 1);
> +               return (NULL);
> +       }
> +
> +       /*
> +        * Wake-up the bufspace daemon on transition.
> +        */
> +       if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
> +               bufspace_daemonwakeup();
> +
> +       if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
> +               panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
> +
> +       KASSERT(bp->b_vp == NULL,
> +           ("bp: %p still has vnode %p.", bp, bp->b_vp));
> +       KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
> +           ("invalid buffer %p flags %#x", bp, bp->b_flags));
> +       KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
> +           ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
> +       KASSERT(bp->b_npages == 0,
> +           ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
> +       KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
> +       KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
> +
> +       bp->b_flags = 0;
> +       bp->b_ioflags = 0;
> +       bp->b_xflags = 0;
> +       bp->b_vflags = 0;
> +       bp->b_vp = NULL;
> +       bp->b_blkno = bp->b_lblkno = 0;
> +       bp->b_offset = NOOFFSET;
> +       bp->b_iodone = 0;
> +       bp->b_error = 0;
> +       bp->b_resid = 0;
> +       bp->b_bcount = 0;
> +       bp->b_npages = 0;
> +       bp->b_dirtyoff = bp->b_dirtyend = 0;
> +       bp->b_bufobj = NULL;
> +       bp->b_pin_count = 0;
> +       bp->b_data = bp->b_kvabase = unmapped_buf;
> +       bp->b_fsprivate1 = NULL;
> +       bp->b_fsprivate2 = NULL;
> +       bp->b_fsprivate3 = NULL;
> +       LIST_INIT(&bp->b_dep);
> +
> +       return (bp);
> +}
> +
> +/*
> + *     buf_qrecycle:
> + *
> + *     Free a buffer from the given bufqueue.  kva controls whether the
> + *     freed buf must own some kva resources.  This is used for
> + *     defragmenting.
> + */
> +static int
> +buf_qrecycle(int qindex, bool kva)
> +{
> +       struct buf *bp, *nbp;
> +
> +       if (kva)
> +               atomic_add_int(&bufdefragcnt, 1);
> +       nbp = NULL;
> +       mtx_lock(&bqlocks[qindex]);
> +       nbp = TAILQ_FIRST(&bufqueues[qindex]);
> +
> +       /*
> +        * Run scan, possibly freeing data and/or kva mappings on the fly
> +        * depending.
> +        */
> +       while ((bp = nbp) != NULL) {
> +               /*
> +                * Calculate next bp (we can only use it if we do not
> +                * release the bqlock).
> +                */
> +               nbp = TAILQ_NEXT(bp, b_freelist);
> +
> +               /*
> +                * If we are defragging then we need a buffer with
> +                * some kva to reclaim.
> +                */
> +               if (kva && bp->b_kvasize == 0)
> +                       continue;
> +
> +               if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
> +                       continue;
> +
> +               /*
> +                * Skip buffers with background writes in progress.
> +                */
> +               if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
> +                       BUF_UNLOCK(bp);
> +                       continue;
> +               }
> +
> +               KASSERT(bp->b_qindex == qindex,
> +                   ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
> +               /*
> +                * NOTE:  nbp is now entirely invalid.  We can only restart
> +                * the scan from this point on.
> +                */
> +               bremfreel(bp);
> +               mtx_unlock(&bqlocks[qindex]);
> +
> +               /*
> +                * Requeue the background write buffer with error and
> +                * restart the scan.
> +                */
> +               if ((bp->b_vflags & BV_BKGRDERR) != 0) {
> +                       bqrelse(bp);
> +                       mtx_lock(&bqlocks[qindex]);
> +                       nbp = TAILQ_FIRST(&bufqueues[qindex]);
> +                       continue;
> +               }
> +               bp->b_flags |= B_INVAL;
> +               brelse(bp);
> +               return (0);
> +       }
> +       mtx_unlock(&bqlocks[qindex]);
> +
> +       return (ENOBUFS);
> +}
> +
> +/*
> + *     buf_recycle:
> + *
> + *     Iterate through all clean queues until we find a buf to recycle or
> + *     exhaust the search.
> + */
> +static int
> +buf_recycle(bool kva)
> +{
> +       int qindex, first_qindex;
> +
> +       qindex = first_qindex = bqcleanq();
> +       do {
> +               if (buf_qrecycle(qindex, kva) == 0)
> +                       return (0);
> +               if (++qindex == QUEUE_CLEAN + clean_queues)
> +                       qindex = QUEUE_CLEAN;
> +       } while (qindex != first_qindex);
> +
> +       return (ENOBUFS);
> +}
> +
> +/*
> + *     buf_scan:
> + *
> + *     Scan the clean queues looking for a buffer to recycle.  needsbuffer
> + *     is set on failure so that the caller may optionally bufspace_wait()
> + *     in a race-free fashion.
> + */
> +static int
> +buf_scan(bool defrag)
> +{
> +       int error;
>
>         /*
> -        * Something we can maybe free or reuse.
> -        */
> -       if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
> -               bufspacewakeup();
> -
> -       if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
> -               bufcountadd(bp);
> +        * To avoid heavy synchronization and wakeup races we set
> +        * needsbuffer and re-poll before failing.  This ensures that
> +        * no frees can be missed between an unsuccessful poll and
> +        * going to sleep in a synchronized fashion.
> +        */
>
> *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
>