svn commit: r329612 - in head/sys: kern sys
Jeff Roberson
jeff at FreeBSD.org
Tue Feb 20 00:06:08 UTC 2018
Author: jeff
Date: Tue Feb 20 00:06:07 2018
New Revision: 329612
URL: https://svnweb.freebsd.org/changeset/base/329612
Log:
Further parallelize the buffer cache.
Provide multiple clean queues partitioned into 'domains'. Each domain manages
its own bufspace and has its own bufspace daemon. Each domain has a set of
subqueues indexed by the current cpuid to reduce lock contention on the cleanq.
Refine the sleep/wakeup around the bufspace daemon to use atomics as much as
possible.
Add a B_REUSE flag that is used to requeue bufs during the scan to approximate
LRU rather than locking the queue on every use of a frequently accessed buf.
Implement bufspace_reserve with only atomic_fetchadd to avoid loop restarts.
Reviewed by: markj
Tested by: pho
Sponsored by: Netflix, Dell/EMC Isilon
Differential Revision: https://reviews.freebsd.org/D14274
Modified:
head/sys/kern/vfs_bio.c
head/sys/kern/vfs_subr.c
head/sys/sys/buf.h
head/sys/sys/bufobj.h
Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c Mon Feb 19 22:56:04 2018 (r329611)
+++ head/sys/kern/vfs_bio.c Tue Feb 20 00:06:07 2018 (r329612)
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/conf.h>
+#include <sys/counter.h>
#include <sys/buf.h>
#include <sys/devicestat.h>
#include <sys/eventhandler.h>
@@ -105,7 +106,6 @@ caddr_t unmapped_buf;
/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
struct proc *bufdaemonproc;
-struct proc *bufspacedaemonproc;
static int inmem(struct vnode *vp, daddr_t blkno);
static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -124,11 +124,8 @@ static int vfs_bio_clcheck(struct vnode *vp, int size,
static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
void (*)(struct buf *));
static int buf_flush(struct vnode *vp, int);
-static int buf_recycle(bool);
-static int buf_scan(bool);
static int flushbufqueues(struct vnode *, int, int);
static void buf_daemon(void);
-static void bremfreel(struct buf *bp);
static __inline void bd_wakeup(void);
static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
static void bufkva_reclaim(vmem_t *, int);
@@ -137,28 +134,17 @@ static int buf_import(void *, void **, int, int, int);
static void buf_release(void *, void **, int);
static void maxbcachebuf_adjust(void);
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
- defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
-#endif
-
int vmiodirenable = TRUE;
SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
"Use the VM system for directory writes");
long runningbufspace;
SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
"Amount of presently outstanding async buffer io");
-static long bufspace;
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
- defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
- &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
-#else
-SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
- "Physical memory used for buffers");
-#endif
-static long bufkvaspace;
-SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
+ NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
+static counter_u64_t bufkvaspace;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
"Kernel virtual memory used for buffers");
static long maxbufspace;
SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
@@ -178,11 +164,11 @@ SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &h
long bufspacethresh;
SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
0, "Bufspace consumed before waking the daemon to free some");
-static int buffreekvacnt;
-SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
+static counter_u64_t buffreekvacnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
"Number of times we have freed the KVA space from some buffer");
-static int bufdefragcnt;
-SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
+static counter_u64_t bufdefragcnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
"Number of times we have had to repeat buffer allocation to defragment");
static long lorunningspace;
SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
@@ -225,24 +211,26 @@ SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
static int hifreebuffers;
SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
"Threshold for clean buffer recycling");
-static int getnewbufcalls;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
- "Number of calls to getnewbuf");
-static int getnewbufrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
+static counter_u64_t getnewbufcalls;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
+ &getnewbufcalls, "Number of calls to getnewbuf");
+static counter_u64_t getnewbufrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
+ &getnewbufrestarts,
"Number of times getnewbuf has had to restart a buffer acquisition");
-static int mappingrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+static counter_u64_t mappingrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
+ &mappingrestarts,
"Number of times getblk has had to restart a buffer mapping for "
"unmapped buffer");
-static int numbufallocfails;
-SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
- "Number of times buffer allocations failed");
+static counter_u64_t numbufallocfails;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
+ &numbufallocfails, "Number of times buffer allocations failed");
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
"Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflushes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0,
+static counter_u64_t notbufdflushes;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes,
"Number of dirty buffer flushes done by the bufdaemon helpers");
static long barrierwrites;
SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
@@ -266,11 +254,6 @@ static struct mtx_padalign __exclusive_cache_line bdlo
static struct mtx_padalign __exclusive_cache_line rbreqlock;
/*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
- */
-static struct rwlock_padalign __exclusive_cache_line nblock;
-
-/*
* Lock that protects bdirtywait.
*/
static struct mtx_padalign __exclusive_cache_line bdirtylock;
@@ -283,11 +266,6 @@ static struct mtx_padalign __exclusive_cache_line bdir
static int bd_request;
/*
- * Request/wakeup point for the bufspace daemon.
- */
-static int bufspace_request;
-
-/*
* Request for the buf daemon to write more buffers than is indicated by
* lodirtybuf. This may be necessary to push out excess dependencies or
* defragment the address space where a simple count of the number of dirty
@@ -302,15 +280,6 @@ static int bd_speedupreq;
*/
static int runningbufreq;
-/*
- * Synchronization (sleep/wakeup) variable for buffer requests.
- * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
- * by and/or.
- * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
- * getnewbuf(), and getblk().
- */
-static volatile int needsbuffer;
-
/*
* Synchronization for bwillwrite() waiters.
*/
@@ -323,29 +292,69 @@ static int bdirtywait;
#define QUEUE_EMPTY 1 /* empty buffer headers */
#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */
-#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
+#define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */
-/* Maximum number of clean buffer queues. */
-#define CLEAN_QUEUES 16
+struct bufqueue {
+ struct mtx_padalign bq_lock;
+ TAILQ_HEAD(, buf) bq_queue;
+ uint8_t bq_index;
+ uint16_t bq_subqueue;
+ int bq_len;
+} __aligned(CACHE_LINE_SIZE);
+#define BQ_LOCKPTR(bq) (&(bq)->bq_lock)
+#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq)))
+#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq)))
+#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
+
+struct bufqueue __exclusive_cache_line bqempty;
+struct bufqueue __exclusive_cache_line bqdirty;
+
+struct bufdomain {
+ struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
+ struct bufqueue *bd_cleanq;
+ struct mtx_padalign bd_run_lock;
+ /* Constants */
+ long bd_maxbufspace;
+ long bd_hibufspace;
+ long bd_lobufspace;
+ long bd_bufspacethresh;
+ int bd_hifreebuffers;
+ int bd_lofreebuffers;
+ int bd_lim;
+ /* atomics */
+ int bd_wanted;
+ int __aligned(CACHE_LINE_SIZE) bd_running;
+ long __aligned(CACHE_LINE_SIZE) bd_bufspace;
+ int __aligned(CACHE_LINE_SIZE) bd_freebuffers;
+} __aligned(CACHE_LINE_SIZE);
+
+#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq->bq_lock)
+#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd)))
+#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd)))
+#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
+#define BD_RUN_LOCKPTR(bd) (&(bd)->bd_run_lock)
+#define BD_RUN_LOCK(bd) mtx_lock(BD_RUN_LOCKPTR((bd)))
+#define BD_RUN_UNLOCK(bd) mtx_unlock(BD_RUN_LOCKPTR((bd)))
+#define BD_DOMAIN(bd) (bd - bdclean)
+
+/* Maximum number of clean buffer domains. */
+#define CLEAN_DOMAINS 8
+
/* Configured number of clean queues. */
-static int clean_queues;
+static int __read_mostly clean_domains;
-/* Maximum number of buffer queues. */
-#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES)
+struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS];
-/* Queues for free buffers with various properties */
-static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
-#ifdef INVARIANTS
-static int bq_len[BUFFER_QUEUES];
-#endif
+static void bq_remove(struct bufqueue *bq, struct buf *bp);
+static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
+static int buf_recycle(struct bufdomain *, bool kva);
+static void bq_init(struct bufqueue *bq, int qindex, int cpu,
+ const char *lockname);
+static void bd_init(struct bufdomain *bd);
+static int bd_flushall(struct bufdomain *bd);
/*
- * Lock for each bufqueue
- */
-static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES];
-
-/*
* per-cpu empty buffer cache.
*/
uma_zone_t buf_zone;
@@ -391,46 +400,34 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
{
long lvalue;
int ivalue;
+ int i;
+ lvalue = 0;
+ for (i = 0; i < clean_domains; i++)
+ lvalue += bdclean[i].bd_bufspace;
if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
- return (sysctl_handle_long(oidp, arg1, arg2, req));
- lvalue = *(long *)arg1;
+ return (sysctl_handle_long(oidp, &lvalue, 0, req));
if (lvalue > INT_MAX)
/* On overflow, still write out a long to trigger ENOMEM. */
return (sysctl_handle_long(oidp, &lvalue, 0, req));
ivalue = lvalue;
return (sysctl_handle_int(oidp, &ivalue, 0, req));
}
-#endif
-
+#else
static int
-bqcleanq(void)
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
{
- static int nextq;
+ long lvalue;
+ int i;
- return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
+ lvalue = 0;
+ for (i = 0; i < clean_domains; i++)
+ lvalue += bdclean[i].bd_bufspace;
+ return (sysctl_handle_int(oidp, &lvalue, 0, req));
}
+#endif
-static int
-bqisclean(int qindex)
-{
-
- return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
-}
-
/*
- * bqlock:
- *
- * Return the appropriate queue lock based on the index.
- */
-static inline struct mtx *
-bqlock(int qindex)
-{
-
- return (struct mtx *)&bqlocks[qindex];
-}
-
-/*
* bdirtywakeup:
*
* Wakeup any bwillwrite() waiters.
@@ -481,47 +478,50 @@ bdirtyadd(void)
}
/*
- * bufspace_wakeup:
+ * bufspace_daemon_wakeup:
*
- * Called when buffer space is potentially available for recovery.
- * getnewbuf() will block on this flag when it is unable to free
- * sufficient buffer space. Buffer space becomes recoverable when
- * bp's get placed back in the queues.
+ * Wakeup the daemons responsible for freeing clean bufs.
*/
static void
-bufspace_wakeup(void)
+bufspace_daemon_wakeup(struct bufdomain *bd)
{
/*
- * If someone is waiting for bufspace, wake them up.
- *
- * Since needsbuffer is set prior to doing an additional queue
- * scan it is safe to check for the flag prior to acquiring the
- * lock. The thread that is preparing to scan again before
- * blocking would discover the buf we released.
+ * avoid the lock if the daemon is running.
*/
- if (needsbuffer) {
- rw_rlock(&nblock);
- if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
- wakeup(__DEVOLATILE(void *, &needsbuffer));
- rw_runlock(&nblock);
+ if (atomic_fetchadd_int(&bd->bd_running, 1) == 0) {
+ BD_RUN_LOCK(bd);
+ atomic_store_int(&bd->bd_running, 1);
+ wakeup(&bd->bd_running);
+ BD_RUN_UNLOCK(bd);
}
}
/*
- * bufspace_daemonwakeup:
+ * bufspace_daemon_wait:
*
- * Wakeup the daemon responsible for freeing clean bufs.
+ * Sleep until the domain falls below a limit or one second passes.
*/
static void
-bufspace_daemonwakeup(void)
+bufspace_daemon_wait(struct bufdomain *bd)
{
- rw_rlock(&nblock);
- if (bufspace_request == 0) {
- bufspace_request = 1;
- wakeup(&bufspace_request);
+ /*
+ * Re-check our limits and sleep. bd_running must be
+ * cleared prior to checking the limits to avoid missed
+ * wakeups. The waker will adjust one of bufspace or
+ * freebuffers prior to checking bd_running.
+ */
+ BD_RUN_LOCK(bd);
+ atomic_store_int(&bd->bd_running, 0);
+ if (bd->bd_bufspace < bd->bd_bufspacethresh &&
+ bd->bd_freebuffers > bd->bd_lofreebuffers) {
+ msleep(&bd->bd_running, BD_RUN_LOCKPTR(bd), PRIBIO|PDROP,
+ "-", hz);
+ } else {
+ /* Avoid spurious wakeups while running. */
+ atomic_store_int(&bd->bd_running, 1);
+ BD_RUN_UNLOCK(bd);
}
- rw_runlock(&nblock);
}
/*
@@ -533,20 +533,22 @@ bufspace_daemonwakeup(void)
static void
bufspace_adjust(struct buf *bp, int bufsize)
{
+ struct bufdomain *bd;
long space;
int diff;
KASSERT((bp->b_flags & B_MALLOC) == 0,
("bufspace_adjust: malloc buf %p", bp));
+ bd = &bdclean[bp->b_domain];
diff = bufsize - bp->b_bufsize;
if (diff < 0) {
- atomic_subtract_long(&bufspace, -diff);
- bufspace_wakeup();
+ atomic_subtract_long(&bd->bd_bufspace, -diff);
} else {
- space = atomic_fetchadd_long(&bufspace, diff);
+ space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
/* Wake up the daemon on the transition. */
- if (space < bufspacethresh && space + diff >= bufspacethresh)
- bufspace_daemonwakeup();
+ if (space < bd->bd_bufspacethresh &&
+ space + diff >= bd->bd_bufspacethresh)
+ bufspace_daemon_wakeup(bd);
}
bp->b_bufsize = bufsize;
}
@@ -558,24 +560,25 @@ bufspace_adjust(struct buf *bp, int bufsize)
* different space limit than data.
*/
static int
-bufspace_reserve(int size, bool metadata)
+bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
{
- long limit;
+ long limit, new;
long space;
if (metadata)
- limit = maxbufspace;
+ limit = bd->bd_maxbufspace;
else
- limit = hibufspace;
- do {
- space = bufspace;
- if (space + size > limit)
- return (ENOSPC);
- } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
+ limit = bd->bd_hibufspace;
+ space = atomic_fetchadd_long(&bd->bd_bufspace, size);
+ new = space + size;
+ if (new > limit) {
+ atomic_subtract_long(&bd->bd_bufspace, size);
+ return (ENOSPC);
+ }
/* Wake up the daemon on the transition. */
- if (space < bufspacethresh && space + size >= bufspacethresh)
- bufspace_daemonwakeup();
+ if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
+ bufspace_daemon_wakeup(bd);
return (0);
}
@@ -586,21 +589,22 @@ bufspace_reserve(int size, bool metadata)
* Release reserved bufspace after bufspace_adjust() has consumed it.
*/
static void
-bufspace_release(int size)
+bufspace_release(struct bufdomain *bd, int size)
{
- atomic_subtract_long(&bufspace, size);
- bufspace_wakeup();
+
+ atomic_subtract_long(&bd->bd_bufspace, size);
}
/*
* bufspace_wait:
*
* Wait for bufspace, acting as the buf daemon if a locked vnode is
- * supplied. needsbuffer must be set in a safe fashion prior to
- * polling for space. The operation must be re-tried on return.
+ * supplied. bd_wanted must be set prior to polling for space. The
+ * operation must be re-tried on return.
*/
static void
-bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
+bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
+ int slpflag, int slptimeo)
{
struct thread *td;
int error, fl, norunbuf;
@@ -609,11 +613,11 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
return;
td = curthread;
- rw_wlock(&nblock);
- while (needsbuffer != 0) {
+ BD_LOCK(bd);
+ while (bd->bd_wanted) {
if (vp != NULL && vp->v_type != VCHR &&
(td->td_pflags & TDP_BUFNEED) == 0) {
- rw_wunlock(&nblock);
+ BD_UNLOCK(bd);
/*
* getblk() is called with a vnode locked, and
* some majority of the dirty buffers may as
@@ -636,18 +640,18 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
fl = buf_flush(vp, flushbufqtarget);
td->td_pflags &= norunbuf;
- rw_wlock(&nblock);
+ BD_LOCK(bd);
if (fl != 0)
continue;
- if (needsbuffer == 0)
+ if (bd->bd_wanted == 0)
break;
}
- error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+ error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
(PRIBIO + 4) | slpflag, "newbuf", slptimeo);
if (error != 0)
break;
}
- rw_wunlock(&nblock);
+ BD_UNLOCK(bd);
}
@@ -659,10 +663,13 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
* block nor work to reclaim buffers.
*/
static void
-bufspace_daemon(void)
+bufspace_daemon(void *arg)
{
+ struct bufdomain *bd;
+
+ bd = arg;
for (;;) {
- kproc_suspend_check(bufspacedaemonproc);
+ kproc_suspend_check(curproc);
/*
* Free buffers from the clean queue until we meet our
@@ -689,46 +696,25 @@ bufspace_daemon(void)
* which will inefficiently trade bufs with bqrelse
* until we return to condition 2.
*/
- while (bufspace > lobufspace ||
- numfreebuffers < hifreebuffers) {
- if (buf_recycle(false) != 0) {
- atomic_set_int(&needsbuffer, 1);
- if (buf_recycle(false) != 0) {
- rw_wlock(&nblock);
- if (needsbuffer)
- rw_sleep(__DEVOLATILE(void *,
- &needsbuffer), &nblock,
- PRIBIO|PDROP, "bufspace",
- hz/10);
- else
- rw_wunlock(&nblock);
- }
+ do {
+ if (buf_recycle(bd, false) != 0) {
+ if (bd_flushall(bd))
+ continue;
+ BD_LOCK(bd);
+ if (bd->bd_wanted) {
+ msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
+ PRIBIO|PDROP, "bufspace", hz/10);
+ } else
+ BD_UNLOCK(bd);
}
maybe_yield();
- }
+ } while (bd->bd_bufspace > bd->bd_lobufspace ||
+ bd->bd_freebuffers < bd->bd_hifreebuffers);
- /*
- * Re-check our limits under the exclusive nblock.
- */
- rw_wlock(&nblock);
- if (bufspace < bufspacethresh &&
- numfreebuffers > lofreebuffers) {
- bufspace_request = 0;
- rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
- "-", hz);
- } else
- rw_wunlock(&nblock);
+ bufspace_daemon_wait(bd);
}
}
-static struct kproc_desc bufspace_kp = {
- "bufspacedaemon",
- bufspace_daemon,
- &bufspacedaemonproc
-};
-SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
- &bufspace_kp);
-
/*
* bufmallocadjust:
*
@@ -842,7 +828,7 @@ vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff,
}
/* Wake up the buffer daemon if necessary */
-static __inline void
+static void
bd_wakeup(void)
{
@@ -1038,19 +1024,12 @@ bufinit(void)
KASSERT(maxbcachebuf >= MAXBSIZE,
("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
MAXBSIZE));
- mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
- mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
- for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
- mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
+ bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
+ bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock");
mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
- rw_init(&nblock, "needsbuffer lock");
mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
- /* next, make a null set of free lists */
- for (i = 0; i < BUFFER_QUEUES; i++)
- TAILQ_INIT(&bufqueues[i]);
-
unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
/* finally, initialize each buffer header and stick on empty q */
@@ -1060,15 +1039,14 @@ bufinit(void)
bp->b_flags = B_INVAL;
bp->b_rcred = NOCRED;
bp->b_wcred = NOCRED;
- bp->b_qindex = QUEUE_EMPTY;
+ bp->b_qindex = QUEUE_NONE;
+ bp->b_domain = -1;
+ bp->b_subqueue = mp_ncpus;
bp->b_xflags = 0;
bp->b_data = bp->b_kvabase = unmapped_buf;
LIST_INIT(&bp->b_dep);
BUF_LOCKINIT(bp);
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
-#ifdef INVARIANTS
- bq_len[QUEUE_EMPTY]++;
-#endif
+ bq_insert(&bqempty, bp, false);
}
/*
@@ -1150,8 +1128,31 @@ bufinit(void)
* One queue per-256mb up to the max. More queues gives better
* concurrency but less accurate LRU.
*/
- clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
+ clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS);
+ for (i = 0 ; i < clean_domains; i++) {
+ struct bufdomain *bd;
+ bd = &bdclean[i];
+ bd_init(bd);
+ bd->bd_freebuffers = nbuf / clean_domains;
+ bd->bd_hifreebuffers = hifreebuffers / clean_domains;
+ bd->bd_lofreebuffers = lofreebuffers / clean_domains;
+ bd->bd_bufspace = 0;
+ bd->bd_maxbufspace = maxbufspace / clean_domains;
+ bd->bd_hibufspace = hibufspace / clean_domains;
+ bd->bd_lobufspace = lobufspace / clean_domains;
+ bd->bd_bufspacethresh = bufspacethresh / clean_domains;
+ /* Don't allow more than 2% of bufs in the per-cpu caches. */
+ bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus;
+ }
+ getnewbufcalls = counter_u64_alloc(M_WAITOK);
+ getnewbufrestarts = counter_u64_alloc(M_WAITOK);
+ mappingrestarts = counter_u64_alloc(M_WAITOK);
+ numbufallocfails = counter_u64_alloc(M_WAITOK);
+ notbufdflushes = counter_u64_alloc(M_WAITOK);
+ buffreekvacnt = counter_u64_alloc(M_WAITOK);
+ bufdefragcnt = counter_u64_alloc(M_WAITOK);
+ bufkvaspace = counter_u64_alloc(M_WAITOK);
}
#ifdef INVARIANTS
@@ -1326,58 +1327,92 @@ bpmap_qenter(struct buf *bp)
(vm_offset_t)(bp->b_offset & PAGE_MASK));
}
+static struct bufqueue *
+bufqueue(struct buf *bp)
+{
+
+ switch (bp->b_qindex) {
+ case QUEUE_NONE:
+ /* FALLTHROUGH */
+ case QUEUE_SENTINEL:
+ return (NULL);
+ case QUEUE_EMPTY:
+ return (&bqempty);
+ case QUEUE_DIRTY:
+ return (&bqdirty);
+ case QUEUE_CLEAN:
+ return (&bdclean[bp->b_domain].bd_subq[bp->b_subqueue]);
+ default:
+ break;
+ }
+ panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
+}
+
/*
+ * Return the locked bufqueue that bp is a member of.
+ */
+static struct bufqueue *
+bufqueue_acquire(struct buf *bp)
+{
+ struct bufqueue *bq, *nbq;
+
+ /*
+ * bp can be pushed from a per-cpu queue to the
+ * cleanq while we're waiting on the lock. Retry
+ * if the queues don't match.
+ */
+ bq = bufqueue(bp);
+ BQ_LOCK(bq);
+ for (;;) {
+ nbq = bufqueue(bp);
+ if (bq == nbq)
+ break;
+ BQ_UNLOCK(bq);
+ BQ_LOCK(nbq);
+ bq = nbq;
+ }
+ return (bq);
+}
+
+/*
* binsfree:
*
- * Insert the buffer into the appropriate free list.
+ * Insert the buffer into the appropriate free list. Requires a
+ * locked buffer on entry and buffer is unlocked before return.
*/
static void
binsfree(struct buf *bp, int qindex)
{
- struct mtx *olock, *nlock;
+ struct bufdomain *bd;
+ struct bufqueue *bq;
- if (qindex != QUEUE_EMPTY) {
- BUF_ASSERT_XLOCKED(bp);
- }
+ KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
+ ("binsfree: Invalid qindex %d", qindex));
+ BUF_ASSERT_XLOCKED(bp);
/*
- * Stick to the same clean queue for the lifetime of the buf to
- * limit locking below. Otherwise pick ont sequentially.
- */
- if (qindex == QUEUE_CLEAN) {
- if (bqisclean(bp->b_qindex))
- qindex = bp->b_qindex;
- else
- qindex = bqcleanq();
- }
-
- /*
* Handle delayed bremfree() processing.
*/
- nlock = bqlock(qindex);
if (bp->b_flags & B_REMFREE) {
- olock = bqlock(bp->b_qindex);
- mtx_lock(olock);
- bremfreel(bp);
- if (olock != nlock) {
- mtx_unlock(olock);
- mtx_lock(nlock);
+ if (bp->b_qindex == qindex) {
+ bp->b_flags |= B_REUSE;
+ bp->b_flags &= ~B_REMFREE;
+ BUF_UNLOCK(bp);
+ return;
}
+ bq = bufqueue_acquire(bp);
+ bq_remove(bq, bp);
+ BQ_UNLOCK(bq);
+ }
+ if (qindex == QUEUE_CLEAN) {
+ bd = &bdclean[bp->b_domain];
+ if (bd->bd_lim != 0)
+ bq = &bd->bd_subq[PCPU_GET(cpuid)];
+ else
+ bq = bd->bd_cleanq;
} else
- mtx_lock(nlock);
-
- if (bp->b_qindex != QUEUE_NONE)
- panic("binsfree: free buffer onto another queue???");
-
- bp->b_qindex = qindex;
- if (bp->b_flags & B_AGE)
- TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
- else
- TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
-#ifdef INVARIANTS
- bq_len[bp->b_qindex]++;
-#endif
- mtx_unlock(nlock);
+ bq = &bqdirty;
+ bq_insert(bq, bp, true);
}
/*
@@ -1404,10 +1439,9 @@ buf_free(struct buf *bp)
if (!LIST_EMPTY(&bp->b_dep))
buf_deallocate(bp);
bufkva_free(bp);
+ atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1);
BUF_UNLOCK(bp);
uma_zfree(buf_zone, bp);
- atomic_add_int(&numfreebuffers, 1);
- bufspace_wakeup();
}
/*
@@ -1424,15 +1458,15 @@ buf_import(void *arg, void **store, int cnt, int domai
struct buf *bp;
int i;
- mtx_lock(&bqlocks[QUEUE_EMPTY]);
+ BQ_LOCK(&bqempty);
for (i = 0; i < cnt; i++) {
- bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ bp = TAILQ_FIRST(&bqempty.bq_queue);
if (bp == NULL)
break;
- bremfreel(bp);
+ bq_remove(&bqempty, bp);
store[i] = bp;
}
- mtx_unlock(&bqlocks[QUEUE_EMPTY]);
+ BQ_UNLOCK(&bqempty);
return (i);
}
@@ -1445,10 +1479,21 @@ buf_import(void *arg, void **store, int cnt, int domai
static void
buf_release(void *arg, void **store, int cnt)
{
+ struct bufqueue *bq;
+ struct buf *bp;
int i;
- for (i = 0; i < cnt; i++)
- binsfree(store[i], QUEUE_EMPTY);
+ bq = &bqempty;
+ BQ_LOCK(bq);
+ for (i = 0; i < cnt; i++) {
+ bp = store[i];
+ /* Inline bq_insert() to batch locking. */
+ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+ bp->b_flags &= ~(B_AGE | B_REUSE);
+ bq->bq_len++;
+ bp->b_qindex = bq->bq_index;
+ }
+ BQ_UNLOCK(bq);
}
/*
@@ -1457,22 +1502,31 @@ buf_release(void *arg, void **store, int cnt)
* Allocate an empty buffer header.
*/
static struct buf *
-buf_alloc(void)
+buf_alloc(struct bufdomain *bd)
{
struct buf *bp;
+ int freebufs;
- bp = uma_zalloc(buf_zone, M_NOWAIT);
+ /*
+ * We can only run out of bufs in the buf zone if the average buf
+ * is less than BKVASIZE. In this case the actual wait/block will
+ * come from buf_reycle() failing to flush one of these small bufs.
+ */
+ bp = NULL;
+ freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
+ if (freebufs > 0)
+ bp = uma_zalloc(buf_zone, M_NOWAIT);
if (bp == NULL) {
- bufspace_daemonwakeup();
- atomic_add_int(&numbufallocfails, 1);
+ atomic_fetchadd_int(&bd->bd_freebuffers, 1);
+ bufspace_daemon_wakeup(bd);
+ counter_u64_add(numbufallocfails, 1);
return (NULL);
}
-
/*
- * Wake-up the bufspace daemon on transition.
+ * Wake-up the bufspace daemon on transition below threshold.
*/
- if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
- bufspace_daemonwakeup();
+ if (freebufs == bd->bd_lofreebuffers)
+ bufspace_daemon_wakeup(bd);
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
@@ -1488,6 +1542,7 @@ buf_alloc(void)
KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
+ bp->b_domain = BD_DOMAIN(bd);
bp->b_flags = 0;
bp->b_ioflags = 0;
bp->b_xflags = 0;
@@ -1512,22 +1567,26 @@ buf_alloc(void)
}
/*
- * buf_qrecycle:
+ * buf_recycle:
*
* Free a buffer from the given bufqueue. kva controls whether the
* freed buf must own some kva resources. This is used for
* defragmenting.
*/
static int
-buf_qrecycle(int qindex, bool kva)
+buf_recycle(struct bufdomain *bd, bool kva)
{
+ struct bufqueue *bq;
struct buf *bp, *nbp;
if (kva)
- atomic_add_int(&bufdefragcnt, 1);
+ counter_u64_add(bufdefragcnt, 1);
nbp = NULL;
- mtx_lock(&bqlocks[qindex]);
- nbp = TAILQ_FIRST(&bufqueues[qindex]);
+ bq = bd->bd_cleanq;
+ BQ_LOCK(bq);
+ KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
+ ("buf_recycle: Locks don't match"));
+ nbp = TAILQ_FIRST(&bq->bq_queue);
/*
* Run scan, possibly freeing data and/or kva mappings on the fly
@@ -1551,6 +1610,18 @@ buf_qrecycle(int qindex, bool kva)
continue;
/*
+ * Implement a second chance algorithm for frequently
+ * accessed buffers.
+ */
+ if ((bp->b_flags & B_REUSE) != 0) {
+ TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+ bp->b_flags &= ~B_REUSE;
+ BUF_UNLOCK(bp);
+ continue;
+ }
+
+ /*
* Skip buffers with background writes in progress.
*/
if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
@@ -1558,14 +1629,18 @@ buf_qrecycle(int qindex, bool kva)
continue;
}
- KASSERT(bp->b_qindex == qindex,
- ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+ KASSERT(bp->b_qindex == QUEUE_CLEAN,
+ ("buf_recycle: inconsistent queue %d bp %p",
+ bp->b_qindex, bp));
+ KASSERT(bp->b_domain == BD_DOMAIN(bd),
+ ("getnewbuf: queue domain %d doesn't match request %d",
+ bp->b_domain, (int)BD_DOMAIN(bd)));
/*
* NOTE: nbp is now entirely invalid. We can only restart
* the scan from this point on.
*/
- bremfreel(bp);
- mtx_unlock(&bqlocks[qindex]);
+ bq_remove(bq, bp);
+ BQ_UNLOCK(bq);
/*
* Requeue the background write buffer with error and
@@ -1573,70 +1648,21 @@ buf_qrecycle(int qindex, bool kva)
*/
if ((bp->b_vflags & BV_BKGRDERR) != 0) {
bqrelse(bp);
- mtx_lock(&bqlocks[qindex]);
- nbp = TAILQ_FIRST(&bufqueues[qindex]);
+ BQ_LOCK(bq);
+ nbp = TAILQ_FIRST(&bq->bq_queue);
continue;
}
bp->b_flags |= B_INVAL;
brelse(bp);
return (0);
}
- mtx_unlock(&bqlocks[qindex]);
+ bd->bd_wanted = 1;
+ BQ_UNLOCK(bq);
return (ENOBUFS);
}
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list