svn commit: r328904 - in user/jeff/numa/sys: kern sys
Jeff Roberson
jeff at FreeBSD.org
Mon Feb 5 23:01:50 UTC 2018
Author: jeff
Date: Mon Feb 5 23:01:49 2018
New Revision: 328904
URL: https://svnweb.freebsd.org/changeset/base/328904
Log:
Re-implement the buffer queues with a number of independent silos each
having their own space allotment and bufspace daemon.
Use a per-cpu clean queue cache in front of the silo clean queue.
Move the common queue variables (queue, len, lock) into a structure so they
can be aligned and packed together.
Implement a REUSE flag to operate as a second chance in buf_recycle() so
we don't have to requeue frequently re-used buffers.
Move counters to the counter API.
Modified:
user/jeff/numa/sys/kern/vfs_bio.c
user/jeff/numa/sys/kern/vfs_subr.c
user/jeff/numa/sys/sys/buf.h
user/jeff/numa/sys/sys/bufobj.h
Modified: user/jeff/numa/sys/kern/vfs_bio.c
==============================================================================
--- user/jeff/numa/sys/kern/vfs_bio.c Mon Feb 5 22:21:51 2018 (r328903)
+++ user/jeff/numa/sys/kern/vfs_bio.c Mon Feb 5 23:01:49 2018 (r328904)
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/conf.h>
+#include <sys/counter.h>
#include <sys/buf.h>
#include <sys/devicestat.h>
#include <sys/eventhandler.h>
@@ -105,7 +106,6 @@ caddr_t unmapped_buf;
/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
struct proc *bufdaemonproc;
-struct proc *bufspacedaemonproc;
static int inmem(struct vnode *vp, daddr_t blkno);
static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -124,11 +124,8 @@ static int vfs_bio_clcheck(struct vnode *vp, int size,
static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
void (*)(struct buf *));
static int buf_flush(struct vnode *vp, int);
-static int buf_recycle(bool);
-static int buf_scan(bool);
static int flushbufqueues(struct vnode *, int, int);
static void buf_daemon(void);
-static void bremfreel(struct buf *bp);
static __inline void bd_wakeup(void);
static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
static void bufkva_reclaim(vmem_t *, int);
@@ -137,28 +134,17 @@ static int buf_import(void *, void **, int, int, int);
static void buf_release(void *, void **, int);
static void maxbcachebuf_adjust(void);
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
- defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
-#endif
-
int vmiodirenable = TRUE;
SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
"Use the VM system for directory writes");
long runningbufspace;
SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
"Amount of presently outstanding async buffer io");
-static long bufspace;
-#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
- defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
- &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
-#else
-SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
- "Physical memory used for buffers");
-#endif
-static long bufkvaspace;
-SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
+ NULL, 0, sysctl_bufspace, "L", "Physical memory used for buffers");
+static counter_u64_t bufkvaspace;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
"Kernel virtual memory used for buffers");
static long maxbufspace;
SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
@@ -178,11 +164,11 @@ SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &h
long bufspacethresh;
SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
0, "Bufspace consumed before waking the daemon to free some");
-static int buffreekvacnt;
-SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
+static counter_u64_t buffreekvacnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
"Number of times we have freed the KVA space from some buffer");
-static int bufdefragcnt;
-SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
+static counter_u64_t bufdefragcnt;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt,
"Number of times we have had to repeat buffer allocation to defragment");
static long lorunningspace;
SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
@@ -225,24 +211,26 @@ SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
static int hifreebuffers;
SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
"Threshold for clean buffer recycling");
-static int getnewbufcalls;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
- "Number of calls to getnewbuf");
-static int getnewbufrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
+static counter_u64_t getnewbufcalls;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
+ &getnewbufcalls, "Number of calls to getnewbuf");
+static counter_u64_t getnewbufrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD,
+ &getnewbufrestarts,
"Number of times getnewbuf has had to restart a buffer acquisition");
-static int mappingrestarts;
-SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+static counter_u64_t mappingrestarts;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RD,
+ &mappingrestarts,
"Number of times getblk has had to restart a buffer mapping for "
"unmapped buffer");
-static int numbufallocfails;
-SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
- "Number of times buffer allocations failed");
+static counter_u64_t numbufallocfails;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW,
+ &numbufallocfails, "Number of times buffer allocations failed");
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
"Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflushes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0,
+static counter_u64_t notbufdflushes;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes,
"Number of dirty buffer flushes done by the bufdaemon helpers");
static long barrierwrites;
SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
@@ -266,11 +254,6 @@ static struct mtx_padalign __exclusive_cache_line bdlo
static struct mtx_padalign __exclusive_cache_line rbreqlock;
/*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
- */
-static struct rwlock_padalign __exclusive_cache_line nblock;
-
-/*
* Lock that protects bdirtywait.
*/
static struct mtx_padalign __exclusive_cache_line bdirtylock;
@@ -283,11 +266,6 @@ static struct mtx_padalign __exclusive_cache_line bdir
static int bd_request;
/*
- * Request/wakeup point for the bufspace daemon.
- */
-static int bufspace_request;
-
-/*
* Request for the buf daemon to write more buffers than is indicated by
* lodirtybuf. This may be necessary to push out excess dependencies or
* defragment the address space where a simple count of the number of dirty
@@ -302,15 +280,6 @@ static int bd_speedupreq;
*/
static int runningbufreq;
-/*
- * Synchronization (sleep/wakeup) variable for buffer requests.
- * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
- * by and/or.
- * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
- * getnewbuf(), and getblk().
- */
-static volatile int needsbuffer;
-
/*
* Synchronization for bwillwrite() waiters.
*/
@@ -323,29 +292,65 @@ static int bdirtywait;
#define QUEUE_EMPTY 1 /* empty buffer headers */
#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */
-#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
+#define QUEUE_SENTINEL 4 /* not an queue index, but mark for sentinel */
-/* Maximum number of clean buffer queues. */
-#define CLEAN_QUEUES 16
+struct bufqueue {
+ struct mtx_padalign bq_lock;
+ TAILQ_HEAD(, buf) bq_queue;
+ uint8_t bq_index;
+ uint16_t bq_cpu;
+ int bq_len;
+} __aligned(CACHE_LINE_SIZE);
+#define BQ_LOCKPTR(bq) (&(bq)->bq_lock)
+#define BQ_LOCK(bq) mtx_lock(BQ_LOCKPTR((bq)))
+#define BQ_UNLOCK(bq) mtx_unlock(BQ_LOCKPTR((bq)))
+#define BQ_ASSERT_LOCKED(bq) mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
+
+struct bufqueue __exclusive_cache_line bqempty;
+struct bufqueue __exclusive_cache_line bqdirty;
+
+struct bufdomain {
+ struct bufqueue bd_cpuq[MAXCPU];
+ struct bufqueue bd_cleanq;
+ /* Constants */
+ long bd_maxbufspace;
+ long bd_hibufspace;
+ long bd_lobufspace;
+ long bd_bufspacethresh;
+ int bd_hifreebuffers;
+ int bd_lofreebuffers;
+ int bd_lim;
+ /* atomics */
+ int bd_wanted;
+ int __aligned(CACHE_LINE_SIZE) bd_request;
+ long __aligned(CACHE_LINE_SIZE) bd_bufspace;
+ int __aligned(CACHE_LINE_SIZE) bd_freebuffers;
+} __aligned(CACHE_LINE_SIZE);
+
+#define BD_LOCKPTR(bd) (&(bd)->bd_cleanq.bq_lock)
+#define BD_LOCK(bd) mtx_lock(BD_LOCKPTR((bd)))
+#define BD_UNLOCK(bd) mtx_unlock(BD_LOCKPTR((bd)))
+#define BD_ASSERT_LOCKED(bd) mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
+#define BD_DOMAIN(bd) (bd - bdclean)
+
+/* Maximum number of clean buffer domains. */
+#define CLEAN_DOMAINS 8
+
/* Configured number of clean queues. */
-static int clean_queues;
+static int __read_mostly clean_domains;
-/* Maximum number of buffer queues. */
-#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES)
+struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS];
-/* Queues for free buffers with various properties */
-static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
-#ifdef INVARIANTS
-static int bq_len[BUFFER_QUEUES];
-#endif
+static void bq_remove(struct bufqueue *bq, struct buf *bp);
+static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
+static int buf_recycle(struct bufdomain *, bool kva);
+static void bq_init(struct bufqueue *bq, int qindex, int cpu,
+ const char *lockname);
+static void bd_init(struct bufdomain *bd);
+static int bd_flushall(struct bufdomain *bd);
/*
- * Lock for each bufqueue
- */
-static struct mtx_padalign __exclusive_cache_line bqlocks[BUFFER_QUEUES];
-
-/*
* per-cpu empty buffer cache.
*/
uma_zone_t buf_zone;
@@ -391,46 +396,34 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
{
long lvalue;
int ivalue;
+ int i;
+ lvalue = 0;
+ for (i = 0; i < clean_domains; i++)
+ lvalue += bdclean[i].bd_bufspace;
if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
- return (sysctl_handle_long(oidp, arg1, arg2, req));
- lvalue = *(long *)arg1;
+ return (sysctl_handle_long(oidp, &lvalue, 0, req));
if (lvalue > INT_MAX)
/* On overflow, still write out a long to trigger ENOMEM. */
return (sysctl_handle_long(oidp, &lvalue, 0, req));
ivalue = lvalue;
return (sysctl_handle_int(oidp, &ivalue, 0, req));
}
-#endif
-
+#else
static int
-bqcleanq(void)
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
{
- static int nextq;
+ long lvalue;
+ int i;
- return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
+ lvalue = 0;
+ for (i = 0; i < clean_domains; i++)
+ lvalue += bdclean[i].bd_bufspace;
+ return (sysctl_handle_int(oidp, &lvalue, 0, req));
}
+#endif
-static int
-bqisclean(int qindex)
-{
-
- return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
-}
-
/*
- * bqlock:
- *
- * Return the appropriate queue lock based on the index.
- */
-static inline struct mtx *
-bqlock(int qindex)
-{
-
- return (struct mtx *)&bqlocks[qindex];
-}
-
-/*
* bdirtywakeup:
*
* Wakeup any bwillwrite() waiters.
@@ -481,50 +474,23 @@ bdirtyadd(void)
}
/*
- * bufspace_wakeup:
+ * bufspace_daemonwakeup:
*
- * Called when buffer space is potentially available for recovery.
- * getnewbuf() will block on this flag when it is unable to free
- * sufficient buffer space. Buffer space becomes recoverable when
- * bp's get placed back in the queues.
+ * Wakeup the daemons responsible for freeing clean bufs.
*/
static void
-bufspace_wakeup(void)
+bufspace_daemonwakeup(struct bufdomain *bd)
{
- /*
- * If someone is waiting for bufspace, wake them up.
- *
- * Since needsbuffer is set prior to doing an additional queue
- * scan it is safe to check for the flag prior to acquiring the
- * lock. The thread that is preparing to scan again before
- * blocking would discover the buf we released.
- */
- if (needsbuffer) {
- rw_rlock(&nblock);
- if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
- wakeup(__DEVOLATILE(void *, &needsbuffer));
- rw_runlock(&nblock);
+ if (atomic_fetchadd_int(&bd->bd_request, 1) == 0) {
+ BD_LOCK(bd);
+ bd->bd_request = 1;
+ wakeup(&bd->bd_request);
+ BD_UNLOCK(bd);
}
}
/*
- * bufspace_daemonwakeup:
- *
- * Wakeup the daemon responsible for freeing clean bufs.
- */
-static void
-bufspace_daemonwakeup(void)
-{
- rw_rlock(&nblock);
- if (bufspace_request == 0) {
- bufspace_request = 1;
- wakeup(&bufspace_request);
- }
- rw_runlock(&nblock);
-}
-
-/*
* bufspace_adjust:
*
* Adjust the reported bufspace for a KVA managed buffer, possibly
@@ -533,20 +499,22 @@ bufspace_daemonwakeup(void)
static void
bufspace_adjust(struct buf *bp, int bufsize)
{
+ struct bufdomain *bd;
long space;
int diff;
KASSERT((bp->b_flags & B_MALLOC) == 0,
("bufspace_adjust: malloc buf %p", bp));
+ bd = &bdclean[bp->b_domain];
diff = bufsize - bp->b_bufsize;
if (diff < 0) {
- atomic_subtract_long(&bufspace, -diff);
- bufspace_wakeup();
+ atomic_subtract_long(&bd->bd_bufspace, -diff);
} else {
- space = atomic_fetchadd_long(&bufspace, diff);
+ space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
/* Wake up the daemon on the transition. */
- if (space < bufspacethresh && space + diff >= bufspacethresh)
- bufspace_daemonwakeup();
+ if (space < bd->bd_bufspacethresh &&
+ space + diff >= bd->bd_bufspacethresh)
+ bufspace_daemonwakeup(bd);
}
bp->b_bufsize = bufsize;
}
@@ -558,24 +526,25 @@ bufspace_adjust(struct buf *bp, int bufsize)
* different space limit than data.
*/
static int
-bufspace_reserve(int size, bool metadata)
+bufspace_reserve(struct bufdomain *bd, int size, bool metadata)
{
- long limit;
+ long limit, new;
long space;
if (metadata)
- limit = maxbufspace;
+ limit = bd->bd_maxbufspace;
else
- limit = hibufspace;
+ limit = bd->bd_hibufspace;
do {
- space = bufspace;
- if (space + size > limit)
+ space = bd->bd_bufspace;
+ new = space + size;
+ if (new > limit)
return (ENOSPC);
- } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
+ } while (atomic_cmpset_long(&bd->bd_bufspace, space, new) == 0);
/* Wake up the daemon on the transition. */
- if (space < bufspacethresh && space + size >= bufspacethresh)
- bufspace_daemonwakeup();
+ if (space < bd->bd_bufspacethresh && new >= bd->bd_bufspacethresh)
+ bufspace_daemonwakeup(bd);
return (0);
}
@@ -586,21 +555,22 @@ bufspace_reserve(int size, bool metadata)
* Release reserved bufspace after bufspace_adjust() has consumed it.
*/
static void
-bufspace_release(int size)
+bufspace_release(struct bufdomain *bd, int size)
{
- atomic_subtract_long(&bufspace, size);
- bufspace_wakeup();
+
+ atomic_subtract_long(&bd->bd_bufspace, size);
}
/*
* bufspace_wait:
*
* Wait for bufspace, acting as the buf daemon if a locked vnode is
- * supplied. needsbuffer must be set in a safe fashion prior to
- * polling for space. The operation must be re-tried on return.
+ * supplied. bd_wanted must be set prior to polling for space. The
+ * operation must be re-tried on return.
*/
static void
-bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
+bufspace_wait(struct bufdomain *bd, struct vnode *vp, int gbflags,
+ int slpflag, int slptimeo)
{
struct thread *td;
int error, fl, norunbuf;
@@ -609,11 +579,11 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
return;
td = curthread;
- rw_wlock(&nblock);
- while (needsbuffer != 0) {
+ BD_LOCK(bd);
+ while (bd->bd_wanted) {
if (vp != NULL && vp->v_type != VCHR &&
(td->td_pflags & TDP_BUFNEED) == 0) {
- rw_wunlock(&nblock);
+ BD_UNLOCK(bd);
/*
* getblk() is called with a vnode locked, and
* some majority of the dirty buffers may as
@@ -636,18 +606,18 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
fl = buf_flush(vp, flushbufqtarget);
td->td_pflags &= norunbuf;
- rw_wlock(&nblock);
+ BD_LOCK(bd);
if (fl != 0)
continue;
- if (needsbuffer == 0)
+ if (bd->bd_wanted == 0)
break;
}
- error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+ error = msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
(PRIBIO + 4) | slpflag, "newbuf", slptimeo);
if (error != 0)
break;
}
- rw_wunlock(&nblock);
+ BD_UNLOCK(bd);
}
@@ -659,10 +629,13 @@ bufspace_wait(struct vnode *vp, int gbflags, int slpfl
* block nor work to reclaim buffers.
*/
static void
-bufspace_daemon(void)
+bufspace_daemon(void *arg)
{
+ struct bufdomain *bd;
+
+ bd = arg;
for (;;) {
- kproc_suspend_check(bufspacedaemonproc);
+ kproc_suspend_check(curproc);
/*
* Free buffers from the clean queue until we meet our
@@ -689,46 +662,35 @@ bufspace_daemon(void)
* which will inefficiently trade bufs with bqrelse
* until we return to condition 2.
*/
- while (bufspace > lobufspace ||
- numfreebuffers < hifreebuffers) {
- if (buf_recycle(false) != 0) {
- atomic_set_int(&needsbuffer, 1);
- if (buf_recycle(false) != 0) {
- rw_wlock(&nblock);
- if (needsbuffer)
- rw_sleep(__DEVOLATILE(void *,
- &needsbuffer), &nblock,
- PRIBIO|PDROP, "bufspace",
- hz/10);
- else
- rw_wunlock(&nblock);
- }
+ do {
+ if (buf_recycle(bd, false) != 0) {
+ if (bd_flushall(bd))
+ continue;
+ BD_LOCK(bd);
+ if (bd->bd_wanted) {
+ msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
+ PRIBIO|PDROP, "bufspace", hz/10);
+ } else
+ BD_UNLOCK(bd);
}
maybe_yield();
- }
+ } while (bd->bd_bufspace > bd->bd_lobufspace ||
+ bd->bd_freebuffers < bd->bd_hifreebuffers);
/*
- * Re-check our limits under the exclusive nblock.
+ * Re-check our limits and sleep.
*/
- rw_wlock(&nblock);
- if (bufspace < bufspacethresh &&
- numfreebuffers > lofreebuffers) {
- bufspace_request = 0;
- rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
+ BD_LOCK(bd);
+ if (bd->bd_bufspace < bd->bd_bufspacethresh &&
+ bd->bd_freebuffers > bd->bd_lofreebuffers) {
+ bd->bd_request = 0;
+ msleep(&bd->bd_request, BD_LOCKPTR(bd), PRIBIO|PDROP,
"-", hz);
} else
- rw_wunlock(&nblock);
+ BD_UNLOCK(bd);
}
}
-static struct kproc_desc bufspace_kp = {
- "bufspacedaemon",
- bufspace_daemon,
- &bufspacedaemonproc
-};
-SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
- &bufspace_kp);
-
/*
* bufmallocadjust:
*
@@ -1038,38 +1000,32 @@ bufinit(void)
KASSERT(maxbcachebuf >= MAXBSIZE,
("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
MAXBSIZE));
- mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
- mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
- for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
- mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
+ bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
+ bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock");
mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
- rw_init(&nblock, "needsbuffer lock");
mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
- /* next, make a null set of free lists */
- for (i = 0; i < BUFFER_QUEUES; i++)
- TAILQ_INIT(&bufqueues[i]);
-
unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
/* finally, initialize each buffer header and stick on empty q */
+ BQ_LOCK(&bqempty);
for (i = 0; i < nbuf; i++) {
bp = &buf[i];
bzero(bp, sizeof *bp);
bp->b_flags = B_INVAL;
bp->b_rcred = NOCRED;
bp->b_wcred = NOCRED;
- bp->b_qindex = QUEUE_EMPTY;
+ bp->b_qindex = QUEUE_NONE;
+ bp->b_domain = -1;
+ bp->b_cpu = -1;
bp->b_xflags = 0;
bp->b_data = bp->b_kvabase = unmapped_buf;
LIST_INIT(&bp->b_dep);
BUF_LOCKINIT(bp);
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
-#ifdef INVARIANTS
- bq_len[QUEUE_EMPTY]++;
-#endif
+ bq_insert(&bqempty, bp, false);
}
+ BQ_UNLOCK(&bqempty);
/*
* maxbufspace is the absolute maximum amount of buffer space we are
@@ -1150,8 +1106,31 @@ bufinit(void)
* One queue per-256mb up to the max. More queues gives better
* concurrency but less accurate LRU.
*/
- clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
+ clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS);
+ for (i = 0 ; i < clean_domains; i++) {
+ struct bufdomain *bd;
+ bd = &bdclean[i];
+ bd_init(bd);
+ bd->bd_freebuffers = nbuf / clean_domains;
+ bd->bd_hifreebuffers = hifreebuffers / clean_domains;
+ bd->bd_lofreebuffers = lofreebuffers / clean_domains;
+ bd->bd_bufspace = 0;
+ bd->bd_maxbufspace = maxbufspace / clean_domains;
+ bd->bd_hibufspace = hibufspace / clean_domains;
+ bd->bd_lobufspace = lobufspace / clean_domains;
+ bd->bd_bufspacethresh = bufspacethresh / clean_domains;
+ /* Don't allow more than 2% of bufs in the per-cpu caches. */
+ bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus;
+ }
+ getnewbufcalls = counter_u64_alloc(M_WAITOK);
+ getnewbufrestarts = counter_u64_alloc(M_WAITOK);
+ mappingrestarts = counter_u64_alloc(M_WAITOK);
+ numbufallocfails = counter_u64_alloc(M_WAITOK);
+ notbufdflushes = counter_u64_alloc(M_WAITOK);
+ buffreekvacnt = counter_u64_alloc(M_WAITOK);
+ bufdefragcnt = counter_u64_alloc(M_WAITOK);
+ bufkvaspace = counter_u64_alloc(M_WAITOK);
}
#ifdef INVARIANTS
@@ -1326,58 +1305,77 @@ bpmap_qenter(struct buf *bp)
(vm_offset_t)(bp->b_offset & PAGE_MASK));
}
+static struct bufqueue *
+bufqueue(struct buf *bp)
+{
+ struct bufdomain *bd;
+
+ switch (bp->b_qindex) {
+ case QUEUE_NONE:
+ /* FALLTHROUGH */
+ case QUEUE_SENTINEL:
+ return (NULL);
+ case QUEUE_EMPTY:
+ return (&bqempty);
+ case QUEUE_DIRTY:
+ return (&bqdirty);
+ case QUEUE_CLEAN:
+ /* FALLTHROUGH */
+ break;
+ default:
+ panic("bufqueue(%p): Unhandled type %d\n", bp, bp->b_qindex);
+ }
+ bd = &bdclean[bp->b_domain];
+ if (bp->b_cpu > mp_maxid)
+ return (&bd->bd_cleanq);
+ return (&bd->bd_cpuq[bp->b_cpu]);
+
+}
+
/*
* binsfree:
*
- * Insert the buffer into the appropriate free list.
+ * Insert the buffer into the appropriate free list. Requires a
+ * locked buffer on entry and buffer is unlocked before return.
*/
static void
binsfree(struct buf *bp, int qindex)
{
- struct mtx *olock, *nlock;
+ struct bufdomain *bd;
+ struct bufqueue *bq;
- if (qindex != QUEUE_EMPTY) {
- BUF_ASSERT_XLOCKED(bp);
- }
+ KASSERT(qindex == QUEUE_CLEAN || qindex == QUEUE_DIRTY,
+ ("binsfree: Invalid qindex %d", qindex));
+ BUF_ASSERT_XLOCKED(bp);
/*
- * Stick to the same clean queue for the lifetime of the buf to
- * limit locking below. Otherwise pick ont sequentially.
- */
- if (qindex == QUEUE_CLEAN) {
- if (bqisclean(bp->b_qindex))
- qindex = bp->b_qindex;
- else
- qindex = bqcleanq();
- }
-
- /*
* Handle delayed bremfree() processing.
*/
- nlock = bqlock(qindex);
if (bp->b_flags & B_REMFREE) {
- olock = bqlock(bp->b_qindex);
- mtx_lock(olock);
- bremfreel(bp);
- if (olock != nlock) {
- mtx_unlock(olock);
- mtx_lock(nlock);
+ if (bp->b_qindex == qindex) {
+ bp->b_flags |= B_REUSE;
+ bp->b_flags &= ~B_REMFREE;
+ BUF_UNLOCK(bp);
+ return;
}
+ bq = bufqueue(bp);
+ BQ_LOCK(bq);
+ bq_remove(bq, bp);
+ BQ_UNLOCK(bq);
+ }
+ if (qindex == QUEUE_CLEAN) {
+ bd = &bdclean[bp->b_domain];
+ if (bd->bd_lim != 0)
+ bq = &bd->bd_cpuq[PCPU_GET(cpuid)];
+ else
+ bq = &bd->bd_cleanq;
} else
- mtx_lock(nlock);
+ bq = &bqdirty;
+ BQ_LOCK(bq);
+ bq_insert(bq, bp, true);
+ BQ_UNLOCK(bq);
- if (bp->b_qindex != QUEUE_NONE)
- panic("binsfree: free buffer onto another queue???");
-
- bp->b_qindex = qindex;
- if (bp->b_flags & B_AGE)
- TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
- else
- TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
-#ifdef INVARIANTS
- bq_len[bp->b_qindex]++;
-#endif
- mtx_unlock(nlock);
+ return;
}
/*
@@ -1404,10 +1402,9 @@ buf_free(struct buf *bp)
if (!LIST_EMPTY(&bp->b_dep))
buf_deallocate(bp);
bufkva_free(bp);
+ atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1);
BUF_UNLOCK(bp);
uma_zfree(buf_zone, bp);
- atomic_add_int(&numfreebuffers, 1);
- bufspace_wakeup();
}
/*
@@ -1424,15 +1421,15 @@ buf_import(void *arg, void **store, int cnt, int domai
struct buf *bp;
int i;
- mtx_lock(&bqlocks[QUEUE_EMPTY]);
+ BQ_LOCK(&bqempty);
for (i = 0; i < cnt; i++) {
- bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ bp = TAILQ_FIRST(&bqempty.bq_queue);
if (bp == NULL)
break;
- bremfreel(bp);
+ bq_remove(&bqempty, bp);
store[i] = bp;
}
- mtx_unlock(&bqlocks[QUEUE_EMPTY]);
+ BQ_UNLOCK(&bqempty);
return (i);
}
@@ -1447,8 +1444,10 @@ buf_release(void *arg, void **store, int cnt)
{
int i;
+ BQ_LOCK(&bqempty);
for (i = 0; i < cnt; i++)
- binsfree(store[i], QUEUE_EMPTY);
+ bq_insert(&bqempty, store[i], false);
+ BQ_UNLOCK(&bqempty);
}
/*
@@ -1457,22 +1456,31 @@ buf_release(void *arg, void **store, int cnt)
* Allocate an empty buffer header.
*/
static struct buf *
-buf_alloc(void)
+buf_alloc(struct bufdomain *bd)
{
struct buf *bp;
+ int freebufs;
- bp = uma_zalloc(buf_zone, M_NOWAIT);
+ /*
+ * We can only run out of bufs in the buf zone if the average buf
+ * is less than BKVASIZE. In this case the actual wait/block will
+ * come from buf_reycle() failing to flush one of these small bufs.
+ */
+ bp = NULL;
+ freebufs = atomic_fetchadd_int(&bd->bd_freebuffers, -1);
+ if (freebufs > 0)
+ bp = uma_zalloc(buf_zone, M_NOWAIT);
if (bp == NULL) {
- bufspace_daemonwakeup();
- atomic_add_int(&numbufallocfails, 1);
+ atomic_fetchadd_int(&bd->bd_freebuffers, 1);
+ bufspace_daemonwakeup(bd);
+ counter_u64_add(numbufallocfails, 1);
return (NULL);
}
-
/*
- * Wake-up the bufspace daemon on transition.
+ * Wake-up the bufspace daemon on transition below threshold.
*/
- if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
- bufspace_daemonwakeup();
+ if (freebufs == bd->bd_lofreebuffers)
+ bufspace_daemonwakeup(bd);
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
@@ -1488,6 +1496,7 @@ buf_alloc(void)
KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
+ bp->b_domain = BD_DOMAIN(bd);
bp->b_flags = 0;
bp->b_ioflags = 0;
bp->b_xflags = 0;
@@ -1512,22 +1521,26 @@ buf_alloc(void)
}
/*
- * buf_qrecycle:
+ * buf_recycle:
*
* Free a buffer from the given bufqueue. kva controls whether the
* freed buf must own some kva resources. This is used for
* defragmenting.
*/
static int
-buf_qrecycle(int qindex, bool kva)
+buf_recycle(struct bufdomain *bd, bool kva)
{
+ struct bufqueue *bq;
struct buf *bp, *nbp;
if (kva)
- atomic_add_int(&bufdefragcnt, 1);
+ counter_u64_add(bufdefragcnt, 1);
nbp = NULL;
- mtx_lock(&bqlocks[qindex]);
- nbp = TAILQ_FIRST(&bufqueues[qindex]);
+ bq = &bd->bd_cleanq;
+ BQ_LOCK(bq);
+ KASSERT(BQ_LOCKPTR(bq) == BD_LOCKPTR(bd),
+ ("buf_recycle: Locks don't match"));
+ nbp = TAILQ_FIRST(&bq->bq_queue);
/*
* Run scan, possibly freeing data and/or kva mappings on the fly
@@ -1551,6 +1564,18 @@ buf_qrecycle(int qindex, bool kva)
continue;
/*
+ * Implement a second chance algorithm for frequently
+ * accessed buffers.
+ */
+ if ((bp->b_flags & B_REUSE) != 0) {
+ TAILQ_REMOVE(&bq->bq_queue, bp, b_freelist);
+ TAILQ_INSERT_TAIL(&bq->bq_queue, bp, b_freelist);
+ bp->b_flags &= ~B_REUSE;
+ BUF_UNLOCK(bp);
+ continue;
+ }
+
+ /*
* Skip buffers with background writes in progress.
*/
if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
@@ -1558,14 +1583,18 @@ buf_qrecycle(int qindex, bool kva)
continue;
}
- KASSERT(bp->b_qindex == qindex,
- ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+ KASSERT(bp->b_qindex == QUEUE_CLEAN,
+ ("buf_recycle: inconsistent queue %d bp %p",
+ bp->b_qindex, bp));
+ KASSERT(bp->b_domain == BD_DOMAIN(bd),
+ ("getnewbuf: queue domain %d doesn't match request %ld",
+ bp->b_domain, BD_DOMAIN(bd)));
/*
* NOTE: nbp is now entirely invalid. We can only restart
* the scan from this point on.
*/
- bremfreel(bp);
- mtx_unlock(&bqlocks[qindex]);
+ bq_remove(bq, bp);
+ BQ_UNLOCK(bq);
/*
* Requeue the background write buffer with error and
@@ -1573,70 +1602,21 @@ buf_qrecycle(int qindex, bool kva)
*/
if ((bp->b_vflags & BV_BKGRDERR) != 0) {
bqrelse(bp);
- mtx_lock(&bqlocks[qindex]);
- nbp = TAILQ_FIRST(&bufqueues[qindex]);
+ BQ_LOCK(bq);
+ nbp = TAILQ_FIRST(&bq->bq_queue);
continue;
}
bp->b_flags |= B_INVAL;
brelse(bp);
return (0);
}
- mtx_unlock(&bqlocks[qindex]);
+ bd->bd_wanted = 1;
+ BQ_UNLOCK(bq);
return (ENOBUFS);
}
/*
- * buf_recycle:
- *
- * Iterate through all clean queues until we find a buf to recycle or
- * exhaust the search.
- */
-static int
-buf_recycle(bool kva)
-{
- int qindex, first_qindex;
-
- qindex = first_qindex = bqcleanq();
- do {
- if (buf_qrecycle(qindex, kva) == 0)
- return (0);
- if (++qindex == QUEUE_CLEAN + clean_queues)
- qindex = QUEUE_CLEAN;
- } while (qindex != first_qindex);
-
- return (ENOBUFS);
-}
-
-/*
- * buf_scan:
- *
- * Scan the clean queues looking for a buffer to recycle. needsbuffer
- * is set on failure so that the caller may optionally bufspace_wait()
- * in a race-free fashion.
- */
-static int
-buf_scan(bool defrag)
-{
- int error;
-
- /*
- * To avoid heavy synchronization and wakeup races we set
- * needsbuffer and re-poll before failing. This ensures that
- * no frees can be missed between an unsuccessful poll and
- * going to sleep in a synchronized fashion.
- */
- if ((error = buf_recycle(defrag)) != 0) {
- atomic_set_int(&needsbuffer, 1);
- bufspace_daemonwakeup();
- error = buf_recycle(defrag);
- }
- if (error == 0)
- atomic_add_int(&getnewbufrestarts, 1);
- return (error);
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-user
mailing list