svn commit: r205231 - in
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Kip Macy
kmacy at FreeBSD.org
Tue Mar 16 22:17:22 UTC 2010
Author: kmacy
Date: Tue Mar 16 22:17:21 2010
New Revision: 205231
URL: http://svn.freebsd.org/changeset/base/205231
Log:
- reduce contention by breaking up ARC state locks in to 16 for data
and 16 for metadata
- export L2ARC tunables as sysctls
- add several kstats to track L2ARC state more precisely
- avoid holding a contended lock when atomically incrementing a
contended counter (no lock protection needed for atomics)
Modified:
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Tue Mar 16 21:44:21 2010 (r205230)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Tue Mar 16 22:17:21 2010 (r205231)
@@ -131,6 +131,7 @@
#include <sys/kstat.h>
#include <sys/sdt.h>
+#include <sys/ktr.h>
#include <vm/vm_pageout.h>
static kmutex_t arc_reclaim_thr_lock;
@@ -186,6 +187,11 @@ SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min,
SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
&zfs_mdcomp_disable, 0, "Disable metadata compression");
+#ifdef ZIO_USE_UMA
+extern kmem_cache_t *zio_buf_cache[];
+extern kmem_cache_t *zio_data_buf_cache[];
+#endif
+
/*
* Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
@@ -218,13 +224,31 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_di
* second level ARC benefit from these fast lookups.
*/
+#define ARCS_LOCK_PAD 128
+struct arcs_lock {
+ kmutex_t arcs_lock;
+#ifdef _KERNEL
+ unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
+#endif
+};
+
+/*
+ * must be power of two for mask use to work
+ *
+ */
+#define ARC_BUFC_NUMDATALISTS 16
+#define ARC_BUFC_NUMMETADATALISTS 16
+#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS+ARC_BUFC_NUMDATALISTS)
+
typedef struct arc_state {
- list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
uint64_t arcs_size; /* total amount of data in this state */
- kmutex_t arcs_mtx;
+ list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
+ struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(128);
} arc_state_t;
+#define ARCS_LOCK(s, i) &((s)->arcs_locks[(i)].arcs_lock)
+
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
@@ -248,7 +272,9 @@ typedef struct arc_stats {
kstat_named_t arcstat_mru_ghost_hits;
kstat_named_t arcstat_mfu_hits;
kstat_named_t arcstat_mfu_ghost_hits;
+ kstat_named_t arcstat_allocated;
kstat_named_t arcstat_deleted;
+ kstat_named_t arcstat_stolen;
kstat_named_t arcstat_recycle_miss;
kstat_named_t arcstat_mutex_miss;
kstat_named_t arcstat_evict_skip;
@@ -280,6 +306,19 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_hdr_size;
kstat_named_t arcstat_memory_throttle_count;
+ kstat_named_t arcstat_l2_write_trylock_fail;
+ kstat_named_t arcstat_l2_write_in_l2;
+ kstat_named_t arcstat_l2_write_passed_headroom;
+ kstat_named_t arcstat_l2_write_spa_mismatch;
+ kstat_named_t arcstat_l2_write_hdr_io_in_progress;
+ kstat_named_t arcstat_l2_write_not_cacheable;
+ kstat_named_t arcstat_l2_write_full;
+ kstat_named_t arcstat_l2_write_buffer_iter;
+ kstat_named_t arcstat_l2_write_pios;
+ kstat_named_t arcstat_l2_write_bytes_written;
+ kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
+ kstat_named_t arcstat_l2_write_buffer_list_iter;
+ kstat_named_t arcstat_l2_write_buffer_list_null_iter;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -297,7 +336,9 @@ static arc_stats_t arc_stats = {
{ "mru_ghost_hits", KSTAT_DATA_UINT64 },
{ "mfu_hits", KSTAT_DATA_UINT64 },
{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
+ { "allocated", KSTAT_DATA_UINT64 },
{ "deleted", KSTAT_DATA_UINT64 },
+ { "stolen", KSTAT_DATA_UINT64 },
{ "recycle_miss", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "evict_skip", KSTAT_DATA_UINT64 },
@@ -328,7 +369,20 @@ static arc_stats_t arc_stats = {
{ "l2_io_error", KSTAT_DATA_UINT64 },
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
- { "memory_throttle_count", KSTAT_DATA_UINT64 }
+ { "memory_throttle_count", KSTAT_DATA_UINT64 },
+ { "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
+ { "l2_write_in_l2", KSTAT_DATA_UINT64 },
+ { "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
+ { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
+ { "l2_write_io_in_progress", KSTAT_DATA_UINT64 },
+ { "l2_write_not_cacheable", KSTAT_DATA_UINT64 },
+ { "l2_write_full", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_iter", KSTAT_DATA_UINT64 },
+ { "l2_write_pios", KSTAT_DATA_UINT64 },
+ { "l2_write_bytes_written", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
+ { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -550,9 +604,10 @@ extern kmem_cache_t *zio_data_buf_cache[
* Level 2 ARC
*/
-#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
-#define L2ARC_HEADROOM 4 /* num of writes */
+#define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 128 /* num of writes */
#define L2ARC_FEED_SECS 1 /* caching interval */
+#define L2ARC_FEED_SECS_SHIFT 1 /* caching interval shift */
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
@@ -564,7 +619,66 @@ uint64_t l2arc_write_max = L2ARC_WRITE_S
uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
-boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+uint64_t l2arc_feed_secs_shift = L2ARC_FEED_SECS_SHIFT; /* interval seconds shift */
+boolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */
+
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
+ &l2arc_write_max, 0, "max write size");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
+ &l2arc_write_boost, 0, "extra write during warmup");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
+ &l2arc_headroom, 0, "number of dev writes");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
+ &l2arc_feed_secs, 0, "interval seconds");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs_shift, CTLFLAG_RW,
+ &l2arc_feed_secs_shift, 0, "power of 2 division of feed seconds");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
+ &l2arc_noprefetch, 0, "don't cache prefetch bufs");
+
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
+ &ARC_anon.arcs_size, 0, "size of anonymous state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
+ &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
+ &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
+ &ARC_mru.arcs_size, 0, "size of mru state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
+ &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
+ &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+ "size of metadata in mru ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+ "size of data in mru ghost state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
+ &ARC_mfu.arcs_size, 0, "size of mfu state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
+ &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
+ &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+ "size of metadata in mfu ghost state");
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+ "size of data in mfu ghost state");
+
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
+ &ARC_l2c_only.arcs_size, 0, "size of mru state");
/*
* L2ARC Internals
@@ -958,20 +1072,42 @@ arc_buf_freeze(arc_buf_t *buf)
}
static void
+get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
+{
+ uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
+
+ if (ab->b_type == ARC_BUFC_METADATA)
+ buf_hashid &= (ARC_BUFC_NUMMETADATALISTS-1);
+ else {
+ buf_hashid &= (ARC_BUFC_NUMDATALISTS-1);
+ buf_hashid += ARC_BUFC_NUMMETADATALISTS;
+ }
+
+ *list = &state->arcs_lists[buf_hashid];
+ *lock = ARCS_LOCK(state, buf_hashid);
+}
+
+
+static void
add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
{
+
ASSERT(MUTEX_HELD(hash_lock));
if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
(ab->b_state != arc_anon)) {
+ list_t *list;
+ kmutex_t *lock;
uint64_t delta = ab->b_size * ab->b_datacnt;
- list_t *list = &ab->b_state->arcs_list[ab->b_type];
uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
- ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
- mutex_enter(&ab->b_state->arcs_mtx);
+ get_buf_info(ab, ab->b_state, &list, &lock);
+ ASSERT(!MUTEX_HELD(lock));
+ mutex_enter(lock);
ASSERT(list_link_active(&ab->b_arc_node));
list_remove(list, ab);
+ mutex_exit(lock);
+
if (GHOST_STATE(ab->b_state)) {
ASSERT3U(ab->b_datacnt, ==, 0);
ASSERT3P(ab->b_buf, ==, NULL);
@@ -980,7 +1116,6 @@ add_reference(arc_buf_hdr_t *ab, kmutex_
ASSERT(delta > 0);
ASSERT3U(*size, >=, delta);
atomic_add_64(size, -delta);
- mutex_exit(&ab->b_state->arcs_mtx);
/* remove the prefetch flag if we get a reference */
if (ab->b_flags & ARC_PREFETCH)
ab->b_flags &= ~ARC_PREFETCH;
@@ -999,14 +1134,19 @@ remove_reference(arc_buf_hdr_t *ab, kmut
if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
uint64_t *size = &state->arcs_lsize[ab->b_type];
+ list_t *list;
+ kmutex_t *lock;
+
+ get_buf_info(ab, state, &list, &lock);
- ASSERT(!MUTEX_HELD(&state->arcs_mtx));
- mutex_enter(&state->arcs_mtx);
+ ASSERT(!MUTEX_HELD(lock));
+ mutex_enter(lock);
ASSERT(!list_link_active(&ab->b_arc_node));
- list_insert_head(&state->arcs_list[ab->b_type], ab);
+ list_insert_head(list, ab);
+ mutex_exit(lock);
+
ASSERT(ab->b_datacnt > 0);
atomic_add_64(size, ab->b_size * ab->b_datacnt);
- mutex_exit(&state->arcs_mtx);
}
return (cnt);
}
@@ -1021,6 +1161,8 @@ arc_change_state(arc_state_t *new_state,
arc_state_t *old_state = ab->b_state;
int64_t refcnt = refcount_count(&ab->b_refcnt);
uint64_t from_delta, to_delta;
+ list_t *list;
+ kmutex_t *lock;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(new_state != old_state);
@@ -1035,14 +1177,17 @@ arc_change_state(arc_state_t *new_state,
*/
if (refcnt == 0) {
if (old_state != arc_anon) {
- int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
+ int use_mutex;
uint64_t *size = &old_state->arcs_lsize[ab->b_type];
+ get_buf_info(ab, old_state, &list, &lock);
+ use_mutex = !MUTEX_HELD(lock);
+
if (use_mutex)
- mutex_enter(&old_state->arcs_mtx);
+ mutex_enter(lock);
ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&old_state->arcs_list[ab->b_type], ab);
+ list_remove(list, ab);
/*
* If prefetching out of the ghost cache,
@@ -1057,16 +1202,20 @@ arc_change_state(arc_state_t *new_state,
atomic_add_64(size, -from_delta);
if (use_mutex)
- mutex_exit(&old_state->arcs_mtx);
+ mutex_exit(lock);
}
if (new_state != arc_anon) {
- int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
+ int use_mutex;
uint64_t *size = &new_state->arcs_lsize[ab->b_type];
+ get_buf_info(ab, new_state, &list, &lock);
+ use_mutex = !MUTEX_HELD(lock);
+
+
if (use_mutex)
- mutex_enter(&new_state->arcs_mtx);
+ mutex_enter(lock);
- list_insert_head(&new_state->arcs_list[ab->b_type], ab);
+ list_insert_head(list, ab);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
@@ -1077,7 +1226,7 @@ arc_change_state(arc_state_t *new_state,
atomic_add_64(size, to_delta);
if (use_mutex)
- mutex_exit(&new_state->arcs_mtx);
+ mutex_exit(lock);
}
}
@@ -1467,21 +1616,49 @@ arc_evict(arc_state_t *state, spa_t *spa
{
arc_state_t *evicted_state;
uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
+ int64_t bytes_remaining;
arc_buf_hdr_t *ab, *ab_prev = NULL;
- list_t *list = &state->arcs_list[type];
+ list_t *evicted_list, *list, *evicted_list_start, *list_start;
+ kmutex_t *lock, *evicted_lock;
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
+ static int evict_metadata_offset, evict_data_offset;
+ int i, idx, offset, list_count, count;
ASSERT(state == arc_mru || state == arc_mfu);
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ if (type == ARC_BUFC_METADATA) {
+ offset = 0;
+ list_count = ARC_BUFC_NUMMETADATALISTS;
+ list_start = &state->arcs_lists[0];
+ evicted_list_start = &evicted_state->arcs_lists[0];
+ idx = evict_metadata_offset;
+ } else {
+ offset = ARC_BUFC_NUMMETADATALISTS;
+
+ list_start = &state->arcs_lists[offset];
+ evicted_list_start = &evicted_state->arcs_lists[offset];
+ list_count = ARC_BUFC_NUMDATALISTS;
+ idx = evict_data_offset;
+ }
+ bytes_remaining = evicted_state->arcs_lsize[type];
+ count = 0;
+
+evict_start:
+ list = &list_start[idx];
+ evicted_list = &evicted_list_start[idx];
+ lock = ARCS_LOCK(state, (offset + idx));
+ evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
- mutex_enter(&state->arcs_mtx);
- mutex_enter(&evicted_state->arcs_mtx);
+ mutex_enter(lock);
+ mutex_enter(evicted_lock);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
+ bytes_remaining -= (ab->b_size * ab->b_datacnt);
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(ab) ||
(spa && ab->b_spa != spa) ||
@@ -1541,18 +1718,36 @@ arc_evict(arc_state_t *state, spa_t *spa
mutex_exit(hash_lock);
if (bytes >= 0 && bytes_evicted >= bytes)
break;
+ if (bytes_remaining > 0) {
+ mutex_exit(evicted_lock);
+ mutex_exit(lock);
+ idx = ((idx + 1)&(list_count-1));
+ count++;
+ goto evict_start;
+ }
} else {
missed += 1;
}
}
- mutex_exit(&evicted_state->arcs_mtx);
- mutex_exit(&state->arcs_mtx);
-
- if (bytes_evicted < bytes)
- dprintf("only evicted %lld bytes from %x",
- (longlong_t)bytes_evicted, state);
+ mutex_exit(evicted_lock);
+ mutex_exit(lock);
+
+ idx = ((idx + 1)&(list_count-1));
+ count++;
+ if (bytes_evicted < bytes) {
+ if (count < list_count)
+ goto evict_start;
+ else
+ dprintf("only evicted %lld bytes from %x",
+ (longlong_t)bytes_evicted, state);
+ }
+ if (type == ARC_BUFC_METADATA)
+ evict_metadata_offset = idx;
+ else
+ evict_data_offset = idx;
+
if (skipped)
ARCSTAT_INCR(arcstat_evict_skip, skipped);
@@ -1579,6 +1774,8 @@ arc_evict(arc_state_t *state, spa_t *spa
arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
}
}
+ if (stolen)
+ ARCSTAT_BUMP(arcstat_stolen);
return (stolen);
}
@@ -1591,14 +1788,28 @@ static void
arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
{
arc_buf_hdr_t *ab, *ab_prev;
- list_t *list = &state->arcs_list[ARC_BUFC_DATA];
- kmutex_t *hash_lock;
+ list_t *list, *list_start;
+ kmutex_t *hash_lock, *lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
+ static int evict_offset;
+ int list_count, idx = evict_offset;
+ int offset, count = 0;
ASSERT(GHOST_STATE(state));
-top:
- mutex_enter(&state->arcs_mtx);
+
+ /*
+ * data lists come after metadata lists
+ */
+ list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
+ list_count = ARC_BUFC_NUMDATALISTS;
+ offset = ARC_BUFC_NUMMETADATALISTS;
+
+evict_start:
+ list = &list_start[idx];
+ lock = ARCS_LOCK(state, idx + offset);
+
+ mutex_enter(lock);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
if (spa && ab->b_spa != spa)
@@ -1628,20 +1839,31 @@ top:
break;
} else {
if (bytes < 0) {
- mutex_exit(&state->arcs_mtx);
+ /*
+ * we're draining the ARC, retry
+ */
+ mutex_exit(lock);
mutex_enter(hash_lock);
mutex_exit(hash_lock);
- goto top;
+ goto evict_start;
}
bufs_skipped += 1;
}
}
- mutex_exit(&state->arcs_mtx);
-
- if (list == &state->arcs_list[ARC_BUFC_DATA] &&
+ mutex_exit(lock);
+ idx = ((idx + 1)&(ARC_BUFC_NUMDATALISTS-1));
+ count++;
+
+ if (count < list_count)
+ goto evict_start;
+
+ evict_offset = idx;
+ if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
(bytes < 0 || bytes_deleted < bytes)) {
- list = &state->arcs_list[ARC_BUFC_METADATA];
- goto top;
+ list_start = &state->arcs_lists[0];
+ list_count = ARC_BUFC_NUMMETADATALISTS;
+ offset = count = 0;
+ goto evict_start;
}
if (bufs_skipped) {
@@ -1755,22 +1977,22 @@ restart:
void
arc_flush(spa_t *spa)
{
- while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
+ while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
if (spa)
break;
}
- while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
+ while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
if (spa)
break;
}
- while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
+ while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
if (spa)
break;
}
- while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
+ while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
if (spa)
break;
@@ -2206,6 +2428,7 @@ out:
arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
arc_p = MIN(arc_c, arc_p + size);
}
+ ARCSTAT_BUMP(arcstat_allocated);
}
/*
@@ -2391,7 +2614,10 @@ arc_read_done(zio_t *zio)
hdr->b_flags &= ~ARC_L2_EVICTED;
if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
hdr->b_flags &= ~ARC_L2CACHE;
-
+#if 0
+ else if ((hdr->b_flags & ARC_PREFETCH) == 0)
+ hdr->b_flags |= ARC_L2CACHE;
+#endif
/* byteswap if necessary */
callback_list = hdr->b_acb;
ASSERT(callback_list != NULL);
@@ -2505,6 +2731,7 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_
uint32_t *arc_flags, const zbookmark_t *zb)
{
int err;
+ arc_buf_hdr_t *hdr = pbuf->b_hdr;
ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
@@ -2513,8 +2740,8 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_
err = arc_read_nolock(pio, spa, bp, done, private, priority,
zio_flags, arc_flags, zb);
+ ASSERT3P(hdr, ==, pbuf->b_hdr);
rw_exit(&pbuf->b_lock);
-
return (err);
}
@@ -2825,7 +3052,9 @@ arc_buf_evict(arc_buf_t *buf)
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
arc_buf_t **bufp;
-
+ list_t *list, *evicted_list;
+ kmutex_t *lock, *evicted_lock;
+
rw_enter(&buf->b_lock, RW_WRITER);
hdr = buf->b_hdr;
if (hdr == NULL) {
@@ -2873,16 +3102,18 @@ arc_buf_evict(arc_buf_t *buf)
evicted_state =
(old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
- mutex_enter(&old_state->arcs_mtx);
- mutex_enter(&evicted_state->arcs_mtx);
+ get_buf_info(hdr, old_state, &list, &lock);
+ get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
+ mutex_enter(lock);
+ mutex_enter(evicted_lock);
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
hdr->b_flags |= ARC_IN_HASH_TABLE;
hdr->b_flags &= ~ARC_BUF_AVAILABLE;
- mutex_exit(&evicted_state->arcs_mtx);
- mutex_exit(&old_state->arcs_mtx);
+ mutex_exit(evicted_lock);
+ mutex_exit(lock);
}
mutex_exit(hash_lock);
rw_exit(&buf->b_lock);
@@ -3428,7 +3659,8 @@ void
arc_init(void)
{
int prefetch_tunable_set = 0;
-
+ int i;
+
mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -3496,33 +3728,34 @@ arc_init(void)
arc_l2c_only = &ARC_l2c_only;
arc_size = 0;
- mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
+
+ mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
+ NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&arc_mru->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mru_ghost->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_mfu_ghost->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_lists[i],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ }
buf_init();
@@ -3596,7 +3829,8 @@ arc_init(void)
void
arc_fini(void)
{
-
+ int i;
+
mutex_enter(&arc_reclaim_thr_lock);
arc_thread_exit = 1;
cv_signal(&arc_reclaim_thr_cv);
@@ -3617,21 +3851,19 @@ arc_fini(void)
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
- list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
- list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
- list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
- list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
- list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
- list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
- list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
- list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
-
- mutex_destroy(&arc_anon->arcs_mtx);
- mutex_destroy(&arc_mru->arcs_mtx);
- mutex_destroy(&arc_mru_ghost->arcs_mtx);
- mutex_destroy(&arc_mfu->arcs_mtx);
- mutex_destroy(&arc_mfu_ghost->arcs_mtx);
-
+ for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
+ list_destroy(&arc_mru->arcs_lists[i]);
+ list_destroy(&arc_mru_ghost->arcs_lists[i]);
+ list_destroy(&arc_mfu->arcs_lists[i]);
+ list_destroy(&arc_mfu_ghost->arcs_lists[i]);
+
+ mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
+ mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
+ mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
+ mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
+ mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
+ }
+
mutex_destroy(&zfs_write_limit_lock);
buf_fini();
@@ -4026,28 +4258,31 @@ static list_t *
l2arc_list_locked(int list_num, kmutex_t **lock)
{
list_t *list;
+ int idx;
+
+ ASSERT(list_num >= 0 && list_num < 2*ARC_BUFC_NUMLISTS);
- ASSERT(list_num >= 0 && list_num <= 3);
-
- switch (list_num) {
- case 0:
- list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
- *lock = &arc_mfu->arcs_mtx;
- break;
- case 1:
- list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
- *lock = &arc_mru->arcs_mtx;
- break;
- case 2:
- list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
- *lock = &arc_mfu->arcs_mtx;
- break;
- case 3:
- list = &arc_mru->arcs_list[ARC_BUFC_DATA];
- *lock = &arc_mru->arcs_mtx;
- break;
+ if (list_num < ARC_BUFC_NUMMETADATALISTS) {
+ idx = list_num;
+ list = &arc_mfu->arcs_lists[idx];
+ *lock = ARCS_LOCK(arc_mfu, idx);
+ } else if (list_num < ARC_BUFC_NUMMETADATALISTS*2) {
+ idx = list_num - ARC_BUFC_NUMMETADATALISTS;
+ list = &arc_mru->arcs_lists[idx];
+ *lock = ARCS_LOCK(arc_mru, idx);
+ } else if (list_num < (ARC_BUFC_NUMMETADATALISTS*2 +
+ ARC_BUFC_NUMDATALISTS)) {
+ idx = list_num - ARC_BUFC_NUMMETADATALISTS;
+ list = &arc_mfu->arcs_lists[idx];
+ *lock = ARCS_LOCK(arc_mfu, idx);
+ } else {
+ idx = list_num - ARC_BUFC_NUMLISTS;
+ list = &arc_mru->arcs_lists[idx];
+ *lock = ARCS_LOCK(arc_mru, idx);
}
+ CTR3(KTR_SPARE2, "list=%p list_num=%d idx=%d",
+ list, list_num, idx);
ASSERT(!(MUTEX_HELD(*lock)));
mutex_enter(*lock);
return (list);
@@ -4212,13 +4447,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
head->b_flags |= ARC_L2_WRITE_HEAD;
+ ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
/*
* Copy buffers for L2ARC writing.
*/
mutex_enter(&l2arc_buflist_mtx);
- for (try = 0; try <= 3; try++) {
+ for (try = 0; try < 2*ARC_BUFC_NUMLISTS; try++) {
list = l2arc_list_locked(try, &list_lock);
passed_sz = 0;
+ ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
/*
* L2ARC fast warmup.
@@ -4231,52 +4468,66 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
ab = list_head(list);
else
ab = list_tail(list);
+ if (ab == NULL) {
+ ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
+ }
for (; ab; ab = ab_prev) {
if (arc_warm == B_FALSE)
ab_prev = list_next(list, ab);
else
ab_prev = list_prev(list, ab);
-
+ ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
+
hash_lock = HDR_LOCK(ab);
have_lock = MUTEX_HELD(hash_lock);
if (!have_lock && !mutex_tryenter(hash_lock)) {
+ ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
/*
* Skip this buffer rather than waiting.
*/
continue;
}
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * Already in L2ARC.
+ */
+ mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_in_l2);
+ continue;
+ }
+
passed_sz += ab->b_size;
if (passed_sz > headroom) {
/*
* Searched too far.
*/
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
break;
}
if (ab->b_spa != spa) {
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
continue;
}
- if (ab->b_l2hdr != NULL) {
- /*
- * Already in L2ARC.
- */
+ if (HDR_IO_IN_PROGRESS(ab)) {
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
continue;
}
-
- if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
+ if (!HDR_L2CACHE(ab)) {
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
continue;
}
-
if ((write_sz + ab->b_size) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
+ ARCSTAT_BUMP(arcstat_l2_write_full);
break;
}
@@ -4300,8 +4551,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
cb->l2wcb_head = head;
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
+ ARCSTAT_BUMP(arcstat_l2_write_pios);
}
+ ARCSTAT_INCR(arcstat_l2_write_bytes_written, ab->b_size);
/*
* Create and add a new L2ARC header.
*/
@@ -4309,7 +4562,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
hdrl2->b_dev = dev;
hdrl2->b_daddr = dev->l2ad_hand;
- ab->b_flags |= ARC_L2_WRITING;
ab->b_l2hdr = hdrl2;
list_insert_head(dev->l2ad_buflist, ab);
buf_data = ab->b_buf->b_data;
@@ -4397,7 +4649,7 @@ l2arc_feed_thread(void *dummy __unused)
*/
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
- hz * l2arc_feed_secs);
+ hz * l2arc_feed_secs >> l2arc_feed_secs_shift);
CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
/*
Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Tue Mar 16 21:44:21 2010 (r205230)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Tue Mar 16 22:17:21 2010 (r205231)
@@ -55,8 +55,8 @@ struct arc_buf {
};
typedef enum arc_buf_contents {
- ARC_BUFC_DATA, /* buffer contains data */
ARC_BUFC_METADATA, /* buffer contains metadata */
+ ARC_BUFC_DATA, /* buffer contains data */
ARC_BUFC_NUMTYPES
} arc_buf_contents_t;
/*
More information about the svn-src-head
mailing list