svn commit: r198458 - in
user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs:
. sys
Kip Macy
kmacy at FreeBSD.org
Sun Oct 25 00:42:03 UTC 2009
Author: kmacy
Date: Sun Oct 25 00:42:03 2009
New Revision: 198458
URL: http://svn.freebsd.org/changeset/base/198458
Log:
initial support for backing the ARC cache by the page cache
Modified:
user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Sun Oct 25 00:37:59 2009 (r198457)
+++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Sun Oct 25 00:42:03 2009 (r198458)
@@ -126,6 +126,7 @@
#include <sys/vdev.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
+#include <sys/ktr.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
@@ -258,6 +259,7 @@ static arc_state_t ARC_l2c_only;
typedef struct arc_stats {
kstat_named_t arcstat_hits;
+ kstat_named_t arcstat_page_cache_hits;
kstat_named_t arcstat_misses;
kstat_named_t arcstat_demand_data_hits;
kstat_named_t arcstat_demand_data_misses;
@@ -307,6 +309,7 @@ typedef struct arc_stats {
static arc_stats_t arc_stats = {
{ "hits", KSTAT_DATA_UINT64 },
+ { "page_cache_hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "demand_data_hits", KSTAT_DATA_UINT64 },
{ "demand_data_misses", KSTAT_DATA_UINT64 },
@@ -512,6 +515,7 @@ static void arc_evict_ghost(arc_state_t
#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
#define ARC_STORED (1 << 19) /* has been store()d to */
+#define ARC_BUF_CLONING (1 << 21) /* is being cloned */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -632,9 +636,10 @@ struct l2arc_buf_hdr {
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
+ arc_buf_t *l2df_buf;
void *l2df_data;
size_t l2df_size;
- void (*l2df_func)(void *, size_t);
+ void (*l2df_func)(arc_buf_t *, void *, size_t);
list_node_t l2df_list_node;
} l2arc_data_free_t;
@@ -1190,8 +1195,8 @@ arc_data_buf_free(void *buf, uint64_t si
atomic_add_64(&arc_size, -size);
}
-arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+static arc_buf_t *
+_arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type, dva_t dva)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
@@ -1201,6 +1206,7 @@ arc_buf_alloc(spa_t *spa, int size, void
ASSERT(BUF_EMPTY(hdr));
hdr->b_size = size;
hdr->b_type = type;
+ hdr->b_dva = dva;
hdr->b_spa = spa;
hdr->b_state = arc_anon;
hdr->b_arc_access = 0;
@@ -1220,6 +1226,14 @@ arc_buf_alloc(spa_t *spa, int size, void
return (buf);
}
+arc_buf_t *
+arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+{
+ dva_t dva = {0ULL, 0ULL};
+
+ return (_arc_buf_alloc(spa, size, tag, type, dva));
+}
+
static arc_buf_t *
arc_buf_clone(arc_buf_t *from)
{
@@ -1234,8 +1248,11 @@ arc_buf_clone(arc_buf_t *from)
buf->b_private = NULL;
buf->b_next = hdr->b_buf;
hdr->b_buf = buf;
+ hdr->b_flags |= ARC_BUF_CLONING;
arc_get_data_buf(buf);
+#ifdef nomore
bcopy(from->b_data, buf->b_data, size);
+#endif
hdr->b_datacnt += 1;
return (buf);
}
@@ -1272,17 +1289,95 @@ arc_buf_add_ref(arc_buf_t *buf, void* ta
data, metadata, hits);
}
+static void
+arc_getblk(arc_buf_t *buf)
+{
+ uint64_t size = buf->b_hdr->b_size;
+ arc_buf_contents_t type = buf->b_hdr->b_type;
+ spa_t *spa = buf->b_hdr->b_spa;
+ off_t blkno = buf->b_hdr->b_dva.dva_word[1] & ~(1UL<<63);
+ struct buf *newbp, *bp;
+ arc_buf_t *tbuf;
+ struct vnode *vp;
+ int flags = 0;
+
+ if (type == ARC_BUFC_METADATA) {
+ arc_space_consume(size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ flags = GB_NODUMP;
+ atomic_add_64(&arc_size, size);
+ }
+
+ if (buf->b_hdr->b_flags & ARC_BUF_CLONING) {
+ newbp = geteblk(size, flags);
+ tbuf = buf;
+
+ while (tbuf->b_next != NULL)
+ tbuf = tbuf->b_next;
+ bp = tbuf->b_bp;
+ vp = spa_get_vnode(spa);
+
+ KASSERT((bp->b_blkno == bp->b_lblkno) &&
+ (bp->b_blkno == blkno),
+ ("blkno mismatch b_blkno %ld b_lblkno %ld blkno %ld",
+ bp->b_blkno, bp->b_lblkno, blkno));
+ newbp->b_bufobj = &vp->v_bufobj;
+ newbp->b_lblkno = blkno;
+ newbp->b_blkno = blkno;
+ newbp->b_offset = (blkno<<9);
+
+ if (bp->b_vp != NULL) {
+ KASSERT(bp->b_xflags & BX_VNCLEAN, ("brelvp() on buffer that is not in splay"));
+ brelvp(bp);
+ }
+
+ BO_LOCK(&vp->v_bufobj);
+ bgetvp(vp, newbp);
+ BO_UNLOCK(&vp->v_bufobj);
+ newbp->b_flags &= ~B_INVAL;
+ newbp->b_flags |= B_CACHE;
+ bp->b_flags |= B_INVAL;
+ bp->b_flags &= ~B_CACHE;
+ bcopy(bp->b_data, newbp->b_data, size);
+ buf->b_hdr->b_flags &= ~ARC_BUF_CLONING;
+
+ } else if (BUF_EMPTY(buf->b_hdr)) {
+ newbp = geteblk(size, flags);
+ } else
+ newbp = getblk(spa_get_vnode(spa), blkno,
+ size, 0, 0, flags);
+ CTR2(KTR_BUF, "arc_getblk() bp=%p flags %X",
+ newbp, newbp->b_flags);
+
+ BUF_KERNPROC(newbp);
+ buf->b_bp = newbp;
+ buf->b_data = newbp->b_data;
+}
+
+static void
+arc_brelse(arc_buf_t *buf, void *data, size_t size)
+{
+
+#ifdef INVARIANTS
+ if (buf->b_bp->b_vp)
+ KASSERT(buf->b_bp->b_xflags & BX_VNCLEAN, ("brelse() on buffer that is not in splay"));
+#endif
+ brelse(buf->b_bp);
+}
+
/*
* Free the arc data buffer. If it is an l2arc write in progress,
* the buffer is placed on l2arc_free_on_write to be freed later.
*/
static void
-arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
- void *data, size_t size)
+arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(arc_buf_t *, void *, size_t),
+ arc_buf_t *buf, void *data, size_t size)
{
if (HDR_L2_WRITING(hdr)) {
l2arc_data_free_t *df;
df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df->l2df_buf = buf;
df->l2df_data = data;
df->l2df_size = size;
df->l2df_func = free_func;
@@ -1291,7 +1386,7 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, vo
mutex_exit(&l2arc_free_on_write_mtx);
ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else {
- free_func(data, size);
+ free_func(buf, data, size);
}
}
@@ -1309,13 +1404,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_
arc_cksum_verify(buf);
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
- arc_buf_data_free(buf->b_hdr, zio_buf_free,
- buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr, arc_brelse,
+ buf, buf->b_data, size);
arc_space_return(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
- arc_buf_data_free(buf->b_hdr,
- zio_data_buf_free, buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr, arc_brelse,
+ buf, buf->b_data, size);
atomic_add_64(&arc_size, -size);
}
}
@@ -1514,7 +1609,7 @@ arc_buf_size(arc_buf_t *buf)
* it can't get a hash_lock on, and so may not catch all candidates.
* It may also return without evicting as much space as requested.
*/
-static void *
+static struct buf *
arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
arc_buf_contents_t type)
{
@@ -1526,11 +1621,12 @@ arc_evict(arc_state_t *state, spa_t *spa
kmutex_t *lock, *evicted_lock;
kmutex_t *hash_lock;
boolean_t have_lock;
- void *stolen = NULL;
+ struct buf *stolen = NULL;
static int evict_metadata_offset, evict_data_offset;
int idx, offset, list_count, count;
ASSERT(state == arc_mru || state == arc_mfu);
+ ASSERT(recycle == FALSE);
evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
@@ -1598,7 +1694,7 @@ evict_start:
if (buf->b_efunc) {
mutex_enter(&arc_eviction_mtx);
arc_buf_destroy(buf,
- buf->b_data == stolen, FALSE);
+ buf->b_bp == stolen, FALSE);
ab->b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
@@ -1608,7 +1704,7 @@ evict_start:
} else {
rw_exit(&buf->b_lock);
arc_buf_destroy(buf,
- buf->b_data == stolen, TRUE);
+ buf->b_bp == stolen, TRUE);
}
}
if (ab->b_datacnt == 0) {
@@ -2267,14 +2363,7 @@ arc_get_data_buf(arc_buf_t *buf)
* just allocate a new buffer.
*/
if (!arc_evict_needed(type)) {
- if (type == ARC_BUFC_METADATA) {
- buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- buf->b_data = zio_data_buf_alloc(size);
- atomic_add_64(&arc_size, size);
- }
+ arc_getblk(buf);
goto out;
}
@@ -2297,17 +2386,8 @@ arc_get_data_buf(arc_buf_t *buf)
state = (arc_mru->arcs_lsize[type] > 0 &&
mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
}
- if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
- if (type == ARC_BUFC_METADATA) {
- buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- buf->b_data = zio_data_buf_alloc(size);
- atomic_add_64(&arc_size, size);
- }
- ARCSTAT_BUMP(arcstat_recycle_miss);
- }
+ (void) arc_evict(state, NULL, size, FALSE, type);
+ arc_getblk(buf);
ASSERT(buf->b_data != NULL);
out:
/*
@@ -2528,6 +2608,9 @@ arc_read_done(zio_t *zio)
arc_cksum_compute(buf, B_FALSE);
+ buf->b_bp->b_flags &= ~B_INVAL;
+ buf->b_bp->b_flags |= B_CACHE;
+
/* create copies of the data buffer for the callers */
abuf = buf;
for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2734,9 +2817,9 @@ top:
/* this block is not in the cache */
arc_buf_hdr_t *exists;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
- buf = arc_buf_alloc(spa, size, private, type);
+ buf = _arc_buf_alloc(spa, size, private, type,
+ *BP_IDENTITY(bp));
hdr = buf->b_hdr;
- hdr->b_dva = *BP_IDENTITY(bp);
hdr->b_birth = bp->blk_birth;
hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
exists = buf_hash_insert(hdr, &hash_lock);
@@ -2783,7 +2866,19 @@ top:
arc_get_data_buf(buf);
ASSERT(hdr->b_datacnt == 0);
hdr->b_datacnt = 1;
-
+ }
+ /*
+ * We hit in the page cache
+ *
+ */
+ if ((buf->b_bp->b_flags & (B_CACHE|B_INVAL)) == B_CACHE) {
+ /*
+ * track the number of times
+ * the buffer was found in the cache
+ */
+ ARCSTAT_BUMP(arcstat_page_cache_hits);
+ mutex_exit(hash_lock);
+ goto top;
}
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -3211,7 +3306,6 @@ arc_write_done(zio_t *zio)
arc_buf_hdr_t *hdr = buf->b_hdr;
hdr->b_acb = NULL;
-
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = zio->io_bp->blk_birth;
hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
@@ -3224,7 +3318,32 @@ arc_write_done(zio_t *zio)
if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
+ /*
+ * Associate buffer with offset in the page cache
+ */
+ struct buf *bp = buf->b_bp;
+ struct vnode *vp = spa_get_vnode(hdr->b_spa);
+ off_t blkno = hdr->b_dva.dva_word[1] & ~(1UL<<63);
+
+ CTR2(KTR_BUF, "arc_write_done(%p) flags %X",
+ bp, bp->b_flags);
+
+ if ((hdr->b_buf == buf) &&
+ (bp->b_bufobj == NULL)) {
+
+ bp->b_bufobj = &vp->v_bufobj;
+ bp->b_lblkno = blkno;
+ bp->b_blkno = blkno;
+ bp->b_offset = (blkno << 9);
+ BO_LOCK(bp->b_bufobj);
+ bgetvp(vp, bp);
+ BO_UNLOCK(bp->b_bufobj);
+ bp->b_flags &= ~B_INVAL;
+ bp->b_flags |= B_CACHE;
+ }
+ /*
+ */
arc_cksum_verify(buf);
exists = buf_hash_insert(hdr, &hash_lock);
@@ -3987,7 +4106,7 @@ l2arc_do_free_on_write()
df_prev = list_prev(buflist, df);
ASSERT(df->l2df_data != NULL);
ASSERT(df->l2df_func != NULL);
- df->l2df_func(df->l2df_data, df->l2df_size);
+ df->l2df_func(df->l2df_buf, df->l2df_data, df->l2df_size);
list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t));
}
Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
==============================================================================
--- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c Sun Oct 25 00:37:59 2009 (r198457)
+++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c Sun Oct 25 00:42:03 2009 (r198458)
@@ -4299,3 +4299,10 @@ done:
#endif
#endif
}
+
+struct vnode *
+spa_get_vnode(spa_t *spa)
+{
+
+ return (spa->spa_root_vdev->vdev_vnode);
+}
Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
==============================================================================
--- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Sun Oct 25 00:37:59 2009 (r198457)
+++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Sun Oct 25 00:42:03 2009 (r198458)
@@ -52,6 +52,7 @@ struct arc_buf {
void *b_data;
arc_evict_func_t *b_efunc;
void *b_private;
+ struct buf *b_bp;
};
typedef enum arc_buf_contents {
Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
==============================================================================
--- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h Sun Oct 25 00:37:59 2009 (r198457)
+++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h Sun Oct 25 00:42:03 2009 (r198458)
@@ -534,6 +534,8 @@ extern void spa_prop_clear_bootfs(spa_t
/* asynchronous event notification */
extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
+extern struct vnode *spa_get_vnode(spa_t *spa);
+
#ifdef ZFS_DEBUG
#define dprintf_bp(bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Sun Oct 25 00:37:59 2009 (r198457)
+++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Sun Oct 25 00:42:03 2009 (r198458)
@@ -144,7 +144,8 @@ struct vdev {
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
-
+ struct vnode *vdev_vnode; /* container for page cache */
+
/*
* Leaf vdev state.
*/
Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
==============================================================================
--- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c Sun Oct 25 00:37:59 2009 (r198457)
+++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c Sun Oct 25 00:42:03 2009 (r198458)
@@ -1060,10 +1060,24 @@ vdev_open(vdev_t *vd)
* inconsistently account for existing bp's.
*/
if (vd->vdev_top == vd) {
+ struct vnode *vp;
+
vd->vdev_deflate_ratio = (1<<17) /
(vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
+
}
+ if (vd->vdev_parent == NULL) {
+ struct vnode *vp;
+ error = getnewvnode("zpool" , NULL, &dead_vnodeops, &vp);
+ if (error != 0)
+ return (error);
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ vnode_create_vobject(vp, 512, curthread);
+ vd->vdev_vnode = vp;
+ VOP_UNLOCK(vp, 0);
+ }
/*
* If a leaf vdev has a DTL, and seems healthy, then kick off a
* resilver. But don't do this if we are doing a reopen for a
@@ -1192,6 +1206,8 @@ vdev_close(vdev_t *vd)
else
vd->vdev_state = VDEV_STATE_CLOSED;
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+
+ vn_free(vd->vdev_vnode);
}
void
More information about the svn-src-user
mailing list