svn commit: r200428 - in
user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs:
. sys
Kip Macy
kmacy at FreeBSD.org
Sat Dec 12 03:35:50 UTC 2009
Author: kmacy
Date: Sat Dec 12 03:35:49 2009
New Revision: 200428
URL: http://svn.freebsd.org/changeset/base/200428
Log:
checkpoint mostly complety state of ARC / VM integration
Modified:
user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_bio.h
user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_bio.c
user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Modified: user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_bio.h
==============================================================================
--- user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_bio.h Sat Dec 12 02:34:00 2009 (r200427)
+++ user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_bio.h Sat Dec 12 03:35:49 2009 (r200428)
@@ -34,7 +34,7 @@ $FreeBSD$
#define ZBIO_BUF_CLONING (1 << 30) /* is being cloned */
-void zbio_sync_cache(spa_t *spa, blkptr_t *bp, uint64_t txg, uint64_t size);
+void zbio_sync_cache(spa_t *spa, blkptr_t *bp, uint64_t txg, void *data, uint64_t size, int bio_op);
void zbio_getblk(arc_buf_t *buf);
void zbio_data_getblk(arc_buf_t *buf);
void zbio_relse(arc_buf_t *buf, size_t size);
Modified: user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_bio.c
==============================================================================
--- user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_bio.c Sat Dec 12 02:34:00 2009 (r200427)
+++ user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_bio.c Sat Dec 12 03:35:49 2009 (r200428)
@@ -27,6 +27,57 @@ POSSIBILITY OF SUCH DAMAGE.
***************************************************************************/
+/**************************************************************************
+This module integrates the caching af pages associated with ARC buffers in a
+per-SPA vm object. Each SPA also has an associated "zbio_state_t" which
+tracks bufs allocated for the SPA in two splay trees.
+
+The first splay tree tracks bufs by the data pointer's virtual address.
+It is used for malloc'ed buffers, and buffers that are VMIO but do not have
+any pages in the SPA's vm object(s).
+
+Buffers are malloced if:
+ 1) the size is not a multiple of PAGE_SIZE
+ 2) the buffer is cloned
+
+There are two reasons why a VMIO buf would not have any pages in the vm object:
+ 1) the buffer has not yet been assigned an address on disk (and thus
+ has no offset in the vm object)
+ 2) the buffer did have pages in the vm object, but they were evicted
+ and replaced by a newer
+
+The second splay tree tracks buffers by block address and is only used
+to track buffers whose pages are referenced by the vm object. It is used to
+ensure that buffers that belong to an older transaction group don't have their
+pages mapped by buffers belonging to a newer transaction group.
+
+zfs_bio assumes that buffers that are cloned and buffers whose pages
+are evicted from the vm object are not used for I/O (will not be referenced
+from zfs_bio_sync_cache).
+
+Pages in the vm object are marked valid on completion of a read or before the
+initiation of a write.
+
+
+
+There are two places where we synchronize the ARC with the vm object's
+page cache: getblk and sync_cache.
+
+In getblk for a malloced buffer we check if the page at the corresponding offset
+is valid, if it is map it in and copy it in to the new buffer. For a VMIO buffer
+we need to remove the pages for any existing overlapping buffers and free any
+other pages in the vm object.
+
+In sync_cache for a malloced buffer we need to evict pages belonging to overlapping
+VMIO buffers, then copy to/from any pages still in the vm object. For an unmapped
+VMIO buffer, we need to remove pages belonging to any existing buffers and free
+any remaining overlapping pages in the vm object. We then add the VMIO buffers
+pages to a VM object. If the buffer is already mapped we mark the pages valid on a
+write, on a read we set a flag in the zio and mark the pages valid before calling
+the io_done I/O completion function.
+
+
+**************************************************************************/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
@@ -42,6 +93,7 @@ __FBSDID("$FreeBSD$");
#include <sys/kstat.h>
#include <sys/sdt.h>
+#include <sys/bitstring.h>
#include <vm/vm_pageout.h>
#ifdef _KERNEL
@@ -58,180 +110,734 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, page_cach
&zfs_page_cache_disable, 0, "Disable backing ARC with page cache ");
static eventhandler_tag zbio_event_shutdown = NULL;
-
+struct zbio_state;
+typedef struct zbio_state zbio_state_t;
+typedef struct buf buf_t;
+typedef uint64_t zbio_pindex_t;
+
+MALLOC_DEFINE(M_ZFS_BIO, "zfs_bio", "zfs buffer cache / vm");
+
+#define B_EVICTED B_00000800
+#define B_CLONED B_00001000
+#define B_ASSIGNED B_00004000
+
+#define ZB_EVICT_ALL 0x1
+
+#define btos(nbytes) ((nbytes)>>DEV_BSHIFT)
+#define stob(nsectors) ((nsectors)<<DEV_BSHIFT)
+
+#define b_arc_buf b_fsprivate2
+#define b_state b_fsprivate3
+
+struct zbio_state {
+ struct mtx mtx;
+ buf_t *blkno_root; /* track buf by blkno */
+ buf_t *va_root; /* track buf by data address */
+ spa_t *spa;
+ int generation;
+ int resident_count;
+ TAILQ_HEAD(, buf) blkno_memq; /* list of resident buffers */
+ TAILQ_HEAD(, buf) va_memq; /* list of resident buffers */
+};
+
+#define ZBIO_STATE_LOCK(zs) mtx_lock(&(zs)->mtx)
+#define ZBIO_STATE_UNLOCK(zs) mtx_unlock(&(zs)->mtx)
+
+#define spa_get_bio_state(spa) ((zbio_state_t *)spa_get_vnode((spa))->v_data)
+#define spa_get_vm_object(spa) spa_get_vnode((spa))->v_object
+#define zbio_buf_get_spa(bp) (((zbio_buf_hdr_t *)((arc_buf_t *)(bp->b_arc_buf))->b_hdr)->b_spa)
+
+static void zbio_buf_blkno_remove(buf_t *bp);
+static void zbio_buf_va_insert(buf_t *bp, zbio_state_t *object);
+
+/*
+ * zbio_buf_blkno_splay: [ internal use only ]
+ *
+ * Implements Sleator and Tarjan's top-down splay algorithm. Returns
+ * the buf containing the given lblkno. If, however, that
+ * lblkno is not found in the tree, returns a buf that is
+ * adjacent to the pindex, coming before or after it.
+ */
+static buf_t *
+zbio_buf_blkno_splay(daddr_t blkno, buf_t *root)
+{
+ buf_t dummy;
+ buf_t *lefttreemax, *righttreemin, *y;
+
+ if (root == NULL)
+ return (root);
+ lefttreemax = righttreemin = &dummy;
+ for (;; root = y) {
+ if (blkno < root->b_blkno) {
+ if ((y = root->b_left) == NULL)
+ break;
+ if (blkno < y->b_blkno) {
+ /* Rotate right. */
+ root->b_left = y->b_right;
+ y->b_right = root;
+ root = y;
+ if ((y = root->b_left) == NULL)
+ break;
+ }
+ /* Link into the new root's right tree. */
+ righttreemin->b_left = root;
+ righttreemin = root;
+ } else if (blkno > root->b_blkno) {
+ if ((y = root->b_right) == NULL)
+ break;
+ if (blkno > y->b_blkno) {
+ /* Rotate left. */
+ root->b_right = y->b_left;
+ y->b_left = root;
+ root = y;
+ if ((y = root->b_right) == NULL)
+ break;
+ }
+ /* Link into the new root's left tree. */
+ lefttreemax->b_right = root;
+ lefttreemax = root;
+ } else
+ break;
+ }
+ /* Assemble the new root. */
+ lefttreemax->b_right = root->b_left;
+ righttreemin->b_left = root->b_right;
+ root->b_left = dummy.b_right;
+ root->b_right = dummy.b_left;
+ return (root);
+}
+
+static buf_t *
+zbio_buf_va_splay(caddr_t va, buf_t *root)
+{
+ buf_t dummy;
+ buf_t *lefttreemax, *righttreemin, *y;
+
+ if (root == NULL)
+ return (root);
+ lefttreemax = righttreemin = &dummy;
+ for (;; root = y) {
+ if (va < root->b_data) {
+ if ((y = root->b_left) == NULL)
+ break;
+ if (va < y->b_data) {
+ /* Rotate right. */
+ root->b_left = y->b_right;
+ y->b_right = root;
+ root = y;
+ if ((y = root->b_left) == NULL)
+ break;
+ }
+ /* Link into the new root's right tree. */
+ righttreemin->b_left = root;
+ righttreemin = root;
+ } else if (va > root->b_data) {
+ if ((y = root->b_right) == NULL)
+ break;
+ if (va > y->b_data) {
+ /* Rotate left. */
+ root->b_right = y->b_left;
+ y->b_left = root;
+ root = y;
+ if ((y = root->b_right) == NULL)
+ break;
+ }
+ /* Link into the new root's left tree. */
+ lefttreemax->b_right = root;
+ lefttreemax = root;
+ } else
+ break;
+ }
+ /* Assemble the new root. */
+ lefttreemax->b_right = root->b_left;
+ righttreemin->b_left = root->b_right;
+ root->b_left = dummy.b_right;
+ root->b_right = dummy.b_left;
+ return (root);
+}
+
+/*
+ * zbio_buf_blkno_insert: [ internal use only ]
+ *
+ * Inserts the given buf into the state splay tree and state list.
+ *
+ * The object and page must be locked.
+ * This routine may not block.
+ */
static void
-_zbio_getblk(arc_buf_t *buf, int flags)
+zbio_buf_blkno_insert(buf_t *bp, zbio_state_t *object)
{
- zbio_buf_hdr_t *hdr = (zbio_buf_hdr_t *)buf->b_hdr;
- uint64_t size = hdr->b_size;
- spa_t *spa = hdr->b_spa;
- uint64_t blkno = hdr->b_dva.dva_word[1] & ~(1ULL<<63);
- void *data;
- struct vnode *vp;
- struct buf *newbp;
- struct bufobj *bo;
-
- vp = spa_get_vnode(spa);
- bo = &vp->v_bufobj;
- newbp = NULL;
- if ((size < PAGE_SIZE) || (hdr->b_flags & ZBIO_BUF_CLONING) ||
- zfs_page_cache_disable) {
- data = zio_buf_alloc(size);
- hdr->b_flags &= ~ZBIO_BUF_CLONING;
- } else if (BUF_EMPTY(hdr)) {
- newbp = geteblk(size, flags);
- data = newbp->b_data;
+ buf_t *root;
+ daddr_t root_blkno_end, blkno, blkno_end;
+
+ blkno = bp->b_blkno;
+ blkno_end = bp->b_blkno + btos(bp->b_bcount);
+
+ root = object->blkno_root;
+ if (root == NULL) {
+ bp->b_left = NULL;
+ bp->b_right = NULL;
+ TAILQ_INSERT_TAIL(&object->blkno_memq, bp, b_bobufs);
} else {
- newbp = getblk(vp, blkno, size, 0, 0, flags | GB_LOCK_NOWAIT);
- if (newbp == NULL)
- newbp = geteblk(size, flags);
- else
- brelvp(newbp);
- data = newbp->b_data;
- }
+ root = zbio_buf_blkno_splay(bp->b_blkno, root);
+ root_blkno_end = root->b_blkno + btos(root->b_bcount);
- if (newbp != NULL) {
- BUF_KERNPROC(newbp);
- newbp->b_bufobj = bo;
- CTR4(KTR_SPARE2, "arc_getblk() bp=%p flags %X "
- "blkno %ld npages %d",
- newbp, newbp->b_flags, blkno, newbp->b_npages);
+ if (blkno < root->b_blkno) {
+ KASSERT(blkno_end <= root->b_blkno, ("buffer overlap!"));
+ bp->b_left = root->b_left;
+ bp->b_right = root;
+ root->b_left = NULL;
+ TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
+ } else if (blkno == root->b_blkno) {
+ panic("zbio_buf_blkno_insert: blkno already allocated");
+ } else {
+ KASSERT(root_blkno_end <= blkno, ("buffer overlap!"));
+
+ bp->b_right = root->b_right;
+ bp->b_left = root;
+ root->b_right = NULL;
+ TAILQ_INSERT_AFTER(&object->blkno_memq, root, bp, b_bobufs);
+ }
}
+ object->blkno_root = bp;
+ object->generation++;
- buf->b_bp = newbp;
- buf->b_data = data;
+ /*
+ * show that the object has one more resident buffer.
+ */
+ object->resident_count++;
}
-void
-zbio_getblk(arc_buf_t *buf)
+/*
+ * zbio_buf_insert: [ internal use only ]
+ *
+ * Inserts the given buf into the state splay tree and state list.
+ *
+ * The object and page must be locked.
+ * This routine may not block.
+ */
+static void
+zbio_buf_va_insert(buf_t *bp, zbio_state_t *object)
{
+ buf_t *root;
+ caddr_t va = bp->b_data;
- _zbio_getblk(buf, 0);
+ bp->b_state = object;
+ root = object->va_root;
+ if (root == NULL) {
+ bp->b_left = NULL;
+ bp->b_right = NULL;
+ TAILQ_INSERT_TAIL(&object->va_memq, bp, b_bobufs);
+ } else {
+ root = zbio_buf_va_splay(bp->b_data, root);
+ if (va < root->b_data) {
+ bp->b_left = root->b_left;
+ bp->b_right = root;
+ root->b_left = NULL;
+ TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
+ } else if (va == root->b_data) {
+ panic("zbio_buf_va_insert: address already allocated");
+ } else {
+ bp->b_right = root->b_right;
+ bp->b_left = root;
+ root->b_right = NULL;
+ TAILQ_INSERT_AFTER(&object->va_memq, root, bp, b_bobufs);
+ }
+ }
+ object->va_root = bp;
+ object->generation++;
+
+ /*
+ * show that the object has one more resident buffer.
+ */
+ object->resident_count++;
}
-void
-zbio_data_getblk(arc_buf_t *buf)
+/*
+ * zbio_buf_remove:
+ *
+ * Removes the given buf from the spa's state tree
+ * buf list
+ *
+ * The state and buf must be locked.
+ * This routine may not block.
+ */
+static void
+zbio_buf_blkno_remove(buf_t *bp)
{
+ zbio_state_t *state;
+ buf_t *root;
+ daddr_t blkno, blkno_end;
- _zbio_getblk(buf, GB_NODUMP);
+ if ((state = bp->b_state) == NULL)
+ return;
+
+ /*
+ * Now remove from the object's list of backed pages.
+ */
+ if (bp != state->blkno_root)
+ zbio_buf_blkno_splay(bp->b_blkno, state->blkno_root);
+ if (bp->b_left == NULL)
+ root = bp->b_right;
+ else {
+ root = zbio_buf_blkno_splay(bp->b_blkno, bp->b_left);
+ root->b_right = bp->b_right;
+ }
+ state->blkno_root = root;
+ TAILQ_REMOVE(&state->blkno_memq, bp, b_bobufs);
+
+ /*
+ * And show that the object has one fewer resident page.
+ */
+ state->resident_count--;
+ state->generation++;
}
-void
-zbio_relse(arc_buf_t *buf, size_t size)
+/*
+ * zbio_buf_va_remove:
+ *
+ * Removes the given buf from the spa's state tree
+ * buf list
+ *
+ * The state and buf must be locked.
+ * This routine may not block.
+ */
+static void
+zbio_buf_va_remove(buf_t *bp)
{
- struct buf *bp = buf->b_bp;
- void * data = buf->b_data;
+ zbio_state_t *state;
+ buf_t *root;
+ vm_offset_t va;
- if (bp == NULL) {
- zio_buf_free(data, size);
+ if ((state = bp->b_state) == NULL)
return;
+
+ /*
+ * Now remove from the object's list of backed pages.
+ */
+ if (bp != state->va_root)
+ zbio_buf_va_splay(bp->b_data, state->va_root);
+ if (bp->b_left == NULL)
+ root = bp->b_right;
+ else {
+ root = zbio_buf_va_splay(bp->b_data, bp->b_left);
+ root->b_right = bp->b_right;
}
+ state->va_root = root;
+ TAILQ_REMOVE(&state->va_memq, bp, b_bobufs);
- CTR4(KTR_SPARE2, "arc_brelse() bp=%p flags %X"
- " size %ld blkno=%ld",
- bp, bp->b_flags, size, bp->b_blkno);
+ /*
+ * And show that the object has one fewer resident page.
+ */
+ state->resident_count--;
+ state->generation++;
+}
- bp->b_flags |= B_ZFS;
- brelse(bp);
+/*
+ * zbio_buf_va_lookup:
+ *
+ * Returns the range associated with the object/offset
+ * pair specified; if none is found, NULL is returned.
+ *
+ * The object must be locked.
+ * This routine may not block.
+ * This is a critical path routine
+ */
+static buf_t *
+zbio_buf_va_lookup(zbio_state_t *state, caddr_t va)
+{
+ buf_t *bp;
+
+ if ((bp = state->va_root) != NULL && bp->b_data != va) {
+ bp = zbio_buf_va_splay(va, bp);
+ if ((state->va_root = bp)->b_data != va)
+ bp = NULL;
+ }
+ return (bp);
}
-void
-zbio_sync_cache(spa_t *spa, blkptr_t *bp, uint64_t txg, uint64_t size)
+
+/*
+ * zbio_buf_blkno_lookup:
+ *
+ * Returns the range associated with the object/offset
+ * pair specified; if none is found, NULL is returned.
+ *
+ * The object must be locked.
+ * This routine may not block.
+ * This is a critical path routine
+ */
+static buf_t *
+zbio_buf_blkno_lookup(zbio_state_t *state, daddr_t blkno)
+{
+ buf_t *bp;
+
+ if ((bp = state->blkno_root) != NULL && bp->b_blkno != blkno) {
+ bp = zbio_buf_blkno_splay(blkno, bp);
+ if ((state->blkno_root = bp)->b_blkno != blkno)
+ bp = NULL;
+ }
+ return (bp);
+}
+
+static void
+zbio_buf_vm_object_copyin(buf_t *bp)
{
-#ifdef notyet
- uint64_t blkno, blkno_lookup;
- struct vnode *vp;
- struct bufobj *bo;
- struct buf *bp;
- vm_pindex_t start, end;
- vm_object_t object;
- vm_page_t m;
- int i;
- if (zfs_page_cache_disable)
- return;
- blkno_lookup = blkno = dva->dva_word[1] & ~(1ULL<<63);
- vp = spa_get_vnode(spa);
- bo = &vp->v_bufobj;
+
+}
- if (dva == NULL || spa == NULL || blkno == 0 || size == 0)
- return;
+static void
+zbio_buf_vm_object_copyout(buf_t *bp)
+{
- start = OFF_TO_IDX((blkno_lookup << 9));
- end = start + OFF_TO_IDX(size + PAGE_MASK);
- object = vp->v_object;
+
+}
- VM_OBJECT_LOCK(object);
- vm_page_cache_free(object, start, end);
- vm_object_page_remove(object, start, end, FALSE);
-#ifdef INVARIANTS
- for (i = 0; i < OFF_TO_IDX(size); i++) {
- KASSERT(vm_page_lookup(object, start + i) == NULL,
- ("found page at %ld blkno %ld blkno_lookup %ld",
- start + i, blkno, blkno_lookup));
- }
-#endif
- VM_OBJECT_UNLOCK(object);
-#endif
+static void
+zbio_buf_vm_object_evict(buf_t *bp)
+{
+ int i;
+
+ /*
+ * remove pages from backing vm_object
+ */
+ for (i = 0; i < bp->b_npages; i++)
+ vm_page_remove(bp->b_pages[i]);
}
-#if 0
static void
-arc_pcache(struct vnode *vp, struct buf *bp, uint64_t blkno)
+zbio_buf_vm_object_insert(buf_t *bp, int valid)
{
- vm_pindex_t start = OFF_TO_IDX((blkno << 9));
- vm_object_t object = vp->v_object;
- struct bufobj *bo = &vp->v_bufobj;
vm_page_t m;
+ vm_pindex_t start = OFF_TO_IDX(stob(bp->b_blkno));
+ spa_t *spa = zbio_buf_get_spa(bp);
+ struct vnode *vp = spa_get_vnode(spa);
+ struct vm_object *object = vp->v_object;
int i;
- CTR3(KTR_SPARE2, "arc_pcache() bp=%p blkno %ld npages %d",
- bp, blkno, bp->b_npages);
VM_OBJECT_LOCK(object);
- vm_page_lock_queues();
+ /*
+ * Insert buffer pages in the object
+ */
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
- m->valid = VM_PAGE_BITS_ALL;
+ if (valid)
+ m->valid = VM_PAGE_BITS_ALL;
vm_page_insert(m, object, start + i);
m->flags &= ~PG_UNMANAGED;
- vm_page_enqueue(PQ_INACTIVE, m);
vdrop(vp);
}
+ vm_page_lock_queues();
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ vm_page_enqueue(PQ_INACTIVE, m);
+ }
vm_page_unlock_queues();
VM_OBJECT_UNLOCK(object);
- bp->b_bufobj = bo;
- bp->b_flags |= B_VMIO;
+
}
+/*
+ * zbio_buf_evict_overlap: [ internal use only ]
+ *
+ * Evict the pages of any buffers overlapping with this range
+ *
+ * If ZB_EVICT_ALL is passed then evict all the pages in that range
+ * from the vm object
+ *
+ * The object and page must be locked.
+ * This routine may not block.
+ */
static void
-arc_bcache(arc_buf_t *buf)
-{
- uint64_t blkno = buf->b_hdr->b_dva.dva_word[1] & ~(1ULL<<63);
- struct buf *bp;
- struct vnode *vp = spa_get_vnode(buf->b_hdr->b_spa);
- arc_buf_hdr_t *hdr = buf->b_hdr;
- int cachebuf;
+zbio_buf_blkno_evict_overlap(daddr_t blkno, int size, zbio_state_t *state,
+ uint64_t txg, int evict_op, int locked)
+{
+ buf_t *root, *tmpbp;
+ daddr_t blkno_end, tmpblkno, tmpblkno_end;
+ struct cluster_list_head clh;
+ int i, collisions;
+ uint64_t tmptxg;
+ vm_pindex_t start, end;
+ vm_object_t object = spa_get_vm_object(state->spa);
- if (zfs_page_cache_disable)
+ if (!locked)
+ VM_OBJECT_LOCK(object);
+ if ((root = state->blkno_root) == NULL)
+ goto done;
+
+ collisions = 0;
+ root = zbio_buf_blkno_splay(blkno, root);
+ TAILQ_INIT(&clh);
+ if (blkno < root->b_blkno)
+ tmpbp = TAILQ_PREV(root, cluster_list_head, b_bobufs);
+
+ /*
+ * Find all existing buffers that overlap with this range
+ */
+ tmpbp = tmpbp != NULL ? tmpbp : root;
+ while (tmpbp != NULL && tmpbp->b_blkno < blkno_end) {
+ tmpblkno = tmpbp->b_blkno;
+ tmpblkno_end = tmpblkno + btos(tmpbp->b_bcount);
+ tmptxg = ((zbio_buf_hdr_t *)((arc_buf_t *)tmpbp->b_arc_buf)->b_hdr)->b_birth;
+
+ if (((tmpblkno >= blkno) && (tmpblkno < blkno_end)) ||
+ (tmpblkno_end > blkno) && (tmpblkno_end <= blkno_end) &&
+ ((txg == 0) || (tmptxg < txg))) {
+ TAILQ_INSERT_TAIL(&clh, tmpbp, b_freelist);
+ collisions++;
+ }
+ tmpbp = TAILQ_NEXT(tmpbp, b_bobufs);
+ }
+ while (!TAILQ_EMPTY(&clh)) {
+ tmpbp = TAILQ_FIRST(&clh);
+ TAILQ_REMOVE(&clh, tmpbp, b_freelist);
+ zbio_buf_vm_object_evict(tmpbp);
+
+ KASSERT(tmpbp->b_flags & B_EVICTED == 0,
+ ("buffer has already been evicted"));
+ tmpbp->b_flags |= B_EVICTED;
+ state->blkno_root = tmpbp;
+ /*
+ * move buffer to the unmanaged tree
+ */
+ zbio_buf_blkno_remove(tmpbp);
+ zbio_buf_va_insert(tmpbp, state);
+ }
+done:
+ if (!(collisions == 1 && tmpbp->b_blkno == blkno && tmpbp->b_bcount == size)
+ && (evict_op == ZB_EVICT_ALL)) {
+ start = OFF_TO_IDX(stob(blkno));
+ end = start + OFF_TO_IDX(size);
+ vm_page_cache_free(object, start, end);
+ vm_object_page_remove(object, start, end, FALSE);
+#ifdef INVARIANTS
+ for (i = 0; i < OFF_TO_IDX(size); i++) {
+ KASSERT(vm_page_lookup(object, start + i) == NULL,
+ ("found page at %ld blkno %ld ",start + i, blkno));
+ }
+#endif
+ }
+ if (!locked)
+ VM_OBJECT_UNLOCK(object);
+}
+
+/*
+Cases:
+
+A) B_MALLOC / address is known
+ 1) getblk:
+ a) page cached: copyin + mark B_CACHE
+ b) buffer+page cached: copyin + mark B_CACHE
+ c) default: N/A
+ 2) sync_cache:
+ a) page cached: copy{in, out}
+ b) buffer+page cached: evict overlapping pages
+ c) default: N/A
+B) B_MALLOC / address is !known
+ 1) getblk: N/A
+ 2) sync_cache:
+ a) page cached: copy{in, out}
+ b) buffer+page cached: evict overlapping pages
+ c) default: N/A
+
+C) !B_MALLOC / address is !known
+ 2) sync_cache:
+ a) page cached: evict/free old pages + replace
+ b) buffer+page cached: evict overlapping pages from object + replace
+ c) default: add pages to vm object
+
+D) !B_MALLOC / address is known
+ 1) getblk:
+ a) buffer+page cached: evict pages belonging to older buffer
+ b) default: N/A
+ 2) sync_cache: N/A - we should only be doing I/O on valid B_VMIO buffers
+
+*/
+
+static buf_t *
+_zbio_getblk_malloc(zbio_buf_hdr_t *hdr, int flags)
+{
+ buf_t *newbp, *tmpbp;
+ void *data;
+ daddr_t blkno;
+ uint64_t size = hdr->b_size;
+ uint64_t txg = hdr->b_birth;
+ zbio_state_t *state = spa_get_bio_state(hdr->b_spa);
+
+ if (flags & GB_NODUMP)
+ data = zio_data_buf_alloc(size);
+ else
+ data = zio_buf_alloc(size);
+ newbp = malloc(sizeof(struct buf), M_ZFS_BIO, M_WAITOK|M_ZERO);
+ newbp->b_data = data;
+ newbp->b_flags = (B_MALLOC|B_INVAL);
+ newbp->b_bcount = size;
+ if (!BUF_EMPTY(hdr) && !(hdr->b_flags & ZBIO_BUF_CLONING)) {
+ blkno = hdr->b_dva.dva_word[1] & ~(1ULL<<63);
+ zbio_buf_blkno_evict_overlap(blkno, size, state, txg, 0, FALSE);
+ newbp->b_blkno = blkno;
+ /*
+ * Copy in from the page cache if found & valid
+ * and mark B_CACHE
+ */
+ zbio_buf_vm_object_copyin(newbp);
+ }
+
+ if (hdr->b_flags & ZBIO_BUF_CLONING) {
+ newbp->b_flags |= B_CLONED;
+ hdr->b_flags &= ~ZBIO_BUF_CLONING;
+ }
+ zbio_buf_va_insert(newbp, state);
+}
+
+static buf_t *
+_zbio_getblk_vmio(zbio_buf_hdr_t *hdr, int flags)
+{
+ buf_t *newbp;
+ daddr_t blkno;
+ uint64_t size = hdr->b_size;
+ spa_t *spa = hdr->b_spa;
+ zbio_state_t *state = spa_get_bio_state(spa);
+ struct vnode *vp = spa_get_vnode(spa);
+ struct bufobj *bo = &vp->v_bufobj;
+
+ if (BUF_EMPTY(hdr)) {
+ newbp = geteblk(size, flags);
+ zbio_buf_va_insert(newbp, state);
+ } else {
+ blkno = hdr->b_dva.dva_word[1] & ~(1ULL<<63);
+ zbio_buf_blkno_evict_overlap(blkno, size, state, 0, 0, FALSE);
+
+ while (newbp == NULL)
+ newbp = getblk(vp, blkno, size, 0, 0, flags | GB_LOCK_NOWAIT);
+ brelvp(newbp);
+ newbp->b_flags |= B_ASSIGNED;
+ zbio_buf_blkno_insert(newbp, state);
+ }
+ newbp->b_bufobj = bo;
+ BUF_KERNPROC(newbp);
+ CTR4(KTR_SPARE2, "arc_getblk() bp=%p flags %X "
+ "blkno %ld npages %d",
+ newbp, newbp->b_flags, blkno, newbp->b_npages);
+
+ return (newbp);
+}
+
+static void
+_zbio_getblk(arc_buf_t *buf, int flags)
+{
+ zbio_buf_hdr_t *hdr = (zbio_buf_hdr_t *)buf->b_hdr;
+ uint64_t size = hdr->b_size;
+ buf_t *newbp;
+
+ if (zfs_page_cache_disable) {
+ buf->b_data = zio_buf_alloc(size);
+ hdr->b_flags &= ~ZBIO_BUF_CLONING;
+ return;
+ }
+
+ if ((size & PAGE_MASK) || (hdr->b_flags & ZBIO_BUF_CLONING))
+ newbp = _zbio_getblk_malloc(hdr, flags);
+ else
+ newbp = _zbio_getblk_vmio(hdr, flags);
+
+ buf->b_bp = newbp;
+ buf->b_data = newbp->b_data;
+ newbp->b_arc_buf = buf;
+}
+
+void
+zbio_getblk(arc_buf_t *buf)
+{
+
+ _zbio_getblk(buf, 0);
+}
+
+void
+zbio_data_getblk(arc_buf_t *buf)
+{
+
+ _zbio_getblk(buf, GB_NODUMP);
+}
+
+void
+zbio_relse(arc_buf_t *buf, size_t size)
+{
+ struct buf *bp = buf->b_bp;
+
+ if (zfs_page_cache_disable) {
+ zio_buf_free(buf->b_data, size);
return;
+ }
+
+ if (bp->b_flags & B_ASSIGNED)
+ zbio_buf_blkno_remove(bp);
+ else
+ zbio_buf_va_remove(bp);
+
+ if (bp->b_flags & B_MALLOC) {
+ zio_buf_free(bp->b_data, size);
+ free(bp, M_ZFS_BIO);
+ } else {
+ CTR4(KTR_SPARE2, "arc_brelse() bp=%p flags %X"
+ " size %ld blkno=%ld",
+ bp, bp->b_flags, size, bp->b_blkno);
+
+ bp->b_flags |= B_ZFS;
+ brelse(bp);
+ }
+}
- if (blkno == 0 || hdr->b_birth == 0)
+void
+zbio_sync_cache(spa_t *spa, blkptr_t *blkp, uint64_t txg, void *data, uint64_t size, int bio_op)
+{
+ buf_t *bp;
+ zbio_state_t *state = spa_get_bio_state(spa);
+ dva_t dva = *BP_IDENTITY(blkp);
+ daddr_t blkno = dva.dva_word[1] & ~(1ULL<<63);
+ struct vnode *vp = spa_get_vnode(spa);
+ vm_object_t object = vp->v_object;
+ vm_pindex_t start;
+ vm_page_t m;
+ int i;
+
+ if (zfs_page_cache_disable)
return;
+ /*
+ * XXX incomplete
+ */
- bp = buf->b_bp;
- bp->b_birth = hdr->b_birth;
- bp->b_blkno = bp->b_lblkno = blkno;
- bp->b_offset = (blkno << 9);
- cachebuf = ((hdr->b_datacnt == 1) &&
- !(hdr->b_flags & ARC_IO_ERROR) &&
- ((bp->b_flags & (B_INVAL|B_CACHE)) == B_CACHE) &&
- (blkno & 0x7) == 0);
-
- arc_binval(hdr->b_spa, &hdr->b_dva, hdr->b_size);
- if (cachebuf)
- arc_pcache(vp, bp, blkno);
+
+ if ((bp = zbio_buf_va_lookup(state, data)) != NULL) {
+ KASSERT(bp->b_flags & (B_CLONED|B_EVICTED) == 0,
+ ("doing I/O with cloned or evicted buffer 0x%x", bp->b_flags));
+
+ if (bp->b_flags & B_MALLOC) {
+ zbio_buf_blkno_evict_overlap(blkno, size, state, txg, 0, FALSE);
+
+ if (bio_op == BIO_READ) {
+ /*
+ * if page resident - copy in
+ * update zio pipeline
+ */
+ zbio_buf_vm_object_copyin(bp);
+ if (bp->b_flags & B_CACHE) {
+ /* update zio pipeline */
+ }
+ } else
+ zbio_buf_vm_object_copyout(bp);
+ } else {
+ zbio_buf_blkno_evict_overlap(blkno, size, state, 0, ZB_EVICT_ALL, TRUE);
+ bp->b_blkno = bp->b_lblkno = blkno;
+ bp->b_flags |= (B_VMIO|B_ASSIGNED);
+ zbio_buf_vm_object_insert(bp, bio_op == BIO_WRITE);
+ }
+ } else {
+ bp = zbio_buf_blkno_lookup(state, blkno);
+ KASSERT(bp != NULL, ("blkno=%ld data=%p unmanaged", blkno, bp->b_data));
+ }
}
-#endif
static void
zbio_shutdown(void *arg __unused, int howto __unused)
@@ -275,6 +881,9 @@ void
zbio_init(void)
{
+ if (zfs_page_cache_disable)
+ return;
+
zbio_event_shutdown = EVENTHANDLER_REGISTER(shutdown_pre_sync,
zbio_shutdown, NULL, EVENTHANDLER_PRI_FIRST);
}
@@ -285,7 +894,9 @@ zbio_fini(void)
if (zbio_event_shutdown != NULL)
EVENTHANDLER_DEREGISTER(shutdown_pre_sync, zbio_event_shutdown);
}
-#else
+
+
+#else /* !_KERNEL */
void
zbio_getblk(arc_buf_t *buf)
@@ -319,5 +930,5 @@ zbio_sync_cache(spa_t *spa, blkptr_t *bp
{
;
}
-
#endif
+
Modified: user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Sat Dec 12 02:34:00 2009 (r200427)
+++ user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Sat Dec 12 03:35:49 2009 (r200428)
@@ -436,8 +436,9 @@ zio_create(zio_t *pio, spa_t *spa, uint6
if (bp != NULL) {
- if ((vd == NULL) || (vd->vdev_parent == NULL))
- zbio_sync_cache(spa, bp, txg, size);
+ if (((vd == NULL) || (vd->vdev_parent == NULL)) &&
+ ((type == ZIO_TYPE_WRITE) || (type == ZIO_TYPE_READ)))
+ zbio_sync_cache(spa, bp, txg, data, size, type == ZIO_TYPE_WRITE ? BIO_WRITE : BIO_READ);
zio->io_bp = bp;
zio->io_bp_copy = *bp;
More information about the svn-src-user
mailing list