svn commit: r308026 - in head/sys: kern sys ufs/ffs

Konstantin Belousov kib at FreeBSD.org
Fri Oct 28 11:44:00 UTC 2016


Author: kib
Date: Fri Oct 28 11:43:59 2016
New Revision: 308026
URL: https://svnweb.freebsd.org/changeset/base/308026

Log:
  Generalize UFS buffer pager to allow it serving other filesystems
  which also use buffer cache.
  
  Most important addition to the code is the handling of filesystems
  where the block size is less than the machine page size, which might
  require reading several buffers to validate single page.
  
  Tested by:	pho
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 weeks

Modified:
  head/sys/kern/vfs_bio.c
  head/sys/sys/buf.h
  head/sys/ufs/ffs/ffs_vnops.c

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c	Fri Oct 28 11:35:06 2016	(r308025)
+++ head/sys/kern/vfs_bio.c	Fri Oct 28 11:43:59 2016	(r308026)
@@ -75,9 +75,10 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
-#include <vm/vm_pageout.h>
-#include <vm/vm_page.h>
 #include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/swap_pager.h>
@@ -4636,6 +4637,161 @@ bdata2bio(struct buf *bp, struct bio *bi
 	}
 }
 
+static int buf_pager_relbuf;
+SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
+    &buf_pager_relbuf, 0,
+    "Make buffer pager release buffers after reading");
+
+/*
+ * The buffer pager.  It uses buffer reads to validate pages.
+ *
+ * In contrast to the generic local pager from vm/vnode_pager.c, this
+ * pager correctly and easily handles volumes where the underlying
+ * device block size is greater than the machine page size.  The
+ * buffer cache transparently extends the requested page run to be
+ * aligned at the block boundary, and does the necessary bogus page
+ * replacements in the addends to avoid obliterating already valid
+ * pages.
+ *
+ * The only non-trivial issue is that the exclusive busy state for
+ * pages, which is assumed by the vm_pager_getpages() interface, is
+ * incompatible with the VMIO buffer cache's desire to share-busy the
+ * pages.  This function performs a trivial downgrade of the pages'
+ * state before reading buffers, and a less trivial upgrade from the
+ * shared-busy to excl-busy state after the read.
+ */
+int
+vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
+    vbg_get_blksize_t get_blksize)
+{
+	vm_page_t m;
+	vm_object_t object;
+	struct buf *bp;
+	daddr_t lbn, lbnp;
+	vm_ooffset_t la, lb, poff, poffe;
+	long bsize;
+	int bo_bs, error, i;
+	bool redo, lpart;
+
+	object = vp->v_object;
+	la = IDX_TO_OFF(ma[count - 1]->pindex);
+	if (la >= object->un_pager.vnp.vnp_size)
+		return (VM_PAGER_BAD);
+	lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
+	bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));
+	if (rbehind != NULL) {
+		lb = IDX_TO_OFF(ma[0]->pindex);
+		*rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
+	}
+	if (rahead != NULL) {
+		*rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la);
+		if (la + IDX_TO_OFF(*rahead) >= object->un_pager.vnp.vnp_size) {
+			*rahead = OFF_TO_IDX(roundup2(object->un_pager.
+			    vnp.vnp_size, PAGE_SIZE) - la);
+		}
+	}
+	VM_OBJECT_WLOCK(object);
+again:
+	for (i = 0; i < count; i++)
+		vm_page_busy_downgrade(ma[i]);
+	VM_OBJECT_WUNLOCK(object);
+
+	lbnp = -1;
+	for (i = 0; i < count; i++) {
+		m = ma[i];
+
+		/*
+		 * Pages are shared busy and the object lock is not
+		 * owned, which together allow for the pages'
+		 * invalidation.  The racy test for validity avoids
+		 * useless creation of the buffer for the most typical
+		 * case when invalidation is not used in redo or for
+		 * parallel read.  The shared->excl upgrade loop at
+		 * the end of the function catches the race in a
+		 * reliable way (protected by the object lock).
+		 */
+		if (m->valid == VM_PAGE_BITS_ALL)
+			continue;
+
+		poff = IDX_TO_OFF(m->pindex);
+		poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
+		for (; poff < poffe; poff += bsize) {
+			lbn = get_lblkno(vp, poff);
+			if (lbn == lbnp)
+				goto next_page;
+			lbnp = lbn;
+
+			bsize = get_blksize(vp, lbn);
+			error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED,
+			    &bp);
+			if (error != 0)
+				goto end_pages;
+			if (LIST_EMPTY(&bp->b_dep)) {
+				/*
+				 * Invalidation clears m->valid, but
+				 * may leave B_CACHE flag if the
+				 * buffer existed at the invalidation
+				 * time.  In this case, recycle the
+				 * buffer to do real read on next
+				 * bread() after redo.
+				 *
+				 * Otherwise B_RELBUF is not strictly
+				 * necessary, enable to reduce buf
+				 * cache pressure.
+				 */
+				if (buf_pager_relbuf ||
+				    m->valid != VM_PAGE_BITS_ALL)
+					bp->b_flags |= B_RELBUF;
+
+				bp->b_flags &= ~B_NOCACHE;
+				brelse(bp);
+			} else {
+				bqrelse(bp);
+			}
+		}
+		KASSERT(1 /* racy, enable for debugging */ ||
+		    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
+		    ("buf %d %p invalid", i, m));
+		if (i == count - 1 && lpart) {
+			VM_OBJECT_WLOCK(object);
+			if (m->valid != 0 &&
+			    m->valid != VM_PAGE_BITS_ALL)
+				vm_page_zero_invalid(m, TRUE);
+			VM_OBJECT_WUNLOCK(object);
+		}
+next_page:;
+	}
+end_pages:
+
+	VM_OBJECT_WLOCK(object);
+	redo = false;
+	for (i = 0; i < count; i++) {
+		vm_page_sunbusy(ma[i]);
+		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
+
+		/*
+		 * Since the pages were only sbusy while neither the
+		 * buffer nor the object lock was held by us, or
+		 * reallocated while vm_page_grab() slept for busy
+		 * relinguish, they could have been invalidated.
+		 * Recheck the valid bits and re-read as needed.
+		 *
+		 * Note that the last page is made fully valid in the
+		 * read loop, and partial validity for the page at
+		 * index count - 1 could mean that the page was
+		 * invalidated or removed, so we must restart for
+		 * safety as well.
+		 */
+		if (ma[i]->valid != VM_PAGE_BITS_ALL)
+			redo = true;
+	}
+	if (redo && error == 0)
+		goto again;
+	VM_OBJECT_WUNLOCK(object);
+	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>

Modified: head/sys/sys/buf.h
==============================================================================
--- head/sys/sys/buf.h	Fri Oct 28 11:35:06 2016	(r308025)
+++ head/sys/sys/buf.h	Fri Oct 28 11:43:59 2016	(r308026)
@@ -68,6 +68,7 @@ extern struct bio_ops {
 } bioops;
 
 struct vm_object;
+struct vm_page;
 
 typedef unsigned char b_xflags_t;
 
@@ -537,6 +538,12 @@ struct	buf *trypbuf(int *);
 void	bwait(struct buf *, u_char, const char *);
 void	bdone(struct buf *);
 
+typedef daddr_t (vbg_get_lblkno_t)(struct vnode *, vm_ooffset_t);
+typedef int (vbg_get_blksize_t)(struct vnode *, daddr_t);
+int	vfs_bio_getpages(struct vnode *vp, struct vm_page **ma, int count,
+	    int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
+	    vbg_get_blksize_t get_blksize);
+
 #endif /* _KERNEL */
 
 #endif /* !_SYS_BUF_H_ */

Modified: head/sys/ufs/ffs/ffs_vnops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vnops.c	Fri Oct 28 11:35:06 2016	(r308025)
+++ head/sys/ufs/ffs/ffs_vnops.c	Fri Oct 28 11:43:59 2016	(r308026)
@@ -87,7 +87,6 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
-#include <vm/vm_pageout.h>
 #include <vm/vnode_pager.h>
 
 #include <ufs/ufs/extattr.h>
@@ -1791,160 +1790,33 @@ SYSCTL_DECL(_vfs_ffs);
 static int use_buf_pager = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
     "Always use buffer pager instead of bmap");
-static int buf_pager_relbuf;
-SYSCTL_INT(_vfs_ffs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
-    &buf_pager_relbuf, 0,
-    "Make buffer pager release buffers after reading");
 
-/*
- * The FFS pager.  It uses buffer reads to validate pages.
- *
- * In contrast to the generic local pager from vm/vnode_pager.c, this
- * pager correctly and easily handles volumes where the underlying
- * device block size is greater than the machine page size.  The
- * buffer cache transparently extends the requested page run to be
- * aligned at the block boundary, and does the necessary bogus page
- * replacements in the addends to avoid obliterating already valid
- * pages.
- *
- * The only non-trivial issue is that the exclusive busy state for
- * pages, which is assumed by the vm_pager_getpages() interface, is
- * incompatible with the VMIO buffer cache's desire to share-busy the
- * pages.  This function performs a trivial downgrade of the pages'
- * state before reading buffers, and a less trivial upgrade from the
- * shared-busy to excl-busy state after the read.
- */
+static daddr_t
+ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
+{
+
+	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
+}
+
+static int
+ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
+{
+
+	return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
+}
+
 static int
 ffs_getpages(struct vop_getpages_args *ap)
 {
 	struct vnode *vp;
-	vm_page_t *ma, m;
-	vm_object_t object;
-	struct buf *bp;
 	struct ufsmount *um;
-	ufs_lbn_t lbn, lbnp;
-	vm_ooffset_t la, lb;
-	long bsize;
-	int bo_bs, count, error, i;
-	bool redo, lpart;
 
 	vp = ap->a_vp;
-	ma = ap->a_m;
-	count = ap->a_count;
+	um = VFSTOUFS(vp->v_mount);
 
-	um = VFSTOUFS(ap->a_vp->v_mount);
-	bo_bs = um->um_devvp->v_bufobj.bo_bsize;
-	if (!use_buf_pager && bo_bs <= PAGE_SIZE)
-		return (vnode_pager_generic_getpages(vp, ma, count,
+	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
+		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
 		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
-
-	object = vp->v_object;
-	la = IDX_TO_OFF(ma[count - 1]->pindex);
-	if (la >= object->un_pager.vnp.vnp_size)
-		return (VM_PAGER_BAD);
-	lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
-	if (ap->a_rbehind != NULL) {
-		lb = IDX_TO_OFF(ma[0]->pindex);
-		*ap->a_rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
-	}
-	if (ap->a_rahead != NULL) {
-		*ap->a_rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la);
-		if (la + IDX_TO_OFF(*ap->a_rahead) >=
-		    object->un_pager.vnp.vnp_size) {
-			*ap->a_rahead = OFF_TO_IDX(roundup2(object->un_pager.
-			    vnp.vnp_size, PAGE_SIZE) - la);
-		}
-	}
-	VM_OBJECT_WLOCK(object);
-again:
-	for (i = 0; i < count; i++)
-		vm_page_busy_downgrade(ma[i]);
-	VM_OBJECT_WUNLOCK(object);
-
-	lbnp = -1;
-	for (i = 0; i < count; i++) {
-		m = ma[i];
-
-		/*
-		 * Pages are shared busy and the object lock is not
-		 * owned, which together allow for the pages'
-		 * invalidation.  The racy test for validity avoids
-		 * useless creation of the buffer for the most typical
-		 * case when invalidation is not used in redo or for
-		 * parallel read.  The shared->excl upgrade loop at
-		 * the end of the function catches the race in a
-		 * reliable way (protected by the object lock).
-		 */
-		if (m->valid == VM_PAGE_BITS_ALL)
-			continue;
-
-		lbn = lblkno(um->um_fs, IDX_TO_OFF(m->pindex));
-		if (lbn != lbnp) {
-			bsize = blksize(um->um_fs, VTOI(vp), lbn);
-			error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED,
-			    &bp);
-			if (error != 0)
-				break;
-			KASSERT(1 /* racy, enable for debugging */ ||
-			    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
-			    ("buf %d %p invalid", i, m));
-			if (i == count - 1 && lpart) {
-				VM_OBJECT_WLOCK(object);
-				if (m->valid != 0 &&
-				    m->valid != VM_PAGE_BITS_ALL)
-					vm_page_zero_invalid(m, TRUE);
-				VM_OBJECT_WUNLOCK(object);
-			}
-			if (LIST_EMPTY(&bp->b_dep)) {
-				/*
-				 * Invalidation clears m->valid, but
-				 * may leave B_CACHE flag if the
-				 * buffer existed at the invalidation
-				 * time.  In this case, recycle the
-				 * buffer to do real read on next
-				 * bread() after redo.
-				 *
-				 * Otherwise B_RELBUF is not strictly
-				 * necessary, enable to reduce buf
-				 * cache pressure.
-				 */
-				if (buf_pager_relbuf ||
-				    m->valid != VM_PAGE_BITS_ALL)
-					bp->b_flags |= B_RELBUF;
-
-				bp->b_flags &= ~B_NOCACHE;
-				brelse(bp);
-			} else {
-				bqrelse(bp);
-			}
-			lbnp = lbn;
-		}
-	}
-
-	VM_OBJECT_WLOCK(object);
-	redo = false;
-	for (i = 0; i < count; i++) {
-		vm_page_sunbusy(ma[i]);
-		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
-
-		/*
-		 * Since the pages were only sbusy while neither the
-		 * buffer nor the object lock was held by us, or
-		 * reallocated while vm_page_grab() slept for busy
-		 * relinguish, they could have been invalidated.
-		 * Recheck the valid bits and re-read as needed.
-		 *
-		 * Note that the last page is made fully valid in the
-		 * read loop, and partial validity for the page at
-		 * index count - 1 could mean that the page was
-		 * invalidated or removed, so we must restart for
-		 * safety as well.
-		 */
-		if (ma[i]->valid != VM_PAGE_BITS_ALL)
-			redo = true;
-	}
-	if (redo && error == 0)
-		goto again;
-	VM_OBJECT_WUNLOCK(object);
-	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
+	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
+	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
 }


More information about the svn-src-head mailing list