svn commit: r333576 - in head/sys: kern sys ufs/ffs

Konstantin Belousov kib at FreeBSD.org
Sun May 13 09:47:30 UTC 2018


Author: kib
Date: Sun May 13 09:47:28 2018
New Revision: 333576
URL: https://svnweb.freebsd.org/changeset/base/333576

Log:
  Detect and optimize reads from the hole on UFS.
  
  - Create getblkx(9) variant of getblk(9) which can return error.
  - Add GB_NOSPARSE flag for getblk()/getblkx() which requests that BMAP
    was performed before the buffer is created, and EJUSTRETURN returned
    in case the requested block does not exist.
  - Make ffs_read() use GB_NOSPARSE to avoid instantiating buffer (and
    allocating the pages for it), copying from zero_region instead.
  
  The end result is less page allocations and buffer recycling when a
  hole is read, which is important for some benchmarks.
  
  Requested and reviewed by:	jeff
  Tested by:	pho
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 weeks
  Differential revision:	https://reviews.freebsd.org/D14917

Modified:
  head/sys/kern/vfs_bio.c
  head/sys/kern/vfs_cluster.c
  head/sys/sys/buf.h
  head/sys/ufs/ffs/ffs_vnops.c

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c	Sat May 12 20:00:29 2018	(r333575)
+++ head/sys/kern/vfs_bio.c	Sun May 13 09:47:28 2018	(r333576)
@@ -2138,30 +2138,37 @@ breadn_flags(struct vnode *vp, daddr_t blkno, int size
     void (*ckhashfunc)(struct buf *), struct buf **bpp)
 {
 	struct buf *bp;
-	int readwait, rv;
+	struct thread *td;
+	int error, readwait, rv;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
+	td = curthread;
 	/*
-	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
+	 * Can only return NULL if GB_LOCK_NOWAIT or GB_SPARSE flags
+	 * are specified.
 	 */
-	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
-	if (bp == NULL)
-		return (EBUSY);
+	error = getblkx(vp, blkno, size, 0, 0, flags, &bp);
+	if (error != 0) {
+		*bpp = NULL;
+		return (error);
+	}
+	flags &= ~GB_NOSPARSE;
+	*bpp = bp;
 
 	/*
 	 * If not found in cache, do some I/O
 	 */
 	readwait = 0;
 	if ((bp->b_flags & B_CACHE) == 0) {
-		if (!TD_IS_IDLETHREAD(curthread)) {
+		if (!TD_IS_IDLETHREAD(td)) {
 #ifdef RACCT
 			if (racct_enable) {
-				PROC_LOCK(curproc);
-				racct_add_buf(curproc, bp, 0);
-				PROC_UNLOCK(curproc);
+				PROC_LOCK(td->td_proc);
+				racct_add_buf(td->td_proc, bp, 0);
+				PROC_UNLOCK(td->td_proc);
 			}
 #endif /* RACCT */
-			curthread->td_ru.ru_inblock++;
+			td->td_ru.ru_inblock++;
 		}
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
@@ -3822,8 +3829,21 @@ has_addr:
 	}
 }
 
+struct buf *
+getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
+    int flags)
+{
+	struct buf *bp;
+	int error;
+
+	error = getblkx(vp, blkno, size, slpflag, slptimeo, flags, &bp);
+	if (error != 0)
+		return (NULL);
+	return (bp);
+}
+
 /*
- *	getblk:
+ *	getblkx:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
@@ -3858,12 +3878,13 @@ has_addr:
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
-struct buf *
-getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
-    int flags)
+int
+getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
+    int flags, struct buf **bpp)
 {
 	struct buf *bp;
 	struct bufobj *bo;
+	daddr_t d_blkno;
 	int bsize, error, maxsize, vmio;
 	off_t offset;
 
@@ -3878,6 +3899,7 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int 
 		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
+	d_blkno = blkno;
 loop:
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
@@ -3889,7 +3911,7 @@ loop:
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
-		if (flags & GB_LOCK_NOWAIT)
+		if ((flags & GB_LOCK_NOWAIT) != 0)
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
@@ -3902,8 +3924,8 @@ loop:
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
-		else if (error)
-			return (NULL);
+		else if (error != 0)
+			return (error);
 		/* If recursed, assume caller knows the rules. */
 		else if (BUF_LOCKRECURSED(bp))
 			goto end;
@@ -4008,10 +4030,10 @@ loop:
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
-			return NULL;
+			return (EEXIST);
 		if (bdomain[bo->bo_domain].bd_freebuffers == 0 &&
 		    TD_IS_IDLETHREAD(curthread))
-			return NULL;
+			return (EBUSY);
 
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
@@ -4025,11 +4047,22 @@ loop:
 			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 		}
 		maxsize = imax(maxsize, bsize);
+		if ((flags & GB_NOSPARSE) != 0 && vmio &&
+		    !vn_isdisk(vp, NULL)) {
+			error = VOP_BMAP(vp, blkno, NULL, &d_blkno, 0, 0);
+			KASSERT(error != EOPNOTSUPP,
+			    ("GB_NOSPARSE from fs not supporting bmap, vp %p",
+			    vp));
+			if (error != 0)
+				return (error);
+			if (d_blkno == -1)
+				return (EJUSTRETURN);
+		}
 
 		bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
-				return NULL;
+				return (ETIMEDOUT);
 			/*
 			 * XXX This is here until the sleep path is diagnosed
 			 * enough to work under very low memory conditions.
@@ -4075,7 +4108,8 @@ loop:
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
-		bp->b_blkno = bp->b_lblkno = blkno;
+		bp->b_lblkno = blkno;
+		bp->b_blkno = d_blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
@@ -4110,7 +4144,8 @@ end:
 	buf_track(bp, __func__);
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
-	return (bp);
+	*bpp = bp;
+	return (0);
 }
 
 /*

Modified: head/sys/kern/vfs_cluster.c
==============================================================================
--- head/sys/kern/vfs_cluster.c	Sat May 12 20:00:29 2018	(r333575)
+++ head/sys/kern/vfs_cluster.c	Sun May 13 09:47:28 2018	(r333576)
@@ -94,12 +94,14 @@ cluster_read(struct vnode *vp, u_quad_t filesize, dadd
 {
 	struct buf *bp, *rbp, *reqbp;
 	struct bufobj *bo;
+	struct thread *td;
 	daddr_t blkno, origblkno;
 	int maxra, racluster;
 	int error, ncontig;
 	int i;
 
 	error = 0;
+	td = curthread;
 	bo = &vp->v_bufobj;
 	if (!unmapped_buf_allowed)
 		gbflags &= ~GB_UNMAPPED;
@@ -118,10 +120,14 @@ cluster_read(struct vnode *vp, u_quad_t filesize, dadd
 	/*
 	 * get the requested block
 	 */
-	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
-	if (bp == NULL)
-		return (EBUSY);
+	error = getblkx(vp, lblkno, size, 0, 0, gbflags, &bp);
+	if (error != 0) {
+		*bpp = NULL;
+		return (error);
+	}
+	gbflags &= ~GB_NOSPARSE;
 	origblkno = lblkno;
+	*bpp = reqbp = bp;
 
 	/*
 	 * if it is in the cache, then check to see if the reads have been
@@ -243,12 +249,12 @@ cluster_read(struct vnode *vp, u_quad_t filesize, dadd
 		bstrategy(bp);
 #ifdef RACCT
 		if (racct_enable) {
-			PROC_LOCK(curproc);
-			racct_add_buf(curproc, bp, 0);
-			PROC_UNLOCK(curproc);
+			PROC_LOCK(td->td_proc);
+			racct_add_buf(td->td_proc, bp, 0);
+			PROC_UNLOCK(td->td_proc);
 		}
 #endif /* RACCT */
-		curthread->td_ru.ru_inblock++;
+		td->td_ru.ru_inblock++;
 	}
 
 	/*
@@ -303,12 +309,12 @@ cluster_read(struct vnode *vp, u_quad_t filesize, dadd
 		bstrategy(rbp);
 #ifdef RACCT
 		if (racct_enable) {
-			PROC_LOCK(curproc);
-			racct_add_buf(curproc, rbp, 0);
-			PROC_UNLOCK(curproc);
+			PROC_LOCK(td->td_proc);
+			racct_add_buf(td->td_proc, rbp, 0);
+			PROC_UNLOCK(td->td_proc);
 		}
 #endif /* RACCT */
-		curthread->td_ru.ru_inblock++;
+		td->td_ru.ru_inblock++;
 	}
 
 	if (reqbp) {

Modified: head/sys/sys/buf.h
==============================================================================
--- head/sys/sys/buf.h	Sat May 12 20:00:29 2018	(r333575)
+++ head/sys/sys/buf.h	Sun May 13 09:47:28 2018	(r333576)
@@ -479,6 +479,7 @@ buf_track(struct buf *bp, const char *location)
 #define	GB_UNMAPPED	0x0008		/* Do not mmap buffer pages. */
 #define	GB_KVAALLOC	0x0010		/* But allocate KVA. */
 #define	GB_CKHASH	0x0020		/* If reading, calc checksum hash */
+#define	GB_NOSPARSE	0x0040		/* Do not instantiate holes */
 
 #ifdef _KERNEL
 extern int	nbuf;			/* The number of buffer headers */
@@ -540,6 +541,8 @@ struct buf *     getpbuf(int *);
 struct buf *incore(struct bufobj *, daddr_t);
 struct buf *gbincore(struct bufobj *, daddr_t);
 struct buf *getblk(struct vnode *, daddr_t, int, int, int, int);
+int	getblkx(struct vnode *vp, daddr_t blkno, int size, int slpflag,
+	    int slptimeo, int flags, struct buf **bpp);
 struct buf *geteblk(int, int);
 int	bufwait(struct buf *);
 int	bufwrite(struct buf *);

Modified: head/sys/ufs/ffs/ffs_vnops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vnops.c	Sat May 12 20:00:29 2018	(r333575)
+++ head/sys/ufs/ffs/ffs_vnops.c	Sun May 13 09:47:28 2018	(r333576)
@@ -462,6 +462,26 @@ ffs_lock(ap)
 #endif
 }
 
+static int
+ffs_read_hole(struct uio *uio, long xfersize, long *size)
+{
+	ssize_t saved_resid, tlen;
+	int error;
+
+	while (xfersize > 0) {
+		tlen = min(xfersize, ZERO_REGION_SIZE);
+		saved_resid = uio->uio_resid;
+		error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
+		    tlen, uio);
+		if (error != 0)
+			return (error);
+		tlen = saved_resid - uio->uio_resid;
+		xfersize -= tlen;
+		*size -= tlen;
+	}
+	return (0);
+}
+
 /*
  * Vnode op for reading.
  */
@@ -483,9 +503,7 @@ ffs_read(ap)
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
 	ssize_t orig_resid;
-	int error;
-	int seqcount;
-	int ioflag;
+	int bflag, error, ioflag, seqcount;
 
 	vp = ap->a_vp;
 	uio = ap->a_uio;
@@ -529,6 +547,7 @@ ffs_read(ap)
 	    uio->uio_offset >= fs->fs_maxfilesize)
 		return (EOVERFLOW);
 
+	bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 			break;
@@ -565,8 +584,7 @@ ffs_read(ap)
 			/*
 			 * Don't do readahead if this is the end of the file.
 			 */
-			error = bread_gb(vp, lbn, size, NOCRED,
-			    GB_UNMAPPED, &bp);
+			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			/*
 			 * Otherwise if we are allowed to cluster,
@@ -577,7 +595,7 @@ ffs_read(ap)
 			 */
 			error = cluster_read(vp, ip->i_size, lbn,
 			    size, NOCRED, blkoffset + uio->uio_resid,
-			    seqcount, GB_UNMAPPED, &bp);
+			    seqcount, bflag, &bp);
 		} else if (seqcount > 1) {
 			/*
 			 * If we are NOT allowed to cluster, then
@@ -589,17 +607,21 @@ ffs_read(ap)
 			 */
 			u_int nextsize = blksize(fs, ip, nextlbn);
 			error = breadn_flags(vp, lbn, size, &nextlbn,
-			    &nextsize, 1, NOCRED, GB_UNMAPPED, NULL, &bp);
+			    &nextsize, 1, NOCRED, bflag, NULL, &bp);
 		} else {
 			/*
 			 * Failing all of the above, just read what the
 			 * user asked for. Interestingly, the same as
 			 * the first option above.
 			 */
-			error = bread_gb(vp, lbn, size, NOCRED,
-			    GB_UNMAPPED, &bp);
+			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
 		}
-		if (error) {
+		if (error == EJUSTRETURN) {
+			error = ffs_read_hole(uio, xfersize, &size);
+			if (error == 0)
+				continue;
+		}
+		if (error != 0) {
 			brelse(bp);
 			bp = NULL;
 			break;


More information about the svn-src-all mailing list