svn commit: r237274 - in head: lib/libc/sys sys/kern sys/sys

John Baldwin jhb at FreeBSD.org
Tue Jun 19 18:42:26 UTC 2012


Author: jhb
Date: Tue Jun 19 18:42:24 2012
New Revision: 237274
URL: http://svn.freebsd.org/changeset/base/237274

Log:
  Further refine the implementation of POSIX_FADV_NOREUSE.
  
  First, extend the changes in r230782 to better handle the common case
  of using NOREUSE with sequential reads.  A NOREUSE file descriptor
  will now track the last implicit DONTNEED request it made as a result
  of a NOREUSE read.  If a subsequent NOREUSE read is adjacent to the
  previous range, it will apply the DONTNEED request to the entire range
  of both the previous read and the current read.  The effect is that
  each read of a file accessed sequentially will apply the DONTNEED
  request to the entire range that has been read.  This allows NOREUSE
  to properly handle misaligned reads by flushing each buffer to cache
  once it has been completely read.
  
  Second, apply the same changes made to read(2) by r230782 and this
  change to writes.  This provides much better performance in the
  sequential write case as it allows writes to still be clustered.  It
  also provides much better performance for misaligned writes.  It does
  mean that NOREUSE will be generally ineffective for non-sequential
  writes as the current implementation relies on a future NOREUSE
  write's implicit DONTNEED request to flush the dirty buffer from the
  current write.
  
  MFC after:	2 weeks

Modified:
  head/lib/libc/sys/posix_fadvise.2
  head/sys/kern/vfs_syscalls.c
  head/sys/kern/vfs_vnops.c
  head/sys/sys/file.h

Modified: head/lib/libc/sys/posix_fadvise.2
==============================================================================
--- head/lib/libc/sys/posix_fadvise.2	Tue Jun 19 17:13:14 2012	(r237273)
+++ head/lib/libc/sys/posix_fadvise.2	Tue Jun 19 18:42:24 2012	(r237274)
@@ -28,7 +28,7 @@
 .\"	@(#)madvise.2	8.1 (Berkeley) 6/9/93
 .\" $FreeBSD$
 .\"
-.Dd February 25, 2012
+.Dd June 19, 2012
 .Dt POSIX_FADVISE 2
 .Os
 .Sh NAME
@@ -84,10 +84,9 @@ specified range and future access to thi
 .It Dv POSIX_FADV_NOREUSE
 Tells the system that the specified data will only be accessed once and
 then not reused.
-Accesses to data within the specified range are treated as if the file
-descriptor has the
-.Dv O_DIRECT
-flag enabled.
+The system may decrease the in-memory priority of data once it has been
+read or written.
+Future access to this data may require a read operation.
 .El
 .Sh RETURN VALUES
 .Rv -std posix_fadvise

Modified: head/sys/kern/vfs_syscalls.c
==============================================================================
--- head/sys/kern/vfs_syscalls.c	Tue Jun 19 17:13:14 2012	(r237273)
+++ head/sys/kern/vfs_syscalls.c	Tue Jun 19 18:42:24 2012	(r237274)
@@ -4872,6 +4872,8 @@ kern_posix_fadvise(struct thread *td, in
 			new->fa_advice = advice;
 			new->fa_start = offset;
 			new->fa_end = end;
+			new->fa_prevstart = 0;
+			new->fa_prevend = 0;
 			fp->f_advice = new;
 			new = fa;
 		}

Modified: head/sys/kern/vfs_vnops.c
==============================================================================
--- head/sys/kern/vfs_vnops.c	Tue Jun 19 17:13:14 2012	(r237273)
+++ head/sys/kern/vfs_vnops.c	Tue Jun 19 18:42:24 2012	(r237274)
@@ -542,7 +542,7 @@ vn_read(fp, uio, active_cred, flags, td)
 	int error, ioflag;
 	struct mtx *mtxp;
 	int advice, vfslocked;
-	off_t offset;
+	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
@@ -607,9 +607,38 @@ vn_read(fp, uio, active_cred, flags, td)
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
-	    offset != uio->uio_offset)
-		error = VOP_ADVISE(vp, offset, uio->uio_offset - 1,
-		    POSIX_FADV_DONTNEED);
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE read(2).  To optimize the common
+		 * case of using POSIX_FADV_NOREUSE with sequential
+		 * access, track the previous implicit DONTNEED
+		 * request and grow this request to include the
+		 * current read(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously read regions of the
+		 * file.  This allows filesystem blocks that are
+		 * accessed by multiple calls to read(2) to be flushed
+		 * once the last read(2) finishes.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
@@ -630,6 +659,7 @@ vn_write(fp, uio, active_cred, flags, td
 	int error, ioflag, lock_flags;
 	struct mtx *mtxp;
 	int advice, vfslocked;
+	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
@@ -664,6 +694,7 @@ vn_write(fp, uio, active_cred, flags, td
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
 	advice = POSIX_FADV_NORMAL;
+	mtxp = NULL;
 	if (fp->f_advice != NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, fp);
 		mtx_lock(mtxp);
@@ -676,19 +707,14 @@ vn_write(fp, uio, active_cred, flags, td
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
-	case POSIX_FADV_NOREUSE:
-		/*
-		 * Request the underlying FS to discard the buffers
-		 * and pages after the I/O is complete.
-		 */
-		ioflag |= IO_DIRECT;
-		break;
 	}
+	offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
@@ -701,6 +727,55 @@ vn_write(fp, uio, active_cred, flags, td
 	VOP_UNLOCK(vp, 0);
 	if (vp->v_type != VCHR)
 		vn_finished_write(mp);
+	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE write(2).  To optimize the
+		 * common case of using POSIX_FADV_NOREUSE with
+		 * sequential access, track the previous implicit
+		 * DONTNEED request and grow this request to include
+		 * the current write(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously written regions of the
+		 * file.
+		 *
+		 * Note that the blocks just written are almost
+		 * certainly still dirty, so this only works when
+		 * VOP_ADVISE() calls from subsequent writes push out
+		 * the data written by this write(2) once the backing
+		 * buffers are clean.  However, as compared to forcing
+		 * IO_DIRECT, this gives much saner behavior.  Write
+		 * clustering is still allowed, and clean pages are
+		 * merely moved to the cache page queue rather than
+		 * outright thrown away.  This means a subsequent
+		 * read(2) can still avoid hitting the disk if the
+		 * pages have not been reclaimed.
+		 *
+		 * This does make POSIX_FADV_NOREUSE largely useless
+		 * with non-sequential access.  However, sequential
+		 * access is the more common use case and the flag is
+		 * merely advisory.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
+	
 unlock:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);

Modified: head/sys/sys/file.h
==============================================================================
--- head/sys/sys/file.h	Tue Jun 19 17:13:14 2012	(r237273)
+++ head/sys/sys/file.h	Tue Jun 19 18:42:24 2012	(r237274)
@@ -126,6 +126,8 @@ struct fadvise_info {
 	int		fa_advice;	/* (f) FADV_* type. */
 	off_t		fa_start;	/* (f) Region start. */
 	off_t		fa_end;		/* (f) Region end. */
+	off_t		fa_prevstart;	/* (f) Previous NOREUSE start. */
+	off_t		fa_prevend;	/* (f) Previous NOREUSE end. */
 };
 
 struct file {


More information about the svn-src-all mailing list