svn commit: r235230 - head/sys/vm

Thu May 10 15:16:43 UTC 2012

Author: alc
Date: Thu May 10 15:16:42 2012
New Revision: 235230
URL: http://svn.freebsd.org/changeset/base/235230

Log:
  Give vm_fault()'s sequential access optimization a makeover.
  
  There are two aspects to the sequential access optimization: (1) read ahead
  of pages that are expected to be accessed in the near future and (2) unmap
  and cache behind of pages that are not expected to be accessed again.  This
  revision changes both aspects.
  
  The read ahead optimization is now more effective.  It starts with the same
  initial read window as before, but arithmetically grows the window on
  sequential page faults.  This can yield increased read bandwidth.  For
  example, on one of my machines, a program using mmap() to read a file that
  is several times larger than the machine's physical memory takes about 17%
  less time to complete.
  
  The unmap and cache behind optimization is now more selectively applied.
  The read ahead window must grow to its maximum size before unmap and cache
  behind is performed.  This significantly reduces the number of times that
  pages are unmapped and cached only to be reactivated a short time later.
  
  The unmap and cache behind optimization now clears each page's referenced
  flag.  Previously, in the case of dirty pages, if the containing file was
  still mapped at the time that the page daemon examined the dirty pages,
  they would be reactivated.
  
  From a stylistic standpoint, this revision also cleanly separates the
  implementation of the read ahead and unmap/cache behind optimizations.
  
  Glanced at:	kib
  MFC after:	2 weeks

Modified:
  head/sys/vm/vm_fault.c
  head/sys/vm/vm_map.c
  head/sys/vm/vm_map.h

Modified: head/sys/vm/vm_fault.c
==============================================================================

--- head/sys/vm/vm_fault.c	Thu May 10 14:27:49 2012	(r235229)
+++ head/sys/vm/vm_fault.c	Thu May 10 15:16:42 2012	(r235230)
@@ -118,9 +118,11 @@ static int prefault_pageorder[] = {
 static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
 static void vm_fault_prefault(pmap_t, vm_offset_t, vm_map_entry_t);
 
-#define VM_FAULT_READ_AHEAD 8
-#define VM_FAULT_READ_BEHIND 7
-#define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1)
+#define	VM_FAULT_READ_BEHIND	8
+#define	VM_FAULT_READ_MAX	(1 + VM_FAULT_READ_AHEAD_MAX)
+#define	VM_FAULT_NINCR		(VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND)
+#define	VM_FAULT_SUM		(VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2)
+#define	VM_FAULT_CACHE_BEHIND	(VM_FAULT_READ_BEHIND * VM_FAULT_SUM)
 
 struct faultstate {
 	vm_page_t m;
@@ -136,6 +138,8 @@ struct faultstate {
 	int vfslocked;
 };
 
+static void vm_fault_cache_behind(const struct faultstate *fs, int distance);
+
 static inline void
 release_page(struct faultstate *fs)
 {
@@ -236,13 +240,13 @@ vm_fault_hold(vm_map_t map, vm_offset_t 
     int fault_flags, vm_page_t *m_hold)
 {
 	vm_prot_t prot;
-	int is_first_object_locked, result;
-	boolean_t growstack, wired;
+	long ahead, behind;
+	int alloc_req, era, faultcount, nera, reqpage, result;
+	boolean_t growstack, is_first_object_locked, wired;
 	int map_generation;
 	vm_object_t next_object;
-	vm_page_t marray[VM_FAULT_READ], mt, mt_prev;
+	vm_page_t marray[VM_FAULT_READ_MAX];
 	int hardfault;
-	int faultcount, ahead, behind, alloc_req;
 	struct faultstate fs;
 	struct vnode *vp;
 	int locked, error;
@@ -252,7 +256,7 @@ vm_fault_hold(vm_map_t map, vm_offset_t 
 	PCPU_INC(cnt.v_vm_faults);
 	fs.vp = NULL;
 	fs.vfslocked = 0;
-	faultcount = behind = 0;
+	faultcount = reqpage = 0;
 
 RetryFault:;
 
@@ -460,75 +464,47 @@ readrest:
 		 */
 		if (TRYPAGER) {
 			int rv;
-			int reqpage = 0;
 			u_char behavior = vm_map_entry_behavior(fs.entry);
 
 			if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
 			    P_KILLED(curproc)) {
+				behind = 0;
 				ahead = 0;
+			} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
 				behind = 0;
+				ahead = atop(fs.entry->end - vaddr) - 1;
+				if (ahead > VM_FAULT_READ_AHEAD_MAX)
+					ahead = VM_FAULT_READ_AHEAD_MAX;
+				if (fs.pindex == fs.entry->next_read)
+					vm_fault_cache_behind(&fs,
+					    VM_FAULT_READ_MAX);
 			} else {
-				behind = (vaddr - fs.entry->start) >> PAGE_SHIFT;
-				if (behind > VM_FAULT_READ_BEHIND)
-					behind = VM_FAULT_READ_BEHIND;
-
-				ahead = ((fs.entry->end - vaddr) >> PAGE_SHIFT) - 1;
-				if (ahead > VM_FAULT_READ_AHEAD)
-					ahead = VM_FAULT_READ_AHEAD;
-			}
-			is_first_object_locked = FALSE;
-			if ((behavior == MAP_ENTRY_BEHAV_SEQUENTIAL ||
-			     (behavior != MAP_ENTRY_BEHAV_RANDOM &&
-			      fs.pindex >= fs.entry->lastr &&
-			      fs.pindex < fs.entry->lastr + VM_FAULT_READ)) &&
-			    (fs.first_object == fs.object ||
-			     (is_first_object_locked = VM_OBJECT_TRYLOCK(fs.first_object))) &&
-			    fs.first_object->type != OBJT_DEVICE &&
-			    fs.first_object->type != OBJT_PHYS &&
-			    fs.first_object->type != OBJT_SG) {
-				vm_pindex_t firstpindex;
-
-				if (fs.first_pindex < 2 * VM_FAULT_READ)
-					firstpindex = 0;
-				else
-					firstpindex = fs.first_pindex - 2 * VM_FAULT_READ;
-				mt = fs.first_object != fs.object ?
-				    fs.first_m : fs.m;
-				KASSERT(mt != NULL, ("vm_fault: missing mt"));
-				KASSERT((mt->oflags & VPO_BUSY) != 0,
-				    ("vm_fault: mt %p not busy", mt));
-				mt_prev = vm_page_prev(mt);
-
 				/*
-				 * note: partially valid pages cannot be 
-				 * included in the lookahead - NFS piecemeal
-				 * writes will barf on it badly.
+				 * If this is a sequential page fault, then
+				 * arithmetically increase the number of pages
+				 * in the read-ahead window.  Otherwise, reset
+				 * the read-ahead window to its smallest size.
 				 */
-				while ((mt = mt_prev) != NULL &&
-				    mt->pindex >= firstpindex &&
-				    mt->valid == VM_PAGE_BITS_ALL) {
-					mt_prev = vm_page_prev(mt);
-					if (mt->busy ||
-					    (mt->oflags & VPO_BUSY))
-						continue;
-					vm_page_lock(mt);
-					if (mt->hold_count ||
-					    mt->wire_count) {
-						vm_page_unlock(mt);
-						continue;
-					}
-					pmap_remove_all(mt);
-					if (mt->dirty != 0)
-						vm_page_deactivate(mt);
-					else
-						vm_page_cache(mt);
-					vm_page_unlock(mt);
-				}
-				ahead += behind;
-				behind = 0;
+				behind = atop(vaddr - fs.entry->start);
+				if (behind > VM_FAULT_READ_BEHIND)
+					behind = VM_FAULT_READ_BEHIND;
+				ahead = atop(fs.entry->end - vaddr) - 1;
+				era = fs.entry->read_ahead;
+				if (fs.pindex == fs.entry->next_read) {
+					nera = era + behind;
+					if (nera > VM_FAULT_READ_AHEAD_MAX)
+						nera = VM_FAULT_READ_AHEAD_MAX;
+					behind = 0;
+					if (ahead > nera)
+						ahead = nera;
+					if (era == VM_FAULT_READ_AHEAD_MAX)
+						vm_fault_cache_behind(&fs,
+						    VM_FAULT_CACHE_BEHIND);
+				} else if (ahead > VM_FAULT_READ_AHEAD_MIN)
+					ahead = VM_FAULT_READ_AHEAD_MIN;
+				if (era != ahead)
+					fs.entry->read_ahead = ahead;
 			}
-			if (is_first_object_locked)
-				VM_OBJECT_UNLOCK(fs.first_object);
 
 			/*
 			 * Call the pager to retrieve the data, if any, after
@@ -899,7 +875,7 @@ vnode_locked:
 	 * without holding a write lock on it.
 	 */
 	if (hardfault)
-		fs.entry->lastr = fs.pindex + faultcount - behind;
+		fs.entry->next_read = fs.pindex + faultcount - reqpage;
 
 	if ((prot & VM_PROT_WRITE) != 0 ||
 	    (fault_flags & VM_FAULT_DIRTY) != 0) {
@@ -992,6 +968,60 @@ vnode_locked:
 }
 
 /*
+ * Speed up the reclamation of up to "distance" pages that precede the
+ * faulting pindex within the first object of the shadow chain.
+ */
+static void
+vm_fault_cache_behind(const struct faultstate *fs, int distance)
+{
+	vm_object_t first_object, object;
+	vm_page_t m, m_prev;
+	vm_pindex_t pindex;
+
+	object = fs->object;
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	first_object = fs->first_object;
+	if (first_object != object) {
+		if (!VM_OBJECT_TRYLOCK(first_object)) {
+			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_LOCK(first_object);
+			VM_OBJECT_LOCK(object);
+		}
+	}
+	if (first_object->type != OBJT_DEVICE &&
+	    first_object->type != OBJT_PHYS && first_object->type != OBJT_SG) {
+		if (fs->first_pindex < distance)
+			pindex = 0;
+		else
+			pindex = fs->first_pindex - distance;
+		if (pindex < OFF_TO_IDX(fs->entry->offset))
+			pindex = OFF_TO_IDX(fs->entry->offset);
+		m = first_object != object ? fs->first_m : fs->m;
+		KASSERT((m->oflags & VPO_BUSY) != 0,
+		    ("vm_fault_cache_behind: page %p is not busy", m));
+		m_prev = vm_page_prev(m);
+		while ((m = m_prev) != NULL && m->pindex >= pindex &&
+		    m->valid == VM_PAGE_BITS_ALL) {
+			m_prev = vm_page_prev(m);
+			if (m->busy != 0 || (m->oflags & VPO_BUSY) != 0)
+				continue;
+			vm_page_lock(m);
+			if (m->hold_count == 0 && m->wire_count == 0) {
+				pmap_remove_all(m);
+				vm_page_aflag_clear(m, PGA_REFERENCED);
+				if (m->dirty != 0)
+					vm_page_deactivate(m);
+				else
+					vm_page_cache(m);
+			}
+			vm_page_unlock(m);
+		}
+	}
+	if (first_object != object)
+		VM_OBJECT_UNLOCK(first_object);
+}
+
+/*
  * vm_fault_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
  * of vm_map_pmap_enter, except it runs at page fault time instead

Modified: head/sys/vm/vm_map.c
==============================================================================
--- head/sys/vm/vm_map.c	Thu May 10 14:27:49 2012	(r235229)
+++ head/sys/vm/vm_map.c	Thu May 10 15:16:42 2012	(r235230)
@@ -1300,6 +1300,8 @@ charged:
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
+	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
+	new_entry->next_read = OFF_TO_IDX(offset);
 
 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
 	    ("OVERCOMMIT: vm_map_insert leaks vm_map %p", new_entry));

Modified: head/sys/vm/vm_map.h
==============================================================================
--- head/sys/vm/vm_map.h	Thu May 10 14:27:49 2012	(r235229)
+++ head/sys/vm/vm_map.h	Thu May 10 15:16:42 2012	(r235230)
@@ -112,8 +112,9 @@ struct vm_map_entry {
 	vm_prot_t protection;		/* protection code */
 	vm_prot_t max_protection;	/* maximum protection */
 	vm_inherit_t inheritance;	/* inheritance */
+	uint8_t read_ahead;		/* pages in the read-ahead window */
 	int wired_count;		/* can be paged if = 0 */
-	vm_pindex_t lastr;		/* last read */
+	vm_pindex_t next_read;		/* index of the next sequential read */
 	struct ucred *cred;		/* tmp storage for creator ref */
 };
 
@@ -330,6 +331,14 @@ long vmspace_wired_count(struct vmspace 
 #define	VM_FAULT_DIRTY 2		/* Dirty the page; use w/VM_PROT_COPY */
 
 /*
+ * Initially, mappings are slightly sequential.  The maximum window size must
+ * account for the map entry's "read_ahead" field being defined as an uint8_t.
+ */
+#define	VM_FAULT_READ_AHEAD_MIN		7
+#define	VM_FAULT_READ_AHEAD_INIT	15
+#define	VM_FAULT_READ_AHEAD_MAX		min(atop(MAXPHYS) - 1, UINT8_MAX)
+
+/*
  * The following "find_space" options are supported by vm_map_find()
  */
 #define	VMFS_NO_SPACE		0	/* don't find; use the given range */