svn commit: r356886 - head/sys/vm

Jeff Roberson jeff at FreeBSD.org
Sun Jan 19 18:30:24 UTC 2020


Author: jeff
Date: Sun Jan 19 18:30:23 2020
New Revision: 356886
URL: https://svnweb.freebsd.org/changeset/base/356886

Log:
  Make collapse synchronization more explicit and allow it to complete during
  paging.
  
  Shadow objects are marked with a COLLAPSING flag while they are collapsing with
  their backing object.  This gives us an explicit test rather than overloading
  paging-in-progress.  While split is on-going we mark an object with SPLIT.
  These two operations will modify the swap tree so they must be serialized
  and swap_pager_getpages() can now directly detect these conditions and page
  more conservatively.
  
  Callers to vm_object_collapse() now will reliably wait for a collapse to finish
  so that the backing chain is as short as possible before other decisions are
  made that may inflate the object chain.  For example, split, coalesce, etc.
  It is now safe to run fault concurrently with collapse.  It is safe to increase
  or decrease paging in progress with no lock so long as there is another valid
  ref on increase.
  
  This change makes collapse more reliable as a secondary benefit.  The primary
  benefit is making it safe to drop the object lock much earlier in fault or
  never acquire it at all.
  
  This was tested with a new shadow chain test script that uncovered long
  standing bugs and will be integrated with stress2.
  
  Reviewed by:	kib, markj
  Differential Revision:	https://reviews.freebsd.org/D22908

Modified:
  head/sys/vm/swap_pager.c
  head/sys/vm/vm_object.c
  head/sys/vm/vm_object.h

Modified: head/sys/vm/swap_pager.c
==============================================================================
--- head/sys/vm/swap_pager.c	Sun Jan 19 18:18:17 2020	(r356885)
+++ head/sys/vm/swap_pager.c	Sun Jan 19 18:30:23 2020	(r356886)
@@ -974,15 +974,12 @@ swp_pager_xfer_source(vm_object_t srcobject, vm_object
 	 * Destination has no swapblk and is not resident, transfer source.
 	 * swp_pager_meta_build() can sleep.
 	 */
-	vm_object_pip_add(srcobject, 1);
 	VM_OBJECT_WUNLOCK(srcobject);
-	vm_object_pip_add(dstobject, 1);
 	dstaddr = swp_pager_meta_build(dstobject, pindex, addr);
 	KASSERT(dstaddr == SWAPBLK_NONE,
 	    ("Unexpected destination swapblk"));
-	vm_object_pip_wakeup(dstobject);
 	VM_OBJECT_WLOCK(srcobject);
-	vm_object_pip_wakeup(srcobject);
+
 	return (true);
 }
 
@@ -995,8 +992,7 @@ swp_pager_xfer_source(vm_object_t srcobject, vm_object
  *	we keep the destination's.
  *
  *	This routine is allowed to sleep.  It may sleep allocating metadata
- *	indirectly through swp_pager_meta_build() or if paging is still in
- *	progress on the source.
+ *	indirectly through swp_pager_meta_build().
  *
  *	The source object contains no vm_page_t's (which is just as well)
  *
@@ -1019,18 +1015,14 @@ swap_pager_copy(vm_object_t srcobject, vm_object_t dst
 	 */
 	if (destroysource && (srcobject->flags & OBJ_ANON) == 0 &&
 	    srcobject->handle != NULL) {
-		vm_object_pip_add(srcobject, 1);
 		VM_OBJECT_WUNLOCK(srcobject);
-		vm_object_pip_add(dstobject, 1);
 		VM_OBJECT_WUNLOCK(dstobject);
 		sx_xlock(&sw_alloc_sx);
 		TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
 		    pager_object_list);
 		sx_xunlock(&sw_alloc_sx);
 		VM_OBJECT_WLOCK(dstobject);
-		vm_object_pip_wakeup(dstobject);
 		VM_OBJECT_WLOCK(srcobject);
-		vm_object_pip_wakeup(srcobject);
 	}
 
 	/*
@@ -1207,26 +1199,29 @@ swap_pager_getpages(vm_object_t object, vm_page_t *ma,
 
 	reqcount = count;
 
-	/*
-	 * Determine the final number of read-behind pages and
-	 * allocate them BEFORE releasing the object lock.  Otherwise,
-	 * there can be a problematic race with vm_object_split().
-	 * Specifically, vm_object_split() might first transfer pages
-	 * that precede ma[0] in the current object to a new object,
-	 * and then this function incorrectly recreates those pages as
-	 * read-behind pages in the current object.
-	 */
 	KASSERT(object->type == OBJT_SWAP,
 	    ("%s: object not swappable", __func__));
 	if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead))
 		return (VM_PAGER_FAIL);
 
+	KASSERT(reqcount - 1 <= maxahead,
+	    ("page count %d extends beyond swap block", reqcount));
+
 	/*
+	 * Do not transfer any pages other than those that are xbusied
+	 * when running during a split or collapse operation.  This
+	 * prevents clustering from re-creating pages which are being
+	 * moved into another object.
+	 */
+	if ((object->flags & (OBJ_SPLIT | OBJ_DEAD)) != 0) {
+		maxahead = reqcount - 1;
+		maxbehind = 0;
+	}
+
+	/*
 	 * Clip the readahead and readbehind ranges to exclude resident pages.
 	 */
 	if (rahead != NULL) {
-		KASSERT(reqcount - 1 <= maxahead,
-		    ("page count %d extends beyond swap block", reqcount));
 		*rahead = imin(*rahead, maxahead - (reqcount - 1));
 		pindex = ma[reqcount - 1]->pindex;
 		msucc = TAILQ_NEXT(ma[reqcount - 1], listq);

Modified: head/sys/vm/vm_object.c
==============================================================================
--- head/sys/vm/vm_object.c	Sun Jan 19 18:18:17 2020	(r356885)
+++ head/sys/vm/vm_object.c	Sun Jan 19 18:30:23 2020	(r356886)
@@ -116,8 +116,6 @@ static int	vm_object_page_collect_flush(vm_object_t ob
 		    boolean_t *eio);
 static boolean_t vm_object_page_remove_write(vm_page_t p, int flags,
 		    boolean_t *allclean);
-static void	vm_object_qcollapse(vm_object_t object);
-static void	vm_object_vndeallocate(vm_object_t object);
 static void	vm_object_backing_remove(vm_object_t object);
 
 /*
@@ -164,12 +162,18 @@ SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, bypasse
     &object_bypasses,
     "VM object bypasses");
 
+static counter_u64_t object_collapse_waits = EARLY_COUNTER;
+SYSCTL_COUNTER_U64(_vm_stats_object, OID_AUTO, collapse_waits, CTLFLAG_RD,
+    &object_collapse_waits,
+    "Number of sleeps for collapse");
+
 static void
 counter_startup(void)
 {
 
 	object_collapses = counter_u64_alloc(M_WAITOK);
 	object_bypasses = counter_u64_alloc(M_WAITOK);
+	object_collapse_waits = counter_u64_alloc(M_WAITOK);
 }
 SYSINIT(object_counters, SI_SUB_CPU, SI_ORDER_ANY, counter_startup, NULL);
 
@@ -376,6 +380,19 @@ vm_object_pip_wakeupn(vm_object_t object, short i)
 	refcount_releasen(&object->paging_in_progress, i);
 }
 
+/*
+ * Atomically drop the interlock and wait for pip to drain.  This protects
+ * from sleep/wakeup races due to identity changes.  The lock is not
+ * re-acquired on return.
+ */
+static void
+vm_object_pip_sleep(vm_object_t object, char *waitid)
+{
+
+	refcount_sleep_interlock(&object->paging_in_progress,
+	    &object->lock, waitid, PVM);
+}
+
 void
 vm_object_pip_wait(vm_object_t object, char *waitid)
 {
@@ -383,8 +400,7 @@ vm_object_pip_wait(vm_object_t object, char *waitid)
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	while (REFCOUNT_COUNT(object->paging_in_progress) > 0) {
-		VM_OBJECT_WUNLOCK(object);
-		refcount_wait(&object->paging_in_progress, waitid, PVM);
+		vm_object_pip_sleep(object, waitid);
 		VM_OBJECT_WLOCK(object);
 	}
 }
@@ -466,30 +482,17 @@ vm_object_allocate_anon(vm_pindex_t size, vm_object_t 
 	return (object);
 }
 
-
-/*
- *	vm_object_reference:
- *
- *	Gets another reference to the given object.  Note: OBJ_DEAD
- *	objects can be referenced during final cleaning.
- */
-void
-vm_object_reference(vm_object_t object)
+static void
+vm_object_reference_vnode(vm_object_t object)
 {
 	struct vnode *vp;
 	u_int old;
 
-	if (object == NULL)
-		return;
-
 	/*
-	 * Many places assume exclusive access to objects with a single
-	 * ref. vm_object_collapse() in particular will directly mainpulate
-	 * references for objects in this state.  vnode objects only need
-	 * the lock for the first ref to reference the vnode.
+	 * vnode objects need the lock for the first reference
+	 * to serialize with vnode_object_deallocate().
 	 */
-	if (!refcount_acquire_if_gt(&object->ref_count,
-	    object->type == OBJT_VNODE ? 0 : 1)) {
+	if (!refcount_acquire_if_gt(&object->ref_count, 0)) {
 		VM_OBJECT_RLOCK(object);
 		old = refcount_acquire(&object->ref_count);
 		if (object->type == OBJT_VNODE && old == 0) {
@@ -501,6 +504,26 @@ vm_object_reference(vm_object_t object)
 }
 
 /*
+ *	vm_object_reference:
+ *
+ *	Acquires a reference to the given object.
+ */
+void
+vm_object_reference(vm_object_t object)
+{
+
+	if (object == NULL)
+		return;
+
+	if (object->type == OBJT_VNODE)
+		vm_object_reference_vnode(object);
+	else
+		refcount_acquire(&object->ref_count);
+	KASSERT((object->flags & OBJ_DEAD) == 0,
+	    ("vm_object_reference: Referenced dead object."));
+}
+
+/*
  *	vm_object_reference_locked:
  *
  *	Gets another reference to the given object.
@@ -516,23 +539,23 @@ vm_object_reference_locked(vm_object_t object)
 	VM_OBJECT_ASSERT_LOCKED(object);
 	old = refcount_acquire(&object->ref_count);
 	if (object->type == OBJT_VNODE && old == 0) {
-		vp = object->handle;
-		vref(vp);
-	}
+		vp = object->handle; vref(vp); }
+	KASSERT((object->flags & OBJ_DEAD) == 0,
+	    ("vm_object_reference: Referenced dead object."));
 }
 
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
 static void
-vm_object_vndeallocate(vm_object_t object)
+vm_object_deallocate_vnode(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 	bool last;
 
 	KASSERT(object->type == OBJT_VNODE,
-	    ("vm_object_vndeallocate: not a vnode object"));
-	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
+	    ("vm_object_deallocate_vnode: not a vnode object"));
+	KASSERT(vp != NULL, ("vm_object_deallocate_vnode: missing vp"));
 
 	/* Object lock to protect handle lookup. */
 	last = refcount_release(&object->ref_count);
@@ -548,7 +571,54 @@ vm_object_vndeallocate(vm_object_t object)
 	vrele(vp);
 }
 
+
 /*
+ * We dropped a reference on an object and discovered that it had a
+ * single remaining shadow.  This is a sibling of the reference we
+ * dropped.  Attempt to collapse the sibling and backing object.
+ */
+static vm_object_t
+vm_object_deallocate_anon(vm_object_t backing_object)
+{
+	vm_object_t object;
+
+	/* Fetch the final shadow.  */
+	object = LIST_FIRST(&backing_object->shadow_head);
+	KASSERT(object != NULL && backing_object->shadow_count == 1,
+	    ("vm_object_anon_deallocate: ref_count: %d, shadow_count: %d",
+	    backing_object->ref_count, backing_object->shadow_count));
+	KASSERT((object->flags & (OBJ_TMPFS_NODE | OBJ_ANON)) == OBJ_ANON,
+	    ("invalid shadow object %p", object));
+
+	if (!VM_OBJECT_TRYWLOCK(object)) {
+		/*
+		 * Prevent object from disappearing since we do not have a
+		 * reference.
+		 */
+		vm_object_pip_add(object, 1);
+		VM_OBJECT_WUNLOCK(backing_object);
+		VM_OBJECT_WLOCK(object);
+		vm_object_pip_wakeup(object);
+	} else
+		VM_OBJECT_WUNLOCK(backing_object);
+
+	/*
+	 * Check for a collapse/terminate race with the last reference holder.
+	 */
+	if ((object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) != 0 ||
+	    !refcount_acquire_if_not_zero(&object->ref_count)) {
+		VM_OBJECT_WUNLOCK(object);
+		return (NULL);
+	}
+	backing_object = object->backing_object;
+	if (backing_object != NULL && (backing_object->flags & OBJ_ANON) != 0)
+		vm_object_collapse(object);
+	VM_OBJECT_WUNLOCK(object);
+
+	return (object);
+}
+
+/*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
@@ -562,7 +632,7 @@ vm_object_vndeallocate(vm_object_t object)
 void
 vm_object_deallocate(vm_object_t object)
 {
-	vm_object_t robject, temp;
+	vm_object_t temp;
 	bool released;
 
 	while (object != NULL) {
@@ -583,7 +653,7 @@ vm_object_deallocate(vm_object_t object)
 		if (object->type == OBJT_VNODE) {
 			VM_OBJECT_RLOCK(object);
 			if (object->type == OBJT_VNODE) {
-				vm_object_vndeallocate(object);
+				vm_object_deallocate_vnode(object);
 				return;
 			}
 			VM_OBJECT_RUNLOCK(object);
@@ -594,92 +664,30 @@ vm_object_deallocate(vm_object_t object)
 		    ("vm_object_deallocate: object deallocated too many times: %d",
 		    object->type));
 
-		if (refcount_release(&object->ref_count))
-			goto doterm;
-		if (object->ref_count > 1) {
-			VM_OBJECT_WUNLOCK(object);
-			return;
-		} else if (object->ref_count == 1) {
-			if (object->shadow_count == 0 &&
-			    (object->flags & OBJ_ANON) != 0) {
-				vm_object_set_flag(object, OBJ_ONEMAPPING);
-			} else if (object->shadow_count == 1) {
-				KASSERT((object->flags & OBJ_ANON) != 0,
-				    ("obj %p with shadow_count > 0 is not anon",
-				    object));
-				robject = LIST_FIRST(&object->shadow_head);
-				KASSERT(robject != NULL,
-				    ("vm_object_deallocate: ref_count: %d, "
-				    "shadow_count: %d", object->ref_count,
-				    object->shadow_count));
-				KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0,
-				    ("shadowed tmpfs v_object %p", object));
-				if (!VM_OBJECT_TRYWLOCK(robject)) {
-					/*
-					 * Avoid a potential deadlock.
-					 */
-					refcount_acquire(&object->ref_count);
-					VM_OBJECT_WUNLOCK(object);
-					/*
-					 * More likely than not the thread
-					 * holding robject's lock has lower
-					 * priority than the current thread.
-					 * Let the lower priority thread run.
-					 */
-					pause("vmo_de", 1);
-					continue;
-				}
-				/*
-				 * Collapse object into its shadow unless its
-				 * shadow is dead.  In that case, object will
-				 * be deallocated by the thread that is
-				 * deallocating its shadow.
-				 */
-				if ((robject->flags &
-				    (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON) {
-
-					refcount_acquire(&robject->ref_count);
-retry:
-					if (REFCOUNT_COUNT(robject->paging_in_progress) > 0) {
-						VM_OBJECT_WUNLOCK(object);
-						vm_object_pip_wait(robject,
-						    "objde1");
-						temp = robject->backing_object;
-						if (object == temp) {
-							VM_OBJECT_WLOCK(object);
-							goto retry;
-						}
-					} else if (REFCOUNT_COUNT(object->paging_in_progress) > 0) {
-						VM_OBJECT_WUNLOCK(robject);
-						VM_OBJECT_WUNLOCK(object);
-						refcount_wait(
-						    &object->paging_in_progress,
-						    "objde2", PVM);
-						VM_OBJECT_WLOCK(robject);
-						temp = robject->backing_object;
-						if (object == temp) {
-							VM_OBJECT_WLOCK(object);
-							goto retry;
-						}
-					} else
-						VM_OBJECT_WUNLOCK(object);
-
-					if (robject->ref_count == 1) {
-						refcount_release(&robject->ref_count);
-						object = robject;
-						goto doterm;
-					}
-					object = robject;
-					vm_object_collapse(object);
-					VM_OBJECT_WUNLOCK(object);
-					continue;
-				}
-				VM_OBJECT_WUNLOCK(robject);
+		/*
+		 * If this is not the final reference to an anonymous
+		 * object we may need to collapse the shadow chain.
+		 */
+		if (!refcount_release(&object->ref_count)) {
+			if (object->ref_count > 1 ||
+			    object->shadow_count == 0) {
+				if ((object->flags & OBJ_ANON) != 0 &&
+				    object->ref_count == 1)
+					vm_object_set_flag(object,
+					    OBJ_ONEMAPPING);
+				VM_OBJECT_WUNLOCK(object);
+				return;
 			}
-			VM_OBJECT_WUNLOCK(object);
-			return;
+
+			/* Handle collapsing last ref on anonymous objects. */
+			object = vm_object_deallocate_anon(object);
+			continue;
 		}
-doterm:
+
+		/*
+		 * Handle the final reference to an object.  We restart
+		 * the loop with the backing object to avoid recursion.
+		 */
 		umtx_shm_object_terminated(object);
 		temp = object->backing_object;
 		if (temp != NULL) {
@@ -687,16 +695,11 @@ doterm:
 			    ("shadowed tmpfs v_object 2 %p", object));
 			vm_object_backing_remove(object);
 		}
-		/*
-		 * Don't double-terminate, we could be in a termination
-		 * recursion due to the terminate having to sync data
-		 * to disk.
-		 */
-		if ((object->flags & OBJ_DEAD) == 0) {
-			vm_object_set_flag(object, OBJ_DEAD);
-			vm_object_terminate(object);
-		} else
-			VM_OBJECT_WUNLOCK(object);
+
+		KASSERT((object->flags & OBJ_DEAD) == 0,
+		    ("vm_object_deallocate: Terminating dead object."));
+		vm_object_set_flag(object, OBJ_DEAD);
+		vm_object_terminate(object);
 		object = temp;
 	}
 }
@@ -734,6 +737,9 @@ vm_object_backing_remove_locked(vm_object_t object)
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
+	KASSERT((object->flags & OBJ_COLLAPSING) == 0,
+	    ("vm_object_backing_remove: Removing collapsing object."));
+
 	if ((object->flags & OBJ_SHADOWLIST) != 0) {
 		LIST_REMOVE(object, shadow_list);
 		backing_object->shadow_count--;
@@ -788,8 +794,100 @@ vm_object_backing_insert(vm_object_t object, vm_object
 		object->backing_object = backing_object;
 }
 
+/*
+ * Insert an object into a backing_object's shadow list with an additional
+ * reference to the backing_object added.
+ */
+static void
+vm_object_backing_insert_ref(vm_object_t object, vm_object_t backing_object)
+{
 
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	if ((backing_object->flags & OBJ_ANON) != 0) {
+		VM_OBJECT_WLOCK(backing_object);
+		KASSERT((backing_object->flags & OBJ_DEAD) == 0,
+		    ("shadowing dead anonymous object"));
+		vm_object_reference_locked(backing_object);
+		vm_object_backing_insert_locked(object, backing_object);
+		vm_object_clear_flag(backing_object, OBJ_ONEMAPPING);
+		VM_OBJECT_WUNLOCK(backing_object);
+	} else {
+		vm_object_reference(backing_object);
+		object->backing_object = backing_object;
+	}
+}
+
 /*
+ * Transfer a backing reference from backing_object to object.
+ */
+static void
+vm_object_backing_transfer(vm_object_t object, vm_object_t backing_object)
+{
+	vm_object_t new_backing_object;
+
+	/*
+	 * Note that the reference to backing_object->backing_object
+	 * moves from within backing_object to within object.
+	 */
+	vm_object_backing_remove_locked(object);
+	new_backing_object = backing_object->backing_object;
+	if (new_backing_object == NULL)
+		return;
+	if ((new_backing_object->flags & OBJ_ANON) != 0) {
+		VM_OBJECT_WLOCK(new_backing_object);
+		vm_object_backing_remove_locked(backing_object);
+		vm_object_backing_insert_locked(object, new_backing_object);
+		VM_OBJECT_WUNLOCK(new_backing_object);
+	} else {
+		object->backing_object = new_backing_object;
+		backing_object->backing_object = NULL;
+	}
+}
+
+/*
+ * Wait for a concurrent collapse to settle.
+ */
+static void
+vm_object_collapse_wait(vm_object_t object)
+{
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	while ((object->flags & OBJ_COLLAPSING) != 0) {
+		vm_object_pip_wait(object, "vmcolwait");
+		counter_u64_add(object_collapse_waits, 1);
+	}
+}
+
+/*
+ * Waits for a backing object to clear a pending collapse and returns
+ * it locked if it is an ANON object.
+ */
+static vm_object_t
+vm_object_backing_collapse_wait(vm_object_t object)
+{
+	vm_object_t backing_object;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	for (;;) {
+		backing_object = object->backing_object;
+		if (backing_object == NULL ||
+		    (backing_object->flags & OBJ_ANON) == 0)
+			return (NULL);
+		VM_OBJECT_WLOCK(backing_object);
+		if ((backing_object->flags & (OBJ_DEAD | OBJ_COLLAPSING)) == 0)
+			break;
+		VM_OBJECT_WUNLOCK(object);
+		vm_object_pip_sleep(backing_object, "vmbckwait");
+		counter_u64_add(object_collapse_waits, 1);
+		VM_OBJECT_WLOCK(object);
+	}
+	return (backing_object);
+}
+
+/*
  *	vm_object_terminate_pages removes any remaining pageable pages
  *	from the object and resets the object to an empty state.
  */
@@ -843,9 +941,14 @@ vm_object_terminate_pages(vm_object_t object)
 void
 vm_object_terminate(vm_object_t object)
 {
+
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_DEAD) != 0,
 	    ("terminating non-dead obj %p", object));
+	KASSERT((object->flags & OBJ_COLLAPSING) == 0,
+	    ("terminating collapsing obj %p", object));
+	KASSERT(object->backing_object == NULL,
+	    ("terminating shadow obj %p", object));
 
 	/*
 	 * wait for the pageout daemon to be done with the object
@@ -853,11 +956,11 @@ vm_object_terminate(vm_object_t object)
 	vm_object_pip_wait(object, "objtrm");
 
 	KASSERT(!REFCOUNT_COUNT(object->paging_in_progress),
-		("vm_object_terminate: pageout in progress"));
+	    ("vm_object_terminate: pageout in progress"));
 
-	KASSERT(object->ref_count == 0, 
-		("vm_object_terminate: object with references, ref_count=%d",
-		object->ref_count));
+	KASSERT(object->ref_count == 0,
+	    ("vm_object_terminate: object with references, ref_count=%d",
+	    object->ref_count));
 
 	if ((object->flags & OBJ_PG_DTOR) == 0)
 		vm_object_terminate_pages(object);
@@ -1397,11 +1500,13 @@ void
 vm_object_split(vm_map_entry_t entry)
 {
 	vm_page_t m, m_next;
-	vm_object_t orig_object, new_object, source;
+	vm_object_t orig_object, new_object, backing_object;
 	vm_pindex_t idx, offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
+	KASSERT((orig_object->flags & OBJ_ONEMAPPING) != 0,
+	    ("vm_object_split:  Splitting object with multiple mappings."));
 	if ((orig_object->flags & OBJ_ANON) == 0)
 		return;
 	if (orig_object->ref_count <= 1)
@@ -1419,35 +1524,25 @@ vm_object_split(vm_map_entry_t entry)
 	    orig_object->cred, ptoa(size));
 
 	/*
+	 * We must wait for the orig_object to complete any in-progress
+	 * collapse so that the swap blocks are stable below.  The
+	 * additional reference on backing_object by new object will
+	 * prevent further collapse operations until split completes.
+	 */
+	VM_OBJECT_WLOCK(orig_object);
+	vm_object_collapse_wait(orig_object);
+
+	/*
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
 	VM_OBJECT_WLOCK(new_object);
-	VM_OBJECT_WLOCK(orig_object);
 	new_object->domain = orig_object->domain;
-	source = orig_object->backing_object;
-	if (source != NULL) {
-		if ((source->flags & (OBJ_ANON | OBJ_DEAD)) != 0) {
-			VM_OBJECT_WLOCK(source);
-			if ((source->flags & OBJ_DEAD) != 0) {
-				VM_OBJECT_WUNLOCK(source);
-				VM_OBJECT_WUNLOCK(orig_object);
-				VM_OBJECT_WUNLOCK(new_object);
-				new_object->cred = NULL;
-				vm_object_deallocate(new_object);
-				VM_OBJECT_WLOCK(orig_object);
-				return;
-			}
-			vm_object_backing_insert_locked(new_object, source);
-			vm_object_reference_locked(source);	/* for new_object */
-			vm_object_clear_flag(source, OBJ_ONEMAPPING);
-			VM_OBJECT_WUNLOCK(source);
-		} else {
-			vm_object_backing_insert(new_object, source);
-			vm_object_reference(source);
-		}
+	backing_object = orig_object->backing_object;
+	if (backing_object != NULL) {
+		vm_object_backing_insert_ref(new_object, backing_object);
 		new_object->backing_object_offset = 
-			orig_object->backing_object_offset + entry->offset;
+		    orig_object->backing_object_offset + entry->offset;
 	}
 	if (orig_object->cred != NULL) {
 		crhold(orig_object->cred);
@@ -1455,6 +1550,12 @@ vm_object_split(vm_map_entry_t entry)
 		    ("orig_object->charge < 0"));
 		orig_object->charge -= ptoa(size);
 	}
+
+	/*
+	 * Mark the split operation so that swap_pager_getpages() knows
+	 * that the object is in transition.
+	 */
+	vm_object_set_flag(orig_object, OBJ_SPLIT);
 retry:
 	m = vm_page_find_least(orig_object, offidxstart);
 	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
@@ -1523,6 +1624,7 @@ retry:
 		TAILQ_FOREACH(m, &new_object->memq, listq)
 			vm_page_xunbusy(m);
 	}
+	vm_object_clear_flag(orig_object, OBJ_SPLIT);
 	VM_OBJECT_WUNLOCK(orig_object);
 	VM_OBJECT_WUNLOCK(new_object);
 	entry->object.vm_object = new_object;
@@ -1531,12 +1633,8 @@ retry:
 	VM_OBJECT_WLOCK(new_object);
 }
 
-#define	OBSC_COLLAPSE_NOWAIT	0x0002
-#define	OBSC_COLLAPSE_WAIT	0x0004
-
 static vm_page_t
-vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
-    int op)
+vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p)
 {
 	vm_object_t backing_object;
 
@@ -1546,8 +1644,6 @@ vm_object_collapse_scan_wait(vm_object_t object, vm_pa
 
 	KASSERT(p == NULL || p->object == object || p->object == backing_object,
 	    ("invalid ownership %p %p %p", p, object, backing_object));
-	if ((op & OBSC_COLLAPSE_NOWAIT) != 0)
-		return (next);
 	/* The page is only NULL when rename fails. */
 	if (p == NULL) {
 		VM_OBJECT_WUNLOCK(object);
@@ -1632,8 +1728,8 @@ vm_object_scan_all_shadowed(vm_object_t object)
 	return (true);
 }
 
-static bool
-vm_object_collapse_scan(vm_object_t object, int op)
+static void
+vm_object_collapse_scan(vm_object_t object)
 {
 	vm_object_t backing_object;
 	vm_page_t next, p, pp;
@@ -1646,12 +1742,6 @@ vm_object_collapse_scan(vm_object_t object, int op)
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
-	 * Initial conditions
-	 */
-	if ((op & OBSC_COLLAPSE_WAIT) != 0)
-		vm_object_set_flag(backing_object, OBJ_DEAD);
-
-	/*
 	 * Our scan
 	 */
 	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
@@ -1662,12 +1752,16 @@ vm_object_collapse_scan(vm_object_t object, int op)
 		 * Check for busy page
 		 */
 		if (vm_page_tryxbusy(p) == 0) {
-			next = vm_object_collapse_scan_wait(object, p, next, op);
+			next = vm_object_collapse_scan_wait(object, p);
 			continue;
 		}
 
+		KASSERT(object->backing_object == backing_object,
+		    ("vm_object_collapse_scan: backing object mismatch %p != %p",
+		    object->backing_object, backing_object));
 		KASSERT(p->object == backing_object,
-		    ("vm_object_collapse_scan: object mismatch"));
+		    ("vm_object_collapse_scan: object mismatch %p != %p",
+		    p->object, backing_object));
 
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= object->size) {
@@ -1689,16 +1783,9 @@ vm_object_collapse_scan(vm_object_t object, int op)
 			 * The page in the parent is busy and possibly not
 			 * (yet) valid.  Until its state is finalized by the
 			 * busy bit owner, we can't tell whether it shadows the
-			 * original page.  Therefore, we must either skip it
-			 * and the original (backing_object) page or wait for
-			 * its state to be finalized.
-			 *
-			 * This is due to a race with vm_fault() where we must
-			 * unbusy the original (backing_obj) page before we can
-			 * (re)lock the parent.  Hence we can get here.
+			 * original page.
 			 */
-			next = vm_object_collapse_scan_wait(object, pp, next,
-			    op);
+			next = vm_object_collapse_scan_wait(object, pp);
 			continue;
 		}
 
@@ -1742,10 +1829,7 @@ vm_object_collapse_scan(vm_object_t object, int op)
 		 */
 		if (vm_page_rename(p, object, new_pindex)) {
 			vm_page_xunbusy(p);
-			if (pp != NULL)
-				vm_page_xunbusy(pp);
-			next = vm_object_collapse_scan_wait(object, NULL, next,
-			    op);
+			next = vm_object_collapse_scan_wait(object, NULL);
 			continue;
 		}
 
@@ -1763,30 +1847,10 @@ vm_object_collapse_scan(vm_object_t object, int op)
 #endif
 		vm_page_xunbusy(p);
 	}
-	return (true);
+	return;
 }
 
-
 /*
- * this version of collapse allows the operation to occur earlier and
- * when paging_in_progress is true for an object...  This is not a complete
- * operation, but should plug 99.9% of the rest of the leaks.
- */
-static void
-vm_object_qcollapse(vm_object_t object)
-{
-	vm_object_t backing_object = object->backing_object;
-
-	VM_OBJECT_ASSERT_WLOCKED(object);
-	VM_OBJECT_ASSERT_WLOCKED(backing_object);
-
-	if (backing_object->ref_count != 1)
-		return;
-
-	vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT);
-}
-
-/*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
@@ -1801,53 +1865,48 @@ vm_object_collapse(vm_object_t object)
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	while (TRUE) {
-		/*
-		 * Verify that the conditions are right for collapse:
-		 *
-		 * The object exists and the backing object exists.
-		 */
-		if ((backing_object = object->backing_object) == NULL)
-			break;
+		KASSERT((object->flags & (OBJ_DEAD | OBJ_ANON)) == OBJ_ANON,
+		    ("collapsing invalid object"));
 
 		/*
-		 * we check the backing object first, because it is most likely
-		 * not collapsable.
+		 * Wait for the backing_object to finish any pending
+		 * collapse so that the caller sees the shortest possible
+		 * shadow chain.
 		 */
-		if ((backing_object->flags & OBJ_ANON) == 0)
-			break;
-		VM_OBJECT_WLOCK(backing_object);
-		if ((backing_object->flags & OBJ_DEAD) != 0 ||
-		    (object->flags & (OBJ_DEAD | OBJ_ANON)) != OBJ_ANON) {
-			VM_OBJECT_WUNLOCK(backing_object);
-			break;
-		}
+		backing_object = vm_object_backing_collapse_wait(object);
+		if (backing_object == NULL)
+			return;
 
-		if (REFCOUNT_COUNT(object->paging_in_progress) > 0 ||
-		    REFCOUNT_COUNT(backing_object->paging_in_progress) > 0) {
-			vm_object_qcollapse(object);
-			VM_OBJECT_WUNLOCK(backing_object);
-			break;
-		}
+		KASSERT(object->ref_count > 0 &&
+		    object->ref_count > object->shadow_count,
+		    ("collapse with invalid ref %d or shadow %d count.",
+		    object->ref_count, object->shadow_count));
+		KASSERT((backing_object->flags &
+		    (OBJ_COLLAPSING | OBJ_DEAD)) == 0,
+		    ("vm_object_collapse: Backing object already collapsing."));
+		KASSERT((object->flags & (OBJ_COLLAPSING | OBJ_DEAD)) == 0,
+		    ("vm_object_collapse: object is already collapsing."));
 
 		/*
-		 * We know that we can either collapse the backing object (if
-		 * the parent is the only reference to it) or (perhaps) have
+		 * We know that we can either collapse the backing object if
+		 * the parent is the only reference to it, or (perhaps) have
 		 * the parent bypass the object if the parent happens to shadow
 		 * all the resident pages in the entire backing object.
-		 *
-		 * This is ignoring pager-backed pages such as swap pages.
-		 * vm_object_collapse_scan fails the shadowing test in this
-		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
+			KASSERT(backing_object->shadow_count == 1,
+			    ("vm_object_collapse: shadow_count: %d",
+			    backing_object->shadow_count));
 			vm_object_pip_add(object, 1);
+			vm_object_set_flag(object, OBJ_COLLAPSING);
 			vm_object_pip_add(backing_object, 1);
+			vm_object_set_flag(backing_object, OBJ_DEAD);
 
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.
 			 */
-			vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT);
+			vm_object_collapse_scan(object);
 
 #if VM_NRESERVLEVEL > 0
 			/*
@@ -1866,31 +1925,24 @@ vm_object_collapse(vm_object_t object)
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
 				 * Since swap_pager_copy() is being asked to
-				 * destroy the source, it will change the
-				 * backing_object's type to OBJT_DEFAULT.
+				 * destroy backing_object, it will change the
+				 * type to OBJT_DEFAULT.
 				 */
 				swap_pager_copy(
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
 			}
+
 			/*
 			 * Object now shadows whatever backing_object did.
-			 * Note that the reference to 
-			 * backing_object->backing_object moves from within 
-			 * backing_object to within object.
 			 */
-			vm_object_backing_remove_locked(object);
-			new_backing_object = backing_object->backing_object;
-			if (new_backing_object != NULL) {
-				VM_OBJECT_WLOCK(new_backing_object);
-				vm_object_backing_remove_locked(backing_object);
-				vm_object_backing_insert_locked(object,
-				    new_backing_object);
-				VM_OBJECT_WUNLOCK(new_backing_object);
-			}
+			vm_object_clear_flag(object, OBJ_COLLAPSING);
+			vm_object_backing_transfer(object, backing_object);
 			object->backing_object_offset +=
 			    backing_object->backing_object_offset;
+			VM_OBJECT_WUNLOCK(object);
+			vm_object_pip_wakeup(object);
 
 			/*
 			 * Discard backing_object.
@@ -1903,17 +1955,17 @@ vm_object_collapse(vm_object_t object)
 "backing_object %p was somehow re-referenced during collapse!",
 			    backing_object));
 			vm_object_pip_wakeup(backing_object);
-			backing_object->type = OBJT_DEAD;
-			refcount_release(&backing_object->ref_count);
-			VM_OBJECT_WUNLOCK(backing_object);
-			vm_object_destroy(backing_object);
-
-			vm_object_pip_wakeup(object);
+			(void)refcount_release(&backing_object->ref_count);
+			vm_object_terminate(backing_object);
 			counter_u64_add(object_collapses, 1);
+			VM_OBJECT_WLOCK(object);
 		} else {
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
+			 *
+			 * The object lock and backing_object lock must not
+			 * be dropped during this sequence.
 			 */
 			if (!vm_object_scan_all_shadowed(object)) {
 				VM_OBJECT_WUNLOCK(backing_object);
@@ -1926,21 +1978,22 @@ vm_object_collapse(vm_object_t object)
 			 * it, since its reference count is at least 2.
 			 */
 			vm_object_backing_remove_locked(object);
-
 			new_backing_object = backing_object->backing_object;
 			if (new_backing_object != NULL) {
-				vm_object_backing_insert(object,
+				vm_object_backing_insert_ref(object,
 				    new_backing_object);
-				vm_object_reference(new_backing_object);
 				object->backing_object_offset +=
-					backing_object->backing_object_offset;
+				    backing_object->backing_object_offset;
 			}
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish.
 			 */
-			refcount_release(&backing_object->ref_count);
+			(void)refcount_release(&backing_object->ref_count);
+			KASSERT(backing_object->ref_count >= 1, (
+"backing_object %p was somehow dereferenced during collapse!",
+			    backing_object));
 			VM_OBJECT_WUNLOCK(backing_object);
 			counter_u64_add(object_bypasses, 1);
 		}
@@ -2155,7 +2208,7 @@ vm_object_coalesce(vm_object_t prev_object, vm_ooffset
 
 	VM_OBJECT_WLOCK(prev_object);
 	/*
-	 * Try to collapse the object first
+	 * Try to collapse the object first.
 	 */
 	vm_object_collapse(prev_object);
 

Modified: head/sys/vm/vm_object.h
==============================================================================
--- head/sys/vm/vm_object.h	Sun Jan 19 18:18:17 2020	(r356885)
+++ head/sys/vm/vm_object.h	Sun Jan 19 18:30:23 2020	(r356886)
@@ -190,6 +190,8 @@ struct vm_object {
 #define	OBJ_SIZEVNLOCK	0x0040		/* lock vnode to check obj size */
 #define	OBJ_PG_DTOR	0x0080		/* dont reset object, leave that for dtor */
 #define	OBJ_TMPFS_NODE	0x0200		/* object belongs to tmpfs VREG node */
+#define	OBJ_SPLIT	0x0400		/* object is being split */
+#define	OBJ_COLLAPSING	0x0800		/* Parent of collapse. */
 #define	OBJ_COLORED	0x1000		/* pg_color is defined */
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
 #define	OBJ_SHADOWLIST	0x4000		/* Object is on the shadow list. */


More information about the svn-src-head mailing list