svn commit: r351743 - head/sys/vm

Tue Sep 3 14:30:00 UTC 2019

Author: markj
Date: Tue Sep  3 14:29:58 2019
New Revision: 351743
URL: https://svnweb.freebsd.org/changeset/base/351743

Log:
  Add preliminary support for atomic updates of per-page queue state.
  
  Queue operations on a page use the page lock when updating the page to
  reflect the desired queue state, and the page queue lock when physically
  enqueuing or dequeuing a page.  Multiple pages share a given page lock,
  but queue state is per-page; this false sharing results in heavy lock
  contention.
  
  Take a small step towards the use of atomic_cmpset to synchronize
  updates to per-page queue state by introducing vm_page_pqstate_cmpset()
  and using it in the page daemon.  In the longer term the plan is to stop
  using the page lock to protect page identity and rely only on the object
  and page busy locks.  However, since the page daemon avoids acquiring
  the object lock except when necessary, some synchronization with a
  concurrent free of the page is required.  vm_page_pqstate_cmpset() can
  be used to ensure that queue state updates are successful only if the
  page is not scheduled for a dequeue, which is sufficient for the page
  daemon.
  
  Add vm_page_swapqueue(), which moves a page from one queue to another
  using vm_page_pqstate_cmpset().  Use it in the active queue scan, which
  does not use the object lock.  Modify vm_page_dequeue_deferred() to
  use vm_page_pqstate_cmpset() as well.
  
  Reviewed by:	kib
  Discussed with:	jeff
  Sponsored by:	Netflix
  Differential Revision:	https://reviews.freebsd.org/D21257

Modified:
  head/sys/vm/vm_page.c
  head/sys/vm/vm_page.h
  head/sys/vm/vm_pageout.c

Modified: head/sys/vm/vm_page.c
==============================================================================

--- head/sys/vm/vm_page.c	Tue Sep  3 13:18:51 2019	(r351742)
+++ head/sys/vm/vm_page.c	Tue Sep  3 14:29:58 2019	(r351743)
@@ -3130,6 +3130,13 @@ vm_pqbatch_process(struct vm_pagequeue *pq, struct vm_
 	vm_batchqueue_init(bq);
 }
 
+/*
+ *	vm_page_pqbatch_submit:		[ internal use only ]
+ *
+ *	Enqueue a page in the specified page queue's batched work queue.
+ *	The caller must have encoded the requested operation in the page
+ *	structure's aflags field.
+ */
 void
 vm_page_pqbatch_submit(vm_page_t m, uint8_t queue)
 {
@@ -3251,17 +3258,26 @@ vm_page_dequeue_deferred(vm_page_t m)
 
 	if ((queue = vm_page_queue(m)) == PQ_NONE)
 		return;
-	vm_page_aflag_set(m, PGA_DEQUEUE);
-	vm_page_pqbatch_submit(m, queue);
+
+	/*
+	 * Set PGA_DEQUEUE if it is not already set to handle a concurrent call
+	 * to vm_page_dequeue_deferred_free().  In particular, avoid modifying
+	 * the page's queue state once vm_page_dequeue_deferred_free() has been
+	 * called.  In the event of a race, two batch queue entries for the page
+	 * will be created, but the second will have no effect.
+	 */
+	if (vm_page_pqstate_cmpset(m, queue, queue, PGA_DEQUEUE, PGA_DEQUEUE))
+		vm_page_pqbatch_submit(m, queue);
 }
 
 /*
  * A variant of vm_page_dequeue_deferred() that does not assert the page
- * lock and is only to be called from vm_page_free_prep().  It is just an
- * open-coded implementation of vm_page_dequeue_deferred().  Because the
- * page is being freed, we can assume that nothing else is scheduling queue
- * operations on this page, so we get for free the mutual exclusion that
- * is otherwise provided by the page lock.
+ * lock and is only to be called from vm_page_free_prep().  Because the
+ * page is being freed, we can assume that nothing other than the page
+ * daemon is scheduling queue operations on this page, so we get for
+ * free the mutual exclusion that is otherwise provided by the page lock.
+ * To handle races, the page daemon must take care to atomically check
+ * for PGA_DEQUEUE when updating queue state.
  */
 static void
 vm_page_dequeue_deferred_free(vm_page_t m)
@@ -3372,6 +3388,42 @@ vm_page_requeue(vm_page_t m)
 	if ((m->aflags & PGA_REQUEUE) == 0)
 		vm_page_aflag_set(m, PGA_REQUEUE);
 	vm_page_pqbatch_submit(m, atomic_load_8(&m->queue));
+}
+
+/*
+ *	vm_page_swapqueue:		[ internal use only ]
+ *
+ *	Move the page from one queue to another, or to the tail of its
+ *	current queue, in the face of a possible concurrent call to
+ *	vm_page_dequeue_deferred_free().
+ */
+void
+vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq)
+{
+	struct vm_pagequeue *pq;
+
+	KASSERT(oldq < PQ_COUNT && newq < PQ_COUNT && oldq != newq,
+	    ("vm_page_swapqueue: invalid queues (%d, %d)", oldq, newq));
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("vm_page_swapqueue: page %p is unmanaged", m));
+	vm_page_assert_locked(m);
+
+	/*
+	 * Atomically update the queue field and set PGA_REQUEUE while
+	 * ensuring that PGA_DEQUEUE has not been set.
+	 */
+	pq = &vm_pagequeue_domain(m)->vmd_pagequeues[oldq];
+	vm_pagequeue_lock(pq);
+	if (!vm_page_pqstate_cmpset(m, oldq, newq, PGA_DEQUEUE, PGA_REQUEUE)) {
+		vm_pagequeue_unlock(pq);
+		return;
+	}
+	if ((m->aflags & PGA_ENQUEUED) != 0) {
+		vm_pagequeue_remove(pq, m);
+		vm_page_aflag_clear(m, PGA_ENQUEUED);
+	}
+	vm_pagequeue_unlock(pq);
+	vm_page_pqbatch_submit(m, newq);
 }
 
 /*

Modified: head/sys/vm/vm_page.h
==============================================================================
--- head/sys/vm/vm_page.h	Tue Sep  3 13:18:51 2019	(r351742)
+++ head/sys/vm/vm_page.h	Tue Sep  3 14:29:58 2019	(r351743)
@@ -147,28 +147,34 @@
  *	sleep until the page's busy state changes, after which the caller
  *	must re-lookup the page and re-evaluate its state.
  *
- *	The queue field is the index of the page queue containing the
- *	page, or PQ_NONE if the page is not enqueued.  The queue lock of a
- *	page is the page queue lock corresponding to the page queue index,
- *	or the page lock (P) for the page if it is not enqueued.  To modify
- *	the queue field, the queue lock for the old value of the field must
- *	be held.  It is invalid for a page's queue field to transition
- *	between two distinct page queue indices.  That is, when updating
- *	the queue field, either the new value or the old value must be
- *	PQ_NONE.
+ *	The queue field is the index of the page queue containing the page,
+ *	or PQ_NONE if the page is not enqueued.  The queue lock of a page is
+ *	the page queue lock corresponding to the page queue index, or the
+ *	page lock (P) for the page if it is not enqueued.  To modify the
+ *	queue field, the queue lock for the old value of the field must be
+ *	held.  There is one exception to this rule: the page daemon may
+ *	transition the queue field from PQ_INACTIVE to PQ_NONE immediately
+ *	prior to freeing a page during an inactive queue scan.  At that
+ *	point the page has already been physically dequeued and no other
+ *	references to that vm_page structure exist.
  *
  *	To avoid contention on page queue locks, page queue operations
- *	(enqueue, dequeue, requeue) are batched using per-CPU queues.
- *	A deferred operation is requested by inserting an entry into a
- *	batch queue; the entry is simply a pointer to the page, and the
- *	request type is encoded in the page's aflags field using the values
- *	in PGA_QUEUE_STATE_MASK.  The type-stability of struct vm_pages is
+ *	(enqueue, dequeue, requeue) are batched using per-CPU queues.  A
+ *	deferred operation is requested by inserting an entry into a batch
+ *	queue; the entry is simply a pointer to the page, and the request
+ *	type is encoded in the page's aflags field using the values in
+ *	PGA_QUEUE_STATE_MASK.  The type-stability of struct vm_pages is
  *	crucial to this scheme since the processing of entries in a given
- *	batch queue may be deferred indefinitely.  In particular, a page
- *	may be freed before its pending batch queue entries have been
- *	processed.  The page lock (P) must be held to schedule a batched
- *	queue operation, and the page queue lock must be held in order to
- *	process batch queue entries for the page queue.
+ *	batch queue may be deferred indefinitely.  In particular, a page may
+ *	be freed before its pending batch queue entries have been processed.
+ *	The page lock (P) must be held to schedule a batched queue
+ *	operation, and the page queue lock must be held in order to process
+ *	batch queue entries for the page queue.  There is one exception to
+ *	this rule: the thread freeing a page may schedule a dequeue without
+ *	holding the page lock.  In this scenario the only other thread which
+ *	may hold a reference to the page is the page daemon, which is
+ *	careful to avoid modifying the page's queue state once the dequeue
+ *	has been requested by setting PGA_DEQUEUE.
  */
 
 #if PAGE_SIZE == 4096
@@ -578,6 +584,7 @@ void vm_page_set_valid_range(vm_page_t m, int base, in
 int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
 void vm_page_sunbusy(vm_page_t m);
+void vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq);
 int vm_page_trysbusy(vm_page_t m);
 void vm_page_unhold_pages(vm_page_t *ma, int count);
 void vm_page_unswappable(vm_page_t m);
@@ -667,9 +674,31 @@ void vm_page_assert_pga_writeable(vm_page_t m, uint8_t
  * destinations.  In order that we can easily use a 32-bit operation, we
  * require that the aflags field be 32-bit aligned.
  */
-CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
+_Static_assert(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0,
+    "aflags field is not 32-bit aligned");
 
 /*
+ * We want to be able to update the aflags and queue fields atomically in
+ * the same operation.
+ */
+_Static_assert(offsetof(struct vm_page, aflags) / sizeof(uint32_t) ==
+    offsetof(struct vm_page, queue) / sizeof(uint32_t),
+    "aflags and queue fields do not belong to the same 32-bit word");
+_Static_assert(offsetof(struct vm_page, queue) % sizeof(uint32_t) == 2,
+    "queue field is at an unexpected offset");
+_Static_assert(sizeof(((struct vm_page *)NULL)->queue) == 1,
+    "queue field has an unexpected size");
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define	VM_PAGE_AFLAG_SHIFT	0
+#define	VM_PAGE_QUEUE_SHIFT	16
+#else
+#define	VM_PAGE_AFLAG_SHIFT	24
+#define	VM_PAGE_QUEUE_SHIFT	8
+#endif
+#define	VM_PAGE_QUEUE_MASK	(0xff << VM_PAGE_QUEUE_SHIFT)
+
+/*
  *	Clear the given bits in the specified page.
  */
 static inline void
@@ -689,12 +718,7 @@ vm_page_aflag_clear(vm_page_t m, uint8_t bits)
 	 * within this word are handled properly by the atomic update.
 	 */
 	addr = (void *)&m->aflags;
-	KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
-	    ("vm_page_aflag_clear: aflags is misaligned"));
-	val = bits;
-#if BYTE_ORDER == BIG_ENDIAN
-	val <<= 24;
-#endif
+	val = bits << VM_PAGE_AFLAG_SHIFT;
 	atomic_clear_32(addr, val);
 }
 
@@ -714,14 +738,44 @@ vm_page_aflag_set(vm_page_t m, uint8_t bits)
 	 * within this word are handled properly by the atomic update.
 	 */
 	addr = (void *)&m->aflags;
-	KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
-	    ("vm_page_aflag_set: aflags is misaligned"));
-	val = bits;
-#if BYTE_ORDER == BIG_ENDIAN
-	val <<= 24;
-#endif
+	val = bits << VM_PAGE_AFLAG_SHIFT;
 	atomic_set_32(addr, val);
-} 
+}
+
+/*
+ *	Atomically update the queue state of the page.  The operation fails if
+ *	any of the queue flags in "fflags" are set or if the "queue" field of
+ *	the page does not match the expected value; if the operation is
+ *	successful, the flags in "nflags" are set and all other queue state
+ *	flags are cleared.
+ */
+static inline bool
+vm_page_pqstate_cmpset(vm_page_t m, uint32_t oldq, uint32_t newq,
+    uint32_t fflags, uint32_t nflags)
+{
+	uint32_t *addr, nval, oval, qsmask;
+
+	vm_page_assert_locked(m);
+
+	fflags <<= VM_PAGE_AFLAG_SHIFT;
+	nflags <<= VM_PAGE_AFLAG_SHIFT;
+	newq <<= VM_PAGE_QUEUE_SHIFT;
+	oldq <<= VM_PAGE_QUEUE_SHIFT;
+	qsmask = ((PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD) <<
+	    VM_PAGE_AFLAG_SHIFT) | VM_PAGE_QUEUE_MASK;
+
+	addr = (void *)&m->aflags;
+	oval = atomic_load_32(addr);
+	do {
+		if ((oval & fflags) != 0)
+			return (false);
+		if ((oval & VM_PAGE_QUEUE_MASK) != oldq)
+			return (false);
+		nval = (oval & ~qsmask) | nflags | newq;
+	} while (!atomic_fcmpset_32(addr, &oval, nval));
+
+	return (true);
+}
 
 /*
  *	vm_page_dirty:

Modified: head/sys/vm/vm_pageout.c
==============================================================================
--- head/sys/vm/vm_pageout.c	Tue Sep  3 13:18:51 2019	(r351742)
+++ head/sys/vm/vm_pageout.c	Tue Sep  3 14:29:58 2019	(r351743)
@@ -742,7 +742,7 @@ recheck:
 		 * chance.
 		 */
 		if ((m->aflags & PGA_REQUEUE) != 0) {
-			vm_page_requeue(m);
+			vm_page_pqbatch_submit(m, queue);
 			continue;
 		}
 
@@ -1256,9 +1256,9 @@ act_scan:
 			 * place them directly in the laundry queue to reduce
 			 * queuing overhead.
 			 */
-			if (page_shortage <= 0)
-				vm_page_deactivate(m);
-			else {
+			if (page_shortage <= 0) {
+				vm_page_swapqueue(m, PQ_ACTIVE, PQ_INACTIVE);
+			} else {
 				/*
 				 * Calling vm_page_test_dirty() here would
 				 * require acquisition of the object's write
@@ -1270,11 +1270,13 @@ act_scan:
 				 * dirty field by the pmap.
 				 */
 				if (m->dirty == 0) {
-					vm_page_deactivate(m);
+					vm_page_swapqueue(m, PQ_ACTIVE,
+					    PQ_INACTIVE);
 					page_shortage -=
 					    act_scan_laundry_weight;
 				} else {
-					vm_page_launder(m);
+					vm_page_swapqueue(m, PQ_ACTIVE,
+					    PQ_LAUNDRY);
 					page_shortage--;
 				}
 			}