svn commit: r355765 - in head/sys: dev/md fs/tmpfs kern vm

Jeff Roberson jeff at FreeBSD.org
Sun Dec 15 03:15:08 UTC 2019


Author: jeff
Date: Sun Dec 15 03:15:06 2019
New Revision: 355765
URL: https://svnweb.freebsd.org/changeset/base/355765

Log:
  Add a deferred free mechanism for freeing swap space that does not require
  an exclusive object lock.
  
  Previously swap space was freed on a best effort basis when a page that
  had valid swap was dirtied, thus invalidating the swap copy.  This may be
  done inconsistently and requires the object lock which is not always
  convenient.
  
  Instead, track when swap space is present.  The first dirty is responsible
  for deleting space or setting PGA_SWAP_FREE which will trigger background
  scans to free the swap space.
  
  Simplify the locking in vm_fault_dirty() now that we can reliably identify
  the first dirty.
  
  Discussed with:	alc, kib, markj
  Differential Revision:	https://reviews.freebsd.org/D22654

Modified:
  head/sys/dev/md/md.c
  head/sys/fs/tmpfs/tmpfs_subr.c
  head/sys/kern/uipc_shm.c
  head/sys/vm/swap_pager.c
  head/sys/vm/vm_fault.c
  head/sys/vm/vm_page.c
  head/sys/vm/vm_page.h
  head/sys/vm/vm_pageout.c
  head/sys/vm/vm_pager.h

Modified: head/sys/dev/md/md.c
==============================================================================
--- head/sys/dev/md/md.c	Sun Dec 15 02:02:27 2019	(r355764)
+++ head/sys/dev/md/md.c	Sun Dec 15 03:15:06 2019	(r355765)
@@ -1118,10 +1118,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp)
 			}
 
 			vm_page_valid(m);
-			if (m->dirty != VM_PAGE_BITS_ALL) {
-				vm_page_dirty(m);
-				vm_pager_page_unswapped(m);
-			}
+			vm_page_set_dirty(m);
 		} else if (bp->bio_cmd == BIO_DELETE) {
 			if (len == PAGE_SIZE || vm_page_all_valid(m))
 				rv = VM_PAGER_OK;
@@ -1138,10 +1135,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp)
 				/* Page is valid. */
 				if (len != PAGE_SIZE) {
 					pmap_zero_page_area(m, offs, len);
-					if (m->dirty != VM_PAGE_BITS_ALL) {
-						vm_page_dirty(m);
-						vm_pager_page_unswapped(m);
-					}
+					vm_page_set_dirty(m);
 				} else {
 					vm_pager_page_unswapped(m);
 					vm_page_free(m);

Modified: head/sys/fs/tmpfs/tmpfs_subr.c
==============================================================================
--- head/sys/fs/tmpfs/tmpfs_subr.c	Sun Dec 15 02:02:27 2019	(r355764)
+++ head/sys/fs/tmpfs/tmpfs_subr.c	Sun Dec 15 03:15:06 2019	(r355765)
@@ -1505,9 +1505,8 @@ retry:
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
-				vm_page_dirty(m);
+				vm_page_set_dirty(m);
 				vm_page_xunbusy(m);
-				vm_pager_page_unswapped(m);
 			}
 		}
 

Modified: head/sys/kern/uipc_shm.c
==============================================================================
--- head/sys/kern/uipc_shm.c	Sun Dec 15 02:02:27 2019	(r355764)
+++ head/sys/kern/uipc_shm.c	Sun Dec 15 03:15:06 2019	(r355765)
@@ -198,7 +198,7 @@ uiomove_object_page(vm_object_t obj, size_t len, struc
 	 * type object.
 	 */
 	rv = vm_page_grab_valid(&m, obj, idx,
-	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_NOBUSY);
+	    VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
 	if (rv != VM_PAGER_OK) {
 		VM_OBJECT_WUNLOCK(obj);
 		printf("uiomove_object: vm_obj %p idx %jd pager error %d\n",
@@ -207,13 +207,10 @@ uiomove_object_page(vm_object_t obj, size_t len, struc
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	error = uiomove_fromphys(&m, offset, tlen, uio);
-	if (uio->uio_rw == UIO_WRITE && error == 0) {
-		VM_OBJECT_WLOCK(obj);
-		vm_page_dirty(m);
-		vm_pager_page_unswapped(m);
-		VM_OBJECT_WUNLOCK(obj);
-	}
-	vm_page_unwire(m, PQ_ACTIVE);
+	if (uio->uio_rw == UIO_WRITE && error == 0)
+		vm_page_set_dirty(m);
+	vm_page_aflag_set(m, PGA_REFERENCED);
+	vm_page_sunbusy(m);
 
 	return (error);
 }
@@ -527,9 +524,8 @@ retry:
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				KASSERT(vm_page_all_valid(m),
 				    ("shm_dotruncate: page %p is invalid", m));
-				vm_page_dirty(m);
+				vm_page_set_dirty(m);
 				vm_page_xunbusy(m);
-				vm_pager_page_unswapped(m);
 			}
 		}
 		delta = IDX_TO_OFF(object->size - nobjsize);

Modified: head/sys/vm/swap_pager.c
==============================================================================
--- head/sys/vm/swap_pager.c	Sun Dec 15 02:02:27 2019	(r355764)
+++ head/sys/vm/swap_pager.c	Sun Dec 15 03:15:06 2019	(r355765)
@@ -155,6 +155,9 @@ static struct sx swdev_syscall_lock;	/* serialize swap
 static u_long swap_reserved;
 static u_long swap_total;
 static int sysctl_page_shift(SYSCTL_HANDLER_ARGS);
+
+static SYSCTL_NODE(_vm_stats, OID_AUTO, swap, CTLFLAG_RD, 0, "VM swap stats");
+
 SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &swap_reserved, 0, sysctl_page_shift, "A", 
     "Amount of swap storage needed to back all allocated anonymous memory.");
@@ -173,6 +176,16 @@ static unsigned long swap_maxpages;
 SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
     "Maximum amount of swap supported");
 
+static counter_u64_t swap_free_deferred;
+SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_deferred,
+    CTLFLAG_RD, &swap_free_deferred,
+    "Number of pages that deferred freeing swap space");
+
+static counter_u64_t swap_free_completed;
+SYSCTL_COUNTER_U64(_vm_stats_swap, OID_AUTO, free_completed,
+    CTLFLAG_RD, &swap_free_completed,
+    "Number of deferred frees completed");
+
 /* bits from overcommit */
 #define	SWAP_RESERVE_FORCE_ON		(1 << 0)
 #define	SWAP_RESERVE_RLIMIT_ON		(1 << 1)
@@ -513,6 +526,15 @@ swap_pager_init(void)
 	sx_init(&swdev_syscall_lock, "swsysc");
 }
 
+static void
+swap_pager_counters(void)
+{
+
+	swap_free_deferred = counter_u64_alloc(M_WAITOK);
+	swap_free_completed = counter_u64_alloc(M_WAITOK);
+}
+SYSINIT(swap_counters, SI_SUB_CPU, SI_ORDER_ANY, swap_pager_counters, NULL);
+
 /*
  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  *
@@ -1112,14 +1134,37 @@ swap_pager_haspage(vm_object_t object, vm_pindex_t pin
  *
  *	This routine may not sleep.
  *
- *	The object containing the page must be locked.
+ *	The object containing the page may be locked.
  */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 	struct swblk *sb;
+	vm_object_t obj;
 
-	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	/*
+	 * Handle enqueing deferred frees first.  If we do not have the
+	 * object lock we wait for the page daemon to clear the space.
+	 */
+	obj = m->object;
+	if (!VM_OBJECT_WOWNED(obj)) {
+		VM_PAGE_OBJECT_BUSY_ASSERT(m);
+		/*
+		 * The caller is responsible for synchronization but we
+		 * will harmlessly handle races.  This is typically provided
+		 * by only calling unswapped() when a page transitions from
+		 * clean to dirty.
+		 */
+		if ((m->a.flags & (PGA_SWAP_SPACE | PGA_SWAP_FREE)) ==
+		    PGA_SWAP_SPACE) {
+			vm_page_aflag_set(m, PGA_SWAP_FREE);
+			counter_u64_add(swap_free_deferred, 1);
+		}
+		return;
+	}
+	if ((m->a.flags & PGA_SWAP_FREE) != 0)
+		counter_u64_add(swap_free_completed, 1);
+	vm_page_aflag_clear(m, PGA_SWAP_FREE | PGA_SWAP_SPACE);
 
 	/*
 	 * The meta data only exists if the object is OBJT_SWAP
@@ -1436,6 +1481,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *ma,
 		VM_OBJECT_WLOCK(object);
 		for (j = 0; j < n; ++j) {
 			mreq = ma[i + j];
+			vm_page_aflag_clear(mreq, PGA_SWAP_FREE);
 			addr = swp_pager_meta_build(mreq->object, mreq->pindex,
 			    blk + j);
 			if (addr != SWAPBLK_NONE)
@@ -1560,6 +1606,9 @@ swp_pager_async_iodone(struct buf *bp)
 			wakeup(&object->handle);
 		}
 
+		/* We always have space after I/O, successful or not. */
+		vm_page_aflag_set(m, PGA_SWAP_SPACE);
+
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
@@ -1581,6 +1630,7 @@ swp_pager_async_iodone(struct buf *bp)
 				 * then finish the I/O.
 				 */
 				MPASS(m->dirty == VM_PAGE_BITS_ALL);
+				/* PQ_UNSWAPPABLE? */
 				vm_page_lock(m);
 				vm_page_activate(m);
 				vm_page_unlock(m);

Modified: head/sys/vm/vm_fault.c
==============================================================================
--- head/sys/vm/vm_fault.c	Sun Dec 15 02:02:27 2019	(r355764)
+++ head/sys/vm/vm_fault.c	Sun Dec 15 03:15:06 2019	(r355765)
@@ -214,7 +214,7 @@ unlock_and_deallocate(struct faultstate *fs)
 
 static void
 vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
-    vm_prot_t fault_type, int fault_flags, bool excl)
+    vm_prot_t fault_type, int fault_flags)
 {
 	bool need_dirty;
 
@@ -223,7 +223,6 @@ vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_p
 	    (m->oflags & VPO_UNMANAGED) != 0)
 		return;
 
-	VM_OBJECT_ASSERT_LOCKED(m->object);
 	VM_PAGE_OBJECT_BUSY_ASSERT(m);
 
 	need_dirty = ((fault_type & VM_PROT_WRITE) != 0 &&
@@ -232,49 +231,29 @@ vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_p
 
 	vm_object_set_writeable_dirty(m->object);
 
-	if (!excl)
-		/*
-		 * If two callers of vm_fault_dirty() with excl ==
-		 * FALSE, one for the map entry with MAP_ENTRY_NOSYNC
-		 * flag set, other with flag clear, race, it is
-		 * possible for the no-NOSYNC thread to see m->dirty
-		 * != 0 and not clear PGA_NOSYNC.  Take vm_page lock
-		 * around manipulation of PGA_NOSYNC and
-		 * vm_page_dirty() call to avoid the race.
-		 */
-		vm_page_lock(m);
-
 	/*
-	 * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC
-	 * if the page is already dirty to prevent data written with
-	 * the expectation of being synced from not being synced.
-	 * Likewise if this entry does not request NOSYNC then make
-	 * sure the page isn't marked NOSYNC.  Applications sharing
-	 * data should use the same flags to avoid ping ponging.
-	 */
-	if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) {
-		if (m->dirty == 0) {
-			vm_page_aflag_set(m, PGA_NOSYNC);
-		}
-	} else {
-		vm_page_aflag_clear(m, PGA_NOSYNC);
-	}
-
-	/*
 	 * If the fault is a write, we know that this page is being
 	 * written NOW so dirty it explicitly to save on
 	 * pmap_is_modified() calls later.
 	 *
 	 * Also, since the page is now dirty, we can possibly tell
-	 * the pager to release any swap backing the page.  Calling
-	 * the pager requires a write lock on the object.
+	 * the pager to release any swap backing the page.
 	 */
-	if (need_dirty)
-		vm_page_dirty(m);
-	if (!excl)
-		vm_page_unlock(m);
-	else if (need_dirty)
-		vm_pager_page_unswapped(m);
+	if (need_dirty && vm_page_set_dirty(m) == 0) {
+		/*
+		 * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC
+		 * if the page is already dirty to prevent data written with
+		 * the expectation of being synced from not being synced.
+		 * Likewise if this entry does not request NOSYNC then make
+		 * sure the page isn't marked NOSYNC.  Applications sharing
+		 * data should use the same flags to avoid ping ponging.
+		 */
+		if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0)
+			vm_page_aflag_set(m, PGA_NOSYNC);
+		else
+			vm_page_aflag_clear(m, PGA_NOSYNC);
+	}
+
 }
 
 /*
@@ -344,7 +323,7 @@ vm_fault_soft_fast(struct faultstate *fs, vm_offset_t 
 		*m_hold = m;
 		vm_page_wire(m);
 	}
-	vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
+	vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags);
 	if (psind == 0 && !wired)
 		vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
 	VM_OBJECT_RUNLOCK(fs->first_object);
@@ -502,7 +481,7 @@ vm_fault_populate(struct faultstate *fs, vm_prot_t pro
 		for (i = 0; i < npages; i++) {
 			vm_fault_populate_check_page(&m[i]);
 			vm_fault_dirty(fs->entry, &m[i], prot, fault_type,
-			    fault_flags, true);
+			    fault_flags);
 		}
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
@@ -1381,7 +1360,7 @@ readrest:
 		fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;
 
 	vm_page_assert_xbusied(fs.m);
-	vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
+	vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags);
 
 	/*
 	 * Page must be completely valid or it is not fit to

Modified: head/sys/vm/vm_page.c
==============================================================================
--- head/sys/vm/vm_page.c	Sun Dec 15 02:02:27 2019	(r355764)
+++ head/sys/vm/vm_page.c	Sun Dec 15 03:15:06 2019	(r355765)
@@ -1584,6 +1584,10 @@ vm_page_object_remove(vm_page_t m)
 	KASSERT((m->ref_count & VPRC_OBJREF) != 0,
 	    ("page %p is missing its object ref", m));
 
+	/* Deferred free of swap space. */
+	if ((m->a.flags & PGA_SWAP_FREE) != 0)
+		vm_pager_page_unswapped(m);
+
 	mrem = vm_radix_remove(&object->rtree, m->pindex);
 	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
 
@@ -4633,6 +4637,62 @@ vm_page_bits_clear(vm_page_t m, vm_page_bits_t *bits, 
 #endif		/* PAGE_SIZE */
 }
 
+static inline vm_page_bits_t
+vm_page_bits_swap(vm_page_t m, vm_page_bits_t *bits, vm_page_bits_t newbits)
+{
+#if PAGE_SIZE == 32768
+	uint64_t old;
+
+	old = *bits;
+	while (atomic_fcmpset_64(bits, &old, newbits) == 0);
+	return (old);
+#elif PAGE_SIZE == 16384
+	uint32_t old;
+
+	old = *bits;
+	while (atomic_fcmpset_32(bits, &old, newbits) == 0);
+	return (old);
+#elif (PAGE_SIZE == 8192) && defined(atomic_fcmpset_16)
+	uint16_t old;
+
+	old = *bits;
+	while (atomic_fcmpset_16(bits, &old, newbits) == 0);
+	return (old);
+#elif (PAGE_SIZE == 4096) && defined(atomic_fcmpset_8)
+	uint8_t old;
+
+	old = *bits;
+	while (atomic_fcmpset_8(bits, &old, newbits) == 0);
+	return (old);
+#else		/* PAGE_SIZE <= 4096*/
+	uintptr_t addr;
+	uint32_t old, new, mask;
+	int shift;
+
+	addr = (uintptr_t)bits;
+	/*
+	 * Use a trick to perform a 32-bit atomic on the
+	 * containing aligned word, to not depend on the existence
+	 * of atomic_{set, swap, clear}_{8, 16}.
+	 */
+	shift = addr & (sizeof(uint32_t) - 1);
+#if BYTE_ORDER == BIG_ENDIAN
+	shift = (sizeof(uint32_t) - sizeof(vm_page_bits_t) - shift) * NBBY;
+#else
+	shift *= NBBY;
+#endif
+	addr &= ~(sizeof(uint32_t) - 1);
+	mask = VM_PAGE_BITS_ALL << shift;
+
+	old = *bits;
+	do {
+		new = old & ~mask;
+		new |= newbits << shift;
+	} while (atomic_fcmpset_32((uint32_t *)addr, &old, new) == 0);
+	return (old >> shift);
+#endif		/* PAGE_SIZE */
+}
+
 /*
  *	vm_page_set_valid_range:
  *
@@ -4688,6 +4748,28 @@ vm_page_set_valid_range(vm_page_t m, int base, int siz
 		m->valid |= pagebits;
 	else
 		vm_page_bits_set(m, &m->valid, pagebits);
+}
+
+/*
+ * Set the page dirty bits and free the invalid swap space if
+ * present.  Returns the previous dirty bits.
+ */
+vm_page_bits_t
+vm_page_set_dirty(vm_page_t m)
+{
+	vm_page_bits_t old;
+
+	VM_PAGE_OBJECT_BUSY_ASSERT(m);
+
+	if (vm_page_xbusied(m) && !pmap_page_is_write_mapped(m)) {
+		old = m->dirty;
+		m->dirty = VM_PAGE_BITS_ALL;
+	} else
+		old = vm_page_bits_swap(m, &m->dirty, VM_PAGE_BITS_ALL);
+	if (old == 0 && (m->a.flags & PGA_SWAP_SPACE) != 0)
+		vm_pager_page_unswapped(m);
+
+	return (old);
 }
 
 /*

Modified: head/sys/vm/vm_page.h
==============================================================================
--- head/sys/vm/vm_page.h	Sun Dec 15 02:02:27 2019	(r355764)
+++ head/sys/vm/vm_page.h	Sun Dec 15 03:15:06 2019	(r355765)
@@ -429,6 +429,10 @@ extern struct mtx_padalign pa_lock[];
  * PGA_REQUEUE_HEAD is a special flag for enqueuing pages near the head of
  * the inactive queue, thus bypassing LRU.  The page lock must be held to
  * set this flag, and the queue lock for the page must be held to clear it.
+ *
+ * PGA_SWAP_FREE is used to defer freeing swap space to the pageout daemon
+ * when the context that dirties the page does not have the object write lock
+ * held.
  */
 #define	PGA_WRITEABLE	0x0001		/* page may be mapped writeable */
 #define	PGA_REFERENCED	0x0002		/* page has been referenced */
@@ -438,6 +442,8 @@ extern struct mtx_padalign pa_lock[];
 #define	PGA_REQUEUE	0x0020		/* page is due to be requeued */
 #define	PGA_REQUEUE_HEAD 0x0040		/* page requeue should bypass LRU */
 #define	PGA_NOSYNC	0x0080		/* do not collect for syncer */
+#define	PGA_SWAP_FREE	0x0100		/* page with swap space was dirtied */
+#define	PGA_SWAP_SPACE	0x0200		/* page has allocated swap space */
 
 #define	PGA_QUEUE_OP_MASK	(PGA_DEQUEUE | PGA_REQUEUE | PGA_REQUEUE_HEAD)
 #define	PGA_QUEUE_STATE_MASK	(PGA_ENQUEUED | PGA_QUEUE_OP_MASK)
@@ -647,6 +653,7 @@ void vm_page_requeue(vm_page_t m);
 int vm_page_sbusied(vm_page_t m);
 vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start,
     vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options);
+vm_page_bits_t vm_page_set_dirty(vm_page_t m);
 void vm_page_set_valid_range(vm_page_t m, int base, int size);
 int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
 int vm_page_sleep_if_xbusy(vm_page_t m, const char *msg);

Modified: head/sys/vm/vm_pageout.c
==============================================================================
--- head/sys/vm/vm_pageout.c	Sun Dec 15 02:02:27 2019	(r355764)
+++ head/sys/vm/vm_pageout.c	Sun Dec 15 03:15:06 2019	(r355765)
@@ -1307,6 +1307,14 @@ act_scan:
 			act_delta++;
 		}
 
+		/* Deferred free of swap space. */
+		if ((m->a.flags & PGA_SWAP_FREE) != 0 &&
+		    VM_OBJECT_TRYWLOCK(object)) {
+			if (m->object == object)
+				vm_pager_page_unswapped(m);
+			VM_OBJECT_WUNLOCK(object);
+		}
+
 		/*
 		 * Advance or decay the act_count based on recent usage.
 		 */
@@ -1541,6 +1549,10 @@ recheck:
 			addl_page_shortage++;
 			goto reinsert;
 		}
+
+		/* Deferred free of swap space. */
+		if ((m->a.flags & PGA_SWAP_FREE) != 0)
+			vm_pager_page_unswapped(m);
 
 		/*
 		 * Re-check for wirings now that we hold the object lock and

Modified: head/sys/vm/vm_pager.h
==============================================================================
--- head/sys/vm/vm_pager.h	Sun Dec 15 02:02:27 2019	(r355764)
+++ head/sys/vm/vm_pager.h	Sun Dec 15 03:15:06 2019	(r355765)
@@ -179,9 +179,6 @@ vm_pager_populate(vm_object_t object, vm_pindex_t pidx
  * 
  *	Destroy swap associated with the page.
  * 
- *	The object containing the page must be locked.
- *      This function may not block.
- *
  *	XXX: A much better name would be "vm_pager_page_dirtied()"
  *	XXX: It is not obvious if this could be profitably used by any
  *	XXX: pagers besides the swap_pager or if it should even be a
@@ -191,7 +188,6 @@ static __inline void
 vm_pager_page_unswapped(vm_page_t m)
 {
 
-	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (pagertab[m->object->type]->pgo_pageunswapped)
 		(*pagertab[m->object->type]->pgo_pageunswapped)(m);
 }


More information about the svn-src-head mailing list