svn commit: r348644 - in stable/12/sys: riscv/include riscv/riscv vm

Mark Johnston markj at FreeBSD.org
Tue Jun 4 17:31:07 UTC 2019


Author: markj
Date: Tue Jun  4 17:31:05 2019
New Revision: 348644
URL: https://svnweb.freebsd.org/changeset/base/348644

Log:
  MFC r344106:
  Implement transparent 2MB superpage promotion for RISC-V.

Modified:
  stable/12/sys/riscv/include/param.h
  stable/12/sys/riscv/include/pmap.h
  stable/12/sys/riscv/include/pte.h
  stable/12/sys/riscv/include/vmparam.h
  stable/12/sys/riscv/riscv/pmap.c
  stable/12/sys/vm/vm_fault.c
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/sys/riscv/include/param.h
==============================================================================
--- stable/12/sys/riscv/include/param.h	Tue Jun  4 17:30:22 2019	(r348643)
+++ stable/12/sys/riscv/include/param.h	Tue Jun  4 17:31:05 2019	(r348644)
@@ -82,7 +82,7 @@
 #define	PAGE_SIZE	(1 << PAGE_SHIFT)	/* Page size */
 #define	PAGE_MASK	(PAGE_SIZE - 1)
 
-#define	MAXPAGESIZES	1		/* maximum number of supported page sizes */
+#define	MAXPAGESIZES	3	/* maximum number of supported page sizes */
 
 #ifndef KSTACK_PAGES
 #define	KSTACK_PAGES	4	/* pages of kernel stack (with pcb) */

Modified: stable/12/sys/riscv/include/pmap.h
==============================================================================
--- stable/12/sys/riscv/include/pmap.h	Tue Jun  4 17:30:22 2019	(r348643)
+++ stable/12/sys/riscv/include/pmap.h	Tue Jun  4 17:31:05 2019	(r348644)
@@ -44,6 +44,8 @@
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
+#include <vm/_vm_radix.h>
+
 #ifdef _KERNEL
 
 #define	vtophys(va)	pmap_kextract((vm_offset_t)(va))
@@ -80,6 +82,7 @@ struct pmap {
 	pd_entry_t		*pm_l1;
 	TAILQ_HEAD(,pv_chunk)	pm_pvchunk;	/* list of mappings in pmap */
 	LIST_ENTRY(pmap)	pm_list;	/* List of all pmaps */
+	struct vm_radix		pm_root;
 };
 
 typedef struct pv_entry {
@@ -139,6 +142,7 @@ void	pmap_kenter_device(vm_offset_t, vm_size_t, vm_pad
 vm_paddr_t pmap_kextract(vm_offset_t va);
 void	pmap_kremove(vm_offset_t);
 void	pmap_kremove_device(vm_offset_t, vm_size_t);
+bool	pmap_ps_enabled(pmap_t);
 
 void	*pmap_mapdev(vm_offset_t, vm_size_t);
 void	*pmap_mapbios(vm_paddr_t, vm_size_t);

Modified: stable/12/sys/riscv/include/pte.h
==============================================================================
--- stable/12/sys/riscv/include/pte.h	Tue Jun  4 17:30:22 2019	(r348643)
+++ stable/12/sys/riscv/include/pte.h	Tue Jun  4 17:31:05 2019	(r348644)
@@ -62,7 +62,8 @@ typedef	uint64_t	pn_t;			/* page number */
 #define	L3_SIZE 	(1 << L3_SHIFT)
 #define	L3_OFFSET 	(L3_SIZE - 1)
 
-#define	Ln_ENTRIES	(1 << 9)
+#define	Ln_ENTRIES_SHIFT 9
+#define	Ln_ENTRIES	(1 << Ln_ENTRIES_SHIFT)
 #define	Ln_ADDR_MASK	(Ln_ENTRIES - 1)
 
 /* Bits 9:8 are reserved for software */
@@ -79,6 +80,8 @@ typedef	uint64_t	pn_t;			/* page number */
 #define	PTE_RWX		(PTE_R | PTE_W | PTE_X)
 #define	PTE_RX		(PTE_R | PTE_X)
 #define	PTE_KERN	(PTE_V | PTE_R | PTE_W | PTE_A | PTE_D)
+#define	PTE_PROMOTE	(PTE_V | PTE_RWX | PTE_D | PTE_A | PTE_G | PTE_U | \
+			 PTE_SW_MANAGED | PTE_SW_WIRED)
 
 #define	PTE_PPN0_S	10
 #define	PTE_PPN1_S	19

Modified: stable/12/sys/riscv/include/vmparam.h
==============================================================================
--- stable/12/sys/riscv/include/vmparam.h	Tue Jun  4 17:30:22 2019	(r348643)
+++ stable/12/sys/riscv/include/vmparam.h	Tue Jun  4 17:31:05 2019	(r348644)
@@ -99,10 +99,10 @@
 #define	VM_NFREEORDER		12
 
 /*
- * Disable superpage reservations.
+ * Enable superpage reservations: 1 level.
  */
 #ifndef	VM_NRESERVLEVEL
-#define	VM_NRESERVLEVEL		0
+#define	VM_NRESERVLEVEL		1
 #endif
 
 /*

Modified: stable/12/sys/riscv/riscv/pmap.c
==============================================================================
--- stable/12/sys/riscv/riscv/pmap.c	Tue Jun  4 17:30:22 2019	(r348643)
+++ stable/12/sys/riscv/riscv/pmap.c	Tue Jun  4 17:31:05 2019	(r348644)
@@ -118,6 +118,7 @@ __FBSDID("$FreeBSD$");
  */
 
 #include <sys/param.h>
+#include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -145,6 +146,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
@@ -154,9 +156,8 @@ __FBSDID("$FreeBSD$");
 #include <machine/pcb.h>
 #include <machine/sbi.h>
 
-#define	NPDEPG		(PAGE_SIZE/(sizeof (pd_entry_t)))
-#define	NUPDE			(NPDEPG * NPDEPG)
-#define	NUSERPGTBLS		(NUPDE + NPDEPG)
+#define	NUL1E		(Ln_ENTRIES * Ln_ENTRIES)
+#define	NUL2E		(Ln_ENTRIES * NUL1E)
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
@@ -175,11 +176,12 @@ __FBSDID("$FreeBSD$");
 #endif
 
 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
+#define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
-			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
+			(&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
@@ -230,13 +232,52 @@ CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_
 static struct rwlock_padalign pvh_global_lock;
 static struct mtx_padalign allpmaps_lock;
 
+static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0,
+    "VM/pmap parameters");
+
+static int superpages_enabled = 1;
+SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
+    CTLFLAG_RDTUN, &superpages_enabled, 0,
+    "Enable support for transparent superpages");
+
+static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
+    "2MB page mapping counters");
+
+static u_long pmap_l2_demotions;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
+    &pmap_l2_demotions, 0,
+    "2MB page demotions");
+
+static u_long pmap_l2_mappings;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
+    &pmap_l2_mappings, 0,
+    "2MB page mappings");
+
+static u_long pmap_l2_p_failures;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
+    &pmap_l2_p_failures, 0,
+    "2MB page promotion failures");
+
+static u_long pmap_l2_promotions;
+SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
+    &pmap_l2_promotions, 0,
+    "2MB page promotions");
+
 /*
  * Data for the pv entry allocation mechanism
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
+static struct md_page *pv_table;
+static struct md_page pv_dummy;
 
+/*
+ * Internal flags for pmap_enter()'s helper functions.
+ */
+#define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
+#define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
+
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
@@ -244,6 +285,11 @@ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, 
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
+static bool	pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
+static bool	pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
+		    vm_offset_t va, struct rwlock **lockp);
+static int	pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
+		    u_int flags, vm_page_t m, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
@@ -254,9 +300,9 @@ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap,
 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 
-static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
+static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
-static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
+static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 
 #define	pmap_clear(pte)			pmap_store(pte, 0)
 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
@@ -636,7 +682,8 @@ pmap_page_init(vm_page_t m)
 void
 pmap_init(void)
 {
-	int i;
+	vm_size_t s;
+	int i, pv_npg;
 
 	/*
 	 * Initialize the pv chunk and pmap list mutexes.
@@ -649,6 +696,24 @@ pmap_init(void)
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
+
+	/*
+	 * Calculate the size of the pv head table for superpages.
+	 */
+	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
+
+	/*
+	 * Allocate memory for the pv head table for superpages.
+	 */
+	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
+	s = round_page(s);
+	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
+	for (i = 0; i < pv_npg; i++)
+		TAILQ_INIT(&pv_table[i].pv_list);
+	TAILQ_INIT(&pv_dummy.pv_list);
+
+	if (superpages_enabled)
+		pagesizes[1] = L2_SIZE;
 }
 
 #ifdef SMP
@@ -999,6 +1064,13 @@ pmap_qremove(vm_offset_t sva, int count)
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
+bool
+pmap_ps_enabled(pmap_t pmap __unused)
+{
+
+	return (superpages_enabled);
+}
+
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
@@ -1018,6 +1090,34 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
+
+/*
+ * Inserts the specified page table page into the specified pmap's collection
+ * of idle page table pages.  Each of a pmap's page table pages is responsible
+ * for mapping a distinct range of virtual addresses.  The pmap's collection is
+ * ordered by this virtual address range.
+ */
+static __inline int
+pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3)
+{
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	return (vm_radix_insert(&pmap->pm_root, ml3));
+}
+
+/*
+ * Removes the page table page mapping the specified virtual address from the
+ * specified pmap's collection of idle page table pages, and returns it.
+ * Otherwise, returns NULL if there is no page table page corresponding to the
+ * specified virtual address.
+ */
+static __inline vm_page_t
+pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
+{
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
+}
 	
 /*
  * Decrements a page table page's wire count, which is used to record the
@@ -1026,12 +1126,12 @@ pmap_add_delayed_free_list(vm_page_t m, struct spglist
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
-pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
+pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
-		_pmap_unwire_l3(pmap, va, m, free);
+		_pmap_unwire_ptp(pmap, va, m, free);
 		return (TRUE);
 	} else {
 		return (FALSE);
@@ -1039,36 +1139,30 @@ pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t 
 }
 
 static void
-_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
+_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 	vm_paddr_t phys;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	/*
-	 * unmap the page table page
-	 */
-	if (m->pindex >= NUPDE) {
-		/* PD page */
+	if (m->pindex >= NUL1E) {
 		pd_entry_t *l1;
 		l1 = pmap_l1(pmap, va);
 		pmap_clear(l1);
 		pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
 	} else {
-		/* PTE page */
 		pd_entry_t *l2;
 		l2 = pmap_l2(pmap, va);
 		pmap_clear(l2);
 	}
 	pmap_resident_count_dec(pmap, 1);
-	if (m->pindex < NUPDE) {
+	if (m->pindex < NUL1E) {
 		pd_entry_t *l1;
-		/* We just released a PT, unhold the matching PD */
 		vm_page_t pdpg;
 
 		l1 = pmap_l1(pmap, va);
 		phys = PTE_TO_PHYS(pmap_load(l1));
 		pdpg = PHYS_TO_VM_PAGE(phys);
-		pmap_unwire_l3(pmap, va, pdpg, free);
+		pmap_unwire_ptp(pmap, va, pdpg, free);
 	}
 	pmap_invalidate_page(pmap, va);
 
@@ -1082,24 +1176,20 @@ _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t
 }
 
 /*
- * After removing an l3 entry, this routine is used to
+ * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
-pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
+pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
-	vm_paddr_t phys;
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
-
-	phys = PTE_TO_PHYS(ptepde);
-
-	mpte = PHYS_TO_VM_PAGE(phys);
-	return (pmap_unwire_l3(pmap, va, mpte, free));
+	mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde));
+	return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 void
@@ -1140,6 +1230,8 @@ pmap_pinit(pmap_t pmap)
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock(&allpmaps_lock);
 
+	vm_radix_init(&pmap->pm_root);
+
 	return (1);
 }
 
@@ -1193,11 +1285,11 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, str
 	 * it isn't already there.
 	 */
 
-	if (ptepindex >= NUPDE) {
+	if (ptepindex >= NUL1E) {
 		pd_entry_t *l1;
 		vm_pindex_t l1index;
 
-		l1index = ptepindex - NUPDE;
+		l1index = ptepindex - NUL1E;
 		l1 = &pmap->pm_l1[l1index];
 
 		pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE);
@@ -1213,7 +1305,7 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, str
 		l1 = &pmap->pm_l1[l1index];
 		if (pmap_load(l1) == 0) {
 			/* recurse for allocating page dir */
-			if (_pmap_alloc_l3(pmap, NUPDE + l1index,
+			if (_pmap_alloc_l3(pmap, NUL1E + l1index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
@@ -1241,6 +1333,29 @@ _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, str
 }
 
 static vm_page_t
+pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
+{
+	pd_entry_t *l1;
+	vm_page_t l2pg;
+	vm_pindex_t l2pindex;
+
+retry:
+	l1 = pmap_l1(pmap, va);
+	if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) {
+		/* Add a reference to the L2 page. */
+		l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1)));
+		l2pg->wire_count++;
+	} else {
+		/* Allocate a L2 page. */
+		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
+		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
+		if (l2pg == NULL && lockp != NULL)
+			goto retry;
+	}
+	return (l2pg);
+}
+
+static vm_page_t
 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
@@ -1599,6 +1714,79 @@ retry:
 }
 
 /*
+ * Ensure that the number of spare PV entries in the specified pmap meets or
+ * exceeds the given count, "needed".
+ *
+ * The given PV list lock may be released.
+ */
+static void
+reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
+{
+	struct pch new_tail;
+	struct pv_chunk *pc;
+	vm_page_t m;
+	int avail, free;
+	bool reclaimed;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
+
+	/*
+	 * Newly allocated PV chunks must be stored in a private list until
+	 * the required number of PV chunks have been allocated.  Otherwise,
+	 * reclaim_pv_chunk() could recycle one of these chunks.  In
+	 * contrast, these chunks must be added to the pmap upon allocation.
+	 */
+	TAILQ_INIT(&new_tail);
+retry:
+	avail = 0;
+	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
+		bit_count((bitstr_t *)pc->pc_map, 0,
+		    sizeof(pc->pc_map) * NBBY, &free);
+		if (free == 0)
+			break;
+		avail += free;
+		if (avail >= needed)
+			break;
+	}
+	for (reclaimed = false; avail < needed; avail += _NPCPV) {
+		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+		    VM_ALLOC_WIRED);
+		if (m == NULL) {
+			m = reclaim_pv_chunk(pmap, lockp);
+			if (m == NULL)
+				goto retry;
+			reclaimed = true;
+		}
+		/* XXX PV STATS */
+#if 0
+		dump_add_page(m->phys_addr);
+#endif
+		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
+		pc->pc_pmap = pmap;
+		pc->pc_map[0] = PC_FREE0;
+		pc->pc_map[1] = PC_FREE1;
+		pc->pc_map[2] = PC_FREE2;
+		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+
+		/*
+		 * The reclaim might have freed a chunk from the current pmap.
+		 * If that chunk contained available entries, we need to
+		 * re-count the number of available entries.
+		 */
+		if (reclaimed)
+			goto retry;
+	}
+	if (!TAILQ_EMPTY(&new_tail)) {
+		mtx_lock(&pv_chunks_mutex);
+		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
+		mtx_unlock(&pv_chunks_mutex);
+	}
+}
+
+/*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
@@ -1632,7 +1820,7 @@ pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_off
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 
-	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
+	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
 	free_pv_entry(pmap, pv);
 }
 
@@ -1660,6 +1848,222 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 
 }
 
 /*
+ * After demotion from a 2MB page mapping to 512 4KB page mappings,
+ * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
+ * entries for each of the 4KB page mappings.
+ */
+static void __unused
+pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
+{
+	struct md_page *pvh;
+	struct pv_chunk *pc;
+	pv_entry_t pv;
+	vm_page_t m;
+	vm_offset_t va_last;
+	int bit, field;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+
+	/*
+	 * Transfer the 2mpage's pv entry for this mapping to the first
+	 * page's pv list.  Once this transfer begins, the pv list lock
+	 * must not be released until the last pv entry is reinstantiated.
+	 */
+	pvh = pa_to_pvh(pa);
+	va &= ~L2_OFFSET;
+	pv = pmap_pvh_remove(pvh, pmap, va);
+	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
+	m = PHYS_TO_VM_PAGE(pa);
+	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+	m->md.pv_gen++;
+	/* Instantiate the remaining 511 pv entries. */
+	va_last = va + L2_SIZE - PAGE_SIZE;
+	for (;;) {
+		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
+		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
+		for (field = 0; field < _NPCM; field++) {
+			while (pc->pc_map[field] != 0) {
+				bit = ffsl(pc->pc_map[field]) - 1;
+				pc->pc_map[field] &= ~(1ul << bit);
+				pv = &pc->pc_pventry[field * 64 + bit];
+				va += PAGE_SIZE;
+				pv->pv_va = va;
+				m++;
+				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+			    ("pmap_pv_demote_l2: page %p is not managed", m));
+				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
+				m->md.pv_gen++;
+				if (va == va_last)
+					goto out;
+			}
+		}
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+	}
+out:
+	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+	}
+	/* XXX PV stats */
+}
+
+#if VM_NRESERVLEVEL > 0
+static void
+pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
+{
+	struct md_page *pvh;
+	pv_entry_t pv;
+	vm_page_t m;
+	vm_offset_t va_last;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	KASSERT((va & L2_OFFSET) == 0,
+	    ("pmap_pv_promote_l2: misaligned va %#lx", va));
+
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+
+	m = PHYS_TO_VM_PAGE(pa);
+	pv = pmap_pvh_remove(&m->md, pmap, va);
+	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
+	pvh = pa_to_pvh(pa);
+	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+	pvh->pv_gen++;
+
+	va_last = va + L2_SIZE - PAGE_SIZE;
+	do {
+		m++;
+		va += PAGE_SIZE;
+		pmap_pvh_free(&m->md, pmap, va);
+	} while (va < va_last);
+}
+#endif /* VM_NRESERVLEVEL > 0 */
+
+/*
+ * Create the PV entry for a 2MB page mapping.  Always returns true unless the
+ * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
+ * false if the PV entry cannot be allocated without resorting to reclamation.
+ */
+static bool
+pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
+    struct rwlock **lockp)
+{
+	struct md_page *pvh;
+	pv_entry_t pv;
+	vm_paddr_t pa;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	/* Pass NULL instead of the lock pointer to disable reclamation. */
+	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
+	    NULL : lockp)) == NULL)
+		return (false);
+	pv->pv_va = va;
+	pa = PTE_TO_PHYS(l2e);
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
+	pvh = pa_to_pvh(pa);
+	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
+	pvh->pv_gen++;
+	return (true);
+}
+
+static void
+pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
+{
+	pt_entry_t newl2, oldl2;
+	vm_page_t ml3;
+	vm_paddr_t ml3pa;
+
+	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
+	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+	ml3 = pmap_remove_pt_page(pmap, va);
+	if (ml3 == NULL)
+		panic("pmap_remove_kernel_l2: Missing pt page");
+
+	ml3pa = VM_PAGE_TO_PHYS(ml3);
+	newl2 = ml3pa | PTE_V;
+
+	/*
+	 * Initialize the page table page.
+	 */
+	pagezero((void *)PHYS_TO_DMAP(ml3pa));
+
+	/*
+	 * Demote the mapping.
+	 */
+	oldl2 = pmap_load_store(l2, newl2);
+	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
+	    __func__, l2, oldl2));
+}
+
+/*
+ * pmap_remove_l2: Do the things to unmap a level 2 superpage.
+ */
+static int
+pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
+    pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
+{
+	struct md_page *pvh;
+	pt_entry_t oldl2;
+	vm_offset_t eva, va;
+	vm_page_t m, ml3;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
+	oldl2 = pmap_load_clear(l2);
+	KASSERT((oldl2 & PTE_RWX) != 0,
+	    ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
+
+	/*
+	 * The sfence.vma documentation states that it is sufficient to specify
+	 * a single address within a superpage mapping.  However, since we do
+	 * not perform any invalidation upon promotion, TLBs may still be
+	 * caching 4KB mappings within the superpage, so we must invalidate the
+	 * entire range.
+	 */
+	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
+	if ((oldl2 & PTE_SW_WIRED) != 0)
+		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
+	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
+	if ((oldl2 & PTE_SW_MANAGED) != 0) {
+		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
+		pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
+		pmap_pvh_free(pvh, pmap, sva);
+		eva = sva + L2_SIZE;
+		for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2));
+		    va < eva; va += PAGE_SIZE, m++) {
+			if ((oldl2 & PTE_D) != 0)
+				vm_page_dirty(m);
+			if ((oldl2 & PTE_A) != 0)
+				vm_page_aflag_set(m, PGA_REFERENCED);
+			if (TAILQ_EMPTY(&m->md.pv_list) &&
+			    TAILQ_EMPTY(&pvh->pv_list))
+				vm_page_aflag_clear(m, PGA_WRITEABLE);
+		}
+	}
+	if (pmap == kernel_pmap) {
+		pmap_remove_kernel_l2(pmap, l2, sva);
+	} else {
+		ml3 = pmap_remove_pt_page(pmap, sva);
+		if (ml3 != NULL) {
+			pmap_resident_count_dec(pmap, 1);
+			KASSERT(ml3->wire_count == Ln_ENTRIES,
+			    ("pmap_remove_l2: l3 page wire count error"));
+			ml3->wire_count = 1;
+			vm_page_unwire_noq(ml3);
+			pmap_add_delayed_free_list(ml3, free, FALSE);
+		}
+	}
+	return (pmap_unuse_pt(pmap, sva, l1e, free));
+}
+
+/*
  * pmap_remove_l3: do the things to unmap a page in a process
  */
 static int
@@ -1687,7 +2091,7 @@ pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_
 		pmap_pvh_free(&m->md, pmap, va);
 	}
 
-	return (pmap_unuse_l3(pmap, va, l2e, free));
+	return (pmap_unuse_pt(pmap, va, l2e, free));
 }
 
 /*
@@ -1699,11 +2103,11 @@ pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
+	struct spglist free;
 	struct rwlock *lock;
 	vm_offset_t va, va_next;
-	pd_entry_t *l1, *l2;
-	pt_entry_t l3_pte, *l3;
-	struct spglist free;
+	pd_entry_t *l1, *l2, l2e;
+	pt_entry_t *l3;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
@@ -1739,16 +2143,22 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (l2 == NULL)
 			continue;
-
-		l3_pte = pmap_load(l2);
-
-		/*
-		 * Weed out invalid mappings.
-		 */
-		if (l3_pte == 0)
+		if ((l2e = pmap_load(l2)) == 0)
 			continue;
-		if ((pmap_load(l2) & PTE_RX) != 0)
-			continue;
+		if ((l2e & PTE_RWX) != 0) {
+			if (sva + L2_SIZE == va_next && eva >= va_next) {
+				(void)pmap_remove_l2(pmap, l2, sva,
+				    pmap_load(l1), &free, &lock);
+				continue;
+			} else if (!pmap_demote_l2_locked(pmap, l2, sva,
+			    &lock)) {
+				/*
+				 * The large page mapping was destroyed.
+				 */
+				continue;
+			}
+			l2e = pmap_load(l2);
+		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
@@ -1761,8 +2171,6 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 		va = va_next;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
-			if (l3 == NULL)
-				panic("l3 == NULL");
 			if (pmap_load(l3) == 0) {
 				if (va != va_next) {
 					pmap_invalidate_range(pmap, va, sva);
@@ -1772,8 +2180,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 			}
 			if (va == va_next)
 				va = sva;
-			if (pmap_remove_l3(pmap, l3, sva, l3_pte, &free,
-			    &lock)) {
+			if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
 				sva += L3_SIZE;
 				break;
 			}
@@ -1783,7 +2190,7 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
-	rw_runlock(&pvh_global_lock);	
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, false);
 }
@@ -1804,42 +2211,54 @@ pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t 
 void
 pmap_remove_all(vm_page_t m)
 {
-	pv_entry_t pv;
-	pmap_t pmap;
-	pt_entry_t *l3, tl3;
-	pd_entry_t *l2, tl2;
 	struct spglist free;
+	struct md_page *pvh;
+	pmap_t pmap;
+	pt_entry_t *l3, l3e;
+	pd_entry_t *l2, l2e;
+	pv_entry_t pv;
+	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
+	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
+	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
+
 	rw_wlock(&pvh_global_lock);
+	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
+		pmap = PV_PMAP(pv);
+		PMAP_LOCK(pmap);
+		va = pv->pv_va;
+		l2 = pmap_l2(pmap, va);
+		(void)pmap_demote_l2(pmap, l2, va);
+		PMAP_UNLOCK(pmap);
+	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap_resident_count_dec(pmap, 1);
 		l2 = pmap_l2(pmap, pv->pv_va);
 		KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
-		tl2 = pmap_load(l2);
+		l2e = pmap_load(l2);
 
-		KASSERT((tl2 & PTE_RX) == 0,
-		    ("pmap_remove_all: found a table when expecting "
-		    "a block in %p's pv list", m));
+		KASSERT((l2e & PTE_RX) == 0,
+		    ("pmap_remove_all: found a superpage in %p's pv list", m));
 
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
-		tl3 = pmap_load_clear(l3);
+		l3e = pmap_load_clear(l3);
 		pmap_invalidate_page(pmap, pv->pv_va);
-		if (tl3 & PTE_SW_WIRED)
+		if (l3e & PTE_SW_WIRED)
 			pmap->pm_stats.wired_count--;
-		if ((tl3 & PTE_A) != 0)
+		if ((l3e & PTE_A) != 0)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
-		if ((tl3 & PTE_D) != 0)
+		if ((l3e & PTE_D) != 0)
 			vm_page_dirty(m);
-		pmap_unuse_l3(pmap, pv->pv_va, pmap_load(l2), &free);
+		pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
@@ -1857,10 +2276,12 @@ pmap_remove_all(vm_page_t m)
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
-	pd_entry_t *l1, *l2;
+	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t *l3, l3e, mask;
 	vm_page_t m;
-	vm_offset_t va_next;
+	vm_paddr_t pa;
+	vm_offset_t va, va_next;
+	bool anychanged, pv_lists_locked;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
@@ -1871,12 +2292,14 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t
 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
 		return;
 
+	anychanged = false;
+	pv_lists_locked = false;
 	mask = 0;
 	if ((prot & VM_PROT_WRITE) == 0)
 		mask |= PTE_W | PTE_D;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		mask |= PTE_X;
-
+resume:
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l1 = pmap_l1(pmap, sva);
@@ -1892,10 +2315,41 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
-		if (l2 == NULL || pmap_load(l2) == 0)
+		if (l2 == NULL || (l2e = pmap_load(l2)) == 0)
 			continue;
-		if ((pmap_load(l2) & PTE_RX) != 0)
-			continue;
+		if ((l2e & PTE_RWX) != 0) {
+			if (sva + L2_SIZE == va_next && eva >= va_next) {
+retryl2:
+				if ((l2e & (PTE_SW_MANAGED | PTE_D)) ==
+				    (PTE_SW_MANAGED | PTE_D)) {
+					pa = PTE_TO_PHYS(l2e);
+					for (va = sva, m = PHYS_TO_VM_PAGE(pa);
+					    va < va_next; m++, va += PAGE_SIZE)
+						vm_page_dirty(m);
+				}
+				if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
+					goto retryl2;
+				anychanged = true;
+			} else {
+				if (!pv_lists_locked) {
+					pv_lists_locked = true;
+					if (!rw_try_rlock(&pvh_global_lock)) {
+						if (anychanged)
+							pmap_invalidate_all(
+							    pmap);
+						PMAP_UNLOCK(pmap);
+						rw_rlock(&pvh_global_lock);
+						goto resume;
+					}
+				}
+				if (!pmap_demote_l2(pmap, l2, sva)) {
+					/*
+					 * The large page mapping was destroyed.
+					 */
+					continue;
+				}
+			}
+		}
 
 		if (va_next > eva)
 			va_next = eva;
@@ -1903,7 +2357,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			l3e = pmap_load(l3);
-retry:
+retryl3:
 			if ((l3e & PTE_V) == 0)
 				continue;
 			if ((prot & VM_PROT_WRITE) == 0 &&
@@ -1913,60 +2367,236 @@ retry:
 				vm_page_dirty(m);
 			}
 			if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
-				goto retry;
-			/* XXX: Use pmap_invalidate_range */
-			pmap_invalidate_page(pmap, sva);
+				goto retryl3;
+			anychanged = true;
 		}
 	}
+	if (anychanged)
+		pmap_invalidate_all(pmap);
+	if (pv_lists_locked)
+		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 int
 pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
 {
-	pt_entry_t orig_l3;
-	pt_entry_t new_l3;
-	pt_entry_t *l3;
+	pd_entry_t *l2, l2e;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list