svn commit: r204855 - in user/kmacy/releng_8_page_lock/sys: amd64/amd64 amd64/conf amd64/include cddl/contrib/opensolaris/uts/common/fs/zfs dev/md kern nfsclient ufs/ffs vm

Kip Macy kmacy at FreeBSD.org
Mon Mar 8 05:03:25 UTC 2010


Author: kmacy
Date: Mon Mar  8 05:03:24 2010
New Revision: 204855
URL: http://svn.freebsd.org/changeset/base/204855

Log:
  integrate page lock patch from HEAD

Modified:
  user/kmacy/releng_8_page_lock/sys/amd64/amd64/pmap.c
  user/kmacy/releng_8_page_lock/sys/amd64/conf/GENERIC
  user/kmacy/releng_8_page_lock/sys/amd64/include/pmap.h
  user/kmacy/releng_8_page_lock/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
  user/kmacy/releng_8_page_lock/sys/dev/md/md.c
  user/kmacy/releng_8_page_lock/sys/kern/kern_exec.c
  user/kmacy/releng_8_page_lock/sys/kern/kern_subr.c
  user/kmacy/releng_8_page_lock/sys/kern/subr_witness.c
  user/kmacy/releng_8_page_lock/sys/kern/sys_pipe.c
  user/kmacy/releng_8_page_lock/sys/kern/sys_process.c
  user/kmacy/releng_8_page_lock/sys/kern/uipc_cow.c
  user/kmacy/releng_8_page_lock/sys/kern/uipc_shm.c
  user/kmacy/releng_8_page_lock/sys/kern/uipc_syscalls.c
  user/kmacy/releng_8_page_lock/sys/kern/vfs_bio.c
  user/kmacy/releng_8_page_lock/sys/nfsclient/nfs_bio.c
  user/kmacy/releng_8_page_lock/sys/ufs/ffs/ffs_vnops.c
  user/kmacy/releng_8_page_lock/sys/vm/device_pager.c
  user/kmacy/releng_8_page_lock/sys/vm/pmap.h
  user/kmacy/releng_8_page_lock/sys/vm/sg_pager.c
  user/kmacy/releng_8_page_lock/sys/vm/swap_pager.c
  user/kmacy/releng_8_page_lock/sys/vm/uma_core.c
  user/kmacy/releng_8_page_lock/sys/vm/vm_contig.c
  user/kmacy/releng_8_page_lock/sys/vm/vm_fault.c
  user/kmacy/releng_8_page_lock/sys/vm/vm_glue.c
  user/kmacy/releng_8_page_lock/sys/vm/vm_kern.c
  user/kmacy/releng_8_page_lock/sys/vm/vm_map.c
  user/kmacy/releng_8_page_lock/sys/vm/vm_mmap.c
  user/kmacy/releng_8_page_lock/sys/vm/vm_object.c
  user/kmacy/releng_8_page_lock/sys/vm/vm_page.c
  user/kmacy/releng_8_page_lock/sys/vm/vm_page.h
  user/kmacy/releng_8_page_lock/sys/vm/vm_pageout.c
  user/kmacy/releng_8_page_lock/sys/vm/vnode_pager.c

Modified: user/kmacy/releng_8_page_lock/sys/amd64/amd64/pmap.c
==============================================================================
--- user/kmacy/releng_8_page_lock/sys/amd64/amd64/pmap.c	Mon Mar  8 04:56:39 2010	(r204854)
+++ user/kmacy/releng_8_page_lock/sys/amd64/amd64/pmap.c	Mon Mar  8 05:03:24 2010	(r204855)
@@ -165,9 +165,29 @@ __FBSDID("$FreeBSD$");
 #define PV_STAT(x)	do { } while (0)
 #endif
 
+#define	CACHE_LINE_FETCH_SIZE	128
+#define	PA_LOCK_PAD		CACHE_LINE_FETCH_SIZE
+
+struct vp_lock {
+	struct mtx	vp_lock;
+	unsigned char	pad[(PA_LOCK_PAD - sizeof(struct mtx))];
+};
+
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
+#define	PA_LOCKPTR(pa)	&pa_lock[pa_index((pa)) % PA_LOCK_COUNT].vp_lock
+#define	PA_LOCK(pa)	mtx_lock(PA_LOCKPTR(pa))
+#define	PA_TRYLOCK(pa)	mtx_trylock(PA_LOCKPTR(pa))
+#define	PA_UNLOCK(pa)	mtx_unlock(PA_LOCKPTR(pa))
+#define	PA_LOCK_ASSERT(pa, a)	mtx_assert(PA_LOCKPTR(pa), (a))
+
+#define	PA_LOCK_COUNT	64
+
+struct mtx pv_lock __aligned(128);
+struct vp_lock pa_lock[PA_LOCK_COUNT] __aligned(128);
+
+
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
@@ -184,6 +204,15 @@ static int pg_ps_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0,
     "Are large page mappings enabled?");
 
+static uint64_t pmap_tryrelock_calls;
+SYSCTL_QUAD(_vm_pmap, OID_AUTO, tryrelock_calls, CTLFLAG_RD,
+    &pmap_tryrelock_calls, 0, "Number of tryrelock calls");
+
+static int pmap_tryrelock_restart;
+SYSCTL_INT(_vm_pmap, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
+    &pmap_tryrelock_restart, 0, "Number of tryrelock restarts");
+
+
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
 u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
@@ -212,8 +241,9 @@ struct msgbuf *msgbufp = 0;
 static caddr_t crashdumpmap;
 
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
-static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
+static pv_entry_t get_pv_entry(pmap_t locked_pmap);
+static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+	struct pv_list_head *pv_list);
 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
@@ -222,7 +252,8 @@ static pv_entry_t pmap_pvh_remove(struct
 static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
-static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
+static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+	struct pv_list_head *pv_list);
 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
     vm_offset_t va);
 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
@@ -241,7 +272,7 @@ static boolean_t pmap_protect_pde(pmap_t
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
-		vm_page_t *free);
+    vm_page_t *free, struct pv_list_head *pv_list);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
 		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
@@ -249,14 +280,14 @@ static void pmap_remove_page(pmap_t pmap
     vm_page_t *free);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 		vm_offset_t va);
-static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m);
 
-static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
-static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
+static vm_page_t pmap_allocpde(pmap_t pmap, vm_paddr_t pa, vm_offset_t va, int flags);
+static vm_page_t pmap_allocpte(pmap_t pmap, vm_paddr_t pa, vm_offset_t va, int flags);
 
-static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
+static vm_page_t _pmap_allocpte(pmap_t pmap, vm_paddr_t pa, vm_pindex_t ptepindex,
+    int flags);
 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
                 vm_page_t* free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
@@ -265,6 +296,76 @@ static vm_offset_t pmap_kmem_choose(vm_o
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
+
+#define LS_MAX		4
+struct lock_stack {
+	struct mtx *ls_array[LS_MAX];
+	int ls_top;
+};
+
+static void
+ls_init(struct lock_stack *ls)
+{
+
+	ls->ls_top = 0;
+}
+
+static void
+ls_push(struct lock_stack *ls, struct mtx *lock)
+{
+
+	KASSERT(ls->ls_top < LS_MAX, ("lock stack overflow"));
+	
+	ls->ls_array[ls->ls_top] = lock;
+	ls->ls_top++;
+	mtx_lock(lock);
+}
+
+
+static int
+ls_trypush(struct lock_stack *ls, struct mtx *lock)
+{
+
+	KASSERT(ls->ls_top < LS_MAX, ("lock stack overflow"));
+
+	if (mtx_trylock(lock) == 0)
+		return (0);
+	
+	ls->ls_array[ls->ls_top] = lock;
+	ls->ls_top++;
+	return (1);
+}
+
+#ifdef notyet
+static void
+ls_pop(struct lock_stack *ls)
+{
+	struct mtx *lock;
+
+	KASSERT(ls->ls_top > 0, ("lock stack underflow"));
+
+	ls->ls_top--;
+	lock = ls->ls_array[ls->ls_top];
+	mtx_unlock(lock);
+}
+#endif
+
+static void
+ls_popa(struct lock_stack *ls)
+{
+	struct mtx *lock;
+
+	KASSERT(ls->ls_top > 0, ("lock stack underflow"));
+
+	while (ls->ls_top > 0) {
+		ls->ls_top--;
+		lock = ls->ls_array[ls->ls_top];
+		mtx_unlock(lock);
+	}
+}
+#ifdef INVARIANTS
+extern void kdb_backtrace(void);
+#endif
 /*
  * Move the kernel virtual free pointer to the next
  * 2MB.  This is used to help improve performance
@@ -414,6 +515,37 @@ vtopde(vm_offset_t va)
 	return (PDmap + ((va >> PDRSHIFT) & mask));
 }
 
+/*
+ * Try to acquire a physical address lock while a pmap is locked.  If we
+ * fail to trylock we unlock and lock the pmap directly and cache the
+ * locked pa in *locked.  The caller should then restart their loop in case
+ * the virtual to physical mapping has changed.
+ */
+static int
+pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
+{
+	vm_paddr_t lockpa;
+
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	atomic_add_long((volatile long *)&pmap_tryrelock_calls, 1);
+	lockpa = *locked;
+	*locked = pa;
+	if (lockpa) {
+		PA_LOCK_ASSERT(lockpa, MA_OWNED);
+		if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
+			return (0);
+		PA_UNLOCK(lockpa);
+	}
+	if (PA_TRYLOCK(pa))
+		return 0;
+	PMAP_UNLOCK(pmap);
+	PA_LOCK(pa);
+	PMAP_LOCK(pmap);
+	atomic_add_int((volatile int *)&pmap_tryrelock_restart, 1);
+
+	return (EAGAIN);
+}
+
 static u_int64_t
 allocpages(vm_paddr_t *firstaddr, int n)
 {
@@ -523,6 +655,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
+	int i;	
 
 	/*
 	 * Create an initial set of page tables to run the kernel in.
@@ -581,6 +714,13 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 
 	/* Initialize the PAT MSR. */
 	pmap_init_pat();
+
+		/* Setup page locks. */
+	for (i = 0; i < PA_LOCK_COUNT; i++)
+		mtx_init(&pa_lock[i].vp_lock, "page lock", NULL,
+		    MTX_DEF | MTX_RECURSE | MTX_DUPOK);
+	mtx_init(&pv_lock, "pv list lock", NULL, MTX_DEF);
+
 }
 
 /*
@@ -618,6 +758,14 @@ pmap_page_init(vm_page_t m)
 	m->md.pat_mode = PAT_WRITE_BACK;
 }
 
+struct mtx *
+pmap_page_lockptr(vm_page_t m)
+{
+
+	KASSERT(m != NULL, ("pmap_page_lockptr: NULL page"));
+	return (PA_LOCKPTR(VM_PAGE_TO_PHYS(m)));
+}
+
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
@@ -1017,15 +1165,20 @@ pmap_extract_and_hold(pmap_t pmap, vm_of
 {
 	pd_entry_t pde, *pdep;
 	pt_entry_t pte;
+	vm_paddr_t pa;
 	vm_page_t m;
 
+	pa = 0;
 	m = NULL;
-	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
+retry:
 	pdep = pmap_pde(pmap, va);
 	if (pdep != NULL && (pde = *pdep)) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
+				if (pa_tryrelock(pmap, pde & PG_PS_FRAME, &pa))
+					goto retry;
+
 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
@@ -1034,12 +1187,15 @@ pmap_extract_and_hold(pmap_t pmap, vm_of
 			pte = *pmap_pde_to_pte(pdep, va);
 			if ((pte & PG_V) &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
+				if (pa_tryrelock(pmap, pte & PG_FRAME, &pa))
+					goto retry;
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 				vm_page_hold(m);
 			}
 		}
 	}
-	vm_page_unlock_queues();
+	if (pa)
+		PA_UNLOCK(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
@@ -1437,7 +1593,7 @@ pmap_pinit(pmap_t pmap)
  * race conditions.
  */
 static vm_page_t
-_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
+_pmap_allocpte(pmap_t pmap, vm_paddr_t pa, vm_pindex_t ptepindex, int flags)
 {
 	vm_page_t m, pdppg, pdpg;
 
@@ -1452,9 +1608,9 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t 
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (flags & M_WAITOK) {
 			PMAP_UNLOCK(pmap);
-			vm_page_unlock_queues();
+			PA_UNLOCK(pa);
 			VM_WAIT;
-			vm_page_lock_queues();
+			PA_LOCK(pa);
 			PMAP_LOCK(pmap);
 		}
 
@@ -1494,7 +1650,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t 
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
-			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
+			if (_pmap_allocpte(pmap, pa, NUPDE + NUPDPE + pml4index,
 			    flags) == NULL) {
 				--m->wire_count;
 				atomic_subtract_int(&cnt.v_wire_count, 1);
@@ -1527,7 +1683,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t 
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
-			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
+			if (_pmap_allocpte(pmap, pa, NUPDE + pdpindex,
 			    flags) == NULL) {
 				--m->wire_count;
 				atomic_subtract_int(&cnt.v_wire_count, 1);
@@ -1541,7 +1697,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t 
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 			if ((*pdp & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
-				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
+				if (_pmap_allocpte(pmap, pa, NUPDE + pdpindex,
 				    flags) == NULL) {
 					--m->wire_count;
 					atomic_subtract_int(&cnt.v_wire_count,
@@ -1568,7 +1724,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t 
 }
 
 static vm_page_t
-pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
+pmap_allocpde(pmap_t pmap, vm_paddr_t pa, vm_offset_t va, int flags)
 {
 	vm_pindex_t pdpindex, ptepindex;
 	pdp_entry_t *pdpe;
@@ -1587,7 +1743,7 @@ retry:
 		/* Allocate a pd page. */
 		ptepindex = pmap_pde_pindex(va);
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
-		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
+		pdpg = _pmap_allocpte(pmap, pa, NUPDE + pdpindex, flags);
 		if (pdpg == NULL && (flags & M_WAITOK))
 			goto retry;
 	}
@@ -1595,11 +1751,12 @@ retry:
 }
 
 static vm_page_t
-pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
+pmap_allocpte(pmap_t pmap, vm_paddr_t pa, vm_offset_t va, int flags)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd;
 	vm_page_t m;
+	struct pv_list_head pv_list;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
@@ -1620,7 +1777,8 @@ retry:
 	 * normal 4K page.
 	 */
 	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
-		if (!pmap_demote_pde(pmap, pd, va)) {
+		TAILQ_INIT(&pv_list);
+		if (!pmap_demote_pde(pmap, pd, va, &pv_list)) {
 			/*
 			 * Invalidation of the 2MB page mapping may have caused
 			 * the deallocation of the underlying PD page.
@@ -1641,7 +1799,7 @@ retry:
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
-		m = _pmap_allocpte(pmap, ptepindex, flags);
+		m = _pmap_allocpte(pmap, pa, ptepindex, flags);
 		if (m == NULL && (flags & M_WAITOK))
 			goto retry;
 	}
@@ -1847,6 +2005,7 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_coll
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
+#ifdef nomore
 static void
 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
 {
@@ -1862,8 +2021,8 @@ pmap_collect(pmap_t locked_pmap, struct 
 		if (m->hold_count || m->busy)
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
-			va = pv->pv_va;
 			pmap = PV_PMAP(pv);
+			va = pv->pv_va;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
@@ -1897,7 +2056,7 @@ pmap_collect(pmap_t locked_pmap, struct 
 		}
 	}
 }
-
+#endif
 
 /*
  * free the pv_entry back to the free list
@@ -1909,8 +2068,8 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	mtx_lock(&pv_lock);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
@@ -1924,6 +2083,7 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+		mtx_unlock(&pv_lock);
 		return;
 	}
 	PV_STAT(pv_entry_spare -= _NPCPV);
@@ -1932,7 +2092,10 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
-	vm_page_unwire(m, 0);
+	mtx_unlock(&pv_lock);
+	KASSERT(m->wire_count == 1, ("wire_count == %d", m->wire_count));
+	m->wire_count--;
+	atomic_subtract_int(&cnt.v_wire_count, 1);
 	vm_page_free(m);
 }
 
@@ -1941,7 +2104,7 @@ free_pv_entry(pmap_t pmap, pv_entry_t pv
  * when needed.
  */
 static pv_entry_t
-get_pv_entry(pmap_t pmap, int try)
+get_pv_entry(pmap_t pmap)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
@@ -1953,7 +2116,7 @@ get_pv_entry(pmap_t pmap, int try)
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	mtx_lock(&pv_lock);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
@@ -1962,7 +2125,6 @@ get_pv_entry(pmap_t pmap, int try)
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max sysctl.\n");
 	pq = NULL;
-retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
@@ -1981,6 +2143,7 @@ retry:
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 			}
 			PV_STAT(pv_entry_spare--);
+			mtx_unlock(&pv_lock);
 			return (pv);
 		}
 	}
@@ -1989,26 +2152,10 @@ retry:
 	    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
-		if (try) {
-			pv_entry_count--;
-			PV_STAT(pc_chunk_tryfail++);
-			return (NULL);
-		}
-		/*
-		 * Reclaim pv entries: At first, destroy mappings to inactive
-		 * pages.  After that, if a pv chunk entry is still needed,
-		 * destroy mappings to active pages.
-		 */
-		if (pq == NULL) {
-			PV_STAT(pmap_collect_inactive++);
-			pq = &vm_page_queues[PQ_INACTIVE];
-		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
-			PV_STAT(pmap_collect_active++);
-			pq = &vm_page_queues[PQ_ACTIVE];
-		} else
-			panic("get_pv_entry: increase vm.pmap.shpgperproc");
-		pmap_collect(pmap, pq);
-		goto retry;
+		pv_entry_count--;
+		PV_STAT(pc_chunk_tryfail++);
+		mtx_unlock(&pv_lock);
+		return (NULL);
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
@@ -2022,9 +2169,64 @@ retry:
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
+
+	mtx_unlock(&pv_lock);
 	return (pv);
 }
 
+static void
+pmap_pv_list_free(pmap_t pmap, struct pv_list_head *pv_list)
+{
+	pv_entry_t pv;
+
+	while (!TAILQ_EMPTY(pv_list)) {
+		pv = TAILQ_FIRST(pv_list);
+		TAILQ_REMOVE(pv_list, pv, pv_list);
+		free_pv_entry(pmap, pv);
+	}
+}
+
+static boolean_t
+pmap_pv_list_alloc(pmap_t pmap, int count, struct pv_list_head *pv_list)
+{
+	pv_entry_t pv;
+	int i;
+	boolean_t slept;
+
+	slept = FALSE;
+	for (i = 0; i < count; i++) {
+		while ((pv = get_pv_entry(pmap)) == NULL) {
+			PMAP_UNLOCK(pmap);
+			slept = TRUE;
+			VM_WAIT;
+			PMAP_LOCK(pmap);
+		}
+		TAILQ_INSERT_HEAD(pv_list, pv, pv_list);
+	}
+
+	return (slept);
+}
+
+static boolean_t
+pmap_pv_list_try_alloc(pmap_t pmap, int count, struct pv_list_head *pv_list)
+{
+	pv_entry_t pv;
+	int i;
+	boolean_t success;
+
+	success = TRUE;
+	for (i = 0; i < count; i++) {
+		if ((pv = get_pv_entry(pmap)) == NULL) {
+			success = FALSE;
+			pmap_pv_list_free(pmap, pv_list);
+			goto done;
+		}
+		TAILQ_INSERT_HEAD(pv_list, pv, pv_list);
+	}
+done:
+	return (success);
+}
+
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
@@ -2036,7 +2238,8 @@ pmap_pvh_remove(struct md_page *pvh, pma
 {
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
@@ -2052,27 +2255,37 @@ pmap_pvh_remove(struct md_page *pvh, pma
  * entries for each of the 4KB page mappings.
  */
 static void
-pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
+pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+	struct pv_list_head *pv_list)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	PA_LOCK_ASSERT(pa, MA_OWNED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 
-	/*
-	 * Transfer the 2mpage's pv entry for this mapping to the first
-	 * page's pv list.
-	 */
+	 /* Transfer the 2mpage's pv entry for this mapping to the first
+	  *  page's pv list.
+	  */
 	pvh = pa_to_pvh(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
+#ifdef INVARIANTS
+		if (va == 0) {
+			printf("inserting va==0\n");
+			kdb_backtrace();
+		}
+#endif
+	vm_page_lock(m);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+	vm_page_unlock(m);
+	
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
@@ -2080,8 +2293,20 @@ pmap_pv_demote_pde(pmap_t pmap, vm_offse
 		KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0,
 		    ("pmap_pv_demote_pde: page %p is not managed", m));
 		va += PAGE_SIZE;
-		pmap_insert_entry(pmap, va, m);
+		pv = TAILQ_FIRST(pv_list);
+		TAILQ_REMOVE(pv_list, pv, pv_list);
+#ifdef INVARIANTS
+		if (va == 0) {
+			printf("inserting va==0\n");
+			kdb_backtrace();
+		}
+#endif		
+		pv->pv_va = va;
+		vm_page_lock(m);
+		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+		vm_page_unlock(m);
 	} while (va < va_last);
+
 }
 
 /*
@@ -2097,7 +2322,7 @@ pmap_pv_promote_pde(pmap_t pmap, vm_offs
 	vm_offset_t va_last;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	PA_LOCK_ASSERT(pa, MA_OWNED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 
@@ -2143,7 +2368,8 @@ pmap_remove_entry(pmap_t pmap, vm_page_t
 {
 	struct md_page *pvh;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	vm_page_lock_assert(m, MA_OWNED);
+
 	pmap_pvh_free(&m->md, pmap, va);
 	if (TAILQ_EMPTY(&m->md.pv_list)) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -2153,22 +2379,6 @@ pmap_remove_entry(pmap_t pmap, vm_page_t
 }
 
 /*
- * Create a pv entry for page at pa for
- * (pmap, va).
- */
-static void
-pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
-{
-	pv_entry_t pv;
-
-	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	pv = get_pv_entry(pmap, FALSE);
-	pv->pv_va = va;
-	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-}
-
-/*
  * Conditionally create a pv entry.
  */
 static boolean_t
@@ -2177,9 +2387,15 @@ pmap_try_insert_pv_entry(pmap_t pmap, vm
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	vm_page_lock_assert(m, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
-	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
+	    (pv = get_pv_entry(pmap)) != NULL) {
+#ifdef INVARIANTS
+		if (va == 0) {
+			printf("inserting va==0\n");
+			kdb_backtrace();
+		}
+#endif		
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 		return (TRUE);
@@ -2196,9 +2412,16 @@ pmap_pv_insert_pde(pmap_t pmap, vm_offse
 	struct md_page *pvh;
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	PA_LOCK_ASSERT(pa, MA_OWNED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
-	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
+	    (pv = get_pv_entry(pmap)) != NULL) {
+#ifdef INVARIANTS
+		if (va == 0) {
+			printf("inserting va==0\n");
+			kdb_backtrace();
+		}
+#endif		
 		pv->pv_va = va;
 		pvh = pa_to_pvh(pa);
 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
@@ -2226,7 +2449,8 @@ pmap_fill_ptp(pt_entry_t *firstpte, pt_e
  * mapping is invalidated.
  */
 static boolean_t
-pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
+pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+	struct pv_list_head *pv_list)
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
@@ -2262,7 +2486,7 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t 
 		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 			free = NULL;
-			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free);
+			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free, pv_list);
 			pmap_invalidate_page(pmap, trunc_2mpage(va));
 			pmap_free_zero_pages(free);
 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
@@ -2272,6 +2496,10 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t 
 		if (va < VM_MAXUSER_ADDRESS)
 			pmap->pm_stats.resident_count++;
 	}
+	if (TAILQ_EMPTY(pv_list) && ((oldpde & PG_MANAGED) != 0)) {
+		if (pmap_pv_list_try_alloc(pmap, NPTEPG-1, pv_list) == FALSE)
+			return (FALSE);
+	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
@@ -2326,7 +2554,7 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t 
 	 * the 2mpage to referencing the page table page.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
-		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
+		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, pv_list);
 
 	pmap_pde_demotions++;
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
@@ -2339,7 +2567,7 @@ pmap_demote_pde(pmap_t pmap, pd_entry_t 
  */
 static int
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
-    vm_page_t *free)
+    vm_page_t *free, struct pv_list_head *pv_list)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
@@ -2366,6 +2594,10 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t 
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
+			/*
+			 * XXX do we need to individually lock each page? 
+			 *
+			 */
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 			if (oldpde & PG_A)
@@ -2376,7 +2608,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t 
 		}
 	}
 	if (pmap == kernel_pmap) {
-		if (!pmap_demote_pde(pmap, pdq, sva))
+		if (!pmap_demote_pde(pmap, pdq, sva, pv_list))
 			panic("pmap_remove_pde: failed demotion");
 	} else {
 		mpte = pmap_lookup_pt_page(pmap, sva);
@@ -2393,6 +2625,7 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t 
 	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 }
 
+
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
@@ -2416,6 +2649,7 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t 
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
+		vm_page_lock_assert(m, MA_OWNED);
 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
@@ -2432,6 +2666,7 @@ static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
 {
 	pt_entry_t *pte;
+	vm_page_t m = NULL;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((*pde & PG_V) == 0)
@@ -2439,10 +2674,94 @@ pmap_remove_page(pmap_t pmap, vm_offset_
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		return;
+	if  (*pte & PG_MANAGED) {
+		m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
+		if (vm_page_trylock(m) == 0) {
+			PMAP_UNLOCK(pmap);
+			vm_page_lock(m);
+			PMAP_LOCK(pmap);
+		}
+	}
 	pmap_remove_pte(pmap, pte, va, *pde, free);
+	if (m != NULL)
+		vm_page_unlock(m);
 	pmap_invalidate_page(pmap, va);
 }
 
+static void
+pmap_prealloc_pv_list(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+	struct pv_list_head *pv_list)
+{
+	vm_offset_t va_next;
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	pd_entry_t ptpaddr, *pde;
+	pt_entry_t *pte;
+	int i, alloc_count;
+
+	alloc_count = 0;
+	PMAP_LOCK(pmap);
+	for (; sva < eva; sva = va_next) {
+
+		pml4e = pmap_pml4e(pmap, sva);
+		if ((*pml4e & PG_V) == 0) {
+			va_next = (sva + NBPML4) & ~PML4MASK;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+
+		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
+		if ((*pdpe & PG_V) == 0) {
+			va_next = (sva + NBPDP) & ~PDPMASK;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+
+		/*
+		 * Calculate index for next page table.
+		 */
+		va_next = (sva + NBPDR) & ~PDRMASK;
+		if (va_next < sva)
+			va_next = eva;
+
+		pde = pmap_pdpe_to_pde(pdpe, sva);
+		ptpaddr = *pde;
+
+		/*
+		 * Weed out invalid mappings.
+		 */
+		if (ptpaddr == 0)
+			continue;
+
+		/*
+		 * Check for large page.
+		 */
+		if ((ptpaddr & PG_PS) != 0) {
+			alloc_count++;
+			continue;
+		}
+		/*
+		 * Limit our scan to either the end of the va represented
+		 * by the current page table page, or to the end of the
+		 * range being removed.
+		 */
+		if (va_next > eva)
+			va_next = eva;
+
+		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
+		    sva += PAGE_SIZE) {
+			if (*pte == 0)
+				continue;
+		}
+	}
+	for (i = 0; i < alloc_count; i++)
+		pmap_pv_list_alloc(pmap, NPTEPG-1, pv_list);
+
+	PMAP_UNLOCK(pmap);
+}
+
 /*
  *	Remove the given range of addresses from the specified map.
  *
@@ -2457,7 +2776,9 @@ pmap_remove(pmap_t pmap, vm_offset_t sva
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte;
+	vm_paddr_t pa;
 	vm_page_t free = NULL;
+	struct pv_list_head pv_list;
 	int anyvalid;
 
 	/*
@@ -2466,11 +2787,19 @@ pmap_remove(pmap_t pmap, vm_offset_t sva
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
-	anyvalid = 0;
+	pa = anyvalid = 0;
+	TAILQ_INIT(&pv_list);
 
-	vm_page_lock_queues();
-	PMAP_LOCK(pmap);
+	/*
+	 * pre-allocate pvs
+	 *
+	 */
+	if ((pmap == kernel_pmap) &&
+	    (sva + PAGE_SIZE != eva)) 
+		pmap_prealloc_pv_list(pmap, sva, eva, &pv_list);
 
+	PMAP_LOCK(pmap);
+restart:
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
@@ -2525,6 +2854,11 @@ pmap_remove(pmap_t pmap, vm_offset_t sva
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
+			if (pa_tryrelock(pmap, ptpaddr & PG_FRAME, &pa)) {
+				va_next = sva;
+				continue;
+			}
+
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
@@ -2536,9 +2870,9 @@ pmap_remove(pmap_t pmap, vm_offset_t sva
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
-				pmap_remove_pde(pmap, pde, sva, &free);
+				pmap_remove_pde(pmap, pde, sva, &free, &pv_list);
 				continue;
-			} else if (!pmap_demote_pde(pmap, pde, sva)) {
+			} else if (!pmap_demote_pde(pmap, pde, sva, &pv_list)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			} else
@@ -2555,23 +2889,39 @@ pmap_remove(pmap_t pmap, vm_offset_t sva
 
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
+			int ret;
+
 			if (*pte == 0)
 				continue;
 
+			if  ((*pte & PG_MANAGED) &&
+			    pa_tryrelock(pmap, *pte & PG_FRAME, &pa))
+				goto restart;
+
 			/*
 			 * The TLB entry for a PG_G mapping is invalidated
 			 * by pmap_remove_pte().
 			 */
 			if ((*pte & PG_G) == 0)
 				anyvalid = 1;
-			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free))
+			ret = pmap_remove_pte(pmap, pte, sva, ptpaddr, &free);
+
+			if (pa) {
+				PA_UNLOCK(pa);
+				pa = 0;
+			}

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-user mailing list