svn commit: r254317 - in projects/bhyve_npt_pmap/sys/amd64: amd64 include vmm vmm/intel

Neel Natu neel at FreeBSD.org
Wed Aug 14 06:28:00 UTC 2013


Author: neel
Date: Wed Aug 14 06:27:58 2013
New Revision: 254317
URL: http://svnweb.freebsd.org/changeset/base/254317

Log:
  Add support for accessed/dirty bit emulation in amd64/pmap.
  
  This is motivated by nested page table implementations that do not keep
  track of accessed/dirty bits.
  
  Accessed Bit emulation is done by enforcing that PG_A is always set
  concurrently with PG_V. Thus, the accessed bit is "cleared" by removing
  the mapping entirely from the pmap.
  
  Dirty Bit emulation is done by temporarily mapping the page as readonly
  and then setting the (PG_RW|PG_M) bits on a write fault.
  
  Mappings that are truly readonly are identified with the PG_RO pseudo-flag.

Modified:
  projects/bhyve_npt_pmap/sys/amd64/amd64/pmap.c
  projects/bhyve_npt_pmap/sys/amd64/include/pmap.h
  projects/bhyve_npt_pmap/sys/amd64/vmm/intel/ept.c
  projects/bhyve_npt_pmap/sys/amd64/vmm/vmm.c

Modified: projects/bhyve_npt_pmap/sys/amd64/amd64/pmap.c
==============================================================================
--- projects/bhyve_npt_pmap/sys/amd64/amd64/pmap.c	Wed Aug 14 06:06:39 2013	(r254316)
+++ projects/bhyve_npt_pmap/sys/amd64/amd64/pmap.c	Wed Aug 14 06:27:58 2013	(r254317)
@@ -344,6 +344,8 @@ static struct md_page *pv_table;
 pt_entry_t *CMAP1 = 0;
 caddr_t CADDR1 = 0;
 
+static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
+
 /*
  * Crashdump maps.
  */
@@ -773,7 +775,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
-	kernel_pmap->pm_flags = PMAP_PDE_SUPERPAGE;
+	kernel_pmap->pm_flags = pmap_flags;
 
  	/*
 	 * Initialize the global pv list lock.
@@ -1089,6 +1091,13 @@ pmap_cache_mask(pmap_t pmap, boolean_t i
 }
 
 static __inline boolean_t
+pmap_emulate_ad_bits(pmap_t pmap)
+{
+
+	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
+}
+
+static __inline boolean_t
 pmap_ps_enabled(pmap_t pmap)
 {
 
@@ -1445,7 +1454,7 @@ pmap_invalidate_range(pmap_t pmap, vm_of
 				invlpg(addr);
 		break;
 	case PT_EPT:
-		pmap->eptgen++;
+		pmap->pm_eptgen++;
 		break;
 	default:
 		panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
@@ -1977,7 +1986,7 @@ pmap_pinit0(pmap_t pmap)
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
-	pmap->pm_flags = PMAP_PDE_SUPERPAGE;
+	pmap->pm_flags = pmap_flags;
 }
 
 /*
@@ -2031,6 +2040,7 @@ pmap_pinit_type(pmap_t pmap, enum pmap_t
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = flags;
+	pmap->pm_eptgen = 0;
 
 	return (1);
 }
@@ -2039,7 +2049,7 @@ int
 pmap_pinit(pmap_t pmap)
 {
 
-	return (pmap_pinit_type(pmap, PT_X86, PMAP_PDE_SUPERPAGE));
+	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
 }
 
 /*
@@ -2473,7 +2483,7 @@ reclaim_pv_chunk(pmap_t locked_pmap, str
 	vm_page_t free, m, m_pc;
 	uint64_t inuse;
 	int bit, field, freed;
-	
+
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
@@ -2527,8 +2537,11 @@ reclaim_pv_chunk(pmap_t locked_pmap, str
 				if ((tpte & PG_G) != 0)
 					pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
-				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+					KASSERT((tpte & PG_RO) == 0,
+					  ("readonly modified PTE %#lx", tpte));
 					vm_page_dirty(m);
+				}
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
@@ -3187,8 +3200,11 @@ pmap_remove_pde(pmap_t pmap, pd_entry_t 
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
-			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
+			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+				KASSERT((oldpde & PG_RO) == 0,
+					("readonly modified PDE %#lx", oldpde));
 				vm_page_dirty(m);
+			}
 			if (oldpde & PG_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
@@ -3235,8 +3251,11 @@ pmap_remove_pte(pmap_t pmap, pt_entry_t 
 	pmap_resident_count_dec(pmap, 1);
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
-		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+			KASSERT((oldpte & PG_RO) == 0,
+				("readonly modified PTE %#lx", oldpte));
 			vm_page_dirty(m);
+		}
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
@@ -3480,8 +3499,11 @@ small_mappings:
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
-		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+			KASSERT((tpte & PG_RO) == 0,
+				("readonly modified PTE %#lx", tpte));
 			vm_page_dirty(m);
+		}
 		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
@@ -3518,11 +3540,17 @@ retry:
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++)
-			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
+			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+				KASSERT((oldpde & PG_RO) == 0,
+					("readonly modified PDE %#lx", oldpde));
 				vm_page_dirty(m);
+			}
 	}
-	if ((prot & VM_PROT_WRITE) == 0)
+	if ((prot & VM_PROT_WRITE) == 0) {
 		newpde &= ~(PG_RW | PG_M);
+		if (pmap_emulate_ad_bits(pmap))
+			newpde |= PG_RO;
+	}
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (newpde != oldpde) {
@@ -3652,10 +3680,15 @@ retry:
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 				    (PG_MANAGED | PG_M | PG_RW)) {
+					KASSERT((pbits & PG_RO) == 0,
+						("readonly modified PTE %#lx",
+						pbits));
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				pbits &= ~(PG_RW | PG_M);
+				if (pmap_emulate_ad_bits(pmap))
+					pbits |= PG_RO;
 			}
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
@@ -3716,6 +3749,8 @@ setpde:
 		return;
 	}
 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
+		KASSERT(!pmap_emulate_ad_bits(pmap),
+		    ("invalid RW/M bits for dirty bit emulation %#lx", newpde));
 		/*
 		 * When PG_M is already clear, PG_RW can be cleared without
 		 * a TLB invalidation.
@@ -3741,6 +3776,9 @@ setpte:
 			return;
 		}
 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
+			KASSERT(!pmap_emulate_ad_bits(pmap),
+				("invalid RW/M bits for dirty bit "
+				 "emulation %#lx", oldpte));
 			/*
 			 * When PG_M is already clear, PG_RW can be cleared
 			 * without a TLB invalidation.
@@ -3799,6 +3837,14 @@ setpte:
 	    " in pmap %p", va, pmap);
 }
 
+static __inline boolean_t
+pmap_writeable_mapping(pmap_t pmap, pt_entry_t pte)
+{
+
+	return ((pte & PG_RW) != 0 ||
+		(pmap_emulate_ad_bits(pmap) && (pte & PG_RO) == 0));
+}
+
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
@@ -3855,6 +3901,38 @@ pmap_enter(pmap_t pmap, vm_offset_t va, 
 		newpte |= PG_G;
 	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0);
 
+	if (pmap_emulate_ad_bits(pmap)) {
+		/*
+		 * Set modified bit gratuitously for writeable mappings if
+		 * the page is unmanaged. We do not want to take a fault
+		 * to do the dirty bit accounting for these mappings.
+		 */
+		if ((m->oflags & VPO_UNMANAGED) != 0) {
+			if ((newpte & PG_RW) != 0)
+				newpte |= PG_M;
+		}
+
+		/*
+		 * Dirty bit emulation enforces the following PG_RW behavior:
+		 * - if PG_RW = 1 then PG_M = 1
+		 * - if PG_RW = 0 then PG_M = 0
+		 *
+		 * If PG_RW = 0 then there are two possibilities:
+		 * - the mapping is permanently readonly (PG_RO = 1)
+		 * - the mapping is temporarily readonly for dirty bit emulation
+		 */
+		if ((newpte & PG_RW) == 0)
+			newpte |= PG_RO;
+		else if ((newpte & PG_M) == 0)
+			newpte &= ~PG_RW;
+
+		if (((newpte & (PG_M | PG_RW)) != (PG_M | PG_RW)) &&
+		    ((newpte & (PG_M | PG_RW)) != 0)) {
+			panic("pmap_enter: invalid rw/modified bits for "
+			      "dirty bit emulation %#lx", newpte);
+		}
+	}
+
 	mpte = NULL;
 
 	lock = NULL;
@@ -3921,7 +3999,7 @@ retry:
 			 */
 			if ((origpte & PG_MANAGED) != 0) {
 				newpte |= PG_MANAGED;
-				if ((newpte & PG_RW) != 0)
+				if (pmap_writeable_mapping(pmap, newpte))
 					vm_page_aflag_set(m, PGA_WRITEABLE);
 			}
 			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
@@ -3946,7 +4024,7 @@ retry:
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
-		if ((newpte & PG_RW) != 0)
+		if (pmap_writeable_mapping(pmap, newpte))
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
@@ -3961,8 +4039,12 @@ validate:
 			if ((origpte & PG_MANAGED) != 0) {
 				om = PHYS_TO_VM_PAGE(opa);
 				if ((origpte & (PG_M | PG_RW)) == (PG_M |
-				    PG_RW))
+				    PG_RW)) {
+					KASSERT((origpte & PG_RO) == 0,
+						("readonly modified PTE %#lx",
+						origpte));
 					vm_page_dirty(om);
+				}
 				if ((origpte & PG_A) != 0)
 					vm_page_aflag_set(om, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
@@ -3975,8 +4057,11 @@ validate:
 			}
 		} else if ((newpte & PG_M) == 0 && (origpte & (PG_M |
 		    PG_RW)) == (PG_M | PG_RW)) {
-			if ((origpte & PG_MANAGED) != 0)
+			if ((origpte & PG_MANAGED) != 0) {
+				KASSERT((origpte & PG_RO) == 0,
+				    ("readonly modified PTE %#lx", origpte));
 				vm_page_dirty(m);
+			}
 
 			/*
 			 * Although the PTE may still have PG_RW set, TLB
@@ -4027,6 +4112,15 @@ pmap_enter_pde(pmap_t pmap, vm_offset_t 
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+	/*
+	 * Software emulation of the accessed bit requires that if PG_V is set
+	 * then PG_A is also set. Therefore we defer setting up the mapping
+	 * until the process actually tries to access it.
+	 */
+	if (pmap_emulate_ad_bits(pmap))
+		return (FALSE);
+
 	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
@@ -4170,6 +4264,14 @@ pmap_enter_quick_locked(pmap_t pmap, vm_
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
+	 * Software emulation of the accessed bit requires that if PG_V is set
+	 * then PG_A is also set. Therefore we defer setting up the mapping
+	 * until the process actually tries to access it.
+	 */
+	if (pmap_emulate_ad_bits(pmap))
+		return (NULL);
+
+	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
@@ -4444,6 +4546,9 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pm
 	if (dst_addr != src_addr)
 		return;
 
+	if (pmap_emulate_ad_bits(dst_pmap))
+		return;
+
 	lock = NULL;
 	rw_rlock(&pvh_global_lock);
 	if (dst_pmap < src_pmap) {
@@ -4868,6 +4973,9 @@ pmap_remove_pages(pmap_t pmap)
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+					KASSERT((tpte & PG_RO) == 0,
+						("readonly modified PTE %#lx",
+						tpte));
 					if ((tpte & PG_PS) != 0) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
@@ -4992,7 +5100,7 @@ pmap_is_modified_pvh(struct md_page *pvh
 /*
  *	pmap_is_prefaultable:
  *
- *	Return whether or not the specified virtual address is elgible
+ *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
@@ -5071,7 +5179,7 @@ pmap_remove_write(vm_page_t m)
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *pde;
-	pt_entry_t oldpte, *pte, PG_M;
+	pt_entry_t oldpte, newpte, *pte, PG_M;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
@@ -5111,12 +5219,17 @@ small_mappings:
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 retry:
 		oldpte = *pte;
-		if (oldpte & PG_RW) {
-			if (!atomic_cmpset_long(pte, oldpte, oldpte &
-			    ~(PG_RW | PG_M)))
+		newpte = oldpte & ~(PG_RW | PG_M);
+		if (pmap_emulate_ad_bits(pmap))
+			newpte |= PG_RO;
+		if (newpte != oldpte) {
+			if (!atomic_cmpset_long(pte, oldpte, newpte))
 				goto retry;
-			if ((oldpte & PG_M) != 0)
+			if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+				KASSERT((oldpte & PG_RO) == 0,
+					("readonly modified PTE %#lx", oldpte));
 				vm_page_dirty(m);
+			}
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
@@ -5147,6 +5260,7 @@ pmap_ts_referenced(vm_page_t m)
 	pt_entry_t *pte, PG_A;
 	vm_offset_t va;
 	int rtval = 0;
+	vm_page_t free = NULL;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
@@ -5187,8 +5301,10 @@ pmap_ts_referenced(vm_page_t m)
 	}
 small_mappings:
 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
-		pvf = pv;
+		pvf = NULL;
 		do {
+			if (pvf == NULL)
+				pvf = pv;
 			pvn = TAILQ_NEXT(pv, pv_next);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
@@ -5200,8 +5316,23 @@ small_mappings:
 			    " found a 2mpage in page %p's pv list", m));
 			pte = pmap_pde_to_pte(pde, pv->pv_va);
 			if ((*pte & PG_A) != 0) {
-				atomic_clear_long(pte, PG_A);
-				pmap_invalidate_page(pmap, pv->pv_va);
+				if (pmap_emulate_ad_bits(pmap)) {
+					/*
+					 * Wired pages cannot be paged out so
+					 * doing accessed bit emulation for
+					 * them is wasted effort. We do the
+					 * hard work for unwired pages only.
+					 */
+					if ((*pte & PG_W) == 0) {
+						pmap_remove_page(pmap,
+							pv->pv_va, pde, &free);
+						if (pvf == pv)
+							pvf = NULL;
+					}
+				} else {
+					atomic_clear_long(pte, PG_A);
+					pmap_invalidate_page(pmap, pv->pv_va);
+				}
 				rtval++;
 				if (rtval > 4)
 					pvn = NULL;
@@ -5211,6 +5342,7 @@ small_mappings:
 	}
 out:
 	rw_wunlock(&pvh_global_lock);
+	pmap_free_zero_pages(free);
 	return (rtval);
 }
 
@@ -5226,6 +5358,7 @@ pmap_clear_modify(vm_page_t m)
 	pd_entry_t oldpde, *pde;
 	pt_entry_t oldpte, *pte, PG_M;
 	vm_offset_t va;
+	long clear_bits;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
@@ -5263,14 +5396,18 @@ pmap_clear_modify(vm_page_t m)
 					    PG_PS_FRAME);
 					pte = pmap_pde_to_pte(pde, va);
 					oldpte = *pte;
-					if ((oldpte & PG_V) != 0) {
-						while (!atomic_cmpset_long(pte,
-						    oldpte,
-						    oldpte & ~(PG_M | PG_RW)))
-							oldpte = *pte;
-						vm_page_dirty(m);
-						pmap_invalidate_page(pmap, va);
-					}
+
+					if ((oldpte & (PG_RO | PG_RW | PG_M)) !=
+						(PG_RW | PG_M))
+						panic("inconsistent pte %#lx "
+						    "after demotion from pde "
+						    "%#lx", oldpte, oldpde);
+
+					while (!atomic_cmpset_long(pte, oldpte,
+					    oldpte & ~(PG_M | PG_RW)))
+						oldpte = *pte;
+					vm_page_dirty(m);
+					pmap_invalidate_page(pmap, va);
 				}
 			}
 		}
@@ -5285,8 +5422,22 @@ small_mappings:
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
-		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
-			atomic_clear_long(pte, PG_M);
+		oldpte = *pte;
+		if (pmap_emulate_ad_bits(pmap)) {
+			if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+				KASSERT((oldpte & PG_RO) == 0,
+				    ("modified readonly pte %#lx", oldpte));
+			} else {
+				KASSERT((oldpte & (PG_M | PG_RW)) == 0,
+					("invalid RW/M bits for dirty bit "
+					 "emulation %#lx", oldpte));
+			}
+		}
+		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+			clear_bits = PG_M;
+			if (pmap_emulate_ad_bits(pmap))
+				clear_bits |= PG_RW;
+			atomic_clear_long(pte, clear_bits);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
@@ -5308,6 +5459,7 @@ pmap_clear_reference(vm_page_t m)
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_A;
 	vm_offset_t va;
+	vm_page_t free = NULL;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_reference: page %p is not managed", m));
@@ -5339,7 +5491,7 @@ pmap_clear_reference(vm_page_t m)
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
-	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
+	TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		PG_A = pmap_accessed_bit(pmap);
@@ -5348,12 +5500,26 @@ small_mappings:
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		if (*pte & PG_A) {
-			atomic_clear_long(pte, PG_A);
-			pmap_invalidate_page(pmap, pv->pv_va);
+			if (pmap_emulate_ad_bits(pmap)) {
+				/*
+				 * Wired pages cannot be paged out so doing
+				 * accessed bit emulation for them is wasted
+				 * effort. We do the hard work for unwired
+				 * pages only.
+				 */
+				if ((*pte & PG_W) == 0) {
+					pmap_remove_page(pmap, pv->pv_va, pde,
+							 &free);
+				}
+			} else {
+				atomic_clear_long(pte, PG_A);
+				pmap_invalidate_page(pmap, pv->pv_va);
+			}
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(&pvh_global_lock);
+	pmap_free_zero_pages(free);
 }
 
 /*
@@ -5921,6 +6087,71 @@ pmap_align_superpage(vm_object_t object,
 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 }
 
+#ifdef INVARIANTS
+static unsigned long num_dirty_emulations;
+SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
+	     &num_dirty_emulations, 0, NULL);
+#endif
+int
+pmap_emulate_dirty(pmap_t pmap, vm_offset_t va)
+{
+	int rv = -1;
+	struct rwlock *lock;
+	vm_page_t m, mpte;
+	pd_entry_t *pde;
+	pt_entry_t *pte, PG_A, PG_M;
+
+	if (!pmap_emulate_ad_bits(pmap))
+		return (-1);
+
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
+	PMAP_LOCK(pmap);
+
+	/*
+	 * Dirty bit emulation is done in the fast path if 'va' is
+	 * already mapped as a regular page and is writeable.
+	 */
+	pde = pmap_pde(pmap, va);
+	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
+		pte = pmap_pde_to_pte(pde, va);
+		if ((*pte & (PG_V | PG_RO)) == PG_V) {
+			KASSERT((*pte & PG_A) != 0,
+				("pmap_emulate_dirty: accessed and valid bits ",
+				 "mismatch %#lx", *pte));
+			atomic_set_long(pte, PG_M | PG_RW);
+			rv = 0;		/* success */
+
+#ifdef INVARIANTS
+			atomic_add_long(&num_dirty_emulations, 1);
+#endif
+
+			/* try to promote the mapping */
+			if (va < VM_MAXUSER_ADDRESS)
+				mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
+			else
+				mpte = NULL;
+
+			m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
+
+			if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
+			    pg_ps_enabled && pmap_ps_enabled(pmap) &&
+			    (m->flags & PG_FICTITIOUS) == 0 &&
+			    vm_reserv_level_iffullpop(m) == 0)
+				pmap_promote_pde(pmap, pde, va, &lock);
+		}
+	}
+
+	if (lock != NULL)
+		rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
+	PMAP_UNLOCK(pmap);
+	return (rv);
+}
+
 void
 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
 {

Modified: projects/bhyve_npt_pmap/sys/amd64/include/pmap.h
==============================================================================
--- projects/bhyve_npt_pmap/sys/amd64/include/pmap.h	Wed Aug 14 06:06:39 2013	(r254316)
+++ projects/bhyve_npt_pmap/sys/amd64/include/pmap.h	Wed Aug 14 06:27:58 2013	(r254317)
@@ -79,6 +79,12 @@
 #define	PG_PROT		(PG_RW|PG_U)	/* all protection bits . */
 #define PG_N		(PG_NC_PWT|PG_NC_PCD)	/* Non-cacheable */
 
+/*
+ * "readonly" pseudo-flag used in pmap entries that require software emulation
+ * of accessed/dirty bits.
+ */
+#define	PG_RO		(1ul << 52)
+
 /* Page level cache control fields used to determine the PAT type */
 #define PG_PDE_CACHE	(PG_PDE_PAT | PG_NC_PWT | PG_NC_PCD)
 #define PG_PTE_CACHE	(PG_PTE_PAT | PG_NC_PWT | PG_NC_PCD)
@@ -88,7 +94,7 @@
  * (PTE) page mappings have identical settings for the following fields:
  */
 #define	PG_PTE_PROMOTE	(PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
-	    PG_M | PG_A | PG_U | PG_RW | PG_V)
+	    PG_M | PG_A | PG_U | PG_RW | PG_V | PG_RO)
 
 /*
  * Page Protection Exception bits
@@ -264,6 +270,7 @@ struct pmap {
 
 /* flags */
 #define	PMAP_PDE_SUPERPAGE	(1 << 0)	/* supports 2MB superpages */
+#define	PMAP_EMULATE_AD_BITS	(1 << 1)	/* needs A/D bits emulation */
 
 typedef struct pmap	*pmap_t;
 
@@ -283,6 +290,7 @@ extern struct pmap	kernel_pmap_store;
 #define	PMAP_UNLOCK(pmap)	mtx_unlock(&(pmap)->pm_mtx)
 
 int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags);
+int pmap_emulate_dirty(pmap_t pmap, vm_offset_t va);
 #endif
 
 /*

Modified: projects/bhyve_npt_pmap/sys/amd64/vmm/intel/ept.c
==============================================================================
--- projects/bhyve_npt_pmap/sys/amd64/vmm/intel/ept.c	Wed Aug 14 06:06:39 2013	(r254316)
+++ projects/bhyve_npt_pmap/sys/amd64/vmm/intel/ept.c	Wed Aug 14 06:27:58 2013	(r254317)
@@ -30,6 +30,7 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
@@ -69,6 +70,7 @@ static int ept_enable_ad_bits;
 int
 ept_init(void)
 {
+	int use_hw_ad_bits;
 	uint64_t cap;
 
 	cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
@@ -91,8 +93,12 @@ ept_init(void)
 	if (EPT_PDE_SUPERPAGE(cap))
 		ept_pmap_flags |= PMAP_PDE_SUPERPAGE;	/* 2MB superpage */
 
-	if (AD_BITS_SUPPORTED(cap))
+	use_hw_ad_bits = 1;
+	TUNABLE_INT_FETCH("vmx.ept.use_hw_ad_bits", &use_hw_ad_bits);
+	if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap))
 		ept_enable_ad_bits = 1;
+	else
+		ept_pmap_flags |= PMAP_EMULATE_AD_BITS;
 
 	return (0);
 }

Modified: projects/bhyve_npt_pmap/sys/amd64/vmm/vmm.c
==============================================================================
--- projects/bhyve_npt_pmap/sys/amd64/vmm/vmm.c	Wed Aug 14 06:06:39 2013	(r254316)
+++ projects/bhyve_npt_pmap/sys/amd64/vmm/vmm.c	Wed Aug 14 06:27:58 2013	(r254317)
@@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/vmparam.h>
 
 #include <machine/vmm.h>
+#include "vmm_ktr.h"
 #include "vmm_host.h"
 #include "vmm_mem.h"
 #include "vmm_util.h"
@@ -736,23 +737,41 @@ vm_handle_hlt(struct vm *vm, int vcpuid,
 static int
 vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
 {
-	int rv;
+	int rv, ftype, prot;
 	struct vm_map *map;
-	vm_prot_t ftype;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
 
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
 
-	map = &vm->vmspace->vm_map;
 	ftype = vme->u.paging.fault_type;
+	KASSERT(ftype == VM_PROT_WRITE ||
+		ftype == VM_PROT_EXECUTE ||
+		ftype == VM_PROT_READ,
+		("vm_handle_paging: invalid fault_type %d", ftype));
+
+	/*
+	 * If the mapping exists then the write fault may be intentional
+	 * for doing dirty bit emulation.
+	 */
+	prot = vme->u.paging.protection;
+	if ((prot & VM_PROT_READ) != 0 && ftype == VM_PROT_WRITE) {
+		rv = pmap_emulate_dirty(vmspace_pmap(vm->vmspace),
+					vme->u.paging.gpa);
+		if (rv == 0)
+			goto done;
+	}
 
+	map = &vm->vmspace->vm_map;
 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
 
+	VMM_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, ftype = %d",
+		 rv, vme->u.paging.gpa, ftype);
+
 	if (rv != KERN_SUCCESS)
 		return (EFAULT);
-
+done:
 	/* restart execution at the faulting instruction */
 	vme->inst_length = 0;
 


More information about the svn-src-projects mailing list