svn commit: r256072 - in head: lib/libvmmapi sys/amd64/amd64 sys/amd64/include sys/amd64/vmm sys/amd64/vmm/amd sys/amd64/vmm/intel sys/amd64/vmm/io usr.sbin/bhyve usr.sbin/bhyvectl usr.sbin/bhyveload

Sat Oct 5 21:22:36 UTC 2013

Author: neel
Date: Sat Oct  5 21:22:35 2013
New Revision: 256072
URL: http://svnweb.freebsd.org/changeset/base/256072

Log:
  Merge projects/bhyve_npt_pmap into head.
  
  Make the amd64/pmap code aware of nested page table mappings used by bhyve
  guests. This allows bhyve to associate each guest with its own vmspace and
  deal with nested page faults in the context of that vmspace. This also
  enables features like accessed/dirty bit tracking, swapping to disk and
  transparent superpage promotions of guest memory.
  
  Guest vmspace:
  Each bhyve guest has a unique vmspace to represent the physical memory
  allocated to the guest. Each memory segment allocated by the guest is
  mapped into the guest's address space via the 'vmspace->vm_map' and is
  backed by an object of type OBJT_DEFAULT.
  
  pmap types:
  The amd64/pmap now understands two types of pmaps: PT_X86 and PT_EPT.
  
  The PT_X86 pmap type is used by the vmspace associated with the host kernel
  as well as user processes executing on the host. The PT_EPT pmap is used by
  the vmspace associated with a bhyve guest.
  
  Page Table Entries:
  The EPT page table entries as mostly similar in functionality to regular
  page table entries although there are some differences in terms of what
  bits are used to express that functionality. For e.g. the dirty bit is
  represented by bit 9 in the nested PTE as opposed to bit 6 in the regular
  x86 PTE. Therefore the bitmask representing the dirty bit is now computed
  at runtime based on the type of the pmap. Thus PG_M that was previously a
  macro now becomes a local variable that is initialized at runtime using
  'pmap_modified_bit(pmap)'.
  
  An additional wrinkle associated with EPT mappings is that older Intel
  processors don't have hardware support for tracking accessed/dirty bits in
  the PTE. This means that the amd64/pmap code needs to emulate these bits to
  provide proper accounting to the VM subsystem. This is achieved by using
  the following mapping for EPT entries that need emulation of A/D bits:
                 Bit Position           Interpreted By
  PG_V               52                 software (accessed bit emulation handler)
  PG_RW              53                 software (dirty bit emulation handler)
  PG_A               0                  hardware (aka EPT_PG_RD)
  PG_M               1                  hardware (aka EPT_PG_WR)
  
  The idea to use the mapping listed above for A/D bit emulation came from
  Alan Cox (alc@).
  
  The final difference with respect to x86 PTEs is that some EPT implementations
  do not support superpage mappings. This is recorded in the 'pm_flags' field
  of the pmap.
  
  TLB invalidation:
  The amd64/pmap code has a number of ways to do invalidation of mappings
  that may be cached in the TLB: single page, multiple pages in a range or the
  entire TLB. All of these funnel into a single EPT invalidation routine called
  'pmap_invalidate_ept()'. This routine bumps up the EPT generation number and
  sends an IPI to the host cpus that are executing the guest's vcpus. On a
  subsequent entry into the guest it will detect that the EPT has changed and
  invalidate the mappings from the TLB.
  
  Guest memory access:
  Since the guest memory is no longer wired we need to hold the host physical
  page that backs the guest physical page before we can access it. The helper
  functions 'vm_gpa_hold()/vm_gpa_release()' are available for this purpose.
  
  PCI passthru:
  Guest's with PCI passthru devices will wire the entire guest physical address
  space. The MMIO BAR associated with the passthru device is backed by a
  vm_object of type OBJT_SG. An IOMMU domain is created only for guest's that
  have one or more PCI passthru devices attached to them.
  
  Limitations:
  There isn't a way to map a guest physical page without execute permissions.
  This is because the amd64/pmap code interprets the guest physical mappings as
  user mappings since they are numerically below VM_MAXUSER_ADDRESS. Since PG_U
  shares the same bit position as EPT_PG_EXECUTE all guest mappings become
  automatically executable.
  
  Thanks to Alan Cox and Konstantin Belousov for their rigorous code reviews
  as well as their support and encouragement.
  
  Thanks for John Baldwin for reviewing the use of OBJT_SG as the backing
  object for pci passthru mmio regions.
  
  Special thanks to Peter Holm for testing the patch on short notice.
  
  Approved by:	re
  Discussed with:	grehan
  Reviewed by:	alc, kib
  Tested by:	pho

Modified:
  head/lib/libvmmapi/vmmapi.c
  head/lib/libvmmapi/vmmapi.h
  head/sys/amd64/amd64/machdep.c
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/amd64/trap.c
  head/sys/amd64/include/pmap.h
  head/sys/amd64/include/vmm.h
  head/sys/amd64/include/vmm_dev.h
  head/sys/amd64/include/vmm_instruction_emul.h
  head/sys/amd64/vmm/amd/amdv.c
  head/sys/amd64/vmm/intel/ept.c
  head/sys/amd64/vmm/intel/ept.h
  head/sys/amd64/vmm/intel/vmcs.c
  head/sys/amd64/vmm/intel/vmcs.h
  head/sys/amd64/vmm/intel/vmx.c
  head/sys/amd64/vmm/intel/vmx.h
  head/sys/amd64/vmm/intel/vmx_genassym.c
  head/sys/amd64/vmm/intel/vmx_support.S
  head/sys/amd64/vmm/io/ppt.c
  head/sys/amd64/vmm/io/ppt.h
  head/sys/amd64/vmm/vmm.c
  head/sys/amd64/vmm/vmm_dev.c
  head/sys/amd64/vmm/vmm_instruction_emul.c
  head/sys/amd64/vmm/vmm_mem.c
  head/sys/amd64/vmm/vmm_mem.h
  head/usr.sbin/bhyve/bhyverun.c
  head/usr.sbin/bhyve/pci_emul.c
  head/usr.sbin/bhyve/rtc.c
  head/usr.sbin/bhyvectl/bhyvectl.c
  head/usr.sbin/bhyveload/bhyveload.c

Modified: head/lib/libvmmapi/vmmapi.c
==============================================================================

--- head/lib/libvmmapi/vmmapi.c	Sat Oct  5 19:51:09 2013	(r256071)
+++ head/lib/libvmmapi/vmmapi.c	Sat Oct  5 21:22:35 2013	(r256072)
@@ -124,7 +124,8 @@ vm_destroy(struct vmctx *vm)
 }
 
 int
-vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len)
+vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+		  int *wired)
 {
 	int error;
 	struct vm_memory_segment seg;
@@ -133,6 +134,8 @@ vm_get_memory_seg(struct vmctx *ctx, vm_
 	seg.gpa = gpa;
 	error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
 	*ret_len = seg.len;
+	if (wired != NULL)
+		*wired = seg.wired;
 	return (error);
 }
 
@@ -741,3 +744,23 @@ vcpu_reset(struct vmctx *vmctx, int vcpu
 done:
 	return (error);
 }
+
+int
+vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
+{
+	int error, i;
+	struct vm_gpa_pte gpapte;
+
+	bzero(&gpapte, sizeof(gpapte));
+	gpapte.gpa = gpa;
+
+	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
+
+	if (error == 0) {
+		*num = gpapte.ptenum;
+		for (i = 0; i < gpapte.ptenum; i++)
+			pte[i] = gpapte.pte[i];
+	}
+
+	return (error);
+}

Modified: head/lib/libvmmapi/vmmapi.h
==============================================================================
--- head/lib/libvmmapi/vmmapi.h	Sat Oct  5 19:51:09 2013	(r256071)
+++ head/lib/libvmmapi/vmmapi.h	Sat Oct  5 21:22:35 2013	(r256072)
@@ -45,9 +45,11 @@ enum vm_mmap_style {
 int	vm_create(const char *name);
 struct vmctx *vm_open(const char *name);
 void	vm_destroy(struct vmctx *ctx);
-int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len);
+int	vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+			  int *wired);
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
+int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 int	vm_set_desc(struct vmctx *ctx, int vcpu, int reg,

Modified: head/sys/amd64/amd64/machdep.c
==============================================================================
--- head/sys/amd64/amd64/machdep.c	Sat Oct  5 19:51:09 2013	(r256071)
+++ head/sys/amd64/amd64/machdep.c	Sat Oct  5 21:22:35 2013	(r256072)
@@ -1574,7 +1574,7 @@ getmemsize(caddr_t kmdp, u_int64_t first
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
-			*pte = pa | PG_V | PG_RW | PG_N;
+			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();
 
 			tmp = *(int *)ptr;

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Sat Oct  5 19:51:09 2013	(r256071)
+++ head/sys/amd64/amd64/pmap.c	Sat Oct  5 21:22:35 2013	(r256072)
@@ -76,6 +76,8 @@
  * SUCH DAMAGE.
  */
 
+#define	AMD64_NPT_AWARE
+
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
@@ -143,6 +145,120 @@ __FBSDID("$FreeBSD$");
 #include <machine/smp.h>
 #endif
 
+static __inline boolean_t
+pmap_emulate_ad_bits(pmap_t pmap)
+{
+
+	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
+}
+
+static __inline pt_entry_t
+pmap_valid_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		mask = X86_PG_V;
+		break;
+	case PT_EPT:
+		if (pmap_emulate_ad_bits(pmap))
+			mask = EPT_PG_EMUL_V;
+		else
+			mask = EPT_PG_READ;
+		break;
+	default:
+		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline pt_entry_t
+pmap_rw_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		mask = X86_PG_RW;
+		break;
+	case PT_EPT:
+		if (pmap_emulate_ad_bits(pmap))
+			mask = EPT_PG_EMUL_RW;
+		else
+			mask = EPT_PG_WRITE;
+		break;
+	default:
+		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline pt_entry_t
+pmap_global_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		mask = X86_PG_G;
+		break;
+	case PT_EPT:
+		mask = 0;
+		break;
+	default:
+		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline pt_entry_t
+pmap_accessed_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		mask = X86_PG_A;
+		break;
+	case PT_EPT:
+		if (pmap_emulate_ad_bits(pmap))
+			mask = EPT_PG_READ;
+		else
+			mask = EPT_PG_A;
+		break;
+	default:
+		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline pt_entry_t
+pmap_modified_bit(pmap_t pmap)
+{
+	pt_entry_t mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		mask = X86_PG_M;
+		break;
+	case PT_EPT:
+		if (pmap_emulate_ad_bits(pmap))
+			mask = EPT_PG_WRITE;
+		else
+			mask = EPT_PG_M;
+		break;
+	default:
+		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
@@ -247,6 +363,8 @@ static struct md_page *pv_table;
 pt_entry_t *CMAP1 = 0;
 caddr_t CADDR1 = 0;
 
+static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
+
 static struct unrhdr pcid_unr;
 static struct mtx pcid_mtx;
 int pmap_pcid_enabled = 1;
@@ -306,12 +424,12 @@ static void pmap_fill_ptp(pt_entry_t *fi
 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
-static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
+static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp);
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
-static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
+static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
@@ -323,7 +441,7 @@ static boolean_t pmap_try_insert_pv_entr
     vm_page_t m, struct rwlock **lockp);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
-static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
+static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
@@ -416,7 +534,9 @@ static __inline pdp_entry_t *
 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
+	pt_entry_t PG_V;
 
+	PG_V = pmap_valid_bit(pmap);
 	pml4e = pmap_pml4e(pmap, va);
 	if ((*pml4e & PG_V) == 0)
 		return (NULL);
@@ -438,7 +558,9 @@ static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
+	pt_entry_t PG_V;
 
+	PG_V = pmap_valid_bit(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
 		return (NULL);
@@ -460,7 +582,9 @@ static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *pde;
+	pt_entry_t PG_V;
 
+	PG_V = pmap_valid_bit(pmap);
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return (NULL);
@@ -490,6 +614,8 @@ vtopte(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
+	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
+
 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
 }
 
@@ -498,6 +624,8 @@ vtopde(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
+	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
+
 	return (PDmap + ((va >> PDRSHIFT) & mask));
 }
 
@@ -601,22 +729,24 @@ create_pagetables(vm_paddr_t *firstaddr)
 	/* XXX not fully used, underneath 2M pages */
 	pt_p = (pt_entry_t *)KPTphys;
 	for (i = 0; ptoa(i) < *firstaddr; i++)
-		pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G;
+		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
 
 	/* Now map the page tables at their location within PTmap */
 	pd_p = (pd_entry_t *)KPDphys;
 	for (i = 0; i < nkpt; i++)
-		pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
+		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/* Map from zero to end of allocations under 2M pages */
 	/* This replaces some of the KPTphys entries above */
 	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
-		pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G;
+		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
+		    X86_PG_G;
 
 	/* And connect up the PD to the PDP (leaving room for L4 pages) */
 	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
 	for (i = 0; i < nkpdpe; i++)
-		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U;
+		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V |
+		    PG_U;
 
 	/*
 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
@@ -630,36 +760,36 @@ create_pagetables(vm_paddr_t *firstaddr)
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
 		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G |
-		    PG_M | PG_A;
+		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+		    X86_PG_M | X86_PG_A;
 	}
 	pdp_p = (pdp_entry_t *)DMPDPphys;
 	for (i = 0; i < ndm1g; i++) {
 		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G |
-		    PG_M | PG_A;
+		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+		    X86_PG_M | X86_PG_A;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
 		pdp_p[i] = DMPDphys + ptoa(j);
-		pdp_p[i] |= PG_RW | PG_V | PG_U;
+		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
 	p4_p = (pml4_entry_t *)KPML4phys;
 	p4_p[PML4PML4I] = KPML4phys;
-	p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U;
+	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U;
 
 	/* Connect the Direct Map slot(s) up to the PML4. */
 	for (i = 0; i < ndmpdpphys; i++) {
 		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
-		p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U;
+		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U;
 	}
 
 	/* Connect the KVA slots up to the PML4 */
 	for (i = 0; i < NKPML4E; i++) {
 		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
-		p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U;
+		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U;
 	}
 }
 
@@ -705,6 +835,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	CPU_ZERO(&kernel_pmap->pm_save);
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
+	kernel_pmap->pm_flags = pmap_flags;
 
  	/*
 	 * Initialize the global pv list lock.
@@ -948,35 +1079,131 @@ SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, de
  * Low level helper routines.....
  ***************************************************/
 
+static pt_entry_t
+pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
+{
+	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		/* Verify that both PAT bits are not set at the same time */
+		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
+		    ("Invalid PAT bits in entry %#lx", entry));
+
+		/* Swap the PAT bits if one of them is set */
+		if ((entry & x86_pat_bits) != 0)
+			entry ^= x86_pat_bits;
+		break;
+	case PT_EPT:
+		/*
+		 * Nothing to do - the memory attributes are represented
+		 * the same way for regular pages and superpages.
+		 */
+		break;
+	default:
+		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
+	}
+
+	return (entry);
+}
+
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 static int
-pmap_cache_bits(int mode, boolean_t is_pde)
+pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
 
 	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
 		panic("Unknown caching mode %d\n", mode);
 
-	/* The PAT bit is different for PTE's and PDE's. */
-	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
+	switch (pmap->pm_type) {
+	case PT_X86:
+		/* The PAT bit is different for PTE's and PDE's. */
+		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
+
+		/* Map the caching mode to a PAT index. */
+		pat_idx = pat_index[mode];
+
+		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
+		cache_bits = 0;
+		if (pat_idx & 0x4)
+			cache_bits |= pat_flag;
+		if (pat_idx & 0x2)
+			cache_bits |= PG_NC_PCD;
+		if (pat_idx & 0x1)
+			cache_bits |= PG_NC_PWT;
+		break;
+
+	case PT_EPT:
+		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
+		break;
 
-	/* Map the caching mode to a PAT index. */
-	pat_idx = pat_index[mode];
+	default:
+		panic("unsupported pmap type %d", pmap->pm_type);
+	}
 
-	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
-	cache_bits = 0;
-	if (pat_idx & 0x4)
-		cache_bits |= pat_flag;
-	if (pat_idx & 0x2)
-		cache_bits |= PG_NC_PCD;
-	if (pat_idx & 0x1)
-		cache_bits |= PG_NC_PWT;
 	return (cache_bits);
 }
 
+static int
+pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
+{
+	int mask;
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
+		break;
+	case PT_EPT:
+		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
+		break;
+	default:
+		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
+	}
+
+	return (mask);
+}
+
+static __inline boolean_t
+pmap_ps_enabled(pmap_t pmap)
+{
+
+	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
+}
+
+static void
+pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
+{
+
+	switch (pmap->pm_type) {
+	case PT_X86:
+		break;
+	case PT_EPT:
+		/*
+		 * XXX
+		 * This is a little bogus since the generation number is
+		 * supposed to be bumped up when a region of the address
+		 * space is invalidated in the page tables.
+		 *
+		 * In this case the old PDE entry is valid but yet we want
+		 * to make sure that any mappings using the old entry are
+		 * invalidated in the TLB.
+		 *
+		 * The reason this works as expected is because we rendezvous
+		 * "all" host cpus and force any vcpu context to exit as a
+		 * side-effect.
+		 */
+		atomic_add_acq_long(&pmap->pm_eptgen, 1);
+		break;
+	default:
+		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
+	}
+	pde_store(pde, newpde);
+}
+
 /*
  * After changing the page size for the specified virtual address in the page
  * table, flush the corresponding entries from the processor's TLB.  Only the
@@ -985,8 +1212,17 @@ pmap_cache_bits(int mode, boolean_t is_p
  * The calling thread must be pinned to a processor.
  */
 static void
-pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
+pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 {
+	pt_entry_t PG_G;
+
+	if (pmap->pm_type == PT_EPT)
+		return;
+
+	KASSERT(pmap->pm_type == PT_X86,
+	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
+
+	PG_G = pmap_global_bit(pmap);
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
@@ -1048,12 +1284,61 @@ pmap_invalidate_page_pcid(pmap_t pmap, v
  * immutable.  The kernel page table is always active on every
  * processor.
  */
+
+/*
+ * Interrupt the cpus that are executing in the guest context.
+ * This will force the vcpu to exit and the cached EPT mappings
+ * will be invalidated by the host before the next vmresume.
+ */
+static __inline void
+pmap_invalidate_ept(pmap_t pmap)
+{
+
+	sched_pin();
+	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
+	    ("pmap_invalidate_ept: absurd pm_active"));
+
+	/*
+	 * The TLB mappings associated with a vcpu context are not
+	 * flushed each time a different vcpu is chosen to execute.
+	 *
+	 * This is in contrast with a process's vtop mappings that
+	 * are flushed from the TLB on each context switch.
+	 *
+	 * Therefore we need to do more than just a TLB shootdown on
+	 * the active cpus in 'pmap->pm_active'. To do this we keep
+	 * track of the number of invalidations performed on this pmap.
+	 *
+	 * Each vcpu keeps a cache of this counter and compares it
+	 * just before a vmresume. If the counter is out-of-date an
+	 * invept will be done to flush stale mappings from the TLB.
+	 */
+	atomic_add_acq_long(&pmap->pm_eptgen, 1);
+
+	/*
+	 * Force the vcpu to exit and trap back into the hypervisor.
+	 *
+	 * XXX this is not optimal because IPI_AST builds a trapframe
+	 * whereas all we need is an 'eoi' followed by 'iret'.
+	 */
+	ipi_selected(pmap->pm_active, IPI_AST);
+	sched_unpin();
+}
+
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	cpuset_t other_cpus;
 	u_int cpuid;
 
+	if (pmap->pm_type == PT_EPT) {
+		pmap_invalidate_ept(pmap);
+		return;
+	}
+
+	KASSERT(pmap->pm_type == PT_X86,
+	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
+
 	sched_pin();
 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 		if (!pmap_pcid_enabled) {
@@ -1124,6 +1409,14 @@ pmap_invalidate_range(pmap_t pmap, vm_of
 	vm_offset_t addr;
 	u_int cpuid;
 
+	if (pmap->pm_type == PT_EPT) {
+		pmap_invalidate_ept(pmap);
+		return;
+	}
+
+	KASSERT(pmap->pm_type == PT_X86,
+	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
+
 	sched_pin();
 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
 		if (!pmap_pcid_enabled) {
@@ -1175,6 +1468,14 @@ pmap_invalidate_all(pmap_t pmap)
 	uint64_t cr3;
 	u_int cpuid;
 
+	if (pmap->pm_type == PT_EPT) {
+		pmap_invalidate_ept(pmap);
+		return;
+	}
+
+	KASSERT(pmap->pm_type == PT_X86,
+	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
+
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	if (pmap == kernel_pmap ||
@@ -1243,6 +1544,7 @@ pmap_invalidate_cache(void)
 
 struct pde_action {
 	cpuset_t invalidate;	/* processors that invalidate their TLB */
+	pmap_t pmap;
 	vm_offset_t va;
 	pd_entry_t *pde;
 	pd_entry_t newpde;
@@ -1255,7 +1557,7 @@ pmap_update_pde_action(void *arg)
 	struct pde_action *act = arg;
 
 	if (act->store == PCPU_GET(cpuid))
-		pde_store(act->pde, act->newpde);
+		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 }
 
 static void
@@ -1264,7 +1566,7 @@ pmap_update_pde_teardown(void *arg)
 	struct pde_action *act = arg;
 
 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
-		pmap_update_pde_invalidate(act->va, act->newpde);
+		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 }
 
 /*
@@ -1286,7 +1588,7 @@ pmap_update_pde(pmap_t pmap, vm_offset_t
 	cpuid = PCPU_GET(cpuid);
 	other_cpus = all_cpus;
 	CPU_CLR(cpuid, &other_cpus);
-	if (pmap == kernel_pmap)
+	if (pmap == kernel_pmap || pmap->pm_type == PT_EPT)
 		active = all_cpus;
 	else {
 		active = pmap->pm_active;
@@ -1296,6 +1598,7 @@ pmap_update_pde(pmap_t pmap, vm_offset_t
 		act.store = cpuid;
 		act.invalidate = active;
 		act.va = va;
+		act.pmap = pmap;
 		act.pde = pde;
 		act.newpde = newpde;
 		CPU_SET(cpuid, &active);
@@ -1303,9 +1606,9 @@ pmap_update_pde(pmap_t pmap, vm_offset_t
 		    smp_no_rendevous_barrier, pmap_update_pde_action,
 		    pmap_update_pde_teardown, &act);
 	} else {
-		pde_store(pde, newpde);
+		pmap_update_pde_store(pmap, pde, newpde);
 		if (CPU_ISSET(cpuid, &active))
-			pmap_update_pde_invalidate(va, newpde);
+			pmap_update_pde_invalidate(pmap, va, newpde);
 	}
 	sched_unpin();
 }
@@ -1318,8 +1621,17 @@ PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
-	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
-		invlpg(va);
+	switch (pmap->pm_type) {
+	case PT_X86:
+		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+			invlpg(va);
+		break;
+	case PT_EPT:
+		pmap->pm_eptgen++;
+		break;
+	default:
+		panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type);
+	}
 }
 
 PMAP_INLINE void
@@ -1327,17 +1639,35 @@ pmap_invalidate_range(pmap_t pmap, vm_of
 {
 	vm_offset_t addr;
 
-	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
-		for (addr = sva; addr < eva; addr += PAGE_SIZE)
-			invlpg(addr);
+	switch (pmap->pm_type) {
+	case PT_X86:
+		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+			for (addr = sva; addr < eva; addr += PAGE_SIZE)
+				invlpg(addr);
+		break;
+	case PT_EPT:
+		pmap->pm_eptgen++;
+		break;
+	default:
+		panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type);
+	}
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
-	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
-		invltlb();
+	switch (pmap->pm_type) {
+	case PT_X86:
+		if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
+			invltlb();
+		break;
+	case PT_EPT:
+		pmap->pm_eptgen++;
+		break;
+	default:
+		panic("pmap_invalidate_all: unknown type %d", pmap->pm_type);
+	}
 }
 
 PMAP_INLINE void
@@ -1351,9 +1681,9 @@ static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 
-	pde_store(pde, newpde);
+	pmap_update_pde_store(pmap, pde, newpde);
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
-		pmap_update_pde_invalidate(va, newpde);
+		pmap_update_pde_invalidate(pmap, va, newpde);
 	else
 		CPU_ZERO(&pmap->pm_save);
 }
@@ -1455,10 +1785,11 @@ pmap_extract(pmap_t pmap, vm_offset_t va
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
-	pt_entry_t *pte;
+	pt_entry_t *pte, PG_V;
 	vm_paddr_t pa;
 
 	pa = 0;
+	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -1493,12 +1824,14 @@ vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde, *pdep;
-	pt_entry_t pte;
+	pt_entry_t pte, PG_RW, PG_V;
 	vm_paddr_t pa;
 	vm_page_t m;
 
 	pa = 0;
 	m = NULL;
+	PG_RW = pmap_rw_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, va);
@@ -1571,16 +1904,18 @@ pmap_kenter(vm_offset_t va, vm_paddr_t p
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
-	pte_store(pte, pa | PG_RW | PG_V | PG_G);
+	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
 }
 
 static __inline void
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
+	int cache_bits;
 
 	pte = vtopte(va);
-	pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
+	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
+	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
 }
 
 /*
@@ -1629,20 +1964,22 @@ pmap_qenter(vm_offset_t sva, vm_page_t *
 {
 	pt_entry_t *endpte, oldpte, pa, *pte;
 	vm_page_t m;
+	int cache_bits;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		m = *ma++;
-		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
-		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
+		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
+		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
+		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
-			pte_store(pte, pa | PG_G | PG_RW | PG_V);
+			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
 		}
 		pte++;
 	}
-	if (__predict_false((oldpte & PG_V) != 0))
+	if (__predict_false((oldpte & X86_PG_V) != 0))
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
@@ -1841,6 +2178,7 @@ pmap_pinit0(pmap_t pmap)
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
+	pmap->pm_flags = pmap_flags;
 }
 
 /*
@@ -1848,9 +2186,10 @@ pmap_pinit0(pmap_t pmap)
  * such as one in a vmspace structure.
  */
 int
-pmap_pinit(pmap_t pmap)
+pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 {
 	vm_page_t pml4pg;
+	vm_paddr_t pml4phys;
 	int i;
 
 	/*
@@ -1860,41 +2199,61 @@ pmap_pinit(pmap_t pmap)
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 		VM_WAIT;
 
-	pmap->pm_cr3 = VM_PAGE_TO_PHYS(pml4pg);
-	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pmap->pm_cr3);
+	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
+	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
+	pmap->pm_pcid = -1;
+	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
 
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_pml4);
 
-	/* Wire in kernel global address entries. */
-	for (i = 0; i < NKPML4E; i++) {
-		pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) |
-		    PG_RW | PG_V | PG_U;
-	}
-	for (i = 0; i < ndmpdpphys; i++) {
-		pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
-		    PG_RW | PG_V | PG_U;
-	}
+	/*
+	 * Do not install the host kernel mappings in the nested page
+	 * tables. These mappings are meaningless in the guest physical
+	 * address space.
+	 */
+	if ((pmap->pm_type = pm_type) == PT_X86) {
+		pmap->pm_cr3 = pml4phys;
+
+		/* Wire in kernel global address entries. */
+		for (i = 0; i < NKPML4E; i++) {
+			pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) |
+			    X86_PG_RW | X86_PG_V | PG_U;
+		}
+		for (i = 0; i < ndmpdpphys; i++) {
+			pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) |
+			    X86_PG_RW | X86_PG_V | PG_U;
+		}
+
+		/* install self-referential address mapping entry(s) */
+		pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
+		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 
-	/* install self-referential address mapping entry(s) */
-	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
+		if (pmap_pcid_enabled) {
+			pmap->pm_pcid = alloc_unr(&pcid_unr);
+			if (pmap->pm_pcid != -1)
+				pmap->pm_cr3 |= pmap->pm_pcid;
+		}
+	}
 
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
-	if (pmap_pcid_enabled) {
-		pmap->pm_pcid = alloc_unr(&pcid_unr);
-		if (pmap->pm_pcid != -1)
-			pmap->pm_cr3 |= pmap->pm_pcid;
-	} else {
-		pmap->pm_pcid = -1;
-	}
+	pmap->pm_flags = flags;
+	pmap->pm_eptgen = 0;
 	CPU_ZERO(&pmap->pm_save);
 
 	return (1);
 }
 
+int
+pmap_pinit(pmap_t pmap)
+{
+
+	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
+}
+
 /*
  * This routine is called if the desired page table page does not exist.
  *
@@ -1910,9 +2269,15 @@ static vm_page_t
 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, pdppg, pdpg;
+	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
+	PG_A = pmap_accessed_bit(pmap);
+	PG_M = pmap_modified_bit(pmap);
+	PG_V = pmap_valid_bit(pmap);
+	PG_RW = pmap_rw_bit(pmap);
+
 	/*
 	 * Allocate a page table page.
 	 */
@@ -2040,9 +2405,11 @@ static vm_page_t
 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t pdpindex, ptepindex;
-	pdp_entry_t *pdpe;
+	pdp_entry_t *pdpe, PG_V;
 	vm_page_t pdpg;
 
+	PG_V = pmap_valid_bit(pmap);
+
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -2064,9 +2431,11 @@ static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
-	pd_entry_t *pd;
+	pd_entry_t *pd, PG_V;
 	vm_page_t m;
 
+	PG_V = pmap_valid_bit(pmap);
+
 	/*
 	 * Calculate pagetable page index
 	 */
@@ -2140,7 +2509,7 @@ pmap_release(pmap_t pmap)
 		pmap_invalidate_all(pmap);
 	}
 
-	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
+	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
 
 	for (i = 0; i < NKPML4E; i++)	/* KVA */
 		pmap->pm_pml4[KPML4BASE + i] = 0;
@@ -2211,7 +2580,7 @@ pmap_growkernel(vm_offset_t addr)
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
-		if ((*pdpe & PG_V) == 0) {
+		if ((*pdpe & X86_PG_V) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
@@ -2221,12 +2590,12 @@ pmap_growkernel(vm_offset_t addr)
 			if ((nkpg->flags & PG_ZERO) == 0)
 				pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
-			*pdpe = (pdp_entry_t)
-				(paddr | PG_V | PG_RW | PG_A | PG_M);
+			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
+			    X86_PG_A | X86_PG_M);
 			continue; /* try again */
 		}
 		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***