svn commit: r328470 - in head/sys: amd64/amd64 amd64/include i386/i386 x86/include x86/x86

Konstantin Belousov kib at FreeBSD.org
Sat Jan 27 11:49:40 UTC 2018


Author: kib
Date: Sat Jan 27 11:49:37 2018
New Revision: 328470
URL: https://svnweb.freebsd.org/changeset/base/328470

Log:
  Use PCID to optimize PTI.
  
  Use PCID to avoid complete TLB shootdown when switching between user
  and kernel mode with PTI enabled.
  
  I use the model close to what I read about KAISER, user-mode PCID has
  1:1 correspondence to the kernel-mode PCID, by setting bit 11 in PCID.
  Full kernel-mode TLB shootdown is performed on context switches, since
  KVA TLB invalidation only works in the current pmap. User-mode part of
  TLB is flushed on the pmap activations as well.
  
  Similarly, IPI TLB shootdowns must handle both kernel and user address
  spaces for each address.  Note that machines which implement PCID but
  do not have INVPCID instructions, cause the usual complications in the
  IPI handlers, due to the need to switch to the target PCID temporary.
  This is racy, but because for PCID/no-INVPCID we disable the
  interrupts in pmap_activate_sw(), IPI handler cannot see inconsistent
  state of CPU PCID vs PCPU pmap/kcr3/ucr3 pointers.
  
  On the other hand, on kernel/user switches, CR3_PCID_SAVE bit is set
  and we do not clear TLB.
  
  I can imagine alternative use of PCID, where there is only one PCID
  allocated for the kernel pmap. Then, there is no need to shootdown
  kernel TLB entries on context switch. But copyout(3) would need to
  either use method similar to proc_rwmem() to access the userspace
  data, or (in reverse) provide a temporal mapping for the kernel buffer
  into user mode PCID and use trampoline for copy.
  
  Reviewed by:	markj (previous version)
  Tested by:	pho
  Discussed with:	alc (some aspects)
  Sponsored by:	The FreeBSD Foundation
  MFC after:	3 weeks
  Differential revision:	https://reviews.freebsd.org/D13985

Modified:
  head/sys/amd64/amd64/apic_vector.S
  head/sys/amd64/amd64/mp_machdep.c
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/amd64/support.S
  head/sys/amd64/include/pmap.h
  head/sys/amd64/include/smp.h
  head/sys/i386/i386/pmap.c
  head/sys/i386/i386/vm_machdep.c
  head/sys/x86/include/x86_smp.h
  head/sys/x86/x86/mp_x86.c

Modified: head/sys/amd64/amd64/apic_vector.S
==============================================================================
--- head/sys/amd64/amd64/apic_vector.S	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/amd64/amd64/apic_vector.S	Sat Jan 27 11:49:37 2018	(r328470)
@@ -184,10 +184,14 @@ invltlb_ret:
 	call	invltlb_pcid_handler
 	jmp	invltlb_ret
 
-	INTR_HANDLER invltlb_invpcid
+	INTR_HANDLER invltlb_invpcid_nopti
 	call	invltlb_invpcid_handler
 	jmp	invltlb_ret
 
+	INTR_HANDLER invltlb_invpcid_pti
+	call	invltlb_invpcid_pti_handler
+	jmp	invltlb_ret
+
 /*
  * Single page TLB shootdown
  */
@@ -195,11 +199,27 @@ invltlb_ret:
 	call	invlpg_handler
 	jmp	invltlb_ret
 
+	INTR_HANDLER invlpg_invpcid
+	call	invlpg_invpcid_handler
+	jmp	invltlb_ret
+
+	INTR_HANDLER invlpg_pcid
+	call	invlpg_pcid_handler
+	jmp	invltlb_ret
+
 /*
  * Page range TLB shootdown.
  */
 	INTR_HANDLER invlrng
 	call	invlrng_handler
+	jmp	invltlb_ret
+
+	INTR_HANDLER invlrng_invpcid
+	call	invlrng_invpcid_handler
+	jmp	invltlb_ret
+
+	INTR_HANDLER invlrng_pcid
+	call	invlrng_pcid_handler
 	jmp	invltlb_ret
 
 /*

Modified: head/sys/amd64/amd64/mp_machdep.c
==============================================================================
--- head/sys/amd64/amd64/mp_machdep.c	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/amd64/amd64/mp_machdep.c	Sat Jan 27 11:49:37 2018	(r328470)
@@ -133,20 +133,30 @@ cpu_mp_start(void)
 	/* Install an inter-CPU IPI for TLB invalidation */
 	if (pmap_pcid_enabled) {
 		if (invpcid_works) {
-			setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_invpcid_pti) :
-			    IDTVEC(invltlb_invpcid), SDT_SYSIGT, SEL_KPL, 0);
+			setidt(IPI_INVLTLB, pti ?
+			    IDTVEC(invltlb_invpcid_pti_pti) :
+			    IDTVEC(invltlb_invpcid_nopti), SDT_SYSIGT,
+			    SEL_KPL, 0);
+			setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_invpcid_pti) :
+			    IDTVEC(invlpg_invpcid), SDT_SYSIGT, SEL_KPL, 0);
+			setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_invpcid_pti) :
+			    IDTVEC(invlrng_invpcid), SDT_SYSIGT, SEL_KPL, 0);
 		} else {
 			setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) :
 			    IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0);
+			setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pcid_pti) :
+			    IDTVEC(invlpg_pcid), SDT_SYSIGT, SEL_KPL, 0);
+			setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pcid_pti) :
+			    IDTVEC(invlrng_pcid), SDT_SYSIGT, SEL_KPL, 0);
 		}
 	} else {
 		setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb),
 		    SDT_SYSIGT, SEL_KPL, 0);
+		setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg),
+		    SDT_SYSIGT, SEL_KPL, 0);
+		setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng),
+		    SDT_SYSIGT, SEL_KPL, 0);
 	}
-	setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg),
-	    SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng),
-	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for cache invalidation. */
 	setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache),
@@ -440,9 +450,43 @@ invltlb_invpcid_handler(void)
 }
 
 void
-invltlb_pcid_handler(void)
+invltlb_invpcid_pti_handler(void)
 {
+	struct invpcid_descr d;
 	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;
+	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
+	d.pad = 0;
+	d.addr = 0;
+	if (smp_tlb_pmap == kernel_pmap) {
+		/*
+		 * This invalidation actually needs to clear kernel
+		 * mappings from the TLB in the current pmap, but
+		 * since we were asked for the flush in the kernel
+		 * pmap, achieve it by performing global flush.
+		 */
+		invpcid(&d, INVPCID_CTXGLOB);
+	} else {
+		invpcid(&d, INVPCID_CTX);
+		d.pcid |= PMAP_PCID_USER_PT;
+		invpcid(&d, INVPCID_CTX);
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invltlb_pcid_handler(void)
+{
+	uint64_t kcr3, ucr3;
+	uint32_t generation, pcid;
   
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
@@ -463,9 +507,132 @@ invltlb_pcid_handler(void)
 		 * CPU.
 		 */
 		if (PCPU_GET(curpmap) == smp_tlb_pmap) {
-			load_cr3(smp_tlb_pmap->pm_cr3 |
-			    smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid);
+			pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
+			kcr3 = smp_tlb_pmap->pm_cr3 | pcid;
+			ucr3 = smp_tlb_pmap->pm_ucr3;
+			if (ucr3 != PMAP_NO_CR3) {
+				ucr3 |= PMAP_PCID_USER_PT | pcid;
+				pmap_pti_pcid_invalidate(ucr3, kcr3);
+			} else
+				load_cr3(kcr3);
 		}
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlpg_invpcid_handler(void)
+{
+	struct invpcid_descr d;
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	invlpg(smp_tlb_addr1);
+	if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
+		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
+		    PMAP_PCID_USER_PT;
+		d.pad = 0;
+		d.addr = smp_tlb_addr1;
+		invpcid(&d, INVPCID_ADDR);
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlpg_pcid_handler(void)
+{
+	uint64_t kcr3, ucr3;
+	uint32_t generation;
+	uint32_t pcid;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	invlpg(smp_tlb_addr1);
+	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
+	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
+		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
+		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+		pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1);
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlrng_invpcid_handler(void)
+{
+	struct invpcid_descr d;
+	vm_offset_t addr, addr2;
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	do {
+		invlpg(addr);
+		addr += PAGE_SIZE;
+	} while (addr < addr2);
+	if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
+		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
+		    PMAP_PCID_USER_PT;
+		d.pad = 0;
+		d.addr = smp_tlb_addr1;
+		do {
+			invpcid(&d, INVPCID_ADDR);
+			d.addr += PAGE_SIZE;
+		} while (d.addr < addr2);
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlrng_pcid_handler(void)
+{
+	vm_offset_t addr, addr2;
+	uint64_t kcr3, ucr3;
+	uint32_t generation;
+	uint32_t pcid;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	do {
+		invlpg(addr);
+		addr += PAGE_SIZE;
+	} while (addr < addr2);
+	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
+	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
+		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
+		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+		pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2);
 	}
 	PCPU_SET(smp_tlb_done, generation);
 }

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/amd64/amd64/pmap.c	Sat Jan 27 11:49:37 2018	(r328470)
@@ -1060,6 +1060,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	kernel_pmap->pm_cr3 = KPML4phys;
+	kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	kernel_pmap->pm_flags = pmap_flags;
@@ -1097,8 +1098,6 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	pmap_init_pat();
 
 	/* Initialize TLB Context Id. */
-	if (pti)
-		pmap_pcid_enabled = 0;
 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
 		/* Check for INVPCID support */
@@ -1576,6 +1575,9 @@ void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	cpuset_t *mask;
+	struct invpcid_descr d;
+	uint64_t kcr3, ucr3;
+	uint32_t pcid;
 	u_int cpuid, i;
 
 	if (pmap_type_guest(pmap)) {
@@ -1592,9 +1594,32 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
-		if (pmap == PCPU_GET(curpmap))
+		if (pmap == PCPU_GET(curpmap)) {
 			invlpg(va);
-		else if (pmap_pcid_enabled)
+			if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
+				/*
+				 * Disable context switching. pm_pcid
+				 * is recalculated on switch, which
+				 * might make us use wrong pcid below.
+				 */
+				critical_enter();
+				pcid = pmap->pm_pcids[cpuid].pm_pcid;
+
+				if (invpcid_works) {
+					d.pcid = pcid | PMAP_PCID_USER_PT;
+					d.pad = 0;
+					d.addr = va;
+					invpcid(&d, INVPCID_ADDR);
+				} else {
+					kcr3 = pmap->pm_cr3 | pcid |
+					    CR3_PCID_SAVE;
+					ucr3 = pmap->pm_ucr3 | pcid |
+					    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+					pmap_pti_pcid_invlpg(ucr3, kcr3, va);
+				}
+				critical_exit();
+			}
+		} else if (pmap_pcid_enabled)
 			pmap->pm_pcids[cpuid].pm_gen = 0;
 		if (pmap_pcid_enabled) {
 			CPU_FOREACH(i) {
@@ -1604,7 +1629,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 		}
 		mask = &pmap->pm_active;
 	}
-	smp_masked_invlpg(*mask, va);
+	smp_masked_invlpg(*mask, va, pmap);
 	sched_unpin();
 }
 
@@ -1615,7 +1640,10 @@ void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	cpuset_t *mask;
+	struct invpcid_descr d;
 	vm_offset_t addr;
+	uint64_t kcr3, ucr3;
+	uint32_t pcid;
 	u_int cpuid, i;
 
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
@@ -1641,6 +1669,26 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm
 		if (pmap == PCPU_GET(curpmap)) {
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
+			if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
+				critical_enter();
+				pcid = pmap->pm_pcids[cpuid].pm_pcid;
+				if (invpcid_works) {
+					d.pcid = pcid | PMAP_PCID_USER_PT;
+					d.pad = 0;
+					d.addr = sva;
+					for (; d.addr < eva; d.addr +=
+					    PAGE_SIZE)
+						invpcid(&d, INVPCID_ADDR);
+				} else {
+					kcr3 = pmap->pm_cr3 | pcid |
+					    CR3_PCID_SAVE;
+					ucr3 = pmap->pm_ucr3 | pcid |
+					    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+					pmap_pti_pcid_invlrng(ucr3, kcr3, sva,
+					    eva);
+				}
+				critical_exit();
+			}
 		} else if (pmap_pcid_enabled) {
 			pmap->pm_pcids[cpuid].pm_gen = 0;
 		}
@@ -1652,7 +1700,7 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm
 		}
 		mask = &pmap->pm_active;
 	}
-	smp_masked_invlpg_range(*mask, sva, eva);
+	smp_masked_invlpg_range(*mask, sva, eva, pmap);
 	sched_unpin();
 }
 
@@ -1661,6 +1709,8 @@ pmap_invalidate_all(pmap_t pmap)
 {
 	cpuset_t *mask;
 	struct invpcid_descr d;
+	uint64_t kcr3, ucr3;
+	uint32_t pcid;
 	u_int cpuid, i;
 
 	if (pmap_type_guest(pmap)) {
@@ -1684,15 +1734,29 @@ pmap_invalidate_all(pmap_t pmap)
 		cpuid = PCPU_GET(cpuid);
 		if (pmap == PCPU_GET(curpmap)) {
 			if (pmap_pcid_enabled) {
+				critical_enter();
+				pcid = pmap->pm_pcids[cpuid].pm_pcid;
 				if (invpcid_works) {
-					d.pcid = pmap->pm_pcids[cpuid].pm_pcid;
+					d.pcid = pcid;
 					d.pad = 0;
 					d.addr = 0;
 					invpcid(&d, INVPCID_CTX);
+					if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+						d.pcid |= PMAP_PCID_USER_PT;
+						invpcid(&d, INVPCID_CTX);
+					}
 				} else {
-					load_cr3(pmap->pm_cr3 | pmap->pm_pcids
-					    [PCPU_GET(cpuid)].pm_pcid);
+					kcr3 = pmap->pm_cr3 | pcid;
+					ucr3 = pmap->pm_ucr3;
+					if (ucr3 != PMAP_NO_CR3) {
+						ucr3 |= pcid | PMAP_PCID_USER_PT;
+						pmap_pti_pcid_invalidate(ucr3,
+						    kcr3);
+					} else {
+						load_cr3(kcr3);
+					}
 				}
+				critical_exit();
 			} else {
 				invltlb();
 			}
@@ -1797,6 +1861,9 @@ pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
+	struct invpcid_descr d;
+	uint64_t kcr3, ucr3;
+	uint32_t pcid;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
@@ -1805,16 +1872,35 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
-	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
+	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		invlpg(va);
-	else if (pmap_pcid_enabled)
+		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
+		    pmap->pm_ucr3 != PMAP_NO_CR3) {
+			critical_enter();
+			pcid = pmap->pm_pcids[0].pm_pcid;
+			if (invpcid_works) {
+				d.pcid = pcid | PMAP_PCID_USER_PT;
+				d.pad = 0;
+				d.addr = va;
+				invpcid(&d, INVPCID_ADDR);
+			} else {
+				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+				ucr3 = pmap->pm_ucr3 | pcid |
+				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
+			}
+			critical_exit();
+		}
+	} else if (pmap_pcid_enabled)
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
+	struct invpcid_descr d;
 	vm_offset_t addr;
+	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
@@ -1826,6 +1912,25 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
+		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
+		    pmap->pm_ucr3 != PMAP_NO_CR3) {
+			critical_enter();
+			if (invpcid_works) {
+				d.pcid = pmap->pm_pcids[0].pm_pcid |
+				    PMAP_PCID_USER_PT;
+				d.pad = 0;
+				d.addr = sva;
+				for (; d.addr < eva; d.addr += PAGE_SIZE)
+					invpcid(&d, INVPCID_ADDR);
+			} else {
+				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
+				    pm_pcid | CR3_PCID_SAVE;
+				ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
+				    pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
+			}
+			critical_exit();
+		}
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
@@ -1835,6 +1940,7 @@ void
 pmap_invalidate_all(pmap_t pmap)
 {
 	struct invpcid_descr d;
+	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
@@ -1852,15 +1958,26 @@ pmap_invalidate_all(pmap_t pmap)
 		}
 	} else if (pmap == PCPU_GET(curpmap)) {
 		if (pmap_pcid_enabled) {
+			critical_enter();
 			if (invpcid_works) {
 				d.pcid = pmap->pm_pcids[0].pm_pcid;
 				d.pad = 0;
 				d.addr = 0;
 				invpcid(&d, INVPCID_CTX);
+				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+					d.pcid |= PMAP_PCID_USER_PT;
+					invpcid(&d, INVPCID_CTX);
+				}
 			} else {
-				load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0].
-				    pm_pcid);
+				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
+				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+					ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
+					    0].pm_pcid | PMAP_PCID_USER_PT;
+					pmap_pti_pcid_invalidate(ucr3, kcr3);
+				} else
+					load_cr3(kcr3);
 			}
+			critical_exit();
 		} else {
 			invltlb();
 		}
@@ -2398,7 +2515,8 @@ pmap_pinit0(pmap_t pmap)
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	pmap->pm_pml4u = NULL;
 	pmap->pm_cr3 = KPML4phys;
-	pmap->pm_ucr3 = ~0UL;
+	/* hack to keep pmap_pti_pcid_invalidate() alive */
+	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
@@ -2408,7 +2526,7 @@ pmap_pinit0(pmap_t pmap)
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 		if (!pti)
-			__pcpu[i].pc_kcr3 = ~0ul;
+			__pcpu[i].pc_kcr3 = PMAP_NO_CR3;
 	}
 	PCPU_SET(curpmap, kernel_pmap);
 	pmap_activate(curthread);
@@ -2472,7 +2590,8 @@ pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, i
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 	}
-	pmap->pm_cr3 = ~0l;	/* initialize to an invalid value */
+	pmap->pm_cr3 = PMAP_NO_CR3;	/* initialize to an invalid value */
+	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_pml4u = NULL;
 
 	pmap->pm_type = pm_type;
@@ -7134,13 +7253,15 @@ pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
 
 	CRITICAL_ASSERT(curthread);
 	gen = PCPU_GET(pcid_gen);
-	if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
-	    pmap->pm_pcids[cpuid].pm_gen == gen)
+	if (!pti && (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
+	    pmap->pm_pcids[cpuid].pm_gen == gen))
 		return (CR3_PCID_SAVE);
 	pcid_next = PCPU_GET(pcid_next);
-	KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x",
-	    cpuid, pcid_next));
-	if (pcid_next == PMAP_PCID_OVERMAX) {
+	KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
+	    (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
+	    ("cpu %d pcid_next %#x", cpuid, pcid_next));
+	if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
+	    (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
 		new_gen = gen + 1;
 		if (new_gen == 0)
 			new_gen = 1;
@@ -7159,7 +7280,8 @@ void
 pmap_activate_sw(struct thread *td)
 {
 	pmap_t oldpmap, pmap;
-	uint64_t cached, cr3;
+	struct invpcid_descr d;
+	uint64_t cached, cr3, kcr3, ucr3;
 	register_t rflags;
 	u_int cpuid;
 
@@ -7215,6 +7337,32 @@ pmap_activate_sw(struct thread *td)
 				PCPU_INC(pm_save_cnt);
 		}
 		PCPU_SET(curpmap, pmap);
+		if (pti) {
+			kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
+			ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
+			    PMAP_PCID_USER_PT;
+
+			/*
+			 * Manually invalidate translations cached
+			 * from the user page table, which are not
+			 * flushed by reload of cr3 with the kernel
+			 * page table pointer above.
+			 */
+			if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+				if (invpcid_works) {
+					d.pcid = PMAP_PCID_USER_PT |
+					    pmap->pm_pcids[cpuid].pm_pcid;
+					d.pad = 0;
+					d.addr = 0;
+					invpcid(&d, INVPCID_CTX);
+				} else {
+					pmap_pti_pcid_invalidate(ucr3, kcr3);
+				}
+			}
+
+			PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
+			PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
+		}
 		if (!invpcid_works)
 			intr_restore(rflags);
 	} else if (cr3 != pmap->pm_cr3) {

Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/amd64/amd64/support.S	Sat Jan 27 11:49:37 2018	(r328470)
@@ -802,3 +802,51 @@ msr_onfault:
 	movl	$EFAULT,%eax
 	POP_FRAME_POINTER
 	ret
+
+/*
+ * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
+ * Invalidates address space addressed by ucr3, then returns to kcr3.
+ * Done in assembler to ensure no other memory accesses happen while
+ * on ucr3.
+ */
+	ALIGN_TEXT
+ENTRY(pmap_pti_pcid_invalidate)
+	pushfq
+	cli
+	movq	%rdi,%cr3	/* to user page table */
+	movq	%rsi,%cr3	/* back to kernel */
+	popfq
+	retq
+
+/*
+ * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
+ * Invalidates virtual address va in address space ucr3, then returns to kcr3.
+ */
+	ALIGN_TEXT
+ENTRY(pmap_pti_pcid_invlpg)
+	pushfq
+	cli
+	movq	%rdi,%cr3	/* to user page table */
+	invlpg	(%rdx)
+	movq	%rsi,%cr3	/* back to kernel */
+	popfq
+	retq
+
+/*
+ * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
+ *     vm_offset_t eva);
+ * Invalidates virtual addresses between sva and eva in address space ucr3,
+ * then returns to kcr3.
+ */
+	ALIGN_TEXT
+ENTRY(pmap_pti_pcid_invlrng)
+	pushfq
+	cli
+	movq	%rdi,%cr3	/* to user page table */
+1:	invlpg	(%rdx)
+	addq	$PAGE_SIZE,%rdx
+	cmpq	%rdx,%rcx
+	ja	1b
+	movq	%rsi,%cr3	/* back to kernel */
+	popfq
+	retq

Modified: head/sys/amd64/include/pmap.h
==============================================================================
--- head/sys/amd64/include/pmap.h	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/amd64/include/pmap.h	Sat Jan 27 11:49:37 2018	(r328470)
@@ -225,7 +225,11 @@
 #define	PMAP_PCID_NONE		0xffffffff
 #define	PMAP_PCID_KERN		0
 #define	PMAP_PCID_OVERMAX	0x1000
+#define	PMAP_PCID_OVERMAX_KERN	0x800
+#define	PMAP_PCID_USER_PT	0x800
 
+#define	PMAP_NO_CR3		(~0UL)
+
 #ifndef LOCORE
 
 #include <sys/queue.h>
@@ -433,6 +437,10 @@ boolean_t pmap_map_io_transient(vm_page_t *, vm_offset
 void	pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
 void	pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec);
 void	pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva);
+void	pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
+void	pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
+void	pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
+	    vm_offset_t eva);
 #endif /* _KERNEL */
 
 /* Return various clipped indexes for a given VA */

Modified: head/sys/amd64/include/smp.h
==============================================================================
--- head/sys/amd64/include/smp.h	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/amd64/include/smp.h	Sat Jan 27 11:49:37 2018	(r328470)
@@ -28,15 +28,23 @@ extern u_int32_t		mptramp_pagetables;
 
 /* IPI handlers */
 inthand_t
-	IDTVEC(invltlb_pcid),	/* TLB shootdowns - global, pcid */
-	IDTVEC(invltlb_invpcid),/* TLB shootdowns - global, invpcid */
 	IDTVEC(justreturn),	/* interrupt CPU with minimum overhead */
-	IDTVEC(invltlb_pcid_pti),
-	IDTVEC(invltlb_invpcid_pti),
 	IDTVEC(justreturn1_pti),
 	IDTVEC(invltlb_pti),
+	IDTVEC(invltlb_pcid_pti),
+	IDTVEC(invltlb_pcid),	/* TLB shootdowns - global, pcid */
+	IDTVEC(invltlb_invpcid_pti_pti),
+	IDTVEC(invltlb_invpcid_nopti),
 	IDTVEC(invlpg_pti),
+	IDTVEC(invlpg_invpcid_pti),
+	IDTVEC(invlpg_invpcid),
+	IDTVEC(invlpg_pcid_pti),
+	IDTVEC(invlpg_pcid),
 	IDTVEC(invlrng_pti),
+	IDTVEC(invlrng_invpcid_pti),
+	IDTVEC(invlrng_invpcid),
+	IDTVEC(invlrng_pcid_pti),
+	IDTVEC(invlrng_pcid),
 	IDTVEC(invlcache_pti),
 	IDTVEC(ipi_intr_bitmap_handler_pti),
 	IDTVEC(cpustop_pti),
@@ -45,6 +53,11 @@ inthand_t
 
 void	invltlb_pcid_handler(void);
 void	invltlb_invpcid_handler(void);
+void	invltlb_invpcid_pti_handler(void);
+void	invlpg_invpcid_handler(void);
+void	invlpg_pcid_handler(void);
+void	invlrng_invpcid_handler(void);
+void	invlrng_pcid_handler(void);
 int	native_start_all_aps(void);
 
 #endif /* !LOCORE */

Modified: head/sys/i386/i386/pmap.c
==============================================================================
--- head/sys/i386/i386/pmap.c	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/i386/i386/pmap.c	Sat Jan 27 11:49:37 2018	(r328470)
@@ -1045,7 +1045,7 @@ pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
-	smp_masked_invlpg(*mask, va);
+	smp_masked_invlpg(*mask, va, pmap);
 	sched_unpin();
 }
 
@@ -1079,7 +1079,7 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
-	smp_masked_invlpg_range(*mask, sva, eva);
+	smp_masked_invlpg_range(*mask, sva, eva, pmap);
 	sched_unpin();
 }
 

Modified: head/sys/i386/i386/vm_machdep.c
==============================================================================
--- head/sys/i386/i386/vm_machdep.c	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/i386/i386/vm_machdep.c	Sat Jan 27 11:49:37 2018	(r328470)
@@ -768,7 +768,7 @@ sf_buf_shootdown(struct sf_buf *sf, int flags)
 		CPU_NAND(&other_cpus, &sf->cpumask);
 		if (!CPU_EMPTY(&other_cpus)) {
 			CPU_OR(&sf->cpumask, &other_cpus);
-			smp_masked_invlpg(other_cpus, sf->kva);
+			smp_masked_invlpg(other_cpus, sf->kva, kernel_pmap);
 		}
 	}
 	sched_unpin();

Modified: head/sys/x86/include/x86_smp.h
==============================================================================
--- head/sys/x86/include/x86_smp.h	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/x86/include/x86_smp.h	Sat Jan 27 11:49:37 2018	(r328470)
@@ -39,6 +39,7 @@ extern int cpu_logical;
 extern int cpu_cores;
 extern volatile uint32_t smp_tlb_generation;
 extern struct pmap *smp_tlb_pmap;
+extern vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 extern u_int xhits_gbl[];
 extern u_int xhits_pg[];
 extern u_int xhits_rng[];
@@ -97,9 +98,9 @@ void	ipi_selected(cpuset_t cpus, u_int ipi);
 u_int	mp_bootaddress(u_int);
 void	set_interrupt_apic_ids(void);
 void	smp_cache_flush(void);
-void	smp_masked_invlpg(cpuset_t mask, vm_offset_t addr);
+void	smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, struct pmap *pmap);
 void	smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva,
-	    vm_offset_t endva);
+	    vm_offset_t endva, struct pmap *pmap);
 void	smp_masked_invltlb(cpuset_t mask, struct pmap *pmap);
 void	mem_range_AP_init(void);
 void	topo_probe(void);

Modified: head/sys/x86/x86/mp_x86.c
==============================================================================
--- head/sys/x86/x86/mp_x86.c	Sat Jan 27 11:40:46 2018	(r328469)
+++ head/sys/x86/x86/mp_x86.c	Sat Jan 27 11:49:37 2018	(r328470)
@@ -1506,7 +1506,7 @@ SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, 
  */
 
 /* Variables needed for SMP tlb shootdown. */
-static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 pmap_t smp_tlb_pmap;
 volatile uint32_t smp_tlb_generation;
 
@@ -1583,11 +1583,11 @@ smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
 }
 
 void
-smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0);
+		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_page++;
 #endif
@@ -1595,11 +1595,12 @@ smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
 }
 
 void
-smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
+    pmap_t pmap)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL,
+		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
 		    addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_range++;


More information about the svn-src-head mailing list