svn commit: r255060 - in head/sys/amd64: amd64 include

Konstantin Belousov kib at FreeBSD.org
Fri Aug 30 07:59:51 UTC 2013


Author: kib
Date: Fri Aug 30 07:59:49 2013
New Revision: 255060
URL: http://svnweb.freebsd.org/changeset/base/255060

Log:
  Implement support for the process-context identifiers ('PCID') on
  Intel CPUs.  The feature tags TLB entries with the Id of the address
  space and allows to avoid TLB invalidation on the context switch, it
  is available only in the long mode.  In the microbenchmarks, using the
  PCID decreased latency of the context switches by ~30% on SandyBridge
  class desktop CPUs, measured with the lat_ctx program from lmbench.
  
  If available, use INVPCID instruction when a TLB entry in non-current
  address space needs to be invalidated.  The instruction is typically
  available on the Haswell.
  
  If needed, the use of PCID can be turned off with the
  vm.pmap.pcid_enabled loader tunable set to 0.  The state of the
  feature is reported by the vm.pmap.pcid_enabled sysctl.  The sysctl
  vm.pmap.pcid_save_cnt reports the number of context switches which
  avoided invalidating the TLB; compare with the total number of context
  switches, available as sysctl vm.stats.sys.v_swtch.
  
  Sponsored by:	The FreeBSD Foundation
  Reviewed by:	alc
  Tested by:	pho, bf

Modified:
  head/sys/amd64/amd64/apic_vector.S
  head/sys/amd64/amd64/cpu_switch.S
  head/sys/amd64/amd64/genassym.c
  head/sys/amd64/amd64/machdep.c
  head/sys/amd64/amd64/mp_machdep.c
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/amd64/vm_machdep.c
  head/sys/amd64/include/pcpu.h
  head/sys/amd64/include/pmap.h
  head/sys/amd64/include/smp.h

Modified: head/sys/amd64/amd64/apic_vector.S
==============================================================================
--- head/sys/amd64/amd64/apic_vector.S	Fri Aug 30 07:43:34 2013	(r255059)
+++ head/sys/amd64/amd64/apic_vector.S	Fri Aug 30 07:59:49 2013	(r255060)
@@ -43,6 +43,12 @@
 
 #include "assym.s"
 
+#ifdef SMP
+#define LK	lock ;
+#else
+#define LK
+#endif
+
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
@@ -149,6 +155,38 @@ IDTVEC(xen_intr_upcall)
  * Global address space TLB shootdown.
  */
 	.text
+
+#define	NAKE_INTR_CS	24
+
+	SUPERALIGN_TEXT
+global_invltlb:
+	movl	%cr4,%eax
+	andl	$~0x80,%eax
+	movl	%eax,%cr4
+	orl	$0x80,%eax
+	movl	%eax,%cr4
+invltlb_ret_clear_pm_save:
+	movq	smp_tlb_pmap,%rdx
+	testq	%rdx,%rdx
+	jz	invltlb_ret
+	testb	$SEL_RPL_MASK,NAKE_INTR_CS(%rsp)
+	jz	1f
+	swapgs
+1:
+	movl	PCPU(CPUID),%eax
+	jz	2f
+	swapgs
+2:
+	LK btcl	%eax,PM_SAVE(%rdx)
+	SUPERALIGN_TEXT
+invltlb_ret:
+	movq	lapic, %rax
+	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
+	LK incl	smp_tlb_wait
+	popq	%rdx
+	popq	%rax
+	jmp	doreti_iret
+
 	SUPERALIGN_TEXT
 IDTVEC(invltlb)
 #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS)
@@ -165,18 +203,44 @@ IDTVEC(invltlb)
 #endif
 
 	pushq	%rax
+	pushq	%rdx
 
-	movq	%cr3, %rax		/* invalidate the TLB */
-	movq	%rax, %cr3
-
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
-
-	lock
-	incl	smp_tlb_wait
-
-	popq	%rax
-	jmp	doreti_iret
+	movq	%cr3,%rax
+	cmpl	$0,pmap_pcid_enabled
+	je	2f
+
+	movq	$smp_tlb_invpcid,%rdx
+	cmpl	$0,(%rdx)
+	je	global_invltlb
+	cmpl	$-1,(%rdx)
+	je	global_invltlb
+
+	/*
+	 * Non-zero smp_tlb_invpcid, only invalidate TLB for entries with
+	 * current PCID.
+	 */
+	cmpl	$0,invpcid_works
+	je	1f
+	/* Use invpcid if available. */
+	movl	$1,%eax /* INVPCID_CTX */
+	/* invpcid (%rdx),%rax */
+	.byte 0x66,0x0f,0x38,0x82,0x02
+	jmp	invltlb_ret_clear_pm_save
+1:
+	/* Otherwise reload %cr3 twice. */
+	movq	pcid_cr3,%rdx
+	cmpq	%rax,%rdx
+	je	2f
+	movq	%rdx,%cr3	/* Invalidate, bit 63 is zero. */
+	btsq	$63,%rax
+
+	/*
+	 * Invalidate the TLB if PCID is not enabled.
+	 * Restore the old address space.
+	 */
+2:
+	movq	%rax,%cr3
+	jmp	invltlb_ret_clear_pm_save
 
 /*
  * Single page TLB shootdown
@@ -198,18 +262,54 @@ IDTVEC(invlpg)
 #endif
 
 	pushq	%rax
-
-	movq	smp_tlb_addr1, %rax
-	invlpg	(%rax)			/* invalidate single page */
-
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
-
-	lock
-	incl	smp_tlb_wait
-
-	popq	%rax
-	jmp	doreti_iret
+	pushq	%rdx
+	movq	$smp_tlb_invpcid,%rdx
+	cmpl	$0,pmap_pcid_enabled
+	je	3f
+	cmpl	$0,invpcid_works
+	jne	2f
+
+	/* kernel pmap - use invlpg to invalidate global mapping */
+	cmpl	$0,(%rdx)
+	je	3f
+	cmpl	$-1,(%rdx)
+	je	global_invltlb
+
+	/*
+	 * PCID supported, but INVPCID is not.
+	 * Temporarily switch to the target address space and do INVLPG.
+	 */
+	pushq	%rcx
+	movq	%cr3,%rcx
+	movq	pcid_cr3,%rax
+	cmp	%rcx,%rax
+	je	1f
+	btsq	$63,%rax
+	movq	%rax,%cr3
+1:	movq	8(%rdx),%rax
+	invlpg	(%rax)
+	btsq	$63,%rcx
+	movq	%rcx,%cr3
+	popq	%rcx
+	jmp	invltlb_ret
+
+	/*
+	 * Invalidate the TLB entry using INVPCID_ADDR.
+	 */
+2:
+	xorl	%eax,%eax
+/*	invpcid	(%rdx),%rax */
+	.byte	0x66,0x0f,0x38,0x82,0x02
+	jmp	invltlb_ret
+
+	/*
+	 * PCID is not supported or kernel pmap.
+	 * Invalidate single page using INVLPG.
+	 */
+3:
+	movq	8(%rdx),%rax
+	invlpg	(%rax)
+	jmp	invltlb_ret
 
 /*
  * Page range TLB shootdown.
@@ -232,23 +332,76 @@ IDTVEC(invlrng)
 
 	pushq	%rax
 	pushq	%rdx
-
-	movq	smp_tlb_addr1, %rdx
-	movq	smp_tlb_addr2, %rax
+	movq	$smp_tlb_invpcid,%rdx
+	cmpl	$0,pmap_pcid_enabled
+	jne	invlrng_single_page
+	cmpl	$0,invpcid_works
+	jne	invlrng_invpcid
+
+	/* kernel pmap - use invlpg to invalidate global mapping */
+	cmpl	$0,(%rdx)
+	je	invlrng_single_page
+	cmpl	$-1,(%rdx)
+	je	global_invltlb
+
+	pushq	%rcx
+	movq	%cr3,%rcx
+	movq	pcid_cr3,%rax
+	cmpq	%rcx,%rax
+	je	1f
+	btsq	$63,%rax
+	movq	%rax,%cr3
+1:
+	movq	8(%rdx),%rdx
+	movq	smp_tlb_addr2,%rax
+2:
+	invlpg	(%rdx)
+	addq	$PAGE_SIZE,%rdx
+	cmpq	%rax,%rdx
+	jb	2b
+	btsq	$63,%rcx
+	movq	%rcx,%cr3
+	popq	%rcx
+	jmp	invltlb_ret
+
+invlrng_invpcid:
+	testb	$SEL_RPL_MASK,NAKE_INTR_CS(%rsp)
+	jz	1f
+	swapgs
+1:
+	pushq	%rcx
+	movq	(%rdx),%rcx
+	movq	%rcx,PCPU(INVPCID_DESCR)
+	movq	8(%rdx),%rax
+	movq	%rax,PCPU(INVPCID_DESCR)+8
+	movq	smp_tlb_addr2,%rcx
+	xorl	%eax,%eax
+	movq	$PC_INVPCID_DESCR,%rdx
+	gs
+	subq	8(%rdx),%rcx
+	shrq	$PAGE_SHIFT,%rcx
+2:
+	gs
+//	invpcid	(%rdx),%rax
+	.byte	0x66,0x0f,0x38,0x82,0x02
+	gs
+	addq	$PAGE_SIZE,8(%rdx)
+	dec	%rcx
+	jne	2b
+	popq	%rcx
+	testb	$SEL_RPL_MASK,NAKE_INTR_CS(%rsp)
+	jz	invltlb_ret
+	swapgs
+	jmp	invltlb_ret
+
+invlrng_single_page:
+	movq	8(%rdx),%rdx
+	movq	smp_tlb_addr2,%rax
 1:	invlpg	(%rdx)			/* invalidate single page */
-	addq	$PAGE_SIZE, %rdx
-	cmpq	%rax, %rdx
+	addq	$PAGE_SIZE,%rdx
+	cmpq	%rax,%rdx
 	jb	1b
-
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
-
-	lock
-	incl	smp_tlb_wait
-
-	popq	%rdx
-	popq	%rax
-	jmp	doreti_iret
+	jmp	invltlb_ret
 
 /*
  * Invalidate cache.
@@ -265,17 +418,9 @@ IDTVEC(invlcache)
 #endif
 
 	pushq	%rax
-
+	pushq	%rdx
 	wbinvd
-
-	movq	lapic, %rax
-	movl	$0, LA_EOI(%rax)	/* End Of Interrupt to APIC */
-
-	lock
-	incl	smp_tlb_wait
-
-	popq	%rax
-	jmp	doreti_iret
+	jmp	invltlb_ret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.

Modified: head/sys/amd64/amd64/cpu_switch.S
==============================================================================
--- head/sys/amd64/amd64/cpu_switch.S	Fri Aug 30 07:43:34 2013	(r255059)
+++ head/sys/amd64/amd64/cpu_switch.S	Fri Aug 30 07:59:49 2013	(r255060)
@@ -77,8 +77,7 @@ ENTRY(cpu_throw)
 	LK btrl	%eax,PM_ACTIVE(%rdx)		/* clear old */
 1:
 	movq	TD_PCB(%rsi),%r8		/* newtd->td_pcb */
-	movq	PCB_CR3(%r8),%rdx
-	movq	%rdx,%cr3			/* new address space */
+	movq	PCB_CR3(%r8),%rcx		/* new address space */
 	jmp	swact
 END(cpu_throw)
 
@@ -145,20 +144,41 @@ ctx_switch_xsave:
 	SETLK	%rdx, TD_LOCK(%rdi)		/* Release the old thread */
 	jmp	sw1
 swinact:
-	movq	%rcx,%cr3			/* new address space */
-	movl	PCPU(CPUID), %eax
+	movl	PCPU(CPUID),%eax
 	/* Release bit from old pmap->pm_active */
-	movq	PCPU(CURPMAP),%rcx
-	LK btrl	%eax,PM_ACTIVE(%rcx)		/* clear old */
-	SETLK	%rdx, TD_LOCK(%rdi)		/* Release the old thread */
+	movq	PCPU(CURPMAP),%r12
+	LK btrl	%eax,PM_ACTIVE(%r12)		/* clear old */
+	SETLK	%rdx,TD_LOCK(%rdi)		/* Release the old thread */
 swact:
 	/* Set bit in new pmap->pm_active */
 	movq	TD_PROC(%rsi),%rdx		/* newproc */
 	movq	P_VMSPACE(%rdx), %rdx
 	addq	$VM_PMAP,%rdx
+	cmpl	$-1,PM_PCID(%rdx)
+	je	1f
+	LK btsl	%eax,PM_SAVE(%rdx)
+	jnc	1f
+	btsq	$63,%rcx			/* CR3_PCID_SAVE */
+	incq	PCPU(PM_SAVE_CNT)
+1:
+	movq	%rcx,%cr3			/* new address space */
 	LK btsl	%eax,PM_ACTIVE(%rdx)		/* set new */
 	movq	%rdx,PCPU(CURPMAP)
 
+	/*
+	 * We might lose the race and other CPU might have changed
+	 * the pmap after we set our bit in pmap->pm_save.  Recheck.
+	 * Reload %cr3 with CR3_PCID_SAVE bit cleared if pmap was
+	 * modified, causing TLB flush for this pcid.
+	 */
+	btrq	$63,%rcx
+	jnc	1f
+	LK btsl	%eax,PM_SAVE(%rdx)
+	jc	1f
+	decq	PCPU(PM_SAVE_CNT)
+	movq	%rcx,%cr3
+1:
+
 sw1:
 #if defined(SCHED_ULE) && defined(SMP)
 	/* Wait for the new thread to become unblocked */

Modified: head/sys/amd64/amd64/genassym.c
==============================================================================
--- head/sys/amd64/amd64/genassym.c	Fri Aug 30 07:43:34 2013	(r255059)
+++ head/sys/amd64/amd64/genassym.c	Fri Aug 30 07:59:49 2013	(r255060)
@@ -76,6 +76,8 @@ __FBSDID("$FreeBSD$");
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
+ASSYM(PM_SAVE, offsetof(struct pmap, pm_save));
+ASSYM(PM_PCID, offsetof(struct pmap, pm_pcid));
 
 ASSYM(P_MD, offsetof(struct proc, p_md));
 ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt));
@@ -225,6 +227,8 @@ ASSYM(PC_GS32P, offsetof(struct pcpu, pc
 ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt));
 ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
 ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
+ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
+ASSYM(PC_INVPCID_DESCR, offsetof(struct pcpu, pc_invpcid_descr));
  
 ASSYM(LA_VER, offsetof(struct LAPIC, version));
 ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));

Modified: head/sys/amd64/amd64/machdep.c
==============================================================================
--- head/sys/amd64/amd64/machdep.c	Fri Aug 30 07:43:34 2013	(r255059)
+++ head/sys/amd64/amd64/machdep.c	Fri Aug 30 07:59:49 2013	(r255060)
@@ -1909,7 +1909,7 @@ hammer_time(u_int64_t modulep, u_int64_t
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
-	thread0.td_pcb->pcb_cr3 = KPML4phys;
+	thread0.td_pcb->pcb_cr3 = KPML4phys; /* PCID 0 is reserved for kernel */
 	thread0.td_frame = &proc0_tf;
 
         env = getenv("kernelname");

Modified: head/sys/amd64/amd64/mp_machdep.c
==============================================================================
--- head/sys/amd64/amd64/mp_machdep.c	Fri Aug 30 07:43:34 2013	(r255059)
+++ head/sys/amd64/amd64/mp_machdep.c	Fri Aug 30 07:59:49 2013	(r255060)
@@ -107,9 +107,11 @@ struct pcb stoppcbs[MAXCPU];
 struct pcb **susppcbs;
 
 /* Variables needed for SMP tlb shootdown. */
-vm_offset_t smp_tlb_addr1;
 vm_offset_t smp_tlb_addr2;
+struct invpcid_descr smp_tlb_invpcid;
 volatile int smp_tlb_wait;
+uint64_t pcid_cr3;
+pmap_t smp_tlb_pmap;
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
@@ -603,6 +605,8 @@ cpu_mp_announce(void)
 	}
 }
 
+extern int pmap_pcid_enabled;
+
 /*
  * AP CPU's call this to initialize themselves.
  */
@@ -768,6 +772,8 @@ init_secondary(void)
 	 */
 
 	load_cr4(rcr4() | CR4_PGE);
+	if (pmap_pcid_enabled)
+		load_cr4(rcr4() | CR4_PCIDE);
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
@@ -1119,7 +1125,8 @@ ipi_send_cpu(int cpu, u_int ipi)
  * Flush the TLB on all other CPU's
  */
 static void
-smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+smp_tlb_shootdown(u_int vector, pmap_t pmap, vm_offset_t addr1,
+    vm_offset_t addr2)
 {
 	u_int ncpu;
 
@@ -1129,8 +1136,16 @@ smp_tlb_shootdown(u_int vector, vm_offse
 	if (!(read_rflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
-	smp_tlb_addr1 = addr1;
+	smp_tlb_invpcid.addr = addr1;
+	if (pmap == NULL) {
+		smp_tlb_invpcid.pcid = 0;
+	} else {
+		smp_tlb_invpcid.pcid = pmap->pm_pcid;
+		pcid_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) |
+		    (pmap->pm_pcid == -1 ? 0 : pmap->pm_pcid);
+	}
 	smp_tlb_addr2 = addr2;
+	smp_tlb_pmap = pmap;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
 	ipi_all_but_self(vector);
 	while (smp_tlb_wait < ncpu)
@@ -1139,7 +1154,8 @@ smp_tlb_shootdown(u_int vector, vm_offse
 }
 
 static void
-smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
+    vm_offset_t addr1, vm_offset_t addr2)
 {
 	int cpu, ncpu, othercpus;
 
@@ -1155,8 +1171,16 @@ smp_targeted_tlb_shootdown(cpuset_t mask
 	if (!(read_rflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
 	mtx_lock_spin(&smp_ipi_mtx);
-	smp_tlb_addr1 = addr1;
+	smp_tlb_invpcid.addr = addr1;
+	if (pmap == NULL) {
+		smp_tlb_invpcid.pcid = 0;
+	} else {
+		smp_tlb_invpcid.pcid = pmap->pm_pcid;
+		pcid_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) |
+		    (pmap->pm_pcid == -1 ? 0 : pmap->pm_pcid);
+	}
 	smp_tlb_addr2 = addr2;
+	smp_tlb_pmap = pmap;
 	atomic_store_rel_int(&smp_tlb_wait, 0);
 	if (CPU_ISFULLSET(&mask)) {
 		ncpu = othercpus;
@@ -1182,15 +1206,15 @@ smp_cache_flush(void)
 {
 
 	if (smp_started)
-		smp_tlb_shootdown(IPI_INVLCACHE, 0, 0);
+		smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0);
 }
 
 void
-smp_invltlb(void)
+smp_invltlb(pmap_t pmap)
 {
 
 	if (smp_started) {
-		smp_tlb_shootdown(IPI_INVLTLB, 0, 0);
+		smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_global++;
 #endif
@@ -1198,11 +1222,11 @@ smp_invltlb(void)
 }
 
 void
-smp_invlpg(vm_offset_t addr)
+smp_invlpg(pmap_t pmap, vm_offset_t addr)
 {
 
 	if (smp_started) {
-		smp_tlb_shootdown(IPI_INVLPG, addr, 0);
+		smp_tlb_shootdown(IPI_INVLPG, pmap, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_page++;
 #endif
@@ -1210,11 +1234,11 @@ smp_invlpg(vm_offset_t addr)
 }
 
 void
-smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2)
+smp_invlpg_range(pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2)
 {
 
 	if (smp_started) {
-		smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2);
+		smp_tlb_shootdown(IPI_INVLRNG, pmap, addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_range++;
 		ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
@@ -1223,11 +1247,11 @@ smp_invlpg_range(vm_offset_t addr1, vm_o
 }
 
 void
-smp_masked_invltlb(cpuset_t mask)
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0);
+		smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, NULL, 0, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_global++;
 #endif
@@ -1235,11 +1259,11 @@ smp_masked_invltlb(cpuset_t mask)
 }
 
 void
-smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
+smp_masked_invlpg(cpuset_t mask, pmap_t pmap, vm_offset_t addr)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0);
+		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_page++;
 #endif
@@ -1247,11 +1271,13 @@ smp_masked_invlpg(cpuset_t mask, vm_offs
 }
 
 void
-smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
+smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1,
+    vm_offset_t addr2)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2);
+		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1,
+		    addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_masked_range++;
 		ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE;

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Fri Aug 30 07:43:34 2013	(r255059)
+++ head/sys/amd64/amd64/pmap.c	Fri Aug 30 07:59:49 2013	(r255060)
@@ -116,11 +116,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
-#ifdef SMP
+#include <sys/_unrhdr.h>
 #include <sys/smp.h>
-#else
-#include <sys/cpuset.h>
-#endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -250,6 +247,53 @@ static struct md_page *pv_table;
 pt_entry_t *CMAP1 = 0;
 caddr_t CADDR1 = 0;
 
+static struct unrhdr pcid_unr;
+static struct mtx pcid_mtx;
+int pmap_pcid_enabled = 1;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled,
+    0, "Is TLB Context ID enabled ?");
+int invpcid_works = 0;
+
+/*
+ * Perform the guaranteed invalidation of all TLB entries.  This
+ * includes the global entries, and entries in all PCIDs, not only the
+ * current context.  The function works both on non-PCID CPUs and CPUs
+ * with the PCID turned off or on.  See IA-32 SDM Vol. 3a 4.10.4.1
+ * Operations that Invalidate TLBs and Paging-Structure Caches.
+ */
+static __inline void
+invltlb_globpcid(void)
+{
+	uint64_t cr4;
+
+	cr4 = rcr4();
+	load_cr4(cr4 & ~CR4_PGE);
+	/*
+	 * Although preemption at this point could be detrimental to
+	 * performance, it would not lead to an error.  PG_G is simply
+	 * ignored if CR4.PGE is clear.  Moreover, in case this block
+	 * is re-entered, the load_cr4() either above or below will
+	 * modify CR4.PGE flushing the TLB.
+	 */
+	load_cr4(cr4 | CR4_PGE);
+}
+
+static int
+pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
+{
+	int i;
+	uint64_t res;
+
+	res = 0;
+	CPU_FOREACH(i) {
+		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
+	}
+	return (sysctl_handle_64(oidp, &res, 0, req));
+}
+SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
+    CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
+    "Count of saved TLB context on switch");
+
 /*
  * Crashdump maps.
  */
@@ -685,6 +729,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
+	CPU_ZERO(&kernel_pmap->pm_save);
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 
  	/*
@@ -716,6 +761,21 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 
 	/* Initialize the PAT MSR. */
 	pmap_init_pat();
+
+#ifdef SMP
+	/* Initialize TLB Context Id. */
+	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
+	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
+		load_cr4(rcr4() | CR4_PCIDE);
+		mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF);
+		init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx);
+		/* Check for INVPCID support */
+		invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID)
+		    != 0;
+		kernel_pmap->pm_pcid = 0;
+	} else
+#endif
+		pmap_pcid_enabled = 0;
 }
 
 /*
@@ -952,7 +1012,6 @@ pmap_cache_bits(int mode, boolean_t is_p
 static void
 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
 {
-	u_long cr4;
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
@@ -968,19 +1027,34 @@ pmap_update_pde_invalidate(vm_offset_t v
 		 * Promotion: flush every 4KB page mapping from the TLB,
 		 * including any global (PG_G) mappings.
 		 */
-		cr4 = rcr4();
-		load_cr4(cr4 & ~CR4_PGE);
-		/*
-		 * Although preemption at this point could be detrimental to
-		 * performance, it would not lead to an error.  PG_G is simply
-		 * ignored if CR4.PGE is clear.  Moreover, in case this block
-		 * is re-entered, the load_cr4() either above or below will
-		 * modify CR4.PGE flushing the TLB.
-		 */
-		load_cr4(cr4 | CR4_PGE);
+		invltlb_globpcid();
 	}
 }
 #ifdef SMP
+
+static void
+pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va)
+{
+	struct invpcid_descr d;
+	uint64_t cr3;
+
+	if (invpcid_works) {
+		d.pcid = pmap->pm_pcid;
+		d.pad = 0;
+		d.addr = va;
+		invpcid(&d, INVPCID_ADDR);
+		return;
+	}
+
+	cr3 = rcr3();
+	critical_enter();
+	load_cr3(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | pmap->pm_pcid |
+	    CR3_PCID_SAVE);
+	invlpg(va);
+	load_cr3(cr3 | CR3_PCID_SAVE);
+	critical_exit();
+}
+
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  *
@@ -1008,21 +1082,68 @@ pmap_invalidate_page(pmap_t pmap, vm_off
 
 	sched_pin();
 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
-		invlpg(va);
-		smp_invlpg(va);
+		if (!pmap_pcid_enabled) {
+			invlpg(va);
+		} else {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+				if (pmap == PCPU_GET(curpmap))
+					invlpg(va);
+				else
+					pmap_invalidate_page_pcid(pmap, va);
+			} else {
+				invltlb_globpcid();
+			}
+		}
+		smp_invlpg(pmap, va);
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		if (CPU_ISSET(cpuid, &pmap->pm_active))
 			invlpg(va);
-		CPU_AND(&other_cpus, &pmap->pm_active);
+		else if (pmap_pcid_enabled) {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
+				pmap_invalidate_page_pcid(pmap, va);
+			else
+				invltlb_globpcid();
+		}
+		if (pmap_pcid_enabled)
+			CPU_AND(&other_cpus, &pmap->pm_save);
+		else
+			CPU_AND(&other_cpus, &pmap->pm_active);
 		if (!CPU_EMPTY(&other_cpus))
-			smp_masked_invlpg(other_cpus, va);
+			smp_masked_invlpg(other_cpus, pmap, va);
 	}
 	sched_unpin();
 }
 
+static void
+pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+	struct invpcid_descr d;
+	uint64_t cr3;
+	vm_offset_t addr;
+
+	if (invpcid_works) {
+		d.pcid = pmap->pm_pcid;
+		d.pad = 0;
+		for (addr = sva; addr < eva; addr += PAGE_SIZE) {
+			d.addr = addr;
+			invpcid(&d, INVPCID_ADDR);
+		}
+		return;
+	}
+
+	cr3 = rcr3();
+	critical_enter();
+	load_cr3(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4) | pmap->pm_pcid |
+	    CR3_PCID_SAVE);
+	for (addr = sva; addr < eva; addr += PAGE_SIZE)
+		invlpg(addr);
+	load_cr3(cr3 | CR3_PCID_SAVE);
+	critical_exit();
+}
+
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
@@ -1032,19 +1153,43 @@ pmap_invalidate_range(pmap_t pmap, vm_of
 
 	sched_pin();
 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
-		for (addr = sva; addr < eva; addr += PAGE_SIZE)
-			invlpg(addr);
-		smp_invlpg_range(sva, eva);
+		if (!pmap_pcid_enabled) {
+			for (addr = sva; addr < eva; addr += PAGE_SIZE)
+				invlpg(addr);
+		} else {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+				if (pmap == PCPU_GET(curpmap)) {
+					for (addr = sva; addr < eva;
+					    addr += PAGE_SIZE)
+						invlpg(addr);
+				} else {
+					pmap_invalidate_range_pcid(pmap,
+					    sva, eva);
+				}
+			} else {
+				invltlb_globpcid();
+			}
+		}
+		smp_invlpg_range(pmap, sva, eva);
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
-		if (CPU_ISSET(cpuid, &pmap->pm_active))
+		if (CPU_ISSET(cpuid, &pmap->pm_active)) {
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
-		CPU_AND(&other_cpus, &pmap->pm_active);
+		} else if (pmap_pcid_enabled) {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0)
+				pmap_invalidate_range_pcid(pmap, sva, eva);
+			else
+				invltlb_globpcid();
+		}
+		if (pmap_pcid_enabled)
+			CPU_AND(&other_cpus, &pmap->pm_save);
+		else
+			CPU_AND(&other_cpus, &pmap->pm_active);
 		if (!CPU_EMPTY(&other_cpus))
-			smp_masked_invlpg_range(other_cpus, sva, eva);
+			smp_masked_invlpg_range(other_cpus, pmap, sva, eva);
 	}
 	sched_unpin();
 }
@@ -1053,21 +1198,63 @@ void
 pmap_invalidate_all(pmap_t pmap)
 {
 	cpuset_t other_cpus;
+	struct invpcid_descr d;
+	uint64_t cr3;
 	u_int cpuid;
 
 	sched_pin();
-	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
-		invltlb();
-		smp_invltlb();
+	cpuid = PCPU_GET(cpuid);
+	if (pmap == kernel_pmap ||
+	    (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) ||
+	    !CPU_CMP(&pmap->pm_active, &all_cpus)) {
+		if (invpcid_works) {
+			bzero(&d, sizeof(d));
+			invpcid(&d, INVPCID_CTXGLOB);
+		} else {
+			invltlb_globpcid();
+		}
+		CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
+		smp_invltlb(pmap);
 	} else {
-		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
-		if (CPU_ISSET(cpuid, &pmap->pm_active))
+
+		/*
+		 * This logic is duplicated in the Xinvltlb shootdown
+		 * IPI handler.
+		 */
+		if (pmap_pcid_enabled) {
+			if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) {
+				if (invpcid_works) {
+					d.pcid = pmap->pm_pcid;
+					d.pad = 0;
+					d.addr = 0;
+					invpcid(&d, INVPCID_CTX);
+				} else {
+					cr3 = rcr3();
+					critical_enter();
+
+					/*
+					 * Bit 63 is clear, pcid TLB
+					 * entries are invalidated.
+					 */
+					load_cr3(DMAP_TO_PHYS((vm_offset_t)
+					    pmap->pm_pml4) | pmap->pm_pcid);
+					load_cr3(cr3 | CR3_PCID_SAVE);
+					critical_exit();
+				}
+			} else {
+				invltlb_globpcid();
+			}
+		} else if (CPU_ISSET(cpuid, &pmap->pm_active))
 			invltlb();
-		CPU_AND(&other_cpus, &pmap->pm_active);
+		CPU_CLR_ATOMIC(cpuid, &pmap->pm_save);
+		if (pmap_pcid_enabled)
+			CPU_AND(&other_cpus, &pmap->pm_save);
+		else
+			CPU_AND(&other_cpus, &pmap->pm_active);
 		if (!CPU_EMPTY(&other_cpus))
-			smp_masked_invltlb(other_cpus);
+			smp_masked_invltlb(other_cpus, pmap);
 	}
 	sched_unpin();
 }
@@ -1129,8 +1316,10 @@ pmap_update_pde(pmap_t pmap, vm_offset_t
 	CPU_CLR(cpuid, &other_cpus);
 	if (pmap == kernel_pmap)
 		active = all_cpus;
-	else
+	else {
 		active = pmap->pm_active;
+		CPU_AND_ATOMIC(&pmap->pm_save, &active);
+	}
 	if (CPU_OVERLAP(&active, &other_cpus)) { 
 		act.store = cpuid;
 		act.invalidate = active;
@@ -1193,6 +1382,8 @@ pmap_update_pde(pmap_t pmap, vm_offset_t
 	pde_store(pde, newpde);
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 		pmap_update_pde_invalidate(va, newpde);
+	else
+		CPU_ZERO(&pmap->pm_save);
 }
 #endif /* !SMP */
 
@@ -1675,6 +1866,8 @@ pmap_pinit0(pmap_t pmap)
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+	pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1;
+	CPU_ZERO(&pmap->pm_save);
 }
 
 /*
@@ -1716,6 +1909,8 @@ pmap_pinit(pmap_t pmap)
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+	pmap->pm_pcid = pmap_pcid_enabled ? alloc_unr(&pcid_unr) : -1;
+	CPU_ZERO(&pmap->pm_save);
 
 	return (1);
 }
@@ -1957,6 +2152,14 @@ pmap_release(pmap_t pmap)
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 
+	if (pmap_pcid_enabled) {
+		/*
+		 * Invalidate any left TLB entries, to allow the reuse
+		 * of the pcid.
+		 */
+		pmap_invalidate_all(pmap);
+	}
+
 	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
 
 	for (i = 0; i < NKPML4E; i++)	/* KVA */
@@ -1968,6 +2171,8 @@ pmap_release(pmap_t pmap)
 	m->wire_count--;
 	atomic_subtract_int(&cnt.v_wire_count, 1);
 	vm_page_free_zero(m);
+	if (pmap->pm_pcid != -1)
+		free_unr(&pcid_unr, pmap->pm_pcid);
 }
 

 static int
@@ -5734,15 +5939,20 @@ pmap_activate(struct thread *td)
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
+	CPU_ZERO(&pmap->pm_save);
 	cpuid = PCPU_GET(cpuid);
 #ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
+	CPU_SET_ATOMIC(cpuid, &pmap->pm_save);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 	CPU_SET(cpuid, &pmap->pm_active);
+	CPU_SET(cpuid, &pmap->pm_save);
 #endif
 	cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4);
+	if (pmap->pm_pcid != -1)
+		cr3 |= pmap->pm_pcid;
 	td->td_pcb->pcb_cr3 = cr3;
 	load_cr3(cr3);
 	PCPU_SET(curpmap, pmap);

Modified: head/sys/amd64/amd64/vm_machdep.c
==============================================================================
--- head/sys/amd64/amd64/vm_machdep.c	Fri Aug 30 07:43:34 2013	(r255059)
+++ head/sys/amd64/amd64/vm_machdep.c	Fri Aug 30 07:59:49 2013	(r255060)
@@ -221,6 +221,8 @@ cpu_fork(td1, p2, td2, flags)
 	 */
 	pmap2 = vmspace_pmap(p2->p_vmspace);
 	pcb2->pcb_cr3 = DMAP_TO_PHYS((vm_offset_t)pmap2->pm_pml4);
+	if (pmap2->pm_pcid != -1)
+		pcb2->pcb_cr3 |= pmap2->pm_pcid;
 	pcb2->pcb_r12 = (register_t)fork_return;	/* fork_trampoline argument */
 	pcb2->pcb_rbp = 0;
 	pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list