svn commit: r347695 - in head/sys: amd64/amd64 amd64/include kern

Konstantin Belousov kib at FreeBSD.org
Thu May 16 13:28:50 UTC 2019


Author: kib
Date: Thu May 16 13:28:48 2019
New Revision: 347695
URL: https://svnweb.freebsd.org/changeset/base/347695

Log:
  amd64 pmap: rework delayed invalidation, removing global mutex.
  
  For machines having cmpxcgh16b instruction, i.e. everything but very
  early Athlons, provide lockless implementation of delayed
  invalidation.
  
  The implementation maintains lock-less single-linked list with the
  trick from the T.L. Harris article about volatile mark of the elements
  being removed. Double-CAS is used to atomically update both link and
  generation.  New thread starting DI appends itself to the end of the
  queue, setting the generation to the generation of the last element
  +1.  On DI finish, thread donates its generation to the previous
  element.  The generation of the fake head of the list is the last
  passed DI generation.  Basically, the implementation is a queued
  spinlock but without spinlock.
  
  Many thanks both to Peter Holm and Mark Johnson for keeping with me
  while I produced intermediate versions of the patch.
  
  Reviewed by:	markj
  Tested by:	pho
  Sponsored by:	The FreeBSD Foundation
  MFC after:	1 month
  MFC note:	td_md.md_invl_gen should go to the end of struct thread
  Differential revision:	https://reviews.freebsd.org/D19630

Modified:
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/amd64/trap.c
  head/sys/amd64/amd64/vm_machdep.c
  head/sys/amd64/include/pmap.h
  head/sys/amd64/include/proc.h
  head/sys/kern/kern_thread.c

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Thu May 16 13:17:57 2019	(r347694)
+++ head/sys/amd64/amd64/pmap.c	Thu May 16 13:28:48 2019	(r347695)
@@ -107,6 +107,7 @@ __FBSDID("$FreeBSD$");
  *	and to when physical maps must be made correct.
  */
 
+#include "opt_ddb.h"
 #include "opt_pmap.h"
 #include "opt_vm.h"
 
@@ -130,6 +131,10 @@ __FBSDID("$FreeBSD$");
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
+#ifdef DDB
+#include <sys/kdb.h>
+#include <ddb/ddb.h>
+#endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -468,22 +473,46 @@ SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE
 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
 static struct mtx invl_gen_mtx;
-static u_long pmap_invl_gen = 0;
 /* Fake lock object to satisfy turnstiles interface. */
 static struct lock_object invl_gen_ts = {
 	.lo_name = "invlts",
 };
+static struct pmap_invl_gen pmap_invl_gen_head = {
+	.gen = 1,
+	.next = NULL,
+};
+static u_long pmap_invl_gen = 1;
 
+#define	PMAP_ASSERT_NOT_IN_DI() \
+    KASSERT(pmap_not_in_di(), ("DI already started"))
+
+static bool pmap_not_in_di_l(void);
+static bool pmap_not_in_di_u(void);
+DEFINE_IFUNC(, bool, pmap_not_in_di, (void), static)
+{
+
+	return ((cpu_feature2 & CPUID2_CX16) == 0 ? pmap_not_in_di_l :
+	    pmap_not_in_di_u);
+}
+
 static bool
-pmap_not_in_di(void)
+pmap_not_in_di_l(void)
 {
+	struct pmap_invl_gen *invl_gen;
 
-	return (curthread->td_md.md_invl_gen.gen == 0);
+	invl_gen = &curthread->td_md.md_invl_gen;
+	return (invl_gen->gen == 0);
 }
 
-#define	PMAP_ASSERT_NOT_IN_DI() \
-    KASSERT(pmap_not_in_di(), ("DI already started"))
+static void
+pmap_thread_init_invl_gen_l(struct thread *td)
+{
+	struct pmap_invl_gen *invl_gen;
 
+	invl_gen = &td->td_md.md_invl_gen;
+	invl_gen->gen = 0;
+}
+
 /*
  * Start a new Delayed Invalidation (DI) block of code, executed by
  * the current thread.  Within a DI block, the current thread may
@@ -493,7 +522,7 @@ pmap_not_in_di(void)
  * pmap active.
  */
 static void
-pmap_delayed_invl_started(void)
+pmap_delayed_invl_started_l(void)
 {
 	struct pmap_invl_gen *invl_gen;
 	u_long currgen;
@@ -525,7 +554,7 @@ pmap_delayed_invl_started(void)
  * current thread's DI.
  */
 static void
-pmap_delayed_invl_finished(void)
+pmap_delayed_invl_finished_l(void)
 {
 	struct pmap_invl_gen *invl_gen, *next;
 	struct turnstile *ts;
@@ -551,7 +580,285 @@ pmap_delayed_invl_finished(void)
 	invl_gen->gen = 0;
 }
 
+static bool
+pmap_not_in_di_u(void)
+{
+	struct pmap_invl_gen *invl_gen;
+
+	invl_gen = &curthread->td_md.md_invl_gen;
+	return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
+}
+
+static void
+pmap_thread_init_invl_gen_u(struct thread *td)
+{
+	struct pmap_invl_gen *invl_gen;
+
+	invl_gen = &td->td_md.md_invl_gen;
+	invl_gen->gen = 0;
+	invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
+}
+
+static bool
+pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
+{
+	uint64_t new_high, new_low, old_high, old_low;
+	char res;
+
+	old_low = new_low = 0;
+	old_high = new_high = (uintptr_t)0;
+
+	__asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
+	    : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
+	    : "b"(new_low), "c" (new_high)
+	    : "memory", "cc");
+	if (res == 0) {
+		if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
+			return (false);
+		out->gen = old_low;
+		out->next = (void *)old_high;
+	} else {
+		out->gen = new_low;
+		out->next = (void *)new_high;
+	}
+	return (true);
+}
+
+static bool
+pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
+    struct pmap_invl_gen *new_val)
+{
+	uint64_t new_high, new_low, old_high, old_low;
+	char res;
+
+	new_low = new_val->gen;
+	new_high = (uintptr_t)new_val->next;
+	old_low = old_val->gen;
+	old_high = (uintptr_t)old_val->next;
+
+	__asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
+	    : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
+	    : "b"(new_low), "c" (new_high)
+	    : "memory", "cc");
+	return (res);
+}
+
 #ifdef PV_STATS
+static long invl_start_restart;
+SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD,
+    &invl_start_restart, 0,
+    "");
+static long invl_finish_restart;
+SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
+    &invl_finish_restart, 0,
+    "");
+static int invl_max_qlen;
+SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
+    &invl_max_qlen, 0,
+    "");
+#endif
+
+static struct lock_delay_config __read_frequently di_delay;
+LOCK_DELAY_SYSINIT_DEFAULT(di_delay);
+
+static void
+pmap_delayed_invl_started_u(void)
+{
+	struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
+	struct thread *td;
+	struct lock_delay_arg lda;
+	uintptr_t prevl;
+	u_char pri;
+#ifdef PV_STATS
+	int i, ii;
+#endif
+
+	td = curthread;
+	invl_gen = &td->td_md.md_invl_gen;
+	PMAP_ASSERT_NOT_IN_DI();
+	lock_delay_arg_init(&lda, &di_delay);
+	thread_lock(td);
+	pri = td->td_base_pri;
+	if (pri < PVM) {
+		invl_gen->saved_pri = 0;
+	} else {
+		invl_gen->saved_pri = pri;
+		sched_prio(td, PVM);
+	}
+	thread_unlock(td);
+
+again:
+	PV_STAT(i = 0);
+	for (p = &pmap_invl_gen_head;; p = prev.next) {
+		PV_STAT(i++);
+		prevl = atomic_load_ptr(&p->next);
+		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
+			PV_STAT(atomic_add_long(&invl_start_restart, 1));
+			lock_delay(&lda);
+			goto again;
+		}
+		if (prevl == 0)
+			break;
+		prev.next = (void *)prevl;
+	}
+#ifdef PV_STATS
+	if ((ii = invl_max_qlen) < i)
+		atomic_cmpset_int(&invl_max_qlen, ii, i);
+#endif
+
+	if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
+		PV_STAT(atomic_add_long(&invl_start_restart, 1));
+		lock_delay(&lda);
+		goto again;
+	}
+
+	new_prev.gen = prev.gen;
+	new_prev.next = invl_gen;
+	invl_gen->gen = prev.gen + 1;
+
+	/* Formal fence between store to invl->gen and updating *p. */
+	atomic_thread_fence_rel();
+
+	/*
+	 * After inserting an invl_gen element with invalid bit set,
+	 * this thread blocks any other thread trying to enter the
+	 * delayed invalidation block.  Do not allow to remove us from
+	 * the CPU, because it causes starvation for other threads.
+	 */
+	critical_enter();
+
+	/*
+	 * ABA for *p is not possible there, since p->gen can only
+	 * increase.  So if the *p thread finished its di, then
+	 * started a new one and got inserted into the list at the
+	 * same place, its gen will appear greater than the previously
+	 * read gen.
+	 */
+	if (!pmap_di_store_invl(p, &prev, &new_prev)) {
+		critical_exit();
+		PV_STAT(atomic_add_long(&invl_start_restart, 1));
+		lock_delay(&lda);
+		goto again;
+	}
+
+	/*
+	 * There we clear PMAP_INVL_GEN_NEXT_INVALID in
+	 * invl_gen->next, allowing other threads to iterate past us.
+	 * pmap_di_store_invl() provides fence between the generation
+	 * write and the update of next.
+	 */
+	invl_gen->next = NULL;
+	critical_exit();
+}
+
+static bool
+pmap_delayed_invl_finished_u_crit(struct pmap_invl_gen *invl_gen,
+    struct pmap_invl_gen *p)
+{
+	struct pmap_invl_gen prev, new_prev;
+	u_long mygen;
+
+	/*
+	 * Load invl_gen->gen after setting invl_gen->next
+	 * PMAP_INVL_GEN_NEXT_INVALID.  This prevents larger
+	 * generations to propagate to our invl_gen->gen.  Lock prefix
+	 * in atomic_set_ptr() worked as seq_cst fence.
+	 */
+	mygen = atomic_load_long(&invl_gen->gen);
+
+	if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
+		return (false);
+
+	KASSERT(prev.gen < mygen,
+	    ("invalid di gen sequence %lu %lu", prev.gen, mygen));
+	new_prev.gen = mygen;
+	new_prev.next = (void *)((uintptr_t)invl_gen->next &
+	    ~PMAP_INVL_GEN_NEXT_INVALID);
+
+	/* Formal fence between load of prev and storing update to it. */
+	atomic_thread_fence_rel();
+
+	return (pmap_di_store_invl(p, &prev, &new_prev));
+}
+
+static void
+pmap_delayed_invl_finished_u(void)
+{
+	struct pmap_invl_gen *invl_gen, *p;
+	struct thread *td;
+	struct lock_delay_arg lda;
+	uintptr_t prevl;
+
+	td = curthread;
+	invl_gen = &td->td_md.md_invl_gen;
+	KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
+	KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
+	    ("missed invl_start: INVALID"));
+	lock_delay_arg_init(&lda, &di_delay);
+
+again:
+	for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
+		prevl = atomic_load_ptr(&p->next);
+		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
+			PV_STAT(atomic_add_long(&invl_finish_restart, 1));
+			lock_delay(&lda);
+			goto again;
+		}
+		if ((void *)prevl == invl_gen)
+			break;
+	}
+
+	/*
+	 * It is legitimate to not find ourself on the list if a
+	 * thread before us finished its DI and started it again.
+	 */
+	if (__predict_false(p == NULL)) {
+		PV_STAT(atomic_add_long(&invl_finish_restart, 1));
+		lock_delay(&lda);
+		goto again;
+	}
+
+	critical_enter();
+	atomic_set_ptr((uintptr_t *)&invl_gen->next,
+	    PMAP_INVL_GEN_NEXT_INVALID);
+	if (!pmap_delayed_invl_finished_u_crit(invl_gen, p)) {
+		atomic_clear_ptr((uintptr_t *)&invl_gen->next,
+		    PMAP_INVL_GEN_NEXT_INVALID);
+		critical_exit();
+		PV_STAT(atomic_add_long(&invl_finish_restart, 1));
+		lock_delay(&lda);
+		goto again;
+	}
+	critical_exit();
+	if (invl_gen->saved_pri != 0) {
+		thread_lock(td);
+		sched_prio(td, invl_gen->saved_pri);
+		thread_unlock(td);
+	}
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(di_queue, pmap_di_queue)
+{
+	struct pmap_invl_gen *p, *pn;
+	struct thread *td;
+	uintptr_t nextl;
+	bool first;
+
+	for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
+	    first = false) {
+		nextl = atomic_load_ptr(&p->next);
+		pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
+		td = first ? NULL : __containerof(p, struct thread,
+		    td_md.md_invl_gen);
+		db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
+		    (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
+		    td != NULL ? td->td_tid : -1);
+	}
+}
+#endif
+
+#ifdef PV_STATS
 static long invl_wait;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
     "Number of times DI invalidation blocked pmap_remove_all/write");
@@ -579,7 +886,7 @@ pmap_delayed_invl_genp(vm_page_t m)
  * processor.
  */
 static void
-pmap_delayed_invl_wait(vm_page_t m)
+pmap_delayed_invl_wait_l(vm_page_t m)
 {
 	struct turnstile *ts;
 	u_long *m_gen;
@@ -603,6 +910,54 @@ pmap_delayed_invl_wait(vm_page_t m)
 	}
 }
 
+static void
+pmap_delayed_invl_wait_u(vm_page_t m)
+{
+	u_long *m_gen;
+#ifdef PV_STATS
+	bool accounted = false;
+#endif
+
+	m_gen = pmap_delayed_invl_genp(m);
+	while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
+#ifdef PV_STATS
+		if (!accounted) {
+			atomic_add_long(&invl_wait, 1);
+			accounted = true;
+		}
+#endif
+		kern_yield(PRI_USER);
+	}
+}
+
+DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *), static)
+{
+
+	return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+	    pmap_thread_init_invl_gen_l : pmap_thread_init_invl_gen_u);
+}
+
+DEFINE_IFUNC(static, void, pmap_delayed_invl_started, (void), static)
+{
+
+	return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+	    pmap_delayed_invl_started_l : pmap_delayed_invl_started_u);
+}
+
+DEFINE_IFUNC(static, void, pmap_delayed_invl_finished, (void), static)
+{
+
+	return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+	    pmap_delayed_invl_finished_l : pmap_delayed_invl_finished_u);
+}
+
+DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t), static)
+{
+
+	return ((cpu_feature2 & CPUID2_CX16) == 0 ?
+	    pmap_delayed_invl_wait_l : pmap_delayed_invl_wait_u);
+}
+
 /*
  * Mark the page m's PV list as participating in the current thread's
  * DI block.  Any threads concurrently using m's PV list to remove or
@@ -2854,6 +3209,7 @@ void
 pmap_pinit0(pmap_t pmap)
 {
 	struct proc *p;
+	struct thread *td;
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
@@ -2872,12 +3228,14 @@ pmap_pinit0(pmap_t pmap)
 		pmap->pm_pcids[i].pm_gen = 1;
 	}
 	pmap_activate_boot(pmap);
+	td = curthread;
 	if (pti) {
-		p = curproc;
+		p = td->td_proc;
 		PROC_LOCK(p);
 		p->p_md.md_flags |= P_MD_KPTI;
 		PROC_UNLOCK(p);
 	}
+	pmap_thread_init_invl_gen(td);
 
 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 		pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
@@ -9327,11 +9685,7 @@ pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offse
 	return (error);
 }
 
-#include "opt_ddb.h"
 #ifdef DDB
-#include <sys/kdb.h>
-#include <ddb/ddb.h>
-
 DB_SHOW_COMMAND(pte, pmap_print_pte)
 {
 	pmap_t pmap;

Modified: head/sys/amd64/amd64/trap.c
==============================================================================
--- head/sys/amd64/amd64/trap.c	Thu May 16 13:17:57 2019	(r347694)
+++ head/sys/amd64/amd64/trap.c	Thu May 16 13:28:48 2019	(r347695)
@@ -1183,7 +1183,7 @@ amd64_syscall(struct thread *td, int traced)
 	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
 	    ("System call %s returning with mangled pcb_save",
 	     syscallname(td->td_proc, td->td_sa.code)));
-	KASSERT(td->td_md.md_invl_gen.gen == 0,
+	KASSERT(pmap_not_in_di(),
 	    ("System call %s returning with leaked invl_gen %lu",
 	    syscallname(td->td_proc, td->td_sa.code),
 	    td->td_md.md_invl_gen.gen));

Modified: head/sys/amd64/amd64/vm_machdep.c
==============================================================================
--- head/sys/amd64/amd64/vm_machdep.c	Thu May 16 13:17:57 2019	(r347694)
+++ head/sys/amd64/amd64/vm_machdep.c	Thu May 16 13:28:48 2019	(r347695)
@@ -228,7 +228,7 @@ cpu_fork(struct thread *td1, struct proc *p2, struct t
 	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
-	td2->td_md.md_invl_gen.gen = 0;
+	pmap_thread_init_invl_gen(td2);
 
 	/* As an i386, do not copy io permission bitmap. */
 	pcb2->pcb_tssp = NULL;
@@ -544,6 +544,7 @@ cpu_copy_thread(struct thread *td, struct thread *td0)
 	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
+	pmap_thread_init_invl_gen(td);
 }
 
 /*

Modified: head/sys/amd64/include/pmap.h
==============================================================================
--- head/sys/amd64/include/pmap.h	Thu May 16 13:17:57 2019	(r347694)
+++ head/sys/amd64/include/pmap.h	Thu May 16 13:28:48 2019	(r347695)
@@ -441,6 +441,7 @@ void	*pmap_mapbios(vm_paddr_t, vm_size_t);
 void	*pmap_mapdev(vm_paddr_t, vm_size_t);
 void	*pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
 void	*pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size);
+bool	pmap_not_in_di(void);
 boolean_t pmap_page_is_mapped(vm_page_t m);
 void	pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
 void	pmap_pinit_pml4(vm_page_t);
@@ -465,6 +466,7 @@ void	pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr
 int	pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 int	pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 	    u_int keyidx, int flags);
+void	pmap_thread_init_invl_gen(struct thread *td);
 int	pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap);
 #endif /* _KERNEL */
 

Modified: head/sys/amd64/include/proc.h
==============================================================================
--- head/sys/amd64/include/proc.h	Thu May 16 13:17:57 2019	(r347694)
+++ head/sys/amd64/include/proc.h	Thu May 16 13:28:48 2019	(r347695)
@@ -50,10 +50,17 @@ struct proc_ldt {
 	int     ldt_refcnt;
 };
 
+#define PMAP_INVL_GEN_NEXT_INVALID	0x1ULL
 struct pmap_invl_gen {
 	u_long gen;			/* (k) */
-	LIST_ENTRY(pmap_invl_gen) link;	/* (pp) */
-};
+	union {
+		LIST_ENTRY(pmap_invl_gen) link;	/* (pp) */
+		struct {
+			struct pmap_invl_gen *next;
+			u_char saved_pri;
+		};
+	};
+} __aligned(16);
 
 /*
  * Machine-dependent part of the proc structure for AMD64.

Modified: head/sys/kern/kern_thread.c
==============================================================================
--- head/sys/kern/kern_thread.c	Thu May 16 13:17:57 2019	(r347694)
+++ head/sys/kern/kern_thread.c	Thu May 16 13:28:48 2019	(r347695)
@@ -84,7 +84,7 @@ _Static_assert(offsetof(struct thread, td_pflags) == 0
     "struct thread KBI td_pflags");
 _Static_assert(offsetof(struct thread, td_frame) == 0x478,
     "struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x530,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x548,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0xb0,
     "struct proc KBI p_flag");


More information about the svn-src-head mailing list