svn commit: r221688 - in user/avg/xcpu/sys: amd64/amd64 kern

Mon May 9 07:14:16 UTC 2011

Author: avg
Date: Mon May  9 07:14:16 2011
New Revision: 221688
URL: http://svn.freebsd.org/changeset/base/221688

Log:
  re-implement smp rendezvous code
  
  - create one rendezvous (outgoing) mailbox per each cpu where a cpu would
    place its rendezvous request directed to other cpus
  - create a cpu mask for each cpu where other cpus can set a bit to indicate
    that they send a rendezvous request to the cpu in question
  - send an ipi only for a first rv request, piggyback subsequent
    requests if a target cpu is still processing previous incoming requests
  - many-to-many rv requests can be sent now, there is no locking, the only
    limitation is that a cpu can have only a single outgoing request at
    a time
  - to avoid deadlocks, when a cpu waits for its requested to be completed
    by target cpus, it also checks for and processes incoming requests
  - to avoid deadlock with cpu stopping logic, cpus also check for stop
    requests while waiting
  - there can be only one cpu asking other cpus to stop; this is implemented
    via a handrolled spin mutex analogue; similar to the above, to avoid
    deadlocks a cpu spinning for this lock also checks for an incoming stop
    request
  - implement tlb shootdowns via smp rendezvous mechanism, no special ipis
    are needed now, amd64 only (see if the code can be further simplified)
  - thus the smp_ipi_mtx is not needed any longer

Modified:
  user/avg/xcpu/sys/amd64/amd64/mp_machdep.c
  user/avg/xcpu/sys/kern/kern_shutdown.c
  user/avg/xcpu/sys/kern/subr_smp.c

Modified: user/avg/xcpu/sys/amd64/amd64/mp_machdep.c
==============================================================================

--- user/avg/xcpu/sys/amd64/amd64/mp_machdep.c	Mon May  9 07:13:08 2011	(r221687)
+++ user/avg/xcpu/sys/amd64/amd64/mp_machdep.c	Mon May  9 07:14:16 2011	(r221688)
@@ -1087,67 +1087,66 @@ SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_
     &ipi_masked_range_size, 0, "");
 #endif /* COUNT_XINVLTLB_HITS */
 
-/*
- * Flush the TLB on all other CPU's
- */
+struct tlb_shootdown_params {
+	u_int type;
+	vm_offset_t addr1;
+	vm_offset_t addr2;
+};
+
 static void
-smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+tlb_shootdown_action(void *arg)
 {
-	u_int ncpu;
+	struct tlb_shootdown_params *params;
+	vm_offset_t addr;
 
-	ncpu = mp_ncpus - 1;	/* does not shootdown self */
-	if (ncpu < 1)
-		return;		/* no other cpus */
-	if (!(read_rflags() & PSL_I))
-		panic("%s: interrupts disabled", __func__);
-	mtx_lock_spin(&smp_ipi_mtx);
-	smp_tlb_addr1 = addr1;
-	smp_tlb_addr2 = addr2;
-	atomic_store_rel_int(&smp_tlb_wait, 0);
-	ipi_all_but_self(vector);
-	while (smp_tlb_wait < ncpu)
-		ia32_pause();
-	mtx_unlock_spin(&smp_ipi_mtx);
+	params = (struct tlb_shootdown_params *)arg;
+	switch (params->type) {
+	case IPI_INVLCACHE:
+		wbinvd();
+		break;
+	case IPI_INVLTLB:
+		invltlb();
+		break;
+	case IPI_INVLPG:
+		invlpg(params->addr1);
+		break;
+	case IPI_INVLRNG:
+		for (addr = params->addr1; addr < params->addr2;
+		    addr += PAGE_SIZE)
+			invlpg(addr);
+		break;
+	default:
+		panic("Unknown TLB shootdown type %u", params->type);
+	}
 }
 
 static void
-smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector,
+    vm_offset_t addr1, vm_offset_t addr2)
 {
-	int ncpu, othercpus;
+	struct tlb_shootdown_params params;
 
-	othercpus = mp_ncpus - 1;
-	if (mask == (cpumask_t)-1) {
-		ncpu = othercpus;
-		if (ncpu < 1)
-			return;
-	} else {
-		mask &= ~PCPU_GET(cpumask);
-		if (mask == 0)
-			return;
-		ncpu = bitcount32(mask);
-		if (ncpu > othercpus) {
-			/* XXX this should be a panic offence */
-			printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",
-			    ncpu, othercpus);
-			ncpu = othercpus;
-		}
-		/* XXX should be a panic, implied by mask == 0 above */
-		if (ncpu < 1)
-			return;
-	}
+#if 0
 	if (!(read_rflags() & PSL_I))
 		panic("%s: interrupts disabled", __func__);
-	mtx_lock_spin(&smp_ipi_mtx);
-	smp_tlb_addr1 = addr1;
-	smp_tlb_addr2 = addr2;
-	atomic_store_rel_int(&smp_tlb_wait, 0);
-	if (mask == (cpumask_t)-1)
-		ipi_all_but_self(vector);
-	else
-		ipi_selected(mask, vector);
-	while (smp_tlb_wait < ncpu)
-		ia32_pause();
-	mtx_unlock_spin(&smp_ipi_mtx);
+#endif
+	params.type = vector;
+	params.addr1 = addr1;
+	params.addr2 = addr2;
+	smp_rendezvous_cpus(mask & all_cpus & ~(1 << curcpu),
+	    smp_no_rendevous_barrier, tlb_shootdown_action,
+	    smp_no_rendevous_barrier, &params);
+}
+
+/*
+ * Flush the TLB on all other CPU's
+ */
+static void
+smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
+{
+
+	smp_targeted_tlb_shootdown(all_cpus & ~(1 << curcpu),
+	    vector, addr1, addr2);
 }
 
 /*

Modified: user/avg/xcpu/sys/kern/kern_shutdown.c
==============================================================================
--- user/avg/xcpu/sys/kern/kern_shutdown.c	Mon May  9 07:13:08 2011	(r221687)
+++ user/avg/xcpu/sys/kern/kern_shutdown.c	Mon May  9 07:14:16 2011	(r221688)
@@ -509,26 +509,9 @@ shutdown_reset(void *junk, int howto)
 	printf("Rebooting...\n");
 	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
 
-	/*
-	 * Acquiring smp_ipi_mtx here has a double effect:
-	 * - it disables interrupts avoiding CPU0 preemption
-	 *   by fast handlers (thus deadlocking  against other CPUs)
-	 * - it avoids deadlocks against smp_rendezvous() or, more 
-	 *   generally, threads busy-waiting, with this spinlock held,
-	 *   and waiting for responses by threads on other CPUs
-	 *   (ie. smp_tlb_shootdown()).
-	 *
-	 * For the !SMP case it just needs to handle the former problem.
-	 */
-#ifdef SMP
-	mtx_lock_spin(&smp_ipi_mtx);
-#else
 	spinlock_enter();
-#endif
-
-	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
 	cpu_reset();
-	/* NOTREACHED */ /* assuming reset worked */
+	/* NOTREACHED */
 }
 
 /*

Modified: user/avg/xcpu/sys/kern/subr_smp.c
==============================================================================
--- user/avg/xcpu/sys/kern/subr_smp.c	Mon May  9 07:13:08 2011	(r221687)
+++ user/avg/xcpu/sys/kern/subr_smp.c	Mon May  9 07:14:16 2011	(r221688)
@@ -101,6 +101,10 @@ SYSCTL_INT(_kern_smp, OID_AUTO, topology
     "Topology override setting; 0 is default provided by hardware.");
 TUNABLE_INT("kern.smp.topology", &smp_topology);
 
+unsigned int coalesced_ipi_count;
+SYSCTL_INT(_kern_smp, OID_AUTO, coalesced_ipi_count, CTLFLAG_RD,
+    &coalesced_ipi_count, 0, "Count of coalesced SMP rendezvous IPIs");
+
 #ifdef SMP
 /* Enable forwarding of a signal to a process running on a different CPU */
 static int forward_signal_enabled = 1;
@@ -109,14 +113,20 @@ SYSCTL_INT(_kern_smp, OID_AUTO, forward_
 	   "Forwarding of a signal to a process on a different CPU");
 
 /* Variables needed for SMP rendezvous. */
-static volatile int smp_rv_ncpus;
-static void (*volatile smp_rv_setup_func)(void *arg);
-static void (*volatile smp_rv_action_func)(void *arg);
-static void (*volatile smp_rv_teardown_func)(void *arg);
-static void *volatile smp_rv_func_arg;
-static volatile int smp_rv_waiters[3];
+struct smp_rendezvous_data {
+	void (*smp_rv_setup_func)(void *arg);
+	void (*smp_rv_action_func)(void *arg);
+	void (*smp_rv_teardown_func)(void *arg);
+	void *smp_rv_func_arg;
+	volatile int smp_rv_waiters[2];
+	int smp_rv_ncpus;
+};
+
+static DPCPU_DEFINE(struct smp_rendezvous_data, smp_rv_data);
+static volatile DPCPU_DEFINE(cpumask_t, smp_rv_senders);
+static volatile DPCPU_DEFINE(cpumask_t, smp_rv_count);
 
-/* 
+/*
  * Shared mutex to restrict busywaits between smp_rendezvous() and
  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
  * functions trigger at once and cause multiple CPUs to busywait with
@@ -397,39 +407,44 @@ unstop_cpus_hard(void)
  * Note that the supplied external functions _must_ be reentrant and aware
  * that they are running in parallel and in an unknown lock context.
  */
-void
-smp_rendezvous_action(void)
+static void
+smp_rendezvous_action_body(int cpu)
 {
-	void* local_func_arg = smp_rv_func_arg;
-	void (*local_setup_func)(void*)   = smp_rv_setup_func;
-	void (*local_action_func)(void*)   = smp_rv_action_func;
-	void (*local_teardown_func)(void*) = smp_rv_teardown_func;
-
-	/* Ensure we have up-to-date values. */
-	atomic_add_acq_int(&smp_rv_waiters[0], 1);
-	while (smp_rv_waiters[0] < smp_rv_ncpus)
-		cpu_spinwait();
+	volatile struct smp_rendezvous_data *rv;
+	void *local_func_arg;
+	void (*local_setup_func)(void*);
+	void (*local_action_func)(void*);
+	void (*local_teardown_func)(void*);
+	int ncpus;
+
+	rv = DPCPU_ID_PTR(cpu, smp_rv_data);
+	local_func_arg = rv->smp_rv_func_arg;
+	local_setup_func = rv->smp_rv_setup_func;
+	local_action_func = rv->smp_rv_action_func;
+	local_teardown_func = rv->smp_rv_teardown_func;
+	ncpus = rv->smp_rv_ncpus;
 
 	/* setup function */
 	if (local_setup_func != smp_no_rendevous_barrier) {
-		if (smp_rv_setup_func != NULL)
-			smp_rv_setup_func(smp_rv_func_arg);
+		if (local_setup_func != NULL)
+			local_setup_func(local_func_arg);
 
 		/* spin on entry rendezvous */
-		atomic_add_int(&smp_rv_waiters[1], 1);
-		while (smp_rv_waiters[1] < smp_rv_ncpus)
-                	cpu_spinwait();
+		atomic_add_int(&rv->smp_rv_waiters[0], 1);
+		while (rv->smp_rv_waiters[0] < ncpus)
+			cpu_spinwait();
 	}
 
 	/* action function */
 	if (local_action_func != NULL)
 		local_action_func(local_func_arg);
 
-	/* spin on exit rendezvous */
-	atomic_add_int(&smp_rv_waiters[2], 1);
+	atomic_add_int(&rv->smp_rv_waiters[1], 1);
 	if (local_teardown_func == smp_no_rendevous_barrier)
                 return;
-	while (smp_rv_waiters[2] < smp_rv_ncpus)
+
+	/* spin on exit rendezvous */
+	while (rv->smp_rv_waiters[1] < ncpus)
 		cpu_spinwait();
 
 	/* teardown function */
@@ -438,13 +453,95 @@ smp_rendezvous_action(void)
 }
 
 void
+smp_rendezvous_action(void)
+{
+	cpumask_t mask;
+	int pending;
+	int count;
+	int cpu;
+
+	pending = DPCPU_GET(smp_rv_count);
+	while (pending != 0) {
+		KASSERT(pending > 0, ("negative pending rendezvous count"));
+		mask = DPCPU_GET(smp_rv_senders);
+		if (mask == 0) {
+			cpu_spinwait();
+			continue;
+		}
+
+		atomic_clear_acq_int(DPCPU_PTR(smp_rv_senders), mask);
+		count = 0;
+		do {
+			count++;
+			cpu = ffs(mask) - 1;
+			mask &= ~(1 << cpu);
+			smp_rendezvous_action_body(cpu);
+		} while (mask != 0);
+
+		pending = atomic_fetchadd_int(DPCPU_PTR(smp_rv_count), -count);
+		pending -= count;
+	}
+}
+
+static void
+smp_rendezvous_wait(void)
+{
+	volatile struct smp_rendezvous_data *rv;
+	int ncpus;
+
+	rv = DPCPU_PTR(smp_rv_data);
+	ncpus = rv->smp_rv_ncpus;
+
+	while (atomic_load_acq_int(&rv->smp_rv_waiters[1]) < ncpus) {
+		/* check for incoming events */
+		if ((stopping_cpus & (1 << curcpu)) != 0)
+			cpustop_handler();
+		else if (DPCPU_GET(smp_rv_senders) != 0)
+			smp_rendezvous_action();
+		else
+			cpu_spinwait();
+	}
+}
+
+/*
+ * Execute the action_func on the targeted CPUs.
+ *
+ * setup_func:
+ * - if a function pointer is given, then first execute the function;
+ *   only after the function is executed on all targeted can they proceed
+ *   to the next step;
+ * - if NULL is given, this is equivalent to specifying a pointer to an
+ *   empty function; as such there is no actual setup function, but all
+ *   targeted CPUs proceed to the next step at about the same time;
+ * - smp_no_rendevous_barrier is a special value that signifies that there
+ *   is no setup function nor the targeted CPUs should wait for anything
+ *   before proceeding to the next step.
+ *
+ * action_func:
+ * - a function to be executed on the targeted CPUs;
+ *   NULL is equivalent to specifying a pointer to an empty function.
+ *
+ * teardown_func:
+ * - if a function pointer is given, then first wait for all targeted CPUs
+ *   to complete execution of action_func, then execute this function;
+ * - if NULL is given, this is equivalent to specifying a pointer to an
+ *   empty function; as such there is no actual teardown action, but all
+ *   targeted CPUs wait for each other to complete execution of action_func;
+ * - smp_no_rendevous_barrier is a special value that signifies that there
+ *   is no teardown function nor the targeted CPUs should wait for anything
+ *   after completing action_func.
+ */
+void
 smp_rendezvous_cpus(cpumask_t map,
 	void (* setup_func)(void *), 
 	void (* action_func)(void *),
 	void (* teardown_func)(void *),
 	void *arg)
 {
-	int i, ncpus = 0;
+	volatile struct smp_rendezvous_data *rv;
+	cpumask_t tmp;
+	int ncpus;
+	int cpu;
 
 	if (!smp_started) {
 		if (setup_func != NULL)
@@ -456,39 +553,66 @@ smp_rendezvous_cpus(cpumask_t map,
 		return;
 	}
 
-	CPU_FOREACH(i) {
-		if (((1 << i) & map) != 0)
-			ncpus++;
+	map &= all_cpus;
+	tmp = map;
+	ncpus = 0;
+	while (tmp != 0) {
+		cpu = ffs(tmp) - 1;
+		tmp &= ~(1 << cpu);
+		ncpus++;
 	}
-	if (ncpus == 0)
-		panic("ncpus is 0 with map=0x%x", map);
 
-	/* obtain rendezvous lock */
-	mtx_lock_spin(&smp_ipi_mtx);
+	spinlock_enter();
+
+	/*
+	 * First wait for an event previously posted by us to complete (if any),
+	 * this is done in case the event was asynchronous.
+	 * In the future we could have a queue of outgoing events instead
+	 * of a single item.
+	 */
+	smp_rendezvous_wait();
 
 	/* set static function pointers */
-	smp_rv_ncpus = ncpus;
-	smp_rv_setup_func = setup_func;
-	smp_rv_action_func = action_func;
-	smp_rv_teardown_func = teardown_func;
-	smp_rv_func_arg = arg;
-	smp_rv_waiters[1] = 0;
-	smp_rv_waiters[2] = 0;
-	atomic_store_rel_int(&smp_rv_waiters[0], 0);
+	rv = DPCPU_PTR(smp_rv_data);
+	rv->smp_rv_ncpus = ncpus;
+	rv->smp_rv_setup_func = setup_func;
+	rv->smp_rv_action_func = action_func;
+	rv->smp_rv_teardown_func = teardown_func;
+	rv->smp_rv_func_arg = arg;
+	rv->smp_rv_waiters[1] = 0;
+	atomic_store_rel_int(&rv->smp_rv_waiters[0], 0);
+
+	/* signal other CPUs, which will enter the IPI with interrupts off */
+	tmp = map;
+	while (tmp != 0) {
+		cpu = ffs(tmp) - 1;
+		tmp &= ~(1 << cpu);
+
+		if (cpu == curcpu)
+			continue;
+
+		KASSERT(
+		    (DPCPU_ID_GET(cpu, smp_rv_senders) & (1 << curcpu)) == 0,
+		    ("curcpu bit is set in target cpu's senders map"));
+
+		/* if we are the first to send an event, then send an ipi */
+		if (atomic_fetchadd_int(DPCPU_ID_PTR(cpu, smp_rv_count), 1)
+		    == 0)
+			ipi_cpu(cpu, IPI_RENDEZVOUS);
+		else
+			coalesced_ipi_count++;
 
-	/* signal other processors, which will enter the IPI with interrupts off */
-	ipi_selected(map & ~(1 << curcpu), IPI_RENDEZVOUS);
+		atomic_set_rel_int(DPCPU_ID_PTR(cpu, smp_rv_senders),
+		    1 << curcpu);
+	}
 
 	/* Check if the current CPU is in the map */
 	if ((map & (1 << curcpu)) != 0)
-		smp_rendezvous_action();
-
+		smp_rendezvous_action_body(curcpu);
 	if (teardown_func == smp_no_rendevous_barrier)
-		while (atomic_load_acq_int(&smp_rv_waiters[2]) < ncpus)
-			cpu_spinwait();
+		smp_rendezvous_wait();
 
-	/* release lock */
-	mtx_unlock_spin(&smp_ipi_mtx);
+	spinlock_exit();
 }
 
 void