Issue with epoch_drain_callbacks and unloading iavf(4) [using iflib]

Thu Jan 30 03:09:21 UTC 2020

On Thu, Jan 30, 2020 at 02:12:05AM +0100, Hans Petter Selasky wrote:
> On 2020-01-29 22:44, Eric Joyner wrote:
> > On Wed, Jan 29, 2020 at 1:41 PM Hans Petter Selasky <hps at selasky.org> wrote:
> > 
> > > On 2020-01-29 22:30, Eric Joyner wrote:
> > > > Hi freebsd-net,
> > > > 
> > > > We've encountered an issue with unloading the iavf(4) driver on FreeBSD
> > > > 12.1 (and stable). On a VM with two iavf(4) interfaces, if we send heavy
> > > > traffic to iavf1 and try to kldunload the driver, the kldunload process
> > > > hangs on iavf0 until iavf1 stops receiving traffic.
> > > > 
> > > > After some debugging, it looks like epoch_drain_callbacks() [via
> > > > if_detach_internal()] tries to switch CPUs to run on one that iavf1 is
> > > > using for RX processing, but since iavf1 is busy, it can't make the
> > > switch,
> > > > so cpu_switch() just hangs and nothing happens until iavf1's RX thread
> > > > stops being busy.
> > > > 
> > > > I can work around this by inserting a kern_yield(PRI_USER) somewhere in
> > > one
> > > > of the iavf txrx functions that iflib calls into (e.g.
> > > > iavf_isc_rxd_available), but that's not a proper fix. Does anyone know
> > > what
> > > > to do to prevent this from happening?
> > > > 
> > > > Wildly guessing, does maybe epoch_drain_callbacks() need a higher
> > > priority
> > > > than the PI_SOFT used in the group taskqueues used in iflib's RX
> > > processing?
> > > > 
> > > 
> > > Hi,
> > > 
> > > Which scheduler is this? ULE or BSD?
> > > 
> > > EPOCH(9) expects some level of round-robin scheduling on the same
> > > priority level. Setting a higher priority on EPOCH(9) might cause epoch
> > > to start spinning w/o letting the lower priority thread which holds the
> > > EPOCH() section to finish.
> > > 
> > > --HPS
> > > 
> > > 
> > Hi Hans,
> > 
> > kern.sched.name gives me "ULE"
> > 
> 
> Hi Eric,
> 
> epoch_drain_callbacks() depends on that epoch_call_task() gets execution
> which is executed from a GTASKQUEUE at PI_SOFT. Also epoch_drain_callbacks()
> runs at the priority of the calling thread, and if this is lower than
> PI_SOFT, and a gtaskqueue is spinning heavily, then that won't work.

I think we can fix this so that the epoch_drain_callbacks() caller's
priority does not matter.  Eric, can you try the patch below?  It is a
bit hacky, but the idea is to ensure that all pending callbacks can be
invoked, and then wait for each cb task to run though at least once.  In
your case I think the callback task threads should still be able to run.

> For a single CPU system you will be toast in this situation regardless if
> there is no free time on a CPU for EPOCH().
>
> In general if epoch_call_task() doesn't get execution time, you will have a
> problem.

That's not the issue here, though there is at least one related problem:
if a thread in an epoch read section is preempted and the CPU is
consumed by a high priority receive processing thread, for example due
to a DOS, we have no mechanism to boost the priority of the preempted
thread except by calling epoch_synchronize_wait().  In other words,
nothing guarantees that deferred callbacks will eventually get executed
in such a scenario.

diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c
index 0a477e1d6f7b..4ed7e7e11a3e 100644
--- a/sys/kern/subr_epoch.c
+++ b/sys/kern/subr_epoch.c
@@ -74,15 +74,19 @@ typedef struct epoch_record {
 	volatile struct epoch_tdlist er_tdlist;
 	volatile uint32_t er_gen;
 	uint32_t er_cpuid;
+	int er_drain_state;
 } __aligned(EPOCH_ALIGN)     *epoch_record_t;
 
+#define	EPOCH_DRAIN_START	2
+#define	EPOCH_DRAIN_RUNNING	1
+#define	EPOCH_DRAIN_DONE	0
+
 struct epoch {
 	struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
 	epoch_record_t e_pcpu_record;
 	int	e_idx;
 	int	e_flags;
 	struct sx e_drain_sx;
-	struct mtx e_drain_mtx;
 	volatile int e_drain_count;
 	const char *e_name;
 };
@@ -335,7 +339,6 @@ epoch_alloc(const char *name, int flags)
 	epoch->e_idx = epoch_count;
 	epoch->e_name = name;
 	sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
-	mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
 	allepochs[epoch_count++] = epoch;
 	return (epoch);
 }
@@ -348,7 +351,6 @@ epoch_free(epoch_t epoch)
 	allepochs[epoch->e_idx] = NULL;
 	epoch_wait(global_epoch);
 	uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
-	mtx_destroy(&epoch->e_drain_mtx);
 	sx_destroy(&epoch->e_drain_sx);
 	free(epoch, M_EPOCH);
 }
@@ -699,14 +701,24 @@ epoch_call_task(void *arg __unused)
 	epoch_t epoch;
 	ck_stack_t cb_stack;
 	int i, npending, total;
+	bool draining;
+
+	KASSERT(curthread->td_pinned > 0,
+	    ("%s: callback task thread is not pinned", __func__));
 
 	ck_stack_init(&cb_stack);
 	critical_enter();
 	epoch_enter(global_epoch);
-	for (total = i = 0; i < epoch_count; i++) {
+	for (total = i = 0, draining = false; i < epoch_count; i++) {
 		if (__predict_false((epoch = allepochs[i]) == NULL))
 			continue;
 		er = epoch_currecord(epoch);
+		if (atomic_load_int(&er->er_drain_state) == EPOCH_DRAIN_START) {
+			atomic_store_int(&er->er_drain_state,
+			    EPOCH_DRAIN_RUNNING);
+			draining = true;
+		}
+
 		record = &er->er_record;
 		if ((npending = record->n_pending) == 0)
 			continue;
@@ -728,6 +740,20 @@ epoch_call_task(void *arg __unused)
 		next = CK_STACK_NEXT(cursor);
 		entry->function(entry);
 	}
+
+	if (__predict_false(draining)) {
+		epoch_enter(global_epoch);
+		for (i = 0; i < epoch_count; i++) {
+			if (__predict_false((epoch = allepochs[i]) == NULL))
+				continue;
+			er = epoch_currecord(epoch);
+			if (atomic_load_int(&er->er_drain_state) ==
+			    EPOCH_DRAIN_RUNNING)
+				atomic_store_int(&er->er_drain_state,
+				    EPOCH_DRAIN_DONE);
+		}
+		epoch_exit(global_epoch);
+	}
 }
 
 int
@@ -769,27 +795,18 @@ in_epoch(epoch_t epoch)
 }
 
 static void
-epoch_drain_cb(struct epoch_context *ctx)
+epoch_drain_handler(struct ck_epoch *global __unused,
+    ck_epoch_record_t *cr __unused, void *arg __unused)
 {
-	struct epoch *epoch =
-	    __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent;
-
-	if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
-		mtx_lock(&epoch->e_drain_mtx);
-		wakeup(epoch);
-		mtx_unlock(&epoch->e_drain_mtx);
-	}
+	maybe_yield();
 }
 
 void
 epoch_drain_callbacks(epoch_t epoch)
 {
 	epoch_record_t er;
-	struct thread *td;
-	int was_bound;
-	int old_pinned;
-	int old_cpu;
-	int cpu;
+	int cpu, state;
+	bool pending;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "epoch_drain_callbacks() may sleep!");
@@ -802,45 +819,28 @@ epoch_drain_callbacks(epoch_t epoch)
 		return;
 #endif
 	DROP_GIANT();
-
 	sx_xlock(&epoch->e_drain_sx);
-	mtx_lock(&epoch->e_drain_mtx);
 
-	td = curthread;
-	thread_lock(td);
-	old_cpu = PCPU_GET(cpuid);
-	old_pinned = td->td_pinned;
-	was_bound = sched_is_bound(td);
-	sched_unbind(td);
-	td->td_pinned = 0;
+	/* Make sure that all pending callbacks are available. */
+	ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_drain_handler, NULL);
 
-	CPU_FOREACH(cpu)
-		epoch->e_drain_count++;
 	CPU_FOREACH(cpu) {
 		er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
-		sched_bind(td, cpu);
-		epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx);
+		atomic_store_int(&er->er_drain_state, EPOCH_DRAIN_START);
+		GROUPTASK_ENQUEUE(DPCPU_ID_PTR(cpu, epoch_cb_task));
 	}
 
-	/* restore CPU binding, if any */
-	if (was_bound != 0) {
-		sched_bind(td, old_cpu);
-	} else {
-		/* get thread back to initial CPU, if any */
-		if (old_pinned != 0)
-			sched_bind(td, old_cpu);
-		sched_unbind(td);
-	}
-	/* restore pinned after bind */
-	td->td_pinned = old_pinned;
-
-	thread_unlock(td);
-
-	while (epoch->e_drain_count != 0)
-		msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
+	do {
+		pending = false;
+		CPU_FOREACH(cpu) {
+			er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
+			state = atomic_load_int(&er->er_drain_state);
+			if (state != EPOCH_DRAIN_DONE)
+				pending = true;
+		}
+		pause("edrain", 1);
+	} while (pending);
 
-	mtx_unlock(&epoch->e_drain_mtx);
 	sx_xunlock(&epoch->e_drain_sx);
-
 	PICKUP_GIANT();
 }