git: 56dc95b249dc - stable/12 - Revert "if_epair: rework"

From: Kristof Provost <kp_at_FreeBSD.org>
Date: Thu, 24 Mar 2022 13:14:36 UTC
The branch stable/12 has been updated by kp:

URL: https://cgit.FreeBSD.org/src/commit/?id=56dc95b249dceb30367a77dccd0231cbb08dc1f7

commit 56dc95b249dceb30367a77dccd0231cbb08dc1f7
Author:     Kristof Provost <kp@FreeBSD.org>
AuthorDate: 2022-03-21 14:41:32 +0000
Commit:     Kristof Provost <kp@FreeBSD.org>
CommitDate: 2022-03-24 09:44:40 +0000

    Revert "if_epair: rework"
    
    Revert the recent performance rework of if_epair. It relies on functions like
    atomic_testandclear_long() which are not available on all platforms in
    stable/12.
    
    This reverts commits b1a3f8dccb6203036b7ee81201fd5b5a8de36f0d,
    fb3644ab2afe777fdd2539bc996a390443f052f1,
    ca7af63e88f8cc96865d45e020a57b3062631388,
    092da35a0d80af7a3e5c5c22cbeddb6cffbd9524,
    and 7c2b681b33fc78ed06c7e9e65eeebb2ab5420586.
    
    This is a direct commit to stable/12.
---
 sys/modules/if_epair/Makefile |   2 +-
 sys/net/if_epair.c            | 832 ++++++++++++++++++++++++++----------------
 2 files changed, 509 insertions(+), 325 deletions(-)

diff --git a/sys/modules/if_epair/Makefile b/sys/modules/if_epair/Makefile
index 8b063623f2e8..3e102413bfe2 100644
--- a/sys/modules/if_epair/Makefile
+++ b/sys/modules/if_epair/Makefile
@@ -3,6 +3,6 @@
 .PATH: ${SRCTOP}/sys/net
 
 KMOD=	if_epair
-SRCS=	bus_if.h device_if.h if_epair.c opt_rss.h opt_inet.h opt_inet6.h
+SRCS=	bus_if.h device_if.h if_epair.c
 
 .include <bsd.kmod.mk>
diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c
index 4b01e97c354d..cd11036ad028 100644
--- a/sys/net/if_epair.c
+++ b/sys/net/if_epair.c
@@ -2,8 +2,8 @@
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 The FreeBSD Foundation
+ * Copyright (c) 2009-2010 Bjoern A. Zeeb <bz@FreeBSD.org>
  * All rights reserved.
- * Copyright (c) 2009-2021 Bjoern A. Zeeb <bz@FreeBSD.org>
  *
  * This software was developed by CK Software GmbH under sponsorship
  * from the FreeBSD Foundation.
@@ -37,14 +37,21 @@
  * This is mostly intended to be used to provide connectivity between
  * different virtual network stack instances.
  */
+/*
+ * Things to re-think once we have more experience:
+ * - ifp->if_reassign function once we can test with vimage. Depending on
+ *   how if_vmove() is going to be improved.
+ * - Real random etheraddrs that are checked to be uniquish; we would need
+ *   to re-do them in case we move the interface between network stacks
+ *   in a private if_reassign function.
+ *   In case we bridge to a real interface/network or between indepedent
+ *   epairs on multiple stacks/machines, we may need this.
+ *   For now let the user handle that case.
+ */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_rss.h"
-#include "opt_inet.h"
-#include "opt_inet6.h"
-
 #include <sys/param.h>
 #include <sys/hash.h>
 #include <sys/jail.h>
@@ -54,16 +61,13 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/proc.h>
+#include <sys/refcount.h>
 #include <sys/queue.h>
-#include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
-#include <sys/taskqueue.h>
+#include <sys/sysctl.h>
 #include <sys/types.h>
-#include <sys/buf_ring.h>
-#include <sys/bus.h>
-#include <sys/interrupt.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
@@ -74,66 +78,121 @@ __FBSDID("$FreeBSD$");
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
-#ifdef RSS
-#include <net/rss_config.h>
-#ifdef INET
-#include <netinet/in_rss.h>
-#endif
-#ifdef INET6
-#include <netinet6/in6_rss.h>
-#endif
-#endif
 #include <net/vnet.h>
 
+SYSCTL_DECL(_net_link);
+static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl");
+
+#ifdef EPAIR_DEBUG
+static int epair_debug = 0;
+SYSCTL_INT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW,
+    &epair_debug, 0, "if_epair(4) debugging.");
+#define	DPRINTF(fmt, arg...)						\
+	if (epair_debug)						\
+		printf("[%s:%d] " fmt, __func__, __LINE__, ##arg)
+#else
+#define	DPRINTF(fmt, arg...)
+#endif
+
+static void epair_nh_sintr(struct mbuf *);
+static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *);
+static void epair_nh_drainedcpu(u_int);
+
+static void epair_start_locked(struct ifnet *);
+static int epair_media_change(struct ifnet *);
+static void epair_media_status(struct ifnet *, struct ifmediareq *);
+
 static int epair_clone_match(struct if_clone *, const char *);
 static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static int epair_clone_destroy(struct if_clone *, struct ifnet *);
 
 static const char epairname[] = "epair";
-#define	RXRSIZE	4096	/* Probably overkill by 4-8x. */
+static unsigned int next_index = 0;
 
-static MALLOC_DEFINE(M_EPAIR, epairname,
-    "Pair of virtual cross-over connected Ethernet-like interfaces");
+/* Netisr related definitions and sysctl. */
+static struct netisr_handler epair_nh = {
+	.nh_name	= epairname,
+	.nh_proto	= NETISR_EPAIR,
+	.nh_policy	= NETISR_POLICY_CPU,
+	.nh_handler	= epair_nh_sintr,
+	.nh_m2cpuid	= epair_nh_m2cpuid,
+	.nh_drainedcpu	= epair_nh_drainedcpu,
+};
 
-VNET_DEFINE_STATIC(struct if_clone *, epair_cloner);
-#define	V_epair_cloner	VNET(epair_cloner)
+static int
+sysctl_epair_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
+{
+	int error, qlimit;
 
-static unsigned int next_index = 0;
-#define	EPAIR_LOCK_INIT()		mtx_init(&epair_n_index_mtx, "epairidx", \
-					    NULL, MTX_DEF)
-#define	EPAIR_LOCK_DESTROY()		mtx_destroy(&epair_n_index_mtx)
-#define	EPAIR_LOCK()			mtx_lock(&epair_n_index_mtx)
-#define	EPAIR_UNLOCK()			mtx_unlock(&epair_n_index_mtx)
-
-#define BIT_QUEUE_TASK		0
-#define BIT_MBUF_QUEUED		1
-
-struct epair_softc;
-struct epair_queue {
-	int			 id;
-	struct buf_ring		*rxring[2];
-	volatile int		 ridx;		/* 0 || 1 */
-	volatile long		 state;		/* taskqueue coordination */
-	struct task		 tx_task;
-	struct epair_softc	*sc;
-};
+	netisr_getqlimit(&epair_nh, &qlimit);
+	error = sysctl_handle_int(oidp, &qlimit, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	if (qlimit < 1)
+		return (EINVAL);
+	return (netisr_setqlimit(&epair_nh, qlimit));
+}
+SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
+    0, 0, sysctl_epair_netisr_maxqlen, "I",
+    "Maximum if_epair(4) netisr \"hw\" queue length");
 
-static struct mtx epair_n_index_mtx;
 struct epair_softc {
-	struct ifnet		*ifp;		/* This ifp. */
-	struct ifnet		*oifp;		/* other ifp of pair. */
-	int			 num_queues;
-	struct epair_queue	*queues;
-	struct ifmedia		 media;		/* Media config (fake). */
-	STAILQ_ENTRY(epair_softc) entry;
+	struct ifnet	*ifp;		/* This ifp. */
+	struct ifnet	*oifp;		/* other ifp of pair. */
+	struct ifmedia	media;		/* Media config (fake). */
+	u_int		refcount;	/* # of mbufs in flight. */
+	u_int		cpuid;		/* CPU ID assigned upon creation. */
+	void		(*if_qflush)(struct ifnet *);
+					/* Original if_qflush routine. */
 };
 
-struct epair_tasks_t {
-	int			 tasks;
-	struct taskqueue	 *tq[MAXCPU];
+/*
+ * Per-CPU list of ifps with data in the ifq that needs to be flushed
+ * to the netisr ``hw'' queue before we allow any further direct queuing
+ * to the ``hw'' queue.
+ */
+struct epair_ifp_drain {
+	STAILQ_ENTRY(epair_ifp_drain)	ifp_next;
+	struct ifnet			*ifp;
 };
+STAILQ_HEAD(eid_list, epair_ifp_drain);
+
+#define	EPAIR_LOCK_INIT(dpcpu)		mtx_init(&(dpcpu)->if_epair_mtx, \
+					    "if_epair", NULL, MTX_DEF)
+#define	EPAIR_LOCK_DESTROY(dpcpu)	mtx_destroy(&(dpcpu)->if_epair_mtx)
+#define	EPAIR_LOCK_ASSERT(dpcpu)	mtx_assert(&(dpcpu)->if_epair_mtx, \
+					    MA_OWNED)
+#define	EPAIR_LOCK(dpcpu)		mtx_lock(&(dpcpu)->if_epair_mtx)
+#define	EPAIR_UNLOCK(dpcpu)		mtx_unlock(&(dpcpu)->if_epair_mtx)
+
+#ifdef INVARIANTS
+#define	EPAIR_REFCOUNT_INIT(r, v)	refcount_init((r), (v))
+#define	EPAIR_REFCOUNT_AQUIRE(r)	refcount_acquire((r))
+#define	EPAIR_REFCOUNT_RELEASE(r)	refcount_release((r))
+#define	EPAIR_REFCOUNT_ASSERT(a, p)	KASSERT(a, p)
+#else
+#define	EPAIR_REFCOUNT_INIT(r, v)
+#define	EPAIR_REFCOUNT_AQUIRE(r)
+#define	EPAIR_REFCOUNT_RELEASE(r)
+#define	EPAIR_REFCOUNT_ASSERT(a, p)
+#endif
 
-static struct epair_tasks_t epair_tasks;
+static MALLOC_DEFINE(M_EPAIR, epairname,
+    "Pair of virtual cross-over connected Ethernet-like interfaces");
+
+VNET_DEFINE_STATIC(struct if_clone *, epair_cloner);
+#define	V_epair_cloner	VNET(epair_cloner)
+
+/*
+ * DPCPU area and functions.
+ */
+struct epair_dpcpu {
+	struct mtx	if_epair_mtx;		/* Per-CPU locking. */
+	int		epair_drv_flags;	/* Per-CPU ``hw'' drv flags. */
+	struct eid_list	epair_ifp_drain_list;	/* Per-CPU list of ifps with
+						 * data in the ifq. */
+};
+DPCPU_DEFINE(struct epair_dpcpu, epair_dpcpu);
 
 static void
 epair_clear_mbuf(struct mbuf *m)
@@ -142,199 +201,313 @@ epair_clear_mbuf(struct mbuf *m)
 }
 
 static void
-epair_if_input(struct epair_softc *sc, struct epair_queue *q, int ridx)
+epair_dpcpu_init(void)
 {
-	struct epoch_tracker et;
-	struct ifnet *ifp;
-	struct mbuf *m;
+	struct epair_dpcpu *epair_dpcpu;
+	struct eid_list *s;
+	u_int cpuid;
+
+	CPU_FOREACH(cpuid) {
+		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
+
+		/* Initialize per-cpu lock. */
+		EPAIR_LOCK_INIT(epair_dpcpu);
+
+		/* Driver flags are per-cpu as are our netisr "hw" queues. */
+		epair_dpcpu->epair_drv_flags = 0;
+
+		/*
+		 * Initialize per-cpu drain list.
+		 * Manually do what STAILQ_HEAD_INITIALIZER would do.
+		 */
+		s = &epair_dpcpu->epair_ifp_drain_list;
+		s->stqh_first = NULL;
+		s->stqh_last = &s->stqh_first;
+	} 
+}
 
-	ifp = sc->ifp;
-	NET_EPOCH_ENTER_ET(et);
-	CURVNET_SET(ifp->if_vnet);
-	while (! buf_ring_empty(q->rxring[ridx])) {
-		m = buf_ring_dequeue_mc(q->rxring[ridx]);
-		if (m == NULL)
-			continue;
+static void
+epair_dpcpu_detach(void)
+{
+	struct epair_dpcpu *epair_dpcpu;
+	u_int cpuid;
 
-		MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
-		(*ifp->if_input)(ifp, m);
+	CPU_FOREACH(cpuid) {
+		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
+
+		/* Destroy per-cpu lock. */
+		EPAIR_LOCK_DESTROY(epair_dpcpu);
 	}
-	CURVNET_RESTORE();
-	NET_EPOCH_EXIT_ET(et);
 }
 
-static void
-epair_tx_start_deferred(void *arg, int pending)
+/*
+ * Helper functions.
+ */
+static u_int
+cpuid_from_ifp(struct ifnet *ifp)
 {
-	struct epair_queue *q = (struct epair_queue *)arg;
-	struct epair_softc *sc = q->sc;
-	int ridx, nidx;
-
-	if_ref(sc->ifp);
-	ridx = atomic_load_int(&q->ridx);
-	do {
-		nidx = (ridx == 0) ? 1 : 0;
-	} while (!atomic_fcmpset_int(&q->ridx, &ridx, nidx));
-	epair_if_input(sc, q, ridx);
-
-	atomic_clear_long(&q->state, (1 << BIT_QUEUE_TASK));
-	if (atomic_testandclear_long(&q->state, BIT_MBUF_QUEUED))
-		taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task);
-
-	if_rele(sc->ifp);
+	struct epair_softc *sc;
+
+	if (ifp == NULL)
+		return (0);
+	sc = ifp->if_softc;
+
+	return (sc->cpuid);
 }
 
-static int
-epair_menq(struct mbuf *m, struct epair_softc *osc)
+/*
+ * Netisr handler functions.
+ */
+static void
+epair_nh_sintr(struct mbuf *m)
 {
-	struct ifnet *ifp, *oifp;
-	int len, ret;
-	int ridx;
-	short mflags;
-	struct epair_queue *q = NULL;
-	uint32_t bucket;
-#ifdef RSS
-	struct ether_header *eh;
-#endif
+	struct ifnet *ifp;
+	struct epair_softc *sc __unused;
 
-	/*
-	 * I know this looks weird. We pass the "other sc" as we need that one
-	 * and can get both ifps from it as well.
-	 */
-	oifp = osc->ifp;
-	ifp = osc->oifp;
+	ifp = m->m_pkthdr.rcvif;
+	(*ifp->if_input)(ifp, m);
+	sc = ifp->if_softc;
+	EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+	EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+	    ("%s: ifp=%p sc->refcount not >= 1: %d",
+	    __func__, ifp, sc->refcount));
+	DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount);
+}
 
-	M_ASSERTPKTHDR(m);
-	epair_clear_mbuf(m);
-	if_setrcvif(m, oifp);
-	M_SETFIB(m, oifp->if_fib);
+static struct mbuf *
+epair_nh_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
+{
 
-	/* Save values as once the mbuf is queued, it's not ours anymore. */
-	len = m->m_pkthdr.len;
-	mflags = m->m_flags;
+	*cpuid = cpuid_from_ifp(m->m_pkthdr.rcvif);
 
-	MPASS(m->m_nextpkt == NULL);
-	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
+	return (m);
+}
 
-#ifdef RSS
-	ret = rss_m2bucket(m, &bucket);
-	if (ret) {
-		/* Actually hash the packet. */
-		eh = mtod(m, struct ether_header *);
+static void
+epair_nh_drainedcpu(u_int cpuid)
+{
+	struct epair_dpcpu *epair_dpcpu;
+	struct epair_ifp_drain *elm, *tvar;
+	struct ifnet *ifp;
 
-		switch (ntohs(eh->ether_type)) {
-#ifdef INET
-		case ETHERTYPE_IP:
-			rss_soft_m2cpuid_v4(m, 0, &bucket);
-			break;
-#endif
-#ifdef INET6
-		case ETHERTYPE_IPV6:
-			rss_soft_m2cpuid_v6(m, 0, &bucket);
-			break;
-#endif
-		default:
-			bucket = 0;
+	epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
+	EPAIR_LOCK(epair_dpcpu);
+	/*
+	 * Assume our "hw" queue and possibly ifq will be emptied
+	 * again. In case we will overflow the "hw" queue while
+	 * draining, epair_start_locked will set IFF_DRV_OACTIVE
+	 * again and we will stop and return.
+	 */
+	STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
+	    ifp_next, tvar) {
+		ifp = elm->ifp;
+		epair_dpcpu->epair_drv_flags &= ~IFF_DRV_OACTIVE;
+		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+		epair_start_locked(ifp);
+
+		IFQ_LOCK(&ifp->if_snd);
+		if (IFQ_IS_EMPTY(&ifp->if_snd)) {
+			struct epair_softc *sc __unused;
+
+			STAILQ_REMOVE(&epair_dpcpu->epair_ifp_drain_list,
+			    elm, epair_ifp_drain, ifp_next);
+			/* The cached ifp goes off the list. */
+			sc = ifp->if_softc;
+			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+			    ("%s: ifp=%p sc->refcount not >= 1: %d",
+			    __func__, ifp, sc->refcount));
+			free(elm, M_EPAIR);
+		}
+		IFQ_UNLOCK(&ifp->if_snd);
+
+		if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) {
+			/* Our "hw"q overflew again. */
+			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
+			DPRINTF("hw queue length overflow at %u\n",
+			    epair_nh.nh_qlimit);
 			break;
 		}
 	}
-	bucket %= osc->num_queues;
-#else
-	bucket = 0;
-#endif
-	q = &osc->queues[bucket];
-
-	atomic_set_long(&q->state, (1 << BIT_MBUF_QUEUED));
-	ridx = atomic_load_int(&q->ridx);
-	ret = buf_ring_enqueue(q->rxring[ridx], m);
-	if (ret != 0) {
-		/* Ring is full. */
-		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
-		m_freem(m);
-		return (0);
+	EPAIR_UNLOCK(epair_dpcpu);
+}
+
+/*
+ * Network interface (`if') related functions.
+ */
+static void
+epair_remove_ifp_from_draining(struct ifnet *ifp)
+{
+	struct epair_dpcpu *epair_dpcpu;
+	struct epair_ifp_drain *elm, *tvar;
+	u_int cpuid;
+
+	CPU_FOREACH(cpuid) {
+		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
+		EPAIR_LOCK(epair_dpcpu);
+		STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
+		    ifp_next, tvar) {
+			if (ifp == elm->ifp) {
+				struct epair_softc *sc __unused;
+
+				STAILQ_REMOVE(
+				    &epair_dpcpu->epair_ifp_drain_list, elm,
+				    epair_ifp_drain, ifp_next);
+				/* The cached ifp goes off the list. */
+				sc = ifp->if_softc;
+				EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+				EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+				    ("%s: ifp=%p sc->refcount not >= 1: %d",
+				    __func__, ifp, sc->refcount));
+				free(elm, M_EPAIR);
+			}
+		}
+		EPAIR_UNLOCK(epair_dpcpu);
 	}
+}
 
-	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
-	/*
-	 * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
-	 * but as we bypass all this we have to duplicate
-	 * the logic another time.
-	 */
-	if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
-	if (mflags & (M_BCAST|M_MCAST))
-		if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
-	/* Someone else received the packet. */
-	if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
+static int
+epair_add_ifp_for_draining(struct ifnet *ifp)
+{
+	struct epair_dpcpu *epair_dpcpu;
+	struct epair_softc *sc;
+	struct epair_ifp_drain *elm = NULL;
 
-	if (!atomic_testandset_long(&q->state, BIT_QUEUE_TASK))
-		taskqueue_enqueue(epair_tasks.tq[bucket], &q->tx_task);
+	sc = ifp->if_softc;
+	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
+	EPAIR_LOCK_ASSERT(epair_dpcpu);
+	STAILQ_FOREACH(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next)
+		if (elm->ifp == ifp)
+			break;
+	/* If the ifp is there already, return success. */
+	if (elm != NULL)
+		return (0);
+
+	elm = malloc(sizeof(struct epair_ifp_drain), M_EPAIR, M_NOWAIT|M_ZERO);
+	if (elm == NULL)
+		return (ENOMEM);
+
+	elm->ifp = ifp;
+	/* Add a reference for the ifp pointer on the list. */
+	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
+	STAILQ_INSERT_TAIL(&epair_dpcpu->epair_ifp_drain_list, elm, ifp_next);
 
 	return (0);
 }
 
 static void
-epair_start(struct ifnet *ifp)
+epair_start_locked(struct ifnet *ifp)
 {
+	struct epair_dpcpu *epair_dpcpu;
 	struct mbuf *m;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
+	int error;
+
+	DPRINTF("ifp=%p\n", ifp);
+	sc = ifp->if_softc;
+	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
+	EPAIR_LOCK_ASSERT(epair_dpcpu);
+
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+		return;
+	if ((ifp->if_flags & IFF_UP) == 0)
+		return;
 
 	/*
 	 * We get packets here from ether_output via if_handoff()
 	 * and need to put them into the input queue of the oifp
-	 * and will put the packet into the receive-queue (rxq) of the
-	 * other interface (oifp) of our pair.
+	 * and call oifp->if_input() via netisr/epair_sintr().
 	 */
-	sc = ifp->if_softc;
 	oifp = sc->oifp;
 	sc = oifp->if_softc;
 	for (;;) {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			break;
-		M_ASSERTPKTHDR(m);
 		BPF_MTAP(ifp, m);
 
-		/* In case either interface is not usable drop the packet. */
-		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
-		    (ifp->if_flags & IFF_UP) == 0 ||
-		    (oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
-		    (oifp->if_flags & IFF_UP) == 0) {
+		/*
+		 * In case the outgoing interface is not usable,
+		 * drop the packet.
+		 */
+		if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
+		    (oifp->if_flags & IFF_UP) ==0) {
+			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			continue;
 		}
-
-		(void) epair_menq(m, sc);
+		DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
+
+		epair_clear_mbuf(m);
+
+		/*
+		 * Add a reference so the interface cannot go while the
+		 * packet is in transit as we rely on rcvif to stay valid.
+		 */
+		EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
+		m->m_pkthdr.rcvif = oifp;
+		CURVNET_SET_QUIET(oifp->if_vnet);
+		error = netisr_queue(NETISR_EPAIR, m);
+		CURVNET_RESTORE();
+		if (!error) {
+			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+			/* Someone else received the packet. */
+			if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
+		} else {
+			/* The packet was freed already. */
+			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
+			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+			(void) epair_add_ifp_for_draining(ifp);
+			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+			    ("%s: ifp=%p sc->refcount not >= 1: %d",
+			    __func__, oifp, sc->refcount));
+		}
 	}
 }
 
+static void
+epair_start(struct ifnet *ifp)
+{
+	struct epair_dpcpu *epair_dpcpu;
+
+	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
+	EPAIR_LOCK(epair_dpcpu);
+	epair_start_locked(ifp);
+	EPAIR_UNLOCK(epair_dpcpu);
+}
+
 static int
-epair_transmit(struct ifnet *ifp, struct mbuf *m)
+epair_transmit_locked(struct ifnet *ifp, struct mbuf *m)
 {
+	struct epair_dpcpu *epair_dpcpu;
 	struct epair_softc *sc;
 	struct ifnet *oifp;
 	int error, len;
 	short mflags;
 
+	DPRINTF("ifp=%p m=%p\n", ifp, m);
+	sc = ifp->if_softc;
+	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
+	EPAIR_LOCK_ASSERT(epair_dpcpu);
+
 	if (m == NULL)
 		return (0);
-
-	M_ASSERTPKTHDR(m);
-
+	
 	/*
 	 * We are not going to use the interface en/dequeue mechanism
 	 * on the TX side. We are called from ether_output_frame()
-	 * and will put the packet into the receive-queue (rxq) of the
-	 * other interface (oifp) of our pair.
+	 * and will put the packet into the incoming queue of the
+	 * other interface of our pair via the netsir.
 	 */
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		m_freem(m);
-		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENXIO);
 	}
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		m_freem(m);
-		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENETDOWN);
 	}
 
@@ -344,16 +517,16 @@ epair_transmit(struct ifnet *ifp, struct mbuf *m)
 	 * In case the outgoing interface is not usable,
 	 * drop the packet.
 	 */
-	sc = ifp->if_softc;
 	oifp = sc->oifp;
 	if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
-	    (oifp->if_flags & IFF_UP) == 0) {
+	    (oifp->if_flags & IFF_UP) ==0) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return (0);
 	}
 	len = m->m_pkthdr.len;
 	mflags = m->m_flags;
+	DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
 
 #ifdef ALTQ
 	/* Support ALTQ via the classic if_start() path. */
@@ -367,17 +540,99 @@ epair_transmit(struct ifnet *ifp, struct mbuf *m)
 			if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 			if (mflags & (M_BCAST|M_MCAST))
 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
-			epair_start(ifp);
+			
+			if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0)
+				epair_start_locked(ifp);
+			else
+				(void)epair_add_ifp_for_draining(ifp);
 		}
 		return (error);
 	}
 	IF_UNLOCK(&ifp->if_snd);
 #endif
 
-	error = epair_menq(m, oifp->if_softc);
+	if ((epair_dpcpu->epair_drv_flags & IFF_DRV_OACTIVE) != 0) {
+		/*
+		 * Our hardware queue is full, try to fall back
+		 * queuing to the ifq but do not call ifp->if_start.
+		 * Either we are lucky or the packet is gone.
+		 */
+		IFQ_ENQUEUE(&ifp->if_snd, m, error);
+		if (!error)
+			(void)epair_add_ifp_for_draining(ifp);
+		return (error);
+	}
+
+	epair_clear_mbuf(m);
+
+	sc = oifp->if_softc;
+	/*
+	 * Add a reference so the interface cannot go while the
+	 * packet is in transit as we rely on rcvif to stay valid.
+	 */
+	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
+	m->m_pkthdr.rcvif = oifp;
+	CURVNET_SET_QUIET(oifp->if_vnet);
+	error = netisr_queue(NETISR_EPAIR, m);
+	CURVNET_RESTORE();
+	if (!error) {
+		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+		/*
+		 * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
+		 * but as we bypass all this we have to duplicate
+		 * the logic another time.
+		 */
+		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
+		if (mflags & (M_BCAST|M_MCAST))
+			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
+		/* Someone else received the packet. */
+		if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
+	} else {
+		/* The packet was freed already. */
+		epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
+		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+		EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+		EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+		    ("%s: ifp=%p sc->refcount not >= 1: %d",
+		    __func__, oifp, sc->refcount));
+	}
+
 	return (error);
 }
 
+static int
+epair_transmit(struct ifnet *ifp, struct mbuf *m)
+{
+	struct epair_dpcpu *epair_dpcpu;
+	int error;
+
+	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
+	EPAIR_LOCK(epair_dpcpu);
+	error = epair_transmit_locked(ifp, m);
+	EPAIR_UNLOCK(epair_dpcpu);
+	return (error);
+}
+
+static void
+epair_qflush(struct ifnet *ifp)
+{
+	struct epair_softc *sc;
+	
+	sc = ifp->if_softc;
+	KASSERT(sc != NULL, ("%s: ifp=%p, epair_softc gone? sc=%p\n",
+	    __func__, ifp, sc));
+	/*
+	 * Remove this ifp from all backpointer lists. The interface will not
+	 * usable for flushing anyway nor should it have anything to flush
+	 * after if_qflush().
+	 */
+	epair_remove_ifp_from_draining(ifp);
+
+	if (sc->if_qflush)
+		sc->if_qflush(ifp);
+}
+
 static int
 epair_media_change(struct ifnet *ifp __unused)
 {
@@ -446,6 +701,8 @@ epair_clone_match(struct if_clone *ifc, const char *name)
 {
 	const char *cp;
 
+	DPRINTF("name='%s'\n", name);
+
 	/*
 	 * Our base name is epair.
 	 * Our interfaces will be named epair<n>[ab].
@@ -534,29 +791,17 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 
 	/* Allocate memory for both [ab] interfaces */
 	sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
+	EPAIR_REFCOUNT_INIT(&sca->refcount, 1);
 	sca->ifp = if_alloc(IFT_ETHER);
-	sca->num_queues = epair_tasks.tasks;
 	if (sca->ifp == NULL) {
 		free(sca, M_EPAIR);
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
-	sca->queues = mallocarray(sca->num_queues, sizeof(struct epair_queue),
-	    M_EPAIR, M_WAITOK);
-	for (int i = 0; i < sca->num_queues; i++) {
-		struct epair_queue *q = &sca->queues[i];
-		q->id = i;
-		q->rxring[0] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
-		q->rxring[1] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
-		q->ridx = 0;
-		q->state = 0;
-		q->sc = sca;
-		TASK_INIT(&q->tx_task, 0, epair_tx_start_deferred, q);
-	}
 
 	scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
+	EPAIR_REFCOUNT_INIT(&scb->refcount, 1);
 	scb->ifp = if_alloc(IFT_ETHER);
-	scb->num_queues = epair_tasks.tasks;
 	if (scb->ifp == NULL) {
 		free(scb, M_EPAIR);
 		if_free(sca->ifp);
@@ -564,33 +809,23 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 		ifc_free_unit(ifc, unit);
 		return (ENOSPC);
 	}
-	scb->queues = mallocarray(scb->num_queues, sizeof(struct epair_queue),
-	    M_EPAIR, M_WAITOK);
-	for (int i = 0; i < scb->num_queues; i++) {
-		struct epair_queue *q = &scb->queues[i];
-		q->id = i;
-		q->rxring[0] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
-		q->rxring[1] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
-		q->ridx = 0;
-		q->state = 0;
-		q->sc = scb;
-		TASK_INIT(&q->tx_task, 0, epair_tx_start_deferred, q);
-	}
-
+	
 	/*
 	 * Cross-reference the interfaces so we will be able to free both.
 	 */
 	sca->oifp = scb->ifp;
 	scb->oifp = sca->ifp;
 
-	EPAIR_LOCK();
-#ifdef SMP
-	/* Get an approximate distribution. */
-	hash = next_index % mp_ncpus;
-#else
-	hash = 0;
-#endif
-	EPAIR_UNLOCK();
+	/*
+	 * Calculate the cpuid for netisr queueing based on the
+	 * ifIndex of the interfaces. As long as we cannot configure
+	 * this or use cpuset information easily we cannot guarantee
+	 * cache locality but we can at least allow parallelism.
+	 */
+	sca->cpuid =
+	    netisr_get_cpuid(sca->ifp->if_index);
+	scb->cpuid =
+	    netisr_get_cpuid(scb->ifp->if_index);
 
 	/* Initialise pseudo media types. */
 	ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status);
@@ -627,14 +862,12 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 	if (hostid == 0) 
 		arc4rand(&hostid, sizeof(hostid), 0);
 
-	EPAIR_LOCK();
 	if (ifp->if_index > next_index)
 		next_index = ifp->if_index;
 	else
 		next_index++;
 
 	key[0] = (uint32_t)next_index;
-	EPAIR_UNLOCK();
 	key[1] = (uint32_t)(hostid & 0xffffffff);
 	key[2] = (uint32_t)((hostid >> 32) & 0xfffffffff);
 	hash = jenkins_hash32(key, 3, 0);
@@ -643,8 +876,10 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 	memcpy(&eaddr[1], &hash, 4);
 	eaddr[5] = 0x0a;
 	ether_ifattach(ifp, eaddr);
-	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
+	sca->if_qflush = ifp->if_qflush;
+	ifp->if_qflush = epair_qflush;
 	ifp->if_transmit = epair_transmit;
+	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 
 	/* Swap the name and finish initialization of interface <n>b. */
 	*dp = 'b';
@@ -669,43 +904,27 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 	strlcpy(name, scb->ifp->if_xname, len);
 	epair_clone_add(ifc, scb);
 
-	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
+	scb->if_qflush = ifp->if_qflush;
+	ifp->if_qflush = epair_qflush;
 	ifp->if_transmit = epair_transmit;
+	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
 
 	/*
 	 * Restore name to <n>a as the ifp for this will go into the
 	 * cloner list for the initial call.
 	 */
 	strlcpy(name, sca->ifp->if_xname, len);
+	DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb);
 
 	/* Tell the world, that we are ready to rock. */
 	sca->ifp->if_drv_flags |= IFF_DRV_RUNNING;
-	if_link_state_change(sca->ifp, LINK_STATE_UP);
 	scb->ifp->if_drv_flags |= IFF_DRV_RUNNING;
+	if_link_state_change(sca->ifp, LINK_STATE_UP);
 	if_link_state_change(scb->ifp, LINK_STATE_UP);
 
 	return (0);
 }
 
-static void
-epair_drain_rings(struct epair_softc *sc)
-{
-	int ridx;
-	struct mbuf *m;
-
-	for (ridx = 0; ridx < 2; ridx++) {
-		for (int i = 0; i < sc->num_queues; i++) {
-			struct epair_queue *q = &sc->queues[i];
-			do {
-				m = buf_ring_dequeue_sc(q->rxring[ridx]);
-				if (m == NULL)
-					break;
-				m_freem(m);
-			} while (1);
-		}
-	}
-}
-
 static int
 epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 {
@@ -713,6 +932,8 @@ epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
*** 198 LINES SKIPPED ***