git: 56dc95b249dc - stable/12 - Revert "if_epair: rework"
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 24 Mar 2022 13:14:36 UTC
The branch stable/12 has been updated by kp:
URL: https://cgit.FreeBSD.org/src/commit/?id=56dc95b249dceb30367a77dccd0231cbb08dc1f7
commit 56dc95b249dceb30367a77dccd0231cbb08dc1f7
Author: Kristof Provost <kp@FreeBSD.org>
AuthorDate: 2022-03-21 14:41:32 +0000
Commit: Kristof Provost <kp@FreeBSD.org>
CommitDate: 2022-03-24 09:44:40 +0000
Revert "if_epair: rework"
Revert the recent performance rework of if_epair. It relies on functions like
atomic_testandclear_long() which are not available on all platforms in
stable/12.
This reverts commits b1a3f8dccb6203036b7ee81201fd5b5a8de36f0d,
fb3644ab2afe777fdd2539bc996a390443f052f1,
ca7af63e88f8cc96865d45e020a57b3062631388,
092da35a0d80af7a3e5c5c22cbeddb6cffbd9524,
and 7c2b681b33fc78ed06c7e9e65eeebb2ab5420586.
This is a direct commit to stable/12.
---
sys/modules/if_epair/Makefile | 2 +-
sys/net/if_epair.c | 832 ++++++++++++++++++++++++++----------------
2 files changed, 509 insertions(+), 325 deletions(-)
diff --git a/sys/modules/if_epair/Makefile b/sys/modules/if_epair/Makefile
index 8b063623f2e8..3e102413bfe2 100644
--- a/sys/modules/if_epair/Makefile
+++ b/sys/modules/if_epair/Makefile
@@ -3,6 +3,6 @@
.PATH: ${SRCTOP}/sys/net
KMOD= if_epair
-SRCS= bus_if.h device_if.h if_epair.c opt_rss.h opt_inet.h opt_inet6.h
+SRCS= bus_if.h device_if.h if_epair.c
.include <bsd.kmod.mk>
diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c
index 4b01e97c354d..cd11036ad028 100644
--- a/sys/net/if_epair.c
+++ b/sys/net/if_epair.c
@@ -2,8 +2,8 @@
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2008 The FreeBSD Foundation
+ * Copyright (c) 2009-2010 Bjoern A. Zeeb <bz@FreeBSD.org>
* All rights reserved.
- * Copyright (c) 2009-2021 Bjoern A. Zeeb <bz@FreeBSD.org>
*
* This software was developed by CK Software GmbH under sponsorship
* from the FreeBSD Foundation.
@@ -37,14 +37,21 @@
* This is mostly intended to be used to provide connectivity between
* different virtual network stack instances.
*/
+/*
+ * Things to re-think once we have more experience:
+ * - ifp->if_reassign function once we can test with vimage. Depending on
+ * how if_vmove() is going to be improved.
+ * - Real random etheraddrs that are checked to be uniquish; we would need
+ * to re-do them in case we move the interface between network stacks
+ * in a private if_reassign function.
+ * In case we bridge to a real interface/network or between indepedent
+ * epairs on multiple stacks/machines, we may need this.
+ * For now let the user handle that case.
+ */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include "opt_rss.h"
-#include "opt_inet.h"
-#include "opt_inet6.h"
-
#include <sys/param.h>
#include <sys/hash.h>
#include <sys/jail.h>
@@ -54,16 +61,13 @@ __FBSDID("$FreeBSD$");
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/proc.h>
+#include <sys/refcount.h>
#include <sys/queue.h>
-#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/sockio.h>
-#include <sys/taskqueue.h>
+#include <sys/sysctl.h>
#include <sys/types.h>
-#include <sys/buf_ring.h>
-#include <sys/bus.h>
-#include <sys/interrupt.h>
#include <net/bpf.h>
#include <net/ethernet.h>
@@ -74,66 +78,121 @@ __FBSDID("$FreeBSD$");
#include <net/if_var.h>
#include <net/if_types.h>
#include <net/netisr.h>
-#ifdef RSS
-#include <net/rss_config.h>
-#ifdef INET
-#include <netinet/in_rss.h>
-#endif
-#ifdef INET6
-#include <netinet6/in6_rss.h>
-#endif
-#endif
#include <net/vnet.h>
+SYSCTL_DECL(_net_link);
+static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl");
+
+#ifdef EPAIR_DEBUG
+static int epair_debug = 0;
+SYSCTL_INT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW,
+ &epair_debug, 0, "if_epair(4) debugging.");
+#define DPRINTF(fmt, arg...) \
+ if (epair_debug) \
+ printf("[%s:%d] " fmt, __func__, __LINE__, ##arg)
+#else
+#define DPRINTF(fmt, arg...)
+#endif
+
+static void epair_nh_sintr(struct mbuf *);
+static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *);
+static void epair_nh_drainedcpu(u_int);
+
+static void epair_start_locked(struct ifnet *);
+static int epair_media_change(struct ifnet *);
+static void epair_media_status(struct ifnet *, struct ifmediareq *);
+
static int epair_clone_match(struct if_clone *, const char *);
static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t);
static int epair_clone_destroy(struct if_clone *, struct ifnet *);
static const char epairname[] = "epair";
-#define RXRSIZE 4096 /* Probably overkill by 4-8x. */
+static unsigned int next_index = 0;
-static MALLOC_DEFINE(M_EPAIR, epairname,
- "Pair of virtual cross-over connected Ethernet-like interfaces");
+/* Netisr related definitions and sysctl. */
+static struct netisr_handler epair_nh = {
+ .nh_name = epairname,
+ .nh_proto = NETISR_EPAIR,
+ .nh_policy = NETISR_POLICY_CPU,
+ .nh_handler = epair_nh_sintr,
+ .nh_m2cpuid = epair_nh_m2cpuid,
+ .nh_drainedcpu = epair_nh_drainedcpu,
+};
-VNET_DEFINE_STATIC(struct if_clone *, epair_cloner);
-#define V_epair_cloner VNET(epair_cloner)
+static int
+sysctl_epair_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
+{
+ int error, qlimit;
-static unsigned int next_index = 0;
-#define EPAIR_LOCK_INIT() mtx_init(&epair_n_index_mtx, "epairidx", \
- NULL, MTX_DEF)
-#define EPAIR_LOCK_DESTROY() mtx_destroy(&epair_n_index_mtx)
-#define EPAIR_LOCK() mtx_lock(&epair_n_index_mtx)
-#define EPAIR_UNLOCK() mtx_unlock(&epair_n_index_mtx)
-
-#define BIT_QUEUE_TASK 0
-#define BIT_MBUF_QUEUED 1
-
-struct epair_softc;
-struct epair_queue {
- int id;
- struct buf_ring *rxring[2];
- volatile int ridx; /* 0 || 1 */
- volatile long state; /* taskqueue coordination */
- struct task tx_task;
- struct epair_softc *sc;
-};
+ netisr_getqlimit(&epair_nh, &qlimit);
+ error = sysctl_handle_int(oidp, &qlimit, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (qlimit < 1)
+ return (EINVAL);
+ return (netisr_setqlimit(&epair_nh, qlimit));
+}
+SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
+ 0, 0, sysctl_epair_netisr_maxqlen, "I",
+ "Maximum if_epair(4) netisr \"hw\" queue length");
-static struct mtx epair_n_index_mtx;
struct epair_softc {
- struct ifnet *ifp; /* This ifp. */
- struct ifnet *oifp; /* other ifp of pair. */
- int num_queues;
- struct epair_queue *queues;
- struct ifmedia media; /* Media config (fake). */
- STAILQ_ENTRY(epair_softc) entry;
+ struct ifnet *ifp; /* This ifp. */
+ struct ifnet *oifp; /* other ifp of pair. */
+ struct ifmedia media; /* Media config (fake). */
+ u_int refcount; /* # of mbufs in flight. */
+ u_int cpuid; /* CPU ID assigned upon creation. */
+ void (*if_qflush)(struct ifnet *);
+ /* Original if_qflush routine. */
};
-struct epair_tasks_t {
- int tasks;
- struct taskqueue *tq[MAXCPU];
+/*
+ * Per-CPU list of ifps with data in the ifq that needs to be flushed
+ * to the netisr ``hw'' queue before we allow any further direct queuing
+ * to the ``hw'' queue.
+ */
+struct epair_ifp_drain {
+ STAILQ_ENTRY(epair_ifp_drain) ifp_next;
+ struct ifnet *ifp;
};
+STAILQ_HEAD(eid_list, epair_ifp_drain);
+
+#define EPAIR_LOCK_INIT(dpcpu) mtx_init(&(dpcpu)->if_epair_mtx, \
+ "if_epair", NULL, MTX_DEF)
+#define EPAIR_LOCK_DESTROY(dpcpu) mtx_destroy(&(dpcpu)->if_epair_mtx)
+#define EPAIR_LOCK_ASSERT(dpcpu) mtx_assert(&(dpcpu)->if_epair_mtx, \
+ MA_OWNED)
+#define EPAIR_LOCK(dpcpu) mtx_lock(&(dpcpu)->if_epair_mtx)
+#define EPAIR_UNLOCK(dpcpu) mtx_unlock(&(dpcpu)->if_epair_mtx)
+
+#ifdef INVARIANTS
+#define EPAIR_REFCOUNT_INIT(r, v) refcount_init((r), (v))
+#define EPAIR_REFCOUNT_AQUIRE(r) refcount_acquire((r))
+#define EPAIR_REFCOUNT_RELEASE(r) refcount_release((r))
+#define EPAIR_REFCOUNT_ASSERT(a, p) KASSERT(a, p)
+#else
+#define EPAIR_REFCOUNT_INIT(r, v)
+#define EPAIR_REFCOUNT_AQUIRE(r)
+#define EPAIR_REFCOUNT_RELEASE(r)
+#define EPAIR_REFCOUNT_ASSERT(a, p)
+#endif
-static struct epair_tasks_t epair_tasks;
+static MALLOC_DEFINE(M_EPAIR, epairname,
+ "Pair of virtual cross-over connected Ethernet-like interfaces");
+
+VNET_DEFINE_STATIC(struct if_clone *, epair_cloner);
+#define V_epair_cloner VNET(epair_cloner)
+
+/*
+ * DPCPU area and functions.
+ */
+struct epair_dpcpu {
+ struct mtx if_epair_mtx; /* Per-CPU locking. */
+ int epair_drv_flags; /* Per-CPU ``hw'' drv flags. */
+ struct eid_list epair_ifp_drain_list; /* Per-CPU list of ifps with
+ * data in the ifq. */
+};
+DPCPU_DEFINE(struct epair_dpcpu, epair_dpcpu);
static void
epair_clear_mbuf(struct mbuf *m)
@@ -142,199 +201,313 @@ epair_clear_mbuf(struct mbuf *m)
}
static void
-epair_if_input(struct epair_softc *sc, struct epair_queue *q, int ridx)
+epair_dpcpu_init(void)
{
- struct epoch_tracker et;
- struct ifnet *ifp;
- struct mbuf *m;
+ struct epair_dpcpu *epair_dpcpu;
+ struct eid_list *s;
+ u_int cpuid;
+
+ CPU_FOREACH(cpuid) {
+ epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
+
+ /* Initialize per-cpu lock. */
+ EPAIR_LOCK_INIT(epair_dpcpu);
+
+ /* Driver flags are per-cpu as are our netisr "hw" queues. */
+ epair_dpcpu->epair_drv_flags = 0;
+
+ /*
+ * Initialize per-cpu drain list.
+ * Manually do what STAILQ_HEAD_INITIALIZER would do.
+ */
+ s = &epair_dpcpu->epair_ifp_drain_list;
+ s->stqh_first = NULL;
+ s->stqh_last = &s->stqh_first;
+ }
+}
- ifp = sc->ifp;
- NET_EPOCH_ENTER_ET(et);
- CURVNET_SET(ifp->if_vnet);
- while (! buf_ring_empty(q->rxring[ridx])) {
- m = buf_ring_dequeue_mc(q->rxring[ridx]);
- if (m == NULL)
- continue;
+static void
+epair_dpcpu_detach(void)
+{
+ struct epair_dpcpu *epair_dpcpu;
+ u_int cpuid;
- MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
- (*ifp->if_input)(ifp, m);
+ CPU_FOREACH(cpuid) {
+ epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
+
+ /* Destroy per-cpu lock. */
+ EPAIR_LOCK_DESTROY(epair_dpcpu);
}
- CURVNET_RESTORE();
- NET_EPOCH_EXIT_ET(et);
}
-static void
-epair_tx_start_deferred(void *arg, int pending)
+/*
+ * Helper functions.
+ */
+static u_int
+cpuid_from_ifp(struct ifnet *ifp)
{
- struct epair_queue *q = (struct epair_queue *)arg;
- struct epair_softc *sc = q->sc;
- int ridx, nidx;
-
- if_ref(sc->ifp);
- ridx = atomic_load_int(&q->ridx);
- do {
- nidx = (ridx == 0) ? 1 : 0;
- } while (!atomic_fcmpset_int(&q->ridx, &ridx, nidx));
- epair_if_input(sc, q, ridx);
-
- atomic_clear_long(&q->state, (1 << BIT_QUEUE_TASK));
- if (atomic_testandclear_long(&q->state, BIT_MBUF_QUEUED))
- taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task);
-
- if_rele(sc->ifp);
+ struct epair_softc *sc;
+
+ if (ifp == NULL)
+ return (0);
+ sc = ifp->if_softc;
+
+ return (sc->cpuid);
}
-static int
-epair_menq(struct mbuf *m, struct epair_softc *osc)
+/*
+ * Netisr handler functions.
+ */
+static void
+epair_nh_sintr(struct mbuf *m)
{
- struct ifnet *ifp, *oifp;
- int len, ret;
- int ridx;
- short mflags;
- struct epair_queue *q = NULL;
- uint32_t bucket;
-#ifdef RSS
- struct ether_header *eh;
-#endif
+ struct ifnet *ifp;
+ struct epair_softc *sc __unused;
- /*
- * I know this looks weird. We pass the "other sc" as we need that one
- * and can get both ifps from it as well.
- */
- oifp = osc->ifp;
- ifp = osc->oifp;
+ ifp = m->m_pkthdr.rcvif;
+ (*ifp->if_input)(ifp, m);
+ sc = ifp->if_softc;
+ EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+ EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+ ("%s: ifp=%p sc->refcount not >= 1: %d",
+ __func__, ifp, sc->refcount));
+ DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount);
+}
- M_ASSERTPKTHDR(m);
- epair_clear_mbuf(m);
- if_setrcvif(m, oifp);
- M_SETFIB(m, oifp->if_fib);
+static struct mbuf *
+epair_nh_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
+{
- /* Save values as once the mbuf is queued, it's not ours anymore. */
- len = m->m_pkthdr.len;
- mflags = m->m_flags;
+ *cpuid = cpuid_from_ifp(m->m_pkthdr.rcvif);
- MPASS(m->m_nextpkt == NULL);
- MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
+ return (m);
+}
-#ifdef RSS
- ret = rss_m2bucket(m, &bucket);
- if (ret) {
- /* Actually hash the packet. */
- eh = mtod(m, struct ether_header *);
+static void
+epair_nh_drainedcpu(u_int cpuid)
+{
+ struct epair_dpcpu *epair_dpcpu;
+ struct epair_ifp_drain *elm, *tvar;
+ struct ifnet *ifp;
- switch (ntohs(eh->ether_type)) {
-#ifdef INET
- case ETHERTYPE_IP:
- rss_soft_m2cpuid_v4(m, 0, &bucket);
- break;
-#endif
-#ifdef INET6
- case ETHERTYPE_IPV6:
- rss_soft_m2cpuid_v6(m, 0, &bucket);
- break;
-#endif
- default:
- bucket = 0;
+ epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
+ EPAIR_LOCK(epair_dpcpu);
+ /*
+ * Assume our "hw" queue and possibly ifq will be emptied
+ * again. In case we will overflow the "hw" queue while
+ * draining, epair_start_locked will set IFF_DRV_OACTIVE
+ * again and we will stop and return.
+ */
+ STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
+ ifp_next, tvar) {
+ ifp = elm->ifp;
+ epair_dpcpu->epair_drv_flags &= ~IFF_DRV_OACTIVE;
+ ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+ epair_start_locked(ifp);
+
+ IFQ_LOCK(&ifp->if_snd);
+ if (IFQ_IS_EMPTY(&ifp->if_snd)) {
+ struct epair_softc *sc __unused;
+
+ STAILQ_REMOVE(&epair_dpcpu->epair_ifp_drain_list,
+ elm, epair_ifp_drain, ifp_next);
+ /* The cached ifp goes off the list. */
+ sc = ifp->if_softc;
+ EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+ EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+ ("%s: ifp=%p sc->refcount not >= 1: %d",
+ __func__, ifp, sc->refcount));
+ free(elm, M_EPAIR);
+ }
+ IFQ_UNLOCK(&ifp->if_snd);
+
+ if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) {
+ /* Our "hw"q overflew again. */
+ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
+ DPRINTF("hw queue length overflow at %u\n",
+ epair_nh.nh_qlimit);
break;
}
}
- bucket %= osc->num_queues;
-#else
- bucket = 0;
-#endif
- q = &osc->queues[bucket];
-
- atomic_set_long(&q->state, (1 << BIT_MBUF_QUEUED));
- ridx = atomic_load_int(&q->ridx);
- ret = buf_ring_enqueue(q->rxring[ridx], m);
- if (ret != 0) {
- /* Ring is full. */
- if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
- m_freem(m);
- return (0);
+ EPAIR_UNLOCK(epair_dpcpu);
+}
+
+/*
+ * Network interface (`if') related functions.
+ */
+static void
+epair_remove_ifp_from_draining(struct ifnet *ifp)
+{
+ struct epair_dpcpu *epair_dpcpu;
+ struct epair_ifp_drain *elm, *tvar;
+ u_int cpuid;
+
+ CPU_FOREACH(cpuid) {
+ epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
+ EPAIR_LOCK(epair_dpcpu);
+ STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
+ ifp_next, tvar) {
+ if (ifp == elm->ifp) {
+ struct epair_softc *sc __unused;
+
+ STAILQ_REMOVE(
+ &epair_dpcpu->epair_ifp_drain_list, elm,
+ epair_ifp_drain, ifp_next);
+ /* The cached ifp goes off the list. */
+ sc = ifp->if_softc;
+ EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+ EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+ ("%s: ifp=%p sc->refcount not >= 1: %d",
+ __func__, ifp, sc->refcount));
+ free(elm, M_EPAIR);
+ }
+ }
+ EPAIR_UNLOCK(epair_dpcpu);
}
+}
- if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
- /*
- * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
- * but as we bypass all this we have to duplicate
- * the logic another time.
- */
- if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
- if (mflags & (M_BCAST|M_MCAST))
- if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
- /* Someone else received the packet. */
- if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
+static int
+epair_add_ifp_for_draining(struct ifnet *ifp)
+{
+ struct epair_dpcpu *epair_dpcpu;
+ struct epair_softc *sc;
+ struct epair_ifp_drain *elm = NULL;
- if (!atomic_testandset_long(&q->state, BIT_QUEUE_TASK))
- taskqueue_enqueue(epair_tasks.tq[bucket], &q->tx_task);
+ sc = ifp->if_softc;
+ epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
+ EPAIR_LOCK_ASSERT(epair_dpcpu);
+ STAILQ_FOREACH(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next)
+ if (elm->ifp == ifp)
+ break;
+ /* If the ifp is there already, return success. */
+ if (elm != NULL)
+ return (0);
+
+ elm = malloc(sizeof(struct epair_ifp_drain), M_EPAIR, M_NOWAIT|M_ZERO);
+ if (elm == NULL)
+ return (ENOMEM);
+
+ elm->ifp = ifp;
+ /* Add a reference for the ifp pointer on the list. */
+ EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
+ STAILQ_INSERT_TAIL(&epair_dpcpu->epair_ifp_drain_list, elm, ifp_next);
return (0);
}
static void
-epair_start(struct ifnet *ifp)
+epair_start_locked(struct ifnet *ifp)
{
+ struct epair_dpcpu *epair_dpcpu;
struct mbuf *m;
struct epair_softc *sc;
struct ifnet *oifp;
+ int error;
+
+ DPRINTF("ifp=%p\n", ifp);
+ sc = ifp->if_softc;
+ epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
+ EPAIR_LOCK_ASSERT(epair_dpcpu);
+
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+ return;
+ if ((ifp->if_flags & IFF_UP) == 0)
+ return;
/*
* We get packets here from ether_output via if_handoff()
* and need to put them into the input queue of the oifp
- * and will put the packet into the receive-queue (rxq) of the
- * other interface (oifp) of our pair.
+ * and call oifp->if_input() via netisr/epair_sintr().
*/
- sc = ifp->if_softc;
oifp = sc->oifp;
sc = oifp->if_softc;
for (;;) {
IFQ_DEQUEUE(&ifp->if_snd, m);
if (m == NULL)
break;
- M_ASSERTPKTHDR(m);
BPF_MTAP(ifp, m);
- /* In case either interface is not usable drop the packet. */
- if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
- (ifp->if_flags & IFF_UP) == 0 ||
- (oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
- (oifp->if_flags & IFF_UP) == 0) {
+ /*
+ * In case the outgoing interface is not usable,
+ * drop the packet.
+ */
+ if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
+ (oifp->if_flags & IFF_UP) ==0) {
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
m_freem(m);
continue;
}
-
- (void) epair_menq(m, sc);
+ DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
+
+ epair_clear_mbuf(m);
+
+ /*
+ * Add a reference so the interface cannot go while the
+ * packet is in transit as we rely on rcvif to stay valid.
+ */
+ EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
+ m->m_pkthdr.rcvif = oifp;
+ CURVNET_SET_QUIET(oifp->if_vnet);
+ error = netisr_queue(NETISR_EPAIR, m);
+ CURVNET_RESTORE();
+ if (!error) {
+ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+ /* Someone else received the packet. */
+ if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
+ } else {
+ /* The packet was freed already. */
+ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
+ ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+ (void) epair_add_ifp_for_draining(ifp);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+ EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+ ("%s: ifp=%p sc->refcount not >= 1: %d",
+ __func__, oifp, sc->refcount));
+ }
}
}
+static void
+epair_start(struct ifnet *ifp)
+{
+ struct epair_dpcpu *epair_dpcpu;
+
+ epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
+ EPAIR_LOCK(epair_dpcpu);
+ epair_start_locked(ifp);
+ EPAIR_UNLOCK(epair_dpcpu);
+}
+
static int
-epair_transmit(struct ifnet *ifp, struct mbuf *m)
+epair_transmit_locked(struct ifnet *ifp, struct mbuf *m)
{
+ struct epair_dpcpu *epair_dpcpu;
struct epair_softc *sc;
struct ifnet *oifp;
int error, len;
short mflags;
+ DPRINTF("ifp=%p m=%p\n", ifp, m);
+ sc = ifp->if_softc;
+ epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
+ EPAIR_LOCK_ASSERT(epair_dpcpu);
+
if (m == NULL)
return (0);
-
- M_ASSERTPKTHDR(m);
-
+
/*
* We are not going to use the interface en/dequeue mechanism
* on the TX side. We are called from ether_output_frame()
- * and will put the packet into the receive-queue (rxq) of the
- * other interface (oifp) of our pair.
+ * and will put the packet into the incoming queue of the
+ * other interface of our pair via the netsir.
*/
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
m_freem(m);
- if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
return (ENXIO);
}
if ((ifp->if_flags & IFF_UP) == 0) {
m_freem(m);
- if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
return (ENETDOWN);
}
@@ -344,16 +517,16 @@ epair_transmit(struct ifnet *ifp, struct mbuf *m)
* In case the outgoing interface is not usable,
* drop the packet.
*/
- sc = ifp->if_softc;
oifp = sc->oifp;
if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
- (oifp->if_flags & IFF_UP) == 0) {
+ (oifp->if_flags & IFF_UP) ==0) {
if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
m_freem(m);
return (0);
}
len = m->m_pkthdr.len;
mflags = m->m_flags;
+ DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
#ifdef ALTQ
/* Support ALTQ via the classic if_start() path. */
@@ -367,17 +540,99 @@ epair_transmit(struct ifnet *ifp, struct mbuf *m)
if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
if (mflags & (M_BCAST|M_MCAST))
if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
- epair_start(ifp);
+
+ if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0)
+ epair_start_locked(ifp);
+ else
+ (void)epair_add_ifp_for_draining(ifp);
}
return (error);
}
IF_UNLOCK(&ifp->if_snd);
#endif
- error = epair_menq(m, oifp->if_softc);
+ if ((epair_dpcpu->epair_drv_flags & IFF_DRV_OACTIVE) != 0) {
+ /*
+ * Our hardware queue is full, try to fall back
+ * queuing to the ifq but do not call ifp->if_start.
+ * Either we are lucky or the packet is gone.
+ */
+ IFQ_ENQUEUE(&ifp->if_snd, m, error);
+ if (!error)
+ (void)epair_add_ifp_for_draining(ifp);
+ return (error);
+ }
+
+ epair_clear_mbuf(m);
+
+ sc = oifp->if_softc;
+ /*
+ * Add a reference so the interface cannot go while the
+ * packet is in transit as we rely on rcvif to stay valid.
+ */
+ EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
+ m->m_pkthdr.rcvif = oifp;
+ CURVNET_SET_QUIET(oifp->if_vnet);
+ error = netisr_queue(NETISR_EPAIR, m);
+ CURVNET_RESTORE();
+ if (!error) {
+ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+ /*
+ * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
+ * but as we bypass all this we have to duplicate
+ * the logic another time.
+ */
+ if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
+ if (mflags & (M_BCAST|M_MCAST))
+ if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
+ /* Someone else received the packet. */
+ if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
+ } else {
+ /* The packet was freed already. */
+ epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
+ ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ EPAIR_REFCOUNT_RELEASE(&sc->refcount);
+ EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
+ ("%s: ifp=%p sc->refcount not >= 1: %d",
+ __func__, oifp, sc->refcount));
+ }
+
return (error);
}
+static int
+epair_transmit(struct ifnet *ifp, struct mbuf *m)
+{
+ struct epair_dpcpu *epair_dpcpu;
+ int error;
+
+ epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
+ EPAIR_LOCK(epair_dpcpu);
+ error = epair_transmit_locked(ifp, m);
+ EPAIR_UNLOCK(epair_dpcpu);
+ return (error);
+}
+
+static void
+epair_qflush(struct ifnet *ifp)
+{
+ struct epair_softc *sc;
+
+ sc = ifp->if_softc;
+ KASSERT(sc != NULL, ("%s: ifp=%p, epair_softc gone? sc=%p\n",
+ __func__, ifp, sc));
+ /*
+ * Remove this ifp from all backpointer lists. The interface will not
+ * usable for flushing anyway nor should it have anything to flush
+ * after if_qflush().
+ */
+ epair_remove_ifp_from_draining(ifp);
+
+ if (sc->if_qflush)
+ sc->if_qflush(ifp);
+}
+
static int
epair_media_change(struct ifnet *ifp __unused)
{
@@ -446,6 +701,8 @@ epair_clone_match(struct if_clone *ifc, const char *name)
{
const char *cp;
+ DPRINTF("name='%s'\n", name);
+
/*
* Our base name is epair.
* Our interfaces will be named epair<n>[ab].
@@ -534,29 +791,17 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
/* Allocate memory for both [ab] interfaces */
sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
+ EPAIR_REFCOUNT_INIT(&sca->refcount, 1);
sca->ifp = if_alloc(IFT_ETHER);
- sca->num_queues = epair_tasks.tasks;
if (sca->ifp == NULL) {
free(sca, M_EPAIR);
ifc_free_unit(ifc, unit);
return (ENOSPC);
}
- sca->queues = mallocarray(sca->num_queues, sizeof(struct epair_queue),
- M_EPAIR, M_WAITOK);
- for (int i = 0; i < sca->num_queues; i++) {
- struct epair_queue *q = &sca->queues[i];
- q->id = i;
- q->rxring[0] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
- q->rxring[1] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
- q->ridx = 0;
- q->state = 0;
- q->sc = sca;
- TASK_INIT(&q->tx_task, 0, epair_tx_start_deferred, q);
- }
scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
+ EPAIR_REFCOUNT_INIT(&scb->refcount, 1);
scb->ifp = if_alloc(IFT_ETHER);
- scb->num_queues = epair_tasks.tasks;
if (scb->ifp == NULL) {
free(scb, M_EPAIR);
if_free(sca->ifp);
@@ -564,33 +809,23 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
ifc_free_unit(ifc, unit);
return (ENOSPC);
}
- scb->queues = mallocarray(scb->num_queues, sizeof(struct epair_queue),
- M_EPAIR, M_WAITOK);
- for (int i = 0; i < scb->num_queues; i++) {
- struct epair_queue *q = &scb->queues[i];
- q->id = i;
- q->rxring[0] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
- q->rxring[1] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
- q->ridx = 0;
- q->state = 0;
- q->sc = scb;
- TASK_INIT(&q->tx_task, 0, epair_tx_start_deferred, q);
- }
-
+
/*
* Cross-reference the interfaces so we will be able to free both.
*/
sca->oifp = scb->ifp;
scb->oifp = sca->ifp;
- EPAIR_LOCK();
-#ifdef SMP
- /* Get an approximate distribution. */
- hash = next_index % mp_ncpus;
-#else
- hash = 0;
-#endif
- EPAIR_UNLOCK();
+ /*
+ * Calculate the cpuid for netisr queueing based on the
+ * ifIndex of the interfaces. As long as we cannot configure
+ * this or use cpuset information easily we cannot guarantee
+ * cache locality but we can at least allow parallelism.
+ */
+ sca->cpuid =
+ netisr_get_cpuid(sca->ifp->if_index);
+ scb->cpuid =
+ netisr_get_cpuid(scb->ifp->if_index);
/* Initialise pseudo media types. */
ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status);
@@ -627,14 +862,12 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
if (hostid == 0)
arc4rand(&hostid, sizeof(hostid), 0);
- EPAIR_LOCK();
if (ifp->if_index > next_index)
next_index = ifp->if_index;
else
next_index++;
key[0] = (uint32_t)next_index;
- EPAIR_UNLOCK();
key[1] = (uint32_t)(hostid & 0xffffffff);
key[2] = (uint32_t)((hostid >> 32) & 0xfffffffff);
hash = jenkins_hash32(key, 3, 0);
@@ -643,8 +876,10 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
memcpy(&eaddr[1], &hash, 4);
eaddr[5] = 0x0a;
ether_ifattach(ifp, eaddr);
- ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */
+ sca->if_qflush = ifp->if_qflush;
+ ifp->if_qflush = epair_qflush;
ifp->if_transmit = epair_transmit;
+ ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */
/* Swap the name and finish initialization of interface <n>b. */
*dp = 'b';
@@ -669,43 +904,27 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
strlcpy(name, scb->ifp->if_xname, len);
epair_clone_add(ifc, scb);
- ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */
+ scb->if_qflush = ifp->if_qflush;
+ ifp->if_qflush = epair_qflush;
ifp->if_transmit = epair_transmit;
+ ifp->if_baudrate = IF_Gbps(10); /* arbitrary maximum */
/*
* Restore name to <n>a as the ifp for this will go into the
* cloner list for the initial call.
*/
strlcpy(name, sca->ifp->if_xname, len);
+ DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb);
/* Tell the world, that we are ready to rock. */
sca->ifp->if_drv_flags |= IFF_DRV_RUNNING;
- if_link_state_change(sca->ifp, LINK_STATE_UP);
scb->ifp->if_drv_flags |= IFF_DRV_RUNNING;
+ if_link_state_change(sca->ifp, LINK_STATE_UP);
if_link_state_change(scb->ifp, LINK_STATE_UP);
return (0);
}
-static void
-epair_drain_rings(struct epair_softc *sc)
-{
- int ridx;
- struct mbuf *m;
-
- for (ridx = 0; ridx < 2; ridx++) {
- for (int i = 0; i < sc->num_queues; i++) {
- struct epair_queue *q = &sc->queues[i];
- do {
- m = buf_ring_dequeue_sc(q->rxring[ridx]);
- if (m == NULL)
- break;
- m_freem(m);
- } while (1);
- }
- }
-}
-
static int
epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
{
@@ -713,6 +932,8 @@ epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
*** 198 LINES SKIPPED ***