git: 84f8ca1bd11d - main - iflib: add a simple transmit routine

From: Andrew Gallatin <gallatin_at_FreeBSD.org>
Date: Wed, 20 Aug 2025 16:51:16 UTC
The branch main has been updated by gallatin:

URL: https://cgit.FreeBSD.org/src/commit/?id=84f8ca1bd11d97d8d254248da7c09507038be505

commit 84f8ca1bd11d97d8d254248da7c09507038be505
Author:     Andrew Gallatin <gallatin@FreeBSD.org>
AuthorDate: 2025-08-20 16:49:32 +0000
Commit:     Andrew Gallatin <gallatin@FreeBSD.org>
CommitDate: 2025-08-20 16:49:32 +0000

    iflib: add a simple transmit routine
    
    While mp_ring can provide amazing scalability in scenarios
    where the number of cores exceeds the number of NIC tx
    rings, it can also lead to greatly reduced performance in simpler,
    high packet rate scenarios due to extra CPU cycles and cache
    misses stemming from its complexity.
    
    This change implements a simple if_transmit routine, selected
    at driver load.  This routine does not queue anything, and uses
    a simple queue selection and ends up being far more cache
    friendly.
    
    In testing on a 400GbE NIC in an AMD 7502P EPYC server, this
    simple tx routine is roughly 2.5 times as fast as mp_ring
    (8Gbs -> 20Gb/s). and 5x as fast as mp_ring with tx_abdicate=1
    (4Gbs -> 20Gb/s) for a simple in-kernel packet generator, which
    is closed source currently.  It also shows a 50% speedup for
    a simple netperf -tUDP_STREAM test (5Gb/s -> 8Gbs).
    
    This change is mostly a noop, as it not enabled by default.
    The one exception is the change to iflib_encap() to immediately
    reclaim completed tx descriptors, and only failing the transmit
    and scheduling a later reclaim if iflib_completed_tx_reclaim()
    didn't free enough descriptors.
    
    Reviewed by:    kbowling, sumit.saxena_broadcom.com, vmaffione
    Sponsored by:   Netflix
    Differential Revision:  https://reviews.freebsd.org/D51905
---
 share/man/man4/iflib.4 | 14 +++++++-
 sys/net/iflib.c        | 92 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 93 insertions(+), 13 deletions(-)

diff --git a/share/man/man4/iflib.4 b/share/man/man4/iflib.4
index 0114263e6ca2..2040698f0087 100644
--- a/share/man/man4/iflib.4
+++ b/share/man/man4/iflib.4
@@ -1,4 +1,4 @@
-.Dd September 27, 2018
+.Dd August 20, 2025
 .Dt IFLIB 4
 .Os
 .Sh NAME
@@ -64,6 +64,18 @@ If this is zero or not set, an RX and TX queue pair will be assigned to each
 core.
 When set to a non-zero value, TX queues are assigned to cores following the
 last RX queue.
+.It Va simple_tx
+When set to one, iflib uses a simple transmit routine with no queuing at all.
+By default, iflib uses a highly optimized, lockless, transmit queue called
+mp_ring.
+This performs well when there are more CPU cores than NIC
+queues and prevents lock contention for transmit resources.
+Unfortunately, mp_ring incurs unneeded overheads on workloads where
+resource contention is not a problem (well behaved applications on
+systems where there are as many NIC queues as CPU cores).
+Note that when this is enabled, the tx_abdicate sysctl is no longer
+applicable and is ignored.
+Defaults to zero.
 .El
 .Pp
 These
diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index 2b43f6f19051..98c59e5de988 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -142,6 +142,7 @@ struct iflib_ctx;
 static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
 static void iflib_timer(void *arg);
 static void iflib_tqg_detach(if_ctx_t ctx);
+static int  iflib_simple_transmit(if_t ifp, struct mbuf *m);
 
 typedef struct iflib_filter_info {
 	driver_filter_t *ifi_filter;
@@ -198,6 +199,7 @@ struct iflib_ctx {
 	uint8_t  ifc_sysctl_use_logical_cores;
 	uint16_t ifc_sysctl_extra_msix_vectors;
 	bool     ifc_cpus_are_physical_cores;
+	bool     ifc_sysctl_simple_tx;
 
 	qidx_t ifc_sysctl_ntxds[8];
 	qidx_t ifc_sysctl_nrxds[8];
@@ -725,6 +727,7 @@ static void iflib_free_intr_mem(if_ctx_t ctx);
 #ifndef __NO_STRICT_ALIGNMENT
 static struct mbuf *iflib_fixup_rx(struct mbuf *m);
 #endif
+static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh);
 
 static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
     SLIST_HEAD_INITIALIZER(cpu_offsets);
@@ -2624,8 +2627,10 @@ iflib_stop(if_ctx_t ctx)
 #endif /* DEV_NETMAP */
 		CALLOUT_UNLOCK(txq);
 
-		/* clean any enqueued buffers */
-		iflib_ifmp_purge(txq);
+		if (!ctx->ifc_sysctl_simple_tx) {
+			/* clean any enqueued buffers */
+			iflib_ifmp_purge(txq);
+		}
 		/* Free any existing tx buffers. */
 		for (j = 0; j < txq->ift_size; j++) {
 			iflib_txsd_free(ctx, txq, j);
@@ -3635,13 +3640,16 @@ defrag:
 	 *        cxgb
 	 */
 	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
-		txq->ift_no_desc_avail++;
-		bus_dmamap_unload(buf_tag, map);
-		DBG_COUNTER_INC(encap_txq_avail_fail);
-		DBG_COUNTER_INC(encap_txd_encap_fail);
-		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
-			GROUPTASK_ENQUEUE(&txq->ift_task);
-		return (ENOBUFS);
+		(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+		if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
+			txq->ift_no_desc_avail++;
+			bus_dmamap_unload(buf_tag, map);
+			DBG_COUNTER_INC(encap_txq_avail_fail);
+			DBG_COUNTER_INC(encap_txd_encap_fail);
+			if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
+				GROUPTASK_ENQUEUE(&txq->ift_task);
+			return (ENOBUFS);
+		}
 	}
 	/*
 	 * On Intel cards we can greatly reduce the number of TX interrupts
@@ -4014,6 +4022,12 @@ _task_fn_tx(void *context)
 	    netmap_tx_irq(ifp, txq->ift_id))
 		goto skip_ifmp;
 #endif
+        if (ctx->ifc_sysctl_simple_tx) {
+                mtx_lock(&txq->ift_mtx);
+                (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+                mtx_unlock(&txq->ift_mtx);
+                goto skip_ifmp;
+        }
 #ifdef ALTQ
 	if (if_altq_is_enabled(ifp))
 		iflib_altq_if_start(ifp);
@@ -4027,9 +4041,8 @@ _task_fn_tx(void *context)
 	 */
 	if (abdicate)
 		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
-#ifdef DEV_NETMAP
+
 skip_ifmp:
-#endif
 	if (ctx->ifc_flags & IFC_LEGACY)
 		IFDI_INTR_ENABLE(ctx);
 	else
@@ -5131,7 +5144,14 @@ iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ct
 
 	scctx = &ctx->ifc_softc_ctx;
 	ifp = ctx->ifc_ifp;
-
+	if (ctx->ifc_sysctl_simple_tx) {
+#ifndef ALTQ
+		if_settransmitfn(ifp, iflib_simple_transmit);
+		device_printf(dev, "using simple if_transmit\n");
+#else
+		device_printf(dev, "ALTQ prevents using simple if_transmit\n");
+#endif
+	}
 	iflib_reset_qvalues(ctx);
 	IFNET_WLOCK();
 	CTX_LOCK(ctx);
@@ -6766,6 +6786,9 @@ iflib_add_device_sysctl_pre(if_ctx_t ctx)
 	SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
 	    CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, "driver version");
 
+	SYSCTL_ADD_BOOL(ctx_list, oid_list, OID_AUTO, "simple_tx",
+	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_simple_tx, 0,
+	    "use simple tx ring");
 	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
 	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
 	    "# of txqs to use, 0 => use default #");
@@ -7088,3 +7111,48 @@ iflib_debugnet_poll(if_t ifp, int count)
 	return (0);
 }
 #endif /* DEBUGNET */
+
+
+static inline iflib_txq_t
+iflib_simple_select_queue(if_ctx_t ctx, struct mbuf *m)
+{
+	int qidx;
+
+	if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m))
+		qidx = QIDX(ctx, m);
+	else
+		qidx = NTXQSETS(ctx) + FIRST_QSET(ctx) - 1;
+	return (&ctx->ifc_txqs[qidx]);
+}
+
+static int
+iflib_simple_transmit(if_t ifp, struct mbuf *m)
+{
+	if_ctx_t ctx;
+	iflib_txq_t txq;
+	int error;
+	int bytes_sent = 0, pkt_sent = 0, mcast_sent = 0;
+
+
+	ctx = if_getsoftc(ifp);
+	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+	    IFF_DRV_RUNNING)
+		return (EBUSY);
+	txq = iflib_simple_select_queue(ctx, m);
+	mtx_lock(&txq->ift_mtx);
+	error = iflib_encap(txq, &m);
+	if (error == 0) {
+		pkt_sent++;
+		bytes_sent += m->m_pkthdr.len;
+		mcast_sent += !!(m->m_flags & M_MCAST);
+		(void)iflib_txd_db_check(txq, true);
+	}
+	(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+	mtx_unlock(&txq->ift_mtx);
+	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
+	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
+	if (mcast_sent)
+		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
+
+	return (error);
+}