git: 84f8ca1bd11d - main - iflib: add a simple transmit routine
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 20 Aug 2025 16:51:16 UTC
The branch main has been updated by gallatin: URL: https://cgit.FreeBSD.org/src/commit/?id=84f8ca1bd11d97d8d254248da7c09507038be505 commit 84f8ca1bd11d97d8d254248da7c09507038be505 Author: Andrew Gallatin <gallatin@FreeBSD.org> AuthorDate: 2025-08-20 16:49:32 +0000 Commit: Andrew Gallatin <gallatin@FreeBSD.org> CommitDate: 2025-08-20 16:49:32 +0000 iflib: add a simple transmit routine While mp_ring can provide amazing scalability in scenarios where the number of cores exceeds the number of NIC tx rings, it can also lead to greatly reduced performance in simpler, high packet rate scenarios due to extra CPU cycles and cache misses stemming from its complexity. This change implements a simple if_transmit routine, selected at driver load. This routine does not queue anything, and uses a simple queue selection and ends up being far more cache friendly. In testing on a 400GbE NIC in an AMD 7502P EPYC server, this simple tx routine is roughly 2.5 times as fast as mp_ring (8Gbs -> 20Gb/s). and 5x as fast as mp_ring with tx_abdicate=1 (4Gbs -> 20Gb/s) for a simple in-kernel packet generator, which is closed source currently. It also shows a 50% speedup for a simple netperf -tUDP_STREAM test (5Gb/s -> 8Gbs). This change is mostly a noop, as it not enabled by default. The one exception is the change to iflib_encap() to immediately reclaim completed tx descriptors, and only failing the transmit and scheduling a later reclaim if iflib_completed_tx_reclaim() didn't free enough descriptors. Reviewed by: kbowling, sumit.saxena_broadcom.com, vmaffione Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D51905 --- share/man/man4/iflib.4 | 14 +++++++- sys/net/iflib.c | 92 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 93 insertions(+), 13 deletions(-) diff --git a/share/man/man4/iflib.4 b/share/man/man4/iflib.4 index 0114263e6ca2..2040698f0087 100644 --- a/share/man/man4/iflib.4 +++ b/share/man/man4/iflib.4 @@ -1,4 +1,4 @@ -.Dd September 27, 2018 +.Dd August 20, 2025 .Dt IFLIB 4 .Os .Sh NAME @@ -64,6 +64,18 @@ If this is zero or not set, an RX and TX queue pair will be assigned to each core. When set to a non-zero value, TX queues are assigned to cores following the last RX queue. +.It Va simple_tx +When set to one, iflib uses a simple transmit routine with no queuing at all. +By default, iflib uses a highly optimized, lockless, transmit queue called +mp_ring. +This performs well when there are more CPU cores than NIC +queues and prevents lock contention for transmit resources. +Unfortunately, mp_ring incurs unneeded overheads on workloads where +resource contention is not a problem (well behaved applications on +systems where there are as many NIC queues as CPU cores). +Note that when this is enabled, the tx_abdicate sysctl is no longer +applicable and is ignored. +Defaults to zero. .El .Pp These diff --git a/sys/net/iflib.c b/sys/net/iflib.c index 2b43f6f19051..98c59e5de988 100644 --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -142,6 +142,7 @@ struct iflib_ctx; static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid); static void iflib_timer(void *arg); static void iflib_tqg_detach(if_ctx_t ctx); +static int iflib_simple_transmit(if_t ifp, struct mbuf *m); typedef struct iflib_filter_info { driver_filter_t *ifi_filter; @@ -198,6 +199,7 @@ struct iflib_ctx { uint8_t ifc_sysctl_use_logical_cores; uint16_t ifc_sysctl_extra_msix_vectors; bool ifc_cpus_are_physical_cores; + bool ifc_sysctl_simple_tx; qidx_t ifc_sysctl_ntxds[8]; qidx_t ifc_sysctl_nrxds[8]; @@ -725,6 +727,7 @@ static void iflib_free_intr_mem(if_ctx_t ctx); #ifndef __NO_STRICT_ALIGNMENT static struct mbuf *iflib_fixup_rx(struct mbuf *m); #endif +static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh); static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets = SLIST_HEAD_INITIALIZER(cpu_offsets); @@ -2624,8 +2627,10 @@ iflib_stop(if_ctx_t ctx) #endif /* DEV_NETMAP */ CALLOUT_UNLOCK(txq); - /* clean any enqueued buffers */ - iflib_ifmp_purge(txq); + if (!ctx->ifc_sysctl_simple_tx) { + /* clean any enqueued buffers */ + iflib_ifmp_purge(txq); + } /* Free any existing tx buffers. */ for (j = 0; j < txq->ift_size; j++) { iflib_txsd_free(ctx, txq, j); @@ -3635,13 +3640,16 @@ defrag: * cxgb */ if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) { - txq->ift_no_desc_avail++; - bus_dmamap_unload(buf_tag, map); - DBG_COUNTER_INC(encap_txq_avail_fail); - DBG_COUNTER_INC(encap_txd_encap_fail); - if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0) - GROUPTASK_ENQUEUE(&txq->ift_task); - return (ENOBUFS); + (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) { + txq->ift_no_desc_avail++; + bus_dmamap_unload(buf_tag, map); + DBG_COUNTER_INC(encap_txq_avail_fail); + DBG_COUNTER_INC(encap_txd_encap_fail); + if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0) + GROUPTASK_ENQUEUE(&txq->ift_task); + return (ENOBUFS); + } } /* * On Intel cards we can greatly reduce the number of TX interrupts @@ -4014,6 +4022,12 @@ _task_fn_tx(void *context) netmap_tx_irq(ifp, txq->ift_id)) goto skip_ifmp; #endif + if (ctx->ifc_sysctl_simple_tx) { + mtx_lock(&txq->ift_mtx); + (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + mtx_unlock(&txq->ift_mtx); + goto skip_ifmp; + } #ifdef ALTQ if (if_altq_is_enabled(ifp)) iflib_altq_if_start(ifp); @@ -4027,9 +4041,8 @@ _task_fn_tx(void *context) */ if (abdicate) ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); -#ifdef DEV_NETMAP + skip_ifmp: -#endif if (ctx->ifc_flags & IFC_LEGACY) IFDI_INTR_ENABLE(ctx); else @@ -5131,7 +5144,14 @@ iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ct scctx = &ctx->ifc_softc_ctx; ifp = ctx->ifc_ifp; - + if (ctx->ifc_sysctl_simple_tx) { +#ifndef ALTQ + if_settransmitfn(ifp, iflib_simple_transmit); + device_printf(dev, "using simple if_transmit\n"); +#else + device_printf(dev, "ALTQ prevents using simple if_transmit\n"); +#endif + } iflib_reset_qvalues(ctx); IFNET_WLOCK(); CTX_LOCK(ctx); @@ -6766,6 +6786,9 @@ iflib_add_device_sysctl_pre(if_ctx_t ctx) SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version", CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, "driver version"); + SYSCTL_ADD_BOOL(ctx_list, oid_list, OID_AUTO, "simple_tx", + CTLFLAG_RDTUN, &ctx->ifc_sysctl_simple_tx, 0, + "use simple tx ring"); SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs", CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0, "# of txqs to use, 0 => use default #"); @@ -7088,3 +7111,48 @@ iflib_debugnet_poll(if_t ifp, int count) return (0); } #endif /* DEBUGNET */ + + +static inline iflib_txq_t +iflib_simple_select_queue(if_ctx_t ctx, struct mbuf *m) +{ + int qidx; + + if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m)) + qidx = QIDX(ctx, m); + else + qidx = NTXQSETS(ctx) + FIRST_QSET(ctx) - 1; + return (&ctx->ifc_txqs[qidx]); +} + +static int +iflib_simple_transmit(if_t ifp, struct mbuf *m) +{ + if_ctx_t ctx; + iflib_txq_t txq; + int error; + int bytes_sent = 0, pkt_sent = 0, mcast_sent = 0; + + + ctx = if_getsoftc(ifp); + if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING) + return (EBUSY); + txq = iflib_simple_select_queue(ctx, m); + mtx_lock(&txq->ift_mtx); + error = iflib_encap(txq, &m); + if (error == 0) { + pkt_sent++; + bytes_sent += m->m_pkthdr.len; + mcast_sent += !!(m->m_flags & M_MCAST); + (void)iflib_txd_db_check(txq, true); + } + (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + mtx_unlock(&txq->ift_mtx); + if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent); + if (mcast_sent) + if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent); + + return (error); +}