git: 9c9501390512 - main - iflib: Introduce v2 of TX Queue Select Functionality

From: Eric Joyner <erj_at_FreeBSD.org>
Date: Mon, 17 Oct 2022 22:01:22 UTC
The branch main has been updated by erj:

URL: https://cgit.FreeBSD.org/src/commit/?id=9c950139051298831ce19d01ea5fb33ec6ea7f89

commit 9c950139051298831ce19d01ea5fb33ec6ea7f89
Author:     Eric Joyner <erj@FreeBSD.org>
AuthorDate: 2022-10-17 21:52:20 +0000
Commit:     Eric Joyner <erj@FreeBSD.org>
CommitDate: 2022-10-17 21:59:55 +0000

    iflib: Introduce v2 of TX Queue Select Functionality
    
    For v2, iflib will parse packet headers before queueing a packet.
    
    This commit also adds a new field in the structure that holds parsed
    header information from packets; it stores the IP ToS/traffic class
    field found in the IPv4/IPv6 header.
    
    To help, it will only partially parse header packets before queueing
    them by using a new header parsing function that does less than the
    current parsing header function; for our purposes we only need up to the
    minimal IP header in order to get the IP ToS infromation and don't need
    to pull up more data.
    
    For now, v1 and v2 co-exist in this patch; v1 still offers a
    less-invasive method where none of the packet is parsed in iflib before
    queueing.
    
    This also bumps the sys/param.h version.
    
    Signed-off-by:  Eric Joyner <erj@FreeBSD.org>
    Tested by:      IntelNetworking
    MFC after:      3 days
    Sponsored by:   Intel Corporation
    Differential Revision:  https://reviews.freebsd.org/D34742
---
 sys/net/iflib.c | 196 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 sys/net/iflib.h |  12 +++-
 sys/sys/param.h |   2 +-
 3 files changed, 185 insertions(+), 25 deletions(-)

diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index b7a96eb3dde5..b0701c1eb63c 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -210,6 +210,7 @@ struct iflib_ctx {
 #define isc_rxd_flush ifc_txrx.ift_rxd_flush
 #define isc_legacy_intr ifc_txrx.ift_legacy_intr
 #define isc_txq_select ifc_txrx.ift_txq_select
+#define isc_txq_select_v2 ifc_txrx.ift_txq_select_v2
 	eventhandler_tag ifc_vlan_attach_event;
 	eventhandler_tag ifc_vlan_detach_event;
 	struct ether_addr ifc_mac;
@@ -3195,32 +3196,24 @@ print_pkt(if_pkt_info_t pi)
 #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
 #define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
 
+/**
+ * Parses out ethernet header information in the given mbuf.
+ * Returns in pi: ipi_etype (EtherType) and ipi_ehdrlen (Ethernet header length)
+ *
+ * This will account for the VLAN header if present.
+ *
+ * XXX: This doesn't handle QinQ, which could prevent TX offloads for those
+ * types of packets.
+ */
 static int
-iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
+iflib_parse_ether_header(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
 {
-	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
 	struct ether_vlan_header *eh;
 	struct mbuf *m;
 
 	m = *mp;
-	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
-	    M_WRITABLE(m) == 0) {
-		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
-			return (ENOMEM);
-		} else {
-			m_freem(*mp);
-			DBG_COUNTER_INC(tx_frees);
-			*mp = m;
-		}
-	}
-
-	/*
-	 * Determine where frame payload starts.
-	 * Jump over vlan headers if already present,
-	 * helpful for QinQ too.
-	 */
 	if (__predict_false(m->m_len < sizeof(*eh))) {
-		txq->ift_pullups++;
+		(*pullups)++;
 		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
 			return (ENOMEM);
 	}
@@ -3232,6 +3225,143 @@ iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 		pi->ipi_etype = ntohs(eh->evl_encap_proto);
 		pi->ipi_ehdrlen = ETHER_HDR_LEN;
 	}
+	*mp = m;
+
+	return (0);
+}
+
+/**
+ * Parse up to the L3 header and extract IPv4/IPv6 header information into pi.
+ * Currently this information includes: IP ToS value, IP header version/presence
+ *
+ * This is missing some checks and doesn't edit the packet content as it goes,
+ * unlike iflib_parse_header(), in order to keep the amount of code here minimal.
+ */
+static int
+iflib_parse_header_partial(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
+{
+	struct mbuf *m;
+	int err;
+
+	*pullups = 0;
+	m = *mp;
+	if (!M_WRITABLE(m)) {
+		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
+			return (ENOMEM);
+		} else {
+			m_freem(*mp);
+			DBG_COUNTER_INC(tx_frees);
+			*mp = m;
+		}
+	}
+
+	/* Fills out pi->ipi_etype */
+	err = iflib_parse_ether_header(pi, mp, pullups);
+	if (err)
+		return (err);
+	m = *mp;
+
+	switch (pi->ipi_etype) {
+#ifdef INET
+	case ETHERTYPE_IP:
+	{
+		struct mbuf *n;
+		struct ip *ip = NULL;
+		int miniplen;
+
+		miniplen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip));
+		if (__predict_false(m->m_len < miniplen)) {
+			/*
+			 * Check for common case where the first mbuf only contains
+			 * the Ethernet header
+			 */
+			if (m->m_len == pi->ipi_ehdrlen) {
+				n = m->m_next;
+				MPASS(n);
+				/* If next mbuf contains at least the minimal IP header, then stop */
+				if (n->m_len >= sizeof(*ip)) {
+					ip = (struct ip *)n->m_data;
+				} else {
+					(*pullups)++;
+					if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
+						return (ENOMEM);
+					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
+				}
+			} else {
+				(*pullups)++;
+				if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
+					return (ENOMEM);
+				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
+			}
+		} else {
+			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
+		}
+
+		/* Have the IPv4 header w/ no options here */
+		pi->ipi_ip_hlen = ip->ip_hl << 2;
+		pi->ipi_ipproto = ip->ip_p;
+		pi->ipi_ip_tos = ip->ip_tos;
+		pi->ipi_flags |= IPI_TX_IPV4;
+
+		break;
+	}
+#endif
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+	{
+		struct ip6_hdr *ip6;
+
+		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
+			(*pullups)++;
+			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
+				return (ENOMEM);
+		}
+		ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
+
+		/* Have the IPv6 fixed header here */
+		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
+		pi->ipi_ipproto = ip6->ip6_nxt;
+		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
+		pi->ipi_flags |= IPI_TX_IPV6;
+
+		break;
+	}
+#endif
+	default:
+		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
+		pi->ipi_ip_hlen = 0;
+		break;
+	}
+	*mp = m;
+
+	return (0);
+
+}
+
+static int
+iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
+{
+	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
+	struct mbuf *m;
+	int err;
+
+	m = *mp;
+	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
+	    M_WRITABLE(m) == 0) {
+		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
+			return (ENOMEM);
+		} else {
+			m_freem(*mp);
+			DBG_COUNTER_INC(tx_frees);
+			*mp = m;
+		}
+	}
+
+	/* Fills out pi->ipi_etype */
+	err = iflib_parse_ether_header(pi, mp, &txq->ift_pullups);
+	if (__predict_false(err))
+		return (err);
+	m = *mp;
 
 	switch (pi->ipi_etype) {
 #ifdef INET
@@ -3276,6 +3406,7 @@ iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 		}
 		pi->ipi_ip_hlen = ip->ip_hl << 2;
 		pi->ipi_ipproto = ip->ip_p;
+		pi->ipi_ip_tos = ip->ip_tos;
 		pi->ipi_flags |= IPI_TX_IPV4;
 
 		/* TCP checksum offload may require TCP header length */
@@ -3329,6 +3460,7 @@ iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 
 		/* XXX-BZ this will go badly in case of ext hdrs. */
 		pi->ipi_ipproto = ip6->ip6_nxt;
+		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
 		pi->ipi_flags |= IPI_TX_IPV6;
 
 		/* TCP checksum offload may require TCP header length */
@@ -4146,11 +4278,10 @@ iflib_if_init(void *arg)
 static int
 iflib_if_transmit(if_t ifp, struct mbuf *m)
 {
-	if_ctx_t	ctx = if_getsoftc(ifp);
-
+	if_ctx_t ctx = if_getsoftc(ifp);
 	iflib_txq_t txq;
 	int err, qidx;
-	int abdicate = ctx->ifc_sysctl_tx_abdicate;
+	int abdicate;
 
 	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
 		DBG_COUNTER_INC(tx_frees);
@@ -4162,7 +4293,24 @@ iflib_if_transmit(if_t ifp, struct mbuf *m)
 	/* ALTQ-enabled interfaces always use queue 0. */
 	qidx = 0;
 	/* Use driver-supplied queue selection method if it exists */
-	if (ctx->isc_txq_select)
+	if (ctx->isc_txq_select_v2) {
+		struct if_pkt_info pi;
+		uint64_t early_pullups = 0;
+		pkt_info_zero(&pi);
+
+		err = iflib_parse_header_partial(&pi, &m, &early_pullups);
+		if (__predict_false(err != 0)) {
+			/* Assign pullups for bad pkts to default queue */
+			ctx->ifc_txqs[0].ift_pullups += early_pullups;
+			DBG_COUNTER_INC(encap_txd_encap_fail);
+			return (err);
+		}
+		/* Let driver make queueing decision */
+		qidx = ctx->isc_txq_select_v2(ctx->ifc_softc, m, &pi);
+		ctx->ifc_txqs[qidx].ift_pullups += early_pullups;
+	}
+	/* Backwards compatibility w/ simpler queue select */
+	else if (ctx->isc_txq_select)
 		qidx = ctx->isc_txq_select(ctx->ifc_softc, m);
 	/* If not, use iflib's standard method */
 	else if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd))
@@ -4207,6 +4355,8 @@ iflib_if_transmit(if_t ifp, struct mbuf *m)
 	}
 #endif
 	DBG_COUNTER_INC(tx_seen);
+	abdicate = ctx->ifc_sysctl_tx_abdicate;
+
 	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
 
 	if (abdicate)
diff --git a/sys/net/iflib.h b/sys/net/iflib.h
index be858a262706..9f60085ce4d6 100644
--- a/sys/net/iflib.h
+++ b/sys/net/iflib.h
@@ -131,7 +131,9 @@ typedef struct if_pkt_info {
 	uint8_t			ipi_mflags;	/* packet mbuf flags */
 
 	uint32_t		ipi_tcp_seq;	/* tcp seqno */
-	uint32_t		__spare0__;
+	uint8_t			ipi_ip_tos;	/* IP ToS field data */
+	uint8_t			__spare0__;
+	uint16_t		__spare1__;
 } *if_pkt_info_t;
 
 typedef struct if_irq {
@@ -188,6 +190,7 @@ typedef struct if_txrx {
 	void (*ift_rxd_flush) (void *, uint16_t qsidx, uint8_t flidx, qidx_t pidx);
 	int (*ift_legacy_intr) (void *);
 	qidx_t (*ift_txq_select) (void *, struct mbuf *);
+	qidx_t (*ift_txq_select_v2) (void *, struct mbuf *, if_pkt_info_t);
 } *if_txrx_t;
 
 typedef struct if_softc_ctx {
@@ -416,6 +419,13 @@ typedef enum {
  * as ift_txq_select in struct if_txrx
  */
 #define IFLIB_FEATURE_QUEUE_SELECT	1400050
+/*
+ * Driver can set its own TX queue selection function
+ * as ift_txq_select_v2 in struct if_txrx. This includes
+ * having iflib send L3+ extra header information to the
+ * function.
+ */
+#define IFLIB_FEATURE_QUEUE_SELECT_V2	1400073
 
 /*
  * These enum values are used in iflib_needs_restart to indicate to iflib
diff --git a/sys/sys/param.h b/sys/sys/param.h
index b7515a9116f5..1f0eac771c02 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -76,7 +76,7 @@
  * cannot include sys/param.h and should only be updated here.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1400072
+#define __FreeBSD_version 1400073
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,