svn commit: r364848 - in stable/12/sys/dev/cxgbe: . common

Wed Aug 26 23:21:28 UTC 2020

Author: np
Date: Wed Aug 26 23:21:26 2020
New Revision: 364848
URL: https://svnweb.freebsd.org/changeset/base/364848

Log:
  MFC r340023 (by jhb@), r362905, r362938, and r363167.
  
  r340023:
  Check cannot_use_txpkts() rather than needs_tso() in add_to_txpkts().
  
  Currently this is a no-op, but will matter in the future when
  cannot_use_txpkts() starts checking other conditions than just
  needs_tso().
  
  r362905:
  cxgbe(4): changes in the Tx path to help increase tx coalescing.
  
  - Ask the firmware for the number of frames that can be stuffed in one
    work request.
  
  - Modify mp_ring to increase the likelihood of tx coalescing when there
    are just one or two threads that are doing most of the tx.  Add teeth
    to the abdication mechanism by pushing the consumer lock into mp_ring.
    This reduces the likelihood that a consumer will get stuck with all
    the work even though it is above its budget.
  
  - Add support for coalesced tx WR to the VF driver.  This, with the
    changes above, results in a 7x improvement in the tx pps of the VF
    driver for some common cases.  The firmware vets the L2 headers
    submitted by the VF driver and it's a big win if the checks are
    performed for a batch of packets and not each one individually.
  
  r362938:
  cxgbe(4): Fix a bug (introduced in r362905) where some tx traffic wasn't
  being reported to BPF.
  
  r363167:
  cxgbev(4): Compare at most 16 bytes of the Ethernet header when trying
  to coalesce tx work requests.
  
  Note that Coverity will still treat this as an out-of-bounds access.  We
  do want to compare 16B starting from ethmacdst but cmp_l2hdr was was
  going beyond that by 2B.
  
  cmp_l2hdr was introduced in r362905.
  
  Sponsored by:	Chelsio Communications

Modified:
  stable/12/sys/dev/cxgbe/adapter.h
  stable/12/sys/dev/cxgbe/common/common.h
  stable/12/sys/dev/cxgbe/t4_main.c
  stable/12/sys/dev/cxgbe/t4_mp_ring.c
  stable/12/sys/dev/cxgbe/t4_mp_ring.h
  stable/12/sys/dev/cxgbe/t4_sge.c
  stable/12/sys/dev/cxgbe/t4_vf.c
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/sys/dev/cxgbe/adapter.h
==============================================================================

--- stable/12/sys/dev/cxgbe/adapter.h	Wed Aug 26 22:52:07 2020	(r364847)
+++ stable/12/sys/dev/cxgbe/adapter.h	Wed Aug 26 23:21:26 2020	(r364848)
@@ -546,6 +546,23 @@ struct sge_fl {
 
 struct mp_ring;
 
+struct txpkts {
+	uint8_t wr_type;	/* type 0 or type 1 */
+	uint8_t npkt;		/* # of packets in this work request */
+	uint8_t len16;		/* # of 16B pieces used by this work request */
+	uint8_t score;		/* 1-10. coalescing attempted if score > 3 */
+	uint8_t max_npkt;	/* maximum number of packets allowed */
+	uint16_t plen;		/* total payload (sum of all packets) */
+
+	/* straight from fw_eth_tx_pkts_vm_wr. */
+	__u8   ethmacdst[6];
+	__u8   ethmacsrc[6];
+	__be16 ethtype;
+	__be16 vlantci;
+
+	struct mbuf *mb[15];
+};
+
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
 struct sge_txq {
 	struct sge_eq eq;	/* MUST be first */
@@ -556,6 +573,7 @@ struct sge_txq {
 	struct sglist *gl;
 	__be32 cpl_ctrl0;	/* for convenience */
 	int tc_idx;		/* traffic class */
+	struct txpkts txp;
 
 	struct task tx_reclaim_task;
 	/* stats for common events first */

Modified: stable/12/sys/dev/cxgbe/common/common.h
==============================================================================
--- stable/12/sys/dev/cxgbe/common/common.h	Wed Aug 26 22:52:07 2020	(r364847)
+++ stable/12/sys/dev/cxgbe/common/common.h	Wed Aug 26 23:21:26 2020	(r364848)
@@ -393,6 +393,7 @@ struct adapter_params {
 	bool ulptx_memwrite_dsgl;	/* use of T5 DSGL allowed */
 	bool fr_nsmr_tpte_wr_support;	/* FW support for FR_NSMR_TPTE_WR */
 	bool viid_smt_extn_support;	/* FW returns vin, vfvld & smt index? */
+	unsigned int max_pkts_per_eth_tx_pkts_wr;
 };
 
 #define CHELSIO_T4		0x4

Modified: stable/12/sys/dev/cxgbe/t4_main.c
==============================================================================
--- stable/12/sys/dev/cxgbe/t4_main.c	Wed Aug 26 22:52:07 2020	(r364847)
+++ stable/12/sys/dev/cxgbe/t4_main.c	Wed Aug 26 23:21:26 2020	(r364848)
@@ -2107,7 +2107,7 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m)
 		    vi->rsrv_noflowq);
 
 	items[0] = m;
-	rc = mp_ring_enqueue(txq->r, items, 1, 4096);
+	rc = mp_ring_enqueue(txq->r, items, 1, 256);
 	if (__predict_false(rc != 0))
 		m_freem(m);
 
@@ -2128,7 +2128,7 @@ cxgbe_qflush(struct ifnet *ifp)
 			txq->eq.flags |= EQ_QFLUSH;
 			TXQ_UNLOCK(txq);
 			while (!mp_ring_is_idle(txq->r)) {
-				mp_ring_check_drainage(txq->r, 0);
+				mp_ring_check_drainage(txq->r, 4096);
 				pause("qflush", 1);
 			}
 			TXQ_LOCK(txq);
@@ -2177,7 +2177,7 @@ vi_get_counter(struct ifnet *ifp, ift_counter c)
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
-				drops += counter_u64_fetch(txq->r->drops);
+				drops += counter_u64_fetch(txq->r->dropped);
 		}
 
 		return (drops);
@@ -2242,7 +2242,7 @@ cxgbe_get_counter(struct ifnet *ifp, ift_counter c)
 			struct sge_txq *txq;
 
 			for_each_txq(vi, i, txq)
-				drops += counter_u64_fetch(txq->r->drops);
+				drops += counter_u64_fetch(txq->r->dropped);
 		}
 
 		return (drops);
@@ -4276,6 +4276,13 @@ get_params__post_init(struct adapter *sc)
 	else
 		sc->params.fr_nsmr_tpte_wr_support = false;
 
+	param[0] = FW_PARAM_PFVF(MAX_PKTS_PER_ETH_TX_PKTS_WR);
+	rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val);
+	if (rc == 0)
+		sc->params.max_pkts_per_eth_tx_pkts_wr = val[0];
+	else
+		sc->params.max_pkts_per_eth_tx_pkts_wr = 15;
+
 	/* get capabilites */
 	bzero(&caps, sizeof(caps));
 	caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) |
@@ -5687,7 +5694,7 @@ quiesce_txq(struct adapter *sc, struct sge_txq *txq)
 
 	/* Wait for the mp_ring to empty. */
 	while (!mp_ring_is_idle(txq->r)) {
-		mp_ring_check_drainage(txq->r, 0);
+		mp_ring_check_drainage(txq->r, 4096);
 		pause("rquiesce", 1);
 	}
 

Modified: stable/12/sys/dev/cxgbe/t4_mp_ring.c
==============================================================================
--- stable/12/sys/dev/cxgbe/t4_mp_ring.c	Wed Aug 26 22:52:07 2020	(r364847)
+++ stable/12/sys/dev/cxgbe/t4_mp_ring.c	Wed Aug 26 23:21:26 2020	(r364848)
@@ -34,6 +34,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/counter.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
 #include <machine/cpu.h>
 
 #include "t4_mp_ring.h"
@@ -43,6 +45,23 @@ __FBSDID("$FreeBSD$");
 #define atomic_cmpset_rel_64 atomic_cmpset_64
 #endif
 
+/*
+ * mp_ring handles multiple threads (producers) enqueueing data to a tx queue.
+ * The thread that is writing the hardware descriptors is the consumer and it
+ * runs with the consumer lock held.  A producer becomes the consumer if there
+ * isn't one already.  The consumer runs with the flags sets to BUSY and
+ * consumes everything (IDLE or COALESCING) or gets STALLED.  If it is running
+ * over its budget it sets flags to TOO_BUSY.  A producer that observes a
+ * TOO_BUSY consumer will become the new consumer by setting flags to
+ * TAKING_OVER.  The original consumer stops and sets the flags back to BUSY for
+ * the new consumer.
+ *
+ * COALESCING is the same as IDLE except there are items being held in the hope
+ * that they can be coalesced with items that follow.  The driver must arrange
+ * for a tx update or some other event that transmits all the held items in a
+ * timely manner if nothing else is enqueued.
+ */
+
 union ring_state {
 	struct {
 		uint16_t pidx_head;
@@ -54,13 +73,21 @@ union ring_state {
 };
 
 enum {
-	IDLE = 0,	/* consumer ran to completion, nothing more to do. */
+	IDLE = 0,	/* tx is all caught up, nothing to do. */
+	COALESCING,	/* IDLE, but tx frames are being held for coalescing */
 	BUSY,		/* consumer is running already, or will be shortly. */
+	TOO_BUSY,	/* consumer is running and is beyond its budget */
+	TAKING_OVER,	/* new consumer taking over from a TOO_BUSY consumer */
 	STALLED,	/* consumer stopped due to lack of resources. */
-	ABDICATED,	/* consumer stopped even though there was work to be
-			   done because it wants another thread to take over. */
 };
 
+enum {
+	C_FAST = 0,
+	C_2,
+	C_3,
+	C_TAKEOVER,
+};
+
 static inline uint16_t
 space_available(struct mp_ring *r, union ring_state s)
 {
@@ -83,93 +110,104 @@ increment_idx(struct mp_ring *r, uint16_t idx, uint16_
 	return (x > n ? idx + n : n - x);
 }
 
-/* Consumer is about to update the ring's state to s */
-static inline uint16_t
-state_to_flags(union ring_state s, int abdicate)
-{
-
-	if (s.cidx == s.pidx_tail)
-		return (IDLE);
-	else if (abdicate && s.pidx_tail != s.pidx_head)
-		return (ABDICATED);
-
-	return (BUSY);
-}
-
 /*
- * Caller passes in a state, with a guarantee that there is work to do and that
- * all items up to the pidx_tail in the state are visible.
+ * Consumer.  Called with the consumer lock held and a guarantee that there is
+ * work to do.
  */
 static void
-drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget)
+drain_ring(struct mp_ring *r, int budget)
 {
-	union ring_state ns;
+	union ring_state os, ns;
 	int n, pending, total;
-	uint16_t cidx = os.cidx;
-	uint16_t pidx = os.pidx_tail;
+	uint16_t cidx;
+	uint16_t pidx;
+	bool coalescing;
 
+	mtx_assert(r->cons_lock, MA_OWNED);
+
+	os.state = atomic_load_acq_64(&r->state);
 	MPASS(os.flags == BUSY);
+
+	cidx = os.cidx;
+	pidx = os.pidx_tail;
 	MPASS(cidx != pidx);
 
-	if (prev == IDLE)
-		counter_u64_add(r->starts, 1);
 	pending = 0;
 	total = 0;
 
 	while (cidx != pidx) {
 
 		/* Items from cidx to pidx are available for consumption. */
-		n = r->drain(r, cidx, pidx);
+		n = r->drain(r, cidx, pidx, &coalescing);
 		if (n == 0) {
 			critical_enter();
-			os.state = r->state;
+			os.state = atomic_load_64(&r->state);
 			do {
 				ns.state = os.state;
 				ns.cidx = cidx;
-				ns.flags = STALLED;
+
+				MPASS(os.flags == BUSY ||
+				    os.flags == TOO_BUSY ||
+				    os.flags == TAKING_OVER);
+
+				if (os.flags == TAKING_OVER)
+					ns.flags = BUSY;
+				else
+					ns.flags = STALLED;
 			} while (atomic_fcmpset_64(&r->state, &os.state,
 			    ns.state) == 0);
 			critical_exit();
-			if (prev != STALLED)
+			if (os.flags == TAKING_OVER)
+				counter_u64_add(r->abdications, 1);
+			else if (ns.flags == STALLED)
 				counter_u64_add(r->stalls, 1);
-			else if (total > 0) {
-				counter_u64_add(r->restarts, 1);
-				counter_u64_add(r->stalls, 1);
-			}
 			break;
 		}
 		cidx = increment_idx(r, cidx, n);
 		pending += n;
 		total += n;
+		counter_u64_add(r->consumed, n);
 
-		/*
-		 * We update the cidx only if we've caught up with the pidx, the
-		 * real cidx is getting too far ahead of the one visible to
-		 * everyone else, or we have exceeded our budget.
-		 */
-		if (cidx != pidx && pending < 64 && total < budget)
-			continue;
-		critical_enter();
-		os.state = r->state;
+		os.state = atomic_load_64(&r->state);
 		do {
+			MPASS(os.flags == BUSY || os.flags == TOO_BUSY ||
+			    os.flags == TAKING_OVER);
+
 			ns.state = os.state;
 			ns.cidx = cidx;
-			ns.flags = state_to_flags(ns, total >= budget);
+			if (__predict_false(os.flags == TAKING_OVER)) {
+				MPASS(total >= budget);
+				ns.flags = BUSY;
+				continue;
+			}
+			if (cidx == os.pidx_tail) {
+				ns.flags = coalescing ? COALESCING : IDLE;
+				continue;
+			}
+			if (total >= budget) {
+				ns.flags = TOO_BUSY;
+				continue;
+			}
+			MPASS(os.flags == BUSY);
+			if (pending < 32)
+				break;
 		} while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0);
-		critical_exit();
 
-		if (ns.flags == ABDICATED)
+		if (__predict_false(os.flags == TAKING_OVER)) {
+			MPASS(ns.flags == BUSY);
 			counter_u64_add(r->abdications, 1);
-		if (ns.flags != BUSY) {
-			/* Wrong loop exit if we're going to stall. */
-			MPASS(ns.flags != STALLED);
-			if (prev == STALLED) {
-				MPASS(total > 0);
-				counter_u64_add(r->restarts, 1);
-			}
 			break;
 		}
 
+		if (ns.flags == IDLE || ns.flags == COALESCING) {
+			MPASS(ns.pidx_tail == cidx);
+			if (ns.pidx_head != ns.pidx_tail)
+				counter_u64_add(r->cons_idle2, 1);
+			else
+				counter_u64_add(r->cons_idle, 1);
+			break;
+		}
+
 		/*
 		 * The acquire style atomic above guarantees visibility of items
 		 * associated with any pidx change that we notice here.
@@ -177,13 +215,55 @@ drain_ring(struct mp_ring *r, union ring_state os, uin
 		pidx = ns.pidx_tail;
 		pending = 0;
 	}
+
+#ifdef INVARIANTS
+	if (os.flags == TAKING_OVER)
+		MPASS(ns.flags == BUSY);
+	else {
+		MPASS(ns.flags == IDLE || ns.flags == COALESCING ||
+		    ns.flags == STALLED);
+	}
+#endif
 }
 
+static void
+drain_txpkts(struct mp_ring *r, union ring_state os, int budget)
+{
+	union ring_state ns;
+	uint16_t cidx = os.cidx;
+	uint16_t pidx = os.pidx_tail;
+	bool coalescing;
+
+	mtx_assert(r->cons_lock, MA_OWNED);
+	MPASS(os.flags == BUSY);
+	MPASS(cidx == pidx);
+
+	r->drain(r, cidx, pidx, &coalescing);
+	MPASS(coalescing == false);
+	critical_enter();
+	os.state = atomic_load_64(&r->state);
+	do {
+		ns.state = os.state;
+		MPASS(os.flags == BUSY);
+		MPASS(os.cidx == cidx);
+		if (ns.cidx == ns.pidx_tail)
+			ns.flags = IDLE;
+		else
+			ns.flags = BUSY;
+	} while (atomic_fcmpset_acq_64(&r->state, &os.state, ns.state) == 0);
+	critical_exit();
+
+	if (ns.flags == BUSY)
+		drain_ring(r, budget);
+}
+
 int
 mp_ring_alloc(struct mp_ring **pr, int size, void *cookie, ring_drain_t drain,
-    ring_can_drain_t can_drain, struct malloc_type *mt, int flags)
+    ring_can_drain_t can_drain, struct malloc_type *mt, struct mtx *lck,
+    int flags)
 {
 	struct mp_ring *r;
+	int i;
 
 	/* All idx are 16b so size can be 65536 at most */
 	if (pr == NULL || size < 2 || size > 65536 || drain == NULL ||
@@ -201,43 +281,59 @@ mp_ring_alloc(struct mp_ring **pr, int size, void *coo
 	r->mt = mt;
 	r->drain = drain;
 	r->can_drain = can_drain;
-	r->enqueues = counter_u64_alloc(flags);
-	r->drops = counter_u64_alloc(flags);
-	r->starts = counter_u64_alloc(flags);
-	r->stalls = counter_u64_alloc(flags);
-	r->restarts = counter_u64_alloc(flags);
-	r->abdications = counter_u64_alloc(flags);
-	if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL ||
-	    r->stalls == NULL || r->restarts == NULL ||
-	    r->abdications == NULL) {
-		mp_ring_free(r);
-		return (ENOMEM);
+	r->cons_lock = lck;
+	if ((r->dropped = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	for (i = 0; i < nitems(r->consumer); i++) {
+		if ((r->consumer[i] = counter_u64_alloc(flags)) == NULL)
+			goto failed;
 	}
-
+	if ((r->not_consumer = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->abdications = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->stalls = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->consumed = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->cons_idle = counter_u64_alloc(flags)) == NULL)
+		goto failed;
+	if ((r->cons_idle2 = counter_u64_alloc(flags)) == NULL)
+		goto failed;
 	*pr = r;
 	return (0);
+failed:
+	mp_ring_free(r);
+	return (ENOMEM);
 }
 
 void
 
 mp_ring_free(struct mp_ring *r)
 {
+	int i;
 
 	if (r == NULL)
 		return;
 
-	if (r->enqueues != NULL)
-		counter_u64_free(r->enqueues);
-	if (r->drops != NULL)
-		counter_u64_free(r->drops);
-	if (r->starts != NULL)
-		counter_u64_free(r->starts);
-	if (r->stalls != NULL)
-		counter_u64_free(r->stalls);
-	if (r->restarts != NULL)
-		counter_u64_free(r->restarts);
+	if (r->dropped != NULL)
+		counter_u64_free(r->dropped);
+	for (i = 0; i < nitems(r->consumer); i++) {
+		if (r->consumer[i] != NULL)
+			counter_u64_free(r->consumer[i]);
+	}
+	if (r->not_consumer != NULL)
+		counter_u64_free(r->not_consumer);
 	if (r->abdications != NULL)
 		counter_u64_free(r->abdications);
+	if (r->stalls != NULL)
+		counter_u64_free(r->stalls);
+	if (r->consumed != NULL)
+		counter_u64_free(r->consumed);
+	if (r->cons_idle != NULL)
+		counter_u64_free(r->cons_idle);
+	if (r->cons_idle2 != NULL)
+		counter_u64_free(r->cons_idle2);
 
 	free(r, r->mt);
 }
@@ -252,7 +348,8 @@ mp_ring_enqueue(struct mp_ring *r, void **items, int n
 {
 	union ring_state os, ns;
 	uint16_t pidx_start, pidx_stop;
-	int i;
+	int i, nospc, cons;
+	bool consumer;
 
 	MPASS(items != NULL);
 	MPASS(n > 0);
@@ -261,26 +358,70 @@ mp_ring_enqueue(struct mp_ring *r, void **items, int n
 	 * Reserve room for the new items.  Our reservation, if successful, is
 	 * from 'pidx_start' to 'pidx_stop'.
 	 */
-	os.state = r->state;
+	nospc = 0;
+	os.state = atomic_load_64(&r->state);
 	for (;;) {
-		if (n >= space_available(r, os)) {
-			counter_u64_add(r->drops, n);
+		for (;;) {
+			if (__predict_true(space_available(r, os) >= n))
+				break;
+
+			/* Not enough room in the ring. */
+
 			MPASS(os.flags != IDLE);
+			MPASS(os.flags != COALESCING);
+			if (__predict_false(++nospc > 100)) {
+				counter_u64_add(r->dropped, n);
+				return (ENOBUFS);
+			}
 			if (os.flags == STALLED)
-				mp_ring_check_drainage(r, 0);
-			return (ENOBUFS);
+				mp_ring_check_drainage(r, 64);
+			else
+				cpu_spinwait();
+			os.state = atomic_load_64(&r->state);
 		}
+
+		/* There is room in the ring. */
+
+		cons = -1;
 		ns.state = os.state;
 		ns.pidx_head = increment_idx(r, os.pidx_head, n);
+		if (os.flags == IDLE || os.flags == COALESCING) {
+			MPASS(os.pidx_tail == os.cidx);
+			if (os.pidx_head == os.pidx_tail) {
+				cons = C_FAST;
+				ns.pidx_tail = increment_idx(r, os.pidx_tail, n);
+			} else
+				cons = C_2;
+			ns.flags = BUSY;
+		} else if (os.flags == TOO_BUSY) {
+			cons = C_TAKEOVER;
+			ns.flags = TAKING_OVER;
+		}
 		critical_enter();
 		if (atomic_fcmpset_64(&r->state, &os.state, ns.state))
 			break;
 		critical_exit();
 		cpu_spinwait();
-	}
+	};
+
 	pidx_start = os.pidx_head;
 	pidx_stop = ns.pidx_head;
 
+	if (cons == C_FAST) {
+		i = pidx_start;
+		do {
+			r->items[i] = *items++;
+			if (__predict_false(++i == r->size))
+				i = 0;
+		} while (i != pidx_stop);
+		critical_exit();
+		counter_u64_add(r->consumer[C_FAST], 1);
+		mtx_lock(r->cons_lock);
+		drain_ring(r, budget);
+		mtx_unlock(r->cons_lock);
+		return (0);
+	}
+
 	/*
 	 * Wait for other producers who got in ahead of us to enqueue their
 	 * items, one producer at a time.  It is our turn when the ring's
@@ -288,7 +429,7 @@ mp_ring_enqueue(struct mp_ring *r, void **items, int n
 	 */
 	while (ns.pidx_tail != pidx_start) {
 		cpu_spinwait();
-		ns.state = r->state;
+		ns.state = atomic_load_64(&r->state);
 	}
 
 	/* Now it is our turn to fill up the area we reserved earlier. */
@@ -303,21 +444,33 @@ mp_ring_enqueue(struct mp_ring *r, void **items, int n
 	 * Update the ring's pidx_tail.  The release style atomic guarantees
 	 * that the items are visible to any thread that sees the updated pidx.
 	 */
-	os.state = r->state;
+	os.state = atomic_load_64(&r->state);
 	do {
+		consumer = false;
 		ns.state = os.state;
 		ns.pidx_tail = pidx_stop;
-		ns.flags = BUSY;
+		if (os.flags == IDLE || os.flags == COALESCING ||
+		    (os.flags == STALLED && r->can_drain(r))) {
+			MPASS(cons == -1);
+			consumer = true;
+			ns.flags = BUSY;
+		}
 	} while (atomic_fcmpset_rel_64(&r->state, &os.state, ns.state) == 0);
 	critical_exit();
-	counter_u64_add(r->enqueues, n);
 
-	/*
-	 * Turn into a consumer if some other thread isn't active as a consumer
-	 * already.
-	 */
-	if (os.flags != BUSY)
-		drain_ring(r, ns, os.flags, budget);
+	if (cons == -1) {
+		if (consumer)
+			cons = C_3;
+		else {
+			counter_u64_add(r->not_consumer, 1);
+			return (0);
+		}
+	}
+	MPASS(cons > C_FAST && cons < nitems(r->consumer));
+	counter_u64_add(r->consumer[cons], 1);
+	mtx_lock(r->cons_lock);
+	drain_ring(r, budget);
+	mtx_unlock(r->cons_lock);
 
 	return (0);
 }
@@ -327,46 +480,96 @@ mp_ring_check_drainage(struct mp_ring *r, int budget)
 {
 	union ring_state os, ns;
 
-	os.state = r->state;
-	if (os.flags != STALLED || os.pidx_head != os.pidx_tail ||
-	    r->can_drain(r) == 0)
-		return;
-
-	MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
-	ns.state = os.state;
-	ns.flags = BUSY;
-
-	/*
-	 * The acquire style atomic guarantees visibility of items associated
-	 * with the pidx that we read here.
-	 */
-	if (!atomic_cmpset_acq_64(&r->state, os.state, ns.state))
-		return;
-
-	drain_ring(r, ns, os.flags, budget);
+	os.state = atomic_load_64(&r->state);
+	if (os.flags == STALLED && r->can_drain(r)) {
+		MPASS(os.cidx != os.pidx_tail);	/* implied by STALLED */
+		ns.state = os.state;
+		ns.flags = BUSY;
+		if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) {
+			mtx_lock(r->cons_lock);
+			drain_ring(r, budget);
+			mtx_unlock(r->cons_lock);
+		}
+	} else if (os.flags == COALESCING) {
+		MPASS(os.cidx == os.pidx_tail);
+		ns.state = os.state;
+		ns.flags = BUSY;
+		if (atomic_cmpset_acq_64(&r->state, os.state, ns.state)) {
+			mtx_lock(r->cons_lock);
+			drain_txpkts(r, ns, budget);
+			mtx_unlock(r->cons_lock);
+		}
+	}
 }
 
 void
 mp_ring_reset_stats(struct mp_ring *r)
 {
+	int i;
 
-	counter_u64_zero(r->enqueues);
-	counter_u64_zero(r->drops);
-	counter_u64_zero(r->starts);
-	counter_u64_zero(r->stalls);
-	counter_u64_zero(r->restarts);
+	counter_u64_zero(r->dropped);
+	for (i = 0; i < nitems(r->consumer); i++)
+		counter_u64_zero(r->consumer[i]);
+	counter_u64_zero(r->not_consumer);
 	counter_u64_zero(r->abdications);
+	counter_u64_zero(r->stalls);
+	counter_u64_zero(r->consumed);
+	counter_u64_zero(r->cons_idle);
+	counter_u64_zero(r->cons_idle2);
 }
 
-int
+bool
 mp_ring_is_idle(struct mp_ring *r)
 {
 	union ring_state s;
 
-	s.state = r->state;
+	s.state = atomic_load_64(&r->state);
 	if (s.pidx_head == s.pidx_tail && s.pidx_tail == s.cidx &&
 	    s.flags == IDLE)
-		return (1);
+		return (true);
 
-	return (0);
+	return (false);
+}
+
+void
+mp_ring_sysctls(struct mp_ring *r, struct sysctl_ctx_list *ctx,
+    struct sysctl_oid_list *children)
+{
+	struct sysctl_oid *oid;
+
+	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "mp_ring", CTLFLAG_RD |
+	    CTLFLAG_MPSAFE, NULL, "mp_ring statistics");
+	children = SYSCTL_CHILDREN(oid);
+
+	SYSCTL_ADD_U64(ctx, children, OID_AUTO, "state", CTLFLAG_RD,
+	    __DEVOLATILE(uint64_t *, &r->state), 0, "ring state");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "dropped", CTLFLAG_RD,
+	    &r->dropped, "# of items dropped");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumed",
+	    CTLFLAG_RD, &r->consumed, "# of items consumed");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "fast_consumer",
+	    CTLFLAG_RD, &r->consumer[C_FAST],
+	    "# of times producer became consumer (fast)");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer2",
+	    CTLFLAG_RD, &r->consumer[C_2],
+	    "# of times producer became consumer (2)");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "consumer3",
+	    CTLFLAG_RD, &r->consumer[C_3],
+	    "# of times producer became consumer (3)");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "takeovers",
+	    CTLFLAG_RD, &r->consumer[C_TAKEOVER],
+	    "# of times producer took over from another consumer.");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "not_consumer",
+	    CTLFLAG_RD, &r->not_consumer,
+	    "# of times producer did not become consumer");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "abdications",
+	    CTLFLAG_RD, &r->abdications, "# of consumer abdications");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "stalls",
+	    CTLFLAG_RD, &r->stalls, "# of consumer stalls");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle",
+	    CTLFLAG_RD, &r->cons_idle,
+	    "# of times consumer ran fully to completion");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "cons_idle2",
+	    CTLFLAG_RD, &r->cons_idle2,
+	    "# of times consumer idled when another enqueue was in progress");
 }

Modified: stable/12/sys/dev/cxgbe/t4_mp_ring.h
==============================================================================
--- stable/12/sys/dev/cxgbe/t4_mp_ring.h	Wed Aug 26 22:52:07 2020	(r364847)
+++ stable/12/sys/dev/cxgbe/t4_mp_ring.h	Wed Aug 26 23:21:26 2020	(r364848)
@@ -36,33 +36,38 @@
 #endif
 
 struct mp_ring;
-typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int);
+typedef u_int (*ring_drain_t)(struct mp_ring *, u_int, u_int, bool *);
 typedef u_int (*ring_can_drain_t)(struct mp_ring *);
 
 struct mp_ring {
 	volatile uint64_t	state __aligned(CACHE_LINE_SIZE);
+	struct malloc_type *	mt;
 
 	int			size __aligned(CACHE_LINE_SIZE);
 	void *			cookie;
-	struct malloc_type *	mt;
 	ring_drain_t		drain;
 	ring_can_drain_t	can_drain;	/* cheap, may be unreliable */
-	counter_u64_t		enqueues;
-	counter_u64_t		drops;
-	counter_u64_t		starts;
-	counter_u64_t		stalls;
-	counter_u64_t		restarts;	/* recovered after stalling */
+	struct mtx *		cons_lock;
+	counter_u64_t		dropped;
+	counter_u64_t		consumer[4];
+	counter_u64_t		not_consumer;
 	counter_u64_t		abdications;
+	counter_u64_t		consumed;
+	counter_u64_t		cons_idle;
+	counter_u64_t		cons_idle2;
+	counter_u64_t		stalls;
 
 	void * volatile		items[] __aligned(CACHE_LINE_SIZE);
 };
 
 int mp_ring_alloc(struct mp_ring **, int, void *, ring_drain_t,
-    ring_can_drain_t, struct malloc_type *, int);
+    ring_can_drain_t, struct malloc_type *, struct mtx *, int);
 void mp_ring_free(struct mp_ring *);
 int mp_ring_enqueue(struct mp_ring *, void **, int, int);
 void mp_ring_check_drainage(struct mp_ring *, int);
 void mp_ring_reset_stats(struct mp_ring *);
-int mp_ring_is_idle(struct mp_ring *);
+bool mp_ring_is_idle(struct mp_ring *);
+void mp_ring_sysctls(struct mp_ring *, struct sysctl_ctx_list *,
+    struct sysctl_oid_list *);
 
 #endif

Modified: stable/12/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- stable/12/sys/dev/cxgbe/t4_sge.c	Wed Aug 26 22:52:07 2020	(r364847)
+++ stable/12/sys/dev/cxgbe/t4_sge.c	Wed Aug 26 23:21:26 2020	(r364848)
@@ -198,19 +198,6 @@ static int lro_mbufs = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0,
     "Enable presorting of LRO frames");
 
-struct txpkts {
-	u_int wr_type;		/* type 0 or type 1 */
-	u_int npkt;		/* # of packets in this work request */
-	u_int plen;		/* total payload (sum of all packets) */
-	u_int len16;		/* # of 16B pieces used by this work request */
-};
-
-/* A packet's SGL.  This + m_pkthdr has all info needed for tx */
-struct sgl {
-	struct sglist sg;
-	struct sglist_seg seg[TX_SGL_SEGS];
-};
-
 static int service_iq(struct sge_iq *, int);
 static int service_iq_fl(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
@@ -279,14 +266,16 @@ static inline u_int txpkt_vm_len16(u_int, u_int);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
-static u_int write_txpkt_wr(struct adapter *, struct sge_txq *,
-    struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int);
+static u_int write_txpkt_wr(struct adapter *, struct sge_txq *, struct mbuf *,
+    u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
-    struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
-static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
-static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
-static u_int write_txpkts_wr(struct adapter *, struct sge_txq *,
-    struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int);
+    struct mbuf *);
+static int add_to_txpkts_vf(struct adapter *, struct sge_txq *, struct mbuf *,
+    int, bool *);
+static int add_to_txpkts_pf(struct adapter *, struct sge_txq *, struct mbuf *,
+    int, bool *);
+static u_int write_txpkts_wr(struct adapter *, struct sge_txq *);
+static u_int write_txpkts_vm_wr(struct adapter *, struct sge_txq *);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
@@ -2653,7 +2642,7 @@ can_resume_eth_tx(struct mp_ring *r)
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }
 
-static inline int
+static inline bool
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
@@ -2669,8 +2658,9 @@ discard_tx(struct sge_eq *eq)
 }
 
 static inline int
-wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
+wr_can_update_eq(void *p)
 {
+	struct fw_eth_tx_pkts_wr *wr = p;
 
 	switch (G_FW_WR_OP(be32toh(wr->op_pkd))) {
 	case FW_ULPTX_WR:
@@ -2678,149 +2668,226 @@ wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr)
 	case FW_ETH_TX_PKTS_WR:
 	case FW_ETH_TX_PKTS2_WR:
 	case FW_ETH_TX_PKT_VM_WR:
+	case FW_ETH_TX_PKTS_VM_WR:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
+static inline void
+set_txupdate_flags(struct sge_txq *txq, u_int avail,
+    struct fw_eth_tx_pkt_wr *wr)
+{
+	struct sge_eq *eq = &txq->eq;
+	struct txpkts *txp = &txq->txp;
+
+	if ((txp->npkt > 0 || avail < eq->sidx / 2) &&
+	    atomic_cmpset_int(&eq->equiq, 0, 1)) {
+		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ);
+		eq->equeqidx = eq->pidx;
+	} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
+		wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
+		eq->equeqidx = eq->pidx;
+	}
+}
+
 /*
  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
  * be consumed.  Return the actual number consumed.  0 indicates a stall.
  */
 static u_int
-eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
+eth_tx(struct mp_ring *r, u_int cidx, u_int pidx, bool *coalescing)
 {
 	struct sge_txq *txq = r->cookie;
-	struct sge_eq *eq = &txq->eq;
 	struct ifnet *ifp = txq->ifp;
+	struct sge_eq *eq = &txq->eq;
+	struct txpkts *txp = &txq->txp;
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->adapter;
 	u_int total, remaining;		/* # of packets */
-	u_int available, dbdiff;	/* # of hardware descriptors */
-	u_int n, next_cidx;
-	struct mbuf *m0, *tail;
-	struct txpkts txp;
-	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
+	u_int n, avail, dbdiff;		/* # of hardware descriptors */
+	int i, rc;
+	struct mbuf *m0;
+	bool snd;
+	void *wr;	/* start of the last WR written to the ring */
 
-	remaining = IDXDIFF(pidx, cidx, r->size);
-	MPASS(remaining > 0);	/* Must not be called without work to do. */
-	total = 0;
+	TXQ_LOCK_ASSERT_OWNED(txq);
 
-	TXQ_LOCK(txq);
+	remaining = IDXDIFF(pidx, cidx, r->size);
 	if (__predict_false(discard_tx(eq))) {
+		for (i = 0; i < txp->npkt; i++)
+			m_freem(txp->mb[i]);
+		txp->npkt = 0;
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
-		reclaim_tx_descs(txq, 2048);
-		total = remaining;
-		goto done;
+		reclaim_tx_descs(txq, eq->sidx);
+		*coalescing = false;
+		return (remaining);	/* emptied */
 	}
 
 	/* How many hardware descriptors do we have readily available. */
-	if (eq->pidx == eq->cidx)
-		available = eq->sidx - 1;
-	else
-		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
-	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
+	if (eq->pidx == eq->cidx) {
+		avail = eq->sidx - 1;
+		if (txp->score++ >= 5)
+			txp->score = 5;	/* tx is completely idle, reset. */
+	} else
+		avail = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
-	while (remaining > 0) {
+	total = 0;
+	if (remaining == 0) {
+		if (txp->score-- == 1)	/* egr_update had to drain txpkts */
+			txp->score = 1;
+		goto send_txpkts;
+	}
 
+	dbdiff = 0;
+	MPASS(remaining > 0);
+	while (remaining > 0) {
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);
 
-		if (available < tx_len16_to_desc(mbuf_len16(m0))) {
-			available += reclaim_tx_descs(txq, 64);
-			if (available < tx_len16_to_desc(mbuf_len16(m0)))
-				break;	/* out of descriptors */
-		}
+		if (avail < 2 * SGE_MAX_WR_NDESC)
+			avail += reclaim_tx_descs(txq, 64);
 
-		next_cidx = cidx + 1;
-		if (__predict_false(next_cidx == r->size))
-			next_cidx = 0;
-
-		wr = (void *)&eq->desc[eq->pidx];
-		if (sc->flags & IS_VF) {
-			total++;
-			remaining--;
-			ETHER_BPF_MTAP(ifp, m0);
-			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
-			    available);
-		} else if (remaining > 1 &&
-		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
-
-			/* pkts at cidx, next_cidx should both be in txp. */
-			MPASS(txp.npkt == 2);
-			tail = r->items[next_cidx];
-			MPASS(tail->m_nextpkt == NULL);
-			ETHER_BPF_MTAP(ifp, m0);
-			ETHER_BPF_MTAP(ifp, tail);
-			m0->m_nextpkt = tail;
-
-			if (__predict_false(++next_cidx == r->size))
-				next_cidx = 0;
-
-			while (next_cidx != pidx) {
-				if (add_to_txpkts(r->items[next_cidx], &txp,
-				    available) != 0)
-					break;
-				tail->m_nextpkt = r->items[next_cidx];
-				tail = tail->m_nextpkt;
-				ETHER_BPF_MTAP(ifp, tail);
-				if (__predict_false(++next_cidx == r->size))
-					next_cidx = 0;
+		if (txp->npkt > 0 || remaining > 1 || txp->score > 3 ||
+		    atomic_load_int(&txq->eq.equiq) != 0) {
+			if (sc->flags & IS_VF)
+				rc = add_to_txpkts_vf(sc, txq, m0, avail, &snd);
+			else
+				rc = add_to_txpkts_pf(sc, txq, m0, avail, &snd);
+		} else {
+			snd = false;
+			rc = EINVAL;
+		}
+		if (snd) {
+			MPASS(txp->npkt > 0);
+			for (i = 0; i < txp->npkt; i++)
+				ETHER_BPF_MTAP(ifp, txp->mb[i]);
+			if (txp->npkt > 1) {

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***