svn commit: r334143 - head/sys/dev/cxgbe

Thu May 24 10:18:16 UTC 2018

Author: np
Date: Thu May 24 10:18:14 2018
New Revision: 334143
URL: https://svnweb.freebsd.org/changeset/base/334143

Log:
  cxgbe(4): Data path for rate-limited tx.
  
  This is hardware support for the SO_MAX_PACING_RATE sockopt (see
  setsockopt(2)), which is available in kernels built with "options
  RATELIMIT".
  
  Relnotes:	Yes
  Sponsored by:	Chelsio Communications

Modified:
  head/sys/dev/cxgbe/adapter.h
  head/sys/dev/cxgbe/offload.h
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/cxgbe/t4_sched.c
  head/sys/dev/cxgbe/t4_sge.c

Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================

--- head/sys/dev/cxgbe/adapter.h	Thu May 24 10:17:49 2018	(r334142)
+++ head/sys/dev/cxgbe/adapter.h	Thu May 24 10:18:14 2018	(r334143)
@@ -1217,6 +1217,10 @@ void t4_register_an_handler(an_handler_t);
 void t4_register_fw_msg_handler(int, fw_msg_handler_t);
 void t4_register_cpl_handler(int, cpl_handler_t);
 void t4_register_shared_cpl_handler(int, cpl_handler_t, int);
+#ifdef RATELIMIT
+int ethofld_transmit(struct ifnet *, struct mbuf *);
+void send_etid_flush_wr(struct cxgbe_snd_tag *);
+#endif
 
 /* t4_tracer.c */
 struct t4_tracer;
@@ -1239,11 +1243,13 @@ void t4_release_cl_rl_kbps(struct adapter *, int, int)
 #ifdef RATELIMIT
 void t4_init_etid_table(struct adapter *);
 void t4_free_etid_table(struct adapter *);
+struct cxgbe_snd_tag *lookup_etid(struct adapter *, int);
 int cxgbe_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *,
     struct m_snd_tag **);
 int cxgbe_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *);
 int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
 void cxgbe_snd_tag_free(struct m_snd_tag *);
+void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
 #endif
 
 /* t4_filter.c */

Modified: head/sys/dev/cxgbe/offload.h
==============================================================================
--- head/sys/dev/cxgbe/offload.h	Thu May 24 10:17:49 2018	(r334142)
+++ head/sys/dev/cxgbe/offload.h	Thu May 24 10:18:14 2018	(r334143)
@@ -79,6 +79,14 @@ union aopen_entry {
 	union aopen_entry *next;
 };
 
+/* cxgbe_snd_tag flags */
+enum {
+	EO_FLOWC_PENDING	= (1 << 0),	/* flowc needs to be sent */
+	EO_FLOWC_RPL_PENDING	= (1 << 1),	/* flowc credits due back */
+	EO_SND_TAG_REF		= (1 << 2),	/* kernel has a ref on us */
+	EO_FLUSH_RPL_PENDING	= (1 << 3),	/* credit flush rpl due back */
+};
+
 struct cxgbe_snd_tag {
 	struct m_snd_tag com;
 	struct adapter *adapter;
@@ -86,13 +94,13 @@ struct cxgbe_snd_tag {
 	struct mtx lock;
 	int port_id;
 	int etid;
+	struct mbufq pending_tx, pending_fwack;
+	int plen;
 	struct sge_wrq *eo_txq;
+	uint32_t ctrl0;
 	uint16_t iqid;
 	int8_t schedcl;
 	uint64_t max_rate;      /* in bytes/s */
-	int8_t next_credits;	/* need these many tx credits next */
-	uint8_t next_nsegs;	/* next WR will have these many GL segs total */
-	uint8_t next_msegs;	/* max segs for a single mbuf in next chain */
 	uint8_t tx_total;	/* total tx WR credits (in 16B units) */
 	uint8_t tx_credits;	/* tx WR credits (in 16B units) available */
 	uint8_t tx_nocompl;	/* tx WR credits since last compl request */

Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c	Thu May 24 10:17:49 2018	(r334142)
+++ head/sys/dev/cxgbe/t4_main.c	Thu May 24 10:18:14 2018	(r334143)
@@ -1891,6 +1891,17 @@ cxgbe_transmit(struct ifnet *ifp, struct mbuf *m)
 		atomic_add_int(&pi->tx_parse_error, 1);	/* rare, atomic is ok */
 		return (rc);
 	}
+#ifdef RATELIMIT
+	if (m->m_pkthdr.snd_tag != NULL) {
+		/* EAGAIN tells the stack we are not the correct interface. */
+		if (__predict_false(ifp != m->m_pkthdr.snd_tag->ifp)) {
+			m_freem(m);
+			return (EAGAIN);
+		}
+
+		return (ethofld_transmit(ifp, m));
+	}
+#endif
 
 	/* Select a txq. */
 	txq = &sc->sge.txq[vi->first_txq];

Modified: head/sys/dev/cxgbe/t4_sched.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sched.c	Thu May 24 10:17:49 2018	(r334142)
+++ head/sys/dev/cxgbe/t4_sched.c	Thu May 24 10:18:14 2018	(r334143)
@@ -529,7 +529,6 @@ alloc_etid(struct adapter *sc, struct cxgbe_snd_tag *c
 	return (etid);
 }
 
-#ifdef notyet
 struct cxgbe_snd_tag *
 lookup_etid(struct adapter *sc, int etid)
 {
@@ -537,7 +536,6 @@ lookup_etid(struct adapter *sc, int etid)
 
 	return (t->etid_tab[etid - t->etid_base].cst);
 }
-#endif
 
 static void
 free_etid(struct adapter *sc, int etid)
@@ -585,14 +583,21 @@ failed:
 	}
 
 	mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF);
+	mbufq_init(&cst->pending_tx, INT_MAX);
+	mbufq_init(&cst->pending_fwack, INT_MAX);
 	cst->com.ifp = ifp;
+	cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF;
 	cst->adapter = sc;
 	cst->port_id = pi->port_id;
 	cst->schedcl = schedcl;
 	cst->max_rate = params->rate_limit.max_rate;
-	cst->next_credits = -1;
 	cst->tx_credits = sc->params.ofldq_wr_cred;
 	cst->tx_total = cst->tx_credits;
+	cst->plen = 0;
+	cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
+	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
+	    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
+	    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
 
 	/*
 	 * Queues will be selected later when the connection flowid is available.
@@ -616,6 +621,8 @@ cxgbe_snd_tag_modify(struct m_snd_tag *mst,
 	/* XXX: is schedcl -1 ok here? */
 	MPASS(cst->schedcl >= 0 && cst->schedcl < sc->chip_params->nsched_cls);
 
+	mtx_lock(&cst->lock);
+	MPASS(cst->flags & EO_SND_TAG_REF);
 	rc = t4_reserve_cl_rl_kbps(sc, cst->port_id,
 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
 	if (rc != 0)
@@ -624,6 +631,7 @@ cxgbe_snd_tag_modify(struct m_snd_tag *mst,
 	t4_release_cl_rl_kbps(sc, cst->port_id, cst->schedcl);
 	cst->schedcl = schedcl;
 	cst->max_rate = params->rate_limit.max_rate;
+	mtx_unlock(&cst->lock);
 
 	return (0);
 }
@@ -643,18 +651,53 @@ cxgbe_snd_tag_query(struct m_snd_tag *mst,
 	return (0);
 }
 
+/*
+ * Unlocks cst and frees it.
+ */
 void
-cxgbe_snd_tag_free(struct m_snd_tag *mst)
+cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *cst)
 {
-	struct cxgbe_snd_tag *cst = mst_to_cst(mst);
 	struct adapter *sc = cst->adapter;
 
+	mtx_assert(&cst->lock, MA_OWNED);
+	MPASS((cst->flags & EO_SND_TAG_REF) == 0);
+	MPASS(cst->tx_credits == cst->tx_total);
+	MPASS(cst->plen == 0);
+	MPASS(mbufq_first(&cst->pending_tx) == NULL);
+	MPASS(mbufq_first(&cst->pending_fwack) == NULL);
+
 	if (cst->etid >= 0)
 		free_etid(sc, cst->etid);
 	if (cst->schedcl != -1)
 		t4_release_cl_rl_kbps(sc, cst->port_id, cst->schedcl);
-	if (mtx_initialized(&cst->lock))
-		mtx_destroy(&cst->lock);
+	mtx_unlock(&cst->lock);
+	mtx_destroy(&cst->lock);
 	free(cst, M_CXGBE);
+}
+
+void
+cxgbe_snd_tag_free(struct m_snd_tag *mst)
+{
+	struct cxgbe_snd_tag *cst = mst_to_cst(mst);
+
+	mtx_lock(&cst->lock);
+
+	/* The kernel is done with the snd_tag.  Remove its reference. */
+	MPASS(cst->flags & EO_SND_TAG_REF);
+	cst->flags &= ~EO_SND_TAG_REF;
+
+	if (cst->ncompl == 0) {
+		/*
+		 * No fw4_ack in flight.  Free the tag right away if there are
+		 * no outstanding credits.  Request the firmware to return all
+		 * credits for the etid otherwise.
+		 */
+		if (cst->tx_credits == cst->tx_total) {
+			cxgbe_snd_tag_free_locked(cst);
+			return;	/* cst is gone. */
+		}
+		send_etid_flush_wr(cst);
+	}
+	mtx_unlock(&cst->lock);
 }
 #endif

Modified: head/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sge.c	Thu May 24 10:17:49 2018	(r334142)
+++ head/sys/dev/cxgbe/t4_sge.c	Thu May 24 10:18:14 2018	(r334143)
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
+#include <netinet/udp.h>
 #include <machine/in_cksum.h>
 #include <machine/md_var.h>
 #include <vm/vm.h>
@@ -153,7 +154,24 @@ TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx
 static int safest_rx_cluster = PAGE_SIZE;
 TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
 
+#ifdef RATELIMIT
 /*
+ * Knob to control TCP timestamp rewriting, and the granularity of the tick used
+ * for rewriting.  -1 and 0-3 are all valid values.
+ * -1: hardware should leave the TCP timestamps alone.
+ * 0: 1ms
+ * 1: 100us
+ * 2: 10us
+ * 3: 1us
+ */
+static int tsclk = -1;
+TUNABLE_INT("hw.cxgbe.tsclk", &tsclk);
+
+static int eo_max_backlog = 1024 * 1024;
+TUNABLE_INT("hw.cxgbe.eo_max_backlog", &eo_max_backlog);
+#endif
+
+/*
  * The interrupt holdoff timers are multiplied by this value on T6+.
  * 1 and 3-17 (both inclusive) are legal values.
  */
@@ -279,6 +297,11 @@ static void drain_wrq_wr_list(struct adapter *, struct
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 static int sysctl_tc(SYSCTL_HANDLER_ARGS);
+#ifdef RATELIMIT
+static inline u_int txpkt_eo_len16(u_int, u_int, u_int);
+static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *,
+    struct mbuf *);
+#endif
 
 static counter_u64_t extfree_refs;
 static counter_u64_t extfree_rels;
@@ -515,6 +538,10 @@ t4_sge_modload(void)
 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
 	t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx);
+#ifdef RATELIMIT
+	t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack,
+	    CPL_COOKIE_ETHOFLD);
+#endif
 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
 }
@@ -2078,7 +2105,68 @@ set_mbuf_len16(struct mbuf *m, uint8_t len16)
 	m->m_pkthdr.PH_loc.eight[0] = len16;
 }
 
+#ifdef RATELIMIT
 static inline int
+mbuf_eo_nsegs(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+	return (m->m_pkthdr.PH_loc.eight[1]);
+}
+
+static inline void
+set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs)
+{
+
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.PH_loc.eight[1] = nsegs;
+}
+
+static inline int
+mbuf_eo_len16(struct mbuf *m)
+{
+	int n;
+
+	M_ASSERTPKTHDR(m);
+	n = m->m_pkthdr.PH_loc.eight[2];
+	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
+
+	return (n);
+}
+
+static inline void
+set_mbuf_eo_len16(struct mbuf *m, uint8_t len16)
+{
+
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.PH_loc.eight[2] = len16;
+}
+
+static inline int
+mbuf_eo_tsclk_tsoff(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+	return (m->m_pkthdr.PH_loc.eight[3]);
+}
+
+static inline void
+set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff)
+{
+
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff;
+}
+
+static inline int
+needs_eo(struct mbuf *m)
+{
+
+	return (m->m_pkthdr.snd_tag != NULL);
+}
+#endif
+
+static inline int
 needs_tso(struct mbuf *m)
 {
 
@@ -2107,6 +2195,22 @@ needs_l4_csum(struct mbuf *m)
 }
 
 static inline int
+needs_tcp_csum(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
+}
+
+static inline int
+needs_udp_csum(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+	return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
+}
+
+static inline int
 needs_vlan_insertion(struct mbuf *m)
 {
 
@@ -2142,16 +2246,19 @@ m_advance(struct mbuf **pm, int *poffset, int len)
 
 /*
  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
- * must have at least one mbuf that's not empty.
+ * must have at least one mbuf that's not empty.  It is possible for this
+ * routine to return 0 if skip accounts for all the contents of the mbuf chain.
  */
 static inline int
-count_mbuf_nsegs(struct mbuf *m)
+count_mbuf_nsegs(struct mbuf *m, int skip)
 {
 	vm_paddr_t lastb, next;
 	vm_offset_t va;
 	int len, nsegs;
 
-	MPASS(m != NULL);
+	M_ASSERTPKTHDR(m);
+	MPASS(m->m_pkthdr.len > 0);
+	MPASS(m->m_pkthdr.len >= skip);
 
 	nsegs = 0;
 	lastb = 0;
@@ -2160,15 +2267,20 @@ count_mbuf_nsegs(struct mbuf *m)
 		len = m->m_len;
 		if (__predict_false(len == 0))
 			continue;
-		va = mtod(m, vm_offset_t);
+		if (skip >= len) {
+			skip -= len;
+			continue;
+		}
+		va = mtod(m, vm_offset_t) + skip;
+		len -= skip;
+		skip = 0;
 		next = pmap_kextract(va);
-		nsegs += sglist_count(m->m_data, len);
+		nsegs += sglist_count((void *)(uintptr_t)va, len);
 		if (lastb + 1 == next)
 			nsegs--;
 		lastb = pmap_kextract(va + len - 1);
 	}
 
-	MPASS(nsegs > 0);
 	return (nsegs);
 }
 
@@ -2204,7 +2316,7 @@ restart:
 	 */
 	M_ASSERTPKTHDR(m0);
 	MPASS(m0->m_pkthdr.len > 0);
-	nsegs = count_mbuf_nsegs(m0);
+	nsegs = count_mbuf_nsegs(m0, 0);
 	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
 			rc = EFBIG;
@@ -2230,7 +2342,20 @@ restart:
 	else
 		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
 
+#ifdef RATELIMIT
+	/*
+	 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
+	 * checksumming is enabled.  needs_l4_csum happens to check for all the
+	 * right things.
+	 */
+	if (__predict_false(needs_eo(m0) && !needs_l4_csum(m0)))
+		m0->m_pkthdr.snd_tag = NULL;
+#endif
+
 	if (!needs_tso(m0) &&
+#ifdef RATELIMIT
+	    !needs_eo(m0) &&
+#endif
 	    !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0))))
 		return (0);
 
@@ -2276,11 +2401,34 @@ restart:
 	}
 
 #if defined(INET) || defined(INET6)
-	if (needs_tso(m0)) {
+	if (needs_tcp_csum(m0)) {
 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
+#ifdef RATELIMIT
+		if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) {
+			set_mbuf_eo_tsclk_tsoff(m0,
+			    V_FW_ETH_TX_EO_WR_TSCLK(tsclk) |
+			    V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
+		} else
+			set_mbuf_eo_tsclk_tsoff(m0, 0);
+	} else if (needs_udp_csum(m)) {
+		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
+#endif
 	}
+#ifdef RATELIMIT
+	if (needs_eo(m0)) {
+		u_int immhdrs;
+
+		/* EO WRs have the headers in the WR and not the GL. */
+		immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
+		    m0->m_pkthdr.l4hlen;
+		nsegs = count_mbuf_nsegs(m0, immhdrs);
+		set_mbuf_eo_nsegs(m0, nsegs);
+		set_mbuf_eo_len16(m0,
+		    txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
+	}
 #endif
+#endif
 	MPASS(m0 == *mp);
 	return (0);
 }
@@ -5302,3 +5450,416 @@ done:
 	mtx_unlock(&sc->tc_lock);
 	return (rc);
 }
+
+#ifdef RATELIMIT
+/*
+ * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
+ */
+static inline u_int
+txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso)
+{
+	u_int n;
+
+	MPASS(immhdrs > 0);
+
+	n = roundup2(sizeof(struct fw_eth_tx_eo_wr) +
+	    sizeof(struct cpl_tx_pkt_core) + immhdrs, 16);
+	if (__predict_false(nsegs == 0))
+		goto done;
+
+	nsegs--; /* first segment is part of ulptx_sgl */
+	n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
+	if (tso)
+		n += sizeof(struct cpl_tx_pkt_lso_core);
+
+done:
+	return (howmany(n, 16));
+}
+
+#define ETID_FLOWC_NPARAMS 6
+#define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \
+    ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16))
+#define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16))
+
+static int
+send_etid_flowc_wr(struct cxgbe_snd_tag *cst, struct port_info *pi,
+    struct vi_info *vi)
+{
+	struct wrq_cookie cookie;
+	u_int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
+	struct fw_flowc_wr *flowc;
+
+	mtx_assert(&cst->lock, MA_OWNED);
+	MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) ==
+	    EO_FLOWC_PENDING);
+
+	flowc = start_wrq_wr(cst->eo_txq, ETID_FLOWC_LEN16, &cookie);
+	if (__predict_false(flowc == NULL))
+		return (ENOMEM);
+
+	bzero(flowc, ETID_FLOWC_LEN);
+	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+	    V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0));
+	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) |
+	    V_FW_WR_FLOWID(cst->etid));
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+	flowc->mnemval[0].val = htobe32(pfvf);
+	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+	flowc->mnemval[1].val = htobe32(pi->tx_chan);
+	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+	flowc->mnemval[2].val = htobe32(pi->tx_chan);
+	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+	flowc->mnemval[3].val = htobe32(cst->iqid);
+	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE;
+	flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED);
+	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
+	flowc->mnemval[5].val = htobe32(cst->schedcl);
+
+	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
+
+	cst->flags &= ~EO_FLOWC_PENDING;
+	cst->flags |= EO_FLOWC_RPL_PENDING;
+	MPASS(cst->tx_credits >= ETID_FLOWC_LEN16);	/* flowc is first WR. */
+	cst->tx_credits -= ETID_FLOWC_LEN16;
+
+	return (0);
+}
+
+#define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16))
+
+void
+send_etid_flush_wr(struct cxgbe_snd_tag *cst)
+{
+	struct fw_flowc_wr *flowc;
+	struct wrq_cookie cookie;
+
+	mtx_assert(&cst->lock, MA_OWNED);
+
+	flowc = start_wrq_wr(cst->eo_txq, ETID_FLUSH_LEN16, &cookie);
+	if (__predict_false(flowc == NULL))
+		CXGBE_UNIMPLEMENTED(__func__);
+
+	bzero(flowc, ETID_FLUSH_LEN16 * 16);
+	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
+	    V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL);
+	flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) |
+	    V_FW_WR_FLOWID(cst->etid));
+
+	commit_wrq_wr(cst->eo_txq, flowc, &cookie);
+
+	cst->flags |= EO_FLUSH_RPL_PENDING;
+	MPASS(cst->tx_credits >= ETID_FLUSH_LEN16);
+	cst->tx_credits -= ETID_FLUSH_LEN16;
+	cst->ncompl++;
+}
+
+static void
+write_ethofld_wr(struct cxgbe_snd_tag *cst, struct fw_eth_tx_eo_wr *wr,
+    struct mbuf *m0, int compl)
+{
+	struct cpl_tx_pkt_core *cpl;
+	uint64_t ctrl1;
+	uint32_t ctrl;	/* used in many unrelated places */
+	int len16, pktlen, nsegs, immhdrs;
+	caddr_t dst;
+	uintptr_t p;
+	struct ulptx_sgl *usgl;
+	struct sglist sg;
+	struct sglist_seg segs[38];	/* XXX: find real limit.  XXX: get off the stack */
+
+	mtx_assert(&cst->lock, MA_OWNED);
+	M_ASSERTPKTHDR(m0);
+	KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
+	    m0->m_pkthdr.l4hlen > 0,
+	    ("%s: ethofld mbuf %p is missing header lengths", __func__, m0));
+
+	if (needs_udp_csum(m0)) {
+		CXGBE_UNIMPLEMENTED("UDP ethofld");
+	}
+
+	len16 = mbuf_eo_len16(m0);
+	nsegs = mbuf_eo_nsegs(m0);
+	pktlen = m0->m_pkthdr.len;
+	ctrl = sizeof(struct cpl_tx_pkt_core);
+	if (needs_tso(m0))
+		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
+	immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen;
+	ctrl += immhdrs;
+
+	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) |
+	    V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl));
+	wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
+	    V_FW_WR_FLOWID(cst->etid));
+	wr->r3 = 0;
+	wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
+	wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
+	wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
+	wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen;
+	wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0);
+	wr->u.tcpseg.r4 = 0;
+	wr->u.tcpseg.r5 = 0;
+	wr->u.tcpseg.plen = htobe32(pktlen - immhdrs);
+
+	if (needs_tso(m0)) {
+		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
+
+		wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz);
+
+		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
+		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
+		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
+		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
+			ctrl |= V_LSO_ETHHDR_LEN(1);
+		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
+			ctrl |= F_LSO_IPV6;
+		lso->lso_ctrl = htobe32(ctrl);
+		lso->ipid_ofst = htobe16(0);
+		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
+		lso->seqno_offset = htobe32(0);
+		lso->len = htobe32(pktlen);
+
+		cpl = (void *)(lso + 1);
+	} else {
+		wr->u.tcpseg.mss = htobe16(0xffff);
+		cpl = (void *)(wr + 1);
+	}
+
+	/* Checksum offload must be requested for ethofld. */
+	ctrl1 = 0;
+	MPASS(needs_l4_csum(m0));
+
+	/* VLAN tag insertion */
+	if (needs_vlan_insertion(m0)) {
+		ctrl1 |= F_TXPKT_VLAN_VLD |
+		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
+	}
+
+	/* CPL header */
+	cpl->ctrl0 = cst->ctrl0;
+	cpl->pack = 0;
+	cpl->len = htobe16(pktlen);
+	cpl->ctrl1 = htobe64(ctrl1);
+
+	/* Copy Ethernet, IP & TCP hdrs as immediate data */
+	p = (uintptr_t)(cpl + 1);
+	m_copydata(m0, 0, immhdrs, (void *)p);
+
+	/* SGL */
+	dst = (void *)(cpl + 1);
+	if (nsegs > 0) {
+		int i, pad;
+
+		/* zero-pad upto next 16Byte boundary, if not 16Byte aligned */
+		p += immhdrs;
+		pad = 16 - (immhdrs & 0xf);
+		bzero((void *)p, pad);
+
+		usgl = (void *)(p + pad);
+		usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
+		    V_ULPTX_NSGE(nsegs));
+
+		sglist_init(&sg, nitems(segs), segs);
+		for (; m0 != NULL; m0 = m0->m_next) {
+			if (__predict_false(m0->m_len == 0))
+				continue;
+			if (immhdrs >= m0->m_len) {
+				immhdrs -= m0->m_len;
+				continue;
+			}
+
+			sglist_append(&sg, mtod(m0, char *) + immhdrs,
+			    m0->m_len - immhdrs);
+			immhdrs = 0;
+		}
+		MPASS(sg.sg_nseg == nsegs);
+
+		/*
+		 * Zero pad last 8B in case the WR doesn't end on a 16B
+		 * boundary.
+		 */
+		*(uint64_t *)((char *)wr + len16 * 16 - 8) = 0;
+
+		usgl->len0 = htobe32(segs[0].ss_len);
+		usgl->addr0 = htobe64(segs[0].ss_paddr);
+		for (i = 0; i < nsegs - 1; i++) {
+			usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len);
+			usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr);
+		}
+		if (i & 1)
+			usgl->sge[i / 2].len[1] = htobe32(0);
+	}
+
+}
+
+static void
+ethofld_tx(struct cxgbe_snd_tag *cst)
+{
+	struct mbuf *m;
+	struct wrq_cookie cookie;
+	int next_credits, compl;
+	struct fw_eth_tx_eo_wr *wr;
+
+	mtx_assert(&cst->lock, MA_OWNED);
+
+	while ((m = mbufq_first(&cst->pending_tx)) != NULL) {
+		M_ASSERTPKTHDR(m);
+
+		/* How many len16 credits do we need to send this mbuf. */
+		next_credits = mbuf_eo_len16(m);
+		MPASS(next_credits > 0);
+		if (next_credits > cst->tx_credits) {
+			/*
+			 * Tx will make progress eventually because there is at
+			 * least one outstanding fw4_ack that will return
+			 * credits and kick the tx.
+			 */
+			MPASS(cst->ncompl > 0);
+			return;
+		}
+		wr = start_wrq_wr(cst->eo_txq, next_credits, &cookie);
+		if (__predict_false(wr == NULL)) {
+			/* XXX: wishful thinking, not a real assertion. */
+			MPASS(cst->ncompl > 0);
+			return;
+		}
+		cst->tx_credits -= next_credits;
+		cst->tx_nocompl += next_credits;
+		compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2;
+		ETHER_BPF_MTAP(cst->com.ifp, m);
+		write_ethofld_wr(cst, wr, m, compl);
+		commit_wrq_wr(cst->eo_txq, wr, &cookie);
+		if (compl) {
+			cst->ncompl++;
+			cst->tx_nocompl	= 0;
+		}
+		(void) mbufq_dequeue(&cst->pending_tx);
+		mbufq_enqueue(&cst->pending_fwack, m);
+	}
+}
+
+int
+ethofld_transmit(struct ifnet *ifp, struct mbuf *m0)
+{
+	struct cxgbe_snd_tag *cst;
+	int rc;
+
+	MPASS(m0->m_nextpkt == NULL);
+	MPASS(m0->m_pkthdr.snd_tag != NULL);
+	cst = mst_to_cst(m0->m_pkthdr.snd_tag);
+
+	mtx_lock(&cst->lock);
+	MPASS(cst->flags & EO_SND_TAG_REF);
+
+	if (__predict_false(cst->flags & EO_FLOWC_PENDING)) {
+		struct vi_info *vi = ifp->if_softc;
+		struct port_info *pi = vi->pi;
+		struct adapter *sc = pi->adapter;
+		const uint32_t rss_mask = vi->rss_size - 1;
+		uint32_t rss_hash;
+
+		cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq];
+		if (M_HASHTYPE_ISHASH(m0))
+			rss_hash = m0->m_pkthdr.flowid;
+		else
+			rss_hash = arc4random();
+		/* We assume RSS hashing */
+		cst->iqid = vi->rss[rss_hash & rss_mask];
+		cst->eo_txq += rss_hash % vi->nofldtxq;
+		rc = send_etid_flowc_wr(cst, pi, vi);
+		if (rc != 0)
+			goto done;
+	}
+
+	if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) {
+		rc = ENOBUFS;
+		goto done;
+	}
+
+	mbufq_enqueue(&cst->pending_tx, m0);
+	cst->plen += m0->m_pkthdr.len;
+
+	ethofld_tx(cst);
+	rc = 0;
+done:
+	mtx_unlock(&cst->lock);
+	if (__predict_false(rc != 0))
+		m_freem(m0);
+	return (rc);
+}
+
+static int
+ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
+{
+	struct adapter *sc = iq->adapter;
+	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
+	struct mbuf *m;
+	u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
+	struct cxgbe_snd_tag *cst;
+	uint8_t credits = cpl->credits;
+
+	cst = lookup_etid(sc, etid);
+	mtx_lock(&cst->lock);
+	if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) {
+		MPASS(credits >= ETID_FLOWC_LEN16);
+		credits -= ETID_FLOWC_LEN16;
+		cst->flags &= ~EO_FLOWC_RPL_PENDING;
+	}
+
+	KASSERT(cst->ncompl > 0,
+	    ("%s: etid %u (%p) wasn't expecting completion.",
+	    __func__, etid, cst));
+	cst->ncompl--;
+
+	while (credits > 0) {
+		m = mbufq_dequeue(&cst->pending_fwack);
+		if (__predict_false(m == NULL)) {
+			/*
+			 * The remaining credits are for the final flush that
+			 * was issued when the tag was freed by the kernel.
+			 */
+			MPASS((cst->flags &
+			    (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) ==
+			    EO_FLUSH_RPL_PENDING);
+			MPASS(credits == ETID_FLUSH_LEN16);
+			MPASS(cst->tx_credits + cpl->credits == cst->tx_total);
+			MPASS(cst->ncompl == 0);
+
+			cst->flags &= ~EO_FLUSH_RPL_PENDING;
+			cst->tx_credits += cpl->credits;
+freetag:
+			cxgbe_snd_tag_free_locked(cst);
+			return (0);	/* cst is gone. */
+		}
+		KASSERT(m != NULL,
+		    ("%s: too many credits (%u, %u)", __func__, cpl->credits,
+		    credits));
+		KASSERT(credits >= mbuf_eo_len16(m),
+		    ("%s: too few credits (%u, %u, %u)", __func__,
+		    cpl->credits, credits, mbuf_eo_len16(m)));
+		credits -= mbuf_eo_len16(m);
+		cst->plen -= m->m_pkthdr.len;
+		m_freem(m);
+	}
+
+	cst->tx_credits += cpl->credits;
+	MPASS(cst->tx_credits <= cst->tx_total);
+
+	m = mbufq_first(&cst->pending_tx);
+	if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m))
+		ethofld_tx(cst);
+
+	if (__predict_false((cst->flags & EO_SND_TAG_REF) == 0) &&
+	    cst->ncompl == 0) {
+		if (cst->tx_credits == cst->tx_total)
+			goto freetag;
+		else {
+			MPASS((cst->flags & EO_FLUSH_RPL_PENDING) == 0);
+			send_etid_flush_wr(cst);
+		}
+	}
+
+	mtx_unlock(&cst->lock);
+
+	return (0);
+}
+#endif