git: 132894ca4beb - stable/13 - cxgbei: Support for ISO (iSCSI segmentation offload).

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 29 Oct 2021 23:58:31 UTC
The branch stable/13 has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=132894ca4bebe54854c414e46d3d1312b47a8f46

commit 132894ca4bebe54854c414e46d3d1312b47a8f46
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2021-08-06 21:21:37 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2021-10-29 23:27:10 +0000

    cxgbei: Support for ISO (iSCSI segmentation offload).
    
    ISO can be disabled before establishing a connection by setting
    dev.tNnex.N.toe.iso to 0.
    
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D31223
    
    (cherry picked from commit 5b27e4b27caae840bd79ccc5cb7811a0c9acc656)
---
 sys/dev/cxgbe/adapter.h           |   1 +
 sys/dev/cxgbe/cxgbei/cxgbei.h     |   2 +
 sys/dev/cxgbe/cxgbei/icl_cxgbei.c |  62 +++++++++++--
 sys/dev/cxgbe/offload.h           |   1 +
 sys/dev/cxgbe/t4_main.c           |   5 ++
 sys/dev/cxgbe/t4_sge.c            |   5 ++
 sys/dev/cxgbe/tom/t4_cpl_io.c     | 184 ++++++++++++++++++++++++++++----------
 sys/dev/cxgbe/tom/t4_tom.h        |  52 +++++++++++
 8 files changed, 254 insertions(+), 58 deletions(-)

diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index 7909e0b108b2..c5a10c563e87 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -739,6 +739,7 @@ struct sge_ofld_txq {
 	struct sge_wrq wrq;
 	counter_u64_t tx_iscsi_pdus;
 	counter_u64_t tx_iscsi_octets;
+	counter_u64_t tx_iscsi_iso_wrs;
 	counter_u64_t tx_toe_tls_records;
 	counter_u64_t tx_toe_tls_octets;
 } __aligned(CACHE_LINE_SIZE);
diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.h b/sys/dev/cxgbe/cxgbei/cxgbei.h
index 45d3398d545c..433f15b743fe 100644
--- a/sys/dev/cxgbe/cxgbei/cxgbei.h
+++ b/sys/dev/cxgbe/cxgbei/cxgbei.h
@@ -134,6 +134,8 @@ struct cxgbei_data {
 	struct sysctl_ctx_list ctx;	/* from uld_activate to deactivate */
 };
 
+#define CXGBEI_MAX_ISO_PAYLOAD	65535
+
 /* cxgbei.c */
 u_int cxgbei_select_worker_thread(struct icl_cxgbei_conn *);
 
diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
index d5b13fb5c3ea..a57d26ae21b8 100644
--- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
@@ -335,13 +335,14 @@ finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp)
 	uint8_t ulp_submode, padding;
 	struct mbuf *m, *last;
 	struct iscsi_bhs *bhs;
+	int data_len;
 
 	/*
 	 * Fix up the data segment mbuf first.
 	 */
 	m = ip->ip_data_mbuf;
 	ulp_submode = icc->ulp_submode;
-	if (m) {
+	if (m != NULL) {
 		last = m_last(m);
 
 		/*
@@ -349,7 +350,8 @@ finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp)
 		 * necessary.  There will definitely be room in the mbuf.
 		 */
 		padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len;
-		if (padding) {
+		if (padding != 0) {
+			MPASS(padding <= M_TRAILINGSPACE(last));
 			bzero(mtod(last, uint8_t *) + last->m_len, padding);
 			last->m_len += padding;
 		}
@@ -367,9 +369,41 @@ finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp)
 	MPASS(m->m_len == sizeof(struct iscsi_bhs));
 
 	bhs = ip->ip_bhs;
-	bhs->bhs_data_segment_len[2] = ip->ip_data_len;
-	bhs->bhs_data_segment_len[1] = ip->ip_data_len >> 8;
-	bhs->bhs_data_segment_len[0] = ip->ip_data_len >> 16;
+	data_len = ip->ip_data_len;
+	if (data_len > icc->ic.ic_max_send_data_segment_length) {
+		struct iscsi_bhs_data_in *bhsdi;
+		int flags;
+
+		KASSERT(padding == 0, ("%s: ISO with padding %d for icp %p",
+		    __func__, padding, icp));
+		switch (bhs->bhs_opcode) {
+		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
+			flags = 1;
+			break;
+		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
+			flags = 2;
+			break;
+		default:
+			panic("invalid opcode %#x for ISO", bhs->bhs_opcode);
+		}
+		data_len = icc->ic.ic_max_send_data_segment_length;
+		bhsdi = (struct iscsi_bhs_data_in *)bhs;
+		if (bhsdi->bhsdi_flags & BHSDI_FLAGS_F) {
+			/*
+			 * Firmware will set F on the final PDU in the
+			 * burst.
+			 */
+			flags |= CXGBE_ISO_F;
+			bhsdi->bhsdi_flags &= ~BHSDI_FLAGS_F;
+		}
+		set_mbuf_iscsi_iso(m, true);
+		set_mbuf_iscsi_iso_flags(m, flags);
+		set_mbuf_iscsi_iso_mss(m, data_len);
+	}
+
+	bhs->bhs_data_segment_len[2] = data_len;
+	bhs->bhs_data_segment_len[1] = data_len >> 8;
+	bhs->bhs_data_segment_len[0] = data_len >> 16;
 
 	/*
 	 * Extract mbuf chain from PDU.
@@ -477,7 +511,8 @@ icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip,
 		}
 		MPASS(len == 0);
 	}
-	MPASS(ip->ip_data_len <= ic->ic_max_send_data_segment_length);
+	MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length,
+	    ic->ic_hw_isomax));
 
 	return (0);
 }
@@ -748,7 +783,7 @@ icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd)
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	cap_rights_t rights;
-	int error;
+	int error, max_iso_pdus;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
@@ -815,12 +850,21 @@ icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd)
 			icc->ulp_submode |= ULP_CRC_HEADER;
 		if (ic->ic_data_crc32c)
 			icc->ulp_submode |= ULP_CRC_DATA;
+
+		if (icc->sc->tt.iso && chip_id(icc->sc) >= CHELSIO_T5) {
+			max_iso_pdus = CXGBEI_MAX_ISO_PAYLOAD /
+			    ci->max_tx_pdu_len;
+			ic->ic_hw_isomax = max_iso_pdus *
+			    ic->ic_max_send_data_segment_length;
+		} else
+			max_iso_pdus = 1;
+
 		so->so_options |= SO_NO_DDP;
 		toep->params.ulp_mode = ULP_MODE_ISCSI;
 		toep->ulpcb = icc;
 
-		send_iscsi_flowc_wr(icc->sc, toep, roundup(ci->max_tx_pdu_len,
-		    tp->t_maxseg));
+		send_iscsi_flowc_wr(icc->sc, toep,
+		    roundup(max_iso_pdus * ci->max_tx_pdu_len, tp->t_maxseg));
 		set_ulp_mode_iscsi(icc->sc, toep, icc->ulp_submode);
 		error = 0;
 	}
diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h
index e264882fb5b4..81ed08f2e412 100644
--- a/sys/dev/cxgbe/offload.h
+++ b/sys/dev/cxgbe/offload.h
@@ -232,6 +232,7 @@ struct tom_tunables {
 	int tx_zcopy;
 	int cop_managed_offloading;
 	int autorcvbuf_inc;
+	int iso;
 };
 
 /* iWARP driver tunables */
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index bb8b9a4d78fb..71877a571982 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -7585,6 +7585,10 @@ t4_sysctls(struct adapter *sc)
 		    CTLFLAG_RW, &sc->tt.autorcvbuf_inc, 0,
 		    "autorcvbuf increment");
 
+		sc->tt.iso = 1;
+		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "iso", CTLFLAG_RW,
+		    &sc->tt.iso, 0, "Enable iSCSI segmentation offload");
+
 		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick",
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
 		    sysctl_tp_tick, "A", "TP timer tick (us)");
@@ -11900,6 +11904,7 @@ clear_stats(struct adapter *sc, u_int port_id)
 				ofld_txq->wrq.tx_wrs_copied = 0;
 				counter_u64_zero(ofld_txq->tx_iscsi_pdus);
 				counter_u64_zero(ofld_txq->tx_iscsi_octets);
+				counter_u64_zero(ofld_txq->tx_iscsi_iso_wrs);
 				counter_u64_zero(ofld_txq->tx_toe_tls_records);
 				counter_u64_zero(ofld_txq->tx_toe_tls_octets);
 			}
diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c
index 3679bbf84f2f..6b40a23516d7 100644
--- a/sys/dev/cxgbe/t4_sge.c
+++ b/sys/dev/cxgbe/t4_sge.c
@@ -4786,6 +4786,7 @@ alloc_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq, int idx)
 
 		ofld_txq->tx_iscsi_pdus = counter_u64_alloc(M_WAITOK);
 		ofld_txq->tx_iscsi_octets = counter_u64_alloc(M_WAITOK);
+		ofld_txq->tx_iscsi_iso_wrs = counter_u64_alloc(M_WAITOK);
 		ofld_txq->tx_toe_tls_records = counter_u64_alloc(M_WAITOK);
 		ofld_txq->tx_toe_tls_octets = counter_u64_alloc(M_WAITOK);
 		add_ofld_txq_sysctls(&vi->ctx, oid, ofld_txq);
@@ -4823,6 +4824,7 @@ free_ofld_txq(struct vi_info *vi, struct sge_ofld_txq *ofld_txq)
 		MPASS(!(eq->flags & EQ_HW_ALLOCATED));
 		counter_u64_free(ofld_txq->tx_iscsi_pdus);
 		counter_u64_free(ofld_txq->tx_iscsi_octets);
+		counter_u64_free(ofld_txq->tx_iscsi_iso_wrs);
 		counter_u64_free(ofld_txq->tx_toe_tls_records);
 		counter_u64_free(ofld_txq->tx_toe_tls_octets);
 		free_wrq(sc, &ofld_txq->wrq);
@@ -4847,6 +4849,9 @@ add_ofld_txq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
 	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_octets",
 	    CTLFLAG_RD, &ofld_txq->tx_iscsi_octets,
 	    "# of payload octets in transmitted iSCSI PDUs");
+	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_iscsi_iso_wrs",
+	    CTLFLAG_RD, &ofld_txq->tx_iscsi_iso_wrs,
+	    "# of iSCSI segmentation offload work requests");
 	SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, "tx_toe_tls_records",
 	    CTLFLAG_RD, &ofld_txq->tx_toe_tls_records,
 	    "# of TOE TLS records transmitted");
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index 66e3955445f9..f61fe557635c 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -67,6 +67,8 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
+#include <dev/iscsi/iscsi_proto.h>
+
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
@@ -516,38 +518,44 @@ t4_close_conn(struct adapter *sc, struct toepcb *toep)
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
+#define MIN_ISO_TX_CREDITS  (howmany(sizeof(struct cpl_tx_data_iso), 16))
+#define MIN_TX_CREDITS(iso)						\
+	(MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0))
 
 /* Maximum amount of immediate data we could stuff in a WR */
 static inline int
-max_imm_payload(int tx_credits)
+max_imm_payload(int tx_credits, int iso)
 {
+	const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0;
 	const int n = 1;	/* Use no more than one desc for imm. data WR */
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
-	if (tx_credits < MIN_OFLD_TX_CREDITS)
+	if (tx_credits < MIN_TX_CREDITS(iso))
 		return (0);
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
-		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr));
+		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) -
+		    iso_cpl_size);
 	else
-		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr));
+		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) -
+		    iso_cpl_size);
 }
 
 /* Maximum number of SGL entries we could stuff in a WR */
 static inline int
-max_dsgl_nsegs(int tx_credits)
+max_dsgl_nsegs(int tx_credits, int iso)
 {
 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
-	int sge_pair_credits = tx_credits - MIN_OFLD_TX_CREDITS;
+	int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso);
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
-	if (tx_credits < MIN_OFLD_TX_CREDITS)
+	if (tx_credits < MIN_TX_CREDITS(iso))
 		return (0);
 
 	nseg += 2 * (sge_pair_credits * 16 / 24);
@@ -558,12 +566,13 @@ max_dsgl_nsegs(int tx_credits)
 }
 
 static inline void
-write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen,
-    unsigned int plen, uint8_t credits, int shove, int ulp_submode)
+write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode,
+    unsigned int immdlen, unsigned int plen, uint8_t credits, int shove,
+    int ulp_submode)
 {
 	struct fw_ofld_tx_data_wr *txwr = dst;
 
-	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_OFLD_TX_DATA_WR) |
+	txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) |
 	    V_FW_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
 	    V_FW_WR_LEN16(credits));
@@ -707,8 +716,8 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 	txsd = &toep->txsd[toep->txsd_pidx];
 	do {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
-		max_imm = max_imm_payload(tx_credits);
-		max_nsegs = max_dsgl_nsegs(tx_credits);
+		max_imm = max_imm_payload(tx_credits, 0);
+		max_nsegs = max_dsgl_nsegs(tx_credits, 0);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
@@ -832,7 +841,8 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
-			write_tx_wr(txwr, toep, plen, plen, credits, shove, 0);
+			write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen,
+			    credits, shove, 0);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
@@ -851,7 +861,8 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
-			write_tx_wr(txwr, toep, 0, plen, credits, shove, 0);
+			write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen,
+			    credits, shove, 0);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
@@ -927,18 +938,71 @@ rqdrop_locked(struct mbufq *q, int plen)
 	}
 }
 
+/*
+ * Not a bit in the TCB, but is a bit in the ulp_submode field of the
+ * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR.
+ */
+#define	ULP_ISO		G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO)
+
+static void
+write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss,
+    int len, int npdu)
+{
+	struct cpl_tx_data_iso *cpl;
+	unsigned int burst_size;
+	unsigned int last;
+
+	/*
+	 * The firmware will set the 'F' bit on the last PDU when
+	 * either condition is true:
+	 *
+	 * - this large PDU is marked as the "last" slice
+	 *
+	 * - the amount of data payload bytes equals the burst_size
+	 *
+	 * The strategy used here is to always set the burst_size
+	 * artificially high (len includes the size of the template
+	 * BHS) and only set the "last" flag if the original PDU had
+	 * 'F' set.
+	 */
+	burst_size = len;
+	last = !!(flags & CXGBE_ISO_F);
+
+	cpl = (struct cpl_tx_data_iso *)dst;
+	cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) |
+	    V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) |
+	    V_CPL_TX_DATA_ISO_CPLHDRLEN(0) |
+	    V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) |
+	    V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) |
+	    V_CPL_TX_DATA_ISO_IMMEDIATE(0) |
+	    V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags)));
+
+	cpl->ahs_len = 0;
+	cpl->mpdu = htons(DIV_ROUND_UP(mss, 4));
+	cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4));
+	cpl->len = htonl(len);
+	cpl->reserved2_seglen_offset = htonl(0);
+	cpl->datasn_offset = htonl(0);
+	cpl->buffer_offset = htonl(0);
+	cpl->reserved3 = 0;
+}
+
 static struct wrqe *
 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
 {
 	struct mbuf *m;
 	struct fw_ofld_tx_data_wr *txwr;
+	struct cpl_tx_data_iso *cpl_iso;
+	void *p;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
-	u_int adjusted_plen, ulp_submode;
+	u_int adjusted_plen, imm_data, ulp_submode;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
-	int tx_credits, shove;
+	int tx_credits, shove, npdu, wr_len;
+	uint16_t iso_mss;
 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
+	bool iso;
 
 	M_ASSERTPKTHDR(sndptr);
 
@@ -958,8 +1022,10 @@ write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
 		return (wr);
 	}
 
-	max_imm = max_imm_payload(tx_credits);
-	max_nsegs = max_dsgl_nsegs(tx_credits);
+	iso = mbuf_iscsi_iso(sndptr);
+	max_imm = max_imm_payload(tx_credits, iso);
+	max_nsegs = max_dsgl_nsegs(tx_credits, iso);
+	iso_mss = mbuf_iscsi_iso_mss(sndptr);
 
 	plen = 0;
 	nsegs = 0;
@@ -993,8 +1059,6 @@ write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
 	MPASS(sndptr->m_pkthdr.len == plen);
 
 	shove = !(tp->t_flags & TF_MORETOCOME);
-	ulp_submode = mbuf_ulp_submode(sndptr);
-	MPASS(ulp_submode < nitems(ulp_extra_len));
 
 	/*
 	 * plen doesn't include header and data digests, which are
@@ -1002,51 +1066,73 @@ write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
 	 * they do occupy TCP sequence space and need to be accounted
 	 * for.
 	 */
-	adjusted_plen = plen + ulp_extra_len[ulp_submode];
+	ulp_submode = mbuf_ulp_submode(sndptr);
+	MPASS(ulp_submode < nitems(ulp_extra_len));
+	npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1;
+	adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu;
+	if (iso)
+		adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1);
+	wr_len = sizeof(*txwr);
+	if (iso)
+		wr_len += sizeof(struct cpl_tx_data_iso);
 	if (plen <= max_imm) {
-
 		/* Immediate data tx */
-
-		wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
-				&toep->ofld_txq->wrq);
-		if (wr == NULL) {
-			/* XXX: how will we recover from this? */
-			return (NULL);
-		}
-		txwr = wrtod(wr);
-		credits = howmany(wr->wr_len, 16);
-		write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
-		    shove, ulp_submode);
-		m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
+		imm_data = plen;
+		wr_len += plen;
 		nsegs = 0;
 	} else {
-		int wr_len;
-
 		/* DSGL tx */
-		wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
+		imm_data = 0;
+		wr_len += sizeof(struct ulptx_sgl) +
 		    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
-		wr = alloc_wrqe(roundup2(wr_len, 16),
-		    &toep->ofld_txq->wrq);
-		if (wr == NULL) {
-			/* XXX: how will we recover from this? */
-			return (NULL);
-		}
-		txwr = wrtod(wr);
-		credits = howmany(wr_len, 16);
-		write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
-		    shove, ulp_submode);
-		write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf);
+	}
+
+	wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq);
+	if (wr == NULL) {
+		/* XXX: how will we recover from this? */
+		return (NULL);
+	}
+	txwr = wrtod(wr);
+	credits = howmany(wr->wr_len, 16);
+
+	if (iso) {
+		write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR,
+		    imm_data + sizeof(struct cpl_tx_data_iso),
+		    adjusted_plen, credits, shove, ulp_submode | ULP_ISO);
+		cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1);
+		MPASS(plen == sndptr->m_pkthdr.len);
+		write_tx_data_iso(cpl_iso, ulp_submode,
+		    mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu);
+		p = cpl_iso + 1;
+	} else {
+		write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data,
+		    adjusted_plen, credits, shove, ulp_submode);
+		p = txwr + 1;
+	}
+
+	if (imm_data != 0) {
+		m_copydata(sndptr, 0, plen, p);
+	} else {
+		write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf);
 		if (wr_len & 0xf) {
 			uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len);
 			*pad = 0;
 		}
 	}
 
+	KASSERT(toep->tx_credits >= credits,
+	    ("%s: not enough credits: credits %u "
+		"toep->tx_credits %u tx_credits %u nsegs %u "
+		"max_nsegs %u iso %d", __func__, credits,
+		toep->tx_credits, tx_credits, nsegs, max_nsegs, iso));
+
 	tp->snd_nxt += adjusted_plen;
 	tp->snd_max += adjusted_plen;
 
-	counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, 1);
+	counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu);
 	counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen);
+	if (iso)
+		counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1);
 
 	return (wr);
 }
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
index 21cfb1df6e16..89c38f8c988a 100644
--- a/sys/dev/cxgbe/tom/t4_tom.h
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -363,6 +363,58 @@ mbuf_ulp_submode(struct mbuf *m)
 	return (m->m_pkthdr.PH_per.eight[0]);
 }
 
+static inline void
+set_mbuf_iscsi_iso(struct mbuf *m, bool iso)
+{
+
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.PH_per.eight[1] = iso;
+}
+
+static inline bool
+mbuf_iscsi_iso(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+	return (m->m_pkthdr.PH_per.eight[1]);
+}
+
+/* Flags for iSCSI segmentation offload. */
+#define	CXGBE_ISO_TYPE(flags)	((flags) & 0x3)
+#define	CXGBE_ISO_F		0x4
+
+static inline void
+set_mbuf_iscsi_iso_flags(struct mbuf *m, uint8_t flags)
+{
+
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.PH_per.eight[2] = flags;
+}
+
+static inline uint8_t
+mbuf_iscsi_iso_flags(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+	return (m->m_pkthdr.PH_per.eight[2]);
+}
+
+static inline void
+set_mbuf_iscsi_iso_mss(struct mbuf *m, uint16_t mss)
+{
+
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.PH_per.sixteen[2] = mss;
+}
+
+static inline uint16_t
+mbuf_iscsi_iso_mss(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+	return (m->m_pkthdr.PH_per.sixteen[2]);
+}
+
 /* t4_tom.c */
 struct toepcb *alloc_toepcb(struct vi_info *, int);
 int init_toepcb(struct vi_info *, struct toepcb *);