git: 4427ac3675f9 - main - cxgbe tom: Set the tid in the work requests to program page pods for iSCSI.

Fri May 14 19:21:50 UTC 2021

The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=4427ac3675f91df039d54a23518132e0e0fede86

commit 4427ac3675f91df039d54a23518132e0e0fede86
Author:     John Baldwin <jhb at FreeBSD.org>
AuthorDate: 2021-05-14 19:16:40 +0000
Commit:     John Baldwin <jhb at FreeBSD.org>
CommitDate: 2021-05-14 19:16:40 +0000

    cxgbe tom: Set the tid in the work requests to program page pods for iSCSI.
    
    As a result, CPL_FW4_ACK now returns credits for these work requests.
    To support this, page pod work requests are now constructed in special
    mbufs similar to "raw" mbufs used for NIC TLS in plain TX queues.
    These special mbufs are stored in the ulp_pduq and dispatched in order
    with PDU work requests.
    
    Sponsored by:   Chelsio Communications
    Discussed with: np
    Differential Revision:  https://reviews.freebsd.org/D29904
---
 sys/dev/cxgbe/cxgbei/icl_cxgbei.c |   7 +-
 sys/dev/cxgbe/tom/t4_cpl_io.c     | 241 ++++++++++++++++++++++----------------
 sys/dev/cxgbe/tom/t4_ddp.c        |  48 ++++++--
 sys/dev/cxgbe/tom/t4_tom.h        |  18 ++-
 4 files changed, 196 insertions(+), 118 deletions(-)

diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
index 961acfb31987..4e168a33e2ca 100644
--- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
@@ -844,8 +844,8 @@ no_ddp:
 		goto no_ddp;
 	}
 
-	rc = t4_write_page_pods_for_buf(sc, &toep->ofld_txq->wrq, toep->tid,
-	    prsv, (vm_offset_t)csio->data_ptr, csio->dxfer_len);
+	rc = t4_write_page_pods_for_buf(sc, toep, prsv,
+	    (vm_offset_t)csio->data_ptr, csio->dxfer_len);
 	if (rc != 0) {
 		t4_free_page_pods(prsv);
 		uma_zfree(prsv_zone, prsv);
@@ -959,8 +959,7 @@ no_ddp:
 			goto no_ddp;
 		}
 
-		rc = t4_write_page_pods_for_buf(sc, &toep->ofld_txq->wrq,
-		    toep->tid, prsv, buf, xferlen);
+		rc = t4_write_page_pods_for_buf(sc, toep, prsv, buf, xferlen);
 		if (rc != 0) {
 			t4_free_page_pods(prsv);
 			uma_zfree(prsv_zone, prsv);
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index d8eb0e091218..b4d84d3e5c55 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -927,10 +927,10 @@ rqdrop_locked(struct mbufq *q, int plen)
 	}
 }
 
-void
-t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
+static struct wrqe *
+write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
 {
-	struct mbuf *sndptr, *m;
+	struct mbuf *m;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
@@ -938,9 +938,129 @@ t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	int tx_credits, shove;
+	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
+
+	M_ASSERTPKTHDR(sndptr);
+
+	tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
+	if (mbuf_raw_wr(sndptr)) {
+		plen = sndptr->m_pkthdr.len;
+		KASSERT(plen <= SGE_MAX_WR_LEN,
+		    ("raw WR len %u is greater than max WR len", plen));
+		if (plen > tx_credits * 16)
+			return (NULL);
+
+		wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq);
+		if (__predict_false(wr == NULL))
+			return (NULL);
+
+		m_copydata(sndptr, 0, plen, wrtod(wr));
+		return (wr);
+	}
+
+	max_imm = max_imm_payload(tx_credits);
+	max_nsegs = max_dsgl_nsegs(tx_credits);
+
+	plen = 0;
+	nsegs = 0;
+	max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
+	for (m = sndptr; m != NULL; m = m->m_next) {
+		int n = sglist_count(mtod(m, void *), m->m_len);
+
+		nsegs += n;
+		plen += m->m_len;
+
+		/*
+		 * This mbuf would send us _over_ the nsegs limit.
+		 * Suspend tx because the PDU can't be sent out.
+		 */
+		if (plen > max_imm && nsegs > max_nsegs)
+			return (NULL);
+
+		if (max_nsegs_1mbuf < n)
+			max_nsegs_1mbuf = n;
+	}
+
+	if (__predict_false(toep->flags & TPF_FIN_SENT))
+		panic("%s: excess tx.", __func__);
+
+	/*
+	 * We have a PDU to send.  All of it goes out in one WR so 'm'
+	 * is NULL.  A PDU's length is always a multiple of 4.
+	 */
+	MPASS(m == NULL);
+	MPASS((plen & 3) == 0);
+	MPASS(sndptr->m_pkthdr.len == plen);
+
+	shove = !(tp->t_flags & TF_MORETOCOME);
+	ulp_submode = mbuf_ulp_submode(sndptr);
+	MPASS(ulp_submode < nitems(ulp_extra_len));
+
+	/*
+	 * plen doesn't include header and data digests, which are
+	 * generated and inserted in the right places by the TOE, but
+	 * they do occupy TCP sequence space and need to be accounted
+	 * for.
+	 */
+	adjusted_plen = plen + ulp_extra_len[ulp_submode];
+	if (plen <= max_imm) {
+
+		/* Immediate data tx */
+
+		wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
+				&toep->ofld_txq->wrq);
+		if (wr == NULL) {
+			/* XXX: how will we recover from this? */
+			return (NULL);
+		}
+		txwr = wrtod(wr);
+		credits = howmany(wr->wr_len, 16);
+		write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
+		    shove, ulp_submode);
+		m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
+		nsegs = 0;
+	} else {
+		int wr_len;
+
+		/* DSGL tx */
+		wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
+		    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
+		wr = alloc_wrqe(roundup2(wr_len, 16),
+		    &toep->ofld_txq->wrq);
+		if (wr == NULL) {
+			/* XXX: how will we recover from this? */
+			return (NULL);
+		}
+		txwr = wrtod(wr);
+		credits = howmany(wr_len, 16);
+		write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
+		    shove, ulp_submode);
+		write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf);
+		if (wr_len & 0xf) {
+			uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len);
+			*pad = 0;
+		}
+	}
+
+	tp->snd_nxt += adjusted_plen;
+	tp->snd_max += adjusted_plen;
+
+	counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, 1);
+	counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen);
+
+	return (wr);
+}
+
+void
+t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
+{
+	struct mbuf *sndptr, *m;
+	struct fw_wr_hdr *wrhdr;
+	struct wrqe *wr;
+	u_int plen, credits;
+	struct inpcb *inp = toep->inp;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 	struct mbufq *pduq = &toep->ulp_pduq;
-	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
@@ -965,99 +1085,14 @@ t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
 
 	while ((sndptr = mbufq_first(pduq)) != NULL) {
-		M_ASSERTPKTHDR(sndptr);
-
-		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
-		max_imm = max_imm_payload(tx_credits);
-		max_nsegs = max_dsgl_nsegs(tx_credits);
-
-		plen = 0;
-		nsegs = 0;
-		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
-		for (m = sndptr; m != NULL; m = m->m_next) {
-			int n = sglist_count(mtod(m, void *), m->m_len);
-
-			nsegs += n;
-			plen += m->m_len;
-
-			/*
-			 * This mbuf would send us _over_ the nsegs limit.
-			 * Suspend tx because the PDU can't be sent out.
-			 */
-			if (plen > max_imm && nsegs > max_nsegs) {
-				toep->flags |= TPF_TX_SUSPENDED;
-				return;
-			}
-
-			if (max_nsegs_1mbuf < n)
-				max_nsegs_1mbuf = n;
-		}
-
-		if (__predict_false(toep->flags & TPF_FIN_SENT))
-			panic("%s: excess tx.", __func__);
-
-		/*
-		 * We have a PDU to send.  All of it goes out in one WR so 'm'
-		 * is NULL.  A PDU's length is always a multiple of 4.
-		 */
-		MPASS(m == NULL);
-		MPASS((plen & 3) == 0);
-		MPASS(sndptr->m_pkthdr.len == plen);
-
-		shove = !(tp->t_flags & TF_MORETOCOME);
-		ulp_submode = mbuf_ulp_submode(sndptr);
-		MPASS(ulp_submode < nitems(ulp_extra_len));
-
-		/*
-		 * plen doesn't include header and data digests, which are
-		 * generated and inserted in the right places by the TOE, but
-		 * they do occupy TCP sequence space and need to be accounted
-		 * for.
-		 */
-		adjusted_plen = plen + ulp_extra_len[ulp_submode];
-		if (plen <= max_imm) {
-
-			/* Immediate data tx */
-
-			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
-					&toep->ofld_txq->wrq);
-			if (wr == NULL) {
-				/* XXX: how will we recover from this? */
-				toep->flags |= TPF_TX_SUSPENDED;
-				return;
-			}
-			txwr = wrtod(wr);
-			credits = howmany(wr->wr_len, 16);
-			write_tx_wr(txwr, toep, plen, adjusted_plen, credits,
-			    shove, ulp_submode);
-			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
-			nsegs = 0;
-		} else {
-			int wr_len;
-
-			/* DSGL tx */
-			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
-			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
-			wr = alloc_wrqe(roundup2(wr_len, 16),
-			    &toep->ofld_txq->wrq);
-			if (wr == NULL) {
-				/* XXX: how will we recover from this? */
-				toep->flags |= TPF_TX_SUSPENDED;
-				return;
-			}
-			txwr = wrtod(wr);
-			credits = howmany(wr_len, 16);
-			write_tx_wr(txwr, toep, 0, adjusted_plen, credits,
-			    shove, ulp_submode);
-			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
-			    max_nsegs_1mbuf);
-			if (wr_len & 0xf) {
-				uint64_t *pad = (uint64_t *)
-				    ((uintptr_t)txwr + wr_len);
-				*pad = 0;
-			}
+		wr = write_iscsi_mbuf_wr(toep, sndptr);
+		if (wr == NULL) {
+			toep->flags |= TPF_TX_SUSPENDED;
+			return;
 		}
 
+		plen = sndptr->m_pkthdr.len;
+		credits = howmany(wr->wr_len, 16);
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
@@ -1068,16 +1103,19 @@ t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
-		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
+
+		/*
+		 * Ensure there are enough credits for a full-sized WR
+		 * as page pod WRs can be full-sized.
+		 */
+		if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 &&
 		    toep->tx_nocompl >= toep->tx_total / 4) {
-			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
+			wrhdr = wrtod(wr);
+			wrhdr->hi |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
-		tp->snd_nxt += adjusted_plen;
-		tp->snd_max += adjusted_plen;
-
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
@@ -1092,9 +1130,6 @@ t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 		}
 		toep->txsd_avail--;
 
-		counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, 1);
-		counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen);
-
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 
diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c
index b0d53dd63997..e87d013a0453 100644
--- a/sys/dev/cxgbe/tom/t4_ddp.c
+++ b/sys/dev/cxgbe/tom/t4_ddp.c
@@ -1081,11 +1081,30 @@ t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
 	return (0);
 }
 
+static struct mbuf *
+alloc_raw_wr_mbuf(int len)
+{
+	struct mbuf *m;
+
+	if (len <= MHLEN)
+		m = m_gethdr(M_NOWAIT, MT_DATA);
+	else if (len <= MCLBYTES)
+		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+	else
+		m = NULL;
+	if (m == NULL)
+		return (NULL);
+	m->m_pkthdr.len = len;
+	m->m_len = len;
+	set_mbuf_raw_wr(m, true);
+	return (m);
+}
+
 int
-t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
+t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
     struct ppod_reservation *prsv, vm_offset_t buf, int buflen)
 {
-	struct wrqe *wr;
+	struct inpcb *inp = toep->inp;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
@@ -1094,6 +1113,8 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
 	uint32_t cmd;
 	struct ppod_region *pr = prsv->prsv_pr;
 	uintptr_t end_pva, pva, pa;
+	struct mbuf *m;
+	struct mbufq wrq;
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
@@ -1105,6 +1126,7 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	pva = trunc_page(buf);
 	end_pva = trunc_page(buf + buflen - 1);
+	mbufq_init(&wrq, INT_MAX);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
@@ -1113,12 +1135,14 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
-		wr = alloc_wrqe(len, wrq);
-		if (wr == NULL)
-			return (ENOMEM);	/* ok to just bail out */
-		ulpmc = wrtod(wr);
+		m = alloc_raw_wr_mbuf(len);
+		if (m == NULL) {
+			mbufq_drain(&wrq);
+			return (ENOMEM);
+		}
+		ulpmc = mtod(m, struct ulp_mem_io *);
 
-		INIT_ULPTX_WR(ulpmc, len, 0, 0);
+		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
@@ -1131,7 +1155,7 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
-			    V_PPOD_TID(tid) |
+			    V_PPOD_TID(toep->tid) |
 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
 			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
 			    V_PPOD_OFST(offset));
@@ -1148,7 +1172,7 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
-				    __func__, tid, i, k,
+				    __func__, toep->tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
@@ -1161,9 +1185,13 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
 			pva -= ddp_pgsz;
 		}
 
-		t4_wrq_tx(sc, wr);
+		mbufq_enqueue(&wrq, m);
 	}
 
+	INP_WLOCK(inp);
+	mbufq_concat(&toep->ulp_pduq, &wrq);
+	INP_WUNLOCK(inp);
+
 	MPASS(pva <= end_pva);
 
 	return (0);
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
index 68b3d29295f8..f1129b47cbcf 100644
--- a/sys/dev/cxgbe/tom/t4_tom.h
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -330,6 +330,22 @@ td_adapter(struct tom_data *td)
 	return (td->tod.tod_softc);
 }
 
+static inline void
+set_mbuf_raw_wr(struct mbuf *m, bool raw)
+{
+
+	M_ASSERTPKTHDR(m);
+	m->m_pkthdr.PH_per.eight[6] = raw;
+}
+
+static inline bool
+mbuf_raw_wr(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+	return (m->m_pkthdr.PH_per.eight[6]);
+}
+
 static inline void
 set_mbuf_ulp_submode(struct mbuf *m, uint8_t ulp_submode)
 {
@@ -423,7 +439,7 @@ int t4_alloc_page_pods_for_buf(struct ppod_region *, vm_offset_t, int,
     struct ppod_reservation *);
 int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int,
     struct pageset *);
-int t4_write_page_pods_for_buf(struct adapter *, struct sge_wrq *, int,
+int t4_write_page_pods_for_buf(struct adapter *, struct toepcb *,
     struct ppod_reservation *, vm_offset_t, int);
 void t4_free_page_pods(struct ppod_reservation *);
 int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *,