git: ec0cd287f55f - main - nvmf_che: NVMe-TCP offload support for Chelsio T7 adapters

Go to: [ bottom of page ] [ top of archives ] [ this month ]
From: John Baldwin <jhb_at_FreeBSD.org>
Date: Mon, 10 Nov 2025 15:51:31 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=ec0cd287f55f7ea93ff4ccfa4de0f70eca5fef75

commit ec0cd287f55f7ea93ff4ccfa4de0f70eca5fef75
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2025-11-10 15:50:48 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2025-11-10 15:50:48 +0000

    nvmf_che: NVMe-TCP offload support for Chelsio T7 adapters
    
    This provides an alternative NVMe over TCP transport which uses PDU
    offload for TOE connections on a T7.
    
    Similar to iSCSI offload via cxgbei.ko, nvmf_che uses DDP when
    possible to enable the NIC to DMA received data directly into I/O data
    buffers (pages from a struct bio on the host side, pages from a CTL
    I/O requests on the controller side) to avoid copying data on the host
    CPU.  nvmf_che is also able to receive a stream of C2H or H2C PDUs for
    a single data transfer when using DDP without processing the header of
    each PDU.
    
    Unlike cxgbei, nvmf_che aims to be mostly transparent to end users.
    Notably, neither nvmecontrol or ctld have to be explicitly asked to
    use an offload.  Instead, TCP queue pairs are claimed by this driver
    whenever they are eligible (e.g., using TOE).
    
    The main restriction of nvmf_che compared to the software TCP
    transport is that Chelsio adapters have a restriction on the largest
    PDU that can be sent and received.  When sending data, nvmf_che is
    able to split large C2H or H2C data requests across multiple PDUs
    without affecting nvmf(4) or nvmft(4).
    
    To avoid overly large PDUs when using nvmf(4), nvmf_che reports a data
    transfer limit that is honored by nvmf(4).  This ensures that the
    remote controller's PDUs will never be too large (since the command
    transfer size is limited to one PDU) and also ensures that nvmf(4)
    will never to try to send a command PDU with ICD that is too large.
    
    For nvmft(4), overly large command PDUs due to ICD are avoided by
    clamping the size of the reported IOCCSZ in the controller data.
    However, to ensure that H2C PDUs are sufficiently small, nvmf_che will
    only claim queue pairs which advertised a suitable MAXH2CDATA
    parameter during queue negotiation.  For ctld(8), this can be achieved
    by setting the MAXH2CDATA option in a transport-group, e.g. for T7:
    
    transport-group tg0 {
            discovery-auth-group no-authentication
            listen tcp 0.0.0.0
            listen tcp [::]
            listen discovery-tcp 0.0.0.0
            listen discovery-tcp [::]
            option MAXH2CDATA 32488
    }
    
    Sponsored by:   Chelsio Communications
---
 sys/dev/cxgbe/nvmf/nvmf_che.c       | 3330 +++++++++++++++++++++++++++++++++++
 sys/modules/cxgbe/Makefile          |    2 +
 sys/modules/cxgbe/nvmf_che/Makefile |   12 +
 3 files changed, 3344 insertions(+)

diff --git a/sys/dev/cxgbe/nvmf/nvmf_che.c b/sys/dev/cxgbe/nvmf/nvmf_che.c
new file mode 100644
index 000000000000..88d59b5e75aa
--- /dev/null
+++ b/sys/dev/cxgbe/nvmf/nvmf_che.c
@@ -0,0 +1,3330 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+
+#ifdef TCP_OFFLOAD
+#include <sys/bitset.h>
+#include <sys/capsicum.h>
+#include <sys/file.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/nv.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#include <netinet/toecore.h>
+
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_proto.h>
+#include <dev/nvmf/nvmf_tcp.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/nvmf_transport_internal.h>
+
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+
+#include "common/common.h"
+#include "common/t4_regs.h"
+#include "common/t4_tcb.h"
+#include "tom/t4_tom.h"
+
+/* Status code values in CPL_NVMT_CMP. */
+#define	CMP_STATUS_ERROR_MASK		0x7f
+#define	CMP_STATUS_NO_ERROR		0
+#define	CMP_STATUS_HEADER_DIGEST	1
+#define	CMP_STATUS_DIRECTION_MISMATCH	2
+#define	CMP_STATUS_DIGEST_FLAG_MISMATCH	3
+#define	CMP_STATUS_SUCCESS_NOT_LAST	4
+#define	CMP_STATUS_BAD_DATA_LENGTH	5
+#define	CMP_STATUS_USER_MODE_UNALLOCATED	6
+#define	CMP_STATUS_RQT_LIMIT		7
+#define	CMP_STATUS_RQT_WRAP		8
+#define	CMP_STATUS_RQT_BOUND		9
+#define	CMP_STATUS_TPT_LIMIT		16
+#define	CMP_STATUS_TPT_INVALID		17
+#define	CMP_STATUS_TPT_COLOUR_MISMATCH	18
+#define	CMP_STATUS_TPT_MISC		19
+#define	CMP_STATUS_TPT_WRAP		20
+#define	CMP_STATUS_TPT_BOUND		21
+#define	CMP_STATUS_TPT_LAST_PDU_UNALIGNED	22
+#define	CMP_STATUS_PBL_LIMIT		24
+#define	CMP_STATUS_DATA_DIGEST		25
+#define	CMP_STATUS_DDP			0x80
+
+/*
+ * Transfer tags and CIDs with the MSB set are "unallocated" tags that
+ * pass data through to the freelist without using DDP.
+ */
+#define	CHE_FL_TAG_MASK		0x8000
+#define	CHE_MAX_FL_TAG		0x7fff
+#define	CHE_NUM_FL_TAGS		(CHE_MAX_FL_TAG + 1)
+
+#define	CHE_TAG_IS_FL(ttag)	(((ttag) & CHE_FL_TAG_MASK) == CHE_FL_TAG_MASK)
+#define	CHE_RAW_FL_TAG(ttag)	((ttag) & ~CHE_FL_TAG_MASK)
+#define	CHE_DDP_TAG(stag_idx, color)	((stag_idx) << 4 | (color))
+#define	CHE_STAG_COLOR(stag)	((stag) & 0xf)
+#define	CHE_STAG_IDX(stag)	((stag) >> 4)
+#define	CHE_DDP_MAX_COLOR	0xf
+
+#define	CHE_DDP_NO_TAG		0xffff
+
+/*
+ * A bitmap of non-DDP CIDs in use on the host.  Since there is no
+ * _BIT_FFC (find first clear), the bitset is inverted so that a clear
+ * bit indicates an in-use CID.
+ */
+BITSET_DEFINE(fl_cid_set, CHE_NUM_FL_TAGS);
+#define	FL_CID_INIT(p)		__BIT_FILL(CHE_NUM_FL_TAGS, p)
+#define	FL_CID_BUSY(n, p)	__BIT_CLR(CHE_NUM_FL_TAGS, n, p)
+#define	FL_CID_ISACTIVE(n, p)	!__BIT_ISSET(CHE_NUM_FL_TAGS, n, p)
+#define	FL_CID_FREE(n, p)	__BIT_SET(CHE_NUM_FL_TAGS, n, p)
+#define	FL_CID_FINDFREE_AT(p, start)	__BIT_FFS_AT(CHE_NUM_FL_TAGS, p, start)
+
+/*
+ * The TCP sequence number of both CPL_NVMT_DATA and CPL_NVMT_CMP
+ * mbufs are saved here while the mbuf is in qp->rx_data and qp->rx_pdus.
+ */
+#define	nvmf_tcp_seq	PH_loc.thirtytwo[0]
+
+/*
+ * The CPL status of CPL_NVMT_CMP mbufs are saved here while the mbuf
+ * is in qp->rx_pdus.
+ */
+#define	nvmf_cpl_status	PH_loc.eight[4]
+
+struct nvmf_che_capsule;
+struct nvmf_che_qpair;
+
+struct nvmf_che_adapter {
+	struct adapter *sc;
+
+	u_int ddp_threshold;
+	u_int max_transmit_pdu;
+	u_int max_receive_pdu;
+	bool nvmt_data_iqe;
+
+	struct sysctl_ctx_list ctx;	/* from uld_activate to deactivate */
+};
+
+struct nvmf_che_command_buffer {
+	struct nvmf_che_qpair *qp;
+
+	struct nvmf_io_request io;
+	size_t	data_len;
+	size_t	data_xfered;
+	uint32_t data_offset;
+
+	u_int	refs;
+	int	error;
+
+	bool	ddp_ok;
+	uint16_t cid;
+	uint16_t ttag;
+	uint16_t original_cid;	/* Host only */
+
+	TAILQ_ENTRY(nvmf_che_command_buffer) link;
+
+	/* Fields used for DDP. */
+	struct fw_ri_tpte tpte;
+	uint64_t *pbl;
+	uint32_t pbl_addr;
+	uint32_t pbl_len;
+
+	/* Controller only */
+	struct nvmf_che_capsule *cc;
+};
+
+struct nvmf_che_command_buffer_list {
+	TAILQ_HEAD(, nvmf_che_command_buffer) head;
+	struct mtx lock;
+};
+
+struct nvmf_che_qpair {
+	struct nvmf_qpair qp;
+
+	struct socket *so;
+	struct toepcb *toep;
+	struct nvmf_che_adapter *nca;
+
+	volatile u_int refs;	/* Every allocated capsule holds a reference */
+	uint8_t	txpda;
+	uint8_t rxpda;
+	bool header_digests;
+	bool data_digests;
+	uint32_t maxr2t;
+	uint32_t maxh2cdata;	/* Controller only */
+	uint32_t max_rx_data;
+	uint32_t max_tx_data;
+	uint32_t max_icd;	/* Host only */
+	uint32_t max_ioccsz;	/* Controller only */
+	union {
+		uint16_t next_fl_ttag;	/* Controller only */
+		uint16_t next_cid;	/* Host only */
+	};
+	uint16_t next_ddp_tag;
+	u_int num_fl_ttags;	/* Controller only */
+	u_int active_fl_ttags;	/* Controller only */
+	u_int num_ddp_tags;
+	u_int active_ddp_tags;
+	bool send_success;	/* Controller only */
+	uint8_t ddp_color;
+	uint32_t tpt_offset;
+
+	/* Receive state. */
+	struct thread *rx_thread;
+	struct cv rx_cv;
+	bool	rx_shutdown;
+	int	rx_error;
+	struct mbufq rx_data;	/* Data received via CPL_NVMT_DATA. */
+	struct mbufq rx_pdus;	/* PDU headers received via CPL_NVMT_CMP. */
+
+	/* Transmit state. */
+	struct thread *tx_thread;
+	struct cv tx_cv;
+	bool	tx_shutdown;
+	STAILQ_HEAD(, nvmf_che_capsule) tx_capsules;
+
+	struct nvmf_che_command_buffer_list tx_buffers;
+	struct nvmf_che_command_buffer_list rx_buffers;
+
+	/*
+	 * For the controller, an RX command buffer can be in one of
+	 * three locations, all protected by the rx_buffers.lock.  If
+	 * a receive request is waiting for either an R2T slot for its
+	 * command (due to exceeding MAXR2T), or a transfer tag it is
+	 * placed on the rx_buffers list.  When a request is allocated
+	 * an active transfer tag, it moves to either the
+	 * open_ddp_tags[] or open_fl_ttags[] array (indexed by the
+	 * tag) until it completes.
+	 *
+	 * For the host, an RX command buffer using DDP is in
+	 * open_ddp_tags[], otherwise it is in rx_buffers.
+	 */
+	struct nvmf_che_command_buffer **open_ddp_tags;
+	struct nvmf_che_command_buffer **open_fl_ttags;	/* Controller only */
+
+	/*
+	 * For the host, CIDs submitted by nvmf(4) must be rewritten
+	 * to either use DDP or not use DDP.  The CID in response
+	 * capsules must be restored to their original value.  For
+	 * DDP, the original CID is stored in the command buffer.
+	 * These variables manage non-DDP CIDs.
+	 */
+	uint16_t *fl_cids;		/* Host only */
+	struct fl_cid_set *fl_cid_set;	/* Host only */
+	struct mtx fl_cid_lock;		/* Host only */
+};
+
+struct nvmf_che_rxpdu {
+	struct mbuf *m;
+	const struct nvme_tcp_common_pdu_hdr *hdr;
+	uint32_t data_len;
+	bool data_digest_mismatch;
+	bool ddp;
+};
+
+struct nvmf_che_capsule {
+	struct nvmf_capsule nc;
+
+	volatile u_int refs;
+
+	struct nvmf_che_rxpdu rx_pdu;
+
+	uint32_t active_r2ts;		/* Controller only */
+#ifdef INVARIANTS
+	uint32_t tx_data_offset;	/* Controller only */
+	u_int pending_r2ts;		/* Controller only */
+#endif
+
+	STAILQ_ENTRY(nvmf_che_capsule) link;
+};
+
+#define	CCAP(nc)	((struct nvmf_che_capsule *)(nc))
+#define	CQP(qp)		((struct nvmf_che_qpair *)(qp))
+
+static void	che_release_capsule(struct nvmf_che_capsule *cc);
+static void	che_free_qpair(struct nvmf_qpair *nq);
+
+SYSCTL_NODE(_kern_nvmf, OID_AUTO, che, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    "Chelsio TCP offload transport");
+
+static u_int che_max_transmit_pdu = 32 * 1024;
+SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_transmit_pdu, CTLFLAG_RWTUN,
+    &che_max_transmit_pdu, 0,
+    "Maximum size of a transmitted PDU");
+
+static u_int che_max_receive_pdu = 32 * 1024;
+SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_receive_pdu, CTLFLAG_RWTUN,
+    &che_max_receive_pdu, 0,
+    "Maximum size of a received PDU");
+
+static int use_dsgl = 1;
+SYSCTL_INT(_kern_nvmf_che, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0,
+    "Use DSGL for PBL/FastReg (default=1)");
+
+static int inline_threshold = 256;
+SYSCTL_INT(_kern_nvmf_che, OID_AUTO, inline_threshold, CTLFLAG_RWTUN,
+    &inline_threshold, 0,
+    "inline vs dsgl threshold (default=256)");
+
+static int ddp_tags_per_qp = 128;
+SYSCTL_INT(_kern_nvmf_che, OID_AUTO, ddp_tags_per_qp, CTLFLAG_RWTUN,
+    &ddp_tags_per_qp, 0,
+    "Number of DDP tags to reserve for each queue pair");
+
+static MALLOC_DEFINE(M_NVMF_CHE, "nvmf_che", "Chelsio NVMe-TCP offload");
+
+/*
+ * PBL regions consist of N full-sized pages.  TPT entries support an
+ * initial offset into the first page (FBO) and can handle a partial
+ * length on the last page.
+ */
+static bool
+che_ddp_io_check(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io)
+{
+	const struct memdesc *mem = &io->io_mem;
+	struct bus_dma_segment *ds;
+	int i;
+
+	if (io->io_len < qp->nca->ddp_threshold) {
+		return (false);
+	}
+
+	switch (mem->md_type) {
+	case MEMDESC_VADDR:
+	case MEMDESC_PADDR:
+	case MEMDESC_VMPAGES:
+		return (true);
+	case MEMDESC_VLIST:
+	case MEMDESC_PLIST:
+		/*
+		 * Require all but the first segment to start on a
+		 * page boundary.  Require all but the last segment to
+		 * end on a page boundary.
+		 */
+		ds = mem->u.md_list;
+		for (i = 0; i < mem->md_nseg; i++, ds++) {
+			if (i != 0 && ds->ds_addr % PAGE_SIZE != 0)
+				return (false);
+			if (i != mem->md_nseg - 1 &&
+			    (ds->ds_addr + ds->ds_len) % PAGE_SIZE != 0)
+				return (false);
+		}
+		return (true);
+	default:
+		/*
+		 * Other types could be validated with more work, but
+		 * they aren't used currently by nvmf(4) or nvmft(4).
+		 */
+		return (false);
+	}
+}
+
+static u_int
+che_fbo(struct nvmf_che_command_buffer *cb)
+{
+	struct memdesc *mem = &cb->io.io_mem;
+
+	switch (mem->md_type) {
+	case MEMDESC_VADDR:
+		return ((uintptr_t)mem->u.md_vaddr & PAGE_MASK);
+	case MEMDESC_PADDR:
+		return (mem->u.md_paddr & PAGE_MASK);
+	case MEMDESC_VMPAGES:
+		return (mem->md_offset);
+	case MEMDESC_VLIST:
+	case MEMDESC_PLIST:
+		return (mem->u.md_list[0].ds_addr & PAGE_MASK);
+	default:
+		__assert_unreachable();
+	}
+}
+
+static u_int
+che_npages(struct nvmf_che_command_buffer *cb)
+{
+	return (howmany(che_fbo(cb) + cb->io.io_len, PAGE_SIZE));
+}
+
+static struct nvmf_che_command_buffer *
+che_alloc_command_buffer(struct nvmf_che_qpair *qp,
+    const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
+    uint16_t cid)
+{
+	struct nvmf_che_command_buffer *cb;
+
+	cb = malloc(sizeof(*cb), M_NVMF_CHE, M_WAITOK);
+	cb->qp = qp;
+	cb->io = *io;
+	cb->data_offset = data_offset;
+	cb->data_len = data_len;
+	cb->data_xfered = 0;
+	refcount_init(&cb->refs, 1);
+	cb->error = 0;
+	cb->ddp_ok = che_ddp_io_check(qp, io);
+	cb->cid = cid;
+	cb->ttag = 0;
+	cb->original_cid = 0;
+	cb->cc = NULL;
+	cb->pbl = NULL;
+
+	return (cb);
+}
+
+static void
+che_hold_command_buffer(struct nvmf_che_command_buffer *cb)
+{
+	refcount_acquire(&cb->refs);
+}
+
+static void
+che_free_command_buffer(struct nvmf_che_command_buffer *cb)
+{
+	nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
+	if (cb->cc != NULL)
+		che_release_capsule(cb->cc);
+	MPASS(cb->pbl == NULL);
+	free(cb, M_NVMF_CHE);
+}
+
+static void
+che_release_command_buffer(struct nvmf_che_command_buffer *cb)
+{
+	if (refcount_release(&cb->refs))
+		che_free_command_buffer(cb);
+}
+
+static void
+che_add_command_buffer(struct nvmf_che_command_buffer_list *list,
+    struct nvmf_che_command_buffer *cb)
+{
+	mtx_assert(&list->lock, MA_OWNED);
+	TAILQ_INSERT_HEAD(&list->head, cb, link);
+}
+
+static struct nvmf_che_command_buffer *
+che_find_command_buffer(struct nvmf_che_command_buffer_list *list,
+    uint16_t cid)
+{
+	struct nvmf_che_command_buffer *cb;
+
+	mtx_assert(&list->lock, MA_OWNED);
+	TAILQ_FOREACH(cb, &list->head, link) {
+		if (cb->cid == cid)
+			return (cb);
+	}
+	return (NULL);
+}
+
+static void
+che_remove_command_buffer(struct nvmf_che_command_buffer_list *list,
+    struct nvmf_che_command_buffer *cb)
+{
+	mtx_assert(&list->lock, MA_OWNED);
+	TAILQ_REMOVE(&list->head, cb, link);
+}
+
+static void
+che_purge_command_buffer(struct nvmf_che_command_buffer_list *list,
+    uint16_t cid)
+{
+	struct nvmf_che_command_buffer *cb;
+
+	mtx_lock(&list->lock);
+	cb = che_find_command_buffer(list, cid);
+	if (cb != NULL) {
+		che_remove_command_buffer(list, cb);
+		mtx_unlock(&list->lock);
+		che_release_command_buffer(cb);
+	} else
+		mtx_unlock(&list->lock);
+}
+
+static int
+che_write_mem_inline(struct adapter *sc, struct toepcb *toep, uint32_t addr,
+    uint32_t len, void *data, struct mbufq *wrq)
+{
+	struct mbuf *m;
+	char *cp;
+	int copy_len, i, num_wqe, wr_len;
+
+#ifdef VERBOSE_TRACES
+	CTR(KTR_CXGBE, "%s: addr 0x%x len %u", __func__, addr << 5, len);
+#endif
+	num_wqe = DIV_ROUND_UP(len, T4_MAX_INLINE_SIZE);
+	cp = data;
+	for (i = 0; i < num_wqe; i++) {
+		copy_len = min(len, T4_MAX_INLINE_SIZE);
+		wr_len = T4_WRITE_MEM_INLINE_LEN(copy_len);
+
+		m = alloc_raw_wr_mbuf(wr_len);
+		if (m == NULL)
+			return (ENOMEM);
+		t4_write_mem_inline_wr(sc, mtod(m, void *), wr_len, toep->tid,
+		    addr, copy_len, cp, 0);
+		if (cp != NULL)
+			cp += T4_MAX_INLINE_SIZE;
+		addr += T4_MAX_INLINE_SIZE >> 5;
+		len -= T4_MAX_INLINE_SIZE;
+
+		mbufq_enqueue(wrq, m);
+	}
+	return (0);
+}
+
+static int
+che_write_mem_dma_aligned(struct adapter *sc, struct toepcb *toep,
+    uint32_t addr, uint32_t len, void *data, struct mbufq *wrq)
+{
+	struct mbuf *m;
+	vm_offset_t va;
+	u_int todo;
+	int wr_len;
+
+	/* First page. */
+	va = (vm_offset_t)data;
+	todo = min(PAGE_SIZE - (va % PAGE_SIZE), len);
+	wr_len = T4_WRITE_MEM_DMA_LEN;
+	m = alloc_raw_wr_mbuf(wr_len);
+	if (m == NULL)
+		return (ENOMEM);
+	t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr,
+	    todo, pmap_kextract(va), 0);
+	mbufq_enqueue(wrq, m);
+	len -= todo;
+	addr += todo >> 5;
+	va += todo;
+
+	while (len > 0) {
+		MPASS(va == trunc_page(va));
+		todo = min(PAGE_SIZE, len);
+		m = alloc_raw_wr_mbuf(wr_len);
+		if (m == NULL)
+			return (ENOMEM);
+		t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid,
+		    addr, todo, pmap_kextract(va), 0);
+		mbufq_enqueue(wrq, m);
+		len -= todo;
+		addr += todo >> 5;
+		va += todo;
+	}
+	return (0);
+}
+
+static int
+che_write_adapter_mem(struct nvmf_che_qpair *qp, uint32_t addr, uint32_t len,
+    void *data)
+{
+	struct adapter *sc = qp->nca->sc;
+	struct toepcb *toep = qp->toep;
+	struct socket *so = qp->so;
+	struct inpcb *inp = sotoinpcb(so);
+	struct mbufq mq;
+	int error;
+
+	mbufq_init(&mq, INT_MAX);
+	if (!use_dsgl || len < inline_threshold || data == NULL)
+		error = che_write_mem_inline(sc, toep, addr, len, data, &mq);
+	else
+		error = che_write_mem_dma_aligned(sc, toep, addr, len, data,
+		    &mq);
+	if (__predict_false(error != 0))
+		goto error;
+
+	INP_WLOCK(inp);
+	if ((inp->inp_flags & INP_DROPPED) != 0) {
+		INP_WUNLOCK(inp);
+		error = ECONNRESET;
+		goto error;
+	}
+	mbufq_concat(&toep->ulp_pduq, &mq);
+	INP_WUNLOCK(inp);
+	return (0);
+
+error:
+	mbufq_drain(&mq);
+	return (error);
+}
+
+static bool
+che_alloc_pbl(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb)
+{
+	struct adapter *sc = qp->nca->sc;
+	struct memdesc *mem = &cb->io.io_mem;
+	uint64_t *pbl;
+	uint32_t addr, len;
+	u_int i, npages;
+	int error;
+
+	MPASS(cb->pbl == NULL);
+	MPASS(cb->ddp_ok);
+
+	/* Hardware limit?  iWARP only enforces this for T5. */
+	if (cb->io.io_len >= (8 * 1024 * 1024 * 1024ULL))
+		return (false);
+
+	npages = che_npages(cb);
+	len = roundup2(npages, 4) * sizeof(*cb->pbl);
+	addr = t4_pblpool_alloc(sc, len);
+	if (addr == 0)
+		return (false);
+
+	pbl = malloc(len, M_NVMF_CHE, M_NOWAIT | M_ZERO);
+	if (pbl == NULL) {
+		t4_pblpool_free(sc, addr, len);
+		return (false);
+	}
+
+	switch (mem->md_type) {
+	case MEMDESC_VADDR:
+	{
+		vm_offset_t va;
+
+		va = trunc_page((uintptr_t)mem->u.md_vaddr);
+		for (i = 0; i < npages; i++)
+			pbl[i] = htobe64(pmap_kextract(va + i * PAGE_SIZE));
+		break;
+	}
+	case MEMDESC_PADDR:
+	{
+		vm_paddr_t pa;
+
+		pa = trunc_page(mem->u.md_paddr);
+		for (i = 0; i < npages; i++)
+			pbl[i] = htobe64(pa + i * PAGE_SIZE);
+		break;
+	}
+	case MEMDESC_VMPAGES:
+		for (i = 0; i < npages; i++)
+			pbl[i] = htobe64(VM_PAGE_TO_PHYS(mem->u.md_ma[i]));
+		break;
+	case MEMDESC_VLIST:
+	{
+		struct bus_dma_segment *ds;
+		vm_offset_t va;
+		vm_size_t len;
+		u_int j, k;
+
+		i = 0;
+		ds = mem->u.md_list;
+		for (j = 0; j < mem->md_nseg; j++, ds++) {
+			va = trunc_page((uintptr_t)ds->ds_addr);
+			len = ds->ds_len;
+			if (ds->ds_addr % PAGE_SIZE != 0)
+				len += ds->ds_addr % PAGE_SIZE;
+			for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
+				pbl[i] = htobe64(pmap_kextract(va +
+					k * PAGE_SIZE));
+				i++;
+			}
+		}
+		MPASS(i == npages);
+		break;
+	}
+	case MEMDESC_PLIST:
+	{
+		struct bus_dma_segment *ds;
+		vm_paddr_t pa;
+		vm_size_t len;
+		u_int j, k;
+
+		i = 0;
+		ds = mem->u.md_list;
+		for (j = 0; j < mem->md_nseg; j++, ds++) {
+			pa = trunc_page((vm_paddr_t)ds->ds_addr);
+			len = ds->ds_len;
+			if (ds->ds_addr % PAGE_SIZE != 0)
+				len += ds->ds_addr % PAGE_SIZE;
+			for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
+				pbl[i] = htobe64(pa + k * PAGE_SIZE);
+				i++;
+			}
+		}
+		MPASS(i == npages);
+		break;
+	}
+	default:
+		__assert_unreachable();
+	}
+
+	error = che_write_adapter_mem(qp, addr >> 5, len, pbl);
+	if (error != 0) {
+		t4_pblpool_free(sc, addr, len);
+		free(pbl, M_NVMF_CHE);
+		return (false);
+	}
+
+	cb->pbl = pbl;
+	cb->pbl_addr = addr;
+	cb->pbl_len = len;
+
+	return (true);
+}
+
+static void
+che_free_pbl(struct nvmf_che_command_buffer *cb)
+{
+	free(cb->pbl, M_NVMF_CHE);
+	t4_pblpool_free(cb->qp->nca->sc, cb->pbl_addr, cb->pbl_len);
+	cb->pbl = NULL;
+	cb->pbl_addr = 0;
+	cb->pbl_len = 0;
+}
+
+static bool
+che_write_tpt_entry(struct nvmf_che_qpair *qp,
+    struct nvmf_che_command_buffer *cb, uint16_t stag)
+{
+	uint32_t tpt_addr;
+	int error;
+
+	cb->tpte.valid_to_pdid = htobe32(F_FW_RI_TPTE_VALID |
+	    V_FW_RI_TPTE_STAGKEY(CHE_STAG_COLOR(stag)) |
+	    F_FW_RI_TPTE_STAGSTATE |
+	    V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) |
+	    V_FW_RI_TPTE_PDID(0));
+	cb->tpte.locread_to_qpid = htobe32(
+	    V_FW_RI_TPTE_PERM(FW_RI_MEM_ACCESS_REM_WRITE) |
+	    V_FW_RI_TPTE_ADDRTYPE(FW_RI_ZERO_BASED_TO) |
+	    V_FW_RI_TPTE_PS(PAGE_SIZE) |
+	    V_FW_RI_TPTE_QPID(qp->toep->tid));
+#define PBL_OFF(qp, a)	((a) - (qp)->nca->sc->vres.pbl.start)
+	cb->tpte.nosnoop_pbladdr =
+	    htobe32(V_FW_RI_TPTE_PBLADDR(PBL_OFF(qp, cb->pbl_addr) >> 3));
+	cb->tpte.len_lo = htobe32(cb->data_len);
+	cb->tpte.va_hi = 0;
+	cb->tpte.va_lo_fbo = htobe32(che_fbo(cb));
+	cb->tpte.dca_mwbcnt_pstag = 0;
+	cb->tpte.len_hi = htobe32(cb->data_offset);
+
+	tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
+	    (qp->nca->sc->vres.stag.start >> 5);
+
+	error = che_write_adapter_mem(qp, tpt_addr, sizeof(cb->tpte),
+	    &cb->tpte);
+	return (error == 0);
+}
+
+static void
+che_clear_tpt_entry(struct nvmf_che_qpair *qp, uint16_t stag)
+{
+	uint32_t tpt_addr;
+
+	tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
+	    (qp->nca->sc->vres.stag.start >> 5);
+
+	(void)che_write_adapter_mem(qp, tpt_addr, sizeof(struct fw_ri_tpte),
+	    NULL);
+}
+
+static uint16_t
+che_alloc_ddp_stag(struct nvmf_che_qpair *qp,
+    struct nvmf_che_command_buffer *cb)
+{
+	uint16_t stag_idx;
+
+	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
+	MPASS(cb->ddp_ok);
+
+	if (qp->active_ddp_tags == qp->num_ddp_tags)
+		return (CHE_DDP_NO_TAG);
+
+	MPASS(qp->num_ddp_tags != 0);
+
+	stag_idx = qp->next_ddp_tag;
+	for (;;) {
+		if (qp->open_ddp_tags[stag_idx] == NULL)
+			break;
+		if (stag_idx == qp->num_ddp_tags - 1) {
+			stag_idx = 0;
+			if (qp->ddp_color == CHE_DDP_MAX_COLOR)
+				qp->ddp_color = 0;
+			else
+				qp->ddp_color++;
+		} else
+			stag_idx++;
+		MPASS(stag_idx != qp->next_ddp_tag);
+	}
+	if (stag_idx == qp->num_ddp_tags - 1)
+		qp->next_ddp_tag = 0;
+	else
+		qp->next_ddp_tag = stag_idx + 1;
+
+	qp->active_ddp_tags++;
+	qp->open_ddp_tags[stag_idx] = cb;
+
+	return (CHE_DDP_TAG(stag_idx, qp->ddp_color));
+}
+
+static void
+che_free_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
+    uint16_t stag)
+{
+	MPASS(!CHE_TAG_IS_FL(stag));
+
+	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
+
+	MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
+
+	qp->open_ddp_tags[CHE_STAG_IDX(stag)] = NULL;
+	qp->active_ddp_tags--;
+}
+
+static uint16_t
+che_alloc_ddp_tag(struct nvmf_che_qpair *qp,
+    struct nvmf_che_command_buffer *cb)
+{
+	uint16_t stag;
+
+	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
+
+	if (!cb->ddp_ok)
+		return (CHE_DDP_NO_TAG);
+
+	stag = che_alloc_ddp_stag(qp, cb);
+	if (stag == CHE_DDP_NO_TAG) {
+		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_no_stag,
+		    1);
+		return (CHE_DDP_NO_TAG);
+	}
+
+	if (!che_alloc_pbl(qp, cb)) {
+		che_free_ddp_stag(qp, cb, stag);
+		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
+		return (CHE_DDP_NO_TAG);
+	}
+
+	if (!che_write_tpt_entry(qp, cb, stag)) {
+		che_free_pbl(cb);
+		che_free_ddp_stag(qp, cb, stag);
+		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
+		return (CHE_DDP_NO_TAG);
+	}
+
+	counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_ok, 1);
+	return (stag);
+}
+
+static void
+che_free_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
+    uint16_t stag)
+{
+	MPASS(!CHE_TAG_IS_FL(stag));
+
+	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
+
+	MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
+
+	che_clear_tpt_entry(qp, stag);
+	che_free_pbl(cb);
+	che_free_ddp_stag(qp, cb, stag);
+}
+
+static void
+nvmf_che_write_pdu(struct nvmf_che_qpair *qp, struct mbuf *m)
+{
+	struct epoch_tracker et;
+	struct socket *so = qp->so;
+	struct inpcb *inp = sotoinpcb(so);
+	struct toepcb *toep = qp->toep;
+
+	CURVNET_SET(so->so_vnet);
+	NET_EPOCH_ENTER(et);
+	INP_WLOCK(inp);
+	if (__predict_false(inp->inp_flags & INP_DROPPED) ||
+	    __predict_false((toep->flags & TPF_ATTACHED) == 0)) {
+		m_freem(m);
+	} else {
+		mbufq_enqueue(&toep->ulp_pduq, m);
+		t4_push_pdus(toep->vi->adapter, toep, 0);
+	}
+	INP_WUNLOCK(inp);
+	NET_EPOCH_EXIT(et);
+	CURVNET_RESTORE();
+}
+
+static void
+nvmf_che_report_error(struct nvmf_che_qpair *qp, uint16_t fes, uint32_t fei,
+    struct mbuf *rx_pdu, u_int hlen)
+{
+	struct nvme_tcp_term_req_hdr *hdr;
+	struct mbuf *m;
+
+	if (hlen != 0) {
+		hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
+		hlen = min(hlen, m_length(rx_pdu, NULL));
+	}
+
+	m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, M_PKTHDR);
+	m->m_len = sizeof(*hdr) + hlen;
+	m->m_pkthdr.len = m->m_len;
+	hdr = mtod(m, void *);
+	memset(hdr, 0, sizeof(*hdr));
+	hdr->common.pdu_type = qp->qp.nq_controller ?
+	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
+	hdr->common.hlen = sizeof(*hdr);
+	hdr->common.plen = sizeof(*hdr) + hlen;
+	hdr->fes = htole16(fes);
+	le32enc(hdr->fei, fei);
+	if (hlen != 0)
+		m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
+
+	nvmf_che_write_pdu(qp, m);
+}
+
+static int
+nvmf_che_validate_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
+{
+	const struct nvme_tcp_common_pdu_hdr *ch;
+	struct mbuf *m = pdu->m;
+	uint32_t data_len, fei, plen, rx_digest;
+	u_int hlen, cpl_error;
+	int error;
*** 2448 LINES SKIPPED ***