git: a15f7c96a276 - main - nvmft: The in-kernel NVMe over Fabrics controller

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 03 May 2024 00:16:09 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=a15f7c96a27644de3ed3bfcf5feee285ebc1cc91

commit a15f7c96a27644de3ed3bfcf5feee285ebc1cc91
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2024-05-02 23:34:45 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2024-05-02 23:38:30 +0000

    nvmft: The in-kernel NVMe over Fabrics controller
    
    This is the server (target in SCSI terms) for NVMe over Fabrics.
    Userland is responsible for accepting a new queue pair and receiving
    the initial Connect command before handing the queue pair off via an
    ioctl to this CTL frontend.
    
    This frontend exposes CTL LUNs as NVMe namespaces to remote hosts.
    Users can ask LUNS to CTL that can be shared via either iSCSI or
    NVMeoF.
    
    Reviewed by:    imp
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D44726
---
 share/man/man4/Makefile                     |    1 +
 share/man/man4/nvmft.4                      |   85 ++
 sys/conf/NOTES                              |    2 +
 sys/conf/files                              |    6 +-
 sys/dev/nvmf/controller/ctl_frontend_nvmf.c | 1123 ++++++++++++++++++++++++++
 sys/dev/nvmf/controller/nvmft_controller.c  | 1130 +++++++++++++++++++++++++++
 sys/dev/nvmf/controller/nvmft_qpair.c       |  361 +++++++++
 sys/dev/nvmf/controller/nvmft_var.h         |  174 +++++
 sys/modules/nvmf/Makefile                   |    3 +-
 sys/modules/nvmf/nvmft/Makefile             |   10 +
 10 files changed, 2893 insertions(+), 2 deletions(-)

diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index 7b6f8849be59..32ea3a1b6991 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -410,6 +410,7 @@ MAN=	aac.4 \
 	nvme.4 \
 	nvmf.4 \
 	nvmf_tcp.4 \
+	nvmft.4 \
 	${_nvram.4} \
 	oce.4 \
 	ocs_fc.4\
diff --git a/share/man/man4/nvmft.4 b/share/man/man4/nvmft.4
new file mode 100644
index 000000000000..d121fb97b514
--- /dev/null
+++ b/share/man/man4/nvmft.4
@@ -0,0 +1,85 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2024 Chelsio Communications, Inc.
+.\"
+.Dd May 2, 2024
+.Dt NVMFT 4
+.Os
+.Sh NAME
+.Nm nvmft
+.Nd "NVM Express over Fabrics CAM Target Layer frontend"
+.Sh SYNOPSIS
+To compile the subsystem into the kernel,
+place the following lines in the
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device nvmft"
+.Cd "device ctl"
+.Ed
+.Pp
+Alternatively, to load the subsystem as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+nvmft_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver provides the kernel component of an NVM Express over Fabrics
+controller.
+The NVMeoF controller is the server exporting namespaces backed by
+local files and volumes to remote hosts.
+.Nm
+follows the dynamic controller model and creates a new dynamic controller
+for each association.
+.Pp
+.Nm
+is implemented as a
+.Xr ctl 4
+frontend and exports CAM Target Layer LUNs as namespaces to remote hosts.
+LUNs can be configured via
+.Xr ctladm 8 .
+.Pp
+Associations between the local controller and remote hosts are managed
+using both the
+.Xr nvmfd 8
+daemon and the
+.Xr ctladm 8
+utility.
+The
+.Xr nvmfd 8
+daemon listens for new associations and handles transport-specific
+negotiation before handing off connected queue pairs to
+.Nm
+which associates queue pairs with a suitable controller instance.
+The
+.Cm nvlist
+.Xr ctladm 8
+command lists active controllers.
+The
+.Cm nvterminate
+command terminates one or more associations between a local controller
+and a remote host.
+.Pp
+Associations require a supported transport such as
+.Xr nvmf_tcp 4
+for associations using TCP/IP.
+.Sh SEE ALSO
+.Xr ctl 4 ,
+.Xr nvmf 4 ,
+.Xr nvmf_tcp 4 ,
+.Xr ctladm 8 ,
+.Xr nvmfd 8
+.Sh HISTORY
+The
+.Nm
+module first appeared in
+.Fx 15.0 .
+.Sh AUTHORS
+The
+.Nm
+subsystem was developed by
+.An John Baldwin Aq Mt jhb@FreeBSD.org
+under sponsorship from Chelsio Communications, Inc.
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index ffb4b43f4efc..5819eeb57b2d 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1677,6 +1677,7 @@ device		mrsas		# LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
 #
 # nvme:	PCI-express NVM Express host controllers
 # nvmf:	NVM Express over Fabrics host
+# nvmft: NVM Express over Fabrics CAM Target Layer frontend
 # nvmf_tcp: TCP transport for NVM Express over Fabrics
 # nda:	CAM NVMe disk driver
 # nvd:	non-CAM NVMe disk driver
@@ -1684,6 +1685,7 @@ device		mrsas		# LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
 device		nvme		# PCI-express NVMe host driver
 options 	NVME_USE_NVD=1	# Use nvd(4) instead of the CAM nda(4) driver
 device		nvmf		# NVMeoF host driver
+device		nvmft		# NVMeoF ctl(4) frontend
 device		nvmf_tcp	# NVMeoF TCP transport
 device		nda		# NVMe direct access devices (aka disks)
 device		nvd		# expose NVMe namespaces as disks, depends on nvme
diff --git a/sys/conf/files b/sys/conf/files
index b23ec357a302..f68567aa9023 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2535,6 +2535,10 @@ dev/nvme/nvme_test.c		optional nvme
 dev/nvme/nvme_util.c		optional nvme
 dev/nvmem/nvmem.c		optional nvmem fdt
 dev/nvmem/nvmem_if.m		optional nvmem
+dev/nvmf/controller/ctl_frontend_nvmf.c		optional nvmft
+dev/nvmf/controller/nvmft_controller.c		optional nvmft
+dev/nvmf/controller/nvmft_subr.c		optional nvmft
+dev/nvmf/controller/nvmft_qpair.c		optional nvmft
 dev/nvmf/host/nvmf.c		optional nvmf
 dev/nvmf/host/nvmf_aer.c	optional nvmf
 dev/nvmf/host/nvmf_cmd.c	optional nvmf
@@ -2543,7 +2547,7 @@ dev/nvmf/host/nvmf_ns.c		optional nvmf
 dev/nvmf/host/nvmf_qpair.c	optional nvmf
 dev/nvmf/host/nvmf_sim.c	optional nvmf
 dev/nvmf/nvmf_tcp.c		optional nvmf_tcp
-dev/nvmf/nvmf_transport.c	optional nvmf
+dev/nvmf/nvmf_transport.c	optional nvmf | optional nvmft
 dev/oce/oce_hw.c		optional oce pci
 dev/oce/oce_if.c		optional oce pci
 dev/oce/oce_mbox.c		optional oce pci
diff --git a/sys/dev/nvmf/controller/ctl_frontend_nvmf.c b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c
new file mode 100644
index 000000000000..a203bb1c90a6
--- /dev/null
+++ b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c
@@ -0,0 +1,1123 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/dnv.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/memdesc.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
+
+#include <machine/bus.h>
+#include <machine/bus_dma.h>
+
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/controller/nvmft_subr.h>
+#include <dev/nvmf/controller/nvmft_var.h>
+
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_error.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl_frontend.h>
+
+/*
+ * Store pointers to the capsule and qpair in the two pointer members
+ * of CTL_PRIV_FRONTEND.
+ */
+#define	NVMFT_NC(io)	((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[0])
+#define	NVMFT_QP(io)	((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[1])
+
+static void	nvmft_done(union ctl_io *io);
+static int	nvmft_init(void);
+static int	nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data,
+    int flag, struct thread *td);
+static int	nvmft_shutdown(void);
+
+static TAILQ_HEAD(, nvmft_port) nvmft_ports;
+static struct sx nvmft_ports_lock;
+
+MALLOC_DEFINE(M_NVMFT, "nvmft", "NVMe over Fabrics controller");
+
+static struct ctl_frontend nvmft_frontend = {
+	.name = "nvmf",
+	.init = nvmft_init,
+	.ioctl = nvmft_ioctl,
+	.fe_dump = NULL,
+	.shutdown = nvmft_shutdown,
+};
+
+static void
+nvmft_online(void *arg)
+{
+	struct nvmft_port *np = arg;
+
+	sx_xlock(&np->lock);
+	np->online = true;
+	sx_xunlock(&np->lock);
+}
+
+static void
+nvmft_offline(void *arg)
+{
+	struct nvmft_port *np = arg;
+	struct nvmft_controller *ctrlr;
+
+	sx_xlock(&np->lock);
+	np->online = false;
+
+	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+		nvmft_printf(ctrlr,
+		    "shutting down due to port going offline\n");
+		nvmft_controller_error(ctrlr, NULL, ENODEV);
+	}
+
+	while (!TAILQ_EMPTY(&np->controllers))
+		sx_sleep(np, &np->lock, 0, "nvmfoff", 0);
+	sx_xunlock(&np->lock);
+}
+
+static int
+nvmft_lun_enable(void *arg, int lun_id)
+{
+	struct nvmft_port *np = arg;
+	struct nvmft_controller *ctrlr;
+	uint32_t *old_ns, *new_ns;
+	uint32_t nsid;
+	u_int i;
+
+	if (lun_id >= le32toh(np->cdata.nn)) {
+		printf("NVMFT: %s lun %d larger than maximum nsid %u\n",
+		    np->cdata.subnqn, lun_id, le32toh(np->cdata.nn));
+		return (EOPNOTSUPP);
+	}
+	nsid = lun_id + 1;
+
+	sx_xlock(&np->lock);
+	new_ns = mallocarray(np->num_ns + 1, sizeof(*new_ns), M_NVMFT,
+	    M_WAITOK);
+	for (i = 0; i < np->num_ns; i++) {
+		if (np->active_ns[i] < nsid)
+			continue;
+		if (np->active_ns[i] == nsid) {
+			sx_xunlock(&np->lock);
+			free(new_ns, M_NVMFT);
+			printf("NVMFT: %s duplicate lun %d\n",
+			    np->cdata.subnqn, lun_id);
+			return (EINVAL);
+		}
+		break;
+	}
+
+	/* Copy over IDs smaller than nsid. */
+	memcpy(new_ns, np->active_ns, i * sizeof(*np->active_ns));
+
+	/* Insert nsid. */
+	new_ns[i] = nsid;
+
+	/* Copy over IDs greater than nsid. */
+	memcpy(new_ns + i + 1, np->active_ns + i, (np->num_ns - i) *
+	    sizeof(*np->active_ns));
+
+	np->num_ns++;
+	old_ns = np->active_ns;
+	np->active_ns = new_ns;
+
+	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+		nvmft_controller_lun_changed(ctrlr, lun_id);
+	}
+
+	sx_xunlock(&np->lock);
+	free(old_ns, M_NVMFT);
+
+	return (0);
+}
+
+static int
+nvmft_lun_disable(void *arg, int lun_id)
+{
+	struct nvmft_port *np = arg;
+	struct nvmft_controller *ctrlr;
+	uint32_t nsid;
+	u_int i;
+
+	if (lun_id >= le32toh(np->cdata.nn))
+		return (0);
+	nsid = lun_id + 1;
+
+	sx_xlock(&np->lock);
+	for (i = 0; i < np->num_ns; i++) {
+		if (np->active_ns[i] == nsid)
+			goto found;
+	}
+	sx_xunlock(&np->lock);
+	printf("NVMFT: %s request to disable nonexistent lun %d\n",
+	    np->cdata.subnqn, lun_id);
+	return (EINVAL);
+
+found:
+	/* Move down IDs greater than nsid. */
+	memmove(np->active_ns + i, np->active_ns + i + 1,
+	    (np->num_ns - (i + 1)) * sizeof(*np->active_ns));
+	np->num_ns--;
+
+	/* NB: Don't bother freeing the old active_ns array. */
+
+	TAILQ_FOREACH(ctrlr, &np->controllers, link) {
+		nvmft_controller_lun_changed(ctrlr, lun_id);
+	}
+
+	sx_xunlock(&np->lock);
+
+	return (0);
+}
+
+void
+nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
+    struct nvme_ns_list *nslist)
+{
+	u_int i, count;
+
+	sx_slock(&np->lock);
+	count = 0;
+	for (i = 0; i < np->num_ns; i++) {
+		if (np->active_ns[i] <= nsid)
+			continue;
+		nslist->ns[count] = htole32(np->active_ns[i]);
+		count++;
+		if (count == nitems(nslist->ns))
+			break;
+	}
+	sx_sunlock(&np->lock);
+}
+
+void
+nvmft_dispatch_command(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
+    bool admin)
+{
+	struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp);
+	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+	struct nvmft_port *np = ctrlr->np;
+	union ctl_io *io;
+	int error;
+
+	if (cmd->nsid == htole32(0)) {
+		nvmft_send_generic_error(qp, nc,
+		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+		nvmf_free_capsule(nc);
+		return;
+	}
+
+	mtx_lock(&ctrlr->lock);
+	if (ctrlr->pending_commands == 0)
+		ctrlr->start_busy = sbinuptime();
+	ctrlr->pending_commands++;
+	mtx_unlock(&ctrlr->lock);
+	io = ctl_alloc_io(np->port.ctl_pool_ref);
+	ctl_zero_io(io);
+	NVMFT_NC(io) = nc;
+	NVMFT_QP(io) = qp;
+	io->io_hdr.io_type = admin ? CTL_IO_NVME_ADMIN : CTL_IO_NVME;
+	io->io_hdr.nexus.initid = ctrlr->cntlid;
+	io->io_hdr.nexus.targ_port = np->port.targ_port;
+	io->io_hdr.nexus.targ_lun = le32toh(cmd->nsid) - 1;
+	io->nvmeio.cmd = *cmd;
+	error = ctl_run(io);
+	if (error != 0) {
+		nvmft_printf(ctrlr, "ctl_run failed for command on %s: %d\n",
+		    nvmft_qpair_name(qp), error);
+		ctl_nvme_set_generic_error(&io->nvmeio,
+		    NVME_SC_INTERNAL_DEVICE_ERROR);
+		nvmft_done(io);
+
+		nvmft_controller_error(ctrlr, qp, ENXIO);
+	}
+}
+
+void
+nvmft_terminate_commands(struct nvmft_controller *ctrlr)
+{
+	struct nvmft_port *np = ctrlr->np;
+	union ctl_io *io;
+	int error;
+
+	mtx_lock(&ctrlr->lock);
+	if (ctrlr->pending_commands == 0)
+		ctrlr->start_busy = sbinuptime();
+	ctrlr->pending_commands++;
+	mtx_unlock(&ctrlr->lock);
+	io = ctl_alloc_io(np->port.ctl_pool_ref);
+	ctl_zero_io(io);
+	NVMFT_QP(io) = ctrlr->admin;
+	io->io_hdr.io_type = CTL_IO_TASK;
+	io->io_hdr.nexus.initid = ctrlr->cntlid;
+	io->io_hdr.nexus.targ_port = np->port.targ_port;
+	io->io_hdr.nexus.targ_lun = 0;
+	io->taskio.tag_type = CTL_TAG_SIMPLE; /* XXX: unused? */
+	io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET;
+	error = ctl_run(io);
+	if (error != CTL_RETVAL_COMPLETE) {
+		nvmft_printf(ctrlr, "failed to terminate tasks: %d\n", error);
+#ifdef INVARIANTS
+		io->io_hdr.status = CTL_SUCCESS;
+#endif
+		nvmft_done(io);
+	}
+}
+
+static void
+nvmft_datamove_out_cb(void *arg, size_t xfered, int error)
+{
+	struct ctl_nvmeio *ctnio = arg;
+
+	if (error != 0) {
+		ctl_nvme_set_data_transfer_error(ctnio);
+	} else {
+		MPASS(xfered == ctnio->kern_data_len);
+		ctnio->kern_data_resid -= xfered;
+	}
+
+	if (ctnio->kern_sg_entries) {
+		free(ctnio->ext_data_ptr, M_NVMFT);
+		ctnio->ext_data_ptr = NULL;
+	} else
+		MPASS(ctnio->ext_data_ptr == NULL);
+	ctl_datamove_done((union ctl_io *)ctnio, false);
+}
+
+static void
+nvmft_datamove_out(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
+    struct nvmf_capsule *nc)
+{
+	struct memdesc mem;
+	int error;
+
+	MPASS(ctnio->ext_data_ptr == NULL);
+	if (ctnio->kern_sg_entries > 0) {
+		struct ctl_sg_entry *sgl;
+		struct bus_dma_segment *vlist;
+
+		vlist = mallocarray(ctnio->kern_sg_entries, sizeof(*vlist),
+		    M_NVMFT, M_WAITOK);
+		ctnio->ext_data_ptr = (void *)vlist;
+		sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
+		for (u_int i = 0; i < ctnio->kern_sg_entries; i++) {
+			vlist[i].ds_addr = (uintptr_t)sgl[i].addr;
+			vlist[i].ds_len = sgl[i].len;
+		}
+		mem = memdesc_vlist(vlist, ctnio->kern_sg_entries);
+	} else
+		mem = memdesc_vaddr(ctnio->kern_data_ptr, ctnio->kern_data_len);
+
+	error = nvmf_receive_controller_data(nc, ctnio->kern_rel_offset, &mem,
+	    ctnio->kern_data_len, nvmft_datamove_out_cb, ctnio);
+	if (error == 0)
+		return;
+
+	nvmft_printf(nvmft_qpair_ctrlr(qp),
+	    "Failed to request capsule data: %d\n", error);
+	ctl_nvme_set_data_transfer_error(ctnio);
+
+	if (ctnio->kern_sg_entries) {
+		free(ctnio->ext_data_ptr, M_NVMFT);
+		ctnio->ext_data_ptr = NULL;
+	} else
+		MPASS(ctnio->ext_data_ptr == NULL);
+	ctl_datamove_done((union ctl_io *)ctnio, true);
+}
+
+static struct mbuf *
+nvmft_copy_data(struct ctl_nvmeio *ctnio)
+{
+	struct ctl_sg_entry *sgl;
+	struct mbuf *m0, *m;
+	uint32_t resid, off, todo;
+	int mlen;
+
+	MPASS(ctnio->kern_data_len != 0);
+
+	m0 = m_getm2(NULL, ctnio->kern_data_len, M_WAITOK, MT_DATA, 0);
+
+	if (ctnio->kern_sg_entries == 0) {
+		m_copyback(m0, 0, ctnio->kern_data_len, ctnio->kern_data_ptr);
+		return (m0);
+	}
+
+	resid = ctnio->kern_data_len;
+	sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
+	off = 0;
+	m = m0;
+	mlen = M_TRAILINGSPACE(m);
+	for (;;) {
+		todo = MIN(mlen, sgl->len - off);
+		memcpy(mtod(m, char *) + m->m_len, (char *)sgl->addr + off,
+		    todo);
+		m->m_len += todo;
+		resid -= todo;
+		if (resid == 0) {
+			MPASS(m->m_next == NULL);
+			break;
+		}
+
+		off += todo;
+		if (off == sgl->len) {
+			sgl++;
+			off = 0;
+		}
+		mlen -= todo;
+		if (mlen == 0) {
+			m = m->m_next;
+			mlen = M_TRAILINGSPACE(m);
+		}
+	}
+
+	return (m0);
+}
+
+static void
+m_free_ref_data(struct mbuf *m)
+{
+	ctl_ref kern_data_ref = m->m_ext.ext_arg1;
+
+	kern_data_ref(m->m_ext.ext_arg2, -1);
+}
+
+static struct mbuf *
+m_get_ref_data(struct ctl_nvmeio *ctnio, void *buf, u_int size)
+{
+	struct mbuf *m;
+
+	m = m_get(M_WAITOK, MT_DATA);
+	m_extadd(m, buf, size, m_free_ref_data, ctnio->kern_data_ref,
+	    ctnio->kern_data_arg, M_RDONLY, EXT_CTL);
+	m->m_len = size;
+	ctnio->kern_data_ref(ctnio->kern_data_arg, 1);
+	return (m);
+}
+
+static struct mbuf *
+nvmft_ref_data(struct ctl_nvmeio *ctnio)
+{
+	struct ctl_sg_entry *sgl;
+	struct mbuf *m0, *m;
+
+	MPASS(ctnio->kern_data_len != 0);
+
+	if (ctnio->kern_sg_entries == 0)
+		return (m_get_ref_data(ctnio, ctnio->kern_data_ptr,
+		    ctnio->kern_data_len));
+
+	sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr;
+	m0 = m_get_ref_data(ctnio, sgl[0].addr, sgl[0].len);
+	m = m0;
+	for (u_int i = 1; i < ctnio->kern_sg_entries; i++) {
+		m->m_next = m_get_ref_data(ctnio, sgl[i].addr, sgl[i].len);
+		m = m->m_next;
+	}
+	return (m0);
+}
+
+static void
+nvmft_datamove_in(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp,
+    struct nvmf_capsule *nc)
+{
+	struct mbuf *m;
+	u_int status;
+
+	if (ctnio->kern_data_ref != NULL)
+		m = nvmft_ref_data(ctnio);
+	else
+		m = nvmft_copy_data(ctnio);
+	status = nvmf_send_controller_data(nc, ctnio->kern_rel_offset, m,
+	    ctnio->kern_data_len);
+	switch (status) {
+	case NVMF_SUCCESS_SENT:
+		ctnio->success_sent = true;
+		nvmft_command_completed(qp, nc);
+		/* FALLTHROUGH */
+	case NVMF_MORE:
+	case NVME_SC_SUCCESS:
+		break;
+	default:
+		ctl_nvme_set_generic_error(ctnio, status);
+		break;
+	}
+	ctl_datamove_done((union ctl_io *)ctnio, true);
+}
+
+static void
+nvmft_datamove(union ctl_io *io)
+{
+	struct nvmf_capsule *nc;
+	struct nvmft_qpair *qp;
+
+	/* Some CTL commands preemptively set a success status. */
+	MPASS(io->io_hdr.status == CTL_STATUS_NONE ||
+	    io->io_hdr.status == CTL_SUCCESS);
+	MPASS(!io->nvmeio.success_sent);
+
+	nc = NVMFT_NC(io);
+	qp = NVMFT_QP(io);
+
+	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
+		nvmft_datamove_in(&io->nvmeio, qp, nc);
+	else
+		nvmft_datamove_out(&io->nvmeio, qp, nc);
+}
+
+static void
+hip_add(uint64_t pair[2], uint64_t addend)
+{
+	uint64_t old, new;
+
+	old = le64toh(pair[0]);
+	new = old + addend;
+	pair[0] = htole64(new);
+	if (new < old)
+		pair[1] += htole64(1);
+}
+
+static void
+nvmft_done(union ctl_io *io)
+{
+	struct nvmft_controller *ctrlr;
+	const struct nvme_command *cmd;
+	struct nvmft_qpair *qp;
+	struct nvmf_capsule *nc;
+	size_t len;
+
+	KASSERT(io->io_hdr.status == CTL_SUCCESS ||
+	    io->io_hdr.status == CTL_NVME_ERROR,
+	    ("%s: bad status %u", __func__, io->io_hdr.status));
+
+	nc = NVMFT_NC(io);
+	qp = NVMFT_QP(io);
+	ctrlr = nvmft_qpair_ctrlr(qp);
+
+	if (nc == NULL) {
+		/* Completion of nvmft_terminate_commands. */
+		goto end;
+	}
+
+	cmd = nvmf_capsule_sqe(nc);
+
+	if (io->io_hdr.status == CTL_SUCCESS)
+		len = nvmf_capsule_data_len(nc) / 512;
+	else
+		len = 0;
+	switch (cmd->opc) {
+	case NVME_OPC_WRITE:
+		mtx_lock(&ctrlr->lock);
+		hip_add(ctrlr->hip.host_write_commands, 1);
+		len += ctrlr->partial_duw;
+		if (len > 1000)
+			hip_add(ctrlr->hip.data_units_written, len / 1000);
+		ctrlr->partial_duw = len % 1000;
+		mtx_unlock(&ctrlr->lock);
+		break;
+	case NVME_OPC_READ:
+	case NVME_OPC_COMPARE:
+	case NVME_OPC_VERIFY:
+		mtx_lock(&ctrlr->lock);
+		if (cmd->opc != NVME_OPC_VERIFY)
+			hip_add(ctrlr->hip.host_read_commands, 1);
+		len += ctrlr->partial_dur;
+		if (len > 1000)
+			hip_add(ctrlr->hip.data_units_read, len / 1000);
+		ctrlr->partial_dur = len % 1000;
+		mtx_unlock(&ctrlr->lock);
+		break;
+	}
+
+	if (io->nvmeio.success_sent) {
+		MPASS(io->io_hdr.status == CTL_SUCCESS);
+	} else {
+		io->nvmeio.cpl.cid = cmd->cid;
+		nvmft_send_response(qp, &io->nvmeio.cpl);
+	}
+	nvmf_free_capsule(nc);
+end:
+	ctl_free_io(io);
+	mtx_lock(&ctrlr->lock);
+	ctrlr->pending_commands--;
+	if (ctrlr->pending_commands == 0)
+		ctrlr->busy_total += sbinuptime() - ctrlr->start_busy;
+	mtx_unlock(&ctrlr->lock);
+}
+
+static int
+nvmft_init(void)
+{
+	TAILQ_INIT(&nvmft_ports);
+	sx_init(&nvmft_ports_lock, "nvmft ports");
+	return (0);
+}
+
+void
+nvmft_port_free(struct nvmft_port *np)
+{
+	KASSERT(TAILQ_EMPTY(&np->controllers),
+	    ("%s(%p): active controllers", __func__, np));
+
+	if (np->port.targ_port != -1) {
+		if (ctl_port_deregister(&np->port) != 0)
+			printf("%s: ctl_port_deregister() failed\n", __func__);
+	}
+
+	free(np->active_ns, M_NVMFT);
+	clean_unrhdr(np->ids);
+	delete_unrhdr(np->ids);
+	sx_destroy(&np->lock);
+	free(np, M_NVMFT);
+}
+
+static struct nvmft_port *
+nvmft_port_find(const char *subnqn)
+{
+	struct nvmft_port *np;
+
+	KASSERT(nvmf_nqn_valid(subnqn), ("%s: invalid nqn", __func__));
+
+	sx_assert(&nvmft_ports_lock, SA_LOCKED);
+	TAILQ_FOREACH(np, &nvmft_ports, link) {
+		if (strcmp(np->cdata.subnqn, subnqn) == 0)
+			break;
+	}
+	return (np);
+}
+
+static struct nvmft_port *
+nvmft_port_find_by_id(int port_id)
+{
+	struct nvmft_port *np;
+
+	sx_assert(&nvmft_ports_lock, SA_LOCKED);
+	TAILQ_FOREACH(np, &nvmft_ports, link) {
+		if (np->port.targ_port == port_id)
+			break;
+	}
+	return (np);
+}
+
+/*
+ * Helper function to fetch a number stored as a string in an nv_list.
+ * Returns false if the string was not a valid number.
+ */
+static bool
+dnvlist_get_strnum(nvlist_t *nvl, const char *name, u_long default_value,
+	u_long *value)
+{
+	const char *str;
+	char *cp;
+
+	str = dnvlist_get_string(nvl, name, NULL);
+	if (str == NULL) {
+		*value = default_value;
+		return (true);
+	}
+	if (*str == '\0')
+		return (false);
+	*value = strtoul(str, &cp, 0);
+	if (*cp != '\0')
+		return (false);
+	return (true);
+}
+
+/*
+ * NVMeoF ports support the following parameters:
+ *
+ * Mandatory:
+ *
+ * subnqn: subsystem NVMe Qualified Name
+ * portid: integer port ID from Discovery Log Page entry
+ *
+ * Optional:
+ * serial: Serial Number string
+ * max_io_qsize: Maximum number of I/O queue entries
+ * enable_timeout: Timeout for controller enable in milliseconds
+ * ioccsz: Maximum command capsule size
+ * iorcsz: Maximum response capsule size
+ * nn: Number of namespaces
+ */
+static void
+nvmft_port_create(struct ctl_req *req)
+{
+	struct nvmft_port *np;
+	struct ctl_port *port;
+	const char *serial, *subnqn;
+	char serial_buf[NVME_SERIAL_NUMBER_LENGTH];
+	u_long enable_timeout, hostid, ioccsz, iorcsz, max_io_qsize, nn, portid;
+	int error;
+
+	/* Required parameters. */
+	subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL);
+	if (subnqn == NULL || !nvlist_exists_string(req->args_nvl, "portid")) {
+		req->status = CTL_LUN_ERROR;
+		snprintf(req->error_str, sizeof(req->error_str),
+		    "Missing required argument");
+		return;
+	}
+	if (!nvmf_nqn_valid(subnqn)) {
+		req->status = CTL_LUN_ERROR;
+		snprintf(req->error_str, sizeof(req->error_str),
+		    "Invalid SubNQN");
+		return;
+	}
+	if (!dnvlist_get_strnum(req->args_nvl, "portid", UINT16_MAX, &portid) ||
+	    portid > UINT16_MAX) {
+		req->status = CTL_LUN_ERROR;
+		snprintf(req->error_str, sizeof(req->error_str),
+		    "Invalid port ID");
+		return;
+	}
+
+	/* Optional parameters. */
+	if (!dnvlist_get_strnum(req->args_nvl, "max_io_qsize",
+	    NVMF_MAX_IO_ENTRIES, &max_io_qsize) ||
+	    max_io_qsize < NVME_MIN_IO_ENTRIES ||
+	    max_io_qsize > NVME_MAX_IO_ENTRIES) {
+		req->status = CTL_LUN_ERROR;
+		snprintf(req->error_str, sizeof(req->error_str),
+		    "Invalid maximum I/O queue size");
+		return;
+	}
+
+	if (!dnvlist_get_strnum(req->args_nvl, "enable_timeout",
+	    NVMF_CC_EN_TIMEOUT * 500, &enable_timeout) ||
+	    (enable_timeout % 500) != 0 || (enable_timeout / 500) > 255) {
+		req->status = CTL_LUN_ERROR;
+		snprintf(req->error_str, sizeof(req->error_str),
+		    "Invalid enable timeout");
+		return;
+	}
+
+	if (!dnvlist_get_strnum(req->args_nvl, "ioccsz", NVMF_IOCCSZ,
+	    &ioccsz) || ioccsz < sizeof(struct nvme_command) ||
+	    (ioccsz % 16) != 0) {
+		req->status = CTL_LUN_ERROR;
+		snprintf(req->error_str, sizeof(req->error_str),
+		    "Invalid Command Capsule size");
+		return;
+	}
+
+	if (!dnvlist_get_strnum(req->args_nvl, "iorcsz", NVMF_IORCSZ,
+	    &iorcsz) || iorcsz < sizeof(struct nvme_completion) ||
+	    (iorcsz % 16) != 0) {
+		req->status = CTL_LUN_ERROR;
+		snprintf(req->error_str, sizeof(req->error_str),
+		    "Invalid Response Capsule size");
+		return;
+	}
+
+	if (!dnvlist_get_strnum(req->args_nvl, "nn", NVMF_NN, &nn) ||
+	    nn < 1 || nn > UINT32_MAX) {
+		req->status = CTL_LUN_ERROR;
+		snprintf(req->error_str, sizeof(req->error_str),
+		    "Invalid number of namespaces");
+		return;
+	}
+
+	serial = dnvlist_get_string(req->args_nvl, "serial", NULL);
+	if (serial == NULL) {
+		getcredhostid(curthread->td_ucred, &hostid);
+		nvmf_controller_serial(serial_buf, sizeof(serial_buf), hostid);
+		serial = serial_buf;
+	}
+
+	sx_xlock(&nvmft_ports_lock);
+
+	np = nvmft_port_find(subnqn);
+	if (np != NULL) {
+		req->status = CTL_LUN_ERROR;
+		snprintf(req->error_str, sizeof(req->error_str),
+		    "SubNQN \"%s\" already exists", subnqn);
+		sx_xunlock(&nvmft_ports_lock);
+		return;
+	}
+
+	np = malloc(sizeof(*np), M_NVMFT, M_WAITOK | M_ZERO);
+	refcount_init(&np->refs, 1);
+	np->max_io_qsize = max_io_qsize;
+	np->cap = _nvmf_controller_cap(max_io_qsize, enable_timeout / 500);
+	sx_init(&np->lock, "nvmft port");
+	np->ids = new_unrhdr(0, MIN(CTL_MAX_INIT_PER_PORT - 1,
+	    NVMF_CNTLID_STATIC_MAX), UNR_NO_MTX);
+	TAILQ_INIT(&np->controllers);
+
+	/* The controller ID is set later for individual controllers. */
+	_nvmf_init_io_controller_data(0, max_io_qsize, serial, ostype,
+	    osrelease, subnqn, nn, ioccsz, iorcsz, &np->cdata);
+	np->cdata.aerl = NVMFT_NUM_AER - 1;
+	np->cdata.oaes = htole32(NVME_ASYNC_EVENT_NS_ATTRIBUTE);
+	np->cdata.oncs = htole16(NVMEF(NVME_CTRLR_DATA_ONCS_VERIFY, 1) |
+	    NVMEF(NVME_CTRLR_DATA_ONCS_WRZERO, 1) |
+	    NVMEF(NVME_CTRLR_DATA_ONCS_DSM, 1) |
+	    NVMEF(NVME_CTRLR_DATA_ONCS_COMPARE, 1));
+	np->cdata.fuses = NVMEF(NVME_CTRLR_DATA_FUSES_CNW, 1);
+
+	np->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1);
+	memcpy(np->fp.revision[0], np->cdata.fr, sizeof(np->cdata.fr));
+
+	port = &np->port;
+
+	port->frontend = &nvmft_frontend;
+	port->port_type = CTL_PORT_NVMF;
+	port->num_requested_ctl_io = max_io_qsize;
+	port->port_name = "nvmf";
+	port->physical_port = portid;
+	port->virtual_port = 0;
+	port->port_online = nvmft_online;
+	port->port_offline = nvmft_offline;
+	port->onoff_arg = np;
+	port->lun_enable = nvmft_lun_enable;
+	port->lun_disable = nvmft_lun_disable;
+	port->targ_lun_arg = np;
+	port->fe_datamove = nvmft_datamove;
+	port->fe_done = nvmft_done;
+	port->targ_port = -1;
+	port->options = nvlist_clone(req->args_nvl);
+
+	error = ctl_port_register(port);
+	if (error != 0) {
+		sx_xunlock(&nvmft_ports_lock);
+		nvlist_destroy(port->options);
*** 2038 LINES SKIPPED ***