git: a1eda74167b5 - main - nvmf: The in-kernel NVMe over Fabrics host
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 03 May 2024 00:15:49 UTC
The branch main has been updated by jhb:
URL: https://cgit.FreeBSD.org/src/commit/?id=a1eda74167b5edb99fd31d507d8a3f7d7e14ae2b
commit a1eda74167b5edb99fd31d507d8a3f7d7e14ae2b
Author: John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2024-05-02 23:29:37 +0000
Commit: John Baldwin <jhb@FreeBSD.org>
CommitDate: 2024-05-02 23:29:37 +0000
nvmf: The in-kernel NVMe over Fabrics host
This is the client (initiator in SCSI terms) for NVMe over Fabrics.
Userland is responsible for creating a set of queue pairs and then
handing them off via an ioctl to this driver, e.g. via the 'connect'
command from nvmecontrol(8). An nvmeX new-bus device is created
at the top-level to represent the remote controller similar to PCI
nvmeX devices for PCI-express controllers.
As with nvme(4), namespace devices named /dev/nvmeXnsY are created and
pass through commands can be submitted to either the namespace devices
or the controller device. For example, 'nvmecontrol identify nvmeX'
works for a remote Fabrics controller the same as for a PCI-express
controller.
nvmf exports remote namespaces via nda(4) devices using the new NVMF
CAM transport. nvmf does not support nvd(4), only nda(4).
Sponsored by: Chelsio Communications
Differential Revision: https://reviews.freebsd.org/D44714
---
share/man/man4/Makefile | 1 +
share/man/man4/nvmf.4 | 87 ++++
sys/conf/NOTES | 4 +-
sys/conf/files | 8 +
sys/dev/nvmf/host/nvmf.c | 939 ++++++++++++++++++++++++++++++++++++++++
sys/dev/nvmf/host/nvmf_aer.c | 290 +++++++++++++
sys/dev/nvmf/host/nvmf_cmd.c | 171 ++++++++
sys/dev/nvmf/host/nvmf_ctldev.c | 159 +++++++
sys/dev/nvmf/host/nvmf_ns.c | 483 +++++++++++++++++++++
sys/dev/nvmf/host/nvmf_qpair.c | 386 +++++++++++++++++
sys/dev/nvmf/host/nvmf_sim.c | 332 ++++++++++++++
sys/dev/nvmf/host/nvmf_var.h | 208 +++++++++
sys/modules/nvmf/Makefile | 3 +-
sys/modules/nvmf/nvmf/Makefile | 13 +
14 files changed, 3082 insertions(+), 2 deletions(-)
diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index aab55d9b90b5..7b6f8849be59 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -408,6 +408,7 @@ MAN= aac.4 \
nvd.4 \
${_nvdimm.4} \
nvme.4 \
+ nvmf.4 \
nvmf_tcp.4 \
${_nvram.4} \
oce.4 \
diff --git a/share/man/man4/nvmf.4 b/share/man/man4/nvmf.4
new file mode 100644
index 000000000000..8afbb4d9daaf
--- /dev/null
+++ b/share/man/man4/nvmf.4
@@ -0,0 +1,87 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2024 Chelsio Communications, Inc.
+.\"
+.Dd May 2, 2024
+.Dt NVMF 4
+.Os
+.Sh NAME
+.Nm nvmf
+.Nd "NVM Express over Fabrics host driver"
+.Sh SYNOPSIS
+To compile the driver into the kernel,
+place the following line in the
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device nvmf"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+nvmf_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver provides the kernel component of an NVM Express over Fabrics
+host.
+The NVMeoF host is the client which provides local access to
+namespaces exported by a remote controller.
+.Pp
+Associations between the local host and remote controllers are managed
+using
+.Xr nvmecontrol 8 .
+New associations are created via the
+.Cm connect
+command and destroyed via the
+.Cm disconnect
+command.
+If an association's connection is interrupted,
+the
+.Cm reconnect
+command creates a new association to replace the interrupted association.
+.Pp
+Similar to
+.Xr nvme 4 ,
+.Nm
+creates controller device nodes using the format
+.Pa /dev/nvmeX
+and namespace device nodes using the format
+.Pa /dev/nvmeXnsY .
+.Nm
+also exports remote namespaces via the CAM
+.Xr nda 4
+peripheral driver.
+Unlike
+.Xr nvme 4 ,
+.Nm
+does not support the
+.Xr nvd 4
+disk driver.
+.Pp
+Associations require a supported transport such as
+.Xr nvmf_tcp 4
+for associations using TCP/IP.
+.Sh SEE ALSO
+.Xr nda 4 ,
+.Xr nvme 4 ,
+.Xr nvmf_tcp 4 ,
+.Xr nvmft 4 ,
+.Xr nvmecontrol 8
+.Sh HISTORY
+The
+.Nm
+module first appeared in
+.Fx 15.0 .
+.Sh AUTHORS
+The
+.Nm
+driver was developed by
+.An John Baldwin Aq Mt jhb@FreeBSD.org
+under sponsorship from Chelsio Communications, Inc.
+.Sh BUGS
+.Nm
+only supports a single I/O queue pair per association.
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index 1f52af4c99d8..ffb4b43f4efc 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1676,12 +1676,14 @@ device mrsas # LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
# NVM Express
#
# nvme: PCI-express NVM Express host controllers
+# nvmf: NVM Express over Fabrics host
# nvmf_tcp: TCP transport for NVM Express over Fabrics
# nda: CAM NVMe disk driver
# nvd: non-CAM NVMe disk driver
-device nvme # base NVMe driver
+device nvme # PCI-express NVMe host driver
options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
+device nvmf # NVMeoF host driver
device nvmf_tcp # NVMeoF TCP transport
device nda # NVMe direct access devices (aka disks)
device nvd # expose NVMe namespaces as disks, depends on nvme
diff --git a/sys/conf/files b/sys/conf/files
index 143814301c20..4a631d979c78 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2533,7 +2533,15 @@ dev/nvme/nvme_test.c optional nvme
dev/nvme/nvme_util.c optional nvme
dev/nvmem/nvmem.c optional nvmem fdt
dev/nvmem/nvmem_if.m optional nvmem
+dev/nvmf/host/nvmf.c optional nvmf
+dev/nvmf/host/nvmf_aer.c optional nvmf
+dev/nvmf/host/nvmf_cmd.c optional nvmf
+dev/nvmf/host/nvmf_ctldev.c optional nvmf
+dev/nvmf/host/nvmf_ns.c optional nvmf
+dev/nvmf/host/nvmf_qpair.c optional nvmf
+dev/nvmf/host/nvmf_sim.c optional nvmf
dev/nvmf/nvmf_tcp.c optional nvmf_tcp
+dev/nvmf/nvmf_transport.c optional nvmf
dev/oce/oce_hw.c optional oce pci
dev/oce/oce_if.c optional oce pci
dev/oce/oce_mbox.c optional oce pci
diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c
new file mode 100644
index 000000000000..0902bc78a7b5
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf.c
@@ -0,0 +1,939 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+static struct cdevsw nvmf_cdevsw;
+
+MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
+
+static void nvmf_disconnect_task(void *arg, int pending);
+
+void
+nvmf_complete(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_completion_status *status = arg;
+ struct mtx *mtx;
+
+ status->cqe = *cqe;
+ mtx = mtx_pool_find(mtxpool_sleep, status);
+ mtx_lock(mtx);
+ status->done = true;
+ mtx_unlock(mtx);
+ wakeup(status);
+}
+
+void
+nvmf_io_complete(void *arg, size_t xfered, int error)
+{
+ struct nvmf_completion_status *status = arg;
+ struct mtx *mtx;
+
+ status->io_error = error;
+ mtx = mtx_pool_find(mtxpool_sleep, status);
+ mtx_lock(mtx);
+ status->io_done = true;
+ mtx_unlock(mtx);
+ wakeup(status);
+}
+
+void
+nvmf_wait_for_reply(struct nvmf_completion_status *status)
+{
+ struct mtx *mtx;
+
+ mtx = mtx_pool_find(mtxpool_sleep, status);
+ mtx_lock(mtx);
+ while (!status->done || !status->io_done)
+ mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
+ mtx_unlock(mtx);
+}
+
+static int
+nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ uint64_t *value)
+{
+ const struct nvmf_fabric_prop_get_rsp *rsp;
+ struct nvmf_completion_status status;
+
+ nvmf_status_init(&status);
+ if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
+ M_WAITOK))
+ return (ECONNABORTED);
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (EIO);
+ }
+
+ rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
+ if (size == 8)
+ *value = le64toh(rsp->value.u64);
+ else
+ *value = le32toh(rsp->value.u32.low);
+ return (0);
+}
+
+static int
+nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+ uint64_t value)
+{
+ struct nvmf_completion_status status;
+
+ nvmf_status_init(&status);
+ if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
+ M_WAITOK))
+ return (ECONNABORTED);
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (EIO);
+ }
+ return (0);
+}
+
+static void
+nvmf_shutdown_controller(struct nvmf_softc *sc)
+{
+ uint64_t cc;
+ int error;
+
+ error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
+ if (error != 0) {
+ device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
+ return;
+ }
+
+ cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
+
+ error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
+ if (error != 0)
+ device_printf(sc->dev,
+ "Failed to set CC to trigger shutdown\n");
+}
+
+static void
+nvmf_check_keep_alive(void *arg)
+{
+ struct nvmf_softc *sc = arg;
+ int traffic;
+
+ traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
+ if (traffic == 0) {
+ device_printf(sc->dev,
+ "disconnecting due to KeepAlive timeout\n");
+ nvmf_disconnect(sc);
+ return;
+ }
+
+ callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
+}
+
+static void
+nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
+{
+ struct nvmf_softc *sc = arg;
+
+ atomic_store_int(&sc->ka_active_rx_traffic, 1);
+ if (cqe->status != 0) {
+ device_printf(sc->dev,
+ "KeepAlive response reported status %#x\n",
+ le16toh(cqe->status));
+ }
+}
+
+static void
+nvmf_send_keep_alive(void *arg)
+{
+ struct nvmf_softc *sc = arg;
+ int traffic;
+
+ /*
+ * Don't bother sending a KeepAlive command if TKAS is active
+ * and another command has been sent during the interval.
+ */
+ traffic = atomic_load_int(&sc->ka_active_tx_traffic);
+ if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
+ sc, M_NOWAIT))
+ device_printf(sc->dev,
+ "Failed to allocate KeepAlive command\n");
+
+ /* Clear ka_active_tx_traffic after sending the keep alive command. */
+ atomic_store_int(&sc->ka_active_tx_traffic, 0);
+
+ callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
+}
+
+int
+nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
+{
+ size_t len;
+ u_int i;
+ int error;
+
+ memset(ivars, 0, sizeof(*ivars));
+
+ if (!hh->admin.admin || hh->num_io_queues < 1)
+ return (EINVAL);
+
+ ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
+ error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
+ if (error != 0)
+ goto out;
+ nvme_controller_data_swapbytes(ivars->cdata);
+
+ len = hh->num_io_queues * sizeof(*ivars->io_params);
+ ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
+ error = copyin(hh->io, ivars->io_params, len);
+ if (error != 0)
+ goto out;
+ for (i = 0; i < hh->num_io_queues; i++) {
+ if (ivars->io_params[i].admin) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /* Require all I/O queues to be the same size. */
+ if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ ivars->hh = hh;
+ return (0);
+
+out:
+ free(ivars->io_params, M_NVMF);
+ free(ivars->cdata, M_NVMF);
+ return (error);
+}
+
+void
+nvmf_free_ivars(struct nvmf_ivars *ivars)
+{
+ free(ivars->io_params, M_NVMF);
+ free(ivars->cdata, M_NVMF);
+}
+
+static int
+nvmf_probe(device_t dev)
+{
+ struct nvmf_ivars *ivars = device_get_ivars(dev);
+ char desc[260];
+
+ if (ivars == NULL)
+ return (ENXIO);
+
+ snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
+ device_set_desc_copy(dev, desc);
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
+{
+ char name[16];
+
+ /* Setup the admin queue. */
+ sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
+ "admin queue");
+ if (sc->admin == NULL) {
+ device_printf(sc->dev, "Failed to setup admin queue\n");
+ return (ENXIO);
+ }
+
+ /* Setup I/O queues. */
+ sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
+ M_WAITOK | M_ZERO);
+ sc->num_io_queues = ivars->hh->num_io_queues;
+ for (u_int i = 0; i < sc->num_io_queues; i++) {
+ snprintf(name, sizeof(name), "I/O queue %u", i);
+ sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
+ &ivars->io_params[i], name);
+ if (sc->io[i] == NULL) {
+ device_printf(sc->dev, "Failed to setup I/O queue %u\n",
+ i + 1);
+ return (ENXIO);
+ }
+ }
+
+ /* Start KeepAlive timers. */
+ if (ivars->hh->kato != 0) {
+ sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
+ sc->cdata->ctratt) != 0;
+ sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
+ sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
+ callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
+ nvmf_check_keep_alive, sc, C_HARDCLOCK);
+ callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
+ nvmf_send_keep_alive, sc, C_HARDCLOCK);
+ }
+
+ return (0);
+}
+
+static bool
+nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
+ struct nvme_namespace_data *data, uint32_t *nsidp)
+{
+ struct nvmf_completion_status status;
+ uint32_t nsid;
+
+ nvmf_status_init(&status);
+ nvmf_status_wait_io(&status);
+ if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
+ nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
+ device_printf(sc->dev,
+ "failed to send IDENTIFY active namespaces command\n");
+ return (false);
+ }
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY active namespaces failed, status %#x\n",
+ le16toh(status.cqe.status));
+ return (false);
+ }
+
+ if (status.io_error != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY active namespaces failed with I/O error %d\n",
+ status.io_error);
+ return (false);
+ }
+
+ for (u_int i = 0; i < nitems(nslist->ns); i++) {
+ nsid = nslist->ns[i];
+ if (nsid == 0) {
+ *nsidp = 0;
+ return (true);
+ }
+
+ if (sc->ns[nsid - 1] != NULL) {
+ device_printf(sc->dev,
+ "duplicate namespace %u in active namespace list\n",
+ nsid);
+ return (false);
+ }
+
+ nvmf_status_init(&status);
+ nvmf_status_wait_io(&status);
+ if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+ &status, nvmf_io_complete, &status, M_WAITOK)) {
+ device_printf(sc->dev,
+ "failed to send IDENTIFY namespace %u command\n",
+ nsid);
+ return (false);
+ }
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed, status %#x\n", nsid,
+ le16toh(status.cqe.status));
+ return (false);
+ }
+
+ if (status.io_error != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed with I/O error %d\n",
+ nsid, status.io_error);
+ return (false);
+ }
+
+ /*
+ * As in nvme_ns_construct, a size of zero indicates an
+ * invalid namespace.
+ */
+ nvme_namespace_data_swapbytes(data);
+ if (data->nsze == 0) {
+ device_printf(sc->dev,
+ "ignoring active namespace %u with zero size\n",
+ nsid);
+ continue;
+ }
+
+ sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+
+ nvmf_sim_rescan_ns(sc, nsid);
+ }
+
+ MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
+
+ if (nsid >= 0xfffffffd)
+ *nsidp = 0;
+ else
+ *nsidp = nsid + 1;
+ return (true);
+}
+
+static bool
+nvmf_add_namespaces(struct nvmf_softc *sc)
+{
+ struct nvme_namespace_data *data;
+ struct nvme_ns_list *nslist;
+ uint32_t nsid;
+ bool retval;
+
+ sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
+ M_WAITOK | M_ZERO);
+ nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
+ data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+ nsid = 0;
+ retval = true;
+ for (;;) {
+ if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
+ retval = false;
+ break;
+ }
+ if (nsid == 0)
+ break;
+ }
+
+ free(data, M_NVMF);
+ free(nslist, M_NVMF);
+ return (retval);
+}
+
+static int
+nvmf_attach(device_t dev)
+{
+ struct make_dev_args mda;
+ struct nvmf_softc *sc = device_get_softc(dev);
+ struct nvmf_ivars *ivars = device_get_ivars(dev);
+ uint64_t val;
+ u_int i;
+ int error;
+
+ if (ivars == NULL)
+ return (ENXIO);
+
+ sc->dev = dev;
+ sc->trtype = ivars->hh->trtype;
+ callout_init(&sc->ka_rx_timer, 1);
+ callout_init(&sc->ka_tx_timer, 1);
+ sx_init(&sc->connection_lock, "nvmf connection");
+ TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
+
+ /* Claim the cdata pointer from ivars. */
+ sc->cdata = ivars->cdata;
+ ivars->cdata = NULL;
+
+ nvmf_init_aer(sc);
+
+ /* TODO: Multiqueue support. */
+ sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
+
+ error = nvmf_establish_connection(sc, ivars);
+ if (error != 0)
+ goto out;
+
+ error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
+ if (error != 0) {
+ device_printf(sc->dev, "Failed to fetch CAP\n");
+ error = ENXIO;
+ goto out;
+ }
+
+ error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
+ if (error != 0) {
+ device_printf(sc->dev, "Failed to fetch VS\n");
+ error = ENXIO;
+ goto out;
+ }
+ sc->vs = val;
+
+ /* Honor MDTS if it is set. */
+ sc->max_xfer_size = maxphys;
+ if (sc->cdata->mdts != 0) {
+ sc->max_xfer_size = ulmin(sc->max_xfer_size,
+ 1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
+ NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
+ }
+
+ error = nvmf_init_sim(sc);
+ if (error != 0)
+ goto out;
+
+ error = nvmf_start_aer(sc);
+ if (error != 0) {
+ nvmf_destroy_sim(sc);
+ goto out;
+ }
+
+ if (!nvmf_add_namespaces(sc)) {
+ nvmf_destroy_sim(sc);
+ goto out;
+ }
+
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &nvmf_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0600;
+ mda.mda_si_drv1 = sc;
+ error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
+ if (error != 0) {
+ nvmf_destroy_sim(sc);
+ goto out;
+ }
+
+ return (0);
+out:
+ if (sc->ns != NULL) {
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_destroy_ns(sc->ns[i]);
+ }
+ free(sc->ns, M_NVMF);
+ }
+
+ callout_drain(&sc->ka_tx_timer);
+ callout_drain(&sc->ka_rx_timer);
+
+ if (sc->admin != NULL)
+ nvmf_shutdown_controller(sc);
+
+ for (i = 0; i < sc->num_io_queues; i++) {
+ if (sc->io[i] != NULL)
+ nvmf_destroy_qp(sc->io[i]);
+ }
+ free(sc->io, M_NVMF);
+ if (sc->admin != NULL)
+ nvmf_destroy_qp(sc->admin);
+
+ nvmf_destroy_aer(sc);
+
+ taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+ sx_destroy(&sc->connection_lock);
+ free(sc->cdata, M_NVMF);
+ return (error);
+}
+
+void
+nvmf_disconnect(struct nvmf_softc *sc)
+{
+ taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
+}
+
+static void
+nvmf_disconnect_task(void *arg, int pending __unused)
+{
+ struct nvmf_softc *sc = arg;
+ u_int i;
+
+ sx_xlock(&sc->connection_lock);
+ if (sc->admin == NULL) {
+ /*
+ * Ignore transport errors if there is no active
+ * association.
+ */
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ if (sc->detaching) {
+ if (sc->admin != NULL) {
+ /*
+ * This unsticks the detach process if a
+ * transport error occurs during detach.
+ */
+ nvmf_shutdown_qp(sc->admin);
+ }
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ if (sc->cdev == NULL) {
+ /*
+ * Transport error occurred during attach (nvmf_add_namespaces).
+ * Shutdown the admin queue.
+ */
+ nvmf_shutdown_qp(sc->admin);
+ sx_xunlock(&sc->connection_lock);
+ return;
+ }
+
+ callout_drain(&sc->ka_tx_timer);
+ callout_drain(&sc->ka_rx_timer);
+ sc->ka_traffic = false;
+
+ /* Quiesce namespace consumers. */
+ nvmf_disconnect_sim(sc);
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_disconnect_ns(sc->ns[i]);
+ }
+
+ /* Shutdown the existing qpairs. */
+ for (i = 0; i < sc->num_io_queues; i++) {
+ nvmf_destroy_qp(sc->io[i]);
+ }
+ free(sc->io, M_NVMF);
+ sc->io = NULL;
+ sc->num_io_queues = 0;
+ nvmf_destroy_qp(sc->admin);
+ sc->admin = NULL;
+
+ sx_xunlock(&sc->connection_lock);
+}
+
+static int
+nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
+{
+ struct nvmf_ivars ivars;
+ u_int i;
+ int error;
+
+ /* XXX: Should we permit changing the transport type? */
+ if (sc->trtype != hh->trtype) {
+ device_printf(sc->dev,
+ "transport type mismatch on reconnect\n");
+ return (EINVAL);
+ }
+
+ error = nvmf_init_ivars(&ivars, hh);
+ if (error != 0)
+ return (error);
+
+ sx_xlock(&sc->connection_lock);
+ if (sc->admin != NULL || sc->detaching) {
+ error = EBUSY;
+ goto out;
+ }
+
+ /*
+ * Ensure this is for the same controller. Note that the
+ * controller ID can vary across associations if the remote
+ * system is using the dynamic controller model. This merely
+ * ensures the new association is connected to the same NVMe
+ * subsystem.
+ */
+ if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
+ sizeof(ivars.cdata->subnqn)) != 0) {
+ device_printf(sc->dev,
+ "controller subsystem NQN mismatch on reconnect\n");
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * XXX: Require same number and size of I/O queues so that
+ * max_pending_io is still correct?
+ */
+
+ error = nvmf_establish_connection(sc, &ivars);
+ if (error != 0)
+ goto out;
+
+ error = nvmf_start_aer(sc);
+ if (error != 0)
+ goto out;
+
+ device_printf(sc->dev,
+ "established new association with %u I/O queues\n",
+ sc->num_io_queues);
+
+ /* Restart namespace consumers. */
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_reconnect_ns(sc->ns[i]);
+ }
+ nvmf_reconnect_sim(sc);
+out:
+ sx_xunlock(&sc->connection_lock);
+ nvmf_free_ivars(&ivars);
+ return (error);
+}
+
+static int
+nvmf_detach(device_t dev)
+{
+ struct nvmf_softc *sc = device_get_softc(dev);
+ u_int i;
+
+ destroy_dev(sc->cdev);
+
+ sx_xlock(&sc->connection_lock);
+ sc->detaching = true;
+ sx_xunlock(&sc->connection_lock);
+
+ nvmf_destroy_sim(sc);
+ for (i = 0; i < sc->cdata->nn; i++) {
+ if (sc->ns[i] != NULL)
+ nvmf_destroy_ns(sc->ns[i]);
+ }
+ free(sc->ns, M_NVMF);
+
+ callout_drain(&sc->ka_tx_timer);
+ callout_drain(&sc->ka_rx_timer);
+
+ if (sc->admin != NULL)
+ nvmf_shutdown_controller(sc);
+
+ for (i = 0; i < sc->num_io_queues; i++) {
+ nvmf_destroy_qp(sc->io[i]);
+ }
+ free(sc->io, M_NVMF);
+
+ taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+
+ if (sc->admin != NULL)
+ nvmf_destroy_qp(sc->admin);
+
+ nvmf_destroy_aer(sc);
+
+ sx_destroy(&sc->connection_lock);
+ free(sc->cdata, M_NVMF);
+ return (0);
+}
+
+void
+nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
+{
+ struct nvmf_completion_status status;
+ struct nvme_namespace_data *data;
+ struct nvmf_namespace *ns;
+
+ data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+ nvmf_status_init(&status);
+ nvmf_status_wait_io(&status);
+ if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+ &status, nvmf_io_complete, &status, M_WAITOK)) {
+ device_printf(sc->dev,
+ "failed to send IDENTIFY namespace %u command\n", nsid);
+ free(data, M_NVMF);
+ return;
+ }
+ nvmf_wait_for_reply(&status);
+
+ if (status.cqe.status != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed, status %#x\n", nsid,
+ le16toh(status.cqe.status));
+ free(data, M_NVMF);
+ return;
+ }
+
+ if (status.io_error != 0) {
+ device_printf(sc->dev,
+ "IDENTIFY namespace %u failed with I/O error %d\n",
+ nsid, status.io_error);
+ free(data, M_NVMF);
+ return;
+ }
+
+ nvme_namespace_data_swapbytes(data);
+
+ /* XXX: Needs locking around sc->ns[]. */
+ ns = sc->ns[nsid - 1];
+ if (data->nsze == 0) {
+ /* XXX: Needs locking */
+ if (ns != NULL) {
+ nvmf_destroy_ns(ns);
+ sc->ns[nsid - 1] = NULL;
+ }
+ } else {
+ /* XXX: Needs locking */
+ if (ns == NULL) {
+ sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+ } else {
+ if (!nvmf_update_ns(ns, data)) {
+ nvmf_destroy_ns(ns);
+ sc->ns[nsid - 1] = NULL;
+ }
+ }
+ }
+
+ free(data, M_NVMF);
+
+ nvmf_sim_rescan_ns(sc, nsid);
+}
+
+int
+nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
+ bool admin)
+{
+ struct nvmf_completion_status status;
+ struct nvme_command cmd;
+ struct memdesc mem;
+ struct nvmf_host_qpair *qp;
+ struct nvmf_request *req;
+ void *buf;
*** 2252 LINES SKIPPED ***