git: 1bce7cd885e7 - main - nvme: Add Linux copatible ioctls

From: Warner Losh <imp_at_FreeBSD.org>
Date: Fri, 14 Jun 2024 22:40:09 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=1bce7cd885e7e5b376a60367629a0f76ff7f0167

commit 1bce7cd885e7e5b376a60367629a0f76ff7f0167
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2024-06-14 22:40:08 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2024-06-14 22:40:08 +0000

    nvme: Add Linux copatible ioctls
    
    Add the NVME_IOCTL_ID, NVME_IOCTL_ADMIN_CMD, and NVME_IOCTL_IO_CMD Linux
    compatible ioctls. These may be run on either an I/O (ns) dev or a nvme
    (admin) dev. Linux allows both on either device, and programs use this
    and aren't careful about having the right device open. Emulate this
    feature, and implement these ioctls. The data is passed in into the
    kernel in host byte order (not converted to le). Results are returned in
    host order.
    
    The timeout field is ignore, and the metadata and metadata_len fields
    must be zero.
    
    The addr field can be null, even when the data_len is non zero (FreeBSD's
    ioctl interface prohibits this, Linux's just ignores the inconsistency).
    
    Only the cdw10 is returned from the command: the status is not returned
    in 'result' field. XXX need to verify that this is what Linux does on an
    error signaled from the drive.
    
    No external include file is yet available for this: most programs that
    call this interface either use a linux-specific path <linux/nvme.h> or
    have their own private copy of the data. It's unclear the best thing to
    do.
    
    Also, create a /dev/nvmeXnY as an alias for /dev/nvmeXnsY.
    
    These changes allow a native build of nvme-cli to work for everything
    that doesn't depend on sysfs entries in /sys, calls that use metadata,
    send / receive drive data and sed functionality not in our nvme driver.
    
    Sponsored by:           Netflix
    Co-Authored-by:         Chuck Tuffli <chuck@freebsd.org>
    Reviewed by:            chuck
    Differential Revision:  https://reviews.freebsd.org/D45415
---
 sys/dev/nvme/nvme.h       |   6 +++
 sys/dev/nvme/nvme_ctrlr.c | 114 +++++++++++++++++++++++++++++++++++++++++++++-
 sys/dev/nvme/nvme_linux.h |  58 +++++++++++++++++++++++
 sys/dev/nvme/nvme_ns.c    |  14 +++++-
 4 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h
index a389fc443743..1db50d24c259 100644
--- a/sys/dev/nvme/nvme.h
+++ b/sys/dev/nvme/nvme.h
@@ -1902,6 +1902,7 @@ struct thread;
 struct nvme_namespace;
 struct nvme_controller;
 struct nvme_consumer;
+struct nvme_passthru_cmd;
 
 typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *);
 
@@ -1921,6 +1922,11 @@ int	nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 				   uint32_t nsid, int is_user_buffer,
 				   int is_admin_cmd);
 
+int	nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
+				      struct nvme_passthru_cmd *npc,
+				      uint32_t nsid, bool is_user,
+				      bool is_admin);
+
 /* Admin functions */
 void	nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
 				   uint8_t feature, uint32_t cdw11,
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 155aedf2f31a..f058a4e33b9f 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -43,6 +43,7 @@
 #include <vm/vm.h>
 
 #include "nvme_private.h"
+#include "nvme_linux.h"
 
 #define B4_CHK_RDY_DELAY_MS	2300		/* work around controller bug */
 
@@ -1269,7 +1270,7 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 				ret = EFAULT;
 				goto err;
 			}
-			req = nvme_allocate_request_vaddr(buf->b_data, pt->len, 
+			req = nvme_allocate_request_vaddr(buf->b_data, pt->len,
 			    nvme_pt_done, pt);
 		} else
 			req = nvme_allocate_request_vaddr(pt->buf, pt->len,
@@ -1314,6 +1315,103 @@ err:
 	return (ret);
 }
 
+static void
+nvme_npc_done(void *arg, const struct nvme_completion *cpl)
+{
+	struct nvme_passthru_cmd *npc = arg;
+	struct mtx *mtx = (void *)(uintptr_t)npc->metadata;
+
+	npc->result = cpl->cdw0;	/* cpl in host order by now */
+	mtx_lock(mtx);
+	npc->metadata = 0;
+	wakeup(npc);
+	mtx_unlock(mtx);
+}
+
+/* XXX refactor? */
+
+int
+nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
+    struct nvme_passthru_cmd *npc, uint32_t nsid, bool is_user, bool is_admin)
+{
+	struct nvme_request	*req;
+	struct mtx		*mtx;
+	struct buf		*buf = NULL;
+	int			ret = 0;
+
+	/*
+	 * We don't support metadata.
+	 */
+	if (npc->metadata != 0 || npc->metadata_len != 0)
+		return (EIO);
+
+	if (npc->data_len > 0 && npc->addr != 0) {
+		if (npc->data_len > ctrlr->max_xfer_size) {
+			nvme_printf(ctrlr,
+			    "npc->data_len (%d) exceeds max_xfer_size (%d)\n",
+			    npc->data_len, ctrlr->max_xfer_size);
+			return (EIO);
+		}
+		/* We only support data out or data in commands, but not both at once. */
+		if ((npc->opcode & 0x3) == 0 || (npc->opcode & 0x3) == 3)
+			return (EINVAL);
+		if (is_user) {
+			/*
+			 * Ensure the user buffer is wired for the duration of
+			 *  this pass-through command.
+			 */
+			PHOLD(curproc);
+			buf = uma_zalloc(pbuf_zone, M_WAITOK);
+			buf->b_iocmd = npc->opcode & 1 ? BIO_WRITE : BIO_READ;
+			if (vmapbuf(buf, (void *)npc->addr, npc->data_len, 1) < 0) {
+				ret = EFAULT;
+				goto err;
+			}
+			req = nvme_allocate_request_vaddr(buf->b_data, npc->data_len,
+			    nvme_npc_done, npc);
+		} else
+			req = nvme_allocate_request_vaddr((void *)npc->addr, npc->data_len,
+			    nvme_npc_done, npc);
+	} else
+		req = nvme_allocate_request_null(nvme_npc_done, npc);
+
+	req->cmd.opc = npc->opcode;
+	req->cmd.fuse = npc->flags;
+	req->cmd.rsvd2 = htole16(npc->cdw2);
+	req->cmd.rsvd3 = htole16(npc->cdw3);
+	req->cmd.cdw10 = htole32(npc->cdw10);
+	req->cmd.cdw11 = htole32(npc->cdw11);
+	req->cmd.cdw12 = htole32(npc->cdw12);
+	req->cmd.cdw13 = htole32(npc->cdw13);
+	req->cmd.cdw14 = htole32(npc->cdw14);
+	req->cmd.cdw15 = htole32(npc->cdw15);
+
+	req->cmd.nsid = htole32(nsid);
+
+	mtx = mtx_pool_find(mtxpool_sleep, npc);
+	npc->metadata = (uintptr_t) mtx;
+
+	/* XXX no timeout passed down */
+	if (is_admin)
+		nvme_ctrlr_submit_admin_request(ctrlr, req);
+	else
+		nvme_ctrlr_submit_io_request(ctrlr, req);
+
+	mtx_lock(mtx);
+	while (npc->metadata != 0)
+		mtx_sleep(npc, mtx, PRIBIO, "nvme_npc", 0);
+	mtx_unlock(mtx);
+
+	if (buf != NULL) {
+		vunmapbuf(buf);
+err:
+		uma_zfree(pbuf_zone, buf);
+		PRELE(curproc);
+	}
+
+	return (ret);
+}
+
 static int
 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
     struct thread *td)
@@ -1324,6 +1422,7 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 	ctrlr = cdev->si_drv1;
 
 	switch (cmd) {
+	case NVME_IOCTL_RESET: /* Linux compat */
 	case NVME_RESET_CONTROLLER:
 		nvme_ctrlr_reset(ctrlr);
 		break;
@@ -1342,6 +1441,19 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 	case NVME_GET_MAX_XFER_SIZE:
 		*(uint64_t *)arg = ctrlr->max_xfer_size;
 		break;
+	/* Linux Compatible (see nvme_linux.h) */
+	case NVME_IOCTL_ID:
+		td->td_retval[0] = 0xfffffffful;
+		return (0);
+
+	case NVME_IOCTL_ADMIN_CMD:
+	case NVME_IOCTL_IO_CMD: {
+		struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg;
+
+		return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, npc->nsid, true,
+		    cmd == NVME_IOCTL_ADMIN_CMD));
+	}
+
 	default:
 		return (ENOTTY);
 	}
diff --git a/sys/dev/nvme/nvme_linux.h b/sys/dev/nvme/nvme_linux.h
new file mode 100644
index 000000000000..aaa68e1d34f8
--- /dev/null
+++ b/sys/dev/nvme/nvme_linux.h
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2024, Netflix Inc.
+ * Written by Warner Losh
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * Linux compatible NVME ioctls. So far we just support ID, ADMIN_CMD and
+ * IO_CMD. The rest are not supported.
+ */
+
+
+#include <sys/ioccom.h>
+#include <sys/_types.h>
+
+struct nvme_passthru_cmd {
+	__uint8_t	opcode;
+	__uint8_t	flags;
+	__uint16_t	rsvd1;
+	__uint32_t	nsid;
+	__uint32_t	cdw2;
+	__uint32_t	cdw3;
+	__uint64_t	metadata;
+	__uint64_t	addr;
+	__uint32_t	metadata_len;
+	__uint32_t	data_len;
+	__uint32_t	cdw10;
+	__uint32_t	cdw11;
+	__uint32_t	cdw12;
+	__uint32_t	cdw13;
+	__uint32_t	cdw14;
+	__uint32_t	cdw15;
+	__uint32_t	timeout_ms;
+	__uint32_t	result;
+};
+
+#define nvme_admin_cmd nvme_passthru_cmd
+
+/*
+ * Linux nvme ioctls, commented out ones are not supported
+ */
+#define NVME_IOCTL_ID		_IO('N', 0x40)
+#define NVME_IOCTL_ADMIN_CMD	_IOWR('N', 0x41, struct nvme_admin_cmd)
+/* #define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io) */
+#define NVME_IOCTL_IO_CMD	_IOWR('N', 0x43, struct nvme_passthru_cmd)
+#define NVME_IOCTL_RESET	_IO('N', 0x44)
+/* #define NVME_IOCTL_SUBSYS_RESET	_IO('N', 0x45) */
+/* #define NVME_IOCTL_RESCAN	_IO('N', 0x46) */
+/* #define NVME_IOCTL_ADMIN64_CMD	_IOWR('N', 0x47, struct nvme_passthru_cmd64) */
+/* #define NVME_IOCTL_IO64_CMD	_IOWR('N', 0x48, struct nvme_passthru_cmd64) */
+/* #define NVME_IOCTL_IO64_CMD_VEC	_IOWR('N', 0x49, struct nvme_passthru_cmd64) */
+
+/* io_uring async commands: */
+/* #define NVME_URING_CMD_IO	_IOWR('N', 0x80, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_IO_VEC	_IOWR('N', 0x81, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_ADMIN	_IOWR('N', 0x82, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd) */
diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c
index 4c65e2c49e64..3f29382fe42f 100644
--- a/sys/dev/nvme/nvme_ns.c
+++ b/sys/dev/nvme/nvme_ns.c
@@ -43,6 +43,7 @@
 #include <geom/geom.h>
 
 #include "nvme_private.h"
+#include "nvme_linux.h"
 
 static void		nvme_bio_child_inbed(struct bio *parent, int bio_error);
 static void		nvme_bio_child_done(void *arg,
@@ -93,6 +94,18 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 	case DIOCGSECTORSIZE:
 		*(u_int *)arg = nvme_ns_get_sector_size(ns);
 		break;
+	/* Linux Compatible (see nvme_linux.h) */
+	case NVME_IOCTL_ID:
+		td->td_retval[0] = ns->id;
+		return (0);
+
+	case NVME_IOCTL_ADMIN_CMD:
+	case NVME_IOCTL_IO_CMD: {
+		struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg;
+
+		return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, ns->id, true,
+		    cmd == NVME_IOCTL_ADMIN_CMD));
+	}
 	default:
 		return (ENOTTY);
 	}
@@ -610,7 +623,6 @@ nvme_ns_construct(struct nvme_namespace *ns, uint32_t id,
 		return (ENXIO);
 	ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%d",
 	    device_get_nameunit(ctrlr->dev), ns->id);
-
 	ns->cdev->si_flags |= SI_UNMAPPED;
 
 	return (0);