git: 1bce7cd885e7 - main - nvme: Add Linux copatible ioctls
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 14 Jun 2024 22:40:09 UTC
The branch main has been updated by imp: URL: https://cgit.FreeBSD.org/src/commit/?id=1bce7cd885e7e5b376a60367629a0f76ff7f0167 commit 1bce7cd885e7e5b376a60367629a0f76ff7f0167 Author: Warner Losh <imp@FreeBSD.org> AuthorDate: 2024-06-14 22:40:08 +0000 Commit: Warner Losh <imp@FreeBSD.org> CommitDate: 2024-06-14 22:40:08 +0000 nvme: Add Linux copatible ioctls Add the NVME_IOCTL_ID, NVME_IOCTL_ADMIN_CMD, and NVME_IOCTL_IO_CMD Linux compatible ioctls. These may be run on either an I/O (ns) dev or a nvme (admin) dev. Linux allows both on either device, and programs use this and aren't careful about having the right device open. Emulate this feature, and implement these ioctls. The data is passed in into the kernel in host byte order (not converted to le). Results are returned in host order. The timeout field is ignore, and the metadata and metadata_len fields must be zero. The addr field can be null, even when the data_len is non zero (FreeBSD's ioctl interface prohibits this, Linux's just ignores the inconsistency). Only the cdw10 is returned from the command: the status is not returned in 'result' field. XXX need to verify that this is what Linux does on an error signaled from the drive. No external include file is yet available for this: most programs that call this interface either use a linux-specific path <linux/nvme.h> or have their own private copy of the data. It's unclear the best thing to do. Also, create a /dev/nvmeXnY as an alias for /dev/nvmeXnsY. These changes allow a native build of nvme-cli to work for everything that doesn't depend on sysfs entries in /sys, calls that use metadata, send / receive drive data and sed functionality not in our nvme driver. Sponsored by: Netflix Co-Authored-by: Chuck Tuffli <chuck@freebsd.org> Reviewed by: chuck Differential Revision: https://reviews.freebsd.org/D45415 --- sys/dev/nvme/nvme.h | 6 +++ sys/dev/nvme/nvme_ctrlr.c | 114 +++++++++++++++++++++++++++++++++++++++++++++- sys/dev/nvme/nvme_linux.h | 58 +++++++++++++++++++++++ sys/dev/nvme/nvme_ns.c | 14 +++++- 4 files changed, 190 insertions(+), 2 deletions(-) diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h index a389fc443743..1db50d24c259 100644 --- a/sys/dev/nvme/nvme.h +++ b/sys/dev/nvme/nvme.h @@ -1902,6 +1902,7 @@ struct thread; struct nvme_namespace; struct nvme_controller; struct nvme_consumer; +struct nvme_passthru_cmd; typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); @@ -1921,6 +1922,11 @@ int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, uint32_t nsid, int is_user_buffer, int is_admin_cmd); +int nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, + struct nvme_passthru_cmd *npc, + uint32_t nsid, bool is_user, + bool is_admin); + /* Admin functions */ void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, uint8_t feature, uint32_t cdw11, diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index 155aedf2f31a..f058a4e33b9f 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -43,6 +43,7 @@ #include <vm/vm.h> #include "nvme_private.h" +#include "nvme_linux.h" #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ @@ -1269,7 +1270,7 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, ret = EFAULT; goto err; } - req = nvme_allocate_request_vaddr(buf->b_data, pt->len, + req = nvme_allocate_request_vaddr(buf->b_data, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_vaddr(pt->buf, pt->len, @@ -1314,6 +1315,103 @@ err: return (ret); } +static void +nvme_npc_done(void *arg, const struct nvme_completion *cpl) +{ + struct nvme_passthru_cmd *npc = arg; + struct mtx *mtx = (void *)(uintptr_t)npc->metadata; + + npc->result = cpl->cdw0; /* cpl in host order by now */ + mtx_lock(mtx); + npc->metadata = 0; + wakeup(npc); + mtx_unlock(mtx); +} + +/* XXX refactor? */ + +int +nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr, + struct nvme_passthru_cmd *npc, uint32_t nsid, bool is_user, bool is_admin) +{ + struct nvme_request *req; + struct mtx *mtx; + struct buf *buf = NULL; + int ret = 0; + + /* + * We don't support metadata. + */ + if (npc->metadata != 0 || npc->metadata_len != 0) + return (EIO); + + if (npc->data_len > 0 && npc->addr != 0) { + if (npc->data_len > ctrlr->max_xfer_size) { + nvme_printf(ctrlr, + "npc->data_len (%d) exceeds max_xfer_size (%d)\n", + npc->data_len, ctrlr->max_xfer_size); + return (EIO); + } + /* We only support data out or data in commands, but not both at once. */ + if ((npc->opcode & 0x3) == 0 || (npc->opcode & 0x3) == 3) + return (EINVAL); + if (is_user) { + /* + * Ensure the user buffer is wired for the duration of + * this pass-through command. + */ + PHOLD(curproc); + buf = uma_zalloc(pbuf_zone, M_WAITOK); + buf->b_iocmd = npc->opcode & 1 ? BIO_WRITE : BIO_READ; + if (vmapbuf(buf, (void *)npc->addr, npc->data_len, 1) < 0) { + ret = EFAULT; + goto err; + } + req = nvme_allocate_request_vaddr(buf->b_data, npc->data_len, + nvme_npc_done, npc); + } else + req = nvme_allocate_request_vaddr((void *)npc->addr, npc->data_len, + nvme_npc_done, npc); + } else + req = nvme_allocate_request_null(nvme_npc_done, npc); + + req->cmd.opc = npc->opcode; + req->cmd.fuse = npc->flags; + req->cmd.rsvd2 = htole16(npc->cdw2); + req->cmd.rsvd3 = htole16(npc->cdw3); + req->cmd.cdw10 = htole32(npc->cdw10); + req->cmd.cdw11 = htole32(npc->cdw11); + req->cmd.cdw12 = htole32(npc->cdw12); + req->cmd.cdw13 = htole32(npc->cdw13); + req->cmd.cdw14 = htole32(npc->cdw14); + req->cmd.cdw15 = htole32(npc->cdw15); + + req->cmd.nsid = htole32(nsid); + + mtx = mtx_pool_find(mtxpool_sleep, npc); + npc->metadata = (uintptr_t) mtx; + + /* XXX no timeout passed down */ + if (is_admin) + nvme_ctrlr_submit_admin_request(ctrlr, req); + else + nvme_ctrlr_submit_io_request(ctrlr, req); + + mtx_lock(mtx); + while (npc->metadata != 0) + mtx_sleep(npc, mtx, PRIBIO, "nvme_npc", 0); + mtx_unlock(mtx); + + if (buf != NULL) { + vunmapbuf(buf); +err: + uma_zfree(pbuf_zone, buf); + PRELE(curproc); + } + + return (ret); +} + static int nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) @@ -1324,6 +1422,7 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, ctrlr = cdev->si_drv1; switch (cmd) { + case NVME_IOCTL_RESET: /* Linux compat */ case NVME_RESET_CONTROLLER: nvme_ctrlr_reset(ctrlr); break; @@ -1342,6 +1441,19 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, case NVME_GET_MAX_XFER_SIZE: *(uint64_t *)arg = ctrlr->max_xfer_size; break; + /* Linux Compatible (see nvme_linux.h) */ + case NVME_IOCTL_ID: + td->td_retval[0] = 0xfffffffful; + return (0); + + case NVME_IOCTL_ADMIN_CMD: + case NVME_IOCTL_IO_CMD: { + struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg; + + return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, npc->nsid, true, + cmd == NVME_IOCTL_ADMIN_CMD)); + } + default: return (ENOTTY); } diff --git a/sys/dev/nvme/nvme_linux.h b/sys/dev/nvme/nvme_linux.h new file mode 100644 index 000000000000..aaa68e1d34f8 --- /dev/null +++ b/sys/dev/nvme/nvme_linux.h @@ -0,0 +1,58 @@ +/*- + * Copyright (c) 2024, Netflix Inc. + * Written by Warner Losh + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +/* + * Linux compatible NVME ioctls. So far we just support ID, ADMIN_CMD and + * IO_CMD. The rest are not supported. + */ + + +#include <sys/ioccom.h> +#include <sys/_types.h> + +struct nvme_passthru_cmd { + __uint8_t opcode; + __uint8_t flags; + __uint16_t rsvd1; + __uint32_t nsid; + __uint32_t cdw2; + __uint32_t cdw3; + __uint64_t metadata; + __uint64_t addr; + __uint32_t metadata_len; + __uint32_t data_len; + __uint32_t cdw10; + __uint32_t cdw11; + __uint32_t cdw12; + __uint32_t cdw13; + __uint32_t cdw14; + __uint32_t cdw15; + __uint32_t timeout_ms; + __uint32_t result; +}; + +#define nvme_admin_cmd nvme_passthru_cmd + +/* + * Linux nvme ioctls, commented out ones are not supported + */ +#define NVME_IOCTL_ID _IO('N', 0x40) +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) +/* #define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) */ +#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) +#define NVME_IOCTL_RESET _IO('N', 0x44) +/* #define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) */ +/* #define NVME_IOCTL_RESCAN _IO('N', 0x46) */ +/* #define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) */ +/* #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) */ +/* #define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64) */ + +/* io_uring async commands: */ +/* #define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd) */ +/* #define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd) */ +/* #define NVME_URING_CMD_ADMIN _IOWR('N', 0x82, struct nvme_uring_cmd) */ +/* #define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd) */ diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c index 4c65e2c49e64..3f29382fe42f 100644 --- a/sys/dev/nvme/nvme_ns.c +++ b/sys/dev/nvme/nvme_ns.c @@ -43,6 +43,7 @@ #include <geom/geom.h> #include "nvme_private.h" +#include "nvme_linux.h" static void nvme_bio_child_inbed(struct bio *parent, int bio_error); static void nvme_bio_child_done(void *arg, @@ -93,6 +94,18 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, case DIOCGSECTORSIZE: *(u_int *)arg = nvme_ns_get_sector_size(ns); break; + /* Linux Compatible (see nvme_linux.h) */ + case NVME_IOCTL_ID: + td->td_retval[0] = ns->id; + return (0); + + case NVME_IOCTL_ADMIN_CMD: + case NVME_IOCTL_IO_CMD: { + struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg; + + return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, ns->id, true, + cmd == NVME_IOCTL_ADMIN_CMD)); + } default: return (ENOTTY); } @@ -610,7 +623,6 @@ nvme_ns_construct(struct nvme_namespace *ns, uint32_t id, return (ENXIO); ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%d", device_get_nameunit(ctrlr->dev), ns->id); - ns->cdev->si_flags |= SI_UNMAPPED; return (0);