git: 82ff1c334b97 - main - nvme: Allow larger user request sizes

From: Warner Losh <imp_at_FreeBSD.org>
Date: Thu, 09 Apr 2026 23:38:09 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=82ff1c334b97e5b68e6330e90f0aa6ae0f6af9aa

commit 82ff1c334b97e5b68e6330e90f0aa6ae0f6af9aa
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2026-04-09 16:18:31 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2026-04-09 23:37:21 +0000

    nvme: Allow larger user request sizes
    
    We have a small buffer for pages on the stack, but if the user wants to
    do an I/O larger than this we currently fail w/o a way for the user to
    know the max size. It's not hard to allocate an array for the uncommon
    case of very large I/Os, and the performance advantage of the array is
    small in that case anyway. In addition, this allows firmware upgrades
    using the full transfer size of the device as a happy accident too.
    
    Sponsored by:           Netflix
    Reviewed by:            chs, chuck
    Differential Revision:  https://reviews.freebsd.org/D55638
---
 sys/dev/nvme/nvme_ctrlr.c | 48 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index b75033300061..e5094e909a24 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -1345,30 +1345,52 @@ nvme_ctrlr_shared_handler(void *arg)
 
 #define NVME_MAX_PAGES  (int)(1024 / sizeof(vm_page_t))
 
+static int
+nvme_page_count(vm_offset_t start, size_t len)
+{
+	return atop(round_page(start + len) - trunc_page(start));
+}
+
 static int
 nvme_user_ioctl_req(vm_offset_t addr, size_t len, bool is_read,
-    vm_page_t *upages, int max_pages, int *npagesp, struct nvme_request **req,
+    vm_page_t **upages, int max_pages, int *npagesp, struct nvme_request **req,
     nvme_cb_fn_t cb_fn, void *cb_arg)
 {
 	vm_prot_t prot = VM_PROT_READ;
-	int err;
+	int err, npages;
+	vm_page_t *upages_us;
+
+	upages_us = *upages;
+	npages = nvme_page_count(addr, len);
+	if (npages > atop(maxphys))
+		return (EINVAL);
+	if (npages > NVME_MAX_PAGES)
+		upages_us = malloc(npages * sizeof(vm_page_t), M_NVME,
+		    M_ZERO | M_WAITOK);
 
 	if (is_read)
 		prot |= VM_PROT_WRITE;	/* Device will write to host memory */
 	err = vm_fault_hold_pages(&curproc->p_vmspace->vm_map,
-	    addr, len, prot, upages, max_pages, npagesp);
-	if (err != 0)
+	    addr, len, prot, upages_us, npages, npagesp);
+	if (err != 0) {
+		if (*upages != upages_us)
+			free(upages_us, M_NVME);
 		return (err);
+	}
 	*req = nvme_allocate_request_null(M_WAITOK, cb_fn, cb_arg);
-	(*req)->payload = memdesc_vmpages(upages, len, addr & PAGE_MASK);
+	(*req)->payload = memdesc_vmpages(upages_us, len, addr & PAGE_MASK);
 	(*req)->payload_valid = true;
+	if (*upages != upages_us)
+		*upages = upages_us;
 	return (0);
 }
 
 static void
-nvme_user_ioctl_free(vm_page_t *pages, int npage)
+nvme_user_ioctl_free(vm_page_t *pages, int npage, bool freeit)
 {
 	vm_page_unhold_pages(pages, npage);
+	if (freeit)
+		free(pages, M_NVME);
 }
 
 static void
@@ -1400,7 +1422,8 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 	struct mtx *mtx;
 	int ret = 0;
 	int npages = 0;
-	vm_page_t upages[NVME_MAX_PAGES];
+	vm_page_t upages_small[NVME_MAX_PAGES];
+	vm_page_t *upages = upages_small;
 
 	if (pt->len > 0) {
 		if (pt->len > ctrlr->max_xfer_size) {
@@ -1411,7 +1434,7 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 		}
 		if (is_user) {
 			ret = nvme_user_ioctl_req((vm_offset_t)pt->buf, pt->len,
-			    pt->is_read, upages, nitems(upages), &npages, &req,
+			    pt->is_read, &upages, nitems(upages_small), &npages, &req,
 			    nvme_pt_done, pt);
 			if (ret != 0)
 				return (ret);
@@ -1449,7 +1472,7 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 	mtx_unlock(mtx);
 
 	if (npages > 0)
-		nvme_user_ioctl_free(upages, npages);
+		nvme_user_ioctl_free(upages, npages, upages != upages_small);
 
 	return (ret);
 }
@@ -1477,7 +1500,8 @@ nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
 	struct mtx		*mtx;
 	int			ret = 0;
 	int			npages = 0;
-	vm_page_t		upages[NVME_MAX_PAGES];
+	vm_page_t		upages_small[NVME_MAX_PAGES];
+	vm_page_t		*upages = upages_small;
 
 	/*
 	 * We don't support metadata.
@@ -1494,7 +1518,7 @@ nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
 		}
 		if (is_user) {
 			ret = nvme_user_ioctl_req(npc->addr, npc->data_len,
-			    npc->opcode & 0x1, upages, nitems(upages), &npages,
+			    npc->opcode & 0x1, &upages, nitems(upages), &npages,
 			    &req, nvme_npc_done, npc);
 			if (ret != 0)
 				return (ret);
@@ -1533,7 +1557,7 @@ nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
 	mtx_unlock(mtx);
 
 	if (npages > 0)
-		nvme_user_ioctl_free(upages, npages);
+		nvme_user_ioctl_free(upages, npages, upages != upages_small);
 
 	return (ret);
 }