git: 99c14fb99ffc - main - cam: Drop periph lock when completing I/O with ENOMEM status

From: Warner Losh <imp_at_FreeBSD.org>
Date: Fri, 24 May 2024 15:40:43 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=99c14fb99ffc8fd601ad15bbe54a54fac8f4f203

commit 99c14fb99ffc8fd601ad15bbe54a54fac8f4f203
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2024-05-24 14:32:04 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2024-05-24 14:32:04 +0000

    cam: Drop periph lock when completing I/O with ENOMEM status
    
    When biofinish calls g_io_deliver with an error of ENOMEM, that kicks
    off the slowdown protocol, forcing I/O to go through g_down rather than
    be directly dispatch. One of the side effects is that the I/O is
    resubmitted, so the start routines get called recursively, leading to a
    recursive lock panic. Rather than make the periph lock recursive, drop
    and reacquire the lock around such calls to biofinish.
    
    For nda, this happens only when we can't allocate space to construct a
    TRIM. For ada and da, this is only for certain ZONE operations.
    
    Sponsored by:           Netflix
    Reviewed by:            gallatin
    Differential Revision:  https://reviews.freebsd.org/D45310
---
 sys/cam/ata/ata_da.c   |  9 +++++++++
 sys/cam/nvme/nvme_da.c | 10 ++++++++++
 sys/cam/scsi/scsi_da.c | 11 ++++++++++-
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c
index d4a591943307..6e008cfc8d22 100644
--- a/sys/cam/ata/ata_da.c
+++ b/sys/cam/ata/ata_da.c
@@ -2518,7 +2518,16 @@ adastart(struct cam_periph *periph, union ccb *start_ccb)
 			error = ada_zone_cmd(periph, start_ccb, bp, &queue_ccb);
 			if ((error != 0)
 			 || (queue_ccb == 0)) {
+				/*
+				 * g_io_deliver will recurisvely call start
+				 * routine for ENOMEM, so drop the periph
+				 * lock to allow that recursion.
+				 */
+				if (error == ENOMEM)
+					cam_periph_unlock(periph);
 				biofinish(bp, NULL, error);
+				if (error == ENOMEM)
+					cam_periph_lock(periph);
 				xpt_release_ccb(start_ccb);
 				return;
 			}
diff --git a/sys/cam/nvme/nvme_da.c b/sys/cam/nvme/nvme_da.c
index 3f6cf8702870..41c552e2780a 100644
--- a/sys/cam/nvme/nvme_da.c
+++ b/sys/cam/nvme/nvme_da.c
@@ -1077,7 +1077,17 @@ ndastart(struct cam_periph *periph, union ccb *start_ccb)
 
 			trim = malloc(sizeof(*trim), M_NVMEDA, M_ZERO | M_NOWAIT);
 			if (trim == NULL) {
+				/*
+				 * We have to drop the periph lock when
+				 * returning ENOMEM. g_io_deliver treats these
+				 * request differently and will recursively call
+				 * the start routine which causes us to get into
+				 * ndastrategy with the periph lock held,
+				 * leading to a panic when its acquired again.
+				 */
+				cam_periph_unlock(periph);
 				biofinish(bp, NULL, ENOMEM);
+				cam_periph_lock(periph);
 				xpt_release_ccb(start_ccb);
 				ndaschedule(periph);
 				return;
diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c
index 0daaff9229b0..59745231bca5 100644
--- a/sys/cam/scsi/scsi_da.c
+++ b/sys/cam/scsi/scsi_da.c
@@ -3455,10 +3455,19 @@ more:
 
 			queue_ccb = 0;
 
-			error = da_zone_cmd(periph, start_ccb, bp,&queue_ccb);
+			error = da_zone_cmd(periph, start_ccb, bp, &queue_ccb);
 			if ((error != 0)
 			 || (queue_ccb == 0)) {
+				/*
+				 * g_io_deliver will recurisvely call start
+				 * routine for ENOMEM, so drop the periph
+				 * lock to allow that recursion.
+				 */
+				if (error == ENOMEM)
+					cam_periph_unlock(periph);
 				biofinish(bp, NULL, error);
+				if (error == ENOMEM)
+					cam_periph_lock(periph);
 				xpt_release_ccb(start_ccb);
 				return;
 			}