git: 3d89acf59098 - main - nvme: Separate total failures from I/O failures

From: Warner Losh <imp_at_FreeBSD.org>
Date: Fri, 16 Aug 2024 03:33:22 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=3d89acf59098cc7a7a118c8aed89e562f686d8ed

commit 3d89acf59098cc7a7a118c8aed89e562f686d8ed
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2024-08-14 22:55:49 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2024-08-16 03:31:20 +0000

    nvme: Separate total failures from I/O failures
    
    When it's a I/O failure, we can still send admin commands. Separate out
    the admin failures and flag them as such so that we can still send admin
    commands on half-failed drives.
    
    Fixes: 9229b3105d88 (nvme: Fail passthrough commands right away in failed state)
    Sponsored by: Netflix
---
 sys/dev/nvme/nvme_ctrlr.c   | 46 ++++++++++++++++++++++++++-------------------
 sys/dev/nvme/nvme_private.h |  1 +
 sys/dev/nvme/nvme_qpair.c   | 23 +++++++++++++++--------
 sys/dev/nvme/nvme_sim.c     | 13 ++++++++++++-
 4 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 270be61e2d36..4c1a3830aac0 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -232,7 +232,7 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
 }
 
 static void
-nvme_ctrlr_fail(struct nvme_controller *ctrlr)
+nvme_ctrlr_fail(struct nvme_controller *ctrlr, bool admin_also)
 {
 	int i;
 
@@ -242,7 +242,10 @@ nvme_ctrlr_fail(struct nvme_controller *ctrlr)
 	 * a different error, though when we fail, that hardly matters).
 	 */
 	ctrlr->is_failed = true;
-	nvme_qpair_fail(&ctrlr->adminq);
+	if (admin_also) {
+		ctrlr->is_failed_admin = true;
+		nvme_qpair_fail(&ctrlr->adminq);
+	}
 	if (ctrlr->ioq != NULL) {
 		for (i = 0; i < ctrlr->num_io_queues; i++) {
 			nvme_qpair_fail(&ctrlr->ioq[i]);
@@ -415,6 +418,7 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 
 	TSENTER();
 
+	ctrlr->is_failed_admin = true;
 	nvme_ctrlr_disable_qpairs(ctrlr);
 
 	err = nvme_ctrlr_disable(ctrlr);
@@ -423,6 +427,8 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 
 	err = nvme_ctrlr_enable(ctrlr);
 out:
+	if (err == 0)
+		ctrlr->is_failed_admin = false;
 
 	TSEXIT();
 	return (err);
@@ -435,11 +441,10 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr)
 
 	cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
 
-	if (cmpset == 0 || ctrlr->is_failed)
+	if (cmpset == 0)
 		/*
-		 * Controller is already resetting or has failed.  Return
-		 *  immediately since there is no need to kick off another
-		 *  reset in these cases.
+		 * Controller is already resetting.  Return immediately since
+		 * there is no need to kick off another reset.
 		 */
 		return;
 
@@ -1090,7 +1095,7 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
 		return;
 
 	if (resetting && nvme_ctrlr_identify(ctrlr) != 0) {
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, false);
 		return;
 	}
 
@@ -1105,7 +1110,7 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
 	if (resetting) {
 		old_num_io_queues = ctrlr->num_io_queues;
 		if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
-			nvme_ctrlr_fail(ctrlr);
+			nvme_ctrlr_fail(ctrlr, false);
 			return;
 		}
 
@@ -1123,12 +1128,12 @@ nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
 		nvme_ctrlr_hmb_enable(ctrlr, true, true);
 
 	if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, false);
 		return;
 	}
 
 	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, false);
 		return;
 	}
 
@@ -1148,8 +1153,7 @@ nvme_ctrlr_start_config_hook(void *arg)
 	TSENTER();
 
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
-fail:
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, true);
 		config_intrhook_disestablish(&ctrlr->config_hook);
 		return;
 	}
@@ -1162,13 +1166,15 @@ fail:
 	    nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
 		nvme_ctrlr_start(ctrlr, false);
 	else
-		goto fail;
+		nvme_ctrlr_fail(ctrlr, false);
 
 	nvme_sysctl_initialize_ctrlr(ctrlr);
 	config_intrhook_disestablish(&ctrlr->config_hook);
 
-	ctrlr->is_initialized = true;
-	nvme_notify_new_controller(ctrlr);
+	if (!ctrlr->is_failed) {
+		ctrlr->is_initialized = true;
+		nvme_notify_new_controller(ctrlr);
+	}
 	TSEXIT();
 }
 
@@ -1185,7 +1191,7 @@ nvme_ctrlr_reset_task(void *arg, int pending)
 		nvme_ctrlr_start(ctrlr, true);
 	} else {
 		nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"timed_out\"");
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, true);
 	}
 
 	atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
@@ -1612,7 +1618,7 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 	 */
 	gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE);
 	if (gone)
-		nvme_ctrlr_fail(ctrlr);
+		nvme_ctrlr_fail(ctrlr, true);
 	else
 		nvme_notify_fail_consumers(ctrlr);
 
@@ -1742,7 +1748,9 @@ nvme_ctrlr_suspend(struct nvme_controller *ctrlr)
 	int to = hz;
 
 	/*
-	 * Can't touch failed controllers, so it's already suspended.
+	 * Can't touch failed controllers, so it's already suspended. User will
+	 * need to do an explicit reset to bring it back, if that's even
+	 * possible.
 	 */
 	if (ctrlr->is_failed)
 		return (0);
@@ -1809,7 +1817,7 @@ fail:
 	 * itself, due to questionable APIs.
 	 */
 	nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
-	nvme_ctrlr_fail(ctrlr);
+	nvme_ctrlr_fail(ctrlr, true);
 	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 	return (0);
 }
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index 57613242ea84..029c2ff97bff 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -301,6 +301,7 @@ struct nvme_controller {
 	uint32_t			notification_sent;
 
 	bool				is_failed;
+	bool				is_failed_admin;
 	bool				is_dying;
 	bool				isr_warned;
 	bool				is_initialized;
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index 0c3a36d4d76f..fcc95bf854b9 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -1046,6 +1046,7 @@ nvme_qpair_timeout(void *arg)
 	struct nvme_tracker	*tr;
 	sbintime_t		now;
 	bool			idle = true;
+	bool			is_admin = qpair == &ctrlr->adminq;
 	bool			fast;
 	uint32_t		csts;
 	uint8_t			cfs;
@@ -1057,9 +1058,10 @@ nvme_qpair_timeout(void *arg)
 	 * failure processing that races with the qpair timeout will fail
 	 * safely.
 	 */
-	if (qpair->ctrlr->is_failed) {
+	if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
 		nvme_printf(qpair->ctrlr,
-		    "Failed controller, stopping watchdog timeout.\n");
+		    "%sFailed controller, stopping watchdog timeout.\n",
+		    is_admin ? "Complete " : "");
 		qpair->timer_armed = false;
 		return;
 	}
@@ -1329,6 +1331,7 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 {
 	struct nvme_tracker	*tr;
 	int			err = 0;
+	bool			is_admin = qpair == &qpair->ctrlr->adminq;
 
 	mtx_assert(&qpair->lock, MA_OWNED);
 
@@ -1338,12 +1341,14 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 	/*
 	 * The controller has failed, so fail the request. Note, that this races
 	 * the recovery / timeout code. Since we hold the qpair lock, we know
-	 * it's safe to fail directly. is_failed is set when we fail the controller.
-	 * It is only ever reset in the ioctl reset controller path, which is safe
-	 * to race (for failed controllers, we make no guarantees about bringing
-	 * it out of failed state relative to other commands).
+	 * it's safe to fail directly. is_failed is set when we fail the
+	 * controller.  It is only ever reset in the ioctl reset controller
+	 * path, which is safe to race (for failed controllers, we make no
+	 * guarantees about bringing it out of failed state relative to other
+	 * commands). We try hard to allow admin commands when the entire
+	 * controller hasn't failed, only something related to I/O queues.
 	 */
-	if (qpair->ctrlr->is_failed) {
+	if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
 		nvme_qpair_manual_complete_request(qpair, req,
 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1,
 		    ERROR_PRINT_NONE);
@@ -1410,11 +1415,13 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 static void
 nvme_qpair_enable(struct nvme_qpair *qpair)
 {
+	bool is_admin __unused = qpair == &qpair->ctrlr->adminq;
+
 	if (mtx_initialized(&qpair->recovery))
 		mtx_assert(&qpair->recovery, MA_OWNED);
 	if (mtx_initialized(&qpair->lock))
 		mtx_assert(&qpair->lock, MA_OWNED);
-	KASSERT(!qpair->ctrlr->is_failed,
+	KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed),
 	    ("Enabling a failed qpair\n"));
 
 	qpair->recovery_state = RECOVERY_NONE;
diff --git a/sys/dev/nvme/nvme_sim.c b/sys/dev/nvme/nvme_sim.c
index 2ba3df9ea6e8..8bdeb4be49f3 100644
--- a/sys/dev/nvme/nvme_sim.c
+++ b/sys/dev/nvme/nvme_sim.c
@@ -268,7 +268,6 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb)
 		ccb->ccb_h.status = CAM_REQ_CMP;
 		break;
 	case XPT_NVME_IO:		/* Execute the requested I/O operation */
-	case XPT_NVME_ADMIN:		/* or Admin operation */
 		if (ctrlr->is_failed) {
 			/*
 			 * I/O came in while we were failing the drive, so drop
@@ -279,6 +278,18 @@ nvme_sim_action(struct cam_sim *sim, union ccb *ccb)
 		}
 		nvme_sim_nvmeio(sim, ccb);
 		return;			/* no done */
+	case XPT_NVME_ADMIN:		/* or Admin operation */
+		if (ctrlr->is_failed_admin) {
+			/*
+			 * Admin request came in when we can't send admin
+			 * commands, so drop it. Once falure is complete, we'll
+			 * be destroyed.
+			 */
+			ccb->ccb_h.status = CAM_DEV_NOT_THERE;
+			break;
+		}
+		nvme_sim_nvmeio(sim, ccb);
+		return;			/* no done */
 	default:
 		ccb->ccb_h.status = CAM_REQ_INVALID;
 		break;