git: 9cd7b624732c - main - nvme: Eliminate RECOVERY_FAILED state

From: Warner Losh <imp_at_FreeBSD.org>
Date: Tue, 10 Oct 2023 22:26:19 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=9cd7b624732c3b675178b02b7447272f67a3203d

commit 9cd7b624732c3b675178b02b7447272f67a3203d
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2023-10-10 17:13:16 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2023-10-10 22:13:57 +0000

    nvme: Eliminate RECOVERY_FAILED state
    
    While it seemed like a good idea to have this state, we can do
    everything we wanted with the state by checking ctrlr->is_failed since
    that's set before we start failing the qpairs. Add some comments about
    racing when we're failing the controller, though in practice I'm not
    sure that kind of race could even be lost.
    
    Sponsored by:           Netflix
    Reviewed by:            chuck, gallatin, jhb
    Differential Revision:  https://reviews.freebsd.org/D42051
---
 sys/dev/nvme/nvme_private.h |  1 -
 sys/dev/nvme/nvme_qpair.c   | 43 ++++++++++++++++++++++++++++---------------
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index c573fbfd572f..f6ad51795adb 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -150,7 +150,6 @@ struct nvme_tracker {
 enum nvme_recovery {
 	RECOVERY_NONE = 0,		/* Normal operations */
 	RECOVERY_WAITING,		/* waiting for the reset to complete */
-	RECOVERY_FAILED,		/* We have failed, no more I/O */
 };
 struct nvme_qpair {
 	struct nvme_controller	*ctrlr;
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index cd0057f444b8..6d9d337e76a5 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -1027,6 +1027,18 @@ nvme_qpair_timeout(void *arg)
 
 	mtx_assert(&qpair->recovery, MA_OWNED);
 
+	/*
+	 * If the controller is failed, then stop polling. This ensures that any
+	 * failure processing that races with the qpair timeout will fail
+	 * safely.
+	 */
+	if (qpair->ctrlr->is_failed) {
+		nvme_printf(qpair->ctrlr,
+		    "Failed controller, stopping watchdog timeout.\n");
+		qpair->timer_armed = false;
+		return;
+	}
+
 	switch (qpair->recovery_state) {
 	case RECOVERY_NONE:
 		/*
@@ -1120,11 +1132,6 @@ nvme_qpair_timeout(void *arg)
 			nvme_printf(ctrlr, "Waiting for reset to complete\n");
 		idle = false;		/* We want to keep polling */
 		break;
-	case RECOVERY_FAILED:
-		KASSERT(qpair->ctrlr->is_failed,
-		    ("Recovery state failed w/o failed controller\n"));
-		idle = true;			/* nothing to monitor */
-		break;
 	}
 
 	/*
@@ -1244,11 +1251,21 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
 		/*
 		 * No tracker is available, or the qpair is disabled due to an
-		 * in-progress controller-level reset or controller failure. If
-		 * we lose the race with recovery_state, then we may add an
-		 * extra request to the queue which will be resubmitted later.
-		 * We only set recovery_state to NONE with qpair->lock also
-		 * held.
+		 * in-progress controller-level reset. If we lose the race with
+		 * recovery_state, then we may add an extra request to the queue
+		 * which will be resubmitted later.  We only set recovery_state
+		 * to NONE with qpair->lock also held, so if we observe that the
+		 * state is not NONE, we know it can't transition to NONE below
+		 * when we've submitted the request to hardware.
+		 *
+		 * Also, as part of the failure process, we set recovery_state
+		 * to RECOVERY_WAITING, so we check here to see if we've failed
+		 * the controller. We set it before we call the qpair_fail
+		 * functions, which take out the lock lock before messing with
+		 * queued_req. Since we hold that lock, we know it's safe to
+		 * either fail directly, or queue the failure should is_failed
+		 * be stale. If we lose the race reading is_failed, then
+		 * nvme_qpair_fail will fail the queued request.
 		 */
 
 		if (qpair->ctrlr->is_failed) {
@@ -1314,7 +1331,7 @@ nvme_qpair_enable(struct nvme_qpair *qpair)
 		mtx_assert(&qpair->recovery, MA_OWNED);
 	if (mtx_initialized(&qpair->lock))
 		mtx_assert(&qpair->lock, MA_OWNED);
-	KASSERT(qpair->recovery_state != RECOVERY_FAILED,
+	KASSERT(!qpair->ctrlr->is_failed,
 	    ("Enabling a failed qpair\n"));
 
 	qpair->recovery_state = RECOVERY_NONE;
@@ -1471,10 +1488,6 @@ nvme_qpair_fail(struct nvme_qpair *qpair)
 	if (!mtx_initialized(&qpair->lock))
 		return;
 
-	mtx_lock(&qpair->recovery);
-	qpair->recovery_state = RECOVERY_FAILED;
-	mtx_unlock(&qpair->recovery);
-
 	mtx_lock(&qpair->lock);
 
 	if (!STAILQ_EMPTY(&qpair->queued_req)) {