git: 9bbd0a7ca999 - stable/13 - nvme: Use shared timeout rather than timeout per transaction

From: Alexander Motin <mav_at_FreeBSD.org>
Date: Fri, 21 Jan 2022 02:26:50 UTC
The branch stable/13 has been updated by mav:

URL: https://cgit.FreeBSD.org/src/commit/?id=9bbd0a7ca9996d3eee2162ab01ad81d4b15071d8

commit 9bbd0a7ca9996d3eee2162ab01ad81d4b15071d8
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2021-09-23 22:31:32 +0000
Commit:     Alexander Motin <mav@FreeBSD.org>
CommitDate: 2022-01-21 02:07:30 +0000

    nvme: Use shared timeout rather than timeout per transaction
    
    Keep track of the approximate time commands are 'due' and the next
    deadline for a command. twice a second, wake up to see if any commands
    have entered timeout. If so, quiessce and then enter a recovery mode
    half the timeout further in the future to allow the ISR to
    complete. Once we exit recovery mode, we go back to operations as
    normal.
    
    Sponsored by:           Netflix
    Differential Revision:  https://reviews.freebsd.org/D28583
    
    (cherry picked from commit 502dc84a8b6703e7c0626739179a3cdffdd22d81)
---
 sys/dev/nvme/nvme_ctrlr.c   |   8 +-
 sys/dev/nvme/nvme_private.h |  16 +++-
 sys/dev/nvme/nvme_qpair.c   | 173 ++++++++++++++++++++++++++++----------------
 3 files changed, 131 insertions(+), 66 deletions(-)

diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 833bf328584a..39cde3f065dd 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -232,7 +232,8 @@ nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
 	mtx_lock(&ctrlr->lock);
 	STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
 	mtx_unlock(&ctrlr->lock);
-	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
+	if (!ctrlr->is_dying)
+		taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
 }
 
 static void
@@ -435,7 +436,8 @@ nvme_ctrlr_reset(struct nvme_controller *ctrlr)
 		 */
 		return;
 
-	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
+	if (!ctrlr->is_dying)
+		taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
 }
 
 static int
@@ -1481,6 +1483,8 @@ nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 {
 	int	gone, i;
 
+	ctrlr->is_dying = true;
+
 	if (ctrlr->resource == NULL)
 		goto nores;
 	if (!mtx_initialized(&ctrlr->adminq.lock))
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index fba1b406e9ce..02cecaf03a97 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -151,7 +151,7 @@ struct nvme_tracker {
 	TAILQ_ENTRY(nvme_tracker)	tailq;
 	struct nvme_request		*req;
 	struct nvme_qpair		*qpair;
-	struct callout			timer;
+	sbintime_t			deadline;
 	bus_dmamap_t			payload_dma_map;
 	uint16_t			cid;
 
@@ -159,6 +159,12 @@ struct nvme_tracker {
 	bus_addr_t			prp_bus_addr;
 };
 
+enum nvme_recovery {
+	RECOVERY_NONE = 0,		/* Normal operations */
+	RECOVERY_START,			/* Deadline has passed, start recovering */
+	RECOVERY_RESET,			/* This pass, initiate reset of controller */
+	RECOVERY_WAITING,		/* waiting for the reset to complete */
+};
 struct nvme_qpair {
 	struct nvme_controller	*ctrlr;
 	uint32_t		id;
@@ -170,6 +176,11 @@ struct nvme_qpair {
 	struct resource		*res;
 	void 			*tag;
 
+	struct callout		timer;
+	sbintime_t		deadline;
+	bool			timer_armed;
+	enum nvme_recovery	recovery_state;
+
 	uint32_t		num_entries;
 	uint32_t		num_trackers;
 	uint32_t		sq_tdbl_off;
@@ -201,8 +212,6 @@ struct nvme_qpair {
 
 	struct nvme_tracker	**act_tr;
 
-	bool			is_enabled;
-
 	struct mtx		lock __aligned(CACHE_LINE_SIZE);
 
 } __aligned(CACHE_LINE_SIZE);
@@ -305,6 +314,7 @@ struct nvme_controller {
 	uint32_t			notification_sent;
 
 	bool				is_failed;
+	bool				is_dying;
 	STAILQ_HEAD(, nvme_request)	fail_req;
 
 	/* Host Memory Buffer */
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index 71718acff66f..7a17a057f319 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -452,7 +452,6 @@ nvme_qpair_complete_tracker(struct nvme_tracker *tr,
 	}
 
 	mtx_lock(&qpair->lock);
-	callout_stop(&tr->timer);
 
 	if (retry) {
 		req->retries++;
@@ -544,7 +543,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 	 * progress.  Ignore the interrupt - any I/O that was associated with
 	 * this interrupt will get retried when the reset is complete.
 	 */
-	if (!qpair->is_enabled)
+	if (qpair->recovery_state != RECOVERY_NONE)
 		return (false);
 
 	bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
@@ -739,6 +738,10 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
 	qpair->cpl_bus_addr = queuemem_phys + cmdsz;
 	prpmem_phys = queuemem_phys + cmdsz + cplsz;
 
+	callout_init(&qpair->timer, 1);
+	qpair->timer_armed = false;
+	qpair->recovery_state = RECOVERY_NONE;
+
 	/*
 	 * Calcuate the stride of the doorbell register. Many emulators set this
 	 * value to correspond to a cache line. However, some hardware has set
@@ -776,7 +779,6 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
 		    DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
 		bus_dmamap_create(qpair->dma_tag_payload, 0,
 		    &tr->payload_dma_map);
-		callout_init(&tr->timer, 1);
 		tr->cid = i;
 		tr->qpair = qpair;
 		tr->prp = (uint64_t *)prp_list;
@@ -835,6 +837,8 @@ nvme_qpair_destroy(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr;
 
+	callout_drain(&qpair->timer);
+
 	if (qpair->tag) {
 		bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
 		qpair->tag = NULL;
@@ -914,65 +918,101 @@ nvme_io_qpair_destroy(struct nvme_qpair *qpair)
 }
 
 static void
-nvme_abort_complete(void *arg, const struct nvme_completion *status)
+nvme_qpair_timeout(void *arg)
 {
-	struct nvme_tracker	*tr = arg;
+	struct nvme_qpair	*qpair = arg;
+	struct nvme_controller	*ctrlr = qpair->ctrlr;
+	struct nvme_tracker	*tr;
+	struct nvme_tracker	*tr_temp;
+	sbintime_t		now;
+	bool			idle;
+	uint32_t		csts;
+	uint8_t			cfs;
 
-	/*
-	 * If cdw0 == 1, the controller was not able to abort the command
-	 *  we requested.  We still need to check the active tracker array,
-	 *  to cover race where I/O timed out at same time controller was
-	 *  completing the I/O.
-	 */
-	if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
+	mtx_lock(&qpair->lock);
+	idle = TAILQ_EMPTY(&qpair->outstanding_tr);
+again:
+	switch (qpair->recovery_state) {
+	case RECOVERY_NONE:
+		if (idle)
+			break;
+		now = getsbinuptime();
+		TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
+			if (now > tr->deadline && tr->deadline != 0) {
+				/*
+				 * We're now passed our earliest deadline. We
+				 * need to do expensive things to cope, but next
+				 * time. Flag that and close the door to any
+				 * further processing.
+				 */
+				qpair->recovery_state = RECOVERY_START;
+				nvme_printf(ctrlr, "RECOVERY_START %jd vs %jd\n",
+				    (uintmax_t)now, (uintmax_t)tr->deadline);
+				break;
+			}
+		}
+		break;
+	case RECOVERY_START:
 		/*
-		 * An I/O has timed out, and the controller was unable to
-		 *  abort it for some reason.  Construct a fake completion
-		 *  status, and then complete the I/O's tracker manually.
+		 * Read csts to get value of cfs - controller fatal status.
+		 * If no fatal status, try to call the completion routine, and
+		 * if completes transactions, report a missed interrupt and
+		 * return (this may need to be rate limited). Otherwise, if
+		 * aborts are enabled and the controller is not reporting
+		 * fatal status, abort the command. Otherwise, just reset the
+		 * controller and hope for the best.
 		 */
-		nvme_printf(tr->qpair->ctrlr,
-		    "abort command failed, aborting command manually\n");
-		nvme_qpair_manual_complete_tracker(tr,
-		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
+		csts = nvme_mmio_read_4(ctrlr, csts);
+		cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;
+		if (cfs) {
+			nvme_printf(ctrlr, "Controller in fatal status, resetting\n");
+			qpair->recovery_state = RECOVERY_RESET;
+			goto again;
+		}
+		mtx_unlock(&qpair->lock);
+		if (nvme_qpair_process_completions(qpair)) {
+			nvme_printf(ctrlr, "Completions present in output without an interrupt\n");
+			qpair->recovery_state = RECOVERY_NONE;
+		} else {
+			nvme_printf(ctrlr, "timeout with nothing complete, resetting\n");
+			qpair->recovery_state = RECOVERY_RESET;
+			mtx_lock(&qpair->lock);
+			goto again;
+		}
+		mtx_lock(&qpair->lock);
+		break;
+	case RECOVERY_RESET:
+		/*
+		 * If we get here due to a possible surprise hot-unplug event,
+		 * then we let nvme_ctrlr_reset confirm and fail the
+		 * controller.
+		 */
+		nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
+		    cfs ? " and fatal error status" : "");
+		nvme_printf(ctrlr, "RECOVERY_WAITING\n");
+		qpair->recovery_state = RECOVERY_WAITING;
+		nvme_ctrlr_reset(ctrlr);
+		break;
+	case RECOVERY_WAITING:
+		nvme_printf(ctrlr, "waiting\n");
+		break;
 	}
-}
-
-static void
-nvme_timeout(void *arg)
-{
-	struct nvme_tracker	*tr = arg;
-	struct nvme_qpair	*qpair = tr->qpair;
-	struct nvme_controller	*ctrlr = qpair->ctrlr;
-	uint32_t		csts;
-	uint8_t			cfs;
 
 	/*
-	 * Read csts to get value of cfs - controller fatal status.
-	 * If no fatal status, try to call the completion routine, and
-	 * if completes transactions, report a missed interrupt and
-	 * return (this may need to be rate limited). Otherwise, if
-	 * aborts are enabled and the controller is not reporting
-	 * fatal status, abort the command. Otherwise, just reset the
-	 * controller and hope for the best.
+	 * Rearm the timeout.
 	 */
-	csts = nvme_mmio_read_4(ctrlr, csts);
-	cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;
-	if (cfs == 0 && nvme_qpair_process_completions(qpair)) {
-		nvme_printf(ctrlr, "Missing interrupt\n");
-		return;
-	}
-	if (ctrlr->enable_aborts && cfs == 0) {
-		nvme_printf(ctrlr, "Aborting command due to a timeout.\n");
-		nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
-		    nvme_abort_complete, tr);
+	if (!idle) {
+		callout_schedule(&qpair->timer, hz / 2);
 	} else {
-		nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
-		    (csts == NVME_GONE) ? " and possible hot unplug" :
-		    (cfs ? " and fatal error status" : ""));
-		nvme_ctrlr_reset(ctrlr);
+		qpair->timer_armed = false;
 	}
+	mtx_unlock(&qpair->lock);
 }
 
+/*
+ * Submit the tracker to the hardware. Must already be in the
+ * outstanding queue when called.
+ */
 void
 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
 {
@@ -989,12 +1029,17 @@ nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
 
 	if (req->timeout) {
 		if (req->cb_fn == nvme_completion_poll_cb)
-			timeout = hz;
+			timeout = 1;
 		else
-			timeout = ctrlr->timeout_period * hz;
-		callout_reset_on(&tr->timer, timeout, nvme_timeout, tr,
-		    qpair->cpu);
-	}
+			timeout = ctrlr->timeout_period;
+		tr->deadline = getsbinuptime() + timeout * SBT_1S;
+		if (!qpair->timer_armed) {
+			qpair->timer_armed = true;
+			callout_reset_on(&qpair->timer, hz / 2,
+			    nvme_qpair_timeout, qpair, qpair->cpu);
+		}
+	} else
+		tr->deadline = SBT_MAX;
 
 	/* Copy the command from the tracker to the submission queue. */
 	memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
@@ -1069,7 +1114,7 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 	tr = TAILQ_FIRST(&qpair->free_tr);
 	req->qpair = qpair;
 
-	if (tr == NULL || !qpair->is_enabled) {
+	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
 		/*
 		 * No tracker is available, or the qpair is disabled due to
 		 *  an in-progress controller-level reset or controller
@@ -1096,6 +1141,8 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 
 	TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
 	TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
+	if (!qpair->timer_armed)
+		tr->deadline = SBT_MAX;
 	tr->req = req;
 
 	switch (req->type) {
@@ -1164,8 +1211,9 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 static void
 nvme_qpair_enable(struct nvme_qpair *qpair)
 {
+	mtx_assert(&qpair->lock, MA_OWNED);
 
-	qpair->is_enabled = true;
+	qpair->recovery_state = RECOVERY_NONE;
 }
 
 void
@@ -1208,7 +1256,9 @@ nvme_admin_qpair_enable(struct nvme_qpair *qpair)
 		    NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
 	}
 
+	mtx_lock(&qpair->lock);
 	nvme_qpair_enable(qpair);
+	mtx_unlock(&qpair->lock);
 }
 
 void
@@ -1251,12 +1301,13 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)
 static void
 nvme_qpair_disable(struct nvme_qpair *qpair)
 {
-	struct nvme_tracker *tr;
+	struct nvme_tracker	*tr, *tr_temp;
 
-	qpair->is_enabled = false;
 	mtx_lock(&qpair->lock);
-	TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq)
-		callout_stop(&tr->timer);
+	qpair->recovery_state = RECOVERY_WAITING;
+	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
+		tr->deadline = SBT_MAX;
+	}
 	mtx_unlock(&qpair->lock);
 }