svn commit: r335150 - stable/11/sys/dev/nvme

Alexander Motin mav at FreeBSD.org
Thu Jun 14 16:51:40 UTC 2018


Author: mav
Date: Thu Jun 14 16:51:39 2018
New Revision: 335150
URL: https://svnweb.freebsd.org/changeset/base/335150

Log:
  MFC r331046 (by imp): Try polling the qpairs on timeout.
  
  On some systems, we're getting timeouts when we use multiple queues on
  drives that work perfectly well on other systems. On a hunch, Jim
  Harris suggested I poll the completion queue when we get a timeout.
  This patch polls the completion queue if no fatal status was
  indicated. If it had pending I/O, we complete that request and
  return. Otherwise, if aborts are enabled and no fatal status, we abort
  the command and return. Otherwise we reset the card.
  
  This may clear up the problem, or we may see it result in lots of
  timeouts and a performance problem. Either way, we'll know the next
  step. We may also need to pay attention to the fatal status bit
  of the controller.

Modified:
  stable/11/sys/dev/nvme/nvme_private.h
  stable/11/sys/dev/nvme/nvme_qpair.c

Modified: stable/11/sys/dev/nvme/nvme_private.h
==============================================================================
--- stable/11/sys/dev/nvme/nvme_private.h	Thu Jun 14 16:19:05 2018	(r335149)
+++ stable/11/sys/dev/nvme/nvme_private.h	Thu Jun 14 16:51:39 2018	(r335150)
@@ -424,7 +424,7 @@ int	nvme_qpair_construct(struct nvme_qpair *qpair, uin
 			     struct nvme_controller *ctrlr);
 void	nvme_qpair_submit_tracker(struct nvme_qpair *qpair,
 				  struct nvme_tracker *tr);
-void	nvme_qpair_process_completions(struct nvme_qpair *qpair);
+bool	nvme_qpair_process_completions(struct nvme_qpair *qpair);
 void	nvme_qpair_submit_request(struct nvme_qpair *qpair,
 				  struct nvme_request *req);
 void	nvme_qpair_reset(struct nvme_qpair *qpair);

Modified: stable/11/sys/dev/nvme/nvme_qpair.c
==============================================================================
--- stable/11/sys/dev/nvme/nvme_qpair.c	Thu Jun 14 16:19:05 2018	(r335149)
+++ stable/11/sys/dev/nvme/nvme_qpair.c	Thu Jun 14 16:51:39 2018	(r335150)
@@ -389,11 +389,12 @@ nvme_qpair_manual_complete_request(struct nvme_qpair *
 	nvme_free_request(req);
 }
 
-void
+bool
 nvme_qpair_process_completions(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr;
 	struct nvme_completion	*cpl;
+	int done = 0;
 
 	qpair->num_intr_handler_calls++;
 
@@ -404,7 +405,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpai
 		 *  associated with this interrupt will get retried when the
 		 *  reset is complete.
 		 */
-		return;
+		return (false);
 
 	while (1) {
 		cpl = &qpair->cpl[qpair->cq_head];
@@ -417,6 +418,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpai
 		if (tr != NULL) {
 			nvme_qpair_complete_tracker(qpair, tr, cpl, TRUE);
 			qpair->sq_head = cpl->sqhd;
+			done++;
 		} else {
 			nvme_printf(qpair->ctrlr, 
 			    "cpl does not map to outstanding cmd\n");
@@ -432,6 +434,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpai
 		nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl,
 		    qpair->cq_head);
 	}
+	return (done != 0);
 }
 
 static void
@@ -685,18 +688,29 @@ nvme_timeout(void *arg)
 	struct nvme_controller	*ctrlr = qpair->ctrlr;
 	union csts_register	csts;
 
-	/* Read csts to get value of cfs - controller fatal status. */
+	/*
+	 * Read csts to get value of cfs - controller fatal status.
+	 * If no fatal status, try to call the completion routine, and
+	 * if completes transactions, report a missed interrupt and
+	 * return (this may need to be rate limited). Otherwise, if
+	 * aborts are enabled and the controller is not reporting
+	 * fatal status, abort the command. Otherwise, just reset the
+	 * controller and hope for the best.
+	 */
 	csts.raw = nvme_mmio_read_4(ctrlr, csts);
-
+	if (csts.bits.cfs == 0 && nvme_qpair_process_completions(qpair)) {
+		nvme_printf(ctrlr, "Missing interrupt\n");
+		return;
+	}
 	if (ctrlr->enable_aborts && csts.bits.cfs == 0) {
-		/*
-		 * If aborts are enabled, only use them if the controller is
-		 *  not reporting fatal status.
-		 */
+		nvme_printf(ctrlr, "Aborting command due to a timeout.\n");
 		nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
 		    nvme_abort_complete, tr);
-	} else
+	} else {
+		nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
+		    csts.bits.cfs ? " and fatal error status" : "");
 		nvme_ctrlr_reset(ctrlr);
+	}
 }
 
 void


More information about the svn-src-all mailing list