svn commit: r349849 - in head/sys/dev: mpr mps

Warner Losh imp at FreeBSD.org
Mon Jul 8 20:20:04 UTC 2019


Author: imp
Date: Mon Jul  8 20:20:01 2019
New Revision: 349849
URL: https://svnweb.freebsd.org/changeset/base/349849

Log:
  Fix bugs in recovery path and improve cm tracking
  
  Eliminate the TIMEDOUT state. This state really conveyed two different
  concepts: I timed out during recovery (and my command got put on the
  recovery queue), and I timed out diring discovery (which doesn't).
  Separate those two concepts into two flags. Use the TIMEDOUT flag to
  fail requests as timed out. Use the on queue flag to remove them from
  the queue.
  
  In mps_intr_locked for MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY message
  type, when completing commands, ignore the ones that are not in state
  INQUEUE. They were already completed as part of the recovery
  process. When we complete them twice, we wind up with entries on the
  free queue that are marked as busy, trigging asserts.
  
  Reviewed by: scottl (earlier version, just for mpr)
  Differential Revision: https://reviews.freebsd.org/D20785

Modified:
  head/sys/dev/mpr/mpr.c
  head/sys/dev/mpr/mpr_sas.c
  head/sys/dev/mpr/mpr_sas_lsi.c
  head/sys/dev/mpr/mprvar.h
  head/sys/dev/mps/mps.c
  head/sys/dev/mps/mps_sas.c
  head/sys/dev/mps/mps_sas_lsi.c
  head/sys/dev/mps/mpsvar.h

Modified: head/sys/dev/mpr/mpr.c
==============================================================================
--- head/sys/dev/mpr/mpr.c	Mon Jul  8 20:01:28 2019	(r349848)
+++ head/sys/dev/mpr/mpr.c	Mon Jul  8 20:20:01 2019	(r349849)
@@ -2617,12 +2617,17 @@ mpr_intr_locked(void *data)
 			} else {
 				cm = &sc->commands[
 				    le16toh(desc->AddressReply.SMID)];
-				if (cm->cm_state != MPR_CM_STATE_TIMEDOUT)
-					cm->cm_state = MPR_CM_STATE_BUSY;
-				cm->cm_reply = reply;
-				cm->cm_reply_data =
-				    le32toh(desc->AddressReply.
-				    ReplyFrameAddress);
+				if (cm->cm_state == MPR_CM_STATE_INQUEUE) {
+					cm->cm_reply = reply;
+					cm->cm_reply_data =
+					    le32toh(desc->AddressReply.
+						ReplyFrameAddress);
+				} else {
+					mpr_dprint(sc, MPR_RECOVERY,
+					    "Bad state for ADDRESS_REPLY status,"
+					    " ignoring state %d cm %p\n",
+					    cm->cm_state, cm);
+				}
 			}
 			break;
 		}

Modified: head/sys/dev/mpr/mpr_sas.c
==============================================================================
--- head/sys/dev/mpr/mpr_sas.c	Mon Jul  8 20:01:28 2019	(r349848)
+++ head/sys/dev/mpr/mpr_sas.c	Mon Jul  8 20:20:01 2019	(r349849)
@@ -1692,7 +1692,7 @@ mprsas_scsiio_timeout(void *data)
 	 * and been re-used, though this is unlikely.
 	 */
 	mpr_intr_locked(sc);
-	if (cm->cm_state != MPR_CM_STATE_INQUEUE) {
+	if (cm->cm_flags & MPR_CM_FLAGS_ON_RECOVERY) {
 		mprsas_log_command(cm, MPR_XINFO,
 		    "SCSI command %p almost timed out\n", cm);
 		return;
@@ -1721,7 +1721,7 @@ mprsas_scsiio_timeout(void *data)
 	 * operational.  if not, do a diag reset.
 	 */
 	mprsas_set_ccbstatus(cm->cm_ccb, CAM_CMD_TIMEOUT);
-	cm->cm_state = MPR_CM_STATE_TIMEDOUT;
+	cm->cm_flags |= MPR_CM_FLAGS_ON_RECOVERY | MPR_CM_FLAGS_TIMEDOUT;
 	TAILQ_INSERT_TAIL(&targ->timedout_commands, cm, cm_recovery);
 
 	if (targ->tm != NULL) {
@@ -2529,9 +2529,11 @@ mprsas_scsiio_complete(struct mpr_softc *sc, struct mp
 	TAILQ_REMOVE(&cm->cm_targ->commands, cm, cm_link);
 	ccb->ccb_h.status &= ~(CAM_STATUS_MASK | CAM_SIM_QUEUED);
 
-	if (cm->cm_state == MPR_CM_STATE_TIMEDOUT) {
+	if (cm->cm_flags & MPR_CM_FLAGS_ON_RECOVERY) {
 		TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery);
-		cm->cm_state = MPR_CM_STATE_BUSY;
+		KASSERT(cm->cm_state == MPR_CM_STATE_BUSY,
+		    ("Not busy for CM_FLAGS_TIMEDOUT: %d\n", cm->cm_state));
+		cm->cm_flags &= ~MPR_CM_FLAGS_ON_RECOVERY;
 		if (cm->cm_reply != NULL)
 			mprsas_log_command(cm, MPR_RECOVERY,
 			    "completed timedout cm %p ccb %p during recovery "
@@ -2817,7 +2819,7 @@ mprsas_scsiio_complete(struct mpr_softc *sc, struct mp
 		 * retry counter), the only difference is what gets printed
 		 * on the console.
 		 */
-		if (cm->cm_state == MPR_CM_STATE_TIMEDOUT)
+		if (cm->cm_flags & MPR_CM_FLAGS_TIMEDOUT)
 			mprsas_set_ccbstatus(ccb, CAM_CMD_TIMEOUT);
 		else
 			mprsas_set_ccbstatus(ccb, CAM_REQ_ABORTED);

Modified: head/sys/dev/mpr/mpr_sas_lsi.c
==============================================================================
--- head/sys/dev/mpr/mpr_sas_lsi.c	Mon Jul  8 20:01:28 2019	(r349848)
+++ head/sys/dev/mpr/mpr_sas_lsi.c	Mon Jul  8 20:20:01 2019	(r349849)
@@ -1017,7 +1017,7 @@ mprsas_add_device(struct mpr_softc *sc, u16 handle, u8
 		cm = &sc->commands[i];
 		if (cm->cm_flags & MPR_CM_FLAGS_SATA_ID_TIMEOUT) {
 			targ->timeouts++;
-			cm->cm_state = MPR_CM_STATE_TIMEDOUT;
+			cm->cm_flags |= MPR_CM_FLAGS_TIMEDOUT;
 
 			if ((targ->tm = mprsas_alloc_tm(sc)) != NULL) {
 				mpr_dprint(sc, MPR_INFO, "%s: sending Target "
@@ -1244,9 +1244,11 @@ mprsas_ata_id_timeout(struct mpr_softc *sc, struct mpr
 	/*
 	 * The Abort Task cannot be sent from here because the driver has not
 	 * completed setting up targets.  Instead, the command is flagged so
-	 * that special handling will be used to send the abort.
+	 * that special handling will be used to send the abort. Now that
+	 * this command has timed out, it's no longer in the queue.
 	 */
 	cm->cm_flags |= MPR_CM_FLAGS_SATA_ID_TIMEOUT;
+	cm->cm_state = MPR_CM_STATE_BUSY;
 }
 
 static int

Modified: head/sys/dev/mpr/mprvar.h
==============================================================================
--- head/sys/dev/mpr/mprvar.h	Mon Jul  8 20:01:28 2019	(r349848)
+++ head/sys/dev/mpr/mprvar.h	Mon Jul  8 20:20:01 2019	(r349849)
@@ -275,11 +275,12 @@ struct mpr_command {
 #define	MPR_CM_FLAGS_ERROR_MASK		MPR_CM_FLAGS_CHAIN_FAILED
 #define	MPR_CM_FLAGS_USE_CCB		(1 << 9)
 #define	MPR_CM_FLAGS_SATA_ID_TIMEOUT	(1 << 10)
+#define MPR_CM_FLAGS_ON_RECOVERY	(1 << 12)
+#define MPR_CM_FLAGS_TIMEDOUT		(1 << 13)
 	u_int				cm_state;
 #define MPR_CM_STATE_FREE		0
 #define MPR_CM_STATE_BUSY		1
-#define MPR_CM_STATE_TIMEDOUT		2
-#define MPR_CM_STATE_INQUEUE		3
+#define MPR_CM_STATE_INQUEUE		2
 	bus_dmamap_t			cm_dmamap;
 	struct scsi_sense_data		*cm_sense;
 	uint64_t			*nvme_error_response;

Modified: head/sys/dev/mps/mps.c
==============================================================================
--- head/sys/dev/mps/mps.c	Mon Jul  8 20:01:28 2019	(r349848)
+++ head/sys/dev/mps/mps.c	Mon Jul  8 20:20:01 2019	(r349849)
@@ -2479,13 +2479,24 @@ mps_intr_locked(void *data)
 					    (MPI2_EVENT_NOTIFICATION_REPLY *)
 					    reply);
 			} else {
+				/*
+				 * Ignore commands not in INQUEUE state
+				 * since they've already been completed
+				 * via another path.
+				 */
 				cm = &sc->commands[
 				    le16toh(desc->AddressReply.SMID)];
-				if (cm->cm_state != MPS_CM_STATE_TIMEDOUT)
+				if (cm->cm_state == MPS_CM_STATE_INQUEUE) {
 					cm->cm_state = MPS_CM_STATE_BUSY;
-				cm->cm_reply = reply;
-				cm->cm_reply_data = le32toh(
-				    desc->AddressReply.ReplyFrameAddress);
+					cm->cm_reply = reply;
+					cm->cm_reply_data = le32toh(
+					    desc->AddressReply.ReplyFrameAddress);
+				} else {
+					mps_dprint(sc, MPS_RECOVERY,
+					    "Bad state for ADDRESS_REPLY status,"
+					    " ignoring state %d cm %p\n",
+					    cm->cm_state, cm);
+				}
 			}
 			break;
 		}

Modified: head/sys/dev/mps/mps_sas.c
==============================================================================
--- head/sys/dev/mps/mps_sas.c	Mon Jul  8 20:01:28 2019	(r349848)
+++ head/sys/dev/mps/mps_sas.c	Mon Jul  8 20:20:01 2019	(r349849)
@@ -1602,7 +1602,7 @@ mpssas_scsiio_timeout(void *data)
 	 * and been re-used, though this is unlikely.
 	 */
 	mps_intr_locked(sc);
-	if (cm->cm_state != MPS_CM_STATE_INQUEUE) {
+	if (cm->cm_flags & MPS_CM_FLAGS_ON_RECOVERY) {
 		mpssas_log_command(cm, MPS_XINFO,
 		    "SCSI command %p almost timed out\n", cm);
 		return;
@@ -1626,7 +1626,7 @@ mpssas_scsiio_timeout(void *data)
 	 * operational.  if not, do a diag reset.
 	 */
 	mpssas_set_ccbstatus(cm->cm_ccb, CAM_CMD_TIMEOUT);
-	cm->cm_state = MPS_CM_STATE_TIMEDOUT;
+	cm->cm_flags |= MPS_CM_FLAGS_ON_RECOVERY | MPS_CM_FLAGS_TIMEDOUT;
 	TAILQ_INSERT_TAIL(&targ->timedout_commands, cm, cm_recovery);
 
 	if (targ->tm != NULL) {
@@ -2040,9 +2040,11 @@ mpssas_scsiio_complete(struct mps_softc *sc, struct mp
 		biotrack(ccb->csio.bio, __func__);
 #endif
 
-	if (cm->cm_state == MPS_CM_STATE_TIMEDOUT) {
+	if (cm->cm_flags & MPS_CM_FLAGS_ON_RECOVERY) {
 		TAILQ_REMOVE(&cm->cm_targ->timedout_commands, cm, cm_recovery);
-		cm->cm_state = MPS_CM_STATE_BUSY;
+		KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
+		    ("Not busy for CM_FLAGS_TIMEDOUT: %d\n", cm->cm_state));
+		cm->cm_flags &= ~MPS_CM_FLAGS_ON_RECOVERY;
 		if (cm->cm_reply != NULL)
 			mpssas_log_command(cm, MPS_RECOVERY,
 			    "completed timedout cm %p ccb %p during recovery "
@@ -2325,7 +2327,7 @@ mpssas_scsiio_complete(struct mps_softc *sc, struct mp
 		 * retry counter), the only difference is what gets printed
 		 * on the console.
 		 */
-		if (cm->cm_state == MPS_CM_STATE_TIMEDOUT)
+		if (cm->cm_flags & MPS_CM_FLAGS_TIMEDOUT)
 			mpssas_set_ccbstatus(ccb, CAM_CMD_TIMEOUT);
 		else
 			mpssas_set_ccbstatus(ccb, CAM_REQ_ABORTED);

Modified: head/sys/dev/mps/mps_sas_lsi.c
==============================================================================
--- head/sys/dev/mps/mps_sas_lsi.c	Mon Jul  8 20:01:28 2019	(r349848)
+++ head/sys/dev/mps/mps_sas_lsi.c	Mon Jul  8 20:20:01 2019	(r349849)
@@ -790,7 +790,7 @@ mpssas_add_device(struct mps_softc *sc, u16 handle, u8
 		cm = &sc->commands[i];
 		if (cm->cm_flags & MPS_CM_FLAGS_SATA_ID_TIMEOUT) {
 			targ->timeouts++;
-			cm->cm_state = MPS_CM_STATE_TIMEDOUT;
+			cm->cm_flags |= MPS_CM_FLAGS_TIMEDOUT;
 
 			if ((targ->tm = mpssas_alloc_tm(sc)) != NULL) {
 				mps_dprint(sc, MPS_INFO, "%s: sending Target "
@@ -1017,9 +1017,11 @@ mpssas_ata_id_timeout(struct mps_softc *sc, struct mps
 	/*
 	 * The Abort Task cannot be sent from here because the driver has not
 	 * completed setting up targets.  Instead, the command is flagged so
-	 * that special handling will be used to send the abort.
+	 * that special handling will be used to send the abort. Now that
+	 * this command has timed out, it's no longer in the queue.
 	 */
 	cm->cm_flags |= MPS_CM_FLAGS_SATA_ID_TIMEOUT;
+	cm->cm_state = MPS_CM_STATE_BUSY;
 }
 
 static int

Modified: head/sys/dev/mps/mpsvar.h
==============================================================================
--- head/sys/dev/mps/mpsvar.h	Mon Jul  8 20:01:28 2019	(r349848)
+++ head/sys/dev/mps/mpsvar.h	Mon Jul  8 20:20:01 2019	(r349849)
@@ -242,11 +242,12 @@ struct mps_command {
 #define	MPS_CM_FLAGS_ERROR_MASK		MPS_CM_FLAGS_CHAIN_FAILED
 #define	MPS_CM_FLAGS_USE_CCB		(1 << 10)
 #define	MPS_CM_FLAGS_SATA_ID_TIMEOUT	(1 << 11)
+#define MPS_CM_FLAGS_ON_RECOVERY	(1 << 12)
+#define MPS_CM_FLAGS_TIMEDOUT		(1 << 13)
 	u_int				cm_state;
 #define MPS_CM_STATE_FREE		0
 #define MPS_CM_STATE_BUSY		1
-#define MPS_CM_STATE_TIMEDOUT		2
-#define MPS_CM_STATE_INQUEUE		3
+#define MPS_CM_STATE_INQUEUE		2
 	bus_dmamap_t			cm_dmamap;
 	struct scsi_sense_data		*cm_sense;
 	TAILQ_HEAD(, mps_chain)		cm_chain_list;
@@ -545,7 +546,8 @@ mps_free_command(struct mps_softc *sc, struct mps_comm
 {
 	struct mps_chain *chain, *chain_temp;
 
-	KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, ("state not busy\n"));
+	KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
+	    ("state not busy: %d\n", cm->cm_state));
 
 	if (cm->cm_reply != NULL)
 		mps_free_reply(sc, cm->cm_reply_data);
@@ -581,7 +583,7 @@ mps_alloc_command(struct mps_softc *sc)
 		return (NULL);
 
 	KASSERT(cm->cm_state == MPS_CM_STATE_FREE,
-	    ("mps: Allocating busy command\n"));
+	    ("mps: Allocating busy command: %d\n", cm->cm_state));
 
 	TAILQ_REMOVE(&sc->req_list, cm, cm_link);
 	cm->cm_state = MPS_CM_STATE_BUSY;
@@ -594,7 +596,8 @@ mps_free_high_priority_command(struct mps_softc *sc, s
 {
 	struct mps_chain *chain, *chain_temp;
 
-	KASSERT(cm->cm_state == MPS_CM_STATE_BUSY, ("state not busy\n"));
+	KASSERT(cm->cm_state == MPS_CM_STATE_BUSY,
+	    ("state not busy: %d\n", cm->cm_state));
 
 	if (cm->cm_reply != NULL)
 		mps_free_reply(sc, cm->cm_reply_data);
@@ -623,7 +626,7 @@ mps_alloc_high_priority_command(struct mps_softc *sc)
 		return (NULL);
 
 	KASSERT(cm->cm_state == MPS_CM_STATE_FREE,
-	    ("mps: Allocating busy command\n"));
+	    ("mps: Allocating high priority busy command: %d\n", cm->cm_state));
 
 	TAILQ_REMOVE(&sc->high_priority_req_list, cm, cm_link);
 	cm->cm_state = MPS_CM_STATE_BUSY;


More information about the svn-src-all mailing list