ahci.ko / geom_mirror / zfs hangs up system when one of HDDs fauilts.

Alexander Motin mav at FreeBSD.org
Tue Jul 26 20:17:35 UTC 2011


Lev Serebryakov wrote:
> You wrote 24 июля 2011 г., 2:29:12:
>>>   I'm not sure, that it is possible to update firmware on these
>>> drives. And MoBo BIOS looks like latest one.
>> Then I have no idea what to do about the cause of errors. What's about
>> consequences, I've tried to simulate alike problem (device detected, but
>> doesn't respond). Recovery (dropping failed device) took a lot of time,
>> but finally (after about 10 minutes) it succeeded and ZFS continued
>> operation without that drive. After that I've just committed one patch
>> to the HEAD and sent another one to freebsd-scsi@ for review. That, I
>> hope, should significantly (down to 1-2 minutes) speedup that process.
> 
>> How long have you waited before and after making that screenshot?
>   About one and half hour -- server stopped to respond on
>  HTTP/SSH/SMTP/POP3 (but responded to pings and traceroute), I've
>  requested access to remote console, tech support provide such access
>  and all this process takes more than hour.

Not sure it is related to your case, but attached patch fixes timeout
handling problem I've found while testing Marvell 88SE912x controller.
In my test scenario without this patch some commands could stuck inside
controller infinitely.

-- 
Alexander Motin
-------------- next part --------------
Index: dev/ahci/ahci.c
===================================================================
--- dev/ahci/ahci.c	(revision 224305)
+++ dev/ahci/ahci.c	(working copy)
@@ -1879,12 +1879,13 @@
 			device_printf(dev, "Poll timeout on slot %d port %d\n",
 			    slot->slot, port);
 			device_printf(dev, "is %08x cs %08x ss %08x "
-			    "rs %08x tfd %02x serr %08x\n",
+			    "rs %08x tfd %02x serr %08x cmd %08x\n",
 			    ATA_INL(ch->r_mem, AHCI_P_IS),
 			    ATA_INL(ch->r_mem, AHCI_P_CI),
 			    ATA_INL(ch->r_mem, AHCI_P_SACT), ch->rslots,
 			    ATA_INL(ch->r_mem, AHCI_P_TFD),
-			    ATA_INL(ch->r_mem, AHCI_P_SERR));
+			    ATA_INL(ch->r_mem, AHCI_P_SERR),
+			    ATA_INL(ch->r_mem, AHCI_P_CMD));
 			et = AHCI_ERR_TIMEOUT;
 		}
 
@@ -1960,8 +1961,12 @@
 		ccs = (ATA_INL(ch->r_mem, AHCI_P_CMD) & AHCI_P_CMD_CCS_MASK)
 		    >> AHCI_P_CMD_CCS_SHIFT;
 		if ((sstatus & (1 << slot->slot)) != 0 || ccs == slot->slot ||
-		    ch->fbs_enabled)
+		    ch->fbs_enabled || ch->wrongccs)
 			slot->state = AHCI_SLOT_EXECUTING;
+		else if ((ch->rslots & (1 << ccs)) == 0) {
+			ch->wrongccs = 1;
+			slot->state = AHCI_SLOT_EXECUTING;
+		}
 
 		callout_reset(&slot->timeout,
 		    (int)slot->ccb->ccb_h.timeout * hz / 2000,
@@ -1971,10 +1976,12 @@
 
 	device_printf(dev, "Timeout on slot %d port %d\n",
 	    slot->slot, slot->ccb->ccb_h.target_id & 0x0f);
-	device_printf(dev, "is %08x cs %08x ss %08x rs %08x tfd %02x serr %08x\n",
+	device_printf(dev, "is %08x cs %08x ss %08x rs %08x tfd %02x "
+	    "serr %08x cmd %08x\n",
 	    ATA_INL(ch->r_mem, AHCI_P_IS), ATA_INL(ch->r_mem, AHCI_P_CI),
 	    ATA_INL(ch->r_mem, AHCI_P_SACT), ch->rslots,
-	    ATA_INL(ch->r_mem, AHCI_P_TFD), ATA_INL(ch->r_mem, AHCI_P_SERR));
+	    ATA_INL(ch->r_mem, AHCI_P_TFD), ATA_INL(ch->r_mem, AHCI_P_SERR),
+	    ATA_INL(ch->r_mem, AHCI_P_CMD));
 
 	/* Handle frozen command. */
 	if (ch->frozen) {
@@ -1987,7 +1994,7 @@
 		}
 		xpt_done(fccb);
 	}
-	if (!ch->fbs_enabled) {
+	if (!ch->fbs_enabled && !ch->wrongccs) {
 		/* Without FBS we know real timeout source. */
 		ch->fatalerr = 1;
 		/* Handle command with timeout. */
@@ -2585,6 +2592,7 @@
 		xpt_release_simq(ch->sim, TRUE);
 	ch->eslots = 0;
 	ch->toslots = 0;
+	ch->wrongccs = 0;
 	ch->fatalerr = 0;
 	/* Tell the XPT about the event */
 	xpt_async(AC_BUS_RESET, ch->path, NULL);
Index: dev/ahci/ahci.h
===================================================================
--- dev/ahci/ahci.h	(revision 224305)
+++ dev/ahci/ahci.h	(working copy)
@@ -426,6 +426,7 @@
 	int			resetting;	/* Hard-reset in progress. */
 	int			resetpolldiv;	/* Hard-reset poll divider. */
 	int			listening;	/* SUD bit is cleared. */
+	int			wrongccs;	/* CCS field in CMD was wrong */
 	union ccb		*frozen;	/* Frozen command */
 	struct callout		pm_timer;	/* Power management events */
 	struct callout		reset_timer;	/* Hard-reset timeout */


More information about the freebsd-hardware mailing list