Any news about "msk0 watchdog timeout" regression in 10-RELEASE?

Curtis Villamizar curtis at ipv6.occnc.com
Tue Jan 21 19:56:21 UTC 2014


In message <52DE4A69.9090304 at gmail.com>
Vitaly Magerya writes:
 
> Hi, folks. I've just upgraded to 10.0-RELEASE, and my msk(4) card
> stopped working; it would work for a few minutes, and then it will
> start printing "msk0 watchdog timeout" messages with an interrupt
> storm accompanying it. I think this problem was described earlier
> this month in [1].
>  
> My question is: was there a workaround found, or should I just
> downgrade back to 9.2?
>  
> [1] https://lists.freebsd.org/pipermail/freebsd-stable/2014-January/076676.html


I have mine working but I haven't done a lot of reboots to see if it
is a "fix" or luck.

There is a lot of junk that you won't need in the code that is running
well for me.  But here it is, as-is warts and all.

I've been swamped lately and haven't had time to look at this further.

Curtis


Notes-

  1.  the change in watchdog code has no effect (not hit when working,
      does not fix things when not working).

  2.  Lots of printf thingies in there that you can delete if you
      like.  If things work you don't hit this code.

  3.  Chanes to the interrupt handler also seem to do nothing (good or
      bad) if things are working.

  4.  Why this is working for me is at this point a mystery but
      whether it works for you gives us another data point.


Index: if_msk.c
===================================================================
--- if_msk.c	(revision 260441)
+++ if_msk.c	(working copy)
@@ -2161,6 +2161,10 @@
 	count = imin(4096, roundup2(count, 1024));
 	sc->msk_stat_count = count;
 	stat_sz = count * sizeof(struct msk_stat_desc);
+#if 1
+	device_printf(sc->msk_dev,
+		      "msk_status_dma_alloc: %d %lu\n", count, stat_sz);
+#endif
 	error = bus_dma_tag_create(
 		    bus_get_dma_tag(sc->msk_dev),	/* parent */
 		    MSK_STAT_ALIGN, 0,		/* alignment, boundary */
@@ -2975,6 +2979,14 @@
 	}
 }
 
+#if 1
+static uint32_t msk_last_status = 0;
+static uint16_t last_stat_put_idx = 0;
+static uint32_t last_msk_control = 0;
+static uint16_t last_good_stat_put_idx = 0;
+static uint32_t last_good_msk_control = 0;
+#endif
+
 static void
 msk_watchdog(struct msk_if_softc *sc_if)
 {
@@ -2995,7 +3007,70 @@
 		return;
 	}
 
-	if_printf(ifp, "watchdog timeout\n");
+#if 1
+	if_printf(ifp,
+"watchdog timeout: 0x%08x\n  (0x%04x 0x%08x) (0x%04x 0x%08x) 0x%08x 0x%08x\n",
+		  msk_last_status,
+		  sc_if->msk_softc->msk_stat_cons, last_msk_control,
+		  last_good_stat_put_idx, last_good_msk_control,
+		  last_stat_put_idx, sc_if->msk_softc->msk_stat_count);
+	{
+	  struct msk_softc *sc = sc_if->msk_softc;
+	  uint16_t cons, count;
+	  struct msk_stat_desc *sd;
+	  uint32_t control;
+#if 0
+	  char linebuf[8192];
+	  char *pt = linebuf;
+	  size_t bytes = 8192;
+	  size_t used;
+
+	  count = sc->msk_stat_count;
+	  for (cons = 0; cons < count; ++cons) {
+	    if ((cons > 0) && ((cons & 0xff) == 0)) {
+	      if_printf(ifp, "%s\n", linebuf);
+	      pt = linebuf;
+	      bytes = sizeof(linebuf);
+	    }
+	    if ((cons & 7) == 0) {
+	      snprintf(pt, bytes - 1, "\n%03x ", cons);
+	      used = strlen(pt); pt += used; bytes -= used;
+	    } else if ((cons & 3) == 0) {
+	      snprintf(pt, bytes - 1, " ");
+	      used = strlen(pt); pt += used; bytes -= used;
+	    }
+	    sd = &sc->msk_stat_ring[cons];
+	    control = le32toh(sd->msk_control);
+	    snprintf(pt, bytes - 1, " %08x", control);
+	    used = strlen(pt); pt += used; bytes -= used;
+	  }
+	  if_printf(ifp, "%s\n\n", linebuf);
+#endif
+	  /* bump the count if we got stuck on HW_OWNER */
+	  if (((msk_last_status & Y2_IS_STAT_BMU) != 0)
+	      && (sc->msk_stat_cons != last_stat_put_idx)
+	      && ((last_msk_control & HW_OWNER) == 0)) {
+	    /* Sync status LEs. */
+	    bus_dmamap_sync(sc->msk_stat_tag, sc->msk_stat_map,
+			    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
+	    cons = sc->msk_stat_cons;
+	    count = sc->msk_stat_count;
+	    do {
+	      MSK_INC(cons, count);
+	      sd = &sc->msk_stat_ring[cons];
+	      control = le32toh(sd->msk_control);
+	    } while ((cons != sc->msk_stat_cons)
+		     && ((control & HW_OWNER) == 0));
+	    if (cons != sc->msk_stat_cons) {
+	      if_printf(ifp, "msk_stat_cons changed 0x%04x -> 0x%04x\n",
+			sc->msk_stat_cons, cons);
+	      sc->msk_stat_cons = cons;
+	    }
+	  }
+	}
+#else
+	if_printf(ifp, "watchdog timeout: status\n");
+#endif
 	ifp->if_oerrors++;
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	msk_init_locked(sc_if);
@@ -3599,8 +3674,12 @@
 	int rxput[2];
 	struct msk_stat_desc *sd;
 	uint32_t control, status;
-	int cons, len, port, rxprog;
+	int len, port, rxprog;
+	uint16_t cons;
 
+#if 1
+	last_stat_put_idx = CSR_READ_2(sc, STAT_PUT_IDX);
+#endif
 	if (sc->msk_stat_cons == CSR_READ_2(sc, STAT_PUT_IDX))
 		return (0);
 
@@ -3614,8 +3693,15 @@
 	for (;;) {
 		sd = &sc->msk_stat_ring[cons];
 		control = le32toh(sd->msk_control);
+#if 1
+		last_msk_control = control;
+#endif
 		if ((control & HW_OWNER) == 0)
 			break;
+#if 1
+		last_good_stat_put_idx = cons;
+		last_good_msk_control = control;
+#endif
 		control &= ~HW_OWNER;
 		sd->msk_control = htole32(control);
 		status = le32toh(sd->msk_status);
@@ -3689,6 +3775,11 @@
 	if (rxput[MSK_PORT_B] > 0)
 		msk_rxput(sc->msk_if[MSK_PORT_B]);
 
+#if 1
+	last_stat_put_idx = CSR_READ_2(sc, STAT_PUT_IDX);
+#endif
+	if ((control & HW_OWNER) == 0)
+		return 1;
 	return (sc->msk_stat_cons != CSR_READ_2(sc, STAT_PUT_IDX));
 }
 
@@ -3742,8 +3833,11 @@
 		CSR_WRITE_4(sc, B0_IMSK, sc->msk_intrmask);
 		CSR_READ_4(sc, B0_IMSK);
 	}
-	if ((status & Y2_IS_HW_ERR) != 0)
+	if ((status & Y2_IS_HW_ERR) != 0) {
 		msk_intr_hwerr(sc);
+		device_printf(sc->msk_dev,
+			      "Y2_IS_HW_ERR is set: status 0x%x\n", status);
+	}
 
 	domore = msk_handle_events(sc);
 	if ((status & Y2_IS_STAT_BMU) != 0 && domore == 0)
@@ -3762,6 +3856,17 @@
 	    !IFQ_DRV_IS_EMPTY(&ifp1->if_snd))
 		msk_start_locked(ifp1);
 
+#if 1
+#define Y2_IS_OTHER_INTR \
+	(Y2_IS_ASF | Y2_IS_POLL_CHK | Y2_IS_IRQ_SW | Y2_IS_TIMINT | \
+	 Y2_IS_CHK_TXS2 | Y2_IS_PSM_ACK | Y2_IS_PTP_TIST | Y2_IS_CHK_TXS1)
+	if ((status & (Y2_IS_OTHER_INTR)) != 0) {
+	    device_printf(sc->msk_dev, "unknown interupt bits 0x%x\n",
+			  status & (Y2_IS_OTHER_INTR));
+	}
+	msk_last_status = status;
+#endif
+
 	MSK_UNLOCK(sc);
 }
 
Index: if_mskreg.h
===================================================================
--- if_mskreg.h	(revision 260441)
+++ if_mskreg.h	(working copy)
@@ -156,7 +156,7 @@
 #define DEVICEID_DLINK_DGE560SX	0x4002
 #define DEVICEID_DLINK_DGE560T	0x4b00
 
-#define BIT_31		(1 << 31)
+#define BIT_31		(1U << 31)
 #define BIT_30		(1 << 30)
 #define BIT_29		(1 << 29)
 #define BIT_28		(1 << 28)
@@ -2329,8 +2329,13 @@
  */
 #if (BUS_SPACE_MAXADDR > 0xFFFFFFFF)
 #define	MSK_64BIT_DMA
+#if 1
+#define MSK_TX_RING_CNT		256
+#define MSK_RX_RING_CNT		256
+#else
 #define MSK_TX_RING_CNT		384
 #define MSK_RX_RING_CNT		512
+#endif
 #else
 #undef	MSK_64BIT_DMA
 #define MSK_TX_RING_CNT		256
@@ -2539,8 +2544,8 @@
 	bus_addr_t		msk_stat_ring_paddr;
 	int			msk_int_holdoff;
 	int			msk_process_limit;
-	int			msk_stat_cons;
-	int			msk_stat_count;
+	uint16_t		msk_stat_cons;
+	uint16_t		msk_stat_count;
 	struct mtx		msk_mtx;
 };
 


More information about the freebsd-stable mailing list