svn commit: r219111 - in stable/7/sys: dev/re pci

Pyun YongHyeon yongari at FreeBSD.org
Mon Feb 28 23:46:59 UTC 2011


Author: yongari
Date: Mon Feb 28 23:46:59 2011
New Revision: 219111
URL: http://svn.freebsd.org/changeset/base/219111

Log:
  MFC r217902:
    Do not use interrupt taskqueue on controllers with MSI/MSI-X
    capability. One of reason using interrupt taskqueue in re(4) was
    to reduce number of TX/RX interrupts under load because re(4)
    controllers have no good TX/RX interrupt moderation mechanism.
    Basic TX interrupt moderation is done by hardware for most
    controllers but RX interrupt moderation through undocumented
    register showed poor RX performance so it was disabled in r215025.
    Using taskqueue to handle RX interrupt greatly reduced number of
    interrupts but re(4) consumed all available CPU cycles to run the
    taskqueue under high TX/RX network load.  This can happen even with
    RTL810x fast ethernet controller and I believe this is not
    acceptable for most systems.
  
    To mitigate the issue, use one-shot timer register to moderate RX
    interrupts. The timer register provides programmable one-shot timer
    and can be used to suppress interrupt generation. The timer runs at
    125MHZ on PCIe controllers so the minimum time allowed for the
    timer is 8ns. Data sheet says the register is 32 bits but
    experimentation shows only lower 13 bits are valid so maximum time
    that can be programmed is 65.528us. This yields theoretical maximum
    number of RX interrupts that could be generated per second is about
    15260. Combined with TX completion interrupts re(4) shall generate
    less than 20k interrupts. This number is still slightly high
    compared to other intelligent ethernet controllers but system is
    very responsive even under high network load.
  
    Introduce sysctl variable dev.re.%d.int_rx_mod that controls amount
    of time to delay RX interrupt processing in units of us. Value 0
    completely disables RX interrupt moderation. To provide old
    behavior for controllers that have MSI/MSI-X capability, introduce
    a new tunable hw.re.intr_filter. If the tunable is set to non-zero
    value, driver will use interrupt taskqueue. The default value of
    the tunable is 0. This tunable has no effect on controllers that
    has no MSI/MSI-X capability or if MSI/MSI-X is explicitly disabled
    by administrator.
  
    While I'm here cleanup interrupt setup/teardown since re(4) uses
    single MSI/MSI-X message at this moment.

Modified:
  stable/7/sys/dev/re/if_re.c
  stable/7/sys/pci/if_rlreg.h
Directory Properties:
  stable/7/sys/   (props changed)
  stable/7/sys/cddl/contrib/opensolaris/   (props changed)
  stable/7/sys/contrib/dev/acpica/   (props changed)
  stable/7/sys/contrib/pf/   (props changed)

Modified: stable/7/sys/dev/re/if_re.c
==============================================================================
--- stable/7/sys/dev/re/if_re.c	Mon Feb 28 23:41:27 2011	(r219110)
+++ stable/7/sys/dev/re/if_re.c	Mon Feb 28 23:46:59 2011	(r219111)
@@ -157,6 +157,8 @@ MODULE_DEPEND(re, miibus, 1, 1, 1);
 #include "miibus_if.h"
 
 /* Tunables. */
+static int intr_filter = 0;
+TUNABLE_INT("hw.re.intr_filter", &intr_filter);
 static int msi_disable = 0;
 TUNABLE_INT("hw.re.msi_disable", &msi_disable);
 static int msix_disable = 0;
@@ -253,6 +255,7 @@ static void re_poll		(struct ifnet *, en
 static void re_poll_locked	(struct ifnet *, enum poll_cmd, int);
 #endif
 static int re_intr		(void *);
+static void re_intr_msi		(void *);
 static void re_tick		(void *);
 static void re_int_task		(void *, int);
 static void re_start		(struct ifnet *);
@@ -290,6 +293,8 @@ static int re_diag		(struct rl_softc *);
 
 static void re_add_sysctls	(struct rl_softc *);
 static int re_sysctl_stats	(SYSCTL_HANDLER_ARGS);
+static int sysctl_int_range	(SYSCTL_HANDLER_ARGS, int, int);
+static int sysctl_hw_re_int_mod	(SYSCTL_HANDLER_ARGS);
 
 static device_method_t re_methods[] = {
 	/* Device interface */
@@ -1575,19 +1580,19 @@ re_attach(device_t dev)
 	}
 #endif
 
+#ifdef RE_TX_MODERATION
+	intr_filter = 1;
+#endif
 	/* Hook interrupt last to avoid having to lock softc */
-	if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0)
+	if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0 &&
+	    intr_filter == 0) {
+		error = bus_setup_intr(dev, sc->rl_irq[0],
+		    INTR_TYPE_NET | INTR_MPSAFE, NULL, re_intr_msi, sc,
+		    &sc->rl_intrhand[0]);
+	} else {
 		error = bus_setup_intr(dev, sc->rl_irq[0],
 		    INTR_TYPE_NET | INTR_MPSAFE, re_intr, NULL, sc,
 		    &sc->rl_intrhand[0]);
-	else {
-		for (i = 0; i < RL_MSI_MESSAGES; i++) {
-			error = bus_setup_intr(dev, sc->rl_irq[i],
-			    INTR_TYPE_NET | INTR_MPSAFE, re_intr, NULL, sc,
-		    	    &sc->rl_intrhand[i]);
-			if (error != 0)
-				break;
-		}
 	}
 	if (error) {
 		device_printf(dev, "couldn't set up irq\n");
@@ -1658,31 +1663,22 @@ re_detach(device_t dev)
 	 * stopped here.
 	 */
 
-	for (i = 0; i < RL_MSI_MESSAGES; i++) {
-		if (sc->rl_intrhand[i] != NULL) {
-			bus_teardown_intr(dev, sc->rl_irq[i],
-			    sc->rl_intrhand[i]);
-			sc->rl_intrhand[i] = NULL;
-		}
+	if (sc->rl_intrhand[0] != NULL) {
+		bus_teardown_intr(dev, sc->rl_irq[0], sc->rl_intrhand[0]);
+		sc->rl_intrhand[0] = NULL;
 	}
 	if (ifp != NULL)
 		if_free(ifp);
-	if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0) {
-		if (sc->rl_irq[0] != NULL) {
-			bus_release_resource(dev, SYS_RES_IRQ, 0,
-			    sc->rl_irq[0]);
-			sc->rl_irq[0] = NULL;
-		}
-	} else {
-		for (i = 0, rid = 1; i < RL_MSI_MESSAGES; i++, rid++) {
-			if (sc->rl_irq[i] != NULL) {
-				bus_release_resource(dev, SYS_RES_IRQ, rid,
-				    sc->rl_irq[i]);
-				sc->rl_irq[i] = NULL;
-			}
-		}
-		pci_release_msi(dev);
+	if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0)
+		rid = 0;
+	else
+		rid = 1;
+	if (sc->rl_irq[0] != NULL) {
+		bus_release_resource(dev, SYS_RES_IRQ, rid, sc->rl_irq[0]);
+		sc->rl_irq[0] = NULL;
 	}
+	if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0)
+		pci_release_msi(dev);
 	if (sc->rl_res_pba) {
 		rid = PCIR_BAR(4);
 		bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->rl_res_pba);
@@ -1971,6 +1967,7 @@ re_rx_list_init(struct rl_softc *sc)
 
 	sc->rl_ldata.rl_rx_prodidx = 0;
 	sc->rl_head = sc->rl_tail = NULL;
+	sc->rl_int_rx_act = 0;
 
 	return (0);
 }
@@ -1994,6 +1991,7 @@ re_jrx_list_init(struct rl_softc *sc)
 
 	sc->rl_ldata.rl_rx_prodidx = 0;
 	sc->rl_head = sc->rl_tail = NULL;
+	sc->rl_int_rx_act = 0;
 
 	return (0);
 }
@@ -2472,6 +2470,87 @@ re_int_task(void *arg, int npending)
 	CSR_WRITE_2(sc, RL_IMR, RL_INTRS_CPLUS);
 }
 
+static void
+re_intr_msi(void *xsc)
+{
+	struct rl_softc		*sc;
+	struct ifnet		*ifp;
+	uint16_t		intrs, status;
+
+	sc = xsc;
+	RL_LOCK(sc);
+
+	ifp = sc->rl_ifp;
+#ifdef DEVICE_POLLING
+	if (ifp->if_capenable & IFCAP_POLLING) {
+		RL_UNLOCK(sc);
+		return;
+	}
+#endif
+	/* Disable interrupts. */
+	CSR_WRITE_2(sc, RL_IMR, 0);
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+		RL_UNLOCK(sc);
+		return;
+	}
+
+	intrs = RL_INTRS_CPLUS;
+	status = CSR_READ_2(sc, RL_ISR);
+        CSR_WRITE_2(sc, RL_ISR, status);
+	if (sc->rl_int_rx_act > 0) {
+		intrs &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW |
+		    RL_ISR_RX_OVERRUN);
+		status &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW |
+		    RL_ISR_RX_OVERRUN);
+	}
+
+	if (status & (RL_ISR_TIMEOUT_EXPIRED | RL_ISR_RX_OK | RL_ISR_RX_ERR |
+	    RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN)) {
+		re_rxeof(sc);
+		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
+			if (sc->rl_int_rx_mod != 0 &&
+			    (status & (RL_ISR_RX_OK | RL_ISR_RX_ERR |
+			    RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN)) != 0) {
+				/* Rearm one-shot timer. */
+				CSR_WRITE_4(sc, RL_TIMERCNT, 1);
+				intrs &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR |
+				    RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN);
+				sc->rl_int_rx_act = 1;
+			} else {
+				intrs |= RL_ISR_RX_OK | RL_ISR_RX_ERR |
+				    RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN;
+				sc->rl_int_rx_act = 0;
+			}
+		}
+	}
+
+	/*
+	 * Some chips will ignore a second TX request issued
+	 * while an existing transmission is in progress. If
+	 * the transmitter goes idle but there are still
+	 * packets waiting to be sent, we need to restart the
+	 * channel here to flush them out. This only seems to
+	 * be required with the PCIe devices.
+	 */
+	if ((status & (RL_ISR_TX_OK | RL_ISR_TX_DESC_UNAVAIL)) &&
+	    (sc->rl_flags & RL_FLAG_PCIE))
+		CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START);
+	if (status & (RL_ISR_TX_OK | RL_ISR_TX_ERR | RL_ISR_TX_DESC_UNAVAIL))
+		re_txeof(sc);
+
+	if (status & RL_ISR_SYSTEM_ERR) {
+		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+		re_init_locked(sc);
+	}
+
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
+		if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+			re_start_locked(ifp);
+		CSR_WRITE_2(sc, RL_IMR, intrs);
+	}
+	RL_UNLOCK(sc);
+}
+
 static int
 re_encap(struct rl_softc *sc, struct mbuf **m_head)
 {
@@ -3001,18 +3080,35 @@ re_init_locked(struct rl_softc *sc)
 	CSR_WRITE_1(sc, RL_COMMAND, RL_CMD_TX_ENB|RL_CMD_RX_ENB);
 #endif
 
-#ifdef RE_TX_MODERATION
 	/*
 	 * Initialize the timer interrupt register so that
 	 * a timer interrupt will be generated once the timer
 	 * reaches a certain number of ticks. The timer is
-	 * reloaded on each transmit. This gives us TX interrupt
+	 * reloaded on each transmit.
+	 */
+#ifdef RE_TX_MODERATION
+	/*
+	 * Use timer interrupt register to moderate TX interrupt
 	 * moderation, which dramatically improves TX frame rate.
 	 */
 	if (sc->rl_type == RL_8169)
 		CSR_WRITE_4(sc, RL_TIMERINT_8169, 0x800);
 	else
 		CSR_WRITE_4(sc, RL_TIMERINT, 0x400);
+#else
+	/*
+	 * Use timer interrupt register to moderate RX interrupt
+	 * moderation.
+	 */
+	if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0 &&
+	    intr_filter == 0) {
+		if (sc->rl_type == RL_8169)
+			CSR_WRITE_4(sc, RL_TIMERINT_8169,
+			    RL_USECS(sc->rl_int_rx_mod));
+	} else {
+		if (sc->rl_type == RL_8169)
+			CSR_WRITE_4(sc, RL_TIMERINT_8169, RL_USECS(0));
+	}
 #endif
 
 	/*
@@ -3529,6 +3625,7 @@ re_add_sysctls(struct rl_softc *sc)
 {
 	struct sysctl_ctx_list	*ctx;
 	struct sysctl_oid_list	*children;
+	int			error;
 
 	ctx = device_get_sysctl_ctx(sc->rl_dev);
 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->rl_dev));
@@ -3536,6 +3633,26 @@ re_add_sysctls(struct rl_softc *sc)
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "stats",
 	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, re_sysctl_stats, "I",
 	    "Statistics Information");
+	if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0)
+		return;
+
+	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "int_rx_mod",
+	    CTLTYPE_INT | CTLFLAG_RW, &sc->rl_int_rx_mod, 0,
+	    sysctl_hw_re_int_mod, "I", "re RX interrupt moderation");
+	/* Pull in device tunables. */
+	sc->rl_int_rx_mod = RL_TIMER_DEFAULT;
+	error = resource_int_value(device_get_name(sc->rl_dev),
+	    device_get_unit(sc->rl_dev), "int_rx_mod", &sc->rl_int_rx_mod);
+	if (error == 0) {
+		if (sc->rl_int_rx_mod < RL_TIMER_MIN ||
+		    sc->rl_int_rx_mod > RL_TIMER_MAX) {
+			device_printf(sc->rl_dev, "int_rx_mod value out of "
+			    "range; using default: %d\n",
+			    RL_TIMER_DEFAULT);
+			sc->rl_int_rx_mod = RL_TIMER_DEFAULT;
+		}
+	}
+
 }
 
 static int
@@ -3613,3 +3730,29 @@ done:
 
 	return (error);
 }
+
+static int
+sysctl_int_range(SYSCTL_HANDLER_ARGS, int low, int high)
+{
+	int error, value;
+
+	if (arg1 == NULL)
+		return (EINVAL);
+	value = *(int *)arg1;
+	error = sysctl_handle_int(oidp, &value, 0, req);
+	if (error || req->newptr == NULL)
+		return (error);
+	if (value < low || value > high)
+		return (EINVAL);
+	*(int *)arg1 = value;
+
+	return (0);
+}
+
+static int
+sysctl_hw_re_int_mod(SYSCTL_HANDLER_ARGS)
+{
+
+	return (sysctl_int_range(oidp, arg1, arg2, req, RL_TIMER_MIN,
+	    RL_TIMER_MAX));
+}

Modified: stable/7/sys/pci/if_rlreg.h
==============================================================================
--- stable/7/sys/pci/if_rlreg.h	Mon Feb 28 23:41:27 2011	(r219110)
+++ stable/7/sys/pci/if_rlreg.h	Mon Feb 28 23:46:59 2011	(r219111)
@@ -497,6 +497,14 @@
 
 #define	RL_EARLYTXTHRESH_CNT	0x003F	/* byte count times 8 */
 
+/* Timer interrupt register */
+#define	RL_TIMERINT_8169_VAL	0x00001FFF
+#define	RL_TIMER_MIN		0
+#define	RL_TIMER_MAX		65	/* 65.528us */
+#define	RL_TIMER_DEFAULT	RL_TIMER_MAX
+#define	RL_TIMER_PCIE_CLK	125	/* 125MHZ */
+#define	RL_USECS(x)		((x) * RL_TIMER_PCIE_CLK)
+
 /*
  * Gigabit PHY access register (8169 only)
  */
@@ -896,6 +904,8 @@ struct rl_softc {
 	struct task		rl_inttask;
 
 	int			rl_txstart;
+	int			rl_int_rx_act;
+	int			rl_int_rx_mod;
 	uint32_t		rl_flags;
 #define	RL_FLAG_MSI		0x0001
 #define	RL_FLAG_AUTOPAD		0x0002


More information about the svn-src-stable mailing list