svn commit: r230572 - in head/sys/dev: ixgbe netmap

Luigi Rizzo luigi at FreeBSD.org
Thu Jan 26 09:55:16 UTC 2012


Author: luigi
Date: Thu Jan 26 09:55:16 2012
New Revision: 230572
URL: http://svn.freebsd.org/changeset/base/230572

Log:
  ixgbe changes:
  - remove experimental code for disabling CRC
  - use the correct constant for conversion between interrupt rate
    and EITR values (the previous values were off by a factor of 2)
  - make dev.ix.N.queueM.interrupt_rate a RW sysctl variable.
    Changing individual values affects the queue immediately,
    and propagates to all interfaces at the next reinit.
  - add dev.ix.N.queueM.irqs rdonly sysctl, to export the actual
    interrupt counts
  
  Netmap-related changes for ixgbe:
  - use the "new" format for TX descriptors in netmap mode.
  - pass interrupt mitigation delays to the user process doing poll()
    on a netmap file descriptor.
    On the RX side this means we will not check the ring more than once
    per interrupt. This gives the process a chance to sleep and process
    packets in larger batches, thus reducing CPU usage.
    On the TX side we take this even further: completed transmissions are
    reclaimed every half ring even if the NIC interrupts more often.
    This saves even more CPU without any additional tx delays.
  
  Generic Netmap-related changes:
  - align the netmap_kring to cache lines so that there is no false sharing
    (possibly useful for multiqueue NICs and MSIX interrupts, which are
    handled by different cores). It's a minor improvement but it does not
    cost anything.
  
  Reviewed by:	Jack Vogel
  Approved by:	Jack Vogel

Modified:
  head/sys/dev/ixgbe/ixgbe.c
  head/sys/dev/netmap/ixgbe_netmap.h
  head/sys/dev/netmap/netmap.c
  head/sys/dev/netmap/netmap_kern.h

Modified: head/sys/dev/ixgbe/ixgbe.c
==============================================================================
--- head/sys/dev/ixgbe/ixgbe.c	Thu Jan 26 09:45:14 2012	(r230571)
+++ head/sys/dev/ixgbe/ixgbe.c	Thu Jan 26 09:55:16 2012	(r230572)
@@ -232,7 +232,7 @@ MODULE_DEPEND(ixgbe, ether, 1, 1, 1);
 static int ixgbe_enable_aim = TRUE;
 TUNABLE_INT("hw.ixgbe.enable_aim", &ixgbe_enable_aim);
 
-static int ixgbe_max_interrupt_rate = (8000000 / IXGBE_LOW_LATENCY);
+static int ixgbe_max_interrupt_rate = (4000000 / IXGBE_LOW_LATENCY);
 TUNABLE_INT("hw.ixgbe.max_interrupt_rate", &ixgbe_max_interrupt_rate);
 
 /* How many packets rxeof tries to clean at a time */
@@ -3385,22 +3385,41 @@ ixgbe_txeof(struct tx_ring *txr)
 #ifdef DEV_NETMAP
 	if (ifp->if_capenable & IFCAP_NETMAP) {
 		struct netmap_adapter *na = NA(ifp);
+		struct netmap_kring *kring = &na->tx_rings[txr->me];
 
+		tx_desc = (struct ixgbe_legacy_tx_desc *)txr->tx_base;
+
+		bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
+		    BUS_DMASYNC_POSTREAD);
 		/*
 		 * In netmap mode, all the work is done in the context
 		 * of the client thread. Interrupt handlers only wake up
 		 * clients, which may be sleeping on individual rings
 		 * or on a global resource for all rings.
+		 * To implement tx interrupt mitigation, we wake up the client
+		 * thread roughly every half ring, even if the NIC interrupts
+		 * more frequently. This is implemented as follows:
+		 * - ixgbe_txsync() sets kring->nr_kflags with the index of
+		 *   the slot that should wake up the thread (nkr_num_slots
+		 *   means the user thread should not be woken up);
+		 * - the driver ignores tx interrupts unless netmap_mitigate=0
+		 *   or the slot has the DD bit set.
+		 *
 		 * When the driver has separate locks, we need to
 		 * release and re-acquire txlock to avoid deadlocks.
 		 * XXX see if we can find a better way.
 		 */
-		selwakeuppri(&na->tx_rings[txr->me].si, PI_NET);
-		IXGBE_TX_UNLOCK(txr);
-		IXGBE_CORE_LOCK(adapter);
-		selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET);
-		IXGBE_CORE_UNLOCK(adapter);
-		IXGBE_TX_LOCK(txr);
+		if (!netmap_mitigate ||
+		    (kring->nr_kflags < kring->nkr_num_slots &&
+		     tx_desc[kring->nr_kflags].upper.fields.status & IXGBE_TXD_STAT_DD)) {
+			kring->nr_kflags = kring->nkr_num_slots;
+			selwakeuppri(&na->tx_rings[txr->me].si, PI_NET);
+			IXGBE_TX_UNLOCK(txr);
+			IXGBE_CORE_LOCK(adapter);
+			selwakeuppri(&na->tx_rings[na->num_queues + 1].si, PI_NET);
+			IXGBE_CORE_UNLOCK(adapter);
+			IXGBE_TX_LOCK(txr);
+		}
 		return FALSE;
 	}
 #endif /* DEV_NETMAP */
@@ -3928,21 +3947,6 @@ skip_head:
 		lro->ifp = adapter->ifp;
 	}
 
-#ifdef DEV_NETMAP1	/* XXX experimental CRC strip */
-	{
-		struct  ixgbe_hw	*hw = &adapter->hw;
-		u32			rdrxctl;
-
-		rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
-		rdrxctl &= ~IXGBE_RDRXCTL_RSCFRSTSIZE;
-		if (slot)
-			rdrxctl &= ~IXGBE_RDRXCTL_CRCSTRIP;
-		else
-			rdrxctl |= IXGBE_RDRXCTL_CRCSTRIP;
-		rdrxctl |= IXGBE_RDRXCTL_RSCACKC;
-		IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
-	}
-#endif /* DEV_NETMAP1 */
 	IXGBE_RX_UNLOCK(rxr);
 	return (0);
 
@@ -4022,12 +4026,6 @@ ixgbe_initialize_receive_units(struct ad
 		hlreg |= IXGBE_HLREG0_JUMBOEN;
 	else
 		hlreg &= ~IXGBE_HLREG0_JUMBOEN;
-#ifdef DEV_NETMAP1	/* XXX experimental CRCSTRIP */
-        if (ifp->if_capenable & IFCAP_NETMAP)
-		hlreg &= ~IXGBE_HLREG0_RXCRCSTRP;
-	else
-		hlreg |= IXGBE_HLREG0_RXCRCSTRP;
-#endif /* DEV_NETMAP1 */
 	IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg);
 
 	bufsz = (adapter->rx_mbuf_sz + BSIZEPKT_ROUNDUP) >> IXGBE_SRRCTL_BSIZEPKT_SHIFT;
@@ -4297,11 +4295,14 @@ ixgbe_rxeof(struct ix_queue *que, int co
 #ifdef DEV_NETMAP
 	if (ifp->if_capenable & IFCAP_NETMAP) {
 		/*
-		 * Same as the txeof routine, only wakeup clients
-		 * and make sure there are no deadlocks.
+		 * Same as the txeof routine: only wakeup clients on intr.
+		 * NKR_PENDINTR in nr_kflags is used to implement interrupt
+		 * mitigation (ixgbe_rxsync() will not look for new packets
+		 * unless NKR_PENDINTR is set).
 		 */
 		struct netmap_adapter *na = NA(ifp);
 
+		na->rx_rings[rxr->me].nr_kflags |= NKR_PENDINTR;
 		selwakeuppri(&na->rx_rings[rxr->me].si, PI_NET);
 		IXGBE_RX_UNLOCK(rxr);
 		IXGBE_CORE_LOCK(adapter);
@@ -4830,7 +4831,7 @@ ixgbe_configure_ivars(struct adapter *ad
 	u32 newitr;
 
 	if (ixgbe_max_interrupt_rate > 0)
-		newitr = (8000000 / ixgbe_max_interrupt_rate) & 0x0FF8;
+		newitr = (4000000 / ixgbe_max_interrupt_rate) & 0x0FF8;
 	else
 		newitr = 0;
 
@@ -5193,12 +5194,21 @@ ixgbe_sysctl_interrupt_rate_handler(SYSC
 	reg = IXGBE_READ_REG(&que->adapter->hw, IXGBE_EITR(que->msix));
 	usec = ((reg & 0x0FF8) >> 3);
 	if (usec > 0)
-		rate = 1000000 / usec;
+		rate = 500000 / usec;
 	else
 		rate = 0;
 	error = sysctl_handle_int(oidp, &rate, 0, req);
 	if (error || !req->newptr)
 		return error;
+	reg &= ~0xfff; /* default, no limitation */
+	ixgbe_max_interrupt_rate = 0;
+	if (rate > 0 && rate < 500000) {
+		if (rate < 1000)
+			rate = 1000;
+		ixgbe_max_interrupt_rate = rate;
+		reg |= ((4000000/rate) & 0xff8 );
+	}
+	IXGBE_WRITE_REG(&que->adapter->hw, IXGBE_EITR(que->msix), reg);
 	return 0;
 }
 
@@ -5252,10 +5262,13 @@ ixgbe_add_hw_stats(struct adapter *adapt
 		queue_list = SYSCTL_CHILDREN(queue_node);
 
 		SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "interrupt_rate",
-				CTLTYPE_UINT | CTLFLAG_RD, &adapter->queues[i],
+				CTLTYPE_UINT | CTLFLAG_RW, &adapter->queues[i],
 				sizeof(&adapter->queues[i]),
 				ixgbe_sysctl_interrupt_rate_handler, "IU",
 				"Interrupt Rate");
+		SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "irqs",
+				CTLFLAG_RD, &(adapter->queues[i].irqs),
+				"irqs on this queue");
 		SYSCTL_ADD_PROC(ctx, queue_list, OID_AUTO, "txd_head", 
 				CTLTYPE_UINT | CTLFLAG_RD, txr, sizeof(txr),
 				ixgbe_sysctl_tdh_handler, "IU",

Modified: head/sys/dev/netmap/ixgbe_netmap.h
==============================================================================
--- head/sys/dev/netmap/ixgbe_netmap.h	Thu Jan 26 09:45:14 2012	(r230571)
+++ head/sys/dev/netmap/ixgbe_netmap.h	Thu Jan 26 09:55:16 2012	(r230572)
@@ -191,6 +191,10 @@ fail:
  * (this is also true for every use of ring in the kernel).
  *
  * ring->avail is never used, only checked for bogus values.
+ *
+ * do_lock is set iff the function is called from the ioctl handler.
+ * In this case, grab a lock around the body, and also reclaim transmitted
+ * buffers irrespective of interrupt mitigation.
  */
 static int
 ixgbe_netmap_txsync(void *a, u_int ring_nr, int do_lock)
@@ -292,10 +296,11 @@ ring_reset:
 			 * need this.
 			 */
 			curr->read.buffer_addr = htole64(paddr);
-			curr->read.olinfo_status = 0;
+			curr->read.olinfo_status = htole32(len << IXGBE_ADVTXD_PAYLEN_SHIFT);
 			curr->read.cmd_type_len =
 			    htole32(txr->txd_cmd | len |
 				(IXGBE_ADVTXD_DTYP_DATA |
+				    IXGBE_ADVTXD_DCMD_DEXT |
 				    IXGBE_ADVTXD_DCMD_IFCS |
 				    IXGBE_TXD_CMD_EOP | flags) );
 			/* If the buffer has changed, unload and reload map
@@ -328,15 +333,41 @@ ring_reset:
 	}
 
 	/*
-	 * If no packets are sent, or there is no room in the tx ring,
-	 * Check whether there are completed transmissions.
-	 * Because this is expensive (we need a register etc.)
-	 * we only do it if absolutely necessary, i.e. there is no room
-	 * in the tx ring, or where were no completed transmissions
-	 * (meaning that probably the caller really wanted to check
-	 * for completed transmissions).
+	 * Reclaim buffers for completed transmissions.
+	 * Because this is expensive (we read a NIC register etc.)
+	 * we only do it in specific cases (see below).
+	 * In all cases kring->nr_kflags indicates which slot will be
+	 * checked upon a tx interrupt (nkr_num_slots means none).
 	 */
-	if (n == 0 || kring->nr_hwavail < 1) {
+	if (do_lock) {
+		j = 1; /* forced reclaim, ignore interrupts */
+		kring->nr_kflags = kring->nkr_num_slots;
+	} else if (kring->nr_hwavail > 0) {
+		j = 0; /* buffers still available: no reclaim, ignore intr. */
+		kring->nr_kflags = kring->nkr_num_slots;
+	} else {
+		/*
+		 * no buffers available, locate a slot for which we request
+		 * ReportStatus (approximately half ring after next_to_clean)
+		 * and record it in kring->nr_kflags.
+		 * If the slot has DD set, do the reclaim looking at TDH,
+		 * otherwise we go to sleep (in netmap_poll()) and will be
+		 * woken up when slot nr_kflags will be ready.
+		 */
+		struct ixgbe_legacy_tx_desc *txd = (struct ixgbe_legacy_tx_desc *)txr->tx_base;
+
+		j = txr->next_to_clean + kring->nkr_num_slots/2;
+		if (j >= kring->nkr_num_slots)
+			j -= kring->nkr_num_slots;
+		// round to the closest with dd set
+		j= (j < kring->nkr_num_slots / 4 || j >= kring->nkr_num_slots*3/4) ?
+			0 : report_frequency;
+		kring->nr_kflags = j; /* the slot to check */
+		j = txd[j].upper.fields.status & IXGBE_TXD_STAT_DD;
+	}
+	if (!j) {
+		netmap_skip_txsync++;
+	} else {
 		int delta;
 
 		/*
@@ -391,6 +422,8 @@ ring_reset:
  * We must subtract the newly consumed slots (cur - nr_hwcur)
  * from nr_hwavail, make the descriptors available for the next reads,
  * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail.
+ *
+ * do_lock has a special meaning: please refer to txsync.
  */
 static int
 ixgbe_netmap_rxsync(void *a, u_int ring_nr, int do_lock)
@@ -401,6 +434,7 @@ ixgbe_netmap_rxsync(void *a, u_int ring_
 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	int j, k, l, n, lim = kring->nkr_num_slots - 1;
+	int force_update = do_lock || kring->nr_kflags & NKR_PENDINTR;
 
 	k = ring->cur;	/* cache and check value, same as in txsync */
 	n = k - kring->nr_hwcur;
@@ -437,6 +471,7 @@ ixgbe_netmap_rxsync(void *a, u_int ring_
 	if (j > lim)
 		j -= lim + 1;
 
+    if (force_update) {
 	for (n = 0; ; n++) {
 		union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l];
 		uint32_t staterr = le32toh(curr->wb.upper.status_error);
@@ -453,6 +488,8 @@ ixgbe_netmap_rxsync(void *a, u_int ring_
 		rxr->next_to_check = l;
 		kring->nr_hwavail += n;
 	}
+	kring->nr_kflags &= ~NKR_PENDINTR;
+    }
 
 	/*
 	 * Skip past packets that userspace has already processed

Modified: head/sys/dev/netmap/netmap.c
==============================================================================
--- head/sys/dev/netmap/netmap.c	Thu Jan 26 09:45:14 2012	(r230571)
+++ head/sys/dev/netmap/netmap.c	Thu Jan 26 09:55:16 2012	(r230572)
@@ -146,6 +146,12 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, total_
     CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers");
 SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers,
     CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers");
+int netmap_mitigate = 1;
+SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
+int netmap_skip_txsync;
+SYSCTL_INT(_dev_netmap, OID_AUTO, skip_txsync, CTLFLAG_RW, &netmap_skip_txsync, 0, "");
+int netmap_skip_rxsync;
+SYSCTL_INT(_dev_netmap, OID_AUTO, skip_rxsync, CTLFLAG_RW, &netmap_skip_rxsync, 0, "");
 
 /*
  * Allocate n buffers from the ring, and fill the slot.

Modified: head/sys/dev/netmap/netmap_kern.h
==============================================================================
--- head/sys/dev/netmap/netmap_kern.h	Thu Jan 26 09:45:14 2012	(r230571)
+++ head/sys/dev/netmap/netmap_kern.h	Thu Jan 26 09:55:16 2012	(r230572)
@@ -65,13 +65,14 @@ struct netmap_kring {
 	struct netmap_ring *ring;
 	u_int nr_hwcur;
 	int nr_hwavail;
-	u_int nr_kflags;
+	u_int nr_kflags;	/* private driver flags */
+#define NKR_PENDINTR   0x1     // Pending interrupt.
 	u_int nkr_num_slots;
 
 	int	nkr_hwofs;	/* offset between NIC and netmap ring */
 	struct netmap_adapter *na;	 // debugging
 	struct selinfo si; /* poll/select wait queue */
-};
+} __attribute__((__aligned__(64)));
 
 /*
  * This struct is part of and extends the 'struct adapter' (or
@@ -171,6 +172,8 @@ struct netmap_slot *netmap_reset(struct 
 	enum txrx tx, int n, u_int new_cur);
 int netmap_ring_reinit(struct netmap_kring *);
 
+extern int netmap_mitigate;
+extern int netmap_skip_txsync, netmap_skip_rxsync;
 extern u_int netmap_total_buffers;
 extern char *netmap_buffer_base;
 extern int netmap_verbose;	// XXX debugging


More information about the svn-src-head mailing list