svn commit: r268028 - head/sys/dev/e1000

Adrian Chadd adrian at FreeBSD.org
Mon Jun 30 04:35:00 UTC 2014


Author: adrian
Date: Mon Jun 30 04:34:59 2014
New Revision: 268028
URL: http://svnweb.freebsd.org/changeset/base/268028

Log:
  Add initial RSS awareness to the igb(4) driver.
  
  The igb(4) hardware is capable of RSS hashing RX packets and doing RSS
  queue selection for up to 8 queues.  (I believe some hardware is limited
  to 4 queues, but I haven't tested on that.)
  
  However, even if multi-queue is enabled for igb(4), the RX path doesn't use
  the RSS flowid from the received descriptor.  It just uses the MSIX queue id.
  
  This patch does a handful of things if RSS is enabled:
  
  * Instead of using a random key at boot, fetch the RSS key from the RSS code
    and program that in to the RSS redirection table.
  
    That whole chunk of code should be double checked for endian correctness.
  
  * Use the RSS queue mapping to CPU ID to figure out where to thread pin
    the RX swi thread and the taskqueue threads for each queue.
  
  * The software queue is now really an "RSS bucket".
  
  * When programming the RSS indirection table, use the RSS code to
    figure out which RSS bucket each slot in the indirection table maps
    to.
  
  * When transmitting, use the flowid RSS mapping if the mbuf has
    an RSS aware hash.  The existing method wasn't guaranteed to align
    correctly with the destination RSS bucket (and thus CPU ID.)
  
  This code warns if the number of RSS buckets isn't the same as the
  automatically configured number of hardware queues.  The administrator
  will have to tweak one of them for better performance.
  
  There's currently no way to re-balance the RSS indirection table after
  startup.  I'll worry about that later.
  
  Additionally, it may be worthwhile to always use the full 32 bit flowid if
  multi-queue is enabled.  It'll make things like lagg(4) behave better with
  respect to traffic distribution.

Modified:
  head/sys/dev/e1000/if_igb.c

Modified: head/sys/dev/e1000/if_igb.c
==============================================================================
--- head/sys/dev/e1000/if_igb.c	Mon Jun 30 04:26:29 2014	(r268027)
+++ head/sys/dev/e1000/if_igb.c	Mon Jun 30 04:34:59 2014	(r268028)
@@ -35,6 +35,7 @@
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_rss.h"
 
 #ifdef HAVE_KERNEL_OPTION_HEADERS
 #include "opt_device_polling.h"
@@ -84,6 +85,9 @@
 #include <netinet/tcp.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/udp.h>
+#ifdef	RSS
+#include <netinet/in_rss.h>
+#endif
 
 #include <machine/in_cksum.h>
 #include <dev/led/led.h>
@@ -967,12 +971,33 @@ igb_mq_start(struct ifnet *ifp, struct m
 	struct igb_queue	*que;
 	struct tx_ring		*txr;
 	int 			i, err = 0;
+#ifdef	RSS
+	uint32_t		bucket_id;
+#endif
 
 	/* Which queue to use */
-	if ((m->m_flags & M_FLOWID) != 0)
-		i = m->m_pkthdr.flowid % adapter->num_queues;
-	else
+	/*
+	 * When doing RSS, map it to the same outbound queue
+	 * as the incoming flow would be mapped to.
+	 *
+	 * If everything is setup correctly, it should be the
+	 * same bucket that the current CPU we're on is.
+	 */
+	if ((m->m_flags & M_FLOWID) != 0) {
+#ifdef	RSS
+		if (rss_hash2bucket(m->m_pkthdr.flowid,
+		    M_HASHTYPE_GET(m), &bucket_id) == 0) {
+			/* XXX TODO: spit out something if bucket_id > num_queues? */
+			i = bucket_id % adapter->num_queues;
+		} else {
+#endif
+			i = m->m_pkthdr.flowid % adapter->num_queues;
+#ifdef	RSS
+		}
+#endif
+	} else {
 		i = curcpu % adapter->num_queues;
+	}
 	txr = &adapter->tx_rings[i];
 	que = &adapter->queues[i];
 
@@ -2433,11 +2458,34 @@ igb_allocate_msix(struct adapter *adapte
 	device_t		dev = adapter->dev;
 	struct igb_queue	*que = adapter->queues;
 	int			error, rid, vector = 0;
+	int			cpu_id;
 
 	/* Be sure to start with all interrupts disabled */
 	E1000_WRITE_REG(&adapter->hw, E1000_IMC, ~0);
 	E1000_WRITE_FLUSH(&adapter->hw);
 
+#ifdef	RSS
+	/*
+	 * If we're doing RSS, the number of queues needs to
+	 * match the number of RSS buckets that are configured.
+	 *
+	 * + If there's more queues than RSS buckets, we'll end
+	 *   up with queues that get no traffic.
+	 *
+	 * + If there's more RSS buckets than queues, we'll end
+	 *   up having multiple RSS buckets map to the same queue,
+	 *   so there'll be some contention.
+	 */
+	if (adapter->num_queues != rss_getnumbuckets()) {
+		device_printf(dev,
+		    "%s: number of queues (%d) != number of RSS buckets (%d)"
+		    "; performance will be impacted.\n",
+		    __func__,
+		    adapter->num_queues,
+		    rss_getnumbuckets());
+	}
+#endif
+
 	for (int i = 0; i < adapter->num_queues; i++, vector++, que++) {
 		rid = vector +1;
 		que->res = bus_alloc_resource_any(dev,
@@ -2464,19 +2512,42 @@ igb_allocate_msix(struct adapter *adapte
 			que->eims = E1000_EICR_TX_QUEUE0 << i;
 		else
 			que->eims = 1 << vector;
+
+#ifdef	RSS
 		/*
-		** Bind the msix vector, and thus the
-		** rings to the corresponding cpu.
-		*/
+		 * The queue ID is used as the RSS layer bucket ID.
+		 * We look up the queue ID -> RSS CPU ID and select
+		 * that.
+		 */
+		cpu_id = rss_getcpu(i % rss_getnumbuckets());
+#else
+		/*
+		 * Bind the msix vector, and thus the
+		 * rings to the corresponding cpu.
+		 *
+		 * This just happens to match the default RSS round-robin
+		 * bucket -> queue -> CPU allocation.
+		 */
 		if (adapter->num_queues > 1) {
 			if (igb_last_bind_cpu < 0)
 				igb_last_bind_cpu = CPU_FIRST();
-			bus_bind_intr(dev, que->res, igb_last_bind_cpu);
+			cpu_id = igb_last_bind_cpu;
+		}
+#endif
+
+		if (adapter->num_queues > 1) {
+			bus_bind_intr(dev, que->res, cpu_id);
+#ifdef	RSS
+			device_printf(dev,
+				"Bound queue %d to RSS bucket %d\n",
+				i, cpu_id);
+#else
 			device_printf(dev,
 				"Bound queue %d to cpu %d\n",
-				i,igb_last_bind_cpu);
-			igb_last_bind_cpu = CPU_NEXT(igb_last_bind_cpu);
+				i, cpu_id);
+#endif
 		}
+
 #ifndef IGB_LEGACY_TX
 		TASK_INIT(&que->txr->txq_task, 0, igb_deferred_mq_start,
 		    que->txr);
@@ -2485,8 +2556,34 @@ igb_allocate_msix(struct adapter *adapte
 		TASK_INIT(&que->que_task, 0, igb_handle_que, que);
 		que->tq = taskqueue_create("igb_que", M_NOWAIT,
 		    taskqueue_thread_enqueue, &que->tq);
-		taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que",
-		    device_get_nameunit(adapter->dev));
+		if (adapter->num_queues > 1) {
+			/*
+			 * Only pin the taskqueue thread to a CPU if
+			 * RSS is in use.
+			 *
+			 * This again just happens to match the default RSS
+			 * round-robin bucket -> queue -> CPU allocation.
+			 */
+#ifdef	RSS
+			taskqueue_start_threads_pinned(&que->tq, 1, PI_NET,
+			    cpu_id,
+			    "%s que (bucket %d)",
+			    device_get_nameunit(adapter->dev),
+			    cpu_id);
+#else
+			taskqueue_start_threads(&que->tq, 1, PI_NET,
+			    "%s que (qid %d)",
+			    device_get_nameunit(adapter->dev),
+			    cpu_id);
+#endif
+		} else {
+			taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que",
+			    device_get_nameunit(adapter->dev));
+		}
+
+		/* Finally update the last bound CPU id */
+		if (adapter->num_queues > 1)
+			igb_last_bind_cpu = CPU_NEXT(igb_last_bind_cpu);
 	}
 
 	/* And Link */
@@ -2763,6 +2860,13 @@ igb_setup_msix(struct adapter *adapter)
 	/* Figure out a reasonable auto config value */
 	queues = (mp_ncpus > (msgs-1)) ? (msgs-1) : mp_ncpus;
 
+#ifdef	RSS
+	/* If we're doing RSS, clamp at the number of RSS buckets */
+	if (queues > rss_getnumbuckets())
+		queues = rss_getnumbuckets();
+#endif
+
+
 	/* Manual override */
 	if (igb_num_queues != 0)
 		queues = igb_num_queues;
@@ -4455,6 +4559,99 @@ fail:
 	return (ENOBUFS);
 }
 
+/*
+ * Initialise the RSS mapping for NICs that support multiple transmit/
+ * receive rings.
+ */
+static void
+igb_initialise_rss_mapping(struct adapter *adapter)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	int i;
+	int queue_id;
+
+	u32 rss_key[10], mrqc, shift = 0;
+	union igb_reta {
+		u32 dword;
+		u8  bytes[4];
+	} reta;
+
+	/* XXX? */
+	if (adapter->hw.mac.type == e1000_82575)
+		shift = 6;
+
+	/*
+	 * The redirection table controls which destination
+	 * queue each bucket redirects traffic to.
+	 * Each DWORD represents four queues, with the LSB
+	 * being the first queue in the DWORD.
+	 *
+	 * This just allocates buckets to queues using round-robin
+	 * allocation.
+	 *
+	 * NOTE: It Just Happens to line up with the default
+	 * RSS allocation method.
+	 */
+
+	/* Warning FM follows */
+	for (i = 0; i < 128; i++) {
+#ifdef	RSS
+		queue_id = rss_get_indirection_to_bucket(i);
+		/*
+		 * If we have more queues than buckets, we'll
+		 * end up mapping buckets to a subset of the
+		 * queues.
+		 *
+		 * If we have more buckets than queues, we'll
+		 * end up instead assigning multiple buckets
+		 * to queues.
+		 *
+		 * Both are suboptimal, but we need to handle
+		 * the case so we don't go out of bounds
+		 * indexing arrays and such.
+		 */
+		queue_id = queue_id % adapter->num_queues;
+#else
+		queue_id = (i % adapter->num_queues);
+#endif
+		reta.bytes[i & 3] = queue_id << shift;
+
+		if ((i & 3) == 3)
+			E1000_WRITE_REG(hw,
+			    E1000_RETA(i >> 2), reta.dword);
+	}
+
+
+	/* Now fill in hash table */
+
+	/* XXX This means RSS enable + 8 queues for my igb (82580.) */
+	mrqc = E1000_MRQC_ENABLE_RSS_4Q;
+
+#ifdef	RSS
+	/* XXX ew typecasting */
+	rss_getkey((uint8_t *) &rss_key);
+#else
+	arc4rand(&rss_key, sizeof(rss_key), 0);
+#endif
+	for (i = 0; i < 10; i++)
+		E1000_WRITE_REG_ARRAY(hw,
+		    E1000_RSSRK(0), i, rss_key[i]);
+
+	/*
+	 * Configure the RSS fields to hash upon.
+	 */
+	mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 |
+	    E1000_MRQC_RSS_FIELD_IPV4_TCP);
+	mrqc |= (E1000_MRQC_RSS_FIELD_IPV6 |
+	    E1000_MRQC_RSS_FIELD_IPV6_TCP);
+	mrqc |=( E1000_MRQC_RSS_FIELD_IPV4_UDP |
+	    E1000_MRQC_RSS_FIELD_IPV6_UDP);
+	mrqc |=( E1000_MRQC_RSS_FIELD_IPV6_UDP_EX |
+	    E1000_MRQC_RSS_FIELD_IPV6_TCP_EX);
+
+	E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
+}
+
 /*********************************************************************
  *
  *  Enable receive unit.
@@ -4538,39 +4735,9 @@ igb_initialize_receive_units(struct adap
 	*/
 	rxcsum = E1000_READ_REG(hw, E1000_RXCSUM);
 	if (adapter->num_queues >1) {
-		u32 random[10], mrqc, shift = 0;
-		union igb_reta {
-			u32 dword;
-			u8  bytes[4];
-		} reta;
-
-		arc4rand(&random, sizeof(random), 0);
-		if (adapter->hw.mac.type == e1000_82575)
-			shift = 6;
-		/* Warning FM follows */
-		for (int i = 0; i < 128; i++) {
-			reta.bytes[i & 3] =
-			    (i % adapter->num_queues) << shift;
-			if ((i & 3) == 3)
-				E1000_WRITE_REG(hw,
-				    E1000_RETA(i >> 2), reta.dword);
-		}
-		/* Now fill in hash table */
-		mrqc = E1000_MRQC_ENABLE_RSS_4Q;
-		for (int i = 0; i < 10; i++)
-			E1000_WRITE_REG_ARRAY(hw,
-			    E1000_RSSRK(0), i, random[i]);
-
-		mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 |
-		    E1000_MRQC_RSS_FIELD_IPV4_TCP);
-		mrqc |= (E1000_MRQC_RSS_FIELD_IPV6 |
-		    E1000_MRQC_RSS_FIELD_IPV6_TCP);
-		mrqc |=( E1000_MRQC_RSS_FIELD_IPV4_UDP |
-		    E1000_MRQC_RSS_FIELD_IPV6_UDP);
-		mrqc |=( E1000_MRQC_RSS_FIELD_IPV6_UDP_EX |
-		    E1000_MRQC_RSS_FIELD_IPV6_TCP_EX);
 
-		E1000_WRITE_REG(hw, E1000_MRQC, mrqc);
+		/* rss setup */
+		igb_initialise_rss_mapping(adapter);
 
 		/*
 		** NOTE: Receive Full-Packet Checksum Offload 
@@ -4831,7 +4998,7 @@ igb_rxeof(struct igb_queue *que, int cou
 	for (i = rxr->next_to_check; count != 0;) {
 		struct mbuf		*sendmp, *mh, *mp;
 		struct igb_rx_buf	*rxbuf;
-		u16			hlen, plen, hdr, vtag;
+		u16			hlen, plen, hdr, vtag, pkt_info;
 		bool			eop = FALSE;
  
 		cur = &rxr->rx_base[i];
@@ -4853,6 +5020,7 @@ igb_rxeof(struct igb_queue *que, int cou
 		else
 			vtag = le16toh(cur->wb.upper.vlan);
 		hdr = le16toh(cur->wb.lower.lo_dword.hs_rss.hdr_info);
+		pkt_info = le16toh(cur->wb.lower.lo_dword.hs_rss.pkt_info);
 		eop = ((staterr & E1000_RXD_STAT_EOP) == E1000_RXD_STAT_EOP);
 
 		/* Make sure all segments of a bad packet are discarded */
@@ -4952,7 +5120,43 @@ igb_rxeof(struct igb_queue *que, int cou
 				rxr->fmp->m_pkthdr.ether_vtag = vtag;
 				rxr->fmp->m_flags |= M_VLANTAG;
 			}
-#ifndef IGB_LEGACY_TX
+#ifdef	RSS
+			/* XXX set flowtype once this works right */
+			rxr->fmp->m_pkthdr.flowid = 
+			    le32toh(cur->wb.lower.hi_dword.rss);
+			rxr->fmp->m_flags |= M_FLOWID;
+			switch (pkt_info & E1000_RXDADV_RSSTYPE_MASK) {
+			case E1000_RXDADV_RSSTYPE_IPV4_TCP:
+				M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_TCP_IPV4);
+				break;
+			case E1000_RXDADV_RSSTYPE_IPV4:
+				M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_IPV4);
+				break;
+			case E1000_RXDADV_RSSTYPE_IPV6_TCP:
+				M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_TCP_IPV6);
+				break;
+			case E1000_RXDADV_RSSTYPE_IPV6_EX:
+				M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_IPV6_EX);
+				break;
+			case E1000_RXDADV_RSSTYPE_IPV6:
+				M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_IPV6);
+				break;
+			case E1000_RXDADV_RSSTYPE_IPV6_TCP_EX:
+				M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_RSS_TCP_IPV6_EX);
+				break;
+
+			/* XXX no UDP support in RSS just yet */
+#ifdef notyet
+			case E1000_RXDADV_RSSTYPE_IPV4_UDP:
+			case E1000_RXDADV_RSSTYPE_IPV6_UDP:
+			case E1000_RXDADV_RSSTYPE_IPV6_UDP_EX:
+#endif
+			
+			default:
+				/* XXX fallthrough */
+				M_HASHTYPE_SET(rxr->fmp, M_HASHTYPE_NONE);
+			}
+#elif !defined(IGB_LEGACY_TX)
 			rxr->fmp->m_pkthdr.flowid = que->msix;
 			rxr->fmp->m_flags |= M_FLOWID;
 #endif


More information about the svn-src-all mailing list