svn commit: r270252 - in stable/10: sys/conf sys/dev/e1000 sys/dev/ixgbe sys/dev/netmap tools/tools/netmap

Luigi Rizzo luigi at FreeBSD.org
Wed Aug 20 23:34:38 UTC 2014


Author: luigi
Date: Wed Aug 20 23:34:36 2014
New Revision: 270252
URL: http://svnweb.freebsd.org/changeset/base/270252

Log:
  MFC 270063: update of netmap code
  (vtnet and cxgbe not merged yet because we need some other mfc first)

Added:
  stable/10/sys/dev/netmap/if_vtnet_netmap.h   (contents, props changed)
  stable/10/sys/dev/netmap/netmap_monitor.c   (contents, props changed)
Modified:
  stable/10/sys/conf/files
  stable/10/sys/dev/e1000/if_em.c
  stable/10/sys/dev/e1000/if_igb.c
  stable/10/sys/dev/e1000/if_lem.c
  stable/10/sys/dev/ixgbe/ixgbe.c
  stable/10/sys/dev/netmap/if_em_netmap.h
  stable/10/sys/dev/netmap/if_igb_netmap.h
  stable/10/sys/dev/netmap/if_lem_netmap.h
  stable/10/sys/dev/netmap/if_re_netmap.h
  stable/10/sys/dev/netmap/ixgbe_netmap.h
  stable/10/sys/dev/netmap/netmap.c
  stable/10/sys/dev/netmap/netmap_freebsd.c
  stable/10/sys/dev/netmap/netmap_generic.c
  stable/10/sys/dev/netmap/netmap_kern.h
  stable/10/sys/dev/netmap/netmap_mbq.h
  stable/10/sys/dev/netmap/netmap_mem2.c
  stable/10/sys/dev/netmap/netmap_mem2.h
  stable/10/sys/dev/netmap/netmap_offloadings.c
  stable/10/sys/dev/netmap/netmap_pipe.c
  stable/10/sys/dev/netmap/netmap_vale.c
  stable/10/tools/tools/netmap/pkt-gen.c
  stable/10/tools/tools/netmap/vale-ctl.c

Modified: stable/10/sys/conf/files
==============================================================================
--- stable/10/sys/conf/files	Wed Aug 20 23:29:34 2014	(r270251)
+++ stable/10/sys/conf/files	Wed Aug 20 23:34:36 2014	(r270252)
@@ -1933,6 +1933,7 @@ dev/netmap/netmap_freebsd.c	optional net
 dev/netmap/netmap_generic.c	optional netmap
 dev/netmap/netmap_mbq.c		optional netmap
 dev/netmap/netmap_mem2.c	optional netmap
+dev/netmap/netmap_monitor.c	optional netmap
 dev/netmap/netmap_offloadings.c	optional netmap
 dev/netmap/netmap_pipe.c	optional netmap
 dev/netmap/netmap_vale.c	optional netmap

Modified: stable/10/sys/dev/e1000/if_em.c
==============================================================================
--- stable/10/sys/dev/e1000/if_em.c	Wed Aug 20 23:29:34 2014	(r270251)
+++ stable/10/sys/dev/e1000/if_em.c	Wed Aug 20 23:34:36 2014	(r270252)
@@ -3389,10 +3389,10 @@ em_setup_transmit_ring(struct tx_ring *t
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + si, &paddr);
+			addr = PNMB(na, slot + si, &paddr);
 			txr->tx_base[i].buffer_addr = htole64(paddr);
 			/* reload the map for netmap mode */
-			netmap_load_map(txr->txtag, txbuf->map, addr);
+			netmap_load_map(na, txr->txtag, txbuf->map, addr);
 		}
 #endif /* DEV_NETMAP */
 
@@ -4131,8 +4131,8 @@ em_setup_receive_ring(struct rx_ring *rx
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + si, &paddr);
-			netmap_load_map(rxr->rxtag, rxbuf->map, addr);
+			addr = PNMB(na, slot + si, &paddr);
+			netmap_load_map(na, rxr->rxtag, rxbuf->map, addr);
 			/* Update descriptor */
 			rxr->rx_base[j].buffer_addr = htole64(paddr);
 			continue;

Modified: stable/10/sys/dev/e1000/if_igb.c
==============================================================================
--- stable/10/sys/dev/e1000/if_igb.c	Wed Aug 20 23:29:34 2014	(r270251)
+++ stable/10/sys/dev/e1000/if_igb.c	Wed Aug 20 23:34:36 2014	(r270252)
@@ -3531,7 +3531,7 @@ igb_setup_transmit_ring(struct tx_ring *
 		if (slot) {
 			int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
 			/* no need to set the address */
-			netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+			netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si));
 		}
 #endif /* DEV_NETMAP */
 		/* clear the watch index */
@@ -4335,8 +4335,8 @@ igb_setup_receive_ring(struct rx_ring *r
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + sj, &paddr);
-			netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+			addr = PNMB(na, slot + sj, &paddr);
+			netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
 			/* Update descriptor */
 			rxr->rx_base[j].read.pkt_addr = htole64(paddr);
 			continue;

Modified: stable/10/sys/dev/e1000/if_lem.c
==============================================================================
--- stable/10/sys/dev/e1000/if_lem.c	Wed Aug 20 23:29:34 2014	(r270251)
+++ stable/10/sys/dev/e1000/if_lem.c	Wed Aug 20 23:34:36 2014	(r270252)
@@ -32,6 +32,15 @@
 ******************************************************************************/
 /*$FreeBSD$*/
 
+/*
+ * Uncomment the following extensions for better performance in a VM,
+ * especially if you have support in the hypervisor.
+ * See http://info.iet.unipi.it/~luigi/netmap/
+ */
+// #define BATCH_DISPATCH
+// #define NIC_SEND_COMBINING
+// #define NIC_PARAVIRT	/* enable virtio-like synchronization */
+
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
@@ -289,6 +298,10 @@ static int lem_tx_int_delay_dflt = EM_TI
 static int lem_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR);
 static int lem_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV);
 static int lem_rx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_RADV);
+/*
+ * increase lem_rxd and lem_txd to at least 2048 in netmap mode
+ * for better performance.
+ */
 static int lem_rxd = EM_DEFAULT_RXD;
 static int lem_txd = EM_DEFAULT_TXD;
 static int lem_smart_pwr_down = FALSE;
@@ -458,6 +471,20 @@ lem_attach(device_t dev)
 	    "max number of rx packets to process", &adapter->rx_process_limit,
 	    lem_rx_process_limit);
 
+#ifdef NIC_SEND_COMBINING
+	/* Sysctls to control mitigation */
+	lem_add_rx_process_limit(adapter, "sc_enable",
+	    "driver TDT mitigation", &adapter->sc_enable, 0);
+#endif /* NIC_SEND_COMBINING */
+#ifdef BATCH_DISPATCH
+	lem_add_rx_process_limit(adapter, "batch_enable",
+	    "driver rx batch", &adapter->batch_enable, 0);
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+	lem_add_rx_process_limit(adapter, "rx_retries",
+	    "driver rx retries", &adapter->rx_retries, 0);
+#endif /* NIC_PARAVIRT */
+
         /* Sysctl for setting the interface flow control */
 	lem_set_flow_cntrl(adapter, "flow_control",
 	    "flow control setting",
@@ -515,6 +542,49 @@ lem_attach(device_t dev)
 	 */
 	adapter->hw.mac.report_tx_early = 1;
 
+#ifdef NIC_PARAVIRT
+	device_printf(dev, "driver supports paravirt, subdev 0x%x\n",
+		adapter->hw.subsystem_device_id);
+	if (adapter->hw.subsystem_device_id == E1000_PARA_SUBDEV) {
+		uint64_t bus_addr;
+
+		device_printf(dev, "paravirt support on dev %p\n", adapter);
+		tsize = 4096; // XXX one page for the csb
+		if (lem_dma_malloc(adapter, tsize, &adapter->csb_mem, BUS_DMA_NOWAIT)) {
+			device_printf(dev, "Unable to allocate csb memory\n");
+			error = ENOMEM;
+			goto err_csb;
+		}
+		/* Setup the Base of the CSB */
+		adapter->csb = (struct paravirt_csb *)adapter->csb_mem.dma_vaddr;
+		/* force the first kick */
+		adapter->csb->host_need_txkick = 1; /* txring empty */
+		adapter->csb->guest_need_rxkick = 1; /* no rx packets */
+		bus_addr = adapter->csb_mem.dma_paddr;
+		lem_add_rx_process_limit(adapter, "csb_on",
+		    "enable paravirt.", &adapter->csb->guest_csb_on, 0);
+		lem_add_rx_process_limit(adapter, "txc_lim",
+		    "txc_lim", &adapter->csb->host_txcycles_lim, 1);
+
+		/* some stats */
+#define PA_SC(name, var, val)		\
+	lem_add_rx_process_limit(adapter, name, name, var, val)
+		PA_SC("host_need_txkick",&adapter->csb->host_need_txkick, 1);
+		PA_SC("host_rxkick_at",&adapter->csb->host_rxkick_at, ~0);
+		PA_SC("guest_need_txkick",&adapter->csb->guest_need_txkick, 0);
+		PA_SC("guest_need_rxkick",&adapter->csb->guest_need_rxkick, 1);
+		PA_SC("tdt_reg_count",&adapter->tdt_reg_count, 0);
+		PA_SC("tdt_csb_count",&adapter->tdt_csb_count, 0);
+		PA_SC("tdt_int_count",&adapter->tdt_int_count, 0);
+		PA_SC("guest_need_kick_count",&adapter->guest_need_kick_count, 0);
+		/* tell the host where the block is */
+		E1000_WRITE_REG(&adapter->hw, E1000_CSBAH,
+			(u32)(bus_addr >> 32));
+		E1000_WRITE_REG(&adapter->hw, E1000_CSBAL,
+			(u32)bus_addr);
+	}
+#endif /* NIC_PARAVIRT */
+
 	tsize = roundup2(adapter->num_tx_desc * sizeof(struct e1000_tx_desc),
 	    EM_DBA_ALIGN);
 
@@ -673,6 +743,11 @@ err_hw_init:
 err_rx_desc:
 	lem_dma_free(adapter, &adapter->txdma);
 err_tx_desc:
+#ifdef NIC_PARAVIRT
+	lem_dma_free(adapter, &adapter->csb_mem);
+err_csb:
+#endif /* NIC_PARAVIRT */
+
 err_pci:
 	if (adapter->ifp != NULL)
 		if_free(adapter->ifp);
@@ -760,6 +835,12 @@ lem_detach(device_t dev)
 		adapter->rx_desc_base = NULL;
 	}
 
+#ifdef NIC_PARAVIRT
+	if (adapter->csb) {
+		lem_dma_free(adapter, &adapter->csb_mem);
+		adapter->csb = NULL;
+	}
+#endif /* NIC_PARAVIRT */
 	lem_release_hw_control(adapter);
 	free(adapter->mta, M_DEVBUF);
 	EM_TX_LOCK_DESTROY(adapter);
@@ -869,6 +950,16 @@ lem_start_locked(struct ifnet *ifp)
 	}
 	if (adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD)
 		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+	if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE && adapter->csb &&
+	    adapter->csb->guest_csb_on &&
+	    !(adapter->csb->guest_need_txkick & 1))  {
+		adapter->csb->guest_need_txkick = 1;
+		adapter->guest_need_kick_count++;
+		// XXX memory barrier
+		lem_txeof(adapter); // XXX possibly clear IFF_DRV_OACTIVE
+	}
+#endif /* NIC_PARAVIRT */
 
 	return;
 }
@@ -1715,6 +1806,37 @@ lem_xmit(struct adapter *adapter, struct
 	 */
 	bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
 	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+#ifdef NIC_PARAVIRT
+	if (adapter->csb) {
+		adapter->csb->guest_tdt = i;
+		/* XXX memory barrier ? */
+ 		if (adapter->csb->guest_csb_on &&
+		    !(adapter->csb->host_need_txkick & 1)) {
+			/* XXX maybe useless
+			 * clean the ring. maybe do it before ?
+			 * maybe a little bit of histeresys ?
+			 */
+			if (adapter->num_tx_desc_avail <= 64) {// XXX
+				lem_txeof(adapter);
+			}
+			return (0);
+		}
+	}
+#endif /* NIC_PARAVIRT */
+
+#ifdef NIC_SEND_COMBINING
+	if (adapter->sc_enable) {
+		if (adapter->shadow_tdt & MIT_PENDING_INT) {
+			/* signal intr and data pending */
+			adapter->shadow_tdt = MIT_PENDING_TDT | (i & 0xffff);
+			return (0);
+		} else {
+			adapter->shadow_tdt = MIT_PENDING_INT;
+		}
+	}
+#endif /* NIC_SEND_COMBINING */
+
 	if (adapter->hw.mac.type == e1000_82547 &&
 	    adapter->link_duplex == HALF_DUPLEX)
 		lem_82547_move_tail(adapter);
@@ -1995,6 +2117,20 @@ lem_local_timer(void *arg)
 
 	lem_smartspeed(adapter);
 
+#ifdef NIC_PARAVIRT
+	/* recover space if needed */
+	if (adapter->csb && adapter->csb->guest_csb_on &&
+	    (adapter->watchdog_check == TRUE) &&
+	    (ticks - adapter->watchdog_time > EM_WATCHDOG) &&
+	    (adapter->num_tx_desc_avail != adapter->num_tx_desc) ) {
+		lem_txeof(adapter);
+		/*
+		 * lem_txeof() normally (except when space in the queue
+		 * runs low XXX) cleans watchdog_check so that
+		 * we do not hung.
+		 */
+	}
+#endif /* NIC_PARAVIRT */
 	/*
 	 * We check the watchdog: the time since
 	 * the last TX descriptor was cleaned.
@@ -2677,10 +2813,10 @@ lem_setup_transmit_structures(struct ada
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + si, &paddr);
+			addr = PNMB(na, slot + si, &paddr);
 			adapter->tx_desc_base[i].buffer_addr = htole64(paddr);
 			/* reload the map for netmap mode */
-			netmap_load_map(adapter->txtag, tx_buffer->map, addr);
+			netmap_load_map(na, adapter->txtag, tx_buffer->map, addr);
 		}
 #endif /* DEV_NETMAP */
 		tx_buffer->next_eop = -1;
@@ -3055,6 +3191,16 @@ lem_txeof(struct adapter *adapter)
         adapter->next_tx_to_clean = first;
         adapter->num_tx_desc_avail = num_avail;
 
+#ifdef NIC_SEND_COMBINING
+	if ((adapter->shadow_tdt & MIT_PENDING_TDT) == MIT_PENDING_TDT) {
+		/* a tdt write is pending, do it */
+		E1000_WRITE_REG(&adapter->hw, E1000_TDT(0),
+			0xffff & adapter->shadow_tdt);
+		adapter->shadow_tdt = MIT_PENDING_INT;
+	} else {
+		adapter->shadow_tdt = 0; // disable
+	}
+#endif /* NIC_SEND_COMBINING */
         /*
          * If we have enough room, clear IFF_DRV_OACTIVE to
          * tell the stack that it is OK to send packets.
@@ -3062,6 +3208,12 @@ lem_txeof(struct adapter *adapter)
          */
         if (adapter->num_tx_desc_avail > EM_TX_CLEANUP_THRESHOLD) {                
                 ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+		if (adapter->csb) { // XXX also csb_on ?
+			adapter->csb->guest_need_txkick = 2; /* acked */
+			// XXX memory barrier
+		}
+#endif /* NIC_PARAVIRT */
                 if (adapter->num_tx_desc_avail == adapter->num_tx_desc) {
 			adapter->watchdog_check = FALSE;
 			return;
@@ -3247,8 +3399,8 @@ lem_setup_receive_structures(struct adap
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + si, &paddr);
-			netmap_load_map(adapter->rxtag, rx_buffer->map, addr);
+			addr = PNMB(na, slot + si, &paddr);
+			netmap_load_map(na, adapter->rxtag, rx_buffer->map, addr);
 			/* Update descriptor */
 			adapter->rx_desc_base[i].buffer_addr = htole64(paddr);
 			continue;
@@ -3445,7 +3597,23 @@ lem_rxeof(struct adapter *adapter, int c
 	int		i, rx_sent = 0;
 	struct e1000_rx_desc   *current_desc;
 
+#ifdef BATCH_DISPATCH
+	struct mbuf *mh = NULL, *mt = NULL;
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+	int retries = 0;
+	struct paravirt_csb* csb = adapter->csb;
+	int csb_mode = csb && csb->guest_csb_on;
+
+	//ND("clear guest_rxkick at %d", adapter->next_rx_desc_to_check);
+	if (csb_mode && csb->guest_need_rxkick)
+		csb->guest_need_rxkick = 0;
+#endif /* NIC_PARAVIRT */
 	EM_RX_LOCK(adapter);
+
+#ifdef BATCH_DISPATCH
+    batch_again:
+#endif /* BATCH_DISPATCH */
 	i = adapter->next_rx_desc_to_check;
 	current_desc = &adapter->rx_desc_base[i];
 	bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
@@ -3458,19 +3626,45 @@ lem_rxeof(struct adapter *adapter, int c
 	}
 #endif /* DEV_NETMAP */
 
+#if 1 // XXX optimization ?
 	if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
 		if (done != NULL)
 			*done = rx_sent;
 		EM_RX_UNLOCK(adapter);
 		return (FALSE);
 	}
+#endif /* 0 */
 
 	while (count != 0 && ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		struct mbuf *m = NULL;
 
 		status = current_desc->status;
-		if ((status & E1000_RXD_STAT_DD) == 0)
+		if ((status & E1000_RXD_STAT_DD) == 0) {
+#ifdef NIC_PARAVIRT
+		    if (csb_mode) {
+			/* buffer not ready yet. Retry a few times before giving up */
+			if (++retries <= adapter->rx_retries) {
+				continue;
+			}
+			if (csb->guest_need_rxkick == 0) {
+				// ND("set guest_rxkick at %d", adapter->next_rx_desc_to_check);
+				csb->guest_need_rxkick = 1;
+				// XXX memory barrier, status volatile ?
+				continue; /* double check */
+			}
+		    }
+		    /* no buffer ready, give up */
+#endif /* NIC_PARAVIRT */
 			break;
+		}
+#ifdef NIC_PARAVIRT
+		if (csb_mode) {
+			if (csb->guest_need_rxkick)
+				// ND("clear again guest_rxkick at %d", adapter->next_rx_desc_to_check);
+			csb->guest_need_rxkick = 0;
+			retries = 0;
+		}
+#endif /* NIC_PARAVIRT */
 
 		mp = adapter->rx_buffer_area[i].m_head;
 		/*
@@ -3595,11 +3789,36 @@ discard:
 		bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
 		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
+#ifdef NIC_PARAVIRT
+		if (csb_mode) {
+			/* the buffer at i has been already replaced by lem_get_buf()
+			 * so it is safe to set guest_rdt = i and possibly send a kick.
+			 * XXX see if we can optimize it later.
+			 */
+			csb->guest_rdt = i;
+			// XXX memory barrier
+			if (i == csb->host_rxkick_at)
+				E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
+		}
+#endif /* NIC_PARAVIRT */
 		/* Advance our pointers to the next descriptor. */
 		if (++i == adapter->num_rx_desc)
 			i = 0;
 		/* Call into the stack */
 		if (m != NULL) {
+#ifdef BATCH_DISPATCH
+		    if (adapter->batch_enable) {
+			if (mh == NULL)
+				mh = mt = m;
+			else
+				mt->m_nextpkt = m;
+			mt = m;
+			m->m_nextpkt = NULL;
+			rx_sent++;
+			current_desc = &adapter->rx_desc_base[i];
+			continue;
+		    }
+#endif /* BATCH_DISPATCH */
 			adapter->next_rx_desc_to_check = i;
 			EM_RX_UNLOCK(adapter);
 			(*ifp->if_input)(ifp, m);
@@ -3610,10 +3829,27 @@ discard:
 		current_desc = &adapter->rx_desc_base[i];
 	}
 	adapter->next_rx_desc_to_check = i;
+#ifdef BATCH_DISPATCH
+	if (mh) {
+		EM_RX_UNLOCK(adapter);
+		while ( (mt = mh) != NULL) {
+			mh = mh->m_nextpkt;
+			mt->m_nextpkt = NULL;
+			if_input(ifp, mt);
+		}
+		EM_RX_LOCK(adapter);
+		i = adapter->next_rx_desc_to_check; /* in case of interrupts */
+		if (count > 0)
+			goto batch_again;
+	}
+#endif /* BATCH_DISPATCH */
 
 	/* Advance the E1000's Receive Queue #0  "Tail Pointer". */
 	if (--i < 0)
 		i = adapter->num_rx_desc - 1;
+#ifdef NIC_PARAVIRT
+	if (!csb_mode) /* filter out writes */
+#endif /* NIC_PARAVIRT */
 	E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
 	if (done != NULL)
 		*done = rx_sent;

Modified: stable/10/sys/dev/ixgbe/ixgbe.c
==============================================================================
--- stable/10/sys/dev/ixgbe/ixgbe.c	Wed Aug 20 23:29:34 2014	(r270251)
+++ stable/10/sys/dev/ixgbe/ixgbe.c	Wed Aug 20 23:34:36 2014	(r270252)
@@ -3079,7 +3079,7 @@ ixgbe_setup_transmit_ring(struct tx_ring
 		 */
 		if (slot) {
 			int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
-			netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+			netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si));
 		}
 #endif /* DEV_NETMAP */
 		/* Clear the EOP descriptor pointer */
@@ -4025,8 +4025,8 @@ ixgbe_setup_receive_ring(struct rx_ring 
 			uint64_t paddr;
 			void *addr;
 
-			addr = PNMB(slot + sj, &paddr);
-			netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+			addr = PNMB(na, slot + sj, &paddr);
+			netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
 			/* Update descriptor and the cached value */
 			rxr->rx_base[j].read.pkt_addr = htole64(paddr);
 			rxbuf->addr = htole64(paddr);

Modified: stable/10/sys/dev/netmap/if_em_netmap.h
==============================================================================
--- stable/10/sys/dev/netmap/if_em_netmap.h	Wed Aug 20 23:29:34 2014	(r270251)
+++ stable/10/sys/dev/netmap/if_em_netmap.h	Wed Aug 20 23:34:36 2014	(r270252)
@@ -113,10 +113,10 @@ em_netmap_reg(struct netmap_adapter *na,
  * Reconcile kernel and user view of the transmit ring.
  */
 static int
-em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_txsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->tx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -128,7 +128,7 @@ em_netmap_txsync(struct netmap_adapter *
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
-	struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+	struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
 
 	bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
 			BUS_DMASYNC_POSTREAD);
@@ -144,7 +144,7 @@ em_netmap_txsync(struct netmap_adapter *
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			/* device-specific */
 			struct e1000_tx_desc *curr = &txr->tx_base[nic_i];
@@ -153,12 +153,12 @@ em_netmap_txsync(struct netmap_adapter *
 				nic_i == 0 || nic_i == report_frequency) ?
 				E1000_TXD_CMD_RS : 0;
 
-			NM_CHECK_ADDR_LEN(addr, len);
+			NM_CHECK_ADDR_LEN(na, addr, len);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				curr->buffer_addr = htole64(paddr);
 				/* buffer has changed, reload map */
-				netmap_reload_map(txr->txtag, txbuf->map, addr);
+				netmap_reload_map(na, txr->txtag, txbuf->map, addr);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
@@ -187,7 +187,7 @@ em_netmap_txsync(struct netmap_adapter *
 	 */
 	if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
 		/* record completed transmissions using TDH */
-		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
 		if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
 			D("TDH wrap %d", nic_i);
 			nic_i -= kring->nkr_num_slots;
@@ -208,10 +208,10 @@ em_netmap_txsync(struct netmap_adapter *
  * Reconcile kernel and user view of the receive ring.
  */
 static int
-em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->rx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -222,7 +222,7 @@ em_netmap_rxsync(struct netmap_adapter *
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
-	struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+	struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
 
 	if (head > lim)
 		return netmap_ring_reinit(kring);
@@ -271,18 +271,18 @@ em_netmap_rxsync(struct netmap_adapter *
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			struct e1000_rx_desc *curr = &rxr->rx_base[nic_i];
 			struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i];
 
-			if (addr == netmap_buffer_base) /* bad buf */
+			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 				goto ring_reset;
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				curr->buffer_addr = htole64(paddr);
-				netmap_reload_map(rxr->rxtag, rxbuf->map, addr);
+				netmap_reload_map(na, rxr->rxtag, rxbuf->map, addr);
 				slot->flags &= ~NS_BUF_CHANGED;
 			}
 			curr->status = 0;

Modified: stable/10/sys/dev/netmap/if_igb_netmap.h
==============================================================================
--- stable/10/sys/dev/netmap/if_igb_netmap.h	Wed Aug 20 23:29:34 2014	(r270251)
+++ stable/10/sys/dev/netmap/if_igb_netmap.h	Wed Aug 20 23:34:36 2014	(r270252)
@@ -81,10 +81,10 @@ igb_netmap_reg(struct netmap_adapter *na
  * Reconcile kernel and user view of the transmit ring.
  */
 static int
-igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_txsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->tx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -96,7 +96,7 @@ igb_netmap_txsync(struct netmap_adapter 
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
-	struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+	struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
 	/* 82575 needs the queue index added */
 	u32 olinfo_status =
 	    (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0;
@@ -115,7 +115,7 @@ igb_netmap_txsync(struct netmap_adapter 
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			/* device-specific */
 			union e1000_adv_tx_desc *curr =
@@ -125,11 +125,11 @@ igb_netmap_txsync(struct netmap_adapter 
 				nic_i == 0 || nic_i == report_frequency) ?
 				E1000_ADVTXD_DCMD_RS : 0;
 
-			NM_CHECK_ADDR_LEN(addr, len);
+			NM_CHECK_ADDR_LEN(na, addr, len);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
-				netmap_reload_map(txr->txtag, txbuf->map, addr);
+				netmap_reload_map(na, txr->txtag, txbuf->map, addr);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
@@ -171,7 +171,7 @@ igb_netmap_txsync(struct netmap_adapter 
 	 */
 	if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
 		/* record completed transmissions using TDH */
-		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
 		if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
 			D("TDH wrap %d", nic_i);
 			nic_i -= kring->nkr_num_slots;
@@ -190,10 +190,10 @@ igb_netmap_txsync(struct netmap_adapter 
  * Reconcile kernel and user view of the receive ring.
  */
 static int
-igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->rx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -204,7 +204,7 @@ igb_netmap_rxsync(struct netmap_adapter 
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
-	struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+	struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
 
 	if (head > lim)
 		return netmap_ring_reinit(kring);
@@ -251,17 +251,17 @@ igb_netmap_rxsync(struct netmap_adapter 
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i];
 			struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i];
 
-			if (addr == netmap_buffer_base) /* bad buf */
+			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 				goto ring_reset;
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
-				netmap_reload_map(rxr->ptag, rxbuf->pmap, addr);
+				netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr);
 				slot->flags &= ~NS_BUF_CHANGED;
 			}
 			curr->wb.upper.status_error = 0;

Modified: stable/10/sys/dev/netmap/if_lem_netmap.h
==============================================================================
--- stable/10/sys/dev/netmap/if_lem_netmap.h	Wed Aug 20 23:29:34 2014	(r270251)
+++ stable/10/sys/dev/netmap/if_lem_netmap.h	Wed Aug 20 23:34:36 2014	(r270252)
@@ -39,6 +39,7 @@
 #include <vm/pmap.h>    /* vtophys ? */
 #include <dev/netmap/netmap_kern.h>
 
+extern int netmap_adaptive_io;
 
 /*
  * Register/unregister. We are already under netmap lock.
@@ -84,10 +85,10 @@ lem_netmap_reg(struct netmap_adapter *na
  * Reconcile kernel and user view of the transmit ring.
  */
 static int
-lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_txsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->tx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -98,6 +99,10 @@ lem_netmap_txsync(struct netmap_adapter 
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
+#ifdef NIC_PARAVIRT
+	struct paravirt_csb *csb = adapter->csb;
+	uint64_t *csbd = (uint64_t *)(csb + 1);
+#endif /* NIC_PARAVIRT */
 
 	bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
 			BUS_DMASYNC_POSTREAD);
@@ -108,12 +113,25 @@ lem_netmap_txsync(struct netmap_adapter 
 
 	nm_i = kring->nr_hwcur;
 	if (nm_i != head) {	/* we have new packets to send */
+#ifdef NIC_PARAVIRT
+		int do_kick = 0;
+		uint64_t t = 0; // timestamp
+		int n = head - nm_i;
+		if (n < 0)
+			n += lim + 1;
+		if (csb) {
+			t = rdtsc(); /* last timestamp */
+			csbd[16] += t - csbd[0]; /* total Wg */
+			csbd[17] += n;		/* Wg count */
+			csbd[0] = t;
+		}
+#endif /* NIC_PARAVIRT */
 		nic_i = netmap_idx_k2n(kring, nm_i);
 		while (nm_i != head) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			u_int len = slot->len;
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			/* device-specific */
 			struct e1000_tx_desc *curr = &adapter->tx_desc_base[nic_i];
@@ -122,12 +140,12 @@ lem_netmap_txsync(struct netmap_adapter 
 				nic_i == 0 || nic_i == report_frequency) ?
 				E1000_TXD_CMD_RS : 0;
 
-			NM_CHECK_ADDR_LEN(addr, len);
+			NM_CHECK_ADDR_LEN(na, addr, len);
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				curr->buffer_addr = htole64(paddr);
-				netmap_reload_map(adapter->txtag, txbuf->map, addr);
+				netmap_reload_map(na, adapter->txtag, txbuf->map, addr);
 			}
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 
@@ -140,6 +158,7 @@ lem_netmap_txsync(struct netmap_adapter 
 
 			nm_i = nm_next(nm_i, lim);
 			nic_i = nm_next(nic_i, lim);
+			// XXX might try an early kick
 		}
 		kring->nr_hwcur = head;
 
@@ -147,8 +166,38 @@ lem_netmap_txsync(struct netmap_adapter 
 		bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
 			BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
+#ifdef NIC_PARAVIRT
+		/* set unconditionally, then also kick if needed */
+		if (csb) {
+			t = rdtsc();
+			if (csb->host_need_txkick == 2) {
+				/* can compute an update of delta */
+				int64_t delta = t - csbd[3];
+				if (delta < 0)
+					delta = -delta;
+				if (csbd[8] == 0 || delta < csbd[8]) {
+					csbd[8] = delta;
+					csbd[9]++;
+				}
+				csbd[10]++;
+			}
+			csb->guest_tdt = nic_i;
+			csbd[18] += t - csbd[0]; // total wp
+			csbd[19] += n;
+		}
+		if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1))
+			do_kick = 1;
+		if (do_kick)
+#endif /* NIC_PARAVIRT */
 		/* (re)start the tx unit up to slot nic_i (excluded) */
 		E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i);
+#ifdef NIC_PARAVIRT
+		if (do_kick) {
+			uint64_t t1 = rdtsc();
+			csbd[20] += t1 - t; // total Np
+			csbd[21]++;
+		}
+#endif /* NIC_PARAVIRT */
 	}
 
 	/*
@@ -157,6 +206,93 @@ lem_netmap_txsync(struct netmap_adapter 
 	if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
 		kring->last_reclaim = ticks;
 		/* record completed transmissions using TDH */
+#ifdef NIC_PARAVIRT
+		/* host updates tdh unconditionally, and we have
+		 * no side effects on reads, so we can read from there
+		 * instead of exiting.
+		 */
+		if (csb) {
+		    static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 0;
+		    u_int x = adapter->next_tx_to_clean;
+		    csbd[19]++; // XXX count reclaims
+		    nic_i = csb->host_tdh;
+		    if (csb->guest_csb_on) {
+			if (nic_i == x) {
+			    bad++;
+		    	    csbd[24]++; // failed reclaims
+			    /* no progress, request kick and retry */
+			    csb->guest_need_txkick = 1;
+			    mb(); // XXX barrier
+		    	    nic_i = csb->host_tdh;
+			} else {
+			    good++;
+			}
+			if (nic_i != x) {
+			    csb->guest_need_txkick = 2;
+			    if (nic_i == csb->guest_tdt)
+				drain++;
+			    else
+				nodrain++;
+#if 1
+			if (netmap_adaptive_io) {
+			    /* new mechanism: last half ring (or so)
+			     * released one slot at a time.
+			     * This effectively makes the system spin.
+			     *
+			     * Take next_to_clean + 1 as a reference.
+			     * tdh must be ahead or equal
+			     * On entry, the logical order is
+			     *		x < tdh = nic_i
+			     * We first push tdh up to avoid wraps.
+			     * The limit is tdh-ll (half ring).
+			     * if tdh-256 < x we report x;
+			     * else we report tdh-256
+			     */
+			    u_int tdh = nic_i;
+			    u_int ll = csbd[15];
+			    u_int delta = lim/8;
+			    if (netmap_adaptive_io == 2 || ll > delta)
+				csbd[15] = ll = delta;
+			    else if (netmap_adaptive_io == 1 && ll > 1) {
+				csbd[15]--;
+			    }
+
+			    if (nic_i >= kring->nkr_num_slots) {
+				RD(5, "bad nic_i %d on input", nic_i);
+			    }
+			    x = nm_next(x, lim);
+			    if (tdh < x)
+				tdh += lim + 1;
+			    if (tdh <= x + ll) {
+				nic_i = x;
+				csbd[25]++; //report n + 1;
+			    } else {
+				tdh = nic_i;
+				if (tdh < ll)
+				    tdh += lim + 1;
+				nic_i = tdh - ll;
+				csbd[26]++; // report tdh - ll
+			    }
+			}
+#endif
+			} else {
+			    /* we stop, count whether we are idle or not */
+			    int bh_active = csb->host_need_txkick & 2 ? 4 : 0;
+			    csbd[27+ csb->host_need_txkick]++;
+			    if (netmap_adaptive_io == 1) {
+				if (bh_active && csbd[15] > 1)
+				    csbd[15]--;
+				else if (!bh_active && csbd[15] < lim/2)
+				    csbd[15]++;
+			    }
+			    bad--;
+			    fail++;
+			}
+		    }
+		    RD(1, "drain %d nodrain %d good %d retry %d fail %d",
+			drain, nodrain, good, bad, fail);
+		} else
+#endif /* !NIC_PARAVIRT */
 		nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
 		if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
 			D("TDH wrap %d", nic_i);
@@ -176,10 +312,10 @@ lem_netmap_txsync(struct netmap_adapter 
  * Reconcile kernel and user view of the receive ring.
  */
 static int
-lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
+	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
-	struct netmap_kring *kring = &na->rx_rings[ring_nr];
 	struct netmap_ring *ring = kring->ring;
 	u_int nm_i;	/* index into the netmap ring */
 	u_int nic_i;	/* index into the NIC ring */
@@ -190,10 +326,21 @@ lem_netmap_rxsync(struct netmap_adapter 
 
 	/* device-specific */
 	struct adapter *adapter = ifp->if_softc;
+#ifdef NIC_PARAVIRT
+	struct paravirt_csb *csb = adapter->csb;
+	uint32_t csb_mode = csb && csb->guest_csb_on;
+	uint32_t do_host_rxkick = 0;
+#endif /* NIC_PARAVIRT */
 
 	if (head > lim)
 		return netmap_ring_reinit(kring);
 
+#ifdef NIC_PARAVIRT
+	if (csb_mode) {
+		force_update = 1;
+		csb->guest_need_rxkick = 0;
+	}
+#endif /* NIC_PARAVIRT */
 	/* XXX check sync modes */
 	bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
 			BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
@@ -212,11 +359,28 @@ lem_netmap_rxsync(struct netmap_adapter 
 			uint32_t staterr = le32toh(curr->status);
 			int len;
 
+#ifdef NIC_PARAVIRT
+			if (csb_mode) {
+			    if ((staterr & E1000_RXD_STAT_DD) == 0) {
+				/* don't bother to retry if more than 1 pkt */
+				if (n > 1)
+				    break;
+				csb->guest_need_rxkick = 1;
+				wmb();
+				staterr = le32toh(curr->status);
+				if ((staterr & E1000_RXD_STAT_DD) == 0) {
+				    break;
+				} else { /* we are good */
+				   csb->guest_need_rxkick = 0;
+				}
+			    }
+			} else
+#endif /* NIC_PARAVIRT */
 			if ((staterr & E1000_RXD_STAT_DD) == 0)
 				break;
 			len = le16toh(curr->length) - 4; // CRC
 			if (len < 0) {
-				D("bogus pkt size %d nic idx %d", len, nic_i);
+				RD(5, "bogus pkt (%d) size %d nic idx %d", n, len, nic_i);
 				len = 0;
 			}
 			ring->slot[nm_i].len = len;
@@ -228,6 +392,18 @@ lem_netmap_rxsync(struct netmap_adapter 
 			nic_i = nm_next(nic_i, lim);
 		}
 		if (n) { /* update the state variables */
+#ifdef NIC_PARAVIRT
+			if (csb_mode) {
+			    if (n > 1) {
+				/* leave one spare buffer so we avoid rxkicks */
+				nm_i = nm_prev(nm_i, lim);
+				nic_i = nm_prev(nic_i, lim);
+				n--;
+			    } else {
+				csb->guest_need_rxkick = 1;
+			    }
+			}
+#endif /* NIC_PARAVIRT */
 			ND("%d new packets at nic %d nm %d tail %d",
 				n,
 				adapter->next_rx_desc_to_check,
@@ -249,23 +425,27 @@ lem_netmap_rxsync(struct netmap_adapter 
 		for (n = 0; nm_i != head; n++) {
 			struct netmap_slot *slot = &ring->slot[nm_i];
 			uint64_t paddr;
-			void *addr = PNMB(slot, &paddr);
+			void *addr = PNMB(na, slot, &paddr);
 
 			struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i];
 			struct em_buffer *rxbuf = &adapter->rx_buffer_area[nic_i];
 
-			if (addr == netmap_buffer_base) /* bad buf */
+			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
 				goto ring_reset;
 
 			if (slot->flags & NS_BUF_CHANGED) {
 				/* buffer has changed, reload map */
 				curr->buffer_addr = htole64(paddr);
-				netmap_reload_map(adapter->rxtag, rxbuf->map, addr);
+				netmap_reload_map(na, adapter->rxtag, rxbuf->map, addr);
 				slot->flags &= ~NS_BUF_CHANGED;
 			}
 			curr->status = 0;
 			bus_dmamap_sync(adapter->rxtag, rxbuf->map,
 			    BUS_DMASYNC_PREREAD);
+#ifdef NIC_PARAVIRT
+			if (csb_mode && csb->host_rxkick_at == nic_i)
+				do_host_rxkick = 1;
+#endif /* NIC_PARAVIRT */

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list