svn commit: r270252 - in stable/10: sys/conf sys/dev/e1000 sys/dev/ixgbe sys/dev/netmap tools/tools/netmap
Luigi Rizzo
luigi at FreeBSD.org
Wed Aug 20 23:34:38 UTC 2014
Author: luigi
Date: Wed Aug 20 23:34:36 2014
New Revision: 270252
URL: http://svnweb.freebsd.org/changeset/base/270252
Log:
MFC 270063: update of netmap code
(vtnet and cxgbe not merged yet because we need some other mfc first)
Added:
stable/10/sys/dev/netmap/if_vtnet_netmap.h (contents, props changed)
stable/10/sys/dev/netmap/netmap_monitor.c (contents, props changed)
Modified:
stable/10/sys/conf/files
stable/10/sys/dev/e1000/if_em.c
stable/10/sys/dev/e1000/if_igb.c
stable/10/sys/dev/e1000/if_lem.c
stable/10/sys/dev/ixgbe/ixgbe.c
stable/10/sys/dev/netmap/if_em_netmap.h
stable/10/sys/dev/netmap/if_igb_netmap.h
stable/10/sys/dev/netmap/if_lem_netmap.h
stable/10/sys/dev/netmap/if_re_netmap.h
stable/10/sys/dev/netmap/ixgbe_netmap.h
stable/10/sys/dev/netmap/netmap.c
stable/10/sys/dev/netmap/netmap_freebsd.c
stable/10/sys/dev/netmap/netmap_generic.c
stable/10/sys/dev/netmap/netmap_kern.h
stable/10/sys/dev/netmap/netmap_mbq.h
stable/10/sys/dev/netmap/netmap_mem2.c
stable/10/sys/dev/netmap/netmap_mem2.h
stable/10/sys/dev/netmap/netmap_offloadings.c
stable/10/sys/dev/netmap/netmap_pipe.c
stable/10/sys/dev/netmap/netmap_vale.c
stable/10/tools/tools/netmap/pkt-gen.c
stable/10/tools/tools/netmap/vale-ctl.c
Modified: stable/10/sys/conf/files
==============================================================================
--- stable/10/sys/conf/files Wed Aug 20 23:29:34 2014 (r270251)
+++ stable/10/sys/conf/files Wed Aug 20 23:34:36 2014 (r270252)
@@ -1933,6 +1933,7 @@ dev/netmap/netmap_freebsd.c optional net
dev/netmap/netmap_generic.c optional netmap
dev/netmap/netmap_mbq.c optional netmap
dev/netmap/netmap_mem2.c optional netmap
+dev/netmap/netmap_monitor.c optional netmap
dev/netmap/netmap_offloadings.c optional netmap
dev/netmap/netmap_pipe.c optional netmap
dev/netmap/netmap_vale.c optional netmap
Modified: stable/10/sys/dev/e1000/if_em.c
==============================================================================
--- stable/10/sys/dev/e1000/if_em.c Wed Aug 20 23:29:34 2014 (r270251)
+++ stable/10/sys/dev/e1000/if_em.c Wed Aug 20 23:34:36 2014 (r270252)
@@ -3389,10 +3389,10 @@ em_setup_transmit_ring(struct tx_ring *t
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
+ addr = PNMB(na, slot + si, &paddr);
txr->tx_base[i].buffer_addr = htole64(paddr);
/* reload the map for netmap mode */
- netmap_load_map(txr->txtag, txbuf->map, addr);
+ netmap_load_map(na, txr->txtag, txbuf->map, addr);
}
#endif /* DEV_NETMAP */
@@ -4131,8 +4131,8 @@ em_setup_receive_ring(struct rx_ring *rx
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
- netmap_load_map(rxr->rxtag, rxbuf->map, addr);
+ addr = PNMB(na, slot + si, &paddr);
+ netmap_load_map(na, rxr->rxtag, rxbuf->map, addr);
/* Update descriptor */
rxr->rx_base[j].buffer_addr = htole64(paddr);
continue;
Modified: stable/10/sys/dev/e1000/if_igb.c
==============================================================================
--- stable/10/sys/dev/e1000/if_igb.c Wed Aug 20 23:29:34 2014 (r270251)
+++ stable/10/sys/dev/e1000/if_igb.c Wed Aug 20 23:34:36 2014 (r270252)
@@ -3531,7 +3531,7 @@ igb_setup_transmit_ring(struct tx_ring *
if (slot) {
int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
/* no need to set the address */
- netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+ netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si));
}
#endif /* DEV_NETMAP */
/* clear the watch index */
@@ -4335,8 +4335,8 @@ igb_setup_receive_ring(struct rx_ring *r
uint64_t paddr;
void *addr;
- addr = PNMB(slot + sj, &paddr);
- netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+ addr = PNMB(na, slot + sj, &paddr);
+ netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
/* Update descriptor */
rxr->rx_base[j].read.pkt_addr = htole64(paddr);
continue;
Modified: stable/10/sys/dev/e1000/if_lem.c
==============================================================================
--- stable/10/sys/dev/e1000/if_lem.c Wed Aug 20 23:29:34 2014 (r270251)
+++ stable/10/sys/dev/e1000/if_lem.c Wed Aug 20 23:34:36 2014 (r270252)
@@ -32,6 +32,15 @@
******************************************************************************/
/*$FreeBSD$*/
+/*
+ * Uncomment the following extensions for better performance in a VM,
+ * especially if you have support in the hypervisor.
+ * See http://info.iet.unipi.it/~luigi/netmap/
+ */
+// #define BATCH_DISPATCH
+// #define NIC_SEND_COMBINING
+// #define NIC_PARAVIRT /* enable virtio-like synchronization */
+
#include "opt_inet.h"
#include "opt_inet6.h"
@@ -289,6 +298,10 @@ static int lem_tx_int_delay_dflt = EM_TI
static int lem_rx_int_delay_dflt = EM_TICKS_TO_USECS(EM_RDTR);
static int lem_tx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_TADV);
static int lem_rx_abs_int_delay_dflt = EM_TICKS_TO_USECS(EM_RADV);
+/*
+ * increase lem_rxd and lem_txd to at least 2048 in netmap mode
+ * for better performance.
+ */
static int lem_rxd = EM_DEFAULT_RXD;
static int lem_txd = EM_DEFAULT_TXD;
static int lem_smart_pwr_down = FALSE;
@@ -458,6 +471,20 @@ lem_attach(device_t dev)
"max number of rx packets to process", &adapter->rx_process_limit,
lem_rx_process_limit);
+#ifdef NIC_SEND_COMBINING
+ /* Sysctls to control mitigation */
+ lem_add_rx_process_limit(adapter, "sc_enable",
+ "driver TDT mitigation", &adapter->sc_enable, 0);
+#endif /* NIC_SEND_COMBINING */
+#ifdef BATCH_DISPATCH
+ lem_add_rx_process_limit(adapter, "batch_enable",
+ "driver rx batch", &adapter->batch_enable, 0);
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+ lem_add_rx_process_limit(adapter, "rx_retries",
+ "driver rx retries", &adapter->rx_retries, 0);
+#endif /* NIC_PARAVIRT */
+
/* Sysctl for setting the interface flow control */
lem_set_flow_cntrl(adapter, "flow_control",
"flow control setting",
@@ -515,6 +542,49 @@ lem_attach(device_t dev)
*/
adapter->hw.mac.report_tx_early = 1;
+#ifdef NIC_PARAVIRT
+ device_printf(dev, "driver supports paravirt, subdev 0x%x\n",
+ adapter->hw.subsystem_device_id);
+ if (adapter->hw.subsystem_device_id == E1000_PARA_SUBDEV) {
+ uint64_t bus_addr;
+
+ device_printf(dev, "paravirt support on dev %p\n", adapter);
+ tsize = 4096; // XXX one page for the csb
+ if (lem_dma_malloc(adapter, tsize, &adapter->csb_mem, BUS_DMA_NOWAIT)) {
+ device_printf(dev, "Unable to allocate csb memory\n");
+ error = ENOMEM;
+ goto err_csb;
+ }
+ /* Setup the Base of the CSB */
+ adapter->csb = (struct paravirt_csb *)adapter->csb_mem.dma_vaddr;
+ /* force the first kick */
+ adapter->csb->host_need_txkick = 1; /* txring empty */
+ adapter->csb->guest_need_rxkick = 1; /* no rx packets */
+ bus_addr = adapter->csb_mem.dma_paddr;
+ lem_add_rx_process_limit(adapter, "csb_on",
+ "enable paravirt.", &adapter->csb->guest_csb_on, 0);
+ lem_add_rx_process_limit(adapter, "txc_lim",
+ "txc_lim", &adapter->csb->host_txcycles_lim, 1);
+
+ /* some stats */
+#define PA_SC(name, var, val) \
+ lem_add_rx_process_limit(adapter, name, name, var, val)
+ PA_SC("host_need_txkick",&adapter->csb->host_need_txkick, 1);
+ PA_SC("host_rxkick_at",&adapter->csb->host_rxkick_at, ~0);
+ PA_SC("guest_need_txkick",&adapter->csb->guest_need_txkick, 0);
+ PA_SC("guest_need_rxkick",&adapter->csb->guest_need_rxkick, 1);
+ PA_SC("tdt_reg_count",&adapter->tdt_reg_count, 0);
+ PA_SC("tdt_csb_count",&adapter->tdt_csb_count, 0);
+ PA_SC("tdt_int_count",&adapter->tdt_int_count, 0);
+ PA_SC("guest_need_kick_count",&adapter->guest_need_kick_count, 0);
+ /* tell the host where the block is */
+ E1000_WRITE_REG(&adapter->hw, E1000_CSBAH,
+ (u32)(bus_addr >> 32));
+ E1000_WRITE_REG(&adapter->hw, E1000_CSBAL,
+ (u32)bus_addr);
+ }
+#endif /* NIC_PARAVIRT */
+
tsize = roundup2(adapter->num_tx_desc * sizeof(struct e1000_tx_desc),
EM_DBA_ALIGN);
@@ -673,6 +743,11 @@ err_hw_init:
err_rx_desc:
lem_dma_free(adapter, &adapter->txdma);
err_tx_desc:
+#ifdef NIC_PARAVIRT
+ lem_dma_free(adapter, &adapter->csb_mem);
+err_csb:
+#endif /* NIC_PARAVIRT */
+
err_pci:
if (adapter->ifp != NULL)
if_free(adapter->ifp);
@@ -760,6 +835,12 @@ lem_detach(device_t dev)
adapter->rx_desc_base = NULL;
}
+#ifdef NIC_PARAVIRT
+ if (adapter->csb) {
+ lem_dma_free(adapter, &adapter->csb_mem);
+ adapter->csb = NULL;
+ }
+#endif /* NIC_PARAVIRT */
lem_release_hw_control(adapter);
free(adapter->mta, M_DEVBUF);
EM_TX_LOCK_DESTROY(adapter);
@@ -869,6 +950,16 @@ lem_start_locked(struct ifnet *ifp)
}
if (adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD)
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+ if (if_getdrvflags(ifp) & IFF_DRV_OACTIVE && adapter->csb &&
+ adapter->csb->guest_csb_on &&
+ !(adapter->csb->guest_need_txkick & 1)) {
+ adapter->csb->guest_need_txkick = 1;
+ adapter->guest_need_kick_count++;
+ // XXX memory barrier
+ lem_txeof(adapter); // XXX possibly clear IFF_DRV_OACTIVE
+ }
+#endif /* NIC_PARAVIRT */
return;
}
@@ -1715,6 +1806,37 @@ lem_xmit(struct adapter *adapter, struct
*/
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+
+#ifdef NIC_PARAVIRT
+ if (adapter->csb) {
+ adapter->csb->guest_tdt = i;
+ /* XXX memory barrier ? */
+ if (adapter->csb->guest_csb_on &&
+ !(adapter->csb->host_need_txkick & 1)) {
+ /* XXX maybe useless
+ * clean the ring. maybe do it before ?
+ * maybe a little bit of histeresys ?
+ */
+ if (adapter->num_tx_desc_avail <= 64) {// XXX
+ lem_txeof(adapter);
+ }
+ return (0);
+ }
+ }
+#endif /* NIC_PARAVIRT */
+
+#ifdef NIC_SEND_COMBINING
+ if (adapter->sc_enable) {
+ if (adapter->shadow_tdt & MIT_PENDING_INT) {
+ /* signal intr and data pending */
+ adapter->shadow_tdt = MIT_PENDING_TDT | (i & 0xffff);
+ return (0);
+ } else {
+ adapter->shadow_tdt = MIT_PENDING_INT;
+ }
+ }
+#endif /* NIC_SEND_COMBINING */
+
if (adapter->hw.mac.type == e1000_82547 &&
adapter->link_duplex == HALF_DUPLEX)
lem_82547_move_tail(adapter);
@@ -1995,6 +2117,20 @@ lem_local_timer(void *arg)
lem_smartspeed(adapter);
+#ifdef NIC_PARAVIRT
+ /* recover space if needed */
+ if (adapter->csb && adapter->csb->guest_csb_on &&
+ (adapter->watchdog_check == TRUE) &&
+ (ticks - adapter->watchdog_time > EM_WATCHDOG) &&
+ (adapter->num_tx_desc_avail != adapter->num_tx_desc) ) {
+ lem_txeof(adapter);
+ /*
+ * lem_txeof() normally (except when space in the queue
+ * runs low XXX) cleans watchdog_check so that
+ * we do not hung.
+ */
+ }
+#endif /* NIC_PARAVIRT */
/*
* We check the watchdog: the time since
* the last TX descriptor was cleaned.
@@ -2677,10 +2813,10 @@ lem_setup_transmit_structures(struct ada
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
+ addr = PNMB(na, slot + si, &paddr);
adapter->tx_desc_base[i].buffer_addr = htole64(paddr);
/* reload the map for netmap mode */
- netmap_load_map(adapter->txtag, tx_buffer->map, addr);
+ netmap_load_map(na, adapter->txtag, tx_buffer->map, addr);
}
#endif /* DEV_NETMAP */
tx_buffer->next_eop = -1;
@@ -3055,6 +3191,16 @@ lem_txeof(struct adapter *adapter)
adapter->next_tx_to_clean = first;
adapter->num_tx_desc_avail = num_avail;
+#ifdef NIC_SEND_COMBINING
+ if ((adapter->shadow_tdt & MIT_PENDING_TDT) == MIT_PENDING_TDT) {
+ /* a tdt write is pending, do it */
+ E1000_WRITE_REG(&adapter->hw, E1000_TDT(0),
+ 0xffff & adapter->shadow_tdt);
+ adapter->shadow_tdt = MIT_PENDING_INT;
+ } else {
+ adapter->shadow_tdt = 0; // disable
+ }
+#endif /* NIC_SEND_COMBINING */
/*
* If we have enough room, clear IFF_DRV_OACTIVE to
* tell the stack that it is OK to send packets.
@@ -3062,6 +3208,12 @@ lem_txeof(struct adapter *adapter)
*/
if (adapter->num_tx_desc_avail > EM_TX_CLEANUP_THRESHOLD) {
ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+#ifdef NIC_PARAVIRT
+ if (adapter->csb) { // XXX also csb_on ?
+ adapter->csb->guest_need_txkick = 2; /* acked */
+ // XXX memory barrier
+ }
+#endif /* NIC_PARAVIRT */
if (adapter->num_tx_desc_avail == adapter->num_tx_desc) {
adapter->watchdog_check = FALSE;
return;
@@ -3247,8 +3399,8 @@ lem_setup_receive_structures(struct adap
uint64_t paddr;
void *addr;
- addr = PNMB(slot + si, &paddr);
- netmap_load_map(adapter->rxtag, rx_buffer->map, addr);
+ addr = PNMB(na, slot + si, &paddr);
+ netmap_load_map(na, adapter->rxtag, rx_buffer->map, addr);
/* Update descriptor */
adapter->rx_desc_base[i].buffer_addr = htole64(paddr);
continue;
@@ -3445,7 +3597,23 @@ lem_rxeof(struct adapter *adapter, int c
int i, rx_sent = 0;
struct e1000_rx_desc *current_desc;
+#ifdef BATCH_DISPATCH
+ struct mbuf *mh = NULL, *mt = NULL;
+#endif /* BATCH_DISPATCH */
+#ifdef NIC_PARAVIRT
+ int retries = 0;
+ struct paravirt_csb* csb = adapter->csb;
+ int csb_mode = csb && csb->guest_csb_on;
+
+ //ND("clear guest_rxkick at %d", adapter->next_rx_desc_to_check);
+ if (csb_mode && csb->guest_need_rxkick)
+ csb->guest_need_rxkick = 0;
+#endif /* NIC_PARAVIRT */
EM_RX_LOCK(adapter);
+
+#ifdef BATCH_DISPATCH
+ batch_again:
+#endif /* BATCH_DISPATCH */
i = adapter->next_rx_desc_to_check;
current_desc = &adapter->rx_desc_base[i];
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
@@ -3458,19 +3626,45 @@ lem_rxeof(struct adapter *adapter, int c
}
#endif /* DEV_NETMAP */
+#if 1 // XXX optimization ?
if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
if (done != NULL)
*done = rx_sent;
EM_RX_UNLOCK(adapter);
return (FALSE);
}
+#endif /* 0 */
while (count != 0 && ifp->if_drv_flags & IFF_DRV_RUNNING) {
struct mbuf *m = NULL;
status = current_desc->status;
- if ((status & E1000_RXD_STAT_DD) == 0)
+ if ((status & E1000_RXD_STAT_DD) == 0) {
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ /* buffer not ready yet. Retry a few times before giving up */
+ if (++retries <= adapter->rx_retries) {
+ continue;
+ }
+ if (csb->guest_need_rxkick == 0) {
+ // ND("set guest_rxkick at %d", adapter->next_rx_desc_to_check);
+ csb->guest_need_rxkick = 1;
+ // XXX memory barrier, status volatile ?
+ continue; /* double check */
+ }
+ }
+ /* no buffer ready, give up */
+#endif /* NIC_PARAVIRT */
break;
+ }
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ if (csb->guest_need_rxkick)
+ // ND("clear again guest_rxkick at %d", adapter->next_rx_desc_to_check);
+ csb->guest_need_rxkick = 0;
+ retries = 0;
+ }
+#endif /* NIC_PARAVIRT */
mp = adapter->rx_buffer_area[i].m_head;
/*
@@ -3595,11 +3789,36 @@ discard:
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ /* the buffer at i has been already replaced by lem_get_buf()
+ * so it is safe to set guest_rdt = i and possibly send a kick.
+ * XXX see if we can optimize it later.
+ */
+ csb->guest_rdt = i;
+ // XXX memory barrier
+ if (i == csb->host_rxkick_at)
+ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
+ }
+#endif /* NIC_PARAVIRT */
/* Advance our pointers to the next descriptor. */
if (++i == adapter->num_rx_desc)
i = 0;
/* Call into the stack */
if (m != NULL) {
+#ifdef BATCH_DISPATCH
+ if (adapter->batch_enable) {
+ if (mh == NULL)
+ mh = mt = m;
+ else
+ mt->m_nextpkt = m;
+ mt = m;
+ m->m_nextpkt = NULL;
+ rx_sent++;
+ current_desc = &adapter->rx_desc_base[i];
+ continue;
+ }
+#endif /* BATCH_DISPATCH */
adapter->next_rx_desc_to_check = i;
EM_RX_UNLOCK(adapter);
(*ifp->if_input)(ifp, m);
@@ -3610,10 +3829,27 @@ discard:
current_desc = &adapter->rx_desc_base[i];
}
adapter->next_rx_desc_to_check = i;
+#ifdef BATCH_DISPATCH
+ if (mh) {
+ EM_RX_UNLOCK(adapter);
+ while ( (mt = mh) != NULL) {
+ mh = mh->m_nextpkt;
+ mt->m_nextpkt = NULL;
+ if_input(ifp, mt);
+ }
+ EM_RX_LOCK(adapter);
+ i = adapter->next_rx_desc_to_check; /* in case of interrupts */
+ if (count > 0)
+ goto batch_again;
+ }
+#endif /* BATCH_DISPATCH */
/* Advance the E1000's Receive Queue #0 "Tail Pointer". */
if (--i < 0)
i = adapter->num_rx_desc - 1;
+#ifdef NIC_PARAVIRT
+ if (!csb_mode) /* filter out writes */
+#endif /* NIC_PARAVIRT */
E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), i);
if (done != NULL)
*done = rx_sent;
Modified: stable/10/sys/dev/ixgbe/ixgbe.c
==============================================================================
--- stable/10/sys/dev/ixgbe/ixgbe.c Wed Aug 20 23:29:34 2014 (r270251)
+++ stable/10/sys/dev/ixgbe/ixgbe.c Wed Aug 20 23:34:36 2014 (r270252)
@@ -3079,7 +3079,7 @@ ixgbe_setup_transmit_ring(struct tx_ring
*/
if (slot) {
int si = netmap_idx_n2k(&na->tx_rings[txr->me], i);
- netmap_load_map(txr->txtag, txbuf->map, NMB(slot + si));
+ netmap_load_map(na, txr->txtag, txbuf->map, NMB(na, slot + si));
}
#endif /* DEV_NETMAP */
/* Clear the EOP descriptor pointer */
@@ -4025,8 +4025,8 @@ ixgbe_setup_receive_ring(struct rx_ring
uint64_t paddr;
void *addr;
- addr = PNMB(slot + sj, &paddr);
- netmap_load_map(rxr->ptag, rxbuf->pmap, addr);
+ addr = PNMB(na, slot + sj, &paddr);
+ netmap_load_map(na, rxr->ptag, rxbuf->pmap, addr);
/* Update descriptor and the cached value */
rxr->rx_base[j].read.pkt_addr = htole64(paddr);
rxbuf->addr = htole64(paddr);
Modified: stable/10/sys/dev/netmap/if_em_netmap.h
==============================================================================
--- stable/10/sys/dev/netmap/if_em_netmap.h Wed Aug 20 23:29:34 2014 (r270251)
+++ stable/10/sys/dev/netmap/if_em_netmap.h Wed Aug 20 23:34:36 2014 (r270252)
@@ -113,10 +113,10 @@ em_netmap_reg(struct netmap_adapter *na,
* Reconcile kernel and user view of the transmit ring.
*/
static int
-em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_txsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -128,7 +128,7 @@ em_netmap_txsync(struct netmap_adapter *
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+ struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -144,7 +144,7 @@ em_netmap_txsync(struct netmap_adapter *
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
/* device-specific */
struct e1000_tx_desc *curr = &txr->tx_base[nic_i];
@@ -153,12 +153,12 @@ em_netmap_txsync(struct netmap_adapter *
nic_i == 0 || nic_i == report_frequency) ?
E1000_TXD_CMD_RS : 0;
- NM_CHECK_ADDR_LEN(addr, len);
+ NM_CHECK_ADDR_LEN(na, addr, len);
if (slot->flags & NS_BUF_CHANGED) {
curr->buffer_addr = htole64(paddr);
/* buffer has changed, reload map */
- netmap_reload_map(txr->txtag, txbuf->map, addr);
+ netmap_reload_map(na, txr->txtag, txbuf->map, addr);
}
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
@@ -187,7 +187,7 @@ em_netmap_txsync(struct netmap_adapter *
*/
if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
/* record completed transmissions using TDH */
- nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
@@ -208,10 +208,10 @@ em_netmap_txsync(struct netmap_adapter *
* Reconcile kernel and user view of the receive ring.
*/
static int
-em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+em_netmap_rxsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -222,7 +222,7 @@ em_netmap_rxsync(struct netmap_adapter *
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+ struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
if (head > lim)
return netmap_ring_reinit(kring);
@@ -271,18 +271,18 @@ em_netmap_rxsync(struct netmap_adapter *
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
struct e1000_rx_desc *curr = &rxr->rx_base[nic_i];
struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i];
- if (addr == netmap_buffer_base) /* bad buf */
+ if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
goto ring_reset;
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
curr->buffer_addr = htole64(paddr);
- netmap_reload_map(rxr->rxtag, rxbuf->map, addr);
+ netmap_reload_map(na, rxr->rxtag, rxbuf->map, addr);
slot->flags &= ~NS_BUF_CHANGED;
}
curr->status = 0;
Modified: stable/10/sys/dev/netmap/if_igb_netmap.h
==============================================================================
--- stable/10/sys/dev/netmap/if_igb_netmap.h Wed Aug 20 23:29:34 2014 (r270251)
+++ stable/10/sys/dev/netmap/if_igb_netmap.h Wed Aug 20 23:34:36 2014 (r270252)
@@ -81,10 +81,10 @@ igb_netmap_reg(struct netmap_adapter *na
* Reconcile kernel and user view of the transmit ring.
*/
static int
-igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_txsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -96,7 +96,7 @@ igb_netmap_txsync(struct netmap_adapter
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct tx_ring *txr = &adapter->tx_rings[ring_nr];
+ struct tx_ring *txr = &adapter->tx_rings[kring->ring_id];
/* 82575 needs the queue index added */
u32 olinfo_status =
(adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0;
@@ -115,7 +115,7 @@ igb_netmap_txsync(struct netmap_adapter
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
/* device-specific */
union e1000_adv_tx_desc *curr =
@@ -125,11 +125,11 @@ igb_netmap_txsync(struct netmap_adapter
nic_i == 0 || nic_i == report_frequency) ?
E1000_ADVTXD_DCMD_RS : 0;
- NM_CHECK_ADDR_LEN(addr, len);
+ NM_CHECK_ADDR_LEN(na, addr, len);
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
- netmap_reload_map(txr->txtag, txbuf->map, addr);
+ netmap_reload_map(na, txr->txtag, txbuf->map, addr);
}
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
@@ -171,7 +171,7 @@ igb_netmap_txsync(struct netmap_adapter
*/
if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
/* record completed transmissions using TDH */
- nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr));
+ nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(kring->ring_id));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
nic_i -= kring->nkr_num_slots;
@@ -190,10 +190,10 @@ igb_netmap_txsync(struct netmap_adapter
* Reconcile kernel and user view of the receive ring.
*/
static int
-igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+igb_netmap_rxsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -204,7 +204,7 @@ igb_netmap_rxsync(struct netmap_adapter
/* device-specific */
struct adapter *adapter = ifp->if_softc;
- struct rx_ring *rxr = &adapter->rx_rings[ring_nr];
+ struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id];
if (head > lim)
return netmap_ring_reinit(kring);
@@ -251,17 +251,17 @@ igb_netmap_rxsync(struct netmap_adapter
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i];
struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i];
- if (addr == netmap_buffer_base) /* bad buf */
+ if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
goto ring_reset;
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
- netmap_reload_map(rxr->ptag, rxbuf->pmap, addr);
+ netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr);
slot->flags &= ~NS_BUF_CHANGED;
}
curr->wb.upper.status_error = 0;
Modified: stable/10/sys/dev/netmap/if_lem_netmap.h
==============================================================================
--- stable/10/sys/dev/netmap/if_lem_netmap.h Wed Aug 20 23:29:34 2014 (r270251)
+++ stable/10/sys/dev/netmap/if_lem_netmap.h Wed Aug 20 23:34:36 2014 (r270252)
@@ -39,6 +39,7 @@
#include <vm/pmap.h> /* vtophys ? */
#include <dev/netmap/netmap_kern.h>
+extern int netmap_adaptive_io;
/*
* Register/unregister. We are already under netmap lock.
@@ -84,10 +85,10 @@ lem_netmap_reg(struct netmap_adapter *na
* Reconcile kernel and user view of the transmit ring.
*/
static int
-lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_txsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->tx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -98,6 +99,10 @@ lem_netmap_txsync(struct netmap_adapter
/* device-specific */
struct adapter *adapter = ifp->if_softc;
+#ifdef NIC_PARAVIRT
+ struct paravirt_csb *csb = adapter->csb;
+ uint64_t *csbd = (uint64_t *)(csb + 1);
+#endif /* NIC_PARAVIRT */
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_POSTREAD);
@@ -108,12 +113,25 @@ lem_netmap_txsync(struct netmap_adapter
nm_i = kring->nr_hwcur;
if (nm_i != head) { /* we have new packets to send */
+#ifdef NIC_PARAVIRT
+ int do_kick = 0;
+ uint64_t t = 0; // timestamp
+ int n = head - nm_i;
+ if (n < 0)
+ n += lim + 1;
+ if (csb) {
+ t = rdtsc(); /* last timestamp */
+ csbd[16] += t - csbd[0]; /* total Wg */
+ csbd[17] += n; /* Wg count */
+ csbd[0] = t;
+ }
+#endif /* NIC_PARAVIRT */
nic_i = netmap_idx_k2n(kring, nm_i);
while (nm_i != head) {
struct netmap_slot *slot = &ring->slot[nm_i];
u_int len = slot->len;
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
/* device-specific */
struct e1000_tx_desc *curr = &adapter->tx_desc_base[nic_i];
@@ -122,12 +140,12 @@ lem_netmap_txsync(struct netmap_adapter
nic_i == 0 || nic_i == report_frequency) ?
E1000_TXD_CMD_RS : 0;
- NM_CHECK_ADDR_LEN(addr, len);
+ NM_CHECK_ADDR_LEN(na, addr, len);
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
curr->buffer_addr = htole64(paddr);
- netmap_reload_map(adapter->txtag, txbuf->map, addr);
+ netmap_reload_map(na, adapter->txtag, txbuf->map, addr);
}
slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
@@ -140,6 +158,7 @@ lem_netmap_txsync(struct netmap_adapter
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
+ // XXX might try an early kick
}
kring->nr_hwcur = head;
@@ -147,8 +166,38 @@ lem_netmap_txsync(struct netmap_adapter
bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map,
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+#ifdef NIC_PARAVIRT
+ /* set unconditionally, then also kick if needed */
+ if (csb) {
+ t = rdtsc();
+ if (csb->host_need_txkick == 2) {
+ /* can compute an update of delta */
+ int64_t delta = t - csbd[3];
+ if (delta < 0)
+ delta = -delta;
+ if (csbd[8] == 0 || delta < csbd[8]) {
+ csbd[8] = delta;
+ csbd[9]++;
+ }
+ csbd[10]++;
+ }
+ csb->guest_tdt = nic_i;
+ csbd[18] += t - csbd[0]; // total wp
+ csbd[19] += n;
+ }
+ if (!csb || !csb->guest_csb_on || (csb->host_need_txkick & 1))
+ do_kick = 1;
+ if (do_kick)
+#endif /* NIC_PARAVIRT */
/* (re)start the tx unit up to slot nic_i (excluded) */
E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i);
+#ifdef NIC_PARAVIRT
+ if (do_kick) {
+ uint64_t t1 = rdtsc();
+ csbd[20] += t1 - t; // total Np
+ csbd[21]++;
+ }
+#endif /* NIC_PARAVIRT */
}
/*
@@ -157,6 +206,93 @@ lem_netmap_txsync(struct netmap_adapter
if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
kring->last_reclaim = ticks;
/* record completed transmissions using TDH */
+#ifdef NIC_PARAVIRT
+ /* host updates tdh unconditionally, and we have
+ * no side effects on reads, so we can read from there
+ * instead of exiting.
+ */
+ if (csb) {
+ static int drain = 0, nodrain=0, good = 0, bad = 0, fail = 0;
+ u_int x = adapter->next_tx_to_clean;
+ csbd[19]++; // XXX count reclaims
+ nic_i = csb->host_tdh;
+ if (csb->guest_csb_on) {
+ if (nic_i == x) {
+ bad++;
+ csbd[24]++; // failed reclaims
+ /* no progress, request kick and retry */
+ csb->guest_need_txkick = 1;
+ mb(); // XXX barrier
+ nic_i = csb->host_tdh;
+ } else {
+ good++;
+ }
+ if (nic_i != x) {
+ csb->guest_need_txkick = 2;
+ if (nic_i == csb->guest_tdt)
+ drain++;
+ else
+ nodrain++;
+#if 1
+ if (netmap_adaptive_io) {
+ /* new mechanism: last half ring (or so)
+ * released one slot at a time.
+ * This effectively makes the system spin.
+ *
+ * Take next_to_clean + 1 as a reference.
+ * tdh must be ahead or equal
+ * On entry, the logical order is
+ * x < tdh = nic_i
+ * We first push tdh up to avoid wraps.
+ * The limit is tdh-ll (half ring).
+ * if tdh-256 < x we report x;
+ * else we report tdh-256
+ */
+ u_int tdh = nic_i;
+ u_int ll = csbd[15];
+ u_int delta = lim/8;
+ if (netmap_adaptive_io == 2 || ll > delta)
+ csbd[15] = ll = delta;
+ else if (netmap_adaptive_io == 1 && ll > 1) {
+ csbd[15]--;
+ }
+
+ if (nic_i >= kring->nkr_num_slots) {
+ RD(5, "bad nic_i %d on input", nic_i);
+ }
+ x = nm_next(x, lim);
+ if (tdh < x)
+ tdh += lim + 1;
+ if (tdh <= x + ll) {
+ nic_i = x;
+ csbd[25]++; //report n + 1;
+ } else {
+ tdh = nic_i;
+ if (tdh < ll)
+ tdh += lim + 1;
+ nic_i = tdh - ll;
+ csbd[26]++; // report tdh - ll
+ }
+ }
+#endif
+ } else {
+ /* we stop, count whether we are idle or not */
+ int bh_active = csb->host_need_txkick & 2 ? 4 : 0;
+ csbd[27+ csb->host_need_txkick]++;
+ if (netmap_adaptive_io == 1) {
+ if (bh_active && csbd[15] > 1)
+ csbd[15]--;
+ else if (!bh_active && csbd[15] < lim/2)
+ csbd[15]++;
+ }
+ bad--;
+ fail++;
+ }
+ }
+ RD(1, "drain %d nodrain %d good %d retry %d fail %d",
+ drain, nodrain, good, bad, fail);
+ } else
+#endif /* !NIC_PARAVIRT */
nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0));
if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */
D("TDH wrap %d", nic_i);
@@ -176,10 +312,10 @@ lem_netmap_txsync(struct netmap_adapter
* Reconcile kernel and user view of the receive ring.
*/
static int
-lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
+lem_netmap_rxsync(struct netmap_kring *kring, int flags)
{
+ struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
- struct netmap_kring *kring = &na->rx_rings[ring_nr];
struct netmap_ring *ring = kring->ring;
u_int nm_i; /* index into the netmap ring */
u_int nic_i; /* index into the NIC ring */
@@ -190,10 +326,21 @@ lem_netmap_rxsync(struct netmap_adapter
/* device-specific */
struct adapter *adapter = ifp->if_softc;
+#ifdef NIC_PARAVIRT
+ struct paravirt_csb *csb = adapter->csb;
+ uint32_t csb_mode = csb && csb->guest_csb_on;
+ uint32_t do_host_rxkick = 0;
+#endif /* NIC_PARAVIRT */
if (head > lim)
return netmap_ring_reinit(kring);
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ force_update = 1;
+ csb->guest_need_rxkick = 0;
+ }
+#endif /* NIC_PARAVIRT */
/* XXX check sync modes */
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
@@ -212,11 +359,28 @@ lem_netmap_rxsync(struct netmap_adapter
uint32_t staterr = le32toh(curr->status);
int len;
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ if ((staterr & E1000_RXD_STAT_DD) == 0) {
+ /* don't bother to retry if more than 1 pkt */
+ if (n > 1)
+ break;
+ csb->guest_need_rxkick = 1;
+ wmb();
+ staterr = le32toh(curr->status);
+ if ((staterr & E1000_RXD_STAT_DD) == 0) {
+ break;
+ } else { /* we are good */
+ csb->guest_need_rxkick = 0;
+ }
+ }
+ } else
+#endif /* NIC_PARAVIRT */
if ((staterr & E1000_RXD_STAT_DD) == 0)
break;
len = le16toh(curr->length) - 4; // CRC
if (len < 0) {
- D("bogus pkt size %d nic idx %d", len, nic_i);
+ RD(5, "bogus pkt (%d) size %d nic idx %d", n, len, nic_i);
len = 0;
}
ring->slot[nm_i].len = len;
@@ -228,6 +392,18 @@ lem_netmap_rxsync(struct netmap_adapter
nic_i = nm_next(nic_i, lim);
}
if (n) { /* update the state variables */
+#ifdef NIC_PARAVIRT
+ if (csb_mode) {
+ if (n > 1) {
+ /* leave one spare buffer so we avoid rxkicks */
+ nm_i = nm_prev(nm_i, lim);
+ nic_i = nm_prev(nic_i, lim);
+ n--;
+ } else {
+ csb->guest_need_rxkick = 1;
+ }
+ }
+#endif /* NIC_PARAVIRT */
ND("%d new packets at nic %d nm %d tail %d",
n,
adapter->next_rx_desc_to_check,
@@ -249,23 +425,27 @@ lem_netmap_rxsync(struct netmap_adapter
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
- void *addr = PNMB(slot, &paddr);
+ void *addr = PNMB(na, slot, &paddr);
struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i];
struct em_buffer *rxbuf = &adapter->rx_buffer_area[nic_i];
- if (addr == netmap_buffer_base) /* bad buf */
+ if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
goto ring_reset;
if (slot->flags & NS_BUF_CHANGED) {
/* buffer has changed, reload map */
curr->buffer_addr = htole64(paddr);
- netmap_reload_map(adapter->rxtag, rxbuf->map, addr);
+ netmap_reload_map(na, adapter->rxtag, rxbuf->map, addr);
slot->flags &= ~NS_BUF_CHANGED;
}
curr->status = 0;
bus_dmamap_sync(adapter->rxtag, rxbuf->map,
BUS_DMASYNC_PREREAD);
+#ifdef NIC_PARAVIRT
+ if (csb_mode && csb->host_rxkick_at == nic_i)
+ do_host_rxkick = 1;
+#endif /* NIC_PARAVIRT */
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-stable-10
mailing list