svn commit: r276485 - in head/sys: conf dev/cxgbe modules/cxgbe/if_cxgbe

Navdeep Parhar np at FreeBSD.org
Wed Dec 31 23:19:19 UTC 2014


Author: np
Date: Wed Dec 31 23:19:16 2014
New Revision: 276485
URL: https://svnweb.freebsd.org/changeset/base/276485

Log:
  cxgbe(4): major tx rework.
  
  a) Front load as much work as possible in if_transmit, before any driver
  lock or software queue has to get involved.
  
  b) Replace buf_ring with a brand new mp_ring (multiproducer ring).  This
  is specifically for the tx multiqueue model where one of the if_transmit
  producer threads becomes the consumer and other producers carry on as
  usual.  mp_ring is implemented as standalone code and it should be
  possible to use it in any driver with tx multiqueue.  It also has:
  - the ability to enqueue/dequeue multiple items.  This might become
    significant if packet batching is ever implemented.
  - an abdication mechanism to allow a thread to give up writing tx
    descriptors and have another if_transmit thread take over.  A thread
    that's writing tx descriptors can end up doing so for an unbounded
    time period if a) there are other if_transmit threads continuously
    feeding the sofware queue, and b) the chip keeps up with whatever the
    thread is throwing at it.
  - accurate statistics about interesting events even when the stats come
    at the expense of additional branches/conditional code.
  
  The NIC txq lock is uncontested on the fast path at this point.  I've
  left it there for synchronization with the control events (interface
  up/down, modload/unload).
  
  c) Add support for "type 1" coalescing work request in the normal NIC tx
  path.  This work request is optimized for frames with a single item in
  the DMA gather list.  These are very common when forwarding packets.
  Note that netmap tx in cxgbe already uses these "type 1" work requests.
  
  d) Do not request automatic cidx updates every 32 descriptors.  Instead,
  request updates via bits in individual work requests (still every 32
  descriptors approximately).  Also, request an automatic final update
  when the queue idles after activity.  This means NIC tx reclaim is still
  performed lazily but it will catch up quickly as soon as the queue
  idles.  This seems to be the best middle ground and I'll probably do
  something similar for netmap tx as well.
  
  e) Implement a faster tx path for WRQs (used by TOE tx and control
  queues, _not_ by the normal NIC tx).  Allow work requests to be written
  directly to the hardware descriptor ring if room is available.  I will
  convert t4_tom and iw_cxgbe modules to this faster style gradually.
  
  MFC after:	2 months

Added:
  head/sys/dev/cxgbe/t4_mp_ring.c   (contents, props changed)
  head/sys/dev/cxgbe/t4_mp_ring.h   (contents, props changed)
Modified:
  head/sys/conf/files
  head/sys/dev/cxgbe/adapter.h
  head/sys/dev/cxgbe/t4_l2t.c
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/cxgbe/t4_sge.c
  head/sys/modules/cxgbe/if_cxgbe/Makefile

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Wed Dec 31 22:52:43 2014	(r276484)
+++ head/sys/conf/files	Wed Dec 31 23:19:16 2014	(r276485)
@@ -1142,6 +1142,8 @@ dev/cxgb/sys/uipc_mvec.c	optional cxgb p
 	compile-with "${NORMAL_C} -I$S/dev/cxgb"
 dev/cxgb/cxgb_t3fw.c		optional cxgb cxgb_t3fw \
 	compile-with "${NORMAL_C} -I$S/dev/cxgb"
+dev/cxgbe/t4_mp_ring.c		optional cxgbe pci \
+	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
 dev/cxgbe/t4_main.c		optional cxgbe pci \
 	compile-with "${NORMAL_C} -I$S/dev/cxgbe"
 dev/cxgbe/t4_netmap.c		optional cxgbe pci \

Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================
--- head/sys/dev/cxgbe/adapter.h	Wed Dec 31 22:52:43 2014	(r276484)
+++ head/sys/dev/cxgbe/adapter.h	Wed Dec 31 23:19:16 2014	(r276485)
@@ -152,7 +152,8 @@ enum {
 	CL_METADATA_SIZE = CACHE_LINE_SIZE,
 
 	SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */
-	TX_SGL_SEGS = 36,
+	TX_SGL_SEGS = 39,
+	TX_SGL_SEGS_TSO = 38,
 	TX_WR_FLITS = SGE_MAX_WR_LEN / 8
 };
 
@@ -273,6 +274,7 @@ struct port_info {
 	struct timeval last_refreshed;
  	struct port_stats stats;
 	u_int tnl_cong_drops;
+	u_int tx_parse_error;
 
 	eventhandler_tag vlan_c;
 
@@ -308,23 +310,9 @@ struct tx_desc {
 	__be64 flit[8];
 };
 
-struct tx_map {
-	struct mbuf *m;
-	bus_dmamap_t map;
-};
-
-/* DMA maps used for tx */
-struct tx_maps {
-	struct tx_map *maps;
-	uint32_t map_total;	/* # of DMA maps */
-	uint32_t map_pidx;	/* next map to be used */
-	uint32_t map_cidx;	/* reclaimed up to this index */
-	uint32_t map_avail;	/* # of available maps */
-};
-
 struct tx_sdesc {
+	struct mbuf *m;		/* m_nextpkt linked chain of frames */
 	uint8_t desc_used;	/* # of hardware descriptors used by the WR */
-	uint8_t credits;	/* NIC txq: # of frames sent out in the WR */
 };
 
 
@@ -378,16 +366,12 @@ struct sge_iq {
 enum {
 	EQ_CTRL		= 1,
 	EQ_ETH		= 2,
-#ifdef TCP_OFFLOAD
 	EQ_OFLD		= 3,
-#endif
 
 	/* eq flags */
-	EQ_TYPEMASK	= 7,		/* 3 lsbits hold the type */
-	EQ_ALLOCATED	= (1 << 3),	/* firmware resources allocated */
-	EQ_DOOMED	= (1 << 4),	/* about to be destroyed */
-	EQ_CRFLUSHED	= (1 << 5),	/* expecting an update from SGE */
-	EQ_STALLED	= (1 << 6),	/* out of hw descriptors or dmamaps */
+	EQ_TYPEMASK	= 0x3,		/* 2 lsbits hold the type (see above) */
+	EQ_ALLOCATED	= (1 << 2),	/* firmware resources allocated */
+	EQ_ENABLED	= (1 << 3),	/* open for business */
 };
 
 /* Listed in order of preference.  Update t4_sysctls too if you change these */
@@ -402,32 +386,25 @@ enum {DOORBELL_UDB, DOORBELL_WCWR, DOORB
 struct sge_eq {
 	unsigned int flags;	/* MUST be first */
 	unsigned int cntxt_id;	/* SGE context id for the eq */
-	bus_dma_tag_t desc_tag;
-	bus_dmamap_t desc_map;
-	char lockname[16];
 	struct mtx eq_lock;
 
 	struct tx_desc *desc;	/* KVA of descriptor ring */
-	bus_addr_t ba;		/* bus address of descriptor ring */
-	struct sge_qstat *spg;	/* status page, for convenience */
 	uint16_t doorbells;
 	volatile uint32_t *udb;	/* KVA of doorbell (lies within BAR2) */
 	u_int udb_qid;		/* relative qid within the doorbell page */
-	uint16_t cap;		/* max # of desc, for convenience */
-	uint16_t avail;		/* available descriptors, for convenience */
-	uint16_t qsize;		/* size (# of entries) of the queue */
+	uint16_t sidx;		/* index of the entry with the status page */
 	uint16_t cidx;		/* consumer idx (desc idx) */
 	uint16_t pidx;		/* producer idx (desc idx) */
-	uint16_t pending;	/* # of descriptors used since last doorbell */
+	uint16_t equeqidx;	/* EQUEQ last requested at this pidx */
+	uint16_t dbidx;		/* pidx of the most recent doorbell */
 	uint16_t iqid;		/* iq that gets egr_update for the eq */
 	uint8_t tx_chan;	/* tx channel used by the eq */
-	struct task tx_task;
-	struct callout tx_callout;
+	volatile u_int equiq;	/* EQUIQ outstanding */
 
-	/* stats */
-
-	uint32_t egr_update;	/* # of SGE_EGR_UPDATE notifications for eq */
-	uint32_t unstalled;	/* recovered from stall */
+	bus_dma_tag_t desc_tag;
+	bus_dmamap_t desc_map;
+	bus_addr_t ba;		/* bus address of descriptor ring */
+	char lockname[16];
 };
 
 struct sw_zone_info {
@@ -499,18 +476,19 @@ struct sge_fl {
 	struct cluster_layout cll_alt;	/* alternate refill zone, layout */
 };
 
+struct mp_ring;
+
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
 struct sge_txq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct ifnet *ifp;	/* the interface this txq belongs to */
-	bus_dma_tag_t tx_tag;	/* tag for transmit buffers */
-	struct buf_ring *br;	/* tx buffer ring */
+	struct mp_ring *r;	/* tx software ring */
 	struct tx_sdesc *sdesc;	/* KVA of software descriptor ring */
-	struct mbuf *m;		/* held up due to temporary resource shortage */
-
-	struct tx_maps txmaps;
+	struct sglist *gl;
+	__be32 cpl_ctrl0;	/* for convenience */
 
+	struct task tx_reclaim_task;
 	/* stats for common events first */
 
 	uint64_t txcsum;	/* # of times hardware assisted with checksum */
@@ -519,13 +497,12 @@ struct sge_txq {
 	uint64_t imm_wrs;	/* # of work requests with immediate data */
 	uint64_t sgl_wrs;	/* # of work requests with direct SGL */
 	uint64_t txpkt_wrs;	/* # of txpkt work requests (not coalesced) */
-	uint64_t txpkts_wrs;	/* # of coalesced tx work requests */
-	uint64_t txpkts_pkts;	/* # of frames in coalesced tx work requests */
+	uint64_t txpkts0_wrs;	/* # of type0 coalesced tx work requests */
+	uint64_t txpkts1_wrs;	/* # of type1 coalesced tx work requests */
+	uint64_t txpkts0_pkts;	/* # of frames in type0 coalesced tx WRs */
+	uint64_t txpkts1_pkts;	/* # of frames in type1 coalesced tx WRs */
 
 	/* stats for not-that-common events */
-
-	uint32_t no_dmamap;	/* no DMA map to load the mbuf */
-	uint32_t no_desc;	/* out of hardware descriptors */
 } __aligned(CACHE_LINE_SIZE);
 
 /* rxq: SGE ingress queue + SGE free list + miscellaneous items */
@@ -574,7 +551,13 @@ struct wrqe {
 	STAILQ_ENTRY(wrqe) link;
 	struct sge_wrq *wrq;
 	int wr_len;
-	uint64_t wr[] __aligned(16);
+	char wr[] __aligned(16);
+};
+
+struct wrq_cookie {
+	TAILQ_ENTRY(wrq_cookie) link;
+	int ndesc;
+	int pidx;
 };
 
 /*
@@ -585,17 +568,32 @@ struct sge_wrq {
 	struct sge_eq eq;	/* MUST be first */
 
 	struct adapter *adapter;
+	struct task wrq_tx_task;
+
+	/* Tx desc reserved but WR not "committed" yet. */
+	TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs;
 
-	/* List of WRs held up due to lack of tx descriptors */
+	/* List of WRs ready to go out as soon as descriptors are available. */
 	STAILQ_HEAD(, wrqe) wr_list;
+	u_int nwr_pending;
+	u_int ndesc_needed;
 
 	/* stats for common events first */
 
-	uint64_t tx_wrs;	/* # of tx work requests */
+	uint64_t tx_wrs_direct;	/* # of WRs written directly to desc ring. */
+	uint64_t tx_wrs_ss;	/* # of WRs copied from scratch space. */
+	uint64_t tx_wrs_copied;	/* # of WRs queued and copied to desc ring. */
 
 	/* stats for not-that-common events */
 
-	uint32_t no_desc;	/* out of hardware descriptors */
+	/*
+	 * Scratch space for work requests that wrap around after reaching the
+	 * status page, and some infomation about the last WR that used it.
+	 */
+	uint16_t ss_pidx;
+	uint16_t ss_len;
+	uint8_t ss[SGE_MAX_WR_LEN];
+
 } __aligned(CACHE_LINE_SIZE);
 
 
@@ -744,7 +742,7 @@ struct adapter {
 	struct sge sge;
 	int lro_timeout;
 
-	struct taskqueue *tq[NCHAN];	/* taskqueues that flush data out */
+	struct taskqueue *tq[NCHAN];	/* General purpose taskqueues */
 	struct port_info *port[MAX_NPORTS];
 	uint8_t chan_map[NCHAN];
 
@@ -978,12 +976,11 @@ static inline int
 tx_resume_threshold(struct sge_eq *eq)
 {
 
-	return (eq->qsize / 4);
+	/* not quite the same as qsize / 4, but this will do. */
+	return (eq->sidx / 4);
 }
 
 /* t4_main.c */
-void t4_tx_task(void *, int);
-void t4_tx_callout(void *);
 int t4_os_find_pci_capability(struct adapter *, int);
 int t4_os_pci_save_state(struct adapter *);
 int t4_os_pci_restore_state(struct adapter *);
@@ -1024,16 +1021,15 @@ int t4_setup_adapter_queues(struct adapt
 int t4_teardown_adapter_queues(struct adapter *);
 int t4_setup_port_queues(struct port_info *);
 int t4_teardown_port_queues(struct port_info *);
-int t4_alloc_tx_maps(struct tx_maps *, bus_dma_tag_t, int, int);
-void t4_free_tx_maps(struct tx_maps *, bus_dma_tag_t);
 void t4_intr_all(void *);
 void t4_intr(void *);
 void t4_intr_err(void *);
 void t4_intr_evt(void *);
 void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *);
-int t4_eth_tx(struct ifnet *, struct sge_txq *, struct mbuf *);
 void t4_update_fl_bufsize(struct ifnet *);
-int can_resume_tx(struct sge_eq *);
+int parse_pkt(struct mbuf **);
+void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *);
+void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *);
 
 /* t4_tracer.c */
 struct t4_tracer;

Modified: head/sys/dev/cxgbe/t4_l2t.c
==============================================================================
--- head/sys/dev/cxgbe/t4_l2t.c	Wed Dec 31 22:52:43 2014	(r276484)
+++ head/sys/dev/cxgbe/t4_l2t.c	Wed Dec 31 23:19:16 2014	(r276485)
@@ -113,16 +113,15 @@ found:
 int
 t4_write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
 {
-	struct wrqe *wr;
+	struct wrq_cookie cookie;
 	struct cpl_l2t_write_req *req;
 	int idx = e->idx + sc->vres.l2t.start;
 
 	mtx_assert(&e->lock, MA_OWNED);
 
-	wr = alloc_wrqe(sizeof(*req), &sc->sge.mgmtq);
-	if (wr == NULL)
+	req = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*req), 16), &cookie);
+	if (req == NULL)
 		return (ENOMEM);
-	req = wrtod(wr);
 
 	INIT_TP_WR(req, 0);
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, idx |
@@ -132,7 +131,7 @@ t4_write_l2e(struct adapter *sc, struct 
 	req->vlan = htons(e->vlan);
 	memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
 
-	t4_wrq_tx(sc, wr);
+	commit_wrq_wr(&sc->sge.mgmtq, req, &cookie);
 
 	if (sync && e->state != L2T_STATE_SWITCHING)
 		e->state = L2T_STATE_SYNC_WRITE;

Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c	Wed Dec 31 22:52:43 2014	(r276484)
+++ head/sys/dev/cxgbe/t4_main.c	Wed Dec 31 23:19:16 2014	(r276485)
@@ -66,6 +66,7 @@ __FBSDID("$FreeBSD$");
 #include "common/t4_regs_values.h"
 #include "t4_ioctl.h"
 #include "t4_l2t.h"
+#include "t4_mp_ring.h"
 
 /* T4 bus driver interface */
 static int t4_probe(device_t);
@@ -378,7 +379,8 @@ static void build_medialist(struct port_
 static int cxgbe_init_synchronized(struct port_info *);
 static int cxgbe_uninit_synchronized(struct port_info *);
 static int setup_intr_handlers(struct adapter *);
-static void quiesce_eq(struct adapter *, struct sge_eq *);
+static void quiesce_txq(struct adapter *, struct sge_txq *);
+static void quiesce_wrq(struct adapter *, struct sge_wrq *);
 static void quiesce_iq(struct adapter *, struct sge_iq *);
 static void quiesce_fl(struct adapter *, struct sge_fl *);
 static int t4_alloc_irq(struct adapter *, struct irq *, int rid,
@@ -434,7 +436,6 @@ static int sysctl_tx_rate(SYSCTL_HANDLER
 static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS);
 #endif
-static inline void txq_start(struct ifnet *, struct sge_txq *);
 static uint32_t fconf_to_mode(uint32_t);
 static uint32_t mode_to_fconf(uint32_t);
 static uint32_t fspec_to_fconf(struct t4_filter_specification *);
@@ -1429,67 +1430,36 @@ cxgbe_transmit(struct ifnet *ifp, struct
 {
 	struct port_info *pi = ifp->if_softc;
 	struct adapter *sc = pi->adapter;
-	struct sge_txq *txq = &sc->sge.txq[pi->first_txq];
-	struct buf_ring *br;
+	struct sge_txq *txq;
+	void *items[1];
 	int rc;
 
 	M_ASSERTPKTHDR(m);
+	MPASS(m->m_nextpkt == NULL);	/* not quite ready for this yet */
 
 	if (__predict_false(pi->link_cfg.link_ok == 0)) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
-	/* check if flowid is set */
-	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
-		txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq))
-		    + pi->rsrv_noflowq);
-	br = txq->br;
-
-	if (TXQ_TRYLOCK(txq) == 0) {
-		struct sge_eq *eq = &txq->eq;
-
-		/*
-		 * It is possible that t4_eth_tx finishes up and releases the
-		 * lock between the TRYLOCK above and the drbr_enqueue here.  We
-		 * need to make sure that this mbuf doesn't just sit there in
-		 * the drbr.
-		 */
-
-		rc = drbr_enqueue(ifp, br, m);
-		if (rc == 0 && callout_pending(&eq->tx_callout) == 0 &&
-		    !(eq->flags & EQ_DOOMED))
-			callout_reset(&eq->tx_callout, 1, t4_tx_callout, eq);
+	rc = parse_pkt(&m);
+	if (__predict_false(rc != 0)) {
+		MPASS(m == NULL);			/* was freed already */
+		atomic_add_int(&pi->tx_parse_error, 1);	/* rare, atomic is ok */
 		return (rc);
 	}
 
-	/*
-	 * txq->m is the mbuf that is held up due to a temporary shortage of
-	 * resources and it should be put on the wire first.  Then what's in
-	 * drbr and finally the mbuf that was just passed in to us.
-	 *
-	 * Return code should indicate the fate of the mbuf that was passed in
-	 * this time.
-	 */
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-	if (drbr_needs_enqueue(ifp, br) || txq->m) {
-
-		/* Queued for transmission. */
-
-		rc = drbr_enqueue(ifp, br, m);
-		m = txq->m ? txq->m : drbr_dequeue(ifp, br);
-		(void) t4_eth_tx(ifp, txq, m);
-		TXQ_UNLOCK(txq);
-		return (rc);
-	}
+	/* Select a txq. */
+	txq = &sc->sge.txq[pi->first_txq];
+	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
+		txq += ((m->m_pkthdr.flowid % (pi->ntxq - pi->rsrv_noflowq)) +
+		    pi->rsrv_noflowq);
 
-	/* Direct transmission. */
-	rc = t4_eth_tx(ifp, txq, m);
-	if (rc != 0 && txq->m)
-		rc = 0;	/* held, will be transmitted soon (hopefully) */
+	items[0] = m;
+	rc = mp_ring_enqueue(txq->r, items, 1, 4096);
+	if (__predict_false(rc != 0))
+		m_freem(m);
 
-	TXQ_UNLOCK(txq);
 	return (rc);
 }
 
@@ -1499,17 +1469,17 @@ cxgbe_qflush(struct ifnet *ifp)
 	struct port_info *pi = ifp->if_softc;
 	struct sge_txq *txq;
 	int i;
-	struct mbuf *m;
 
 	/* queues do not exist if !PORT_INIT_DONE. */
 	if (pi->flags & PORT_INIT_DONE) {
 		for_each_txq(pi, i, txq) {
 			TXQ_LOCK(txq);
-			m_freem(txq->m);
-			txq->m = NULL;
-			while ((m = buf_ring_dequeue_sc(txq->br)) != NULL)
-				m_freem(m);
+			txq->eq.flags &= ~EQ_ENABLED;
 			TXQ_UNLOCK(txq);
+			while (!mp_ring_is_idle(txq->r)) {
+				mp_ring_check_drainage(txq->r, 0);
+				pause("qflush", 1);
+			}
 		}
 	}
 	if_qflush(ifp);
@@ -1564,7 +1534,7 @@ cxgbe_get_counter(struct ifnet *ifp, ift
 			struct sge_txq *txq;
 
 			for_each_txq(pi, i, txq)
-				drops += txq->br->br_drops;
+				drops += counter_u64_fetch(txq->r->drops);
 		}
 
 		return (drops);
@@ -3236,7 +3206,8 @@ cxgbe_init_synchronized(struct port_info
 {
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
-	int rc = 0;
+	int rc = 0, i;
+	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
@@ -3265,6 +3236,17 @@ cxgbe_init_synchronized(struct port_info
 	}
 
 	/*
+	 * Can't fail from this point onwards.  Review cxgbe_uninit_synchronized
+	 * if this changes.
+	 */
+
+	for_each_txq(pi, i, txq) {
+		TXQ_LOCK(txq);
+		txq->eq.flags |= EQ_ENABLED;
+		TXQ_UNLOCK(txq);
+	}
+
+	/*
 	 * The first iq of the first port to come up is used for tracing.
 	 */
 	if (sc->traceq < 0) {
@@ -3297,7 +3279,8 @@ cxgbe_uninit_synchronized(struct port_in
 {
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = pi->ifp;
-	int rc;
+	int rc, i;
+	struct sge_txq *txq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
@@ -3314,6 +3297,12 @@ cxgbe_uninit_synchronized(struct port_in
 		return (rc);
 	}
 
+	for_each_txq(pi, i, txq) {
+		TXQ_LOCK(txq);
+		txq->eq.flags &= ~EQ_ENABLED;
+		TXQ_UNLOCK(txq);
+	}
+
 	clrbit(&sc->open_device_map, pi->port_id);
 	PORT_LOCK(pi);
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
@@ -3543,15 +3532,17 @@ port_full_uninit(struct port_info *pi)
 
 	if (pi->flags & PORT_INIT_DONE) {
 
-		/* Need to quiesce queues.  XXX: ctrl queues? */
+		/* Need to quiesce queues.  */
+
+		quiesce_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
 
 		for_each_txq(pi, i, txq) {
-			quiesce_eq(sc, &txq->eq);
+			quiesce_txq(sc, txq);
 		}
 
 #ifdef TCP_OFFLOAD
 		for_each_ofld_txq(pi, i, ofld_txq) {
-			quiesce_eq(sc, &ofld_txq->eq);
+			quiesce_wrq(sc, ofld_txq);
 		}
 #endif
 
@@ -3576,23 +3567,39 @@ port_full_uninit(struct port_info *pi)
 }
 
 static void
-quiesce_eq(struct adapter *sc, struct sge_eq *eq)
+quiesce_txq(struct adapter *sc, struct sge_txq *txq)
 {
-	EQ_LOCK(eq);
-	eq->flags |= EQ_DOOMED;
+	struct sge_eq *eq = &txq->eq;
+	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 
-	/*
-	 * Wait for the response to a credit flush if one's
-	 * pending.
-	 */
-	while (eq->flags & EQ_CRFLUSHED)
-		mtx_sleep(eq, &eq->eq_lock, 0, "crflush", 0);
-	EQ_UNLOCK(eq);
+	(void) sc;	/* unused */
 
-	callout_drain(&eq->tx_callout);	/* XXX: iffy */
-	pause("callout", 10);		/* Still iffy */
+#ifdef INVARIANTS
+	TXQ_LOCK(txq);
+	MPASS((eq->flags & EQ_ENABLED) == 0);
+	TXQ_UNLOCK(txq);
+#endif
 
-	taskqueue_drain(sc->tq[eq->tx_chan], &eq->tx_task);
+	/* Wait for the mp_ring to empty. */
+	while (!mp_ring_is_idle(txq->r)) {
+		mp_ring_check_drainage(txq->r, 0);
+		pause("rquiesce", 1);
+	}
+
+	/* Then wait for the hardware to finish. */
+	while (spg->cidx != htobe16(eq->pidx))
+		pause("equiesce", 1);
+
+	/* Finally, wait for the driver to reclaim all descriptors. */
+	while (eq->cidx != eq->pidx)
+		pause("dquiesce", 1);
+}
+
+static void
+quiesce_wrq(struct adapter *sc, struct sge_wrq *wrq)
+{
+
+	/* XXXTX */
 }
 
 static void
@@ -4892,6 +4899,9 @@ cxgbe_sysctls(struct port_info *pi)
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD,
 	    NULL, "port statistics");
 	children = SYSCTL_CHILDREN(oid);
+	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD,
+	    &pi->tx_parse_error, 0,
+	    "# of tx packets with invalid length or # of segments");
 
 #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \
 	SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \
@@ -6947,74 +6957,6 @@ sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS)
 }
 #endif
 
-static inline void
-txq_start(struct ifnet *ifp, struct sge_txq *txq)
-{
-	struct buf_ring *br;
-	struct mbuf *m;
-
-	TXQ_LOCK_ASSERT_OWNED(txq);
-
-	br = txq->br;
-	m = txq->m ? txq->m : drbr_dequeue(ifp, br);
-	if (m)
-		t4_eth_tx(ifp, txq, m);
-}
-
-void
-t4_tx_callout(void *arg)
-{
-	struct sge_eq *eq = arg;
-	struct adapter *sc;
-
-	if (EQ_TRYLOCK(eq) == 0)
-		goto reschedule;
-
-	if (eq->flags & EQ_STALLED && !can_resume_tx(eq)) {
-		EQ_UNLOCK(eq);
-reschedule:
-		if (__predict_true(!(eq->flags && EQ_DOOMED)))
-			callout_schedule(&eq->tx_callout, 1);
-		return;
-	}
-
-	EQ_LOCK_ASSERT_OWNED(eq);
-
-	if (__predict_true((eq->flags & EQ_DOOMED) == 0)) {
-
-		if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) {
-			struct sge_txq *txq = arg;
-			struct port_info *pi = txq->ifp->if_softc;
-
-			sc = pi->adapter;
-		} else {
-			struct sge_wrq *wrq = arg;
-
-			sc = wrq->adapter;
-		}
-
-		taskqueue_enqueue(sc->tq[eq->tx_chan], &eq->tx_task);
-	}
-
-	EQ_UNLOCK(eq);
-}
-
-void
-t4_tx_task(void *arg, int count)
-{
-	struct sge_eq *eq = arg;
-
-	EQ_LOCK(eq);
-	if ((eq->flags & EQ_TYPEMASK) == EQ_ETH) {
-		struct sge_txq *txq = arg;
-		txq_start(txq->ifp, txq);
-	} else {
-		struct sge_wrq *wrq = arg;
-		t4_wrq_tx_locked(wrq->adapter, wrq, NULL);
-	}
-	EQ_UNLOCK(eq);
-}
-
 static uint32_t
 fconf_to_mode(uint32_t fconf)
 {
@@ -7452,9 +7394,9 @@ static int
 set_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
+	struct wrq_cookie cookie;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
@@ -7473,12 +7415,10 @@ set_filter_wr(struct adapter *sc, int fi
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
-	if (wr == NULL)
+	fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie);
+	if (fwr == NULL)
 		return (ENOMEM);
-
-	fwr = wrtod(wr);
-	bzero(fwr, sizeof (*fwr));
+	bzero(fwr, sizeof(*fwr));
 
 	fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR));
 	fwr->len16_pkd = htobe32(FW_LEN16(*fwr));
@@ -7547,7 +7487,7 @@ set_filter_wr(struct adapter *sc, int fi
 	f->pending = 1;
 	sc->tids.ftids_in_use++;
 
-	t4_wrq_tx(sc, wr);
+	commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie);
 	return (0);
 }
 
@@ -7555,22 +7495,21 @@ static int
 del_filter_wr(struct adapter *sc, int fidx)
 {
 	struct filter_entry *f = &sc->tids.ftid_tab[fidx];
-	struct wrqe *wr;
 	struct fw_filter_wr *fwr;
 	unsigned int ftid;
+	struct wrq_cookie cookie;
 
 	ftid = sc->tids.ftid_base + fidx;
 
-	wr = alloc_wrqe(sizeof(*fwr), &sc->sge.mgmtq);
-	if (wr == NULL)
+	fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie);
+	if (fwr == NULL)
 		return (ENOMEM);
-	fwr = wrtod(wr);
 	bzero(fwr, sizeof (*fwr));
 
 	t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id);
 
 	f->pending = 1;
-	t4_wrq_tx(sc, wr);
+	commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie);
 	return (0);
 }
 
@@ -8170,6 +8109,7 @@ t4_ioctl(struct cdev *dev, unsigned long
 
 		/* MAC stats */
 		t4_clr_port_stats(sc, pi->tx_chan);
+		pi->tx_parse_error = 0;
 
 		if (pi->flags & PORT_INIT_DONE) {
 			struct sge_rxq *rxq;
@@ -8192,24 +8132,24 @@ t4_ioctl(struct cdev *dev, unsigned long
 				txq->imm_wrs = 0;
 				txq->sgl_wrs = 0;
 				txq->txpkt_wrs = 0;
-				txq->txpkts_wrs = 0;
-				txq->txpkts_pkts = 0;
-				txq->br->br_drops = 0;
-				txq->no_dmamap = 0;
-				txq->no_desc = 0;
+				txq->txpkts0_wrs = 0;
+				txq->txpkts1_wrs = 0;
+				txq->txpkts0_pkts = 0;
+				txq->txpkts1_pkts = 0;
+				mp_ring_reset_stats(txq->r);
 			}
 
 #ifdef TCP_OFFLOAD
 			/* nothing to clear for each ofld_rxq */
 
 			for_each_ofld_txq(pi, i, wrq) {
-				wrq->tx_wrs = 0;
-				wrq->no_desc = 0;
+				wrq->tx_wrs_direct = 0;
+				wrq->tx_wrs_copied = 0;
 			}
 #endif
 			wrq = &sc->sge.ctrlq[pi->port_id];
-			wrq->tx_wrs = 0;
-			wrq->no_desc = 0;
+			wrq->tx_wrs_direct = 0;
+			wrq->tx_wrs_copied = 0;
 		}
 		break;
 	}

Added: head/sys/dev/cxgbe/t4_mp_ring.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/dev/cxgbe/t4_mp_ring.c	Wed Dec 31 23:19:16 2014	(r276485)
@@ -0,0 +1,364 @@
+/*-
+ * Copyright (c) 2014 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np at FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <machine/cpu.h>
+
+#include "t4_mp_ring.h"
+
+union ring_state {
+	struct {
+		uint16_t pidx_head;
+		uint16_t pidx_tail;
+		uint16_t cidx;
+		uint16_t flags;
+	};
+	uint64_t state;
+};
+
+enum {
+	IDLE = 0,	/* consumer ran to completion, nothing more to do. */
+	BUSY,		/* consumer is running already, or will be shortly. */
+	STALLED,	/* consumer stopped due to lack of resources. */
+	ABDICATED,	/* consumer stopped even though there was work to be
+			   done because it wants another thread to take over. */
+};
+
+static inline uint16_t
+space_available(struct mp_ring *r, union ring_state s)
+{
+	uint16_t x = r->size - 1;
+
+	if (s.cidx == s.pidx_head)
+		return (x);
+	else if (s.cidx > s.pidx_head)
+		return (s.cidx - s.pidx_head - 1);
+	else
+		return (x - s.pidx_head + s.cidx);
+}
+
+static inline uint16_t
+increment_idx(struct mp_ring *r, uint16_t idx, uint16_t n)
+{
+	int x = r->size - idx;
+
+	MPASS(x > 0);
+	return (x > n ? idx + n : n - x);
+}
+
+/* Consumer is about to update the ring's state to s */
+static inline uint16_t
+state_to_flags(union ring_state s, int abdicate)
+{
+
+	if (s.cidx == s.pidx_tail)
+		return (IDLE);
+	else if (abdicate && s.pidx_tail != s.pidx_head)
+		return (ABDICATED);
+
+	return (BUSY);
+}
+
+/*
+ * Caller passes in a state, with a guarantee that there is work to do and that
+ * all items up to the pidx_tail in the state are visible.
+ */
+static void
+drain_ring(struct mp_ring *r, union ring_state os, uint16_t prev, int budget)
+{
+	union ring_state ns;
+	int n, pending, total;
+	uint16_t cidx = os.cidx;
+	uint16_t pidx = os.pidx_tail;
+
+	MPASS(os.flags == BUSY);
+	MPASS(cidx != pidx);
+
+	if (prev == IDLE)
+		counter_u64_add(r->starts, 1);
+	pending = 0;
+	total = 0;
+
+	while (cidx != pidx) {
+
+		/* Items from cidx to pidx are available for consumption. */
+		n = r->drain(r, cidx, pidx);
+		if (n == 0) {
+			critical_enter();
+			do {
+				os.state = ns.state = r->state;
+				ns.cidx = cidx;
+				ns.flags = STALLED;
+			} while (atomic_cmpset_64(&r->state, os.state,
+			    ns.state) == 0);
+			critical_exit();
+			if (prev != STALLED)
+				counter_u64_add(r->stalls, 1);
+			else if (total > 0) {
+				counter_u64_add(r->restarts, 1);
+				counter_u64_add(r->stalls, 1);
+			}
+			break;
+		}
+		cidx = increment_idx(r, cidx, n);
+		pending += n;
+		total += n;
+
+		/*
+		 * We update the cidx only if we've caught up with the pidx, the
+		 * real cidx is getting too far ahead of the one visible to
+		 * everyone else, or we have exceeded our budget.
+		 */
+		if (cidx != pidx && pending < 64 && total < budget)
+			continue;
+		critical_enter();
+		do {
+			os.state = ns.state = r->state;
+			ns.cidx = cidx;
+			ns.flags = state_to_flags(ns, total >= budget);
+		} while (atomic_cmpset_acq_64(&r->state, os.state, ns.state) == 0);
+		critical_exit();
+
+		if (ns.flags == ABDICATED)
+			counter_u64_add(r->abdications, 1);
+		if (ns.flags != BUSY) {
+			/* Wrong loop exit if we're going to stall. */
+			MPASS(ns.flags != STALLED);
+			if (prev == STALLED) {
+				MPASS(total > 0);
+				counter_u64_add(r->restarts, 1);
+			}
+			break;
+		}
+
+		/*
+		 * The acquire style atomic above guarantees visibility of items
+		 * associated with any pidx change that we notice here.
+		 */
+		pidx = ns.pidx_tail;
+		pending = 0;
+	}
+}
+
+int
+mp_ring_alloc(struct mp_ring **pr, int size, void *cookie, ring_drain_t drain,
+    ring_can_drain_t can_drain, struct malloc_type *mt, int flags)
+{
+	struct mp_ring *r;
+
+	/* All idx are 16b so size can be 65536 at most */
+	if (pr == NULL || size < 2 || size > 65536 || drain == NULL ||
+	    can_drain == NULL)
+		return (EINVAL);
+	*pr = NULL;
+	flags &= M_NOWAIT | M_WAITOK;
+	MPASS(flags != 0);
+
+	r = malloc(__offsetof(struct mp_ring, items[size]), mt, flags | M_ZERO);
+	if (r == NULL)
+		return (ENOMEM);
+	r->size = size;
+	r->cookie = cookie;
+	r->mt = mt;
+	r->drain = drain;
+	r->can_drain = can_drain;
+	r->enqueues = counter_u64_alloc(flags);
+	r->drops = counter_u64_alloc(flags);
+	r->starts = counter_u64_alloc(flags);
+	r->stalls = counter_u64_alloc(flags);
+	r->restarts = counter_u64_alloc(flags);
+	r->abdications = counter_u64_alloc(flags);
+	if (r->enqueues == NULL || r->drops == NULL || r->starts == NULL ||
+	    r->stalls == NULL || r->restarts == NULL ||
+	    r->abdications == NULL) {
+		mp_ring_free(r);
+		return (ENOMEM);
+	}
+
+	*pr = r;
+	return (0);
+}
+
+void
+
+mp_ring_free(struct mp_ring *r)
+{
+
+	if (r == NULL)
+		return;
+
+	if (r->enqueues != NULL)
+		counter_u64_free(r->enqueues);
+	if (r->drops != NULL)
+		counter_u64_free(r->drops);
+	if (r->starts != NULL)
+		counter_u64_free(r->starts);
+	if (r->stalls != NULL)
+		counter_u64_free(r->stalls);
+	if (r->restarts != NULL)
+		counter_u64_free(r->restarts);
+	if (r->abdications != NULL)
+		counter_u64_free(r->abdications);
+
+	free(r, r->mt);
+}
+
+/*
+ * Enqueue n items and maybe drain the ring for some time.
+ *
+ * Returns an errno.
+ */
+int
+mp_ring_enqueue(struct mp_ring *r, void **items, int n, int budget)
+{
+	union ring_state os, ns;
+	uint16_t pidx_start, pidx_stop;
+	int i;
+
+	MPASS(items != NULL);
+	MPASS(n > 0);
+

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list