PERFORCE change 109499 for review

Wed Nov 8 00:40:57 UTC 2006

http://perforce.freebsd.org/chv.cgi?CH=109499

Change 109499 by sam at sam_ebb on 2006/11/08 00:39:22

	Add multi-segment tx:
	o change ix_npe in npebuf to an NPE_MAXSEG array of
	  descriptors (3 for now based on tracing traffic for
	  NFS root mount and normal traffic patterns running
	  diskless)
	o bring in defrag code from ath to handle the case
	  where the mbuf chain doesn't fit
	
	Gets us >20% improvement for upstream TCP netperf on a
	WITNESS+INVARIANTS kernel.
	
	Note: rx buffers get NPE_MAXSEG-1 unused descriptors
	(~8Kbytes right now); this can easily be reclaimed.
	
	Note: can optimize npebuf setup a bit in tx path by
	unrolling loop and eliminating extraneous write to
	the uncached npebuf.

Affected files ...

.. //depot/projects/arm/src/sys/arm/xscale/ixp425/if_npe.c#15 edit
.. //depot/projects/arm/src/sys/arm/xscale/ixp425/if_npereg.h#3 edit

Differences ...

==== //depot/projects/arm/src/sys/arm/xscale/ixp425/if_npe.c#15 (text+ko) ====

@@ -380,7 +380,7 @@
 
 static int
 npe_dma_setup(struct npe_softc *sc, struct npedma *dma,
-	const char *name, int nbuf)
+	const char *name, int nbuf, int maxseg)
 {
 	int error, i;
 
@@ -391,7 +391,7 @@
 
 	/* DMA tag for mapped mbufs  */
 	error = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR_32BIT,
-	    BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1, MCLBYTES, 0,
+	    BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, maxseg, MCLBYTES, 0,
 	    busdma_lock_mutex, &sc->sc_mtx, &dma->mtag);
 	if (error != 0) {
 		device_printf(sc->sc_dev, "unable to create %s mbuf dma tag, "
@@ -508,10 +508,11 @@
 		}
 	} else
 		sc->sc_miih = sc->sc_ioh;
-	error = npe_dma_setup(sc, &sc->txdma, "tx", NPE_MAX_TX_BUFFERS);
+	error = npe_dma_setup(sc, &sc->txdma, "tx", NPE_MAX_TX_BUFFERS,
+			NPE_MAXSEG);
 	if (error != 0)
 		return error;
-	error = npe_dma_setup(sc, &sc->rxdma, "rx", NPE_MAX_RX_BUFFERS);
+	error = npe_dma_setup(sc, &sc->rxdma, "rx", NPE_MAX_RX_BUFFERS, 1);
 	if (error != 0)
 		return error;
 
@@ -753,6 +754,7 @@
 	uint32_t entry;
 
 	NPE_LOCK(sc);
+	/* XXX max # at a time? */
 	while (ixpqmgr_qread(qid, &entry) == 0) {
 		struct npebuf *npe = P2V(NPE_QM_Q_ADDR(entry));
 
@@ -786,7 +788,6 @@
 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			return ENOBUFS;
-		m->m_len = MCLBYTES;
 	}
 	KASSERT(m->m_ext.ext_size >= 1536 + ETHER_ALIGN,
 		("ext_size %d", m->m_ext.ext_size));
@@ -799,11 +800,11 @@
 		m_freem(m);
 		return error;
 	}
-	npe->ix_ne_data = htobe32(segs[0].ds_addr);
+	npe->ix_ne[0].data = htobe32(segs[0].ds_addr);
 	/* NB: NPE requires length be a multiple of 64 */
 	/* NB: buffer length is shifted in word */
-	npe->ix_ne_len = htobe32(segs[0].ds_len << 16);
-	npe->ix_ne_next = 0;
+	npe->ix_ne[0].len = htobe32(segs[0].ds_len << 16);
+	npe->ix_ne[0].next = 0;
 	npe->ix_m = m;
 	/* Flush the memory in the mbuf */
 	bus_dmamap_sync(dma->mtag, npe->ix_map, BUS_DMASYNC_PREREAD);
@@ -830,7 +831,7 @@
 		struct mbuf *m;
 
 		DPRINTF(sc, "%s: entry 0x%x neaddr 0x%x ne_len 0x%x\n",
-		    __func__, entry, npe->ix_neaddr, npe->ix_ne_len);/*XXX*/
+		    __func__, entry, npe->ix_neaddr, npe->ix_ne[0].len);
 		/*
 		 * Allocate a new mbuf to replenish the rx buffer.
 		 * If doing so fails we drop the rx'd frame so we
@@ -848,7 +849,7 @@
 			    BUS_DMASYNC_POSTREAD);
 
 			/* set m_len etc. per rx frame size */
-			mrx->m_len = be32toh(npe->ix_ne_len) & 0xffff;
+			mrx->m_len = be32toh(npe->ix_ne[0].len) & 0xffff;
 			mrx->m_pkthdr.len = mrx->m_len;
 			mrx->m_pkthdr.rcvif = ifp;
 			mrx->m_flags |= M_HASFCS;
@@ -862,8 +863,8 @@
 			}
 		} else {
 			m = npe->ix_m;
-			npe->ix_ne_len = htobe32(m->m_len << 16);
-			npe->ix_ne_next = 0;
+			npe->ix_ne[0].len = htobe32(m->m_len << 16);
+			npe->ix_ne[0].next = 0;
 			/* XXX? sync? */
 		}
 		bus_dmamap_sync(dma->buf_tag, dma->buf_map,
@@ -1000,27 +1001,88 @@
 	NPE_UNLOCK(sc);
 }
 
+/*
+ * Defragment an mbuf chain, returning at most maxfrags separate
+ * mbufs+clusters.  If this is not possible NULL is returned and
+ * the original mbuf chain is left in it's present (potentially
+ * modified) state.  We use two techniques: collapsing consecutive
+ * mbufs and replacing consecutive mbufs by a cluster.
+ */
 static struct mbuf *
-npe_linearize(struct mbuf *m0, int how)
+npe_defrag(struct mbuf *m0, int how, int maxfrags)
 {
-	struct mbuf *m, *n;
+	struct mbuf *m, *n, *n2, **prev;
+	u_int curfrags;
 
-	if (m0->m_pkthdr.len > MHLEN)
-		n = m_getcl(how, MT_DATA, M_PKTHDR);
-	else
-		n = m_gethdr(how, MT_DATA);
-	if (n != NULL) {
-		n->m_len = 0;		/* NB: not initialized on alloc */
-		for (m = m0; m != NULL; m = m->m_next) {
-			bcopy(mtod(m, void *), mtod(n, char *) + n->m_len,
-				m->m_len);
-			n->m_len += m->m_len;
+	/*
+	 * Calculate the current number of frags.
+	 */
+	curfrags = 0;
+	for (m = m0; m != NULL; m = m->m_next)
+		curfrags++;
+	/*
+	 * First, try to collapse mbufs.  Note that we always collapse
+	 * towards the front so we don't need to deal with moving the
+	 * pkthdr.  This may be suboptimal if the first mbuf has much
+	 * less data than the following.
+	 */
+	m = m0;
+again:
+	for (;;) {
+		n = m->m_next;
+		if (n == NULL)
+			break;
+		if ((m->m_flags & M_RDONLY) == 0 &&
+		    n->m_len < M_TRAILINGSPACE(m)) {
+			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
+				n->m_len);
+			m->m_len += n->m_len;
+			m->m_next = n->m_next;
+			m_free(n);
+			if (--curfrags <= maxfrags)
+				return m0;
+		} else
+			m = n;
+	}
+	KASSERT(maxfrags > 1,
+		("maxfrags %u, but normal collapse failed", maxfrags));
+	/*
+	 * Collapse consecutive mbufs to a cluster.
+	 */
+	prev = &m0->m_next;		/* NB: not the first mbuf */
+	while ((n = *prev) != NULL) {
+		if ((n2 = n->m_next) != NULL &&
+		    n->m_len + n2->m_len < MCLBYTES) {
+			m = m_getcl(how, MT_DATA, 0);
+			if (m == NULL)
+				goto bad;
+			bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
+			bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
+				n2->m_len);
+			m->m_len = n->m_len + n2->m_len;
+			m->m_next = n2->m_next;
+			*prev = m;
+			m_free(n);
+			m_free(n2);
+			if (--curfrags <= maxfrags)	/* +1 cl -2 mbufs */
+				return m0;
+			/*
+			 * Still not there, try the normal collapse
+			 * again before we allocate another cluster.
+			 */
+			goto again;
 		}
-		/* NB: this works because we never change m_final->m_data */
-		m_move_pkthdr(n, m0);
+		prev = &n->m_next;
 	}
-	m_freem(m0);
-	return n;
+	/*
+	 * No place where we can collapse to a cluster; punt.
+	 * This can occur if, for example, you request 2 frags
+	 * but the packet requires that both be clusters (we
+	 * never reallocate the first mbuf to avoid moving the
+	 * packet header).
+	 */
+bad:
+	return NULL;
 }
 
 /*
@@ -1031,10 +1093,11 @@
 {
 	struct npe_softc *sc = ifp->if_softc;
 	struct npebuf *npe;
-	struct mbuf *m;
+	struct mbuf *m, *n;
 	struct npedma *dma = &sc->txdma;
-	bus_dma_segment_t segs[1];
-	int nseg, len;
+	bus_dma_segment_t segs[NPE_MAXSEG];
+	int nseg, len, error, i;
+	uint32_t next;
 
 	NPE_ASSERT_LOCKED(sc);
 	/* XXX can this happen? */
@@ -1049,19 +1112,27 @@
 			return;
 		}
 		npe = sc->tx_free;
-		if (m->m_next != NULL) {
-			m = npe_linearize(m, M_DONTWAIT);
-			if (m == NULL)
-				return;
+		error = bus_dmamap_load_mbuf_sg(dma->mtag, npe->ix_map,
+		    m, segs, &nseg, 0);
+		if (error == EFBIG) {
+			n = npe_defrag(m, M_DONTWAIT, NPE_MAXSEG);
+			if (n == NULL) {
+				if_printf(ifp, "%s: too many fragments %u\n",
+				    __func__, nseg);
+				m_freem(m);
+				return;	/* XXX? */
+			}
+			m = n;
+			error = bus_dmamap_load_mbuf_sg(dma->mtag, npe->ix_map,
+			    m, segs, &nseg, 0);
 		}
-		if (bus_dmamap_load_mbuf_sg(dma->mtag, npe->ix_map,
-		    m, segs, &nseg, 0) != 0) {
+		if (error != 0 || nseg == 0) {
+			if_printf(ifp, "%s: error %u nseg %u\n",
+			    __func__, error, nseg);
 			m_freem(m);
-			continue;
+			return;	/* XXX? */
 		}
 		sc->tx_free = npe->ix_next;
-		if (sc->tx_free == NULL)
-			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
 		bus_dmamap_sync(dma->mtag, npe->ix_map, BUS_DMASYNC_PREWRITE);
 	
@@ -1071,22 +1142,30 @@
 		BPF_MTAP(ifp, m);
 
 		npe->ix_m = m;
-		npe->ix_ne_data = htobe32(segs[0].ds_addr);
-		len = segs[0].ds_len;
-		/* NB: this sets both frame and buffer lengths */
-		npe->ix_ne_len = htobe32((len<<16) | len);
-		npe->ix_ne_next = 0;		/* NB: no chaining (yet) */
+		len = m->m_pkthdr.len;
+		next = npe->ix_neaddr + sizeof(npe->ix_ne[0]);
+		for (i = 0; i < nseg; i++) {
+			npe->ix_ne[i].data = htobe32(segs[i].ds_addr);
+			npe->ix_ne[i].len = htobe32((segs[i].ds_len<<16) | len);
+			npe->ix_ne[i].next = htobe32(next);
+
+			len = 0;		/* zero for segments > 1 */
+			next += sizeof(npe->ix_ne[0]);
+		}
+		npe->ix_ne[i-1].next = 0;	/* zero last in chain */
 		/* XXX flush descriptor instead of using uncached memory */
 
 		DPRINTF(sc, "%s: qwrite(%u, 0x%x) ne_data %x ne_len 0x%x\n",
 		    __func__, sc->tx_qid, npe->ix_neaddr,
-		    npe->ix_ne_data, npe->ix_ne_len);
+		    npe->ix_ne[0].data, npe->ix_ne[0].len);
 		/* stick it on the tx q */
 		/* XXX add vlan priority */
 		ixpqmgr_qwrite(sc->tx_qid, npe->ix_neaddr);
 
 		ifp->if_timer = 5;
 	}
+	if (sc->tx_free == NULL)
+		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 }
 
 void

==== //depot/projects/arm/src/sys/arm/xscale/ixp425/if_npereg.h#3 (text+ko) ====

@@ -67,16 +67,20 @@
  * of the Intel code all the s/w area is free for us to use as we
  * choose--only the npe area layout and alignment must be honored.
  */
+#define	NPE_MAXSEG	3		/* empirically selected */
+
 struct npebuf {
 	struct npebuf	*ix_next;	/* chain to next buffer */
 	void		*ix_m;		/* backpointer to mbuf */
 	uint32_t	ix_neaddr;	/* phys address of ix_ne */
 	bus_dmamap_t	ix_map;		/* bus dma map for associated data */
 	uint32_t	ix_reserved[4];
-	uint32_t	ix_ne[8];	/* NPE shared area, cacheline aligned */
-#define	ix_ne_next	ix_ne[0]	/* phys addr of next buffer */
-#define	ix_ne_len	ix_ne[1]	/* buffer length (bytes) */
-#define	ix_ne_data	ix_ne[2]	/* phys addr of data buffer */
+	struct {			/* NPE shared area, cacheline aligned */
+		uint32_t next;		/* phys addr of next segment */
+		uint32_t len;		/* buffer/segment length (bytes) */
+		uint32_t data;		/* phys addr of data segment */
+		uint32_t pad[5];	/* pad to cacheline */
+	} ix_ne[NPE_MAXSEG];
 };
 
 #define NPE_PORTS_MAX		3