svn commit: r263317 - in head/sys/dev/cxgbe: . common

Navdeep Parhar np at FreeBSD.org
Tue Mar 18 20:14:14 UTC 2014


Author: np
Date: Tue Mar 18 20:14:13 2014
New Revision: 263317
URL: http://svnweb.freebsd.org/changeset/base/263317

Log:
  cxgbe(4): significant rx rework.
  
  - More flexible cluster size selection, including the ability to fall
    back to a safe cluster size (PAGE_SIZE from zone_jumbop by default) in
    case an allocation of a larger size fails.
  - A single get_fl_payload() function that assembles the payload into an
    mbuf chain for any kind of freelist.  This replaces two variants: one
    for freelists with buffer packing enabled and another for those without.
  - Buffer packing with any sized cluster.  It was limited to 4K clusters
    only before this change.
  - Enable buffer packing for TOE rx queues as well.
  - Statistics and tunables to go with all these changes.  The driver's
    man page will be updated separately.
  
  MFC after:	5 weeks

Modified:
  head/sys/dev/cxgbe/adapter.h
  head/sys/dev/cxgbe/common/t4_hw.h
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/cxgbe/t4_sge.c

Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================
--- head/sys/dev/cxgbe/adapter.h	Tue Mar 18 20:05:55 2014	(r263316)
+++ head/sys/dev/cxgbe/adapter.h	Tue Mar 18 20:14:13 2014	(r263317)
@@ -134,10 +134,11 @@ enum {
 
 	RX_FL_ESIZE = EQ_ESIZE,	/* 8 64bit addresses */
 #if MJUMPAGESIZE != MCLBYTES
-	FL_BUF_SIZES_MAX = 5,	/* cluster, jumbop, jumbo9k, jumbo16k, extra */
+	SW_ZONE_SIZES = 4,	/* cluster, jumbop, jumbo9k, jumbo16k */
 #else
-	FL_BUF_SIZES_MAX = 4,	/* cluster, jumbo9k, jumbo16k, extra */
+	SW_ZONE_SIZES = 3,	/* cluster, jumbo9k, jumbo16k */
 #endif
+	CL_METADATA_SIZE = CACHE_LINE_SIZE,
 
 	CTRL_EQ_QSIZE = 128,
 
@@ -241,15 +242,28 @@ struct port_info {
 	uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */
 };
 
-struct fl_sdesc {
-	bus_dmamap_t map;
-	caddr_t cl;
-	uint8_t tag_idx;	/* the fl->tag entry this map comes from */
+/* Where the cluster came from, how it has been carved up. */
+struct cluster_layout {
+	int8_t zidx;
+	int8_t hwidx;
+	uint16_t region1;	/* mbufs laid out within this region */
+				/* region2 is the DMA region */
+	uint16_t region3;	/* cluster_metadata within this region */
+};
+
+struct cluster_metadata {
+	u_int refcount;
 #ifdef INVARIANTS
-	__be64 ba_hwtag;
+	struct fl_sdesc *sd;	/* For debug only.  Could easily be stale */
 #endif
 };
 
+struct fl_sdesc {
+	caddr_t cl;
+	uint8_t nmbuf;
+	struct cluster_layout cll;
+};
+
 struct tx_desc {
 	__be64 flit[8];
 };
@@ -368,17 +382,19 @@ struct sge_eq {
 	uint32_t unstalled;	/* recovered from stall */
 };
 
-struct fl_buf_info {
-	u_int size;
-	int type;
-	int hwtag:4;	/* tag in low 4 bits of the pa. */
-	uma_zone_t zone;
-};
-#define FL_BUF_SIZES(sc)	(sc->sge.fl_buf_sizes)
-#define FL_BUF_SIZE(sc, x)	(sc->sge.fl_buf_info[x].size)
-#define FL_BUF_TYPE(sc, x)	(sc->sge.fl_buf_info[x].type)
-#define FL_BUF_HWTAG(sc, x)	(sc->sge.fl_buf_info[x].hwtag)
-#define FL_BUF_ZONE(sc, x)	(sc->sge.fl_buf_info[x].zone)
+struct sw_zone_info {
+	uma_zone_t zone;	/* zone that this cluster comes from */
+	int size;		/* size of cluster: 2K, 4K, 9K, 16K, etc. */
+	int type;		/* EXT_xxx type of the cluster */
+	int8_t head_hwidx;
+	int8_t tail_hwidx;
+};
+
+struct hw_buf_info {
+	int8_t zidx;		/* backpointer to zone; -ve means unused */
+	int8_t next;		/* next hwidx for this zone; -1 means no more */
+	int size;
+};
 
 enum {
 	FL_STARVING	= (1 << 0), /* on the adapter's list of starving fl's */
@@ -392,9 +408,8 @@ enum {
 struct sge_fl {
 	bus_dma_tag_t desc_tag;
 	bus_dmamap_t desc_map;
-	bus_dma_tag_t tag[FL_BUF_SIZES_MAX]; /* only first FL_BUF_SIZES(sc) are
-						valid */
-	uint8_t tag_idx;
+	struct cluster_layout cll_def;	/* default refill zone, layout */
+	struct cluster_layout cll_alt;	/* alternate refill zone, layout */
 	struct mtx fl_lock;
 	char lockname[16];
 	int flags;
@@ -411,9 +426,17 @@ struct sge_fl {
 	uint32_t needed;	/* # of buffers needed to fill up fl. */
 	uint32_t lowat;		/* # of buffers <= this means fl needs help */
 	uint32_t pending;	/* # of bufs allocated since last doorbell */
-	u_int dmamap_failed;
-	struct mbuf *mstash[8];
 	TAILQ_ENTRY(sge_fl) link; /* All starving freelists */
+
+	struct mbuf *m0;
+	struct mbuf **pnext;
+	u_int remaining;
+
+	uint64_t mbuf_allocated;/* # of mbuf allocated from zone_mbuf */
+	uint64_t mbuf_inlined;	/* # of mbuf created within clusters */
+	uint64_t cl_allocated;	/* # of clusters allocated */
+	uint64_t cl_recycled;	/* # of clusters recycled */
+	uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */
 };
 
 /* txq: SGE egress queue + what's needed for Ethernet NIC */
@@ -547,8 +570,11 @@ struct sge {
 	struct sge_iq **iqmap;	/* iq->cntxt_id to iq mapping */
 	struct sge_eq **eqmap;	/* eq->cntxt_id to eq mapping */
 
-	u_int fl_buf_sizes __aligned(CACHE_LINE_SIZE);
-	struct fl_buf_info fl_buf_info[FL_BUF_SIZES_MAX];
+	int pack_boundary;
+	int8_t safe_hwidx1;	/* may not have room for metadata */
+	int8_t safe_hwidx2;	/* with room for metadata and maybe more */
+	struct sw_zone_info sw_zone_info[SW_ZONE_SIZES];
+	struct hw_buf_info hw_buf_info[SGE_FLBUF_SIZES];
 };
 
 struct rss_header;

Modified: head/sys/dev/cxgbe/common/t4_hw.h
==============================================================================
--- head/sys/dev/cxgbe/common/t4_hw.h	Tue Mar 18 20:05:55 2014	(r263316)
+++ head/sys/dev/cxgbe/common/t4_hw.h	Tue Mar 18 20:14:13 2014	(r263317)
@@ -87,6 +87,7 @@ enum {
 	SGE_NTIMERS = 6,          /* # of interrupt holdoff timer values */
 	SGE_NCOUNTERS = 4,        /* # of interrupt packet counter values */
 	SGE_MAX_IQ_SIZE = 65520,
+	SGE_FLBUF_SIZES = 16,
 };
 
 struct sge_qstat {                /* data written to SGE queue status entries */

Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c	Tue Mar 18 20:05:55 2014	(r263316)
+++ head/sys/dev/cxgbe/t4_main.c	Tue Mar 18 20:14:13 2014	(r263317)
@@ -494,6 +494,8 @@ CTASSERT(offsetof(struct sge_ofld_rxq, f
 CTASSERT(nitems(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS);
 CTASSERT(nitems(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES);
 
+CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE);
+
 static int
 t4_probe(device_t dev)
 {

Modified: head/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sge.c	Tue Mar 18 20:05:55 2014	(r263316)
+++ head/sys/dev/cxgbe/t4_sge.c	Tue Mar 18 20:14:13 2014	(r263317)
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/kdb.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
+#include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/sysctl.h>
@@ -52,6 +53,8 @@ __FBSDID("$FreeBSD$");
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <machine/md_var.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
@@ -124,6 +127,27 @@ static int t4_fl_pack;
 static int t5_fl_pack;
 TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
 
+/*
+ * Allow the driver to create mbuf(s) in a cluster allocated for rx.
+ * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
+ * 1: ok to create mbuf(s) within a cluster if there is room.
+ */
+static int allow_mbufs_in_cluster = 1;
+TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
+
+/*
+ * Largest rx cluster size that the driver is allowed to allocate.
+ */
+static int largest_rx_cluster = MJUM16BYTES;
+TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
+
+/*
+ * Size of cluster allocation that's most likely to succeed.  The driver will
+ * fall back to this size if it fails to allocate clusters larger than this.
+ */
+static int safest_rx_cluster = PAGE_SIZE;
+TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
+
 /* Used to track coalesced tx work request */
 struct txpkts {
 	uint64_t *flitp;	/* ptr to flit where next pkt should start */
@@ -140,9 +164,7 @@ struct sgl {
 };
 
 static int service_iq(struct sge_iq *, int);
-static struct mbuf *get_fl_payload1(struct adapter *, struct sge_fl *, uint32_t,
-    int *);
-static struct mbuf *get_fl_payload2(struct adapter *, struct sge_fl *, uint32_t,
+static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t,
     int *);
 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int,
@@ -158,6 +180,8 @@ static int free_ring(struct adapter *, b
 static int alloc_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *,
     int, int);
 static int free_iq_fl(struct port_info *, struct sge_iq *, struct sge_fl *);
+static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
+    struct sge_fl *);
 static int alloc_fwq(struct adapter *);
 static int free_fwq(struct adapter *);
 static int alloc_mgmtq(struct adapter *);
@@ -191,7 +215,8 @@ static int refill_fl(struct adapter *, s
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
-static void set_fl_tag_idx(struct adapter *, struct sge_fl *, int);
+static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
+static void find_safe_refill_source(struct adapter *, struct sge_fl *);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static int get_pkt_sgl(struct sge_txq *, struct mbuf **, struct sgl *, int);
@@ -216,6 +241,7 @@ static int handle_fw_msg(struct sge_iq *
     struct mbuf *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
+static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 
 /*
  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
@@ -264,7 +290,7 @@ t4_sge_modload(void)
 	/* T5's pack boundary is independent of the pad boundary. */
 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
 	    !powerof2(fl_pack))
-	       t5_fl_pack = max(pad, 64);
+	       t5_fl_pack = max(pad, CACHE_LINE_SIZE);
 	else
 	       t5_fl_pack = fl_pack;
 
@@ -313,14 +339,18 @@ t4_tweak_chip_settings(struct adapter *s
 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
-	int sw_flbuf_sizes[] = {
+	static int sge_flbuf_sizes[] = {
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
+		MJUMPAGESIZE - CL_METADATA_SIZE,
+		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES,
-		MJUMPAGESIZE - MSIZE
+		MCLBYTES - MSIZE - CL_METADATA_SIZE,
+		MJUM9BYTES - CL_METADATA_SIZE,
+		MJUM16BYTES - CL_METADATA_SIZE,
 	};
 
 	KASSERT(sc->flags & MASTER_PF,
@@ -358,9 +388,11 @@ t4_tweak_chip_settings(struct adapter *s
 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
-	for (i = 0; i < min(nitems(sw_flbuf_sizes), 16); i++) {
+	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
+	    ("%s: hw buffer size table too big", __func__));
+	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
 		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
-		    sw_flbuf_sizes[i]);
+		    sge_flbuf_sizes[i]);
 	}
 
 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
@@ -415,6 +447,18 @@ t4_tweak_chip_settings(struct adapter *s
 }
 
 /*
+ * SGE wants the buffer to be at least 64B and then a multiple of the pad
+ * boundary or 16, whichever is greater.
+ */
+static inline int
+hwsz_ok(int hwsz)
+{
+	int mask = max(fl_pad, 16) - 1;
+
+	return (hwsz >= 64 && (hwsz & mask) == 0);
+}
+
+/*
  * XXX: driver really should be able to deal with unexpected settings.
  */
 int
@@ -424,7 +468,7 @@ t4_read_chip_settings(struct adapter *sc
 	int i, j, n, rc = 0;
 	uint32_t m, v, r;
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
-	uint32_t sge_flbuf_sizes[16], sw_flbuf_sizes[] = {
+	static int sw_buf_sizes[] = {	/* Sorted by size */
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
@@ -432,6 +476,8 @@ t4_read_chip_settings(struct adapter *sc
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
+	struct sw_zone_info *swz, *safe_swz;
+	struct hw_buf_info *hwb;
 
 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
@@ -462,6 +508,7 @@ t4_read_chip_settings(struct adapter *sc
 			rc = EINVAL;
 		}
 	}
+	s->pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack;
 
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
@@ -477,45 +524,93 @@ t4_read_chip_settings(struct adapter *sc
 		rc = EINVAL;
 	}
 
-	/*
-	 * Make a list of SGE FL buffer sizes programmed in the chip and tally
-	 * it with the FL buffer sizes that we'd like to use.
-	 */
-	n = 0;
-	for (i = 0; i < nitems(sge_flbuf_sizes); i++) {
+	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
+	hwb = &s->hw_buf_info[0];
+	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
 		r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i));
-		sge_flbuf_sizes[i] = r;
-		if (r == MJUMPAGESIZE - MSIZE &&
-		    (sc->flags & BUF_PACKING_OK) == 0) {
-			sc->flags |= BUF_PACKING_OK;
-			FL_BUF_HWTAG(sc, n) = i;
-			FL_BUF_SIZE(sc, n) = MJUMPAGESIZE - MSIZE;
-			FL_BUF_TYPE(sc, n) = m_gettype(MJUMPAGESIZE);
-			FL_BUF_ZONE(sc, n) = m_getzone(MJUMPAGESIZE);
-			n++;
-		}
+		hwb->size = r;
+		hwb->zidx = hwsz_ok(r) ? -1 : -2;
+		hwb->next = -1;
 	}
-	for (i = 0; i < nitems(sw_flbuf_sizes); i++) {
-		for (j = 0; j < nitems(sge_flbuf_sizes); j++) {
-			if (sw_flbuf_sizes[i] != sge_flbuf_sizes[j])
+
+	/*
+	 * Create a sorted list in decreasing order of hw buffer sizes (and so
+	 * increasing order of spare area) for each software zone.
+	 */
+	n = 0;	/* no usable buffer size to begin with */
+	swz = &s->sw_zone_info[0];
+	safe_swz = NULL;
+	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
+		int8_t head = -1, tail = -1;
+
+		swz->size = sw_buf_sizes[i];
+		swz->zone = m_getzone(swz->size);
+		swz->type = m_gettype(swz->size);
+
+		if (swz->size == safest_rx_cluster)
+			safe_swz = swz;
+
+		hwb = &s->hw_buf_info[0];
+		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
+			if (hwb->zidx != -1 || hwb->size > swz->size)
 				continue;
-			FL_BUF_HWTAG(sc, n) = j;
-			FL_BUF_SIZE(sc, n) = sw_flbuf_sizes[i];
-			FL_BUF_TYPE(sc, n) = m_gettype(sw_flbuf_sizes[i]);
-			FL_BUF_ZONE(sc, n) = m_getzone(sw_flbuf_sizes[i]);
+			hwb->zidx = i;
+			if (head == -1)
+				head = tail = j;
+			else if (hwb->size < s->hw_buf_info[tail].size) {
+				s->hw_buf_info[tail].next = j;
+				tail = j;
+			} else {
+				int8_t *cur;
+				struct hw_buf_info *t;
+
+				for (cur = &head; *cur != -1; cur = &t->next) {
+					t = &s->hw_buf_info[*cur];
+					if (hwb->size == t->size) {
+						hwb->zidx = -2;
+						break;
+					}
+					if (hwb->size > t->size) {
+						hwb->next = *cur;
+						*cur = j;
+						break;
+					}
+				}
+			}
+		}
+		swz->head_hwidx = head;
+		swz->tail_hwidx = tail;
+
+		if (tail != -1) {
 			n++;
-			break;
+			if (swz->size - s->hw_buf_info[tail].size >=
+			    CL_METADATA_SIZE)
+				sc->flags |= BUF_PACKING_OK;
 		}
 	}
 	if (n == 0) {
 		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
 		rc = EINVAL;
-	} else if (n == 1 && (sc->flags & BUF_PACKING_OK)) {
-		device_printf(sc->dev,
-		    "no usable SGE FL buffer size when not packing buffers.\n");
-		rc = EINVAL;
 	}
-	FL_BUF_SIZES(sc) = n;
+
+	s->safe_hwidx1 = -1;
+	s->safe_hwidx2 = -1;
+	if (safe_swz != NULL) {
+		s->safe_hwidx1 = safe_swz->head_hwidx;
+		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
+			int spare;
+
+			hwb = &s->hw_buf_info[i];
+			spare = safe_swz->size - hwb->size;
+			if (spare < CL_METADATA_SIZE)
+				continue;
+			if (s->safe_hwidx2 == -1 ||
+			    spare == CL_METADATA_SIZE + MSIZE)
+				s->safe_hwidx2 = i;
+			if (spare >= CL_METADATA_SIZE + MSIZE)
+				break;
+		}
+	}
 
 	r = t4_read_reg(sc, A_SGE_INGRESS_RX_THRESHOLD);
 	s->counter_val[0] = G_THRESHOLD_0(r);
@@ -627,6 +722,10 @@ t4_sge_sysctls(struct adapter *sc, struc
     struct sysctl_oid_list *children)
 {
 
+	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
+	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
+	    "freelist buffer sizes");
+
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
 	    NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)");
 
@@ -644,8 +743,7 @@ t4_sge_sysctls(struct adapter *sc, struc
 	    "pack multiple frames in one fl buffer");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
-	    NULL, is_t5(sc) ? t5_fl_pack : t4_fl_pack,
-	    "payload pack boundary (bytes)");
+	    NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)");
 }
 
 int
@@ -765,7 +863,7 @@ port_intr_iq(struct port_info *pi, int i
 #ifdef TCP_OFFLOAD
 	if (sc->flags & INTR_DIRECT) {
 		idx %= pi->nrxq + pi->nofldrxq;
-		
+
 		if (idx >= pi->nrxq) {
 			idx -= pi->nrxq;
 			iq = &s->ofld_rxq[pi->first_ofld_rxq + idx].iq;
@@ -796,29 +894,28 @@ port_intr_iq(struct port_info *pi, int i
 	return (iq);
 }
 
+/* Maximum payload that can be delivered with a single iq descriptor */
 static inline int
-mtu_to_bufsize(int mtu)
+mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
 {
-	int bufsize;
-
-	/* large enough for a frame even when VLAN extraction is disabled */
-	bufsize = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu;
-	bufsize = roundup2(bufsize + fl_pktshift, fl_pad);
-
-	return (bufsize);
-}
+	int payload;
 
 #ifdef TCP_OFFLOAD
-static inline int
-mtu_to_bufsize_toe(struct adapter *sc, int mtu)
-{
-
-	if (sc->tt.rx_coalesce)
-		return (G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)));
+	if (toe) {
+		payload = sc->tt.rx_coalesce ?
+		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
+	} else {
+#endif
+		/* large enough even when hw VLAN extraction is disabled */
+		payload = fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN +
+		    mtu;
+#ifdef TCP_OFFLOAD
+	}
+#endif
+	payload = roundup2(payload, fl_pad);
 
-	return (mtu);
+	return (payload);
 }
-#endif
 
 int
 t4_setup_port_queues(struct port_info *pi)
@@ -837,7 +934,7 @@ t4_setup_port_queues(struct port_info *p
 	struct ifnet *ifp = pi->ifp;
 	struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
-	int bufsize, pack;
+	int maxp, pack, mtu = ifp->if_mtu;
 
 	oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD,
 	    NULL, "rx queues");
@@ -858,7 +955,7 @@ t4_setup_port_queues(struct port_info *p
 	 * a) initialize iq and fl
 	 * b) allocate queue iff it will take direct interrupts.
 	 */
-	bufsize = mtu_to_bufsize(ifp->if_mtu);
+	maxp = mtu_to_max_payload(sc, mtu, 0);
 	pack = enable_buffer_packing(sc);
 	for_each_rxq(pi, i, rxq) {
 
@@ -867,7 +964,7 @@ t4_setup_port_queues(struct port_info *p
 
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(pi->dev), i);
-		init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, bufsize, pack, name);
+		init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, pack, name);
 
 		if (sc->flags & INTR_DIRECT
 #ifdef TCP_OFFLOAD
@@ -883,8 +980,7 @@ t4_setup_port_queues(struct port_info *p
 	}
 
 #ifdef TCP_OFFLOAD
-	bufsize = mtu_to_bufsize_toe(sc, ifp->if_mtu);
-	pack = 0;	/* XXX: think about this some more */
+	maxp = mtu_to_max_payload(sc, mtu, 1);
 	for_each_ofld_rxq(pi, i, ofld_rxq) {
 
 		init_iq(&ofld_rxq->iq, sc, pi->tmr_idx, pi->pktc_idx,
@@ -892,8 +988,7 @@ t4_setup_port_queues(struct port_info *p
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(pi->dev), i);
-		init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, bufsize, pack,
-		    name);
+		init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, pack, name);
 
 		if (sc->flags & INTR_DIRECT ||
 		    (sc->intr_count > 1 && pi->nofldrxq > pi->nrxq)) {
@@ -1170,10 +1265,7 @@ service_iq(struct sge_iq *iq, int budget
 				    ("%s: data for an iq (%p) with no freelist",
 				    __func__, iq));
 
-				m0 = fl->flags & FL_BUF_PACKING ?
-				    get_fl_payload1(sc, fl, lq, &fl_bufs_used) :
-				    get_fl_payload2(sc, fl, lq, &fl_bufs_used);
-
+				m0 = get_fl_payload(sc, fl, lq, &fl_bufs_used);
 				if (__predict_false(m0 == NULL))
 					goto process_iql;
 #ifdef T4_PKT_TIMESTAMP
@@ -1246,6 +1338,14 @@ service_iq(struct sge_iq *iq, int budget
 				break;
 			}
 
+			if (fl_bufs_used >= 16) {
+				FL_LOCK(fl);
+				fl->needed += fl_bufs_used;
+				refill_fl(sc, fl, 32);
+				FL_UNLOCK(fl);
+				fl_bufs_used = 0;
+			}
+
 			iq_next(iq);
 			if (++ndescs == limit) {
 				t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS),
@@ -1262,14 +1362,6 @@ service_iq(struct sge_iq *iq, int budget
 				}
 #endif
 
-				if (fl_bufs_used > 0) {
-					FL_LOCK(fl);
-					fl->needed += fl_bufs_used;
-					refill_fl(sc, fl, fl->cap / 8);
-					FL_UNLOCK(fl);
-					fl_bufs_used = 0;
-				}
-
 				if (budget)
 					return (EINPROGRESS);
 			}
@@ -1312,7 +1404,7 @@ process_iql:
 
 		FL_LOCK(fl);
 		fl->needed += fl_bufs_used;
-		starved = refill_fl(sc, fl, fl->cap / 4);
+		starved = refill_fl(sc, fl, 64);
 		FL_UNLOCK(fl);
 		if (__predict_false(starved != 0))
 			add_fl_to_sfl(sc, fl);
@@ -1321,74 +1413,28 @@ process_iql:
 	return (0);
 }
 
-static int
-fill_mbuf_stash(struct sge_fl *fl)
-{
-	int i;
-
-	for (i = 0; i < nitems(fl->mstash); i++) {
-		if (fl->mstash[i] == NULL) {
-			struct mbuf *m;
-			if ((m = m_get(M_NOWAIT, MT_NOINIT)) == NULL)
-				return (ENOBUFS);
-			fl->mstash[i] = m;
-		}
-	}
-	return (0);
-}
-
-static struct mbuf *
-get_mbuf_from_stash(struct sge_fl *fl)
+static inline int
+cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
 {
-	int i;
+	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
 
-	for (i = 0; i < nitems(fl->mstash); i++) {
-		if (fl->mstash[i] != NULL) {
-			struct mbuf *m;
-
-			m = fl->mstash[i];
-			fl->mstash[i] = NULL;
-			return (m);
-		} else
-			fl->mstash[i] = m_get(M_NOWAIT, MT_NOINIT);
-	}
+	if (rc)
+		MPASS(cll->region3 >= CL_METADATA_SIZE);
 
-	return (m_get(M_NOWAIT, MT_NOINIT));
+	return (rc);
 }
 
-static void
-return_mbuf_to_stash(struct sge_fl *fl, struct mbuf *m)
+static inline struct cluster_metadata *
+cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
+    caddr_t cl)
 {
-	int i;
 
-	if (m == NULL)
-		return;
+	if (cl_has_metadata(fl, cll)) {
+		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 
-	for (i = 0; i < nitems(fl->mstash); i++) {
-		if (fl->mstash[i] == NULL) {
-			fl->mstash[i] = m;
-			return;
-		}
+		return ((struct cluster_metadata *)(cl + swz->size) - 1);
 	}
-	m_init(m, NULL, 0, M_NOWAIT, MT_DATA, 0);
-	m_free(m);
-}
-
-/* buf can be any address within the buffer */
-static inline u_int *
-find_buf_refcnt(caddr_t buf)
-{
-	uintptr_t ptr = (uintptr_t)buf;
-
-	return ((u_int *)((ptr & ~(MJUMPAGESIZE - 1)) + MSIZE - sizeof(u_int)));
-}
-
-static inline struct mbuf *
-find_buf_mbuf(caddr_t buf)
-{
-	uintptr_t ptr = (uintptr_t)buf;
-
-	return ((struct mbuf *)(ptr & ~(MJUMPAGESIZE - 1)));
+	return (NULL);
 }
 
 static int
@@ -1396,179 +1442,117 @@ rxb_free(struct mbuf *m, void *arg1, voi
 {
 	uma_zone_t zone = arg1;
 	caddr_t cl = arg2;
-#ifdef notyet
-	u_int refcount;
 
-	refcount = *find_buf_refcnt(cl);
-	KASSERT(refcount == 0, ("%s: cl %p refcount is %u", __func__,
-	    cl - MSIZE, refcount));
-#endif
-	cl -= MSIZE;
 	uma_zfree(zone, cl);
 
 	return (EXT_FREE_OK);
 }
 
+/*
+ * The mbuf returned by this function could be allocated from zone_mbuf or
+ * constructed in spare room in the cluster.
+ *
+ * The mbuf carries the payload in one of these ways
+ * a) frame inside the mbuf (mbuf from zone_mbuf)
+ * b) m_cljset (for clusters without metadata) zone_mbuf
+ * c) m_extaddref (cluster with metadata) inline mbuf
+ * d) m_extaddref (cluster with metadata) zone_mbuf
+ */
 static struct mbuf *
-get_fl_payload1(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
-    int *fl_bufs_used)
+get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int total, int flags)
 {
-	struct mbuf *m0, *m;
+	struct mbuf *m;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
-	unsigned int nbuf, len;
-	int pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack;
+	struct cluster_layout *cll = &sd->cll;
+	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
+	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
+	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
+	int len, padded_len;
+	caddr_t payload;
+
+	len = min(total, hwb->size - fl->rx_offset);
+	padded_len = roundup2(len, fl_pad);
+	payload = sd->cl + cll->region1 + fl->rx_offset;
 
-	/*
-	 * No assertion for the fl lock because we don't need it.  This routine
-	 * is called only from the rx interrupt handler and it only updates
-	 * fl->cidx.  (Contrast that with fl->pidx/fl->needed which could be
-	 * updated in the rx interrupt handler or the starvation helper routine.
-	 * That's why code that manipulates fl->pidx/fl->needed needs the fl
-	 * lock but this routine does not).
-	 */
+	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
 
-	KASSERT(fl->flags & FL_BUF_PACKING,
-	    ("%s: buffer packing disabled for fl %p", __func__, fl));
-
-	len = G_RSPD_LEN(len_newbuf);
+		/*
+		 * Copy payload into a freshly allocated mbuf.
+		 */
 
-	if ((len_newbuf & F_RSPD_NEWBUF) == 0) {
-		KASSERT(fl->rx_offset > 0,
-		    ("%s: packed frame but driver at offset=0", __func__));
-
-		/* A packed frame is guaranteed to fit entirely in this buf. */
-		KASSERT(FL_BUF_SIZE(sc, sd->tag_idx) - fl->rx_offset >= len,
-		    ("%s: packing error.  bufsz=%u, offset=%u, len=%u",
-		    __func__, FL_BUF_SIZE(sc, sd->tag_idx), fl->rx_offset,
-		    len));
-
-		m0 = get_mbuf_from_stash(fl);
-		if (m0 == NULL ||
-		    m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) {
-			return_mbuf_to_stash(fl, m0);
+		m = flags & M_PKTHDR ?
+		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
+		if (m == NULL)
 			return (NULL);
-		}
-
-		bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
-		    BUS_DMASYNC_POSTREAD);
-		if (sc->sc_do_rxcopy && (len < RX_COPY_THRESHOLD)) {
+		fl->mbuf_allocated++;
 #ifdef T4_PKT_TIMESTAMP
-			/* Leave room for a timestamp */
-			m0->m_data += 8;
+		/* Leave room for a timestamp */
+		m->m_data += 8;
 #endif
-			bcopy(sd->cl + fl->rx_offset, mtod(m0, caddr_t), len);
-			m0->m_pkthdr.len = len;
-			m0->m_len = len;
-		} else {
-			m0->m_pkthdr.len = len;
-			m0->m_len = len;
-			m_extaddref(m0, sd->cl + fl->rx_offset,
-			    roundup2(m0->m_len, fl_pad),
-			    find_buf_refcnt(sd->cl), rxb_free,
-			    FL_BUF_ZONE(sc, sd->tag_idx), sd->cl);
-		}
-		fl->rx_offset += len;
-		fl->rx_offset = roundup2(fl->rx_offset, fl_pad);
-		fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
-		if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
-			fl->rx_offset = 0;
-			(*fl_bufs_used) += 1;
-			if (__predict_false(++fl->cidx == fl->cap))
-				fl->cidx = 0;
-		}
+		/* copy data to mbuf */
+		bcopy(payload, mtod(m, caddr_t), len);
 
-		return (m0);
-	}
+	} else if (sd->nmbuf * MSIZE < cll->region1) {
 
-	KASSERT(len_newbuf & F_RSPD_NEWBUF,
-	    ("%s: only new buffer handled here", __func__));
+		/*
+		 * There's spare room in the cluster for an mbuf.  Create one
+		 * and associate it with the payload that's in the cluster too.
+		 */
 
-	nbuf = 0;
+		MPASS(clm != NULL);
+		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
+		/* No bzero required */
+		if (m_init(m, NULL, 0, M_NOWAIT, MT_DATA, flags | M_NOFREE))
+			return (NULL);
+		fl->mbuf_inlined++;
+		m_extaddref(m, payload, padded_len, &clm->refcount, rxb_free,
+		    swz->zone, sd->cl);
+		sd->nmbuf++;
 
-	/*
-	 * Move to the start of the next buffer if we are still in the middle of
-	 * some buffer.  This is the case where there was some room left in the
-	 * previous buffer but not enough to fit this frame in its entirety.
-	 */
-	if (fl->rx_offset > 0) {
-		KASSERT(roundup2(len, fl_pad) > FL_BUF_SIZE(sc, sd->tag_idx) -
-		    fl->rx_offset, ("%s: frame (%u bytes) should have fit at "
-		    "cidx %u offset %u bufsize %u", __func__, len, fl->cidx,
-		    fl->rx_offset, FL_BUF_SIZE(sc, sd->tag_idx)));
-		nbuf++;
-		fl->rx_offset = 0;
-		sd++;
-		if (__predict_false(++fl->cidx == fl->cap)) {
-			sd = fl->sdesc;
-			fl->cidx = 0;
-		}
-	}
+	} else {
 
-	m0 = find_buf_mbuf(sd->cl);
-	if (m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR | M_NOFREE))
-		goto done;
-	bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map, BUS_DMASYNC_POSTREAD);
-	m0->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
-	m_extaddref(m0, sd->cl, roundup2(m0->m_len, fl_pad),
-	    find_buf_refcnt(sd->cl), rxb_free, FL_BUF_ZONE(sc, sd->tag_idx),
-	    sd->cl);
-	m0->m_pkthdr.len = len;
-
-	fl->rx_offset = roundup2(m0->m_len, fl_pad);
-	fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
-	if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
-		fl->rx_offset = 0;
-		nbuf++;
-		sd++;
-		if (__predict_false(++fl->cidx == fl->cap)) {
-			sd = fl->sdesc;
-			fl->cidx = 0;
+		/*
+		 * Grab an mbuf from zone_mbuf and associate it with the
+		 * payload in the cluster.
+		 */
+
+		m = flags & M_PKTHDR ?
+		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
+		if (m == NULL)
+			return (NULL);
+		fl->mbuf_allocated++;
+		if (clm != NULL)
+			m_extaddref(m, payload, padded_len, &clm->refcount,
+			    rxb_free, swz->zone, sd->cl);
+		else {
+			m_cljset(m, sd->cl, swz->type);
+			sd->cl = NULL;	/* consumed, not a recycle candidate */
 		}
 	}
+	if (flags & M_PKTHDR)
+		m->m_pkthdr.len = total;
+	m->m_len = len;
 
-	m = m0;
-	len -= m->m_len;
-
-	while (len > 0) {
-		m->m_next = find_buf_mbuf(sd->cl);
-		m = m->m_next;
-
-		bus_dmamap_sync(fl->tag[sd->tag_idx], sd->map,
-		    BUS_DMASYNC_POSTREAD);
+	if (fl->flags & FL_BUF_PACKING) {
+		fl->rx_offset += roundup2(padded_len, sc->sge.pack_boundary);
+		MPASS(fl->rx_offset <= hwb->size);
+		if (fl->rx_offset < hwb->size)
+			return (m);	/* without advancing the cidx */
+	}
 
-		/* m_init for !M_PKTHDR can't fail so don't bother */
-		m_init(m, NULL, 0, M_NOWAIT, MT_DATA, M_NOFREE);
-		m->m_len = min(len, FL_BUF_SIZE(sc, sd->tag_idx));
-		m_extaddref(m, sd->cl, roundup2(m->m_len, fl_pad),
-		    find_buf_refcnt(sd->cl), rxb_free,
-		    FL_BUF_ZONE(sc, sd->tag_idx), sd->cl);
-
-		fl->rx_offset = roundup2(m->m_len, fl_pad);
-		fl->rx_offset = roundup2(fl->rx_offset, pack_boundary);
-		if (fl->rx_offset >= FL_BUF_SIZE(sc, sd->tag_idx)) {
-			fl->rx_offset = 0;
-			nbuf++;
-			sd++;
-			if (__predict_false(++fl->cidx == fl->cap)) {
-				sd = fl->sdesc;
-				fl->cidx = 0;
-			}
-		}
+	if (__predict_false(++fl->cidx == fl->cap))
+		fl->cidx = 0;
+	fl->rx_offset = 0;
 
-		len -= m->m_len;
-	}
-done:
-	(*fl_bufs_used) += nbuf;
-	return (m0);
+	return (m);
 }
 
 static struct mbuf *
-get_fl_payload2(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
+get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf,
     int *fl_bufs_used)
 {
-	struct mbuf *m0, *m;
-	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
-	unsigned int nbuf, len;
+	struct mbuf *m0, *m, **pnext;
+	u_int nbuf, len;
 
 	/*
 	 * No assertion for the fl lock because we don't need it.  This routine
@@ -1579,87 +1563,54 @@ get_fl_payload2(struct adapter *sc, stru
 	 * lock but this routine does not).
 	 */
 
-	KASSERT((fl->flags & FL_BUF_PACKING) == 0,
-	    ("%s: buffer packing enabled for fl %p", __func__, fl));
-	if (__predict_false((len_newbuf & F_RSPD_NEWBUF) == 0))
-		panic("%s: cannot handle packed frames", __func__);
+	nbuf = 0;
 	len = G_RSPD_LEN(len_newbuf);
-
-	/*
-	 * We never want to run out of mbufs in between a frame when a frame
-	 * spans multiple fl buffers.  If the fl's mbuf stash isn't full and
-	 * can't be filled up to the brim then fail early.
-	 */
-	if (len > FL_BUF_SIZE(sc, sd->tag_idx) && fill_mbuf_stash(fl) != 0)
-		return (NULL);
-
-	m0 = get_mbuf_from_stash(fl);
-	if (m0 == NULL ||
-	    m_init(m0, NULL, 0, M_NOWAIT, MT_DATA, M_PKTHDR) != 0) {
-		return_mbuf_to_stash(fl, m0);
-		return (NULL);
+	if (__predict_false(fl->m0 != NULL)) {
+		MPASS(len == fl->m0->m_pkthdr.len);
+		MPASS(fl->remaining < len);
+
+		m0 = fl->m0;
+		pnext = fl->pnext;
+		len = fl->remaining;
+		fl->m0 = NULL;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list