svn commit: r294327 - in head/sys: dev/cxgb dev/cxgbe dev/e1000 dev/hyperv/netvsc dev/ixgbe dev/mxge netinet sys

Hans Petter Selasky hselasky at FreeBSD.org
Tue Jan 19 15:33:30 UTC 2016


Author: hselasky
Date: Tue Jan 19 15:33:28 2016
New Revision: 294327
URL: https://svnweb.freebsd.org/changeset/base/294327

Log:
  Add optimizing LRO wrapper:
  
  - Add optimizing LRO wrapper which pre-sorts all incoming packets
    according to the hash type and flowid. This prevents exhaustion of
    the LRO entries due to too many connections at the same time.
    Testing using a larger number of higher bandwidth TCP connections
    showed that the incoming ACK packet aggregation rate increased from
    ~1.3:1 to almost 3:1. Another test showed that for a number of TCP
    connections greater than 16 per hardware receive ring, where 8 TCP
    connections was the LRO active entry limit, there was a significant
    improvement in throughput due to being able to fully aggregate more
    than 8 TCP stream. For very few very high bandwidth TCP streams, the
    optimizing LRO wrapper will add CPU usage instead of reducing CPU
    usage. This is expected. Network drivers which want to use the
    optimizing LRO wrapper needs to call "tcp_lro_queue_mbuf()" instead
    of "tcp_lro_rx()" and "tcp_lro_flush_all()" instead of
    "tcp_lro_flush()". Further the LRO control structure must be
    initialized using "tcp_lro_init_args()" passing a non-zero number
    into the "lro_mbufs" argument.
  
  - Make LRO statistics 64-bit. Previously 32-bit integers were used for
    statistics which can be prone to wrap-around. Fix this while at it
    and update all SYSCTL's which expose LRO statistics.
  
  - Ensure all data is freed when destroying a LRO control structures,
    especially leftover LRO entries.
  
  - Reduce number of memory allocations needed when setting up a LRO
    control structure by precomputing the total amount of memory needed.
  
  - Add own memory allocation counter for LRO.
  
  - Bump the FreeBSD version to force recompilation of all KLDs due to
    change of the LRO control structure size.
  
  Sponsored by:	Mellanox Technologies
  Reviewed by:	gallatin, sbruno, rrs, gnn, transport
  Tested by:	Netflix
  Differential Revision:	https://reviews.freebsd.org/D4914

Modified:
  head/sys/dev/cxgb/cxgb_sge.c
  head/sys/dev/cxgbe/t4_sge.c
  head/sys/dev/e1000/if_igb.c
  head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
  head/sys/dev/ixgbe/if_ix.c
  head/sys/dev/ixgbe/if_ixv.c
  head/sys/dev/mxge/if_mxge.c
  head/sys/netinet/tcp_lro.c
  head/sys/netinet/tcp_lro.h
  head/sys/sys/param.h

Modified: head/sys/dev/cxgb/cxgb_sge.c
==============================================================================
--- head/sys/dev/cxgb/cxgb_sge.c	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/dev/cxgb/cxgb_sge.c	Tue Jan 19 15:33:28 2016	(r294327)
@@ -3579,11 +3579,11 @@ t3_add_configured_sysctls(adapter_t *sc)
 			    CTLTYPE_STRING | CTLFLAG_RD, &qs->txq[TXQ_CTRL],
 			    0, t3_dump_txq_ctrl, "A", "dump of the transmit queue");
 
-			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_queued",
+			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_queued",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_queued, 0, NULL);
-			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_flushed",
+			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_flushed",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_flushed, 0, NULL);
-			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
+			SYSCTL_ADD_U64(ctx, lropoidlist, OID_AUTO, "lro_bad_csum",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_bad_csum, 0, NULL);
 			SYSCTL_ADD_INT(ctx, lropoidlist, OID_AUTO, "lro_cnt",
 			    CTLFLAG_RD, &qs->lro.ctrl.lro_cnt, 0, NULL);

Modified: head/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sge.c	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/dev/cxgbe/t4_sge.c	Tue Jan 19 15:33:28 2016	(r294327)
@@ -2939,9 +2939,9 @@ alloc_rxq(struct vi_info *vi, struct sge
 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 #if defined(INET) || defined(INET6)
-	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
+	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
 	    &rxq->lro.lro_queued, 0, NULL);
-	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
+	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
 	    &rxq->lro.lro_flushed, 0, NULL);
 #endif
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,

Modified: head/sys/dev/e1000/if_igb.c
==============================================================================
--- head/sys/dev/e1000/if_igb.c	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/dev/e1000/if_igb.c	Tue Jan 19 15:33:28 2016	(r294327)
@@ -5914,10 +5914,10 @@ igb_add_hw_stats(struct adapter *adapter
 		SYSCTL_ADD_QUAD(ctx, queue_list, OID_AUTO, "rx_bytes",
 				CTLFLAG_RD, &rxr->rx_bytes,
 				"Queue Bytes Received");
-		SYSCTL_ADD_UINT(ctx, queue_list, OID_AUTO, "lro_queued",
+		SYSCTL_ADD_U64(ctx, queue_list, OID_AUTO, "lro_queued",
 				CTLFLAG_RD, &lro->lro_queued, 0,
 				"LRO Queued");
-		SYSCTL_ADD_UINT(ctx, queue_list, OID_AUTO, "lro_flushed",
+		SYSCTL_ADD_U64(ctx, queue_list, OID_AUTO, "lro_flushed",
 				CTLFLAG_RD, &lro->lro_flushed, 0,
 				"LRO Flushed");
 	}

Modified: head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
==============================================================================
--- head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c	Tue Jan 19 15:33:28 2016	(r294327)
@@ -405,9 +405,9 @@ netvsc_attach(device_t dev)
 	ctx = device_get_sysctl_ctx(dev);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 
-	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued",
+	SYSCTL_ADD_U64(ctx, child, OID_AUTO, "lro_queued",
 	    CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued");
-	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed",
+	SYSCTL_ADD_U64(ctx, child, OID_AUTO, "lro_flushed",
 	    CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed");
 	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried",
 	    CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries");

Modified: head/sys/dev/ixgbe/if_ix.c
==============================================================================
--- head/sys/dev/ixgbe/if_ix.c	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/dev/ixgbe/if_ix.c	Tue Jan 19 15:33:28 2016	(r294327)
@@ -4476,10 +4476,10 @@ ixgbe_add_hw_stats(struct adapter *adapt
 		SYSCTL_ADD_UQUAD(ctx, queue_list, OID_AUTO, "rx_copies",
 				CTLFLAG_RD, &rxr->rx_copies,
 				"Copied RX Frames");
-		SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_queued",
+		SYSCTL_ADD_U64(ctx, queue_list, OID_AUTO, "lro_queued",
 				CTLFLAG_RD, &lro->lro_queued, 0,
 				"LRO Queued");
-		SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "lro_flushed",
+		SYSCTL_ADD_U64(ctx, queue_list, OID_AUTO, "lro_flushed",
 				CTLFLAG_RD, &lro->lro_flushed, 0,
 				"LRO Flushed");
 	}

Modified: head/sys/dev/ixgbe/if_ixv.c
==============================================================================
--- head/sys/dev/ixgbe/if_ixv.c	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/dev/ixgbe/if_ixv.c	Tue Jan 19 15:33:28 2016	(r294327)
@@ -2167,10 +2167,10 @@ ixv_print_debug_info(struct adapter *ada
                     rxr->me, (long long)rxr->rx_packets);
                 device_printf(dev,"RX(%d) Bytes Received: %lu\n",
                     rxr->me, (long)rxr->rx_bytes);
-                device_printf(dev,"RX(%d) LRO Queued= %d\n",
-                    rxr->me, lro->lro_queued);
-                device_printf(dev,"RX(%d) LRO Flushed= %d\n",
-                    rxr->me, lro->lro_flushed);
+                device_printf(dev,"RX(%d) LRO Queued= %lld\n",
+                    rxr->me, (long long)lro->lro_queued);
+                device_printf(dev,"RX(%d) LRO Flushed= %lld\n",
+                    rxr->me, (long long)lro->lro_flushed);
                 device_printf(dev,"TX(%d) Packets Sent: %lu\n",
                     txr->me, (long)txr->total_packets);
                 device_printf(dev,"TX(%d) NO Desc Avail: %lu\n",

Modified: head/sys/dev/mxge/if_mxge.c
==============================================================================
--- head/sys/dev/mxge/if_mxge.c	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/dev/mxge/if_mxge.c	Tue Jan 19 15:33:28 2016	(r294327)
@@ -1637,15 +1637,15 @@ mxge_add_sysctls(mxge_softc_t *sc)
 			       "rx_big_cnt",
 			       CTLFLAG_RD, &ss->rx_big.cnt,
 			       0, "rx_small_cnt");
-		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
 			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
 			       0, "number of lro merge queues flushed");
 
-		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
 			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
 			       0, "number of bad csums preventing LRO");
 
-		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
 			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
 			       0, "number of frames appended to lro merge"
 			       "queues");

Modified: head/sys/netinet/tcp_lro.c
==============================================================================
--- head/sys/netinet/tcp_lro.c	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/netinet/tcp_lro.c	Tue Jan 19 15:33:28 2016	(r294327)
@@ -2,6 +2,7 @@
  * Copyright (c) 2007, Myricom Inc.
  * Copyright (c) 2008, Intel Corporation.
  * Copyright (c) 2012 The FreeBSD Foundation
+ * Copyright (c) 2016 Mellanox Technologies.
  * All rights reserved.
  *
  * Portions of this software were developed by Bjoern Zeeb
@@ -58,9 +59,7 @@ __FBSDID("$FreeBSD$");
 
 #include <machine/in_cksum.h>
 
-#ifndef LRO_ENTRIES
-#define	LRO_ENTRIES	8	/* # of LRO entries per RX queue. */
-#endif
+static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
 
 #define	TCP_LRO_UPDATE_CSUM	1
 #ifndef	TCP_LRO_UPDATE_CSUM
@@ -70,42 +69,73 @@ __FBSDID("$FreeBSD$");
 int
 tcp_lro_init(struct lro_ctrl *lc)
 {
+	return (tcp_lro_init_args(lc, NULL, TCP_LRO_ENTRIES, 0));
+}
+
+int
+tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
+    unsigned lro_entries, unsigned lro_mbufs)
+{
 	struct lro_entry *le;
-	int error, i;
+	size_t size;
+	unsigned i;
 
 	lc->lro_bad_csum = 0;
 	lc->lro_queued = 0;
 	lc->lro_flushed = 0;
 	lc->lro_cnt = 0;
+	lc->lro_mbuf_count = 0;
+	lc->lro_mbuf_max = lro_mbufs;
+	lc->lro_cnt = lro_entries;
+	lc->ifp = ifp;
 	SLIST_INIT(&lc->lro_free);
 	SLIST_INIT(&lc->lro_active);
 
-	error = 0;
-	for (i = 0; i < LRO_ENTRIES; i++) {
-		le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
-		    M_NOWAIT | M_ZERO);
-                if (le == NULL) {
-			if (i == 0)
-				error = ENOMEM;
-                        break;
-                }
-		lc->lro_cnt = i + 1;
-		SLIST_INSERT_HEAD(&lc->lro_free, le, next);
-        }
+	/* compute size to allocate */
+	size = (lro_mbufs * sizeof(struct mbuf *)) +
+	    (lro_entries * sizeof(*le));
+	lc->lro_mbuf_data = (struct mbuf **)
+	    malloc(size, M_LRO, M_NOWAIT | M_ZERO);
+
+	/* check for out of memory */
+	if (lc->lro_mbuf_data == NULL) {
+		memset(lc, 0, sizeof(*lc));
+		return (ENOMEM);
+	}
+	/* compute offset for LRO entries */
+	le = (struct lro_entry *)
+	    (lc->lro_mbuf_data + lro_mbufs);
+
+	/* setup linked list */
+	for (i = 0; i != lro_entries; i++)
+		SLIST_INSERT_HEAD(&lc->lro_free, le + i, next);
 
-	return (error);
+	return (0);
 }
 
 void
 tcp_lro_free(struct lro_ctrl *lc)
 {
 	struct lro_entry *le;
+	unsigned x;
+
+	/* reset LRO free list */
+	SLIST_INIT(&lc->lro_free);
 
-	while (!SLIST_EMPTY(&lc->lro_free)) {
-		le = SLIST_FIRST(&lc->lro_free);
-		SLIST_REMOVE_HEAD(&lc->lro_free, next);
-		free(le, M_DEVBUF);
+	/* free active mbufs, if any */
+	while ((le = SLIST_FIRST(&lc->lro_active)) != NULL) {
+		SLIST_REMOVE_HEAD(&lc->lro_active, next);
+		m_freem(le->m_head);
 	}
+
+	/* free mbuf array, if any */
+	for (x = 0; x != lc->lro_mbuf_count; x++)
+		m_freem(lc->lro_mbuf_data[x]);
+	lc->lro_mbuf_count = 0;
+	
+	/* free allocated memory, if any */
+	free(lc->lro_mbuf_data, M_LRO);
+	lc->lro_mbuf_data = NULL;
 }
 
 #ifdef TCP_LRO_UPDATE_CSUM
@@ -305,6 +335,83 @@ tcp_lro_flush(struct lro_ctrl *lc, struc
 	SLIST_INSERT_HEAD(&lc->lro_free, le, next);
 }
 
+static int
+tcp_lro_mbuf_compare_header(const void *ppa, const void *ppb)
+{
+	const struct mbuf *ma = *((const struct mbuf * const *)ppa);
+	const struct mbuf *mb = *((const struct mbuf * const *)ppb);
+	int ret;
+
+	ret = M_HASHTYPE_GET(ma) - M_HASHTYPE_GET(mb);
+	if (ret != 0)
+		goto done;
+
+	ret = ma->m_pkthdr.flowid - mb->m_pkthdr.flowid;
+	if (ret != 0)
+		goto done;
+
+	ret = TCP_LRO_SEQUENCE(ma) - TCP_LRO_SEQUENCE(mb);
+done:
+	return (ret);
+}
+
+void
+tcp_lro_flush_all(struct lro_ctrl *lc)
+{
+	struct lro_entry *le;
+	uint32_t hashtype;
+	uint32_t flowid;
+	unsigned x;
+
+	/* check if no mbufs to flush */
+	if (__predict_false(lc->lro_mbuf_count == 0))
+		goto done;
+
+	/* sort all mbufs according to stream */
+	qsort(lc->lro_mbuf_data, lc->lro_mbuf_count, sizeof(struct mbuf *),
+	    &tcp_lro_mbuf_compare_header);
+
+	/* input data into LRO engine, stream by stream */
+	flowid = 0;
+	hashtype = M_HASHTYPE_NONE;
+	for (x = 0; x != lc->lro_mbuf_count; x++) {
+		struct mbuf *mb;
+
+		mb = lc->lro_mbuf_data[x];
+
+		/* check for new stream */
+		if (mb->m_pkthdr.flowid != flowid ||
+		    M_HASHTYPE_GET(mb) != hashtype) {
+			flowid = mb->m_pkthdr.flowid;
+			hashtype = M_HASHTYPE_GET(mb);
+
+			/* flush active streams */
+			while ((le = SLIST_FIRST(&lc->lro_active)) != NULL) {
+				SLIST_REMOVE_HEAD(&lc->lro_active, next);
+				tcp_lro_flush(lc, le);
+			}
+		}
+#ifdef TCP_LRO_RESET_SEQUENCE
+		/* reset sequence number */
+		TCP_LRO_SEQUENCE(mb) = 0;
+#endif
+		/* add packet to LRO engine */
+		if (tcp_lro_rx(lc, mb, 0) != 0) {
+			/* input packet to network layer */
+			(*lc->ifp->if_input)(lc->ifp, mb);
+			lc->lro_queued++;
+			lc->lro_flushed++;
+		}
+	}
+done:
+	/* flush active streams */
+	while ((le = SLIST_FIRST(&lc->lro_active)) != NULL) {
+		SLIST_REMOVE_HEAD(&lc->lro_active, next);
+		tcp_lro_flush(lc, le);
+	}
+	lc->lro_mbuf_count = 0;
+}
+
 #ifdef INET6
 static int
 tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
@@ -633,4 +740,37 @@ tcp_lro_rx(struct lro_ctrl *lc, struct m
 	return (0);
 }
 
+void
+tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
+{
+	/* sanity checks */
+	if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
+	    lc->lro_mbuf_max == 0)) {
+		/* packet drop */
+		m_freem(mb);
+		return;
+	}
+
+	/* check if packet is not LRO capable */
+	if (__predict_false(mb->m_pkthdr.csum_flags == 0 ||
+	    (lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
+		lc->lro_flushed++;
+		lc->lro_queued++;
+
+		/* input packet to network layer */
+		(*lc->ifp->if_input) (lc->ifp, mb);
+		return;
+	}
+
+	/* check if array is full */
+	if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max))
+		tcp_lro_flush_all(lc);
+
+	/* store sequence number */
+	TCP_LRO_SEQUENCE(mb) = lc->lro_mbuf_count;
+
+	/* enter mbuf */
+	lc->lro_mbuf_data[lc->lro_mbuf_count++] = mb;
+}
+
 /* end */

Modified: head/sys/netinet/tcp_lro.h
==============================================================================
--- head/sys/netinet/tcp_lro.h	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/netinet/tcp_lro.h	Tue Jan 19 15:33:28 2016	(r294327)
@@ -1,6 +1,7 @@
 /*-
  * Copyright (c) 2006, Myricom Inc.
  * Copyright (c) 2008, Intel Corporation.
+ * Copyright (c) 2016 Mellanox Technologies.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,6 +33,14 @@
 
 #include <sys/time.h>
 
+#ifndef TCP_LRO_ENTRIES
+/* Define default number of LRO entries per RX queue */
+#define	TCP_LRO_ENTRIES	8
+#endif
+
+#define	TCP_LRO_SEQUENCE(mb) \
+    (mb)->m_pkthdr.PH_loc.thirtytwo[0]
+
 struct lro_entry
 {
 	SLIST_ENTRY(lro_entry)	next;
@@ -75,20 +84,26 @@ SLIST_HEAD(lro_head, lro_entry);
 /* NB: This is part of driver structs. */
 struct lro_ctrl {
 	struct ifnet	*ifp;
-	int		lro_queued;
-	int		lro_flushed;
-	int		lro_bad_csum;
-	int		lro_cnt;
+	struct mbuf	**lro_mbuf_data;
+	uint64_t	lro_queued;
+	uint64_t	lro_flushed;
+	uint64_t	lro_bad_csum;
+	unsigned	lro_cnt;
+	unsigned	lro_mbuf_count;
+	unsigned	lro_mbuf_max;
 
 	struct lro_head	lro_active;
 	struct lro_head	lro_free;
 };
 
 int tcp_lro_init(struct lro_ctrl *);
+int tcp_lro_init_args(struct lro_ctrl *, struct ifnet *, unsigned, unsigned);
 void tcp_lro_free(struct lro_ctrl *);
 void tcp_lro_flush_inactive(struct lro_ctrl *, const struct timeval *);
 void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *);
+void tcp_lro_flush_all(struct lro_ctrl *);
 int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t);
+void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *);
 
 #define	TCP_LRO_CANNOT		-1
 #define	TCP_LRO_NOT_SUPPORTED	1

Modified: head/sys/sys/param.h
==============================================================================
--- head/sys/sys/param.h	Tue Jan 19 15:02:37 2016	(r294326)
+++ head/sys/sys/param.h	Tue Jan 19 15:33:28 2016	(r294327)
@@ -58,7 +58,7 @@
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1100094	/* Master, propagated to newvers */
+#define __FreeBSD_version 1100095	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,


More information about the svn-src-head mailing list