svn commit: r192295 - in user/kmacy/releng_7_2_fcs/sys: amd64/conf conf dev/e1000 i386/conf net netinet

Kip Macy kmacy at FreeBSD.org
Mon May 18 06:46:35 UTC 2009


Author: kmacy
Date: Mon May 18 06:46:34 2009
New Revision: 192295
URL: http://svn.freebsd.org/changeset/base/192295

Log:
  Import changes from HEAD
  
  191038
  191154
   add utility routine for updating an struct llentry *
  
  191158
  191159
  191160
  191161
  191162
  191221
  191255
  191257
  191258
  191259
  191324
  191440
  191441
  191442
  191603
  191611
  191612

Added:
  user/kmacy/releng_7_2_fcs/sys/net/flowtable.c
  user/kmacy/releng_7_2_fcs/sys/net/flowtable.h
Modified:
  user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS
  user/kmacy/releng_7_2_fcs/sys/conf/NOTES
  user/kmacy/releng_7_2_fcs/sys/conf/files
  user/kmacy/releng_7_2_fcs/sys/conf/options
  user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c
  user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h
  user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS
  user/kmacy/releng_7_2_fcs/sys/net/if.c
  user/kmacy/releng_7_2_fcs/sys/net/if_bridge.c
  user/kmacy/releng_7_2_fcs/sys/net/if_llatbl.c
  user/kmacy/releng_7_2_fcs/sys/net/if_llatbl.h
  user/kmacy/releng_7_2_fcs/sys/net/if_var.h
  user/kmacy/releng_7_2_fcs/sys/netinet/in_pcb.h
  user/kmacy/releng_7_2_fcs/sys/netinet/ip_input.c
  user/kmacy/releng_7_2_fcs/sys/netinet/ip_output.c
  user/kmacy/releng_7_2_fcs/sys/netinet/vinet.h

Modified: user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS	Mon May 18 06:32:38 2009	(r192294)
+++ user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS	Mon May 18 06:46:34 2009	(r192295)
@@ -16,9 +16,9 @@ device		io		# I/O device
 device		uart_ns8250
 
 # Default partitioning schemes
-options 	GEOM_BSD
-options 	GEOM_MBR
-
-# KSE support went from being default to a kernel option
-options 	KSE
 options		VIMAGE_GLOBALS
+options 	GEOM_PART_BSD
+options 	GEOM_PART_MBR
+
+options		FLOWTABLE
+

Modified: user/kmacy/releng_7_2_fcs/sys/conf/NOTES
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/conf/NOTES	Mon May 18 06:32:38 2009	(r192294)
+++ user/kmacy/releng_7_2_fcs/sys/conf/NOTES	Mon May 18 06:46:34 2009	(r192295)
@@ -549,6 +549,9 @@ options 	LIBMCHAIN
 # libalias library, performing NAT
 options		LIBALIAS
 
+# flowtable cache
+options		FLOWTABLE
+
 #
 # SCTP is a NEW transport protocol defined by
 # RFC2960 updated by RFC3309 and RFC3758.. and

Modified: user/kmacy/releng_7_2_fcs/sys/conf/files
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/conf/files	Mon May 18 06:32:38 2009	(r192294)
+++ user/kmacy/releng_7_2_fcs/sys/conf/files	Mon May 18 06:46:34 2009	(r192295)
@@ -1806,6 +1806,7 @@ net/bpf_filter.c		optional bpf | netgrap
 net/bpf_zerocopy.c		optional bpf
 net/bridgestp.c			optional bridge | if_bridge
 net/bsd_comp.c			optional ppp_bsdcomp
+net/flowtable.c			optional flowtable
 net/ieee8023ad_lacp.c		optional lagg
 net/if.c			standard
 net/if_arcsubr.c		optional arcnet

Modified: user/kmacy/releng_7_2_fcs/sys/conf/options
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/conf/options	Mon May 18 06:32:38 2009	(r192294)
+++ user/kmacy/releng_7_2_fcs/sys/conf/options	Mon May 18 06:46:34 2009	(r192295)
@@ -405,6 +405,7 @@ VLAN_ARRAY		opt_vlan.h
 XBONEHACK
 KRPC
 NFSLOCKD
+FLOWTABLE		opt_route.h
 
 #
 # SCTP

Modified: user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c	Mon May 18 06:32:38 2009	(r192294)
+++ user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c	Mon May 18 06:46:34 2009	(r192295)
@@ -893,6 +893,7 @@ em_detach(device_t dev)
 	if_free(ifp);
 	drbr_free(adapter->br, M_DEVBUF);
 
+	drbr_free(adapter->br, M_DEVBUF);
 	em_free_transmit_structures(adapter);
 	em_free_receive_structures(adapter);
 
@@ -987,7 +988,7 @@ em_resume(device_t dev)
  *  the packet is requeued.
  **********************************************************************/
 
-#ifdef IFNET_MULTIQUEUE
+#ifdef IFNET_BUF_RING
 static int
 em_transmit_locked(struct ifnet *ifp, struct mbuf *m)
 {
@@ -1000,68 +1001,63 @@ em_transmit_locked(struct ifnet *ifp, st
 	    || (!adapter->link_active)) {
 		error = drbr_enqueue(ifp, adapter->br, m);
 		return (error);
-	}
-	
-	if (buf_ring_empty(adapter->br) &&
+	} else if (ADAPTER_RING_EMPTY(adapter) &&
 	    (adapter->num_tx_desc_avail > EM_TX_OP_THRESHOLD)) {
 		if (em_xmit(adapter, &m)) {
-			if (m && (error = drbr_enqueue(ifp, adapter->br, m)) != 0) {
+			if (m && (error = drbr_enqueue(ifp, adapter->br, m)) != 0) 
 				return (error);
-			}
-		} else{
-			/* Send a copy of the frame to the BPF listener */
+		} else {
+			/*
+			 * We've bypassed the buf ring so we need to update
+			 * ifp directly
+			 */
+			drbr_stats_update(ifp, m->m_pkthdr.len, m->m_flags);
+			/*
+			** Send a copy of the frame to the BPF
+			** listener and set the watchdog on.
+			*/
 			ETHER_BPF_MTAP(ifp, m);
 		}
 	} else if ((error = drbr_enqueue(ifp, adapter->br, m)) != 0)
 		return (error);
 	
-	if (!buf_ring_empty(adapter->br))
+	if (!ADAPTER_RING_EMPTY(adapter))
 		em_start_locked(ifp);
 
 	return (0);
 }
 	
-static void
-em_start_locked(struct ifnet *ifp)
+static int
+em_transmit(struct ifnet *ifp, struct mbuf *m)
 {
-	struct adapter	*adapter = ifp->if_softc;
-	struct mbuf	*m_head;
-
-	EM_TX_LOCK_ASSERT(adapter);
-
-	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
-	    IFF_DRV_RUNNING)
-		return;
-	if (!adapter->link_active)
-		return;
-
-	while ((adapter->num_tx_desc_avail > EM_TX_OP_THRESHOLD)
-	    && (!buf_ring_empty(adapter->br))) {
+	
+	struct adapter *adapter = ifp->if_softc;
+	int error = 0;
 
-		m_head = buf_ring_dequeue_sc(adapter->br);
-		if (m_head == NULL)
-			break;
-		/*
-		 *  Encapsulation can modify our pointer, and or make it
-		 *  NULL on failure.  In that event, we can't requeue.
-		 */
-		if (em_xmit(adapter, &m_head)) {
-			if (m_head == NULL)
-				break;
-			break;
-		}
+	if(EM_TX_TRYLOCK(adapter)) {
+		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+			error = em_transmit_locked(ifp, m);
+		EM_TX_UNLOCK(adapter);
+	} else 
+		error = drbr_enqueue(ifp, adapter->br, m);
 
-		/* Send a copy of the frame to the BPF listener */
-		ETHER_BPF_MTAP(ifp, m_head);
+	return (error);
+}
 
-		/* Set timeout in case hardware has problems transmitting. */
-		adapter->watchdog_timer = EM_TX_TIMEOUT;
-	}
-	if ((adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD))
-		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+static void
+em_qflush(struct ifnet *ifp)
+{
+	struct mbuf *m;
+	struct adapter *adapter = (struct adapter *)ifp->if_softc;
 
+	EM_TX_LOCK(adapter);
+	while ((m = buf_ring_dequeue_sc(adapter->br)) != NULL)
+		m_freem(m);
+	if_qflush(ifp);
+	EM_TX_UNLOCK(adapter);
 }
-#else
+#endif
+
 static void
 em_start_locked(struct ifnet *ifp)
 {
@@ -1076,9 +1072,10 @@ em_start_locked(struct ifnet *ifp)
 	if (!adapter->link_active)
 		return;
 
-	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+	while ((adapter->num_tx_desc_avail > EM_TX_OP_THRESHOLD)
+	    && (!ADAPTER_RING_EMPTY(adapter))) {
 
-		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
+		m_head = em_dequeue(ifp, adapter->br);
 		if (m_head == NULL)
 			break;
 		/*
@@ -1088,8 +1085,10 @@ em_start_locked(struct ifnet *ifp)
 		if (em_xmit(adapter, &m_head)) {
 			if (m_head == NULL)
 				break;
+#ifndef IFNET_BUF_RING
 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+#endif
 			break;
 		}
 
@@ -1099,8 +1098,10 @@ em_start_locked(struct ifnet *ifp)
 		/* Set timeout in case hardware has problems transmitting. */
 		adapter->watchdog_timer = EM_TX_TIMEOUT;
 	}
+	if ((adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD))
+		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+
 }
-#endif
 
 static void
 em_start(struct ifnet *ifp)
@@ -1113,23 +1114,6 @@ em_start(struct ifnet *ifp)
 	EM_TX_UNLOCK(adapter);
 }
 
-static int
-em_transmit(struct ifnet *ifp, struct mbuf *m)
-{
-	
-	struct adapter *adapter = ifp->if_softc;
-	int error = 0;
-
-	if(EM_TX_TRYLOCK(adapter)) {
-		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
-			error = em_transmit_locked(ifp, m);
-		EM_TX_UNLOCK(adapter);
-	} else 
-		error = drbr_enqueue(ifp, adapter->br, m);
-
-	return (error);
-}
-
 /*********************************************************************
  *  Ioctl entry point
  *
@@ -1693,11 +1677,7 @@ em_poll(struct ifnet *ifp, enum poll_cmd
 	EM_TX_LOCK(adapter);
 	em_txeof(adapter);
 
-#ifdef IFNET_MULTIQUEUE
-	if (!buf_ring_empty(adapter->br))
-#else    
-	if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-#endif		
+	if (!ADAPTER_RING_EMPTY(adapter))
 		em_start_locked(ifp);
 	EM_TX_UNLOCK(adapter);
 }
@@ -1767,13 +1747,7 @@ em_intr(void *arg)
 
 	
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
-#ifdef IFNET_MULTIQUEUE
-	    !buf_ring_empty(adapter->br)
-#else
-	    !IFQ_DRV_IS_EMPTY(&ifp->if_snd)
-#endif
-		)
-
+	    !ADAPTER_RING_EMPTY(adapter))
 		em_start(ifp);
 }
 
@@ -1812,11 +1786,7 @@ em_handle_rxtx(void *context, int pendin
 		EM_TX_LOCK(adapter);
 		em_txeof(adapter);
 
-#ifdef IFNET_MULTIQUEUE
-		if (!buf_ring_empty(adapter->br))
-#else			    
-		if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-#endif
+		if (!ADAPTER_RING_EMPTY(adapter))
 			em_start_locked(ifp);
 		EM_TX_UNLOCK(adapter);
 	}
@@ -1824,43 +1794,6 @@ em_handle_rxtx(void *context, int pendin
 	em_enable_intr(adapter);
 }
 
-static void
-em_handle_rx(void *context, int pending)
-{
-	struct adapter	*adapter = context;
-	struct ifnet	*ifp = adapter->ifp;
-
-	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) &&
-	    (em_rxeof(adapter, adapter->rx_process_limit) != 0))
-		taskqueue_enqueue(adapter->tq, &adapter->rx_task);
-
-}
-
-static void
-em_handle_tx(void *context, int pending)
-{
-	struct adapter	*adapter = context;
-	struct ifnet	*ifp = adapter->ifp;
-
-	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-#ifdef IFNET_MULTIQUEUE
-		if (!EM_TX_TRYLOCK(adapter))
-			return;
-#else
-		EM_TX_LOCK(adapter);
-#endif
-		
-		em_txeof(adapter);
-#ifdef IFNET_MULTIQUEUE
-		if (!buf_ring_empty(adapter->br))
-#else			
-		if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-#endif
-			em_start_locked(ifp);
-		EM_TX_UNLOCK(adapter);
-	}
-}
-
 /*********************************************************************
  *
  *  Fast Legacy/MSI Combined Interrupt Service routine  
@@ -1989,6 +1922,35 @@ em_msix_link(void *arg)
 	    EM_MSIX_LINK | E1000_IMS_LSC);
 	return;
 }
+
+static void
+em_handle_rx(void *context, int pending)
+{
+	struct adapter	*adapter = context;
+	struct ifnet	*ifp = adapter->ifp;
+
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) &&
+	    (em_rxeof(adapter, adapter->rx_process_limit) != 0))
+		taskqueue_enqueue(adapter->tq, &adapter->rx_task);
+
+}
+
+static void
+em_handle_tx(void *context, int pending)
+{
+	struct adapter	*adapter = context;
+	struct ifnet	*ifp = adapter->ifp;
+
+	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+		if (!EM_TX_TRYLOCK(adapter))
+			return;
+
+		em_txeof(adapter);
+		if (!ADAPTER_RING_EMPTY(adapter))
+			em_start_locked(ifp);
+		EM_TX_UNLOCK(adapter);
+	}
+}
 #endif /* EM_FAST_IRQ */
 
 /*********************************************************************
@@ -2646,6 +2608,8 @@ em_local_timer(void *arg)
 	EM_CORE_LOCK_ASSERT(adapter);
 	taskqueue_enqueue(adapter->tq,
 	    &adapter->rxtx_task);
+	taskqueue_enqueue(adapter->tq,
+	    &adapter->rxtx_task);
 	em_update_link_status(adapter);
 	em_update_stats_counters(adapter);
 
@@ -2990,6 +2954,11 @@ em_allocate_msix(struct adapter *adapter
 	 */
 	TASK_INIT(&adapter->rx_task, 0, em_handle_rx, adapter);
 	TASK_INIT(&adapter->tx_task, 0, em_handle_tx, adapter);
+	/*
+	 * Handle compatibility for msi case for deferral due to
+	 * trylock failure
+	 */
+	TASK_INIT(&adapter->rxtx_task, 0, em_handle_tx, adapter);
 	TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter);
 	adapter->tq = taskqueue_create_fast("em_taskq", M_NOWAIT,
 	    taskqueue_thread_enqueue, &adapter->tq);
@@ -3244,6 +3213,11 @@ em_setup_interface(device_t dev, struct 
 	adapter->br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK, &adapter->tx_mtx);
 #endif	
 	
+#ifdef IFNET_BUF_RING
+	ifp->if_transmit = em_transmit;
+	ifp->if_qflush = em_qflush;
+	adapter->br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK, &adapter->tx_mtx);
+#endif	
 	if (adapter->hw.mac.type >= e1000_82543) {
 		int version_cap;
 #if __FreeBSD_version < 700000

Modified: user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h	Mon May 18 06:32:38 2009	(r192294)
+++ user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h	Mon May 18 06:46:34 2009	(r192295)
@@ -36,7 +36,7 @@
 #ifndef _EM_H_DEFINED_
 #define _EM_H_DEFINED_
 
-#define IFNET_MULTIQUEUE
+#define	IFNET_BUF_RING
 /* Tunables */
 
 /*
@@ -302,8 +302,10 @@ struct em_dma_alloc {
 /* Our adapter structure */
 struct adapter {
 	struct ifnet	*ifp;
-#ifdef IFNET_MULTIQUEUE
+#ifdef IFNET_BUF_RING
 	struct buf_ring	*br;
+#else
+        void		*br;
 #endif
 	struct e1000_hw	hw;
 
@@ -494,4 +496,27 @@ typedef struct _DESCRIPTOR_PAIR
 #define	EM_CORE_LOCK_ASSERT(_sc)	mtx_assert(&(_sc)->core_mtx, MA_OWNED)
 #define	EM_TX_LOCK_ASSERT(_sc)		mtx_assert(&(_sc)->tx_mtx, MA_OWNED)
 
+#ifdef IFNET_BUF_RING
+#define ADAPTER_RING_EMPTY(adapter) drbr_empty((adapter)->ifp, (adapter)->br)
+#define	em_dequeue     	drbr_dequeue
+
+#else
+#define ADAPTER_RING_EMPTY(adapter) IFQ_DRV_IS_EMPTY(&((adapter)->ifp->if_snd))
+#define	drbr_free(br, type)
+static __inline struct mbuf *
+em_dequeue(struct ifnet *ifp, struct buf_ring *br)
+{
+    struct mbuf *m;
+    
+    IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
+    return (m);
+}
+#ifdef BUF_RING_UNDEFINED
+
+struct buf_ring {
+};
+
+#endif
+#endif
+
 #endif /* _EM_H_DEFINED_ */

Modified: user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS	Mon May 18 06:32:38 2009	(r192294)
+++ user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS	Mon May 18 06:46:34 2009	(r192295)
@@ -23,6 +23,9 @@ device		uart_ns8250
 options 	GEOM_BSD
 options 	GEOM_MBR
 
-# KSE support went from being default to a kernel option
-options 	KSE
 options		VIMAGE_GLOBALS
+# enable support for native hardware
+options 	NATIVE
+device		atpic
+
+options		FLOWTABLE

Added: user/kmacy/releng_7_2_fcs/sys/net/flowtable.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/kmacy/releng_7_2_fcs/sys/net/flowtable.c	Mon May 18 06:46:34 2009	(r192295)
@@ -0,0 +1,1076 @@
+/**************************************************************************
+
+Copyright (c) 2008-2009, BitGravity Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the BitGravity Corporation nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include "opt_route.h"
+#include "opt_mpath.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>  
+#include <sys/types.h>
+#include <sys/bitstring.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>  
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/sysctl.h>
+#include <sys/vimage.h>
+
+#include <net/if.h>
+#include <net/if_llatbl.h>
+#include <net/if_var.h>
+#include <net/route.h> 
+#include <net/vnet.h>
+#include <net/flowtable.h>
+
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/sctp.h>
+
+/*
+ * Taken from http://burtleburtle.net/bob/c/lookup3.c
+ */
+
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or 
+  all zero plus a counter that starts at zero.
+
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+    4  6  8 16 19  4
+    9 15  3 18 27 15
+   14  9  3  7 17  3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta.  I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose 
+the operations, constants, and arrangements of the variables.
+
+This does not achieve avalanche.  There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a.  The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+
+This allows some parallelism.  Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism.  I did what I could.  Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different.  This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or 
+  all zero plus a counter that starts at zero.
+
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+  4  8 15 26 3 22 24
+ 10  8 15 26 3 22 24
+ 11  8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+
+/*
+--------------------------------------------------------------------
+ This works on all machines.  To be useful, it requires
+ -- that the key be an array of uint32_t's, and
+ -- that the length be the number of uint32_t's in the key
+
+ The function hashword() is identical to hashlittle() on little-endian
+ machines, and identical to hashbig() on big-endian machines,
+ except that the length has to be measured in uint32_ts rather than in
+ bytes.  hashlittle() is more complicated than hashword() only because
+ hashlittle() has to dance around fitting the key bytes into registers.
+--------------------------------------------------------------------
+*/
+static uint32_t hashword(
+const uint32_t *k,                   /* the key, an array of uint32_t values */
+size_t          length,               /* the length of the key, in uint32_ts */
+uint32_t        initval)         /* the previous hash, or an arbitrary value */
+{
+  uint32_t a,b,c;
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
+
+  /*------------------------------------------------- handle most of the key */
+  while (length > 3)
+  {
+    a += k[0];
+    b += k[1];
+    c += k[2];
+    mix(a,b,c);
+    length -= 3;
+    k += 3;
+  }
+
+  /*------------------------------------------- handle the last 3 uint32_t's */
+  switch(length)                     /* all the case statements fall through */
+  { 
+  case 3 : c+=k[2];
+  case 2 : b+=k[1];
+  case 1 : a+=k[0];
+    final(a,b,c);
+  case 0:     /* case 0: nothing left to add */
+    break;
+  }
+  /*------------------------------------------------------ report the result */
+  return c;
+}
+
+
+struct ipv4_tuple {
+	uint16_t 	ip_sport;	/* source port */
+	uint16_t 	ip_dport;	/* destination port */
+	in_addr_t 	ip_saddr;	/* source address */
+	in_addr_t 	ip_daddr;	/* destination address */
+};
+
+union ipv4_flow {
+	struct ipv4_tuple ipf_ipt;
+	uint32_t 	ipf_key[3];
+};
+
+struct ipv6_tuple {
+	uint16_t 	ip_sport;	/* source port */
+	uint16_t 	ip_dport;	/* destination port */
+	struct in6_addr	ip_saddr;	/* source address */
+	struct in6_addr	ip_daddr;	/* destination address */
+};
+
+union ipv6_flow {
+	struct ipv6_tuple ipf_ipt;
+	uint32_t 	ipf_key[9];
+};
+
+struct flentry {
+	volatile uint32_t	f_fhash;	/* hash flowing forward */
+	uint16_t		f_flags;	/* flow flags */
+	uint8_t			f_pad;		/* alignment */
+	uint8_t			f_proto;	/* protocol */
+	uint32_t		f_uptime;	/* uptime at last access */
+	struct flentry		*f_next;	/* pointer to collision entry */
+	volatile struct rtentry *f_rt;		/* rtentry for flow */
+	volatile struct llentry *f_lle;		/* llentry for flow */
+};
+
+struct flentry_v4 {
+	struct flentry	fl_entry;
+	union ipv4_flow	fl_flow;
+};
+
+struct flentry_v6 {
+	struct flentry	fl_entry;
+	union ipv6_flow	fl_flow;
+};
+
+#define	fl_fhash	fl_entry.fl_fhash
+#define	fl_flags	fl_entry.fl_flags
+#define	fl_proto	fl_entry.fl_proto
+#define	fl_uptime	fl_entry.fl_uptime
+#define	fl_rt		fl_entry.fl_rt
+#define	fl_lle		fl_entry.fl_lle
+
+#define	SECS_PER_HOUR		3600
+#define	SECS_PER_DAY		(24*SECS_PER_HOUR)
+
+#define	SYN_IDLE		300
+#define	UDP_IDLE		300
+#define	FIN_WAIT_IDLE		600
+#define	TCP_IDLE		SECS_PER_DAY
+
+
+typedef	void fl_lock_t(struct flowtable *, uint32_t);
+typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
+
+union flentryp {
+	struct flentry		**global;
+	struct flentry		**pcpu[MAXCPU];
+};
+
+struct flowtable {
+	int 		ft_size;
+	int 		ft_lock_count;
+	uint32_t	ft_flags;
+	uint32_t	ft_collisions;
+	uint32_t	ft_allocated;
+	uint32_t	ft_misses;
+	uint64_t	ft_hits;
+
+	uint32_t	ft_udp_idle;
+	uint32_t	ft_fin_wait_idle;
+	uint32_t	ft_syn_idle;
+	uint32_t	ft_tcp_idle;
+
+	fl_lock_t	*ft_lock;
+	fl_lock_t 	*ft_unlock;
+	fl_rtalloc_t	*ft_rtalloc;
+	struct mtx	*ft_locks;
+
+	
+	union flentryp	ft_table;
+	bitstr_t 	*ft_masks[MAXCPU];
+	bitstr_t	*ft_tmpmask;
+	struct flowtable *ft_next;
+};
+
+static struct proc *flowcleanerproc;
+static struct flowtable *flow_list_head;
+static uint32_t hashjitter;
+static uma_zone_t ipv4_zone;
+static uma_zone_t ipv6_zone;
+
+/*
+ * TODO:
+ * - Make flowtable stats per-cpu, aggregated at sysctl call time,
+ *   to avoid extra cache evictions caused by incrementing a shared
+ *   counter
+ * - add IPv6 support to flow lookup
+ * - add sysctls to resize && flush flow tables 
+ * - Add per flowtable sysctls for statistics and configuring timeouts
+ * - add saturation counter to rtentry to support per-packet load-balancing
+ *   add flag to indicate round-robin flow, add list lookup from head
+     for flows
+ * - add sysctl / device node / syscall to support exporting and importing
+ *   of flows with flag to indicate that a flow was imported so should
+ *   not be considered for auto-cleaning
+ * - support explicit connection state (currently only ad-hoc for DSR)
+ */
+SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
+int	flowtable_enable = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
+    &flowtable_enable, 0, "enable flowtable caching.");
+static int flowtable_hits = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
+    &flowtable_hits, 0, "# flowtable hits.");
+static int flowtable_lookups = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
+    &flowtable_lookups, 0, "# flowtable lookups.");
+static int flowtable_misses = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
+    &flowtable_misses, 0, "#flowtable misses.");
+static int flowtable_frees = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
+    &flowtable_frees, 0, "#flows freed.");
+static int flowtable_free_checks = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
+    &flowtable_free_checks, 0, "#flows free checks.");
+static int flowtable_max_depth = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
+    &flowtable_max_depth, 0, "max collision list length.");
+static int flowtable_collisions = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
+    &flowtable_collisions, 0, "#flowtable collisions.");
+
+/*
+ * XXX This does not end up updating timeouts at runtime
+ * and only reflects the value for the last table added :-/
+ */
+static int flowtable_syn_expire = SYN_IDLE;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
+    &flowtable_syn_expire, 0, "seconds after which to remove syn allocated flow.");
+static int flowtable_udp_expire = UDP_IDLE;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
+    &flowtable_udp_expire, 0, "seconds after which to remove flow allocated to UDP.");
+static int flowtable_fin_wait_expire = FIN_WAIT_IDLE;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
+    &flowtable_fin_wait_expire, 0, "seconds after which to remove a flow in FIN_WAIT.");
+static int flowtable_tcp_expire = TCP_IDLE;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
+    &flowtable_tcp_expire, 0, "seconds after which to remove flow allocated to a TCP connection.");
+
+
+/*
+ * Maximum number of flows that can be allocated of a given type.
+ *
+ * The table is allocated at boot time (for the pure caching case
+ * there is no reason why this could not be changed at runtime)
+ * and thus (currently) needs to be set with a tunable.
+ */
+static int nmbflows = 4096;
+
+static int
+sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbflows;
+
+	newnmbflows = nmbflows;
+	error = sysctl_handle_int(oidp, &newnmbflows, 0, req); 
+	if (error == 0 && req->newptr) {
+		if (newnmbflows > nmbflows) {
+			nmbflows = newnmbflows;
+			uma_zone_set_max(ipv4_zone, nmbflows);
+			uma_zone_set_max(ipv6_zone, nmbflows);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_net_inet_flowtable, OID_AUTO, nmbflows, CTLTYPE_INT|CTLFLAG_RW,
+    &nmbflows, 0, sysctl_nmbflows, "IU", "Maximum number of flows allowed");
+
+#ifndef RADIX_MPATH
+static void
+in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fib)
+{
+
+	in_rtalloc_ign(ro, 0, fib);
+}
+#endif
+
+static void
+flowtable_global_lock(struct flowtable *table, uint32_t hash)
+{	
+	int lock_index = (hash)&(table->ft_lock_count - 1);
+
+	mtx_lock(&table->ft_locks[lock_index]);
+}
+
+static void
+flowtable_global_unlock(struct flowtable *table, uint32_t hash)
+{	
+	int lock_index = (hash)&(table->ft_lock_count - 1);
+
+	mtx_unlock(&table->ft_locks[lock_index]);
+}
+
+static void
+flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
+{
+
+	critical_enter();
+}
+
+static void
+flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
+{
+
+	mb();
+	critical_exit();
+}
+
+#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
+#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
+#define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
+#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
+
+#define FL_STALE (1<<8)
+#define FL_IPV6  (1<<9)
+
+static uint32_t
+ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
+    uint32_t *key, uint16_t *flags, uint8_t *protop)
+{
+	uint16_t sport = 0, dport = 0;
+	struct ip *ip = NULL;
+	uint8_t proto = 0;
+	int iphlen;
+	uint32_t hash;
+	struct sockaddr_in *sin;
+	struct tcphdr *th;
+	struct udphdr *uh;
+	struct sctphdr *sh;
+
+	if (flowtable_enable == 0)
+		return (0);
+
+	key[1] = key[0] = 0;
+	sin = (struct sockaddr_in *)&ro->ro_dst;
+	if (m != NULL) {
+		ip = mtod(m, struct ip *);
+		sin->sin_family = AF_INET;
+		sin->sin_len = sizeof(*sin);
+		sin->sin_addr = ip->ip_dst;
+	} else
+		*flags &= ~FL_HASH_PORTS;
+
+	key[2] = sin->sin_addr.s_addr;
+
+	if ((*flags & FL_HASH_PORTS) == 0)
+		goto skipports;
+
+	proto = ip->ip_p;
+	iphlen = ip->ip_hl << 2; /* XXX options? */
+	key[1] = ip->ip_src.s_addr;
+	
+	switch (proto) {
+	case IPPROTO_TCP:
+		th = (struct tcphdr *)((caddr_t)ip + iphlen);
+		sport = ntohs(th->th_sport);
+		dport = ntohs(th->th_dport);
+		*flags |= th->th_flags;
+		if (*flags & TH_RST)
+			*flags |= FL_STALE;
+	break;
+	case IPPROTO_UDP:
+		uh = (struct udphdr *)((caddr_t)ip + iphlen);
+		sport = uh->uh_sport;
+		dport = uh->uh_dport;
+	break;
+	case IPPROTO_SCTP:
+		sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+		sport = sh->src_port;
+		dport = sh->dest_port;
+	break;
+	default:
+		if (*flags & FL_HASH_PORTS)
+			goto noop;
+		/* no port - hence not a protocol we care about */
+		break;;
+	
+	}
+	*protop = proto;
+
+	/*
+	 * If this is a transmit route cache then 
+	 * hash all flows to a given destination to
+	 * the same bucket
+	 */
+	if ((*flags & FL_HASH_PORTS) == 0)
+		proto = sport = dport = 0;
+
+	((uint16_t *)key)[0] = sport;
+	((uint16_t *)key)[1] = dport; 
+
+skipports:
+	hash = hashword(key, 3, hashjitter + proto);
+	if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
+		m->m_flags |= M_FLOWID;
+		m->m_pkthdr.flowid = hash;
+	}
+
+	return (hash);
+noop:
+	*protop = proto;
+	return (0);
+}
+
+static bitstr_t *
+flowtable_mask(struct flowtable *ft)
+{

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-user mailing list