svn commit: r192295 - in user/kmacy/releng_7_2_fcs/sys: amd64/conf
conf dev/e1000 i386/conf net netinet
Kip Macy
kmacy at FreeBSD.org
Mon May 18 06:46:35 UTC 2009
Author: kmacy
Date: Mon May 18 06:46:34 2009
New Revision: 192295
URL: http://svn.freebsd.org/changeset/base/192295
Log:
Import changes from HEAD
191038
191154
add utility routine for updating an struct llentry *
191158
191159
191160
191161
191162
191221
191255
191257
191258
191259
191324
191440
191441
191442
191603
191611
191612
Added:
user/kmacy/releng_7_2_fcs/sys/net/flowtable.c
user/kmacy/releng_7_2_fcs/sys/net/flowtable.h
Modified:
user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS
user/kmacy/releng_7_2_fcs/sys/conf/NOTES
user/kmacy/releng_7_2_fcs/sys/conf/files
user/kmacy/releng_7_2_fcs/sys/conf/options
user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c
user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h
user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS
user/kmacy/releng_7_2_fcs/sys/net/if.c
user/kmacy/releng_7_2_fcs/sys/net/if_bridge.c
user/kmacy/releng_7_2_fcs/sys/net/if_llatbl.c
user/kmacy/releng_7_2_fcs/sys/net/if_llatbl.h
user/kmacy/releng_7_2_fcs/sys/net/if_var.h
user/kmacy/releng_7_2_fcs/sys/netinet/in_pcb.h
user/kmacy/releng_7_2_fcs/sys/netinet/ip_input.c
user/kmacy/releng_7_2_fcs/sys/netinet/ip_output.c
user/kmacy/releng_7_2_fcs/sys/netinet/vinet.h
Modified: user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS Mon May 18 06:32:38 2009 (r192294)
+++ user/kmacy/releng_7_2_fcs/sys/amd64/conf/DEFAULTS Mon May 18 06:46:34 2009 (r192295)
@@ -16,9 +16,9 @@ device io # I/O device
device uart_ns8250
# Default partitioning schemes
-options GEOM_BSD
-options GEOM_MBR
-
-# KSE support went from being default to a kernel option
-options KSE
options VIMAGE_GLOBALS
+options GEOM_PART_BSD
+options GEOM_PART_MBR
+
+options FLOWTABLE
+
Modified: user/kmacy/releng_7_2_fcs/sys/conf/NOTES
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/conf/NOTES Mon May 18 06:32:38 2009 (r192294)
+++ user/kmacy/releng_7_2_fcs/sys/conf/NOTES Mon May 18 06:46:34 2009 (r192295)
@@ -549,6 +549,9 @@ options LIBMCHAIN
# libalias library, performing NAT
options LIBALIAS
+# flowtable cache
+options FLOWTABLE
+
#
# SCTP is a NEW transport protocol defined by
# RFC2960 updated by RFC3309 and RFC3758.. and
Modified: user/kmacy/releng_7_2_fcs/sys/conf/files
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/conf/files Mon May 18 06:32:38 2009 (r192294)
+++ user/kmacy/releng_7_2_fcs/sys/conf/files Mon May 18 06:46:34 2009 (r192295)
@@ -1806,6 +1806,7 @@ net/bpf_filter.c optional bpf | netgrap
net/bpf_zerocopy.c optional bpf
net/bridgestp.c optional bridge | if_bridge
net/bsd_comp.c optional ppp_bsdcomp
+net/flowtable.c optional flowtable
net/ieee8023ad_lacp.c optional lagg
net/if.c standard
net/if_arcsubr.c optional arcnet
Modified: user/kmacy/releng_7_2_fcs/sys/conf/options
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/conf/options Mon May 18 06:32:38 2009 (r192294)
+++ user/kmacy/releng_7_2_fcs/sys/conf/options Mon May 18 06:46:34 2009 (r192295)
@@ -405,6 +405,7 @@ VLAN_ARRAY opt_vlan.h
XBONEHACK
KRPC
NFSLOCKD
+FLOWTABLE opt_route.h
#
# SCTP
Modified: user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c Mon May 18 06:32:38 2009 (r192294)
+++ user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.c Mon May 18 06:46:34 2009 (r192295)
@@ -893,6 +893,7 @@ em_detach(device_t dev)
if_free(ifp);
drbr_free(adapter->br, M_DEVBUF);
+ drbr_free(adapter->br, M_DEVBUF);
em_free_transmit_structures(adapter);
em_free_receive_structures(adapter);
@@ -987,7 +988,7 @@ em_resume(device_t dev)
* the packet is requeued.
**********************************************************************/
-#ifdef IFNET_MULTIQUEUE
+#ifdef IFNET_BUF_RING
static int
em_transmit_locked(struct ifnet *ifp, struct mbuf *m)
{
@@ -1000,68 +1001,63 @@ em_transmit_locked(struct ifnet *ifp, st
|| (!adapter->link_active)) {
error = drbr_enqueue(ifp, adapter->br, m);
return (error);
- }
-
- if (buf_ring_empty(adapter->br) &&
+ } else if (ADAPTER_RING_EMPTY(adapter) &&
(adapter->num_tx_desc_avail > EM_TX_OP_THRESHOLD)) {
if (em_xmit(adapter, &m)) {
- if (m && (error = drbr_enqueue(ifp, adapter->br, m)) != 0) {
+ if (m && (error = drbr_enqueue(ifp, adapter->br, m)) != 0)
return (error);
- }
- } else{
- /* Send a copy of the frame to the BPF listener */
+ } else {
+ /*
+ * We've bypassed the buf ring so we need to update
+ * ifp directly
+ */
+ drbr_stats_update(ifp, m->m_pkthdr.len, m->m_flags);
+ /*
+ ** Send a copy of the frame to the BPF
+ ** listener and set the watchdog on.
+ */
ETHER_BPF_MTAP(ifp, m);
}
} else if ((error = drbr_enqueue(ifp, adapter->br, m)) != 0)
return (error);
- if (!buf_ring_empty(adapter->br))
+ if (!ADAPTER_RING_EMPTY(adapter))
em_start_locked(ifp);
return (0);
}
-static void
-em_start_locked(struct ifnet *ifp)
+static int
+em_transmit(struct ifnet *ifp, struct mbuf *m)
{
- struct adapter *adapter = ifp->if_softc;
- struct mbuf *m_head;
-
- EM_TX_LOCK_ASSERT(adapter);
-
- if ((ifp->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
- IFF_DRV_RUNNING)
- return;
- if (!adapter->link_active)
- return;
-
- while ((adapter->num_tx_desc_avail > EM_TX_OP_THRESHOLD)
- && (!buf_ring_empty(adapter->br))) {
+
+ struct adapter *adapter = ifp->if_softc;
+ int error = 0;
- m_head = buf_ring_dequeue_sc(adapter->br);
- if (m_head == NULL)
- break;
- /*
- * Encapsulation can modify our pointer, and or make it
- * NULL on failure. In that event, we can't requeue.
- */
- if (em_xmit(adapter, &m_head)) {
- if (m_head == NULL)
- break;
- break;
- }
+ if(EM_TX_TRYLOCK(adapter)) {
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ error = em_transmit_locked(ifp, m);
+ EM_TX_UNLOCK(adapter);
+ } else
+ error = drbr_enqueue(ifp, adapter->br, m);
- /* Send a copy of the frame to the BPF listener */
- ETHER_BPF_MTAP(ifp, m_head);
+ return (error);
+}
- /* Set timeout in case hardware has problems transmitting. */
- adapter->watchdog_timer = EM_TX_TIMEOUT;
- }
- if ((adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD))
- ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+static void
+em_qflush(struct ifnet *ifp)
+{
+ struct mbuf *m;
+ struct adapter *adapter = (struct adapter *)ifp->if_softc;
+ EM_TX_LOCK(adapter);
+ while ((m = buf_ring_dequeue_sc(adapter->br)) != NULL)
+ m_freem(m);
+ if_qflush(ifp);
+ EM_TX_UNLOCK(adapter);
}
-#else
+#endif
+
static void
em_start_locked(struct ifnet *ifp)
{
@@ -1076,9 +1072,10 @@ em_start_locked(struct ifnet *ifp)
if (!adapter->link_active)
return;
- while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+ while ((adapter->num_tx_desc_avail > EM_TX_OP_THRESHOLD)
+ && (!ADAPTER_RING_EMPTY(adapter))) {
- IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
+ m_head = em_dequeue(ifp, adapter->br);
if (m_head == NULL)
break;
/*
@@ -1088,8 +1085,10 @@ em_start_locked(struct ifnet *ifp)
if (em_xmit(adapter, &m_head)) {
if (m_head == NULL)
break;
+#ifndef IFNET_BUF_RING
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
+#endif
break;
}
@@ -1099,8 +1098,10 @@ em_start_locked(struct ifnet *ifp)
/* Set timeout in case hardware has problems transmitting. */
adapter->watchdog_timer = EM_TX_TIMEOUT;
}
+ if ((adapter->num_tx_desc_avail <= EM_TX_OP_THRESHOLD))
+ ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+
}
-#endif
static void
em_start(struct ifnet *ifp)
@@ -1113,23 +1114,6 @@ em_start(struct ifnet *ifp)
EM_TX_UNLOCK(adapter);
}
-static int
-em_transmit(struct ifnet *ifp, struct mbuf *m)
-{
-
- struct adapter *adapter = ifp->if_softc;
- int error = 0;
-
- if(EM_TX_TRYLOCK(adapter)) {
- if (ifp->if_drv_flags & IFF_DRV_RUNNING)
- error = em_transmit_locked(ifp, m);
- EM_TX_UNLOCK(adapter);
- } else
- error = drbr_enqueue(ifp, adapter->br, m);
-
- return (error);
-}
-
/*********************************************************************
* Ioctl entry point
*
@@ -1693,11 +1677,7 @@ em_poll(struct ifnet *ifp, enum poll_cmd
EM_TX_LOCK(adapter);
em_txeof(adapter);
-#ifdef IFNET_MULTIQUEUE
- if (!buf_ring_empty(adapter->br))
-#else
- if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-#endif
+ if (!ADAPTER_RING_EMPTY(adapter))
em_start_locked(ifp);
EM_TX_UNLOCK(adapter);
}
@@ -1767,13 +1747,7 @@ em_intr(void *arg)
if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
-#ifdef IFNET_MULTIQUEUE
- !buf_ring_empty(adapter->br)
-#else
- !IFQ_DRV_IS_EMPTY(&ifp->if_snd)
-#endif
- )
-
+ !ADAPTER_RING_EMPTY(adapter))
em_start(ifp);
}
@@ -1812,11 +1786,7 @@ em_handle_rxtx(void *context, int pendin
EM_TX_LOCK(adapter);
em_txeof(adapter);
-#ifdef IFNET_MULTIQUEUE
- if (!buf_ring_empty(adapter->br))
-#else
- if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-#endif
+ if (!ADAPTER_RING_EMPTY(adapter))
em_start_locked(ifp);
EM_TX_UNLOCK(adapter);
}
@@ -1824,43 +1794,6 @@ em_handle_rxtx(void *context, int pendin
em_enable_intr(adapter);
}
-static void
-em_handle_rx(void *context, int pending)
-{
- struct adapter *adapter = context;
- struct ifnet *ifp = adapter->ifp;
-
- if ((ifp->if_drv_flags & IFF_DRV_RUNNING) &&
- (em_rxeof(adapter, adapter->rx_process_limit) != 0))
- taskqueue_enqueue(adapter->tq, &adapter->rx_task);
-
-}
-
-static void
-em_handle_tx(void *context, int pending)
-{
- struct adapter *adapter = context;
- struct ifnet *ifp = adapter->ifp;
-
- if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-#ifdef IFNET_MULTIQUEUE
- if (!EM_TX_TRYLOCK(adapter))
- return;
-#else
- EM_TX_LOCK(adapter);
-#endif
-
- em_txeof(adapter);
-#ifdef IFNET_MULTIQUEUE
- if (!buf_ring_empty(adapter->br))
-#else
- if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
-#endif
- em_start_locked(ifp);
- EM_TX_UNLOCK(adapter);
- }
-}
-
/*********************************************************************
*
* Fast Legacy/MSI Combined Interrupt Service routine
@@ -1989,6 +1922,35 @@ em_msix_link(void *arg)
EM_MSIX_LINK | E1000_IMS_LSC);
return;
}
+
+static void
+em_handle_rx(void *context, int pending)
+{
+ struct adapter *adapter = context;
+ struct ifnet *ifp = adapter->ifp;
+
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) &&
+ (em_rxeof(adapter, adapter->rx_process_limit) != 0))
+ taskqueue_enqueue(adapter->tq, &adapter->rx_task);
+
+}
+
+static void
+em_handle_tx(void *context, int pending)
+{
+ struct adapter *adapter = context;
+ struct ifnet *ifp = adapter->ifp;
+
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ if (!EM_TX_TRYLOCK(adapter))
+ return;
+
+ em_txeof(adapter);
+ if (!ADAPTER_RING_EMPTY(adapter))
+ em_start_locked(ifp);
+ EM_TX_UNLOCK(adapter);
+ }
+}
#endif /* EM_FAST_IRQ */
/*********************************************************************
@@ -2646,6 +2608,8 @@ em_local_timer(void *arg)
EM_CORE_LOCK_ASSERT(adapter);
taskqueue_enqueue(adapter->tq,
&adapter->rxtx_task);
+ taskqueue_enqueue(adapter->tq,
+ &adapter->rxtx_task);
em_update_link_status(adapter);
em_update_stats_counters(adapter);
@@ -2990,6 +2954,11 @@ em_allocate_msix(struct adapter *adapter
*/
TASK_INIT(&adapter->rx_task, 0, em_handle_rx, adapter);
TASK_INIT(&adapter->tx_task, 0, em_handle_tx, adapter);
+ /*
+ * Handle compatibility for msi case for deferral due to
+ * trylock failure
+ */
+ TASK_INIT(&adapter->rxtx_task, 0, em_handle_tx, adapter);
TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter);
adapter->tq = taskqueue_create_fast("em_taskq", M_NOWAIT,
taskqueue_thread_enqueue, &adapter->tq);
@@ -3244,6 +3213,11 @@ em_setup_interface(device_t dev, struct
adapter->br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK, &adapter->tx_mtx);
#endif
+#ifdef IFNET_BUF_RING
+ ifp->if_transmit = em_transmit;
+ ifp->if_qflush = em_qflush;
+ adapter->br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK, &adapter->tx_mtx);
+#endif
if (adapter->hw.mac.type >= e1000_82543) {
int version_cap;
#if __FreeBSD_version < 700000
Modified: user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h Mon May 18 06:32:38 2009 (r192294)
+++ user/kmacy/releng_7_2_fcs/sys/dev/e1000/if_em.h Mon May 18 06:46:34 2009 (r192295)
@@ -36,7 +36,7 @@
#ifndef _EM_H_DEFINED_
#define _EM_H_DEFINED_
-#define IFNET_MULTIQUEUE
+#define IFNET_BUF_RING
/* Tunables */
/*
@@ -302,8 +302,10 @@ struct em_dma_alloc {
/* Our adapter structure */
struct adapter {
struct ifnet *ifp;
-#ifdef IFNET_MULTIQUEUE
+#ifdef IFNET_BUF_RING
struct buf_ring *br;
+#else
+ void *br;
#endif
struct e1000_hw hw;
@@ -494,4 +496,27 @@ typedef struct _DESCRIPTOR_PAIR
#define EM_CORE_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->core_mtx, MA_OWNED)
#define EM_TX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->tx_mtx, MA_OWNED)
+#ifdef IFNET_BUF_RING
+#define ADAPTER_RING_EMPTY(adapter) drbr_empty((adapter)->ifp, (adapter)->br)
+#define em_dequeue drbr_dequeue
+
+#else
+#define ADAPTER_RING_EMPTY(adapter) IFQ_DRV_IS_EMPTY(&((adapter)->ifp->if_snd))
+#define drbr_free(br, type)
+static __inline struct mbuf *
+em_dequeue(struct ifnet *ifp, struct buf_ring *br)
+{
+ struct mbuf *m;
+
+ IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
+ return (m);
+}
+#ifdef BUF_RING_UNDEFINED
+
+struct buf_ring {
+};
+
+#endif
+#endif
+
#endif /* _EM_H_DEFINED_ */
Modified: user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS
==============================================================================
--- user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS Mon May 18 06:32:38 2009 (r192294)
+++ user/kmacy/releng_7_2_fcs/sys/i386/conf/DEFAULTS Mon May 18 06:46:34 2009 (r192295)
@@ -23,6 +23,9 @@ device uart_ns8250
options GEOM_BSD
options GEOM_MBR
-# KSE support went from being default to a kernel option
-options KSE
options VIMAGE_GLOBALS
+# enable support for native hardware
+options NATIVE
+device atpic
+
+options FLOWTABLE
Added: user/kmacy/releng_7_2_fcs/sys/net/flowtable.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/kmacy/releng_7_2_fcs/sys/net/flowtable.c Mon May 18 06:46:34 2009 (r192295)
@@ -0,0 +1,1076 @@
+/**************************************************************************
+
+Copyright (c) 2008-2009, BitGravity Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ 1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2. Neither the name of the BitGravity Corporation nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+***************************************************************************/
+
+#include "opt_route.h"
+#include "opt_mpath.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/bitstring.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <sys/sysctl.h>
+#include <sys/vimage.h>
+
+#include <net/if.h>
+#include <net/if_llatbl.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/vnet.h>
+#include <net/flowtable.h>
+
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/sctp.h>
+
+/*
+ * Taken from http://burtleburtle.net/bob/c/lookup3.c
+ */
+
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+ of top bits of (a,b,c), or in any combination of bottom bits of
+ (a,b,c).
+* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ is commonly produced by subtraction) look like a single 1-bit
+ difference.
+* the base values were pseudorandom, all zero but one bit set, or
+ all zero plus a counter that starts at zero.
+
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+ 4 6 8 16 19 4
+ 9 15 3 18 27 15
+ 14 9 3 7 17 3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta. I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose
+the operations, constants, and arrangements of the variables.
+
+This does not achieve avalanche. There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a. The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+
+This allows some parallelism. Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism. I did what I could. Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+ a -= c; a ^= rot(c, 4); c += b; \
+ b -= a; b ^= rot(a, 6); a += c; \
+ c -= b; c ^= rot(b, 8); b += a; \
+ a -= c; a ^= rot(c,16); c += b; \
+ b -= a; b ^= rot(a,19); a += c; \
+ c -= b; c ^= rot(b, 4); b += a; \
+}
+
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different. This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+ of top bits of (a,b,c), or in any combination of bottom bits of
+ (a,b,c).
+* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ is commonly produced by subtraction) look like a single 1-bit
+ difference.
+* the base values were pseudorandom, all zero but one bit set, or
+ all zero plus a counter that starts at zero.
+
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+ 4 8 15 26 3 22 24
+ 10 8 15 26 3 22 24
+ 11 8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+ c ^= b; c -= rot(b,14); \
+ a ^= c; a -= rot(c,11); \
+ b ^= a; b -= rot(a,25); \
+ c ^= b; c -= rot(b,16); \
+ a ^= c; a -= rot(c,4); \
+ b ^= a; b -= rot(a,14); \
+ c ^= b; c -= rot(b,24); \
+}
+
+/*
+--------------------------------------------------------------------
+ This works on all machines. To be useful, it requires
+ -- that the key be an array of uint32_t's, and
+ -- that the length be the number of uint32_t's in the key
+
+ The function hashword() is identical to hashlittle() on little-endian
+ machines, and identical to hashbig() on big-endian machines,
+ except that the length has to be measured in uint32_ts rather than in
+ bytes. hashlittle() is more complicated than hashword() only because
+ hashlittle() has to dance around fitting the key bytes into registers.
+--------------------------------------------------------------------
+*/
+static uint32_t hashword(
+const uint32_t *k, /* the key, an array of uint32_t values */
+size_t length, /* the length of the key, in uint32_ts */
+uint32_t initval) /* the previous hash, or an arbitrary value */
+{
+ uint32_t a,b,c;
+
+ /* Set up the internal state */
+ a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
+
+ /*------------------------------------------------- handle most of the key */
+ while (length > 3)
+ {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+ mix(a,b,c);
+ length -= 3;
+ k += 3;
+ }
+
+ /*------------------------------------------- handle the last 3 uint32_t's */
+ switch(length) /* all the case statements fall through */
+ {
+ case 3 : c+=k[2];
+ case 2 : b+=k[1];
+ case 1 : a+=k[0];
+ final(a,b,c);
+ case 0: /* case 0: nothing left to add */
+ break;
+ }
+ /*------------------------------------------------------ report the result */
+ return c;
+}
+
+
+struct ipv4_tuple {
+ uint16_t ip_sport; /* source port */
+ uint16_t ip_dport; /* destination port */
+ in_addr_t ip_saddr; /* source address */
+ in_addr_t ip_daddr; /* destination address */
+};
+
+union ipv4_flow {
+ struct ipv4_tuple ipf_ipt;
+ uint32_t ipf_key[3];
+};
+
+struct ipv6_tuple {
+ uint16_t ip_sport; /* source port */
+ uint16_t ip_dport; /* destination port */
+ struct in6_addr ip_saddr; /* source address */
+ struct in6_addr ip_daddr; /* destination address */
+};
+
+union ipv6_flow {
+ struct ipv6_tuple ipf_ipt;
+ uint32_t ipf_key[9];
+};
+
+struct flentry {
+ volatile uint32_t f_fhash; /* hash flowing forward */
+ uint16_t f_flags; /* flow flags */
+ uint8_t f_pad; /* alignment */
+ uint8_t f_proto; /* protocol */
+ uint32_t f_uptime; /* uptime at last access */
+ struct flentry *f_next; /* pointer to collision entry */
+ volatile struct rtentry *f_rt; /* rtentry for flow */
+ volatile struct llentry *f_lle; /* llentry for flow */
+};
+
+struct flentry_v4 {
+ struct flentry fl_entry;
+ union ipv4_flow fl_flow;
+};
+
+struct flentry_v6 {
+ struct flentry fl_entry;
+ union ipv6_flow fl_flow;
+};
+
+#define fl_fhash fl_entry.fl_fhash
+#define fl_flags fl_entry.fl_flags
+#define fl_proto fl_entry.fl_proto
+#define fl_uptime fl_entry.fl_uptime
+#define fl_rt fl_entry.fl_rt
+#define fl_lle fl_entry.fl_lle
+
+#define SECS_PER_HOUR 3600
+#define SECS_PER_DAY (24*SECS_PER_HOUR)
+
+#define SYN_IDLE 300
+#define UDP_IDLE 300
+#define FIN_WAIT_IDLE 600
+#define TCP_IDLE SECS_PER_DAY
+
+
+typedef void fl_lock_t(struct flowtable *, uint32_t);
+typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
+
+union flentryp {
+ struct flentry **global;
+ struct flentry **pcpu[MAXCPU];
+};
+
+struct flowtable {
+ int ft_size;
+ int ft_lock_count;
+ uint32_t ft_flags;
+ uint32_t ft_collisions;
+ uint32_t ft_allocated;
+ uint32_t ft_misses;
+ uint64_t ft_hits;
+
+ uint32_t ft_udp_idle;
+ uint32_t ft_fin_wait_idle;
+ uint32_t ft_syn_idle;
+ uint32_t ft_tcp_idle;
+
+ fl_lock_t *ft_lock;
+ fl_lock_t *ft_unlock;
+ fl_rtalloc_t *ft_rtalloc;
+ struct mtx *ft_locks;
+
+
+ union flentryp ft_table;
+ bitstr_t *ft_masks[MAXCPU];
+ bitstr_t *ft_tmpmask;
+ struct flowtable *ft_next;
+};
+
+static struct proc *flowcleanerproc;
+static struct flowtable *flow_list_head;
+static uint32_t hashjitter;
+static uma_zone_t ipv4_zone;
+static uma_zone_t ipv6_zone;
+
+/*
+ * TODO:
+ * - Make flowtable stats per-cpu, aggregated at sysctl call time,
+ * to avoid extra cache evictions caused by incrementing a shared
+ * counter
+ * - add IPv6 support to flow lookup
+ * - add sysctls to resize && flush flow tables
+ * - Add per flowtable sysctls for statistics and configuring timeouts
+ * - add saturation counter to rtentry to support per-packet load-balancing
+ * add flag to indicate round-robin flow, add list lookup from head
+ for flows
+ * - add sysctl / device node / syscall to support exporting and importing
+ * of flows with flag to indicate that a flow was imported so should
+ * not be considered for auto-cleaning
+ * - support explicit connection state (currently only ad-hoc for DSR)
+ */
+SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
+int flowtable_enable = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
+ &flowtable_enable, 0, "enable flowtable caching.");
+static int flowtable_hits = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
+ &flowtable_hits, 0, "# flowtable hits.");
+static int flowtable_lookups = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
+ &flowtable_lookups, 0, "# flowtable lookups.");
+static int flowtable_misses = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
+ &flowtable_misses, 0, "#flowtable misses.");
+static int flowtable_frees = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
+ &flowtable_frees, 0, "#flows freed.");
+static int flowtable_free_checks = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
+ &flowtable_free_checks, 0, "#flows free checks.");
+static int flowtable_max_depth = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
+ &flowtable_max_depth, 0, "max collision list length.");
+static int flowtable_collisions = 0;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
+ &flowtable_collisions, 0, "#flowtable collisions.");
+
+/*
+ * XXX This does not end up updating timeouts at runtime
+ * and only reflects the value for the last table added :-/
+ */
+static int flowtable_syn_expire = SYN_IDLE;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
+ &flowtable_syn_expire, 0, "seconds after which to remove syn allocated flow.");
+static int flowtable_udp_expire = UDP_IDLE;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
+ &flowtable_udp_expire, 0, "seconds after which to remove flow allocated to UDP.");
+static int flowtable_fin_wait_expire = FIN_WAIT_IDLE;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
+ &flowtable_fin_wait_expire, 0, "seconds after which to remove a flow in FIN_WAIT.");
+static int flowtable_tcp_expire = TCP_IDLE;
+SYSCTL_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
+ &flowtable_tcp_expire, 0, "seconds after which to remove flow allocated to a TCP connection.");
+
+
+/*
+ * Maximum number of flows that can be allocated of a given type.
+ *
+ * The table is allocated at boot time (for the pure caching case
+ * there is no reason why this could not be changed at runtime)
+ * and thus (currently) needs to be set with a tunable.
+ */
+static int nmbflows = 4096;
+
+static int
+sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbflows;
+
+ newnmbflows = nmbflows;
+ error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbflows > nmbflows) {
+ nmbflows = newnmbflows;
+ uma_zone_set_max(ipv4_zone, nmbflows);
+ uma_zone_set_max(ipv6_zone, nmbflows);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_net_inet_flowtable, OID_AUTO, nmbflows, CTLTYPE_INT|CTLFLAG_RW,
+ &nmbflows, 0, sysctl_nmbflows, "IU", "Maximum number of flows allowed");
+
+#ifndef RADIX_MPATH
+static void
+in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fib)
+{
+
+ in_rtalloc_ign(ro, 0, fib);
+}
+#endif
+
+static void
+flowtable_global_lock(struct flowtable *table, uint32_t hash)
+{
+ int lock_index = (hash)&(table->ft_lock_count - 1);
+
+ mtx_lock(&table->ft_locks[lock_index]);
+}
+
+static void
+flowtable_global_unlock(struct flowtable *table, uint32_t hash)
+{
+ int lock_index = (hash)&(table->ft_lock_count - 1);
+
+ mtx_unlock(&table->ft_locks[lock_index]);
+}
+
+static void
+flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
+{
+
+ critical_enter();
+}
+
+static void
+flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
+{
+
+ mb();
+ critical_exit();
+}
+
+#define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
+#define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
+#define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
+#define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
+
+#define FL_STALE (1<<8)
+#define FL_IPV6 (1<<9)
+
+static uint32_t
+ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
+ uint32_t *key, uint16_t *flags, uint8_t *protop)
+{
+ uint16_t sport = 0, dport = 0;
+ struct ip *ip = NULL;
+ uint8_t proto = 0;
+ int iphlen;
+ uint32_t hash;
+ struct sockaddr_in *sin;
+ struct tcphdr *th;
+ struct udphdr *uh;
+ struct sctphdr *sh;
+
+ if (flowtable_enable == 0)
+ return (0);
+
+ key[1] = key[0] = 0;
+ sin = (struct sockaddr_in *)&ro->ro_dst;
+ if (m != NULL) {
+ ip = mtod(m, struct ip *);
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = ip->ip_dst;
+ } else
+ *flags &= ~FL_HASH_PORTS;
+
+ key[2] = sin->sin_addr.s_addr;
+
+ if ((*flags & FL_HASH_PORTS) == 0)
+ goto skipports;
+
+ proto = ip->ip_p;
+ iphlen = ip->ip_hl << 2; /* XXX options? */
+ key[1] = ip->ip_src.s_addr;
+
+ switch (proto) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)((caddr_t)ip + iphlen);
+ sport = ntohs(th->th_sport);
+ dport = ntohs(th->th_dport);
+ *flags |= th->th_flags;
+ if (*flags & TH_RST)
+ *flags |= FL_STALE;
+ break;
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)((caddr_t)ip + iphlen);
+ sport = uh->uh_sport;
+ dport = uh->uh_dport;
+ break;
+ case IPPROTO_SCTP:
+ sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+ sport = sh->src_port;
+ dport = sh->dest_port;
+ break;
+ default:
+ if (*flags & FL_HASH_PORTS)
+ goto noop;
+ /* no port - hence not a protocol we care about */
+ break;;
+
+ }
+ *protop = proto;
+
+ /*
+ * If this is a transmit route cache then
+ * hash all flows to a given destination to
+ * the same bucket
+ */
+ if ((*flags & FL_HASH_PORTS) == 0)
+ proto = sport = dport = 0;
+
+ ((uint16_t *)key)[0] = sport;
+ ((uint16_t *)key)[1] = dport;
+
+skipports:
+ hash = hashword(key, 3, hashjitter + proto);
+ if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
+ m->m_flags |= M_FLOWID;
+ m->m_pkthdr.flowid = hash;
+ }
+
+ return (hash);
+noop:
+ *protop = proto;
+ return (0);
+}
+
+static bitstr_t *
+flowtable_mask(struct flowtable *ft)
+{
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-user
mailing list