svn commit: r206024 - in stable/8/sys: net netinet

Kip Macy kmacy at FreeBSD.org
Thu Apr 1 00:36:41 UTC 2010


Author: kmacy
Date: Thu Apr  1 00:36:40 2010
New Revision: 206024
URL: http://svn.freebsd.org/changeset/base/206024

Log:
  MFC 205066, 205069, 205093, 205097, 205488:
  
  r205066:
  
  Log:
   - restructure flowtable to support ipv6
   - add a name argument to flowtable_alloc for printing with ddb commands
   - extend ddb commands to print destination address or 4-tuples
   - don't parse ports in ulp header if FL_HASH_ALL is not passed
   - add kern_flowtable_insert to enable more generic use of flowtable
     (e.g. system calls for adding entries)
   - don't hash loopback addresses
   - cleanup whitespace
   - keep statistics per-cpu for per-cpu flowtables to avoid cache line contention
   - add sysctls to accumulate stats and report aggregate
  
  r205069:
  Log:
   fix stats reporting sysctl
  
  r205093:
  Log:
   re-update copyright to 2010
   pointed out by danfe@
  
  r205097:
  
  Log:
   flowtable_get_hashkey is only used by a DDB function - move under #ifdef DDB
  
   pointed out by jkim@
  
  r205488:
  
  Log:
   - boot-time size the ipv4 flowtable and the maximum number of flows
   - increase flow cleaning frequency and decrease flow caching time
     when near the flow limit
   - stop allocating new flows when within 3% of maxflows don't start
     allocating again until below 12.5%

Modified:
  stable/8/sys/net/flowtable.c
  stable/8/sys/net/flowtable.h
  stable/8/sys/net/if_llatbl.c
  stable/8/sys/net/if_llatbl.h
  stable/8/sys/netinet/ip_input.c
  stable/8/sys/netinet/ip_output.c
Directory Properties:
  stable/8/sys/net/   (props changed)

Modified: stable/8/sys/net/flowtable.c
==============================================================================
--- stable/8/sys/net/flowtable.c	Wed Mar 31 23:24:42 2010	(r206023)
+++ stable/8/sys/net/flowtable.c	Thu Apr  1 00:36:40 2010	(r206024)
@@ -1,6 +1,6 @@
 /**************************************************************************
 
-Copyright (c) 2008-2009, BitGravity Inc.
+Copyright (c) 2008-2010, BitGravity Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,6 +30,8 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "opt_route.h"
 #include "opt_mpath.h"
 #include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
@@ -45,6 +47,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
+#include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
@@ -63,6 +66,9 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/sctp.h>
@@ -140,31 +146,42 @@ union flentryp {
 	struct flentry		**pcpu[MAXCPU];
 };
 
+struct flowtable_stats {
+	uint64_t	ft_collisions;
+	uint64_t	ft_allocated;
+	uint64_t	ft_misses;
+	uint64_t	ft_max_depth;
+	uint64_t	ft_free_checks;
+	uint64_t	ft_frees;
+	uint64_t	ft_hits;
+	uint64_t	ft_lookups;
+} __aligned(CACHE_LINE_SIZE);
+
 struct flowtable {
+	struct	flowtable_stats ft_stats[MAXCPU];
 	int 		ft_size;
 	int 		ft_lock_count;
 	uint32_t	ft_flags;
-	uint32_t	ft_collisions;
-	uint32_t	ft_allocated;
-	uint32_t	ft_misses;
-	uint64_t	ft_hits;
-
-	uint32_t	ft_udp_idle;
-	uint32_t	ft_fin_wait_idle;
-	uint32_t	ft_syn_idle;
-	uint32_t	ft_tcp_idle;
-
+	char		*ft_name;
 	fl_lock_t	*ft_lock;
 	fl_lock_t 	*ft_unlock;
 	fl_rtalloc_t	*ft_rtalloc;
+	/*
+	 * XXX need to pad out 
+	 */ 
 	struct mtx	*ft_locks;
-
-	
 	union flentryp	ft_table;
 	bitstr_t 	*ft_masks[MAXCPU];
 	bitstr_t	*ft_tmpmask;
 	struct flowtable *ft_next;
-};
+
+	uint32_t	ft_count __aligned(CACHE_LINE_SIZE);
+	uint32_t	ft_udp_idle __aligned(CACHE_LINE_SIZE);
+	uint32_t	ft_fin_wait_idle;
+	uint32_t	ft_syn_idle;
+	uint32_t	ft_tcp_idle;
+	boolean_t	ft_full;
+} __aligned(CACHE_LINE_SIZE);
 
 static struct proc *flowcleanerproc;
 static VNET_DEFINE(struct flowtable *, flow_list_head);
@@ -177,16 +194,30 @@ static VNET_DEFINE(uma_zone_t, flow_ipv6
 #define	V_flow_ipv4_zone	VNET(flow_ipv4_zone)
 #define	V_flow_ipv6_zone	VNET(flow_ipv6_zone)
 
+
 static struct cv 	flowclean_cv;
 static struct mtx	flowclean_lock;
 static uint32_t		flowclean_cycles;
+static uint32_t		flowclean_freq;
+
+#ifdef FLOWTABLE_DEBUG
+#define FLDPRINTF(ft, flags, fmt, ...) 		\
+do {		  				\
+	if ((ft)->ft_flags & (flags))		\
+		printf((fmt), __VA_ARGS__);	\
+} while (0);					\
+
+#else
+#define FLDPRINTF(ft, flags, fmt, ...)
+
+#endif
+
 
 /*
  * TODO:
  * - Make flowtable stats per-cpu, aggregated at sysctl call time,
  *   to avoid extra cache evictions caused by incrementing a shared
  *   counter
- * - add IPv6 support to flow lookup
  * - add sysctls to resize && flush flow tables 
  * - Add per flowtable sysctls for statistics and configuring timeouts
  * - add saturation counter to rtentry to support per-packet load-balancing
@@ -200,29 +231,15 @@ static uint32_t		flowclean_cycles;
  */
 VNET_DEFINE(int, flowtable_enable) = 1;
 static VNET_DEFINE(int, flowtable_debug);
-static VNET_DEFINE(int, flowtable_hits);
-static VNET_DEFINE(int, flowtable_lookups);
-static VNET_DEFINE(int, flowtable_misses);
-static VNET_DEFINE(int, flowtable_frees);
-static VNET_DEFINE(int, flowtable_free_checks);
-static VNET_DEFINE(int, flowtable_max_depth);
-static VNET_DEFINE(int, flowtable_collisions);
 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
-static VNET_DEFINE(int, flowtable_nmbflows) = 4096;
+static VNET_DEFINE(int, flowtable_nmbflows);
 static VNET_DEFINE(int, flowtable_ready) = 0;
 
 #define	V_flowtable_enable		VNET(flowtable_enable)
 #define	V_flowtable_debug		VNET(flowtable_debug)
-#define	V_flowtable_hits		VNET(flowtable_hits)
-#define	V_flowtable_lookups		VNET(flowtable_lookups)
-#define	V_flowtable_misses		VNET(flowtable_misses)
-#define	V_flowtable_frees		VNET(flowtable_frees)
-#define	V_flowtable_free_checks		VNET(flowtable_free_checks)
-#define	V_flowtable_max_depth		VNET(flowtable_max_depth)
-#define	V_flowtable_collisions		VNET(flowtable_collisions)
 #define	V_flowtable_syn_expire		VNET(flowtable_syn_expire)
 #define	V_flowtable_udp_expire		VNET(flowtable_udp_expire)
 #define	V_flowtable_fin_wait_expire	VNET(flowtable_fin_wait_expire)
@@ -235,20 +252,6 @@ SYSCTL_VNET_INT(_net_inet_flowtable, OID
     &VNET_NAME(flowtable_debug), 0, "print debug info.");
 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
-    &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
-    &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
-    &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
-    &VNET_NAME(flowtable_frees), 0, "#flows freed.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
-    &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
-    &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
-    &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
 
 /*
  * XXX This does not end up updating timeouts at runtime
@@ -298,6 +301,77 @@ SYSCTL_VNET_PROC(_net_inet_flowtable, OI
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
     "Maximum number of flows allowed");
 
+
+
+#define FS_PRINT(sb, field)	sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
+
+static void
+fs_print(struct sbuf *sb, struct flowtable_stats *fs)
+{
+
+	FS_PRINT(sb, collisions);
+	FS_PRINT(sb, allocated);
+	FS_PRINT(sb, misses);
+	FS_PRINT(sb, max_depth);
+	FS_PRINT(sb, free_checks);
+	FS_PRINT(sb, frees);
+	FS_PRINT(sb, hits);
+	FS_PRINT(sb, lookups);
+}
+
+static void
+flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
+{
+	int i;
+	struct flowtable_stats fs, *pfs;
+
+	if (ft->ft_flags & FL_PCPU) {
+		bzero(&fs, sizeof(fs));
+		pfs = &fs;
+		for (i = 0; i <= mp_maxid; i++) {
+			if (CPU_ABSENT(i))
+				continue;
+			pfs->ft_collisions  += ft->ft_stats[i].ft_collisions;
+			pfs->ft_allocated   += ft->ft_stats[i].ft_allocated;
+			pfs->ft_misses      += ft->ft_stats[i].ft_misses;
+			pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
+			pfs->ft_frees       += ft->ft_stats[i].ft_frees;
+			pfs->ft_hits        += ft->ft_stats[i].ft_hits;
+			pfs->ft_lookups     += ft->ft_stats[i].ft_lookups;
+			if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
+				pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
+		}
+	} else {
+		pfs = &ft->ft_stats[0];
+	}
+	fs_print(sb, pfs);
+}
+
+static int
+sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
+{
+	struct flowtable *ft;
+	struct sbuf *sb;
+	int error;
+
+	sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
+
+	ft = V_flow_list_head;
+	while (ft != NULL) {
+		sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
+		flowtable_show_stats(sb, ft);
+		ft = ft->ft_next;
+	}
+	sbuf_finish(sb);
+	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+	sbuf_delete(sb);
+
+	return (error);
+}
+SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
+    NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
+
+
 #ifndef RADIX_MPATH
 static void
 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
@@ -342,52 +416,122 @@ flowtable_pcpu_unlock(struct flowtable *
 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
 
-#define FL_STALE (1<<8)
-#define FL_IPV6  (1<<9)
+#define FL_STALE 	(1<<8)
+#define FL_IPV6  	(1<<9)
+#define FL_OVERWRITE	(1<<10)
 
-static uint32_t
-ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
-    uint32_t *key, uint16_t *flags, uint8_t *protop)
+void
+flow_invalidate(struct flentry *fle)
 {
-	uint16_t sport = 0, dport = 0;
-	struct ip *ip = NULL;
-	uint8_t proto = 0;
+
+	fle->f_flags |= FL_STALE;
+}
+
+static __inline int
+proto_to_flags(uint8_t proto)
+{
+	int flag;
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		flag = FL_TCP;
+		break;
+	case IPPROTO_SCTP:
+		flag = FL_SCTP;
+		break;		
+	case IPPROTO_UDP:
+		flag = FL_UDP;
+		break;
+	default:
+		flag = 0;
+		break;
+	}
+
+	return (flag);
+}
+
+static __inline int
+flags_to_proto(int flags)
+{
+	int proto, protoflags;
+
+	protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
+	switch (protoflags) {
+	case FL_TCP:
+		proto = IPPROTO_TCP;
+		break;
+	case FL_SCTP:
+		proto = IPPROTO_SCTP;
+		break;
+	case FL_UDP:
+		proto = IPPROTO_UDP;
+		break;
+	default:
+		proto = 0;
+		break;
+	}
+	return (proto);
+}
+
+#ifdef INET
+#ifdef FLOWTABLE_DEBUG
+static void
+ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
+    struct sockaddr_in *dsin)
+{
+	char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
+
+	if (flags & FL_HASH_ALL) {
+		inet_ntoa_r(ssin->sin_addr, saddr);
+		inet_ntoa_r(dsin->sin_addr, daddr);
+		printf("proto=%d %s:%d->%s:%d\n",
+		    proto, saddr, ntohs(ssin->sin_port), daddr,
+		    ntohs(dsin->sin_port));
+	} else {
+		inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
+		printf("proto=%d %s\n", proto, daddr);
+	}
+
+}
+#endif
+
+static int
+ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
+    struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
+{
+	struct ip *ip;
+	uint8_t proto;
 	int iphlen;
-	uint32_t hash;
-	struct sockaddr_in *sin;
 	struct tcphdr *th;
 	struct udphdr *uh;
 	struct sctphdr *sh;
+	uint16_t sport, dport;
 
-	if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
-		return (0);
+	proto = sport = dport = 0;
+	ip = mtod(m, struct ip *);
+	dsin->sin_family = AF_INET;
+	dsin->sin_len = sizeof(*dsin);
+	dsin->sin_addr = ip->ip_dst;
+	ssin->sin_family = AF_INET;
+	ssin->sin_len = sizeof(*ssin);
+	ssin->sin_addr = ip->ip_src;	
 
-	key[1] = key[0] = 0;
-	sin = (struct sockaddr_in *)&ro->ro_dst;
-	if (m != NULL) {
-		ip = mtod(m, struct ip *);
-		sin->sin_family = AF_INET;
-		sin->sin_len = sizeof(*sin);
-		sin->sin_addr = ip->ip_dst;
-	} else
-		*flags &= ~FL_HASH_PORTS;
-
-	key[2] = sin->sin_addr.s_addr;
-
-	if ((*flags & FL_HASH_PORTS) == 0)
+	proto = ip->ip_p;
+	if ((*flags & FL_HASH_ALL) == 0) {
+		FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
+		    *flags);
 		goto skipports;
+	}
 
-	proto = ip->ip_p;
 	iphlen = ip->ip_hl << 2; /* XXX options? */
-	key[1] = ip->ip_src.s_addr;
-	
+
 	switch (proto) {
 	case IPPROTO_TCP:
 		th = (struct tcphdr *)((caddr_t)ip + iphlen);
-		sport = ntohs(th->th_sport);
-		dport = ntohs(th->th_dport);
-		*flags |= th->th_flags;
-		if (*flags & TH_RST)
+		sport = th->th_sport;
+		dport = th->th_dport;
+		if ((*flags & FL_HASH_ALL) &&
+		    (th->th_flags & (TH_RST|TH_FIN)))
 			*flags |= FL_STALE;
 	break;
 	case IPPROTO_UDP:
@@ -401,38 +545,288 @@ ipv4_flow_lookup_hash_internal(struct mb
 		dport = sh->dest_port;
 	break;
 	default:
-		if (*flags & FL_HASH_PORTS)
-			goto noop;
+		FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
+		return (ENOTSUP);
 		/* no port - hence not a protocol we care about */
 		break;
 	
 	}
-	*protop = proto;
 
-	/*
-	 * If this is a transmit route cache then 
-	 * hash all flows to a given destination to
-	 * the same bucket
-	 */
-	if ((*flags & FL_HASH_PORTS) == 0)
-		proto = sport = dport = 0;
+skipports:
+	*flags |= proto_to_flags(proto);
+	ssin->sin_port = sport;
+	dsin->sin_port = dport;
+	return (0);
+}
 
-	((uint16_t *)key)[0] = sport;
-	((uint16_t *)key)[1] = dport; 
+static uint32_t
+ipv4_flow_lookup_hash_internal(
+	struct sockaddr_in *ssin, struct sockaddr_in *dsin, 
+	    uint32_t *key, uint16_t flags)
+{
+	uint16_t sport, dport;
+	uint8_t proto;
+	int offset = 0;
 
-skipports:
-	hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
-	if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
-		m->m_flags |= M_FLOWID;
-		m->m_pkthdr.flowid = hash;
+	if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
+		return (0);
+	proto = flags_to_proto(flags);
+	sport = dport = key[2] = key[1] = key[0] = 0;
+	if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
+		key[1] = ssin->sin_addr.s_addr;
+		sport = ssin->sin_port;
+	}
+	if (dsin != NULL) {
+		key[2] = dsin->sin_addr.s_addr;
+		dport = dsin->sin_port;
+	}
+	if (flags & FL_HASH_ALL) {
+		((uint16_t *)key)[0] = sport;
+		((uint16_t *)key)[1] = dport; 
+	} else
+		offset = V_flow_hashjitter + proto;
+
+	return (jenkins_hashword(key, 3, offset));
+}
+
+static struct flentry *
+flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
+{
+	struct sockaddr_storage ssa, dsa;
+	uint16_t flags;
+	struct sockaddr_in *dsin, *ssin;
+
+	dsin = (struct sockaddr_in *)&dsa;
+	ssin = (struct sockaddr_in *)&ssa;
+	flags = ft->ft_flags;
+	if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
+		return (NULL);
+
+	return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
+}
+
+void
+flow_to_route(struct flentry *fle, struct route *ro)
+{
+	uint32_t *hashkey = NULL;
+	struct sockaddr_in *sin;
+
+	sin = (struct sockaddr_in *)&ro->ro_dst;
+	sin->sin_family = AF_INET;
+	sin->sin_len = sizeof(*sin);
+	hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
+	sin->sin_addr.s_addr = hashkey[2];
+	ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
+	ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
+}
+#endif /* INET */
+
+#ifdef INET6
+/*
+ * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
+ * then it sets p to point at the offset "len" in the mbuf. WARNING: the
+ * pointer might become stale after other pullups (but we never use it
+ * this way).
+ */
+#define PULLUP_TO(_len, p, T)						\
+do {									\
+	int x = (_len) + sizeof(T);					\
+	if ((m)->m_len < x) {						\
+		goto receive_failed;					\
+	}								\
+	p = (mtod(m, char *) + (_len));					\
+} while (0)
+
+#define	TCP(p)		((struct tcphdr *)(p))
+#define	SCTP(p)		((struct sctphdr *)(p))
+#define	UDP(p)		((struct udphdr *)(p))
+
+static int
+ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
+    struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
+{
+	struct ip6_hdr *ip6;
+	uint8_t proto;
+	int hlen;
+	uint16_t src_port, dst_port;
+	u_short offset;
+	void *ulp;
+
+	offset = hlen = src_port = dst_port = 0;
+	ulp = NULL;
+	ip6 = mtod(m, struct ip6_hdr *);
+	hlen = sizeof(struct ip6_hdr);
+	proto = ip6->ip6_nxt;
+
+	if ((*flags & FL_HASH_ALL) == 0)
+		goto skipports;
+
+	while (ulp == NULL) {
+		switch (proto) {
+		case IPPROTO_ICMPV6:
+		case IPPROTO_OSPFIGP:
+		case IPPROTO_PIM:
+		case IPPROTO_CARP:
+		case IPPROTO_ESP:
+		case IPPROTO_NONE:
+			ulp = ip6;
+			break;
+		case IPPROTO_TCP:
+			PULLUP_TO(hlen, ulp, struct tcphdr);
+			dst_port = TCP(ulp)->th_dport;
+			src_port = TCP(ulp)->th_sport;
+			if ((*flags & FL_HASH_ALL) &&
+			    (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
+				*flags |= FL_STALE;
+			break;
+		case IPPROTO_SCTP:
+			PULLUP_TO(hlen, ulp, struct sctphdr);
+			src_port = SCTP(ulp)->src_port;
+			dst_port = SCTP(ulp)->dest_port;
+			break;
+		case IPPROTO_UDP:
+			PULLUP_TO(hlen, ulp, struct udphdr);
+			dst_port = UDP(ulp)->uh_dport;
+			src_port = UDP(ulp)->uh_sport;
+			break;
+		case IPPROTO_HOPOPTS:	/* RFC 2460 */
+			PULLUP_TO(hlen, ulp, struct ip6_hbh);
+			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+			ulp = NULL;
+			break;
+		case IPPROTO_ROUTING:	/* RFC 2460 */
+			PULLUP_TO(hlen, ulp, struct ip6_rthdr);	
+			hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
+			proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
+			ulp = NULL;
+			break;
+		case IPPROTO_FRAGMENT:	/* RFC 2460 */
+			PULLUP_TO(hlen, ulp, struct ip6_frag);
+			hlen += sizeof (struct ip6_frag);
+			proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
+			offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
+			    IP6F_OFF_MASK;
+			ulp = NULL;
+			break;
+		case IPPROTO_DSTOPTS:	/* RFC 2460 */
+			PULLUP_TO(hlen, ulp, struct ip6_hbh);
+			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+			ulp = NULL;
+			break;
+		case IPPROTO_AH:	/* RFC 2402 */
+			PULLUP_TO(hlen, ulp, struct ip6_ext);
+			hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
+			proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
+			ulp = NULL;
+			break;
+		default:
+			PULLUP_TO(hlen, ulp, struct ip6_ext);
+			break;
+		}
+	}
+
+	if (src_port == 0) {
+	receive_failed:
+		return (ENOTSUP);
 	}
 
-	return (hash);
-noop:
-	*protop = proto;
+skipports:
+	dsin6->sin6_family = AF_INET6;
+	dsin6->sin6_len = sizeof(*dsin6);
+	dsin6->sin6_port = dst_port;
+	memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
+
+	ssin6->sin6_family = AF_INET6;
+	ssin6->sin6_len = sizeof(*ssin6);
+	ssin6->sin6_port = src_port;
+	memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
+	*flags |= proto_to_flags(proto);
+
 	return (0);
 }
 
+#define zero_key(key) 		\
+do {				\
+	key[0] = 0;		\
+	key[1] = 0;		\
+	key[2] = 0;		\
+	key[3] = 0;		\
+	key[4] = 0;		\
+	key[5] = 0;		\
+	key[6] = 0;		\
+	key[7] = 0;		\
+	key[8] = 0;		\
+} while (0)
+	
+static uint32_t
+ipv6_flow_lookup_hash_internal(
+	struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, 
+	    uint32_t *key, uint16_t flags)
+{
+	uint16_t sport, dport;
+	uint8_t proto;
+	int offset = 0;
+
+	if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
+		return (0);
+
+	proto = flags_to_proto(flags);
+	zero_key(key);
+	sport = dport = 0;
+	if (dsin6 != NULL) {
+		memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
+		dport = dsin6->sin6_port;
+	}
+	if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
+		memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
+		sport = ssin6->sin6_port;
+	}
+	if (flags & FL_HASH_ALL) {
+		((uint16_t *)key)[0] = sport;
+		((uint16_t *)key)[1] = dport; 
+	} else
+		offset = V_flow_hashjitter + proto;
+
+	return (jenkins_hashword(key, 9, offset));
+}
+
+static struct flentry *
+flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
+{
+	struct sockaddr_storage ssa, dsa;
+	struct sockaddr_in6 *dsin6, *ssin6;	
+	uint16_t flags;
+
+	dsin6 = (struct sockaddr_in6 *)&dsa;
+	ssin6 = (struct sockaddr_in6 *)&ssa;
+	flags = ft->ft_flags;
+	
+	if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
+		return (NULL);
+
+	return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
+}
+
+void
+flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
+{
+	uint32_t *hashkey = NULL;
+	struct sockaddr_in6 *sin6;
+
+	sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
+
+	sin6->sin6_family = AF_INET6;
+	sin6->sin6_len = sizeof(*sin6);
+	hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
+	memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
+	ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
+	ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
+
+}
+#endif /* INET6 */
+
 static bitstr_t *
 flowtable_mask(struct flowtable *ft)
 {
@@ -511,22 +905,78 @@ flowtable_set_hashkey(struct flentry *fl
 		hashkey[i] = key[i];
 }
 
+static struct flentry *
+flow_alloc(struct flowtable *ft)
+{
+	struct flentry *newfle;
+	uma_zone_t zone;
+
+	newfle = NULL;
+	zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
+
+	newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
+	if (newfle != NULL)
+		atomic_add_int(&ft->ft_count, 1);
+	return (newfle);
+}
+
+static void
+flow_free(struct flentry *fle, struct flowtable *ft)
+{
+	uma_zone_t zone;
+
+	zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
+	atomic_add_int(&ft->ft_count, -1);
+	uma_zfree(zone, fle);
+}
+
+static int
+flow_full(struct flowtable *ft)
+{
+	boolean_t full;
+	uint32_t count;
+	
+	full = ft->ft_full;
+	count = ft->ft_count;
+
+	if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
+		ft->ft_full = FALSE;
+	else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
+		ft->ft_full = TRUE;
+	
+	if (full && !ft->ft_full) {
+		flowclean_freq = 4*hz;
+		if ((ft->ft_flags & FL_HASH_ALL) == 0)
+			ft->ft_udp_idle = ft->ft_fin_wait_idle =
+			    ft->ft_syn_idle = ft->ft_tcp_idle = 5;
+		cv_broadcast(&flowclean_cv);
+	} else if (!full && ft->ft_full) {
+		flowclean_freq = 20*hz;
+		if ((ft->ft_flags & FL_HASH_ALL) == 0)
+			ft->ft_udp_idle = ft->ft_fin_wait_idle =
+			    ft->ft_syn_idle = ft->ft_tcp_idle = 30;
+	}
+
+	return (ft->ft_full);
+}
+
 static int
 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
-    uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags)
+    uint32_t fibnum, struct route *ro, uint16_t flags)
 {
 	struct flentry *fle, *fletail, *newfle, **flep;
+	struct flowtable_stats *fs = &ft->ft_stats[curcpu];
 	int depth;
-	uma_zone_t flezone;
 	bitstr_t *mask;
+	uint8_t proto;
 
-	flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
-	newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
+	newfle = flow_alloc(ft);
 	if (newfle == NULL)
 		return (ENOMEM);
 
 	newfle->f_flags |= (flags & FL_IPV6);
-	
+	proto = flags_to_proto(flags);
+
 	FL_ENTRY_LOCK(ft, hash);
 	mask = flowtable_mask(ft);
 	flep = flowtable_entry(ft, hash);
@@ -539,7 +989,7 @@ flowtable_insert(struct flowtable *ft, u
 	} 
 	
 	depth = 0;
-	V_flowtable_collisions++;
+	fs->ft_collisions++;
 	/*
 	 * find end of list and make sure that we were not
 	 * preempted by another thread handling this flow
@@ -551,8 +1001,10 @@ flowtable_insert(struct flowtable *ft, u
 			 * or we lost a race to insert
 			 */
 			FL_ENTRY_UNLOCK(ft, hash);
-			uma_zfree((newfle->f_flags & FL_IPV6) ?
-			    V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
+			flow_free(newfle, ft);
+			
+			if (flags & FL_OVERWRITE) 
+				goto skip;
 			return (EEXIST);
 		}
 		/*
@@ -565,8 +1017,8 @@ flowtable_insert(struct flowtable *ft, u
 		fle = fle->f_next;
 	} 
 
-	if (depth > V_flowtable_max_depth)
-		V_flowtable_max_depth = depth;
+	if (depth > fs->ft_max_depth)
+		fs->ft_max_depth = depth;
 	fletail->f_next = newfle;
 	fle = newfle;
 skip:
@@ -582,6 +1034,35 @@ skip:
 	return (0);
 }
 
+int
+kern_flowtable_insert(struct flowtable *ft,
+    struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
+    struct route *ro, uint32_t fibnum, int flags)
+{
+	uint32_t key[9], hash;
+
+	flags = (ft->ft_flags | flags | FL_OVERWRITE);
+	hash = 0;
+
+#ifdef INET
+	if (ssa->ss_family == AF_INET) 
+		hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
+		    (struct sockaddr_in *)dsa, key, flags);
+#endif
+#ifdef INET6
+	if (ssa->ss_family == AF_INET6) 
+		hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
+		    (struct sockaddr_in6 *)dsa, key, flags);
+#endif	
+	if (ro->ro_rt == NULL || ro->ro_lle == NULL)
+		return (EINVAL);
+
+	FLDPRINTF(ft, FL_DEBUG,
+	    "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
+	    key[0], key[1], key[2], hash, fibnum, flags);
+	return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
+}
+
 static int
 flowtable_key_equal(struct flentry *fle, uint32_t *key)
 {
@@ -595,7 +1076,7 @@ flowtable_key_equal(struct flentry *fle,
 		nwords = 3;
 		hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
 	}
-	
+
 	for (i = 0; i < nwords; i++) 
 		if (hashkey[i] != key[i])
 			return (0);
@@ -603,44 +1084,86 @@ flowtable_key_equal(struct flentry *fle,
 	return (1);
 }
 
-int
-flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum)
+struct flentry *
+flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
+{
+	struct flentry *fle = NULL;
+
+#ifdef INET
+	if (af == AF_INET)
+		fle = flowtable_lookup_mbuf4(ft, m);
+#endif
+#ifdef INET6
+	if (af == AF_INET6)
+		fle = flowtable_lookup_mbuf6(ft, m);
+#endif	
+	if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
+		m->m_flags |= M_FLOWID;
+		m->m_pkthdr.flowid = fle->f_fhash;
+	}
+	return (fle);
+}
+	
+struct flentry *
+flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
+    struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
 {
 	uint32_t key[9], hash;
 	struct flentry *fle;
-	uint16_t flags;
+	struct flowtable_stats *fs = &ft->ft_stats[curcpu];
 	uint8_t proto = 0;
 	int error = 0;
 	struct rtentry *rt;
 	struct llentry *lle;
+	struct route sro, *ro;
+	struct route_in6 sro6;
 
-	flags = ft->ft_flags;
-	ro->ro_rt = NULL;
-	ro->ro_lle = NULL;
+	sro.ro_rt = sro6.ro_rt = NULL;
+	sro.ro_lle = sro6.ro_lle = NULL;
+	ro = NULL;
+	hash = 0;
+	flags |= ft->ft_flags;
+	proto = flags_to_proto(flags);
+#ifdef INET
+	if (ssa->ss_family == AF_INET) {
+		struct sockaddr_in *ssin, *dsin;
+
+		ro = &sro;
+		memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
+		dsin = (struct sockaddr_in *)dsa;
+		ssin = (struct sockaddr_in *)ssa;
+		if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
+		    (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
+		    (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
+			return (NULL);
 
-	/*
-	 * The internal hash lookup is the only IPv4 specific bit
-	 * remaining
-	 *
-	 * XXX BZ: to add IPv6 support just add a check for the
-	 * address type in m and ro and an equivalent ipv6 lookup
-	 * function - the rest of the code should automatically
-	 * handle an ipv6 flow (note that m can be NULL in which
-	 * case ro will be set)
-	 */
-	hash = ipv4_flow_lookup_hash_internal(m, ro, key,
-	    &flags, &proto);
+		hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
+	}
+#endif
+#ifdef INET6
+	if (ssa->ss_family == AF_INET6) {
+		struct sockaddr_in6 *ssin6, *dsin6;
+
+		ro = (struct route *)&sro6;
+		memcpy(&sro6.ro_dst, dsa,
+		    sizeof(struct sockaddr_in6));
+		dsin6 = (struct sockaddr_in6 *)dsa;
+		ssin6 = (struct sockaddr_in6 *)ssa;
 
+		flags |= FL_IPV6;
+		hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
+	}
+#endif
 	/*
 	 * Ports are zero and this isn't a transmit cache
 	 * - thus not a protocol for which we need to keep 
 	 * state
-	 * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
+	 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
 	 */
-	if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
-		return (ENOENT);
+	if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
+		return (NULL);
 
-	V_flowtable_lookups++;
+	fs->ft_lookups++;
 	FL_ENTRY_LOCK(ft, hash);
 	if ((fle = FL_ENTRY(ft, hash)) == NULL) {
 		FL_ENTRY_UNLOCK(ft, hash);
@@ -656,21 +1179,21 @@ keycheck:	
 	    && (fibnum == fle->f_fibnum)
 	    && (rt->rt_flags & RTF_UP)
 	    && (rt->rt_ifp != NULL)) {
-		V_flowtable_hits++;
+		fs->ft_hits++;
 		fle->f_uptime = time_uptime;
 		fle->f_flags |= flags;
-		ro->ro_rt = rt;
-		ro->ro_lle = lle;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list