svn commit: r243707 - head/sys/netpfil/ipfw

Alexander V. Chernikov melifaro at FreeBSD.org
Fri Nov 30 16:33:22 UTC 2012


Author: melifaro
Date: Fri Nov 30 16:33:22 2012
New Revision: 243707
URL: http://svnweb.freebsd.org/changeset/base/243707

Log:
  Make ipfw dynamic states operations SMP-ready.
  
  * Global IPFW_DYN_LOCK() is changed to per-bucket mutex.
  * State expiration is done in ipfw_tick every second.
  * No expiration is done on forwarding path.
  * hash table resize is done automatically and does not flush all states.
  * Dynamic UMA zone is now allocated per each VNET
  * State limiting is now done via UMA(9) api.
  
  Discussed with:	ipfw
  MFC after:	3 weeks
  Sponsored by:	Yandex LLC

Modified:
  head/sys/netpfil/ipfw/ip_fw2.c
  head/sys/netpfil/ipfw/ip_fw_dynamic.c
  head/sys/netpfil/ipfw/ip_fw_private.h
  head/sys/netpfil/ipfw/ip_fw_sockopt.c

Modified: head/sys/netpfil/ipfw/ip_fw2.c
==============================================================================
--- head/sys/netpfil/ipfw/ip_fw2.c	Fri Nov 30 16:18:26 2012	(r243706)
+++ head/sys/netpfil/ipfw/ip_fw2.c	Fri Nov 30 16:33:22 2012	(r243707)
@@ -2046,7 +2046,7 @@ do {								\
 						f->rulenum, f->id);
 					cmd = ACTION_PTR(f);
 					l = f->cmd_len - f->act_ofs;
-					ipfw_dyn_unlock();
+					ipfw_dyn_unlock(q);
 					cmdlen = 0;
 					match = 1;
 					break;
@@ -2525,7 +2525,6 @@ ipfw_init(void)
 {
 	int error = 0;
 
-	ipfw_dyn_attach();
 	/*
  	 * Only print out this stuff the first time around,
 	 * when called from the sysinit code.
@@ -2579,7 +2578,6 @@ ipfw_destroy(void)
 {
 
 	ipfw_log_bpf(0); /* uninit */
-	ipfw_dyn_detach();
 	printf("IP firewall unloaded\n");
 }
 
@@ -2637,7 +2635,7 @@ vnet_ipfw_init(const void *unused)
 	chain->id = rule->id = 1;
 
 	IPFW_LOCK_INIT(chain);
-	ipfw_dyn_init();
+	ipfw_dyn_init(chain);
 
 	/* First set up some values that are compile time options */
 	V_ipfw_vnet_ready = 1;		/* Open for business */

Modified: head/sys/netpfil/ipfw/ip_fw_dynamic.c
==============================================================================
--- head/sys/netpfil/ipfw/ip_fw_dynamic.c	Fri Nov 30 16:18:26 2012	(r243706)
+++ head/sys/netpfil/ipfw/ip_fw_dynamic.c	Fri Nov 30 16:33:22 2012	(r243707)
@@ -95,7 +95,7 @@ __FBSDID("$FreeBSD$");
  * The lifetime of dynamic rules is regulated by dyn_*_lifetime,
  * measured in seconds and depending on the flags.
  *
- * The total number of dynamic rules is stored in dyn_count.
+ * The total number of dynamic rules is equal to UMA zone items count.
  * The max number of dynamic rules is dyn_max. When we reach
  * the maximum number of rules we do not create anymore. This is
  * done to avoid consuming too much memory, but also too much
@@ -111,37 +111,33 @@ __FBSDID("$FreeBSD$");
  * passes through the firewall. XXX check the latter!!!
  */
 
+struct ipfw_dyn_bucket {
+	struct mtx	mtx;		/* Bucket protecting lock */
+	ipfw_dyn_rule	*head;		/* Pointer to first rule */
+};
+
 /*
  * Static variables followed by global ones
  */
-static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v);
-static VNET_DEFINE(u_int32_t, dyn_buckets);
+static VNET_DEFINE(struct ipfw_dyn_bucket *, ipfw_dyn_v);
+static VNET_DEFINE(u_int32_t, dyn_buckets_max);
 static VNET_DEFINE(u_int32_t, curr_dyn_buckets);
 static VNET_DEFINE(struct callout, ipfw_timeout);
 #define	V_ipfw_dyn_v			VNET(ipfw_dyn_v)
-#define	V_dyn_buckets			VNET(dyn_buckets)
+#define	V_dyn_buckets_max		VNET(dyn_buckets_max)
 #define	V_curr_dyn_buckets		VNET(curr_dyn_buckets)
 #define V_ipfw_timeout                  VNET(ipfw_timeout)
 
-static uma_zone_t ipfw_dyn_rule_zone;
-#ifndef __FreeBSD__
-DEFINE_SPINLOCK(ipfw_dyn_mtx);
-#else
-static struct mtx ipfw_dyn_mtx;		/* mutex guarding dynamic rules */
-#endif
-
-#define	IPFW_DYN_LOCK_INIT() \
-	mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF)
-#define	IPFW_DYN_LOCK_DESTROY()	mtx_destroy(&ipfw_dyn_mtx)
-#define	IPFW_DYN_LOCK()		mtx_lock(&ipfw_dyn_mtx)
-#define	IPFW_DYN_UNLOCK()	mtx_unlock(&ipfw_dyn_mtx)
-#define	IPFW_DYN_LOCK_ASSERT()	mtx_assert(&ipfw_dyn_mtx, MA_OWNED)
+static VNET_DEFINE(uma_zone_t, ipfw_dyn_rule_zone);
+#define	V_ipfw_dyn_rule_zone		VNET(ipfw_dyn_rule_zone)
 
-void
-ipfw_dyn_unlock(void)
-{
-	IPFW_DYN_UNLOCK();
-}
+#define	IPFW_BUCK_LOCK_INIT(b)	\
+	mtx_init(&(b)->mtx, "IPFW dynamic bucket", NULL, MTX_DEF)
+#define	IPFW_BUCK_LOCK_DESTROY(b)	\
+	mtx_destroy(&(b)->mtx)
+#define	IPFW_BUCK_LOCK(i)	mtx_lock(&V_ipfw_dyn_v[(i)].mtx)
+#define	IPFW_BUCK_UNLOCK(i)	mtx_unlock(&V_ipfw_dyn_v[(i)].mtx)
+#define	IPFW_BUCK_ASSERT(i)	mtx_assert(&V_ipfw_dyn_v[(i)].mtx, MA_OWNED)
 
 /*
  * Timeouts for various events in handing dynamic rules.
@@ -171,33 +167,42 @@ static VNET_DEFINE(u_int32_t, dyn_short_
 static VNET_DEFINE(u_int32_t, dyn_keepalive_interval);
 static VNET_DEFINE(u_int32_t, dyn_keepalive_period);
 static VNET_DEFINE(u_int32_t, dyn_keepalive);
+static VNET_DEFINE(time_t, dyn_keepalive_last);
 
 #define	V_dyn_keepalive_interval	VNET(dyn_keepalive_interval)
 #define	V_dyn_keepalive_period		VNET(dyn_keepalive_period)
 #define	V_dyn_keepalive			VNET(dyn_keepalive)
+#define	V_dyn_keepalive_last		VNET(dyn_keepalive_last)
 
-static VNET_DEFINE(u_int32_t, dyn_count);	/* # of dynamic rules */
 static VNET_DEFINE(u_int32_t, dyn_max);		/* max # of dynamic rules */
 
-#define	V_dyn_count			VNET(dyn_count)
+#define	DYN_COUNT			uma_zone_get_cur(V_ipfw_dyn_rule_zone)
 #define	V_dyn_max			VNET(dyn_max)
 
+static int last_log;	/* Log ratelimiting */
+
+static void ipfw_dyn_tick(void *vnetx);
+static void check_dyn_rules(struct ip_fw_chain *, struct ip_fw *,
+    int, int, int);
 #ifdef SYSCTL_NODE
 
+static int sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS);
+static int sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS);
+
 SYSBEGIN(f2)
 
 SYSCTL_DECL(_net_inet_ip_fw);
 SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets,
-    CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0,
-    "Number of dyn. buckets");
+    CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0,
+    "Max number of dyn. buckets");
 SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets,
     CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0,
     "Current Number of dyn. buckets");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_count,
-    CTLFLAG_RD, &VNET_NAME(dyn_count), 0,
+SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count,
+    CTLTYPE_UINT|CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU",
     "Number of dyn. rules");
-SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_max,
-    CTLFLAG_RW, &VNET_NAME(dyn_max), 0,
+SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max,
+    CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU",
     "Max number of dyn. rules");
 SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime,
     CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0,
@@ -244,7 +249,7 @@ hash_packet6(struct ipfw_flow_id *id)
  * and we want to find both in the same bucket.
  */
 static __inline int
-hash_packet(struct ipfw_flow_id *id)
+hash_packet(struct ipfw_flow_id *id, int buckets)
 {
 	u_int32_t i;
 
@@ -254,7 +259,7 @@ hash_packet(struct ipfw_flow_id *id)
 	else
 #endif /* INET6 */
 	i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port);
-	i &= (V_curr_dyn_buckets - 1);
+	i &= (buckets - 1);
 	return i;
 }
 
@@ -286,124 +291,19 @@ print_dyn_rule_flags(struct ipfw_flow_id
 	}
 	log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n",
 	    prefix, dyn_type, src, id->src_port, dst,
-	    id->dst_port, V_dyn_count, postfix);
+	    id->dst_port, DYN_COUNT, postfix);
 }
 
 #define	print_dyn_rule(id, dtype, prefix, postfix)	\
 	print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix)
 
-/**
- * unlink a dynamic rule from a chain. prev is a pointer to
- * the previous one, q is a pointer to the rule to delete,
- * head is a pointer to the head of the queue.
- * Modifies q and potentially also head.
- */
-#define UNLINK_DYN_RULE(prev, head, q) {				\
-	ipfw_dyn_rule *old_q = q;					\
-									\
-	/* remove a refcount to the parent */				\
-	if (q->dyn_type == O_LIMIT)					\
-		q->parent->count--;					\
-	V_dyn_count--;							\
-	DEB(print_dyn_rule(&q->id, q->dyn_type, "unlink entry", "left");) \
-	if (prev != NULL)						\
-		prev->next = q = q->next;				\
-	else								\
-		head = q = q->next;					\
-	uma_zfree(ipfw_dyn_rule_zone, old_q); }
-
 #define TIME_LEQ(a,b)       ((int)((a)-(b)) <= 0)
 
-/**
- * Remove dynamic rules pointing to "rule", or all of them if rule == NULL.
- *
- * If keep_me == NULL, rules are deleted even if not expired,
- * otherwise only expired rules are removed.
- *
- * The value of the second parameter is also used to point to identify
- * a rule we absolutely do not want to remove (e.g. because we are
- * holding a reference to it -- this is the case with O_LIMIT_PARENT
- * rules). The pointer is only used for comparison, so any non-null
- * value will do.
- */
-static void
-remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me)
-{
-	static u_int32_t last_remove = 0;
-
-#define FORCE (keep_me == NULL)
-
-	ipfw_dyn_rule *prev, *q;
-	int i, pass = 0, max_pass = 0;
-
-	IPFW_DYN_LOCK_ASSERT();
-
-	if (V_ipfw_dyn_v == NULL || V_dyn_count == 0)
-		return;
-	/* do not expire more than once per second, it is useless */
-	if (!FORCE && last_remove == time_uptime)
-		return;
-	last_remove = time_uptime;
-
-	/*
-	 * because O_LIMIT refer to parent rules, during the first pass only
-	 * remove child and mark any pending LIMIT_PARENT, and remove
-	 * them in a second pass.
-	 */
-next_pass:
-	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
-		for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) {
-			/*
-			 * Logic can become complex here, so we split tests.
-			 */
-			if (q == keep_me)
-				goto next;
-			if (rule != NULL && rule != q->rule)
-				goto next; /* not the one we are looking for */
-			if (q->dyn_type == O_LIMIT_PARENT) {
-				/*
-				 * handle parent in the second pass,
-				 * record we need one.
-				 */
-				max_pass = 1;
-				if (pass == 0)
-					goto next;
-				if (FORCE && q->count != 0 ) {
-					/* XXX should not happen! */
-					printf("ipfw: OUCH! cannot remove rule,"
-					     " count %d\n", q->count);
-				}
-			} else {
-				if (!FORCE &&
-				    !TIME_LEQ( q->expire, time_uptime ))
-					goto next;
-			}
-             if (q->dyn_type != O_LIMIT_PARENT || !q->count) {
-                     UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
-                     continue;
-             }
-next:
-			prev=q;
-			q=q->next;
-		}
-	}
-	if (pass++ < max_pass)
-		goto next_pass;
-}
-
-void
-ipfw_remove_dyn_children(struct ip_fw *rule)
-{
-	IPFW_DYN_LOCK();
-	remove_dyn_rule(rule, NULL /* force removal */);
-	IPFW_DYN_UNLOCK();
-}
-
 /*
  * Lookup a dynamic rule, locked version.
  */
 static ipfw_dyn_rule *
-lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction,
+lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int i, int *match_direction,
     struct tcphdr *tcp)
 {
 	/*
@@ -414,23 +314,17 @@ lookup_dyn_rule_locked(struct ipfw_flow_
 #define MATCH_FORWARD	1
 #define MATCH_NONE	2
 #define MATCH_UNKNOWN	3
-	int i, dir = MATCH_NONE;
+	int dir = MATCH_NONE;
 	ipfw_dyn_rule *prev, *q = NULL;
 
-	IPFW_DYN_LOCK_ASSERT();
+	IPFW_BUCK_ASSERT(i);
 
-	if (V_ipfw_dyn_v == NULL)
-		goto done;				/* not found */
-	i = hash_packet(pkt);
-	for (prev = NULL, q = V_ipfw_dyn_v[i]; q != NULL;) {
+	for (prev = NULL, q = V_ipfw_dyn_v[i].head; q; prev = q, q = q->next) {
 		if (q->dyn_type == O_LIMIT_PARENT && q->count)
-			goto next;
-		if (TIME_LEQ(q->expire, time_uptime)) {	/* expire entry */
-			UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q);
 			continue;
-		}
+
 		if (pkt->proto != q->id.proto || q->dyn_type == O_LIMIT_PARENT)
-			goto next;
+			continue;
 
 		if (IS_IP6_FLOW_ID(pkt)) {
 			if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) &&
@@ -463,17 +357,14 @@ lookup_dyn_rule_locked(struct ipfw_flow_
 				break;
 			}
 		}
-next:
-		prev = q;
-		q = q->next;
 	}
 	if (q == NULL)
 		goto done;	/* q = NULL, not found */
 
 	if (prev != NULL) {	/* found and not in front */
 		prev->next = q->next;
-		q->next = V_ipfw_dyn_v[i];
-		V_ipfw_dyn_v[i] = q;
+		q->next = V_ipfw_dyn_v[i].head;
+		V_ipfw_dyn_v[i].head = q;
 	}
 	if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */
 		uint32_t ack;
@@ -556,42 +447,108 @@ ipfw_lookup_dyn_rule(struct ipfw_flow_id
     struct tcphdr *tcp)
 {
 	ipfw_dyn_rule *q;
+	int i;
 
-	IPFW_DYN_LOCK();
-	q = lookup_dyn_rule_locked(pkt, match_direction, tcp);
+	i = hash_packet(pkt, V_curr_dyn_buckets);
+
+	IPFW_BUCK_LOCK(i);
+	q = lookup_dyn_rule_locked(pkt, i, match_direction, tcp);
 	if (q == NULL)
-		IPFW_DYN_UNLOCK();
+		IPFW_BUCK_UNLOCK(i);
 	/* NB: return table locked when q is not NULL */
 	return q;
 }
 
-static void
-realloc_dynamic_table(void)
+/*
+ * Unlock bucket mtx
+ * @p - pointer to dynamic rule
+ */
+void
+ipfw_dyn_unlock(ipfw_dyn_rule *q)
+{
+
+	IPFW_BUCK_UNLOCK(q->bucket);
+}
+
+static int
+resize_dynamic_table(struct ip_fw_chain *chain, int nbuckets)
 {
-	IPFW_DYN_LOCK_ASSERT();
+	int i, k, nbuckets_old;
+	ipfw_dyn_rule *q;
+	struct ipfw_dyn_bucket *dyn_v, *dyn_v_old;
+
+	/* Check if given number is power of 2 and less than 64k */
+	if ((nbuckets > 65536) || (!powerof2(nbuckets)))
+		return 1;
+
+	CTR3(KTR_NET, "%s: resize dynamic hash: %d -> %d", __func__,
+	    V_curr_dyn_buckets, nbuckets);
+
+	/* Allocate and initialize new hash */
+	dyn_v = malloc(nbuckets * sizeof(ipfw_dyn_rule), M_IPFW,
+	    M_WAITOK | M_ZERO);
+
+	for (i = 0 ; i < nbuckets; i++)
+		IPFW_BUCK_LOCK_INIT(&dyn_v[i]);
 
 	/*
-	 * Try reallocation, make sure we have a power of 2 and do
-	 * not allow more than 64k entries. In case of overflow,
-	 * default to 1024.
+	 * Call upper half lock, as get_map() do to ease
+	 * read-only access to dynamic rules hash from sysctl
 	 */
+	IPFW_UH_WLOCK(chain);
 
-	if (V_dyn_buckets > 65536)
-		V_dyn_buckets = 1024;
-	if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */
-		V_dyn_buckets = V_curr_dyn_buckets; /* reset */
-		return;
+	/*
+	 * Acquire chain write lock to permit hash access
+	 * for main traffic path without additional locks
+	 */
+	IPFW_WLOCK(chain);
+
+	/* Save old values */
+	nbuckets_old = V_curr_dyn_buckets;
+	dyn_v_old = V_ipfw_dyn_v;
+
+	/* Skip relinking if array is not set up */
+	if (V_ipfw_dyn_v == NULL)
+		V_curr_dyn_buckets = 0;
+
+	/* Re-link all dynamic states */
+	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
+		while (V_ipfw_dyn_v[i].head != NULL) {
+			/* Remove from current chain */
+			q = V_ipfw_dyn_v[i].head;
+			V_ipfw_dyn_v[i].head = q->next;
+
+			/* Get new hash value */
+			k = hash_packet(&q->id, nbuckets);
+			q->bucket = k;
+			/* Add to the new head */
+			q->next = dyn_v[k].head;
+			dyn_v[k].head = q;
+             }
 	}
-	V_curr_dyn_buckets = V_dyn_buckets;
-	if (V_ipfw_dyn_v != NULL)
-		free(V_ipfw_dyn_v, M_IPFW);
-	for (;;) {
-		V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *),
-		       M_IPFW, M_NOWAIT | M_ZERO);
-		if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2)
-			break;
-		V_curr_dyn_buckets /= 2;
+
+	/* Update current pointers/buckets values */
+	V_curr_dyn_buckets = nbuckets;
+	V_ipfw_dyn_v = dyn_v;
+
+	IPFW_WUNLOCK(chain);
+
+	IPFW_UH_WUNLOCK(chain);
+
+	/* Start periodic callout on initial creation */
+	if (dyn_v_old == NULL) {
+        	callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, curvnet, 0);
+		return (0);
 	}
+
+	/* Destroy all mutexes */
+	for (i = 0 ; i < nbuckets_old ; i++)
+		IPFW_BUCK_LOCK_DESTROY(&dyn_v_old[i]);
+
+	/* Free old hash */
+	free(dyn_v_old, M_IPFW);
+
+	return 0;
 }
 
 /**
@@ -605,33 +562,30 @@ realloc_dynamic_table(void)
  * - "parent" rules for the above (O_LIMIT_PARENT).
  */
 static ipfw_dyn_rule *
-add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule)
+add_dyn_rule(struct ipfw_flow_id *id, int i, u_int8_t dyn_type, struct ip_fw *rule)
 {
 	ipfw_dyn_rule *r;
-	int i;
 
-	IPFW_DYN_LOCK_ASSERT();
+	IPFW_BUCK_ASSERT(i);
 
-	if (V_ipfw_dyn_v == NULL ||
-	    (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) {
-		realloc_dynamic_table();
-		if (V_ipfw_dyn_v == NULL)
-			return NULL; /* failed ! */
-	}
-	i = hash_packet(id);
-
-	r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
+	r = uma_zalloc(V_ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO);
 	if (r == NULL) {
-		printf ("ipfw: sorry cannot allocate state\n");
+		if (last_log != time_uptime) {
+			last_log = time_uptime;
+			log(LOG_DEBUG, "ipfw: %s: Cannot allocate rule\n",
+			    __func__);
+		}
 		return NULL;
 	}
 
-	/* increase refcount on parent, and set pointer */
+	/*
+	 * refcount on parent is already incremented, so
+	 * it is safe to use parent unlocked.
+	 */
 	if (dyn_type == O_LIMIT) {
 		ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule;
 		if ( parent->dyn_type != O_LIMIT_PARENT)
 			panic("invalid parent");
-		parent->count++;
 		r->parent = parent;
 		rule = parent->rule;
 	}
@@ -644,9 +598,8 @@ add_dyn_rule(struct ipfw_flow_id *id, u_
 	r->count = 0;
 
 	r->bucket = i;
-	r->next = V_ipfw_dyn_v[i];
-	V_ipfw_dyn_v[i] = r;
-	V_dyn_count++;
+	r->next = V_ipfw_dyn_v[i].head;
+	V_ipfw_dyn_v[i].head = r;
 	DEB(print_dyn_rule(id, dyn_type, "add dyn entry", "total");)
 	return r;
 }
@@ -656,40 +609,40 @@ add_dyn_rule(struct ipfw_flow_id *id, u_
  * If the lookup fails, then install one.
  */
 static ipfw_dyn_rule *
-lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule)
+lookup_dyn_parent(struct ipfw_flow_id *pkt, int *pindex, struct ip_fw *rule)
 {
 	ipfw_dyn_rule *q;
-	int i;
+	int i, is_v6;
 
-	IPFW_DYN_LOCK_ASSERT();
+	is_v6 = IS_IP6_FLOW_ID(pkt);
+	i = hash_packet( pkt, V_curr_dyn_buckets );
+	*pindex = i;
+	IPFW_BUCK_LOCK(i);
+	for (q = V_ipfw_dyn_v[i].head ; q != NULL ; q=q->next)
+		if (q->dyn_type == O_LIMIT_PARENT &&
+		    rule== q->rule &&
+		    pkt->proto == q->id.proto &&
+		    pkt->src_port == q->id.src_port &&
+		    pkt->dst_port == q->id.dst_port &&
+		    (
+			(is_v6 &&
+			 IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
+				&(q->id.src_ip6)) &&
+			 IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
+				&(q->id.dst_ip6))) ||
+			(!is_v6 &&
+			 pkt->src_ip == q->id.src_ip &&
+			 pkt->dst_ip == q->id.dst_ip)
+		    )
+		) {
+			q->expire = time_uptime + V_dyn_short_lifetime;
+			DEB(print_dyn_rule(pkt, q->dyn_type,
+			    "lookup_dyn_parent found", "");)
+			return q;
+		}
 
-	if (V_ipfw_dyn_v) {
-		int is_v6 = IS_IP6_FLOW_ID(pkt);
-		i = hash_packet( pkt );
-		for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next)
-			if (q->dyn_type == O_LIMIT_PARENT &&
-			    rule== q->rule &&
-			    pkt->proto == q->id.proto &&
-			    pkt->src_port == q->id.src_port &&
-			    pkt->dst_port == q->id.dst_port &&
-			    (
-				(is_v6 &&
-				 IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6),
-					&(q->id.src_ip6)) &&
-				 IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6),
-					&(q->id.dst_ip6))) ||
-				(!is_v6 &&
-				 pkt->src_ip == q->id.src_ip &&
-				 pkt->dst_ip == q->id.dst_ip)
-			    )
-			) {
-				q->expire = time_uptime + V_dyn_short_lifetime;
-				DEB(print_dyn_rule(pkt, q->dyn_type,
-				    "lookup_dyn_parent found", "");)
-				return q;
-			}
-	}
-	return add_dyn_rule(pkt, O_LIMIT_PARENT, rule);
+	/* Add virtual limiting rule */
+	return add_dyn_rule(pkt, i, O_LIMIT_PARENT, rule);
 }
 
 /**
@@ -702,14 +655,16 @@ int
 ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd,
     struct ip_fw_args *args, uint32_t tablearg)
 {
-	static int last_log;
 	ipfw_dyn_rule *q;
+	int i;
 
 	DEB(print_dyn_rule(&args->f_id, cmd->o.opcode, "install_state", "");)
+	
+	i = hash_packet(&args->f_id, V_curr_dyn_buckets);
 
-	IPFW_DYN_LOCK();
+	IPFW_BUCK_LOCK(i);
 
-	q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+	q = lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL);
 
 	if (q != NULL) {	/* should never occur */
 		DEB(
@@ -718,26 +673,20 @@ ipfw_install_state(struct ip_fw *rule, i
 			printf("ipfw: %s: entry already present, done\n",
 			    __func__);
 		})
-		IPFW_DYN_UNLOCK();
+		IPFW_BUCK_UNLOCK(i);
 		return (0);
 	}
 
-	if (V_dyn_count >= V_dyn_max)
-		/* Run out of slots, try to remove any expired rule. */
-		remove_dyn_rule(NULL, (ipfw_dyn_rule *)1);
-
-	if (V_dyn_count >= V_dyn_max) {
-		if (last_log != time_uptime) {
-			last_log = time_uptime;
-			printf("ipfw: %s: Too many dynamic rules\n", __func__);
-		}
-		IPFW_DYN_UNLOCK();
-		return (1);	/* cannot install, notify caller */
-	}
+	/*
+	 * State limiting is done via uma(9) zone limiting.
+	 * Save pointer to newly-installed rule and reject
+	 * packet if add_dyn_rule() returned NULL.
+	 * Note q is currently set to NULL.
+	 */
 
 	switch (cmd->o.opcode) {
 	case O_KEEP_STATE:	/* bidir rule */
-		add_dyn_rule(&args->f_id, O_KEEP_STATE, rule);
+		q = add_dyn_rule(&args->f_id, i, O_KEEP_STATE, rule);
 		break;
 
 	case O_LIMIT: {		/* limit number of sessions */
@@ -745,6 +694,7 @@ ipfw_install_state(struct ip_fw *rule, i
 		ipfw_dyn_rule *parent;
 		uint32_t conn_limit;
 		uint16_t limit_mask = cmd->limit_mask;
+		int pindex;
 
 		conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ?
 		    tablearg : cmd->conn_limit;
@@ -778,46 +728,65 @@ ipfw_install_state(struct ip_fw *rule, i
 			id.src_port = args->f_id.src_port;
 		if (limit_mask & DYN_DST_PORT)
 			id.dst_port = args->f_id.dst_port;
-		if ((parent = lookup_dyn_parent(&id, rule)) == NULL) {
+
+		/*
+		 * We have to release lock for previous bucket to
+		 * avoid possible deadlock
+		 */
+		IPFW_BUCK_UNLOCK(i);
+
+		if ((parent = lookup_dyn_parent(&id, &pindex, rule)) == NULL) {
 			printf("ipfw: %s: add parent failed\n", __func__);
-			IPFW_DYN_UNLOCK();
+			IPFW_BUCK_UNLOCK(pindex);
 			return (1);
 		}
 
 		if (parent->count >= conn_limit) {
-			/* See if we can remove some expired rule. */
-			remove_dyn_rule(rule, parent);
-			if (parent->count >= conn_limit) {
-				if (V_fw_verbose && last_log != time_uptime) {
-					last_log = time_uptime;
-					char sbuf[24];
-					last_log = time_uptime;
-					snprintf(sbuf, sizeof(sbuf),
-					    "%d drop session",
-					    parent->rule->rulenum);
-					print_dyn_rule_flags(&args->f_id,
-					    cmd->o.opcode,
-					    LOG_SECURITY | LOG_DEBUG,
-					    sbuf, "too many entries");
-				}
-				IPFW_DYN_UNLOCK();
-				return (1);
+			if (V_fw_verbose && last_log != time_uptime) {
+				last_log = time_uptime;
+				char sbuf[24];
+				last_log = time_uptime;
+				snprintf(sbuf, sizeof(sbuf),
+				    "%d drop session",
+				    parent->rule->rulenum);
+				print_dyn_rule_flags(&args->f_id,
+				    cmd->o.opcode,
+				    LOG_SECURITY | LOG_DEBUG,
+				    sbuf, "too many entries");
 			}
+			IPFW_BUCK_UNLOCK(pindex);
+			return (1);
+		}
+		/* Increment counter on parent */
+		parent->count++;
+		IPFW_BUCK_UNLOCK(pindex);
+
+		IPFW_BUCK_LOCK(i);
+		q = add_dyn_rule(&args->f_id, i, O_LIMIT, (struct ip_fw *)parent);
+		if (q == NULL) {
+			/* Decrement index and notify caller */
+			IPFW_BUCK_UNLOCK(i);
+			IPFW_BUCK_LOCK(pindex);
+			parent->count--;
+			IPFW_BUCK_UNLOCK(pindex);
+			return (1);
 		}
-		add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent);
 		break;
 	}
 	default:
 		printf("ipfw: %s: unknown dynamic rule type %u\n",
 		    __func__, cmd->o.opcode);
-		IPFW_DYN_UNLOCK();
-		return (1);
+	}
+
+	if (q == NULL) {
+		IPFW_BUCK_UNLOCK(i);
+		return (1);	/* Notify caller about failure */
 	}
 
 	/* XXX just set lifetime */
-	lookup_dyn_rule_locked(&args->f_id, NULL, NULL);
+	lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL);
 
-	IPFW_DYN_UNLOCK();
+	IPFW_BUCK_UNLOCK(i);
 	return (0);
 }
 
@@ -996,23 +965,92 @@ ipfw_dyn_send_ka(struct mbuf **mtailp, i
 }
 
 /*
- * This procedure is only used to handle keepalives. It is invoked
- * every dyn_keepalive_period
+ * This procedure is used to perform various maintance
+ * on dynamic hash list. Currently it is called every second.
  */
 static void
-ipfw_tick(void * vnetx) 
+ipfw_dyn_tick(void * vnetx) 
 {
-	struct mbuf *m0, *m, *mnext, **mtailp;
-	struct ip *h;
-	int i;
-	ipfw_dyn_rule *q;
+	struct ip_fw_chain *chain;
+	int check_ka = 0;
 #ifdef VIMAGE
 	struct vnet *vp = vnetx;
 #endif
 
 	CURVNET_SET(vp);
-	if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0)
-		goto done;
+
+	chain = &V_layer3_chain;
+
+	/* Run keepalive checks every keepalive_interval iff ka is enabled */
+	if ((V_dyn_keepalive_last + V_dyn_keepalive_interval >= time_uptime) &&
+	    (V_dyn_keepalive != 0)) {
+		V_dyn_keepalive_last = time_uptime;
+		check_ka = 1;
+	}
+
+	check_dyn_rules(chain, NULL, RESVD_SET, check_ka, 1);
+
+	callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, vnetx, 0);
+
+	CURVNET_RESTORE();
+}
+
+
+/*
+ * Walk thru all dynamic states doing generic maintance:
+ * 1) free expired states
+ * 2) free all states based on deleted rule / set
+ * 3) send keepalives for states if needed
+ *
+ * @chain - pointer to current ipfw rules chain
+ * @rule - delete all states originated by given rule if != NULL
+ * @set - delete all states originated by any rule in set @set if != RESVD_SET
+ * @check_ka - perform checking/sending keepalives
+ * @timer - indicate call from timer routine.
+ *
+ * Timer routine must call this function unlocked to permit
+ * sending keepalives/resizing table.
+ *
+ * Others has to call function with IPFW_UH_WLOCK held.
+ * Additionally, function assume that dynamic rule/set is
+ * ALREADY deleted so no new states can be generated by
+ * 'deleted' rules.
+ *
+ * Write lock is needed to ensure that unused parent rules
+ * are not freed by other instance (see stage 2, 3)
+ */
+static void
+check_dyn_rules(struct ip_fw_chain *chain, struct ip_fw *rule,
+    int set, int check_ka, int timer)
+{
+	struct mbuf *m0, *m, *mnext, **mtailp;
+	struct ip *h;
+	int i, dyn_count, new_buckets = 0, max_buckets;
+	int expired = 0, expired_limits = 0, parents = 0, total = 0;
+	ipfw_dyn_rule *q, *q_prev, *q_next;
+	ipfw_dyn_rule *exp_head, **exptailp;
+	ipfw_dyn_rule *exp_lhead, **expltailp;
+
+	KASSERT(V_ipfw_dyn_v != NULL, ("%s: dynamic table not allocated",
+	    __func__));
+
+	/* Avoid possible LOR */
+	KASSERT(!check_ka || timer, ("%s: keepalive check with lock held",
+	    __func__));
+
+	/*
+	 * Do not perform any checks if we currently have no dynamic states
+	 */
+	if (DYN_COUNT == 0)
+		return;
+
+	/* Expired states */
+	exp_head = NULL;
+	exptailp = &exp_head;
+
+	/* Expired limit states */
+	exp_lhead = NULL;
+	expltailp = &exp_lhead;
 
 	/*
 	 * We make a chain of packets to go out here -- not deferring
@@ -1022,26 +1060,211 @@ ipfw_tick(void * vnetx) 
 	 */
 	m0 = NULL;
 	mtailp = &m0;
-	IPFW_DYN_LOCK();
+
+	/* Protect from hash resizing */
+	if (timer != 0)
+		IPFW_UH_WLOCK(chain);
+	else
+		IPFW_UH_WLOCK_ASSERT(chain);
+
+#define	NEXT_RULE()	{ q_prev = q; q = q->next ; continue; }
+
+	/* Stage 1: perform requested deletion */
 	for (i = 0 ; i < V_curr_dyn_buckets ; i++) {
-		for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) {
-			if (q->dyn_type == O_LIMIT_PARENT)
-				continue;
-			if (TIME_LEQ(q->expire, time_uptime))
-				continue;	/* too late, rule expired */
+		IPFW_BUCK_LOCK(i);
+		for (q = V_ipfw_dyn_v[i].head, q_prev = q; q ; ) {
+			/* account every rule */
+			total++;
 
-			if (q->id.proto != IPPROTO_TCP)
-				continue;
-			if ( (q->state & BOTH_SYN) != BOTH_SYN)
+			/* Skip parent rules at all */
+			if (q->dyn_type == O_LIMIT_PARENT) {
+				parents++;
+				NEXT_RULE();
+			}
+
+			/*
+			 * Remove rules which are:
+			 * 1) expired
+			 * 2) created by given rule
+			 * 3) created by any rule in given set
+			 */
+			if ((TIME_LEQ(q->expire, time_uptime)) ||
+			    ((rule != NULL) && (q->rule == rule)) ||
+			    ((set != RESVD_SET) && (q->rule->set == set))) {
+				/* Unlink q from current list */
+				q_next = q->next;
+				if (q == V_ipfw_dyn_v[i].head)
+					V_ipfw_dyn_v[i].head = q_next;
+				else
+					q_prev->next = q_next;
+
+				q->next = NULL;
+
+				/* queue q to expire list */
+				if (q->dyn_type != O_LIMIT) {
+					*exptailp = q;
+					exptailp = &(*exptailp)->next;
+					DEB(print_dyn_rule(&q->id, q->dyn_type,
+					    "unlink entry", "left");
+					)
+				} else {
+					/* Separate list for limit rules */
+					*expltailp = q;
+					expltailp = &(*expltailp)->next;
+					expired_limits++;
+					DEB(print_dyn_rule(&q->id, q->dyn_type,
+					    "unlink limit entry", "left");
+					)
+				}
+
+				q = q_next;
+				expired++;
 				continue;
-			if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval,
-			    q->expire))
-				continue;	/* too early */
+			}
+
+			/*
+			 * Check if we need to send keepalive:
+			 * we need to ensure if is time to do KA,
+			 * this is established TCP session, and
+			 * expire time is within keepalive interval
+			 */
+			if ((check_ka != 0) && (q->id.proto == IPPROTO_TCP) &&
+			    ((q->state & BOTH_SYN) == BOTH_SYN) &&
+			    (TIME_LEQ(q->expire, time_uptime +
+			      V_dyn_keepalive_interval)))
+				mtailp = ipfw_dyn_send_ka(mtailp, q);
+
+			NEXT_RULE();
+		}
+		IPFW_BUCK_UNLOCK(i);
+	}
+
+	/* Stage 2: decrement counters from O_LIMIT parents */
+	if (expired_limits != 0) {
+		/*
+		 * XXX: Note that deleting set with more than one
+		 * heavily-used LIMIT rules can result in overwhelming
+		 * locking due to lack of per-hash value sorting
+		 *
+		 * We should probably think about:
+		 * 1) pre-allocating hash of size, say,
+		 * MAX(16, V_curr_dyn_buckets / 1024)
+		 * 2) checking if expired_limits is large enough
+		 * 3) If yes, init hash (or its part), re-link
+		 * current list and start decrementing procedure in
+		 * each bucket separately
+		 */
+
+		/*
+		 * Small optimization: do not unlock bucket until
+		 * we see the next item resides in different bucket
+		 */
+		if (exp_lhead != NULL) {
+			i = exp_lhead->parent->bucket;
+			IPFW_BUCK_LOCK(i);
+		}
+		for (q = exp_lhead; q != NULL; q = q->next) {
+			if (i != q->parent->bucket) {
+				IPFW_BUCK_UNLOCK(i);
+				i = q->parent->bucket;
+				IPFW_BUCK_LOCK(i);
+			}
 
-			mtailp = ipfw_dyn_send_ka(mtailp, q);
+			/* Decrease parent refcount */
+			q->parent->count--;
+		}
+		if (exp_lhead != NULL)
+			IPFW_BUCK_UNLOCK(i);
+	}
+
+	/*
+	 * We protectet ourselves from unused parent deletion
+	 * (from the timer function) by holding UH write lock.

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list