parallelizing ipfw table

Gleb Smirnoff glebius at FreeBSD.org
Sun Nov 27 02:53:55 GMT 2005


On Sun, Nov 27, 2005 at 03:59:43AM +0300, Gleb Smirnoff wrote:
T>   Colleagues,
T> 
T>   in ipfw(4) we've got a reader/writer locking semantics. The ipfw
T> lookups performed on packet forwarding obtain reader lock on ipfw
T> chain, while altering the chain requires writer access on chain.
T> 
T> So, in multiprocessor multiinterface box we achieve a parallizm
T> almost without any contention.
T> 
T> However, ipfw tables lock the RADIX trie on every lookup. At the
T> first glance the radix.c:rn_lookup() function is reenterable. This
T> means that we can do two parallel RADIX lookups. So, I suggest
T> to eliminate the RADIX trie locking in ipfw, and utilize the
T> locking that is already present in ipfw. This will:
T> 
T>   - reduce number of mutex operations for each packer
T>   - remove contention from parallel ipfw_chk() lookups
T> 
T> A patch displaying the idea is attached. Not tested yet, read
T> below. The patch moves the tables array into the ip_fw_chain
T> structure. This is not necessary now, but in future we can
T> have multiple independent chains in ipfw, that's why I try
T> to avoid usage of &layer3_chain in the functions that are
T> deeper in the call graph. I try to supply chain pointer
T> from the caller.
T> 
T> The only problem is the caching in table lookup. This "hack"
T> makes the lookup function modify the table structure. We need
T> to remove caching to make the lookup_table() function fully
T> lockless and reenterable at the same time. The attached patch
T> doesn't removes caching, since it only displays the original
T> idea.

I'm sorry to send 'cvs log' instead of 'cvs diff'. :(
Here is the patch.

-- 
Totus tuus, Glebius.
GLEBIUS-RIPN GLEB-RIPE
-------------- next part --------------
Index: ip_fw2.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/ip_fw2.c,v
retrieving revision 1.115
diff -u -r1.115 ip_fw2.c
--- ip_fw2.c	10 Nov 2005 22:10:39 -0000	1.115
+++ ip_fw2.c	27 Nov 2005 00:11:46 -0000
@@ -126,9 +126,19 @@
 	int		fw_prid;
 };
 
+#define	IPFW_TABLES_MAX		128
+struct ip_fw_table {
+	struct radix_node_head	*rnh;
+	int			modified;
+	in_addr_t		last_addr;
+	int			last_match;
+	u_int32_t		last_value;
+};
+
 struct ip_fw_chain {
 	struct ip_fw	*rules;		/* list of rules */
 	struct ip_fw	*reap;		/* list of rules to reap */
+	struct ip_fw_table tables[IPFW_TABLES_MAX];
 	struct mtx	mtx;		/* lock guarding rule list */
 	int		busy_count;	/* busy count for rw locks */
 	int		want_write;
@@ -192,15 +202,6 @@
 	u_int32_t		value;
 };
 
-#define	IPFW_TABLES_MAX		128
-static struct ip_fw_table {
-	struct radix_node_head	*rnh;
-	int			modified;
-	in_addr_t		last_addr;
-	int			last_match;
-	u_int32_t		last_value;
-} ipfw_tables[IPFW_TABLES_MAX];
-
 static int fw_debug = 1;
 static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
 
@@ -1703,25 +1704,26 @@
 }
 
 static void
-init_tables(void)
+init_tables(struct ip_fw_chain *ch)
 {
 	int i;
 
 	for (i = 0; i < IPFW_TABLES_MAX; i++) {
-		rn_inithead((void **)&ipfw_tables[i].rnh, 32);
-		ipfw_tables[i].modified = 1;
+		rn_inithead((void **)&ch->tables[i].rnh, 32);
+		ch->tables[i].modified = 1;
 	}
 }
 
 static int
-add_table_entry(u_int16_t tbl, in_addr_t addr, u_int8_t mlen, u_int32_t value)
+add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+	uint8_t mlen, uint32_t value)
 {
 	struct radix_node_head *rnh;
 	struct table_entry *ent;
 
 	if (tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
-	rnh = ipfw_tables[tbl].rnh;
+	rnh = ch->tables[tbl].rnh;
 	ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
 	if (ent == NULL)
 		return (ENOMEM);
@@ -1729,20 +1731,21 @@
 	ent->addr.sin_len = ent->mask.sin_len = 8;
 	ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
 	ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
-	RADIX_NODE_HEAD_LOCK(rnh);
+	IPFW_WLOCK(&layer3_chain);
 	if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) ==
 	    NULL) {
-		RADIX_NODE_HEAD_UNLOCK(rnh);
+		IPFW_WUNLOCK(&layer3_chain);
 		free(ent, M_IPFW_TBL);
 		return (EEXIST);
 	}
-	ipfw_tables[tbl].modified = 1;
-	RADIX_NODE_HEAD_UNLOCK(rnh);
+	ch->tables[tbl].modified = 1;
+	IPFW_WUNLOCK(&layer3_chain);
 	return (0);
 }
 
 static int
-del_table_entry(u_int16_t tbl, in_addr_t addr, u_int8_t mlen)
+del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+	uint8_t mlen)
 {
 	struct radix_node_head *rnh;
 	struct table_entry *ent;
@@ -1750,18 +1753,18 @@
 
 	if (tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
-	rnh = ipfw_tables[tbl].rnh;
+	rnh = ch->tables[tbl].rnh;
 	sa.sin_len = mask.sin_len = 8;
 	mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
 	sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
-	RADIX_NODE_HEAD_LOCK(rnh);
+	IPFW_WLOCK(ch);
 	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
 	if (ent == NULL) {
-		RADIX_NODE_HEAD_UNLOCK(rnh);
+		IPFW_WUNLOCK(ch);
 		return (ESRCH);
 	}
-	ipfw_tables[tbl].modified = 1;
-	RADIX_NODE_HEAD_UNLOCK(rnh);
+	ch->tables[tbl].modified = 1;
+	IPFW_WUNLOCK(ch);
 	free(ent, M_IPFW_TBL);
 	return (0);
 }
@@ -1780,31 +1783,34 @@
 }
 
 static int
-flush_table(u_int16_t tbl)
+flush_table(struct ip_fw_chain *ch, uint16_t tbl)
 {
 	struct radix_node_head *rnh;
 
+	IPFW_WLOCK_ASSERT(ch);
+
 	if (tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
-	rnh = ipfw_tables[tbl].rnh;
-	RADIX_NODE_HEAD_LOCK(rnh);
+	rnh = ch->tables[tbl].rnh;
 	rnh->rnh_walktree(rnh, flush_table_entry, rnh);
-	ipfw_tables[tbl].modified = 1;
-	RADIX_NODE_HEAD_UNLOCK(rnh);
+	ch->tables[tbl].modified = 1;
 	return (0);
 }
 
 static void
-flush_tables(void)
+flush_tables(struct ip_fw_chain *ch)
 {
-	u_int16_t tbl;
+	uint16_t tbl;
+
+	IPFW_WLOCK_ASSERT(ch);
 
 	for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
-		flush_table(tbl);
+		flush_table(ch, tbl);
 }
 
 static int
-lookup_table(u_int16_t tbl, in_addr_t addr, u_int32_t *val)
+lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+	uint32_t *val)
 {
 	struct radix_node_head *rnh;
 	struct ip_fw_table *table;
@@ -1814,14 +1820,12 @@
 
 	if (tbl >= IPFW_TABLES_MAX)
 		return (0);
-	table = &ipfw_tables[tbl];
+	table = &ch->tables[tbl];
 	rnh = table->rnh;
-	RADIX_NODE_HEAD_LOCK(rnh);
 	if (addr == table->last_addr && !table->modified) {
 		last_match = table->last_match;
 		if (last_match)
 			*val = table->last_value;
-		RADIX_NODE_HEAD_UNLOCK(rnh);
 		return (last_match);
 	}
 	table->modified = 0;
@@ -1832,11 +1836,9 @@
 	if (ent != NULL) {
 		table->last_value = *val = ent->value;
 		table->last_match = 1;
-		RADIX_NODE_HEAD_UNLOCK(rnh);
 		return (1);
 	}
 	table->last_match = 0;
-	RADIX_NODE_HEAD_UNLOCK(rnh);
 	return (0);
 }
 
@@ -1850,17 +1852,15 @@
 }
 
 static int
-count_table(u_int32_t tbl, u_int32_t *cnt)
+count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
 {
 	struct radix_node_head *rnh;
 
 	if (tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
-	rnh = ipfw_tables[tbl].rnh;
+	rnh = ch->tables[tbl].rnh;
 	*cnt = 0;
-	RADIX_NODE_HEAD_LOCK(rnh);
 	rnh->rnh_walktree(rnh, count_table_entry, cnt);
-	RADIX_NODE_HEAD_UNLOCK(rnh);
 	return (0);
 }
 
@@ -1886,17 +1886,17 @@
 }
 
 static int
-dump_table(ipfw_table *tbl)
+dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
 {
 	struct radix_node_head *rnh;
 
+	IPFW_WLOCK_ASSERT(ch);
+
 	if (tbl->tbl >= IPFW_TABLES_MAX)
 		return (EINVAL);
-	rnh = ipfw_tables[tbl->tbl].rnh;
+	rnh = ch->tables[tbl->tbl].rnh;
 	tbl->cnt = 0;
-	RADIX_NODE_HEAD_LOCK(rnh);
 	rnh->rnh_walktree(rnh, dump_table_entry, tbl);
-	RADIX_NODE_HEAD_UNLOCK(rnh);
 	return (0);
 }
 
@@ -2567,7 +2567,8 @@
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t v;
 
-				    match = lookup_table(cmd->arg1, a, &v);
+				    match = lookup_table(chain, cmd->arg1, a,
+					&v);
 				    if (!match)
 					break;
 				    if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
@@ -4012,8 +4013,8 @@
 			    sizeof(ent), sizeof(ent));
 			if (error)
 				break;
-			error = add_table_entry(ent.tbl, ent.addr,
-			    ent.masklen, ent.value);
+			error = add_table_entry(&layer3_chain, ent.tbl,
+			    ent.addr, ent.masklen, ent.value);
 		}
 		break;
 
@@ -4025,7 +4026,8 @@
 			    sizeof(ent), sizeof(ent));
 			if (error)
 				break;
-			error = del_table_entry(ent.tbl, ent.addr, ent.masklen);
+			error = del_table_entry(&layer3_chain, ent.tbl,
+			    ent.addr, ent.masklen);
 		}
 		break;
 
@@ -4037,7 +4039,9 @@
 			    sizeof(tbl), sizeof(tbl));
 			if (error)
 				break;
-			error = flush_table(tbl);
+			IPFW_WLOCK(&layer3_chain);
+			error = flush_table(&layer3_chain, tbl);
+			IPFW_WUNLOCK(&layer3_chain);
 		}
 		break;
 
@@ -4048,8 +4052,10 @@
 			if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
 			    sizeof(tbl))))
 				break;
-			if ((error = count_table(tbl, &cnt)))
+			IPFW_RLOCK(&layer3_chain);
+			if ((error = count_table(&layer3_chain, tbl, &cnt)))
 				break;
+			IPFW_RUNLOCK(&layer3_chain);
 			error = sooptcopyout(sopt, &cnt, sizeof(cnt));
 		}
 		break;
@@ -4075,11 +4081,14 @@
 			}
 			tbl->size = (size - sizeof(*tbl)) /
 			    sizeof(ipfw_table_entry);
-			error = dump_table(tbl);
+			IPFW_WLOCK(&layer3_chain);
+			error = dump_table(&layer3_chain, tbl);
 			if (error) {
+				IPFW_WUNLOCK(&layer3_chain);
 				free(tbl, M_TEMP);
 				break;
 			}
+			IPFW_WUNLOCK(&layer3_chain);
 			error = sooptcopyout(sopt, tbl, size);
 			free(tbl, M_TEMP);
 		}
@@ -4242,7 +4251,7 @@
 		printf("limited to %d packets/entry by default\n",
 		    verbose_limit);
 
-	init_tables();
+	init_tables(&layer3_chain);
 	ip_fw_ctl_ptr = ipfw_ctl;
 	ip_fw_chk_ptr = ipfw_chk;
 	callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL);
@@ -4259,13 +4268,13 @@
 	ip_fw_ctl_ptr = NULL;
 	callout_drain(&ipfw_timeout);
 	IPFW_WLOCK(&layer3_chain);
+	flush_tables(&layer3_chain);
 	layer3_chain.reap = NULL;
 	free_chain(&layer3_chain, 1 /* kill default rule */);
 	reap = layer3_chain.reap, layer3_chain.reap = NULL;
 	IPFW_WUNLOCK(&layer3_chain);
 	if (reap != NULL)
 		reap_rules(reap);
-	flush_tables();
 	IPFW_DYN_LOCK_DESTROY();
 	uma_zdestroy(ipfw_dyn_rule_zone);
 	IPFW_LOCK_DESTROY(&layer3_chain);


More information about the freebsd-net mailing list