git: 0eeef61aec4b - stable/13 - [fib_algo][dxr] Improve incremental updating strategy

From: Marko Zec <zec_at_FreeBSD.org>
Date: Wed, 13 Oct 2021 20:10:05 UTC
The branch stable/13 has been updated by zec:

URL: https://cgit.FreeBSD.org/src/commit/?id=0eeef61aec4b996e88547f31a8e7fef677180e98

commit 0eeef61aec4b996e88547f31a8e7fef677180e98
Author:     Marko Zec <zec@FreeBSD.org>
AuthorDate: 2021-10-09 11:22:27 +0000
Commit:     Marko Zec <zec@FreeBSD.org>
CommitDate: 2021-10-13 20:06:10 +0000

    [fib_algo][dxr] Improve incremental updating strategy
    
    Tracking the number of unused holes in the trie and the range table
    was a bad metric based on which full trie and / or range rebuilds
    were triggered, which would happen in vain by far too frequently,
    particularly with live BGP feeds.
    
    Instead, track the total unused space inside the trie and range table
    structures, and trigger rebuilds if the percentage of unused space
    exceeds a sysctl-tunable threshold.
    
    MFC after:      3 days
    PR:             257965
---
 sys/netinet/in_fib_dxr.c | 103 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 84 insertions(+), 19 deletions(-)

diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c
index 6a9f414c3ab0..d832a66ee2cc 100644
--- a/sys/netinet/in_fib_dxr.c
+++ b/sys/netinet/in_fib_dxr.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
+#include <sys/ctype.h>
 #include <sys/epoch.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
@@ -195,6 +196,7 @@ struct dxr_aux {
 	uint32_t		updates_high;
 	uint32_t		all_chunks_cnt;
 	uint32_t		unused_chunks_cnt;
+	uint32_t		unused_chunks_size;
 	uint32_t		xtbl_size;
 	uint32_t		all_trie_cnt;
 	uint32_t		unused_trie_cnt;
@@ -232,21 +234,48 @@ static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
 uma_zone_t chunk_zone;
 uma_zone_t trie_zone;
 
+VNET_DEFINE_STATIC(int, frag_limit) = 100;
+#define	V_frag_limit	VNET(frag_limit)
+
 SYSCTL_DECL(_net_route_algo);
 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "DXR tunables");
 
-VNET_DEFINE_STATIC(int, max_trie_holes) = 8;
-#define	V_max_trie_holes	VNET(max_trie_holes)
-SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes,
-    CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0,
-    "Trie fragmentation threshold before triggering a full rebuild");
+static int
+sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
+{
+	char buf[8];
+	int error, new, i;
+
+	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
+	    V_frag_limit % 100);
+	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	if (!isdigit(*buf) && *buf != '.')
+		return (EINVAL);
+	for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
+		new = new * 10 + buf[i] - '0';
+	new *= 100;
+	if (buf[i++] == '.') {
+		if (!isdigit(buf[i]))
+			return (EINVAL);
+		new += (buf[i++] - '0') * 10;
+		if (isdigit(buf[i]))
+			new += buf[i++] - '0';
+	}
+	if (new > 1000)
+		return (EINVAL);
+	V_frag_limit = new;
+	snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
+	    V_frag_limit % 100);
+	return (0);
+}
 
-VNET_DEFINE_STATIC(int, max_range_holes) = 16;
-#define	V_max_range_holes	VNET(max_range_holes)
-SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes,
-    CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0,
-    "Range table fragmentation threshold before triggering a full rebuild");
+SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
+    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
+    0, 0, sysctl_dxr_frag_limit, "A",
+    "Fragmentation threshold to full rebuild");
 
 /* Binary search for a matching address range */
 #define	DXR_LOOKUP_STAGE					\
@@ -424,6 +453,7 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk)
 		fdesc->base = cdp->cd_base;
 		da->rtbl_top -= size;
 		da->unused_chunks_cnt--;
+		da->unused_chunks_size -= cdp->cd_max_size;
 		if (cdp->cd_max_size > size) {
 			/* Split the range in two, need a new descriptor */
 			empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
@@ -442,6 +472,7 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk)
 
 			da->all_chunks_cnt++;
 			da->unused_chunks_cnt++;
+			da->unused_chunks_size += empty_cdp->cd_max_size;
 			cdp->cd_max_size = size;
 		}
 		LIST_REMOVE(cdp, cd_hash_le);
@@ -471,9 +502,9 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk)
 			return (1);
 		}
 		da->rtbl_size += RTBL_SIZE_INCR;
-		if (da->rtbl_top >= BASE_MAX / 4)
-			FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%",
-			    da->rtbl_top * 100 / BASE_MAX);
+		i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
+		FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
+		    da->rtbl_top * 100 / BASE_MAX);
 		da->range_tbl = realloc(da->range_tbl,
 		    sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
 		    M_DXRAUX, M_NOWAIT);
@@ -508,6 +539,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk)
 
 	LIST_REMOVE(cdp, cd_hash_le);
 	da->unused_chunks_cnt++;
+	da->unused_chunks_size += cdp->cd_max_size;
 	cdp->cd_cur_size = 0;
 
 	/* Attempt to merge with the preceding chunk, if empty */
@@ -546,6 +578,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk)
 		da->all_chunks_cnt--;
 		da->unused_chunks_cnt--;
 		da->rtbl_top -= cdp->cd_max_size;
+		da->unused_chunks_size -= cdp->cd_max_size;
 		LIST_REMOVE(cdp, cd_all_le);
 		uma_zfree(chunk_zone, cdp);
 		return;
@@ -846,10 +879,12 @@ dxr_build(struct dxr *dxr)
 	struct timeval t0, t1, t2, t3;
 	uint32_t r_size, dxr_tot_size;
 	uint32_t i, m, range_rebuild = 0;
+	uint32_t range_frag;
 #ifdef DXR2
 	struct trie_desc *tp;
 	uint32_t d_tbl_size, dxr_x, d_size, x_size;
 	uint32_t ti, trie_rebuild = 0, prev_size = 0;
+	uint32_t trie_frag;
 #endif
 
 	KASSERT(dxr->d == NULL, ("dxr: d not free"));
@@ -897,9 +932,10 @@ dxr_build(struct dxr *dxr)
 	dxr->nh_tbl = fib_get_nhop_array(da->fd);
 	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
 
-	if (da->updates_low > da->updates_high ||
-	    da->unused_chunks_cnt > V_max_range_holes)
+	if (da->updates_low > da->updates_high)
 		range_rebuild = 1;
+
+range_build:
 	if (range_rebuild) {
 		/* Bulk cleanup */
 		bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
@@ -910,6 +946,7 @@ dxr_build(struct dxr *dxr)
 		for (i = 0; i < UNUSED_BUCKETS; i++)
 			LIST_INIT(&da->unused_chunks[i]);
 		da->all_chunks_cnt = da->unused_chunks_cnt = 0;
+		da->unused_chunks_size = 0;
 		da->rtbl_top = 0;
 		da->updates_low = 0;
 		da->updates_high = DIRECT_TBL_SIZE - 1;
@@ -929,18 +966,32 @@ dxr_build(struct dxr *dxr)
 		else if (m & 1 && update_chunk(da, i) != 0)
 			return;
 	}
+
+	range_frag = 0;
+	if (da->rtbl_top)
+		range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
+	if (range_frag > V_frag_limit) {
+		range_rebuild = 1;
+		goto range_build;
+	}
+
 	r_size = sizeof(*da->range_tbl) * da->rtbl_top;
 	microuptime(&t1);
 
 #ifdef DXR2
-	if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes ||
+	if (range_rebuild ||
 	    abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
 		trie_rebuild = 1;
+
+trie_build:
 	if (trie_rebuild) {
 		da->trie_rebuilt_prefixes = da->prefixes;
 		da->d_bits = DXR_D;
 		da->updates_low = 0;
 		da->updates_high = DIRECT_TBL_SIZE - 1;
+		if (!range_rebuild)
+			memset(da->updates_mask, 0xff,
+			    sizeof(da->updates_mask));
 	}
 
 dxr2_try_squeeze:
@@ -976,6 +1027,14 @@ dxr2_try_squeeze:
 		da->d_tbl[i] = ti;
 	}
 
+	trie_frag = 0;
+	if (da->all_trie_cnt)
+		trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
+	if (trie_frag > V_frag_limit) {
+		trie_rebuild = 1;
+		goto trie_build;
+	}
+
 	d_size = sizeof(*da->d_tbl) * d_tbl_size;
 	x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
 	    * da->all_trie_cnt;
@@ -1035,6 +1094,15 @@ dxr2_try_squeeze:
 	FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
 	    dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
 	    i / 100, i % 100);
+#ifdef DXR2
+	FIB_PRINTF(LOG_INFO, da->fd,
+	    "%d.%02d%% trie, %d.%02d%% range fragmentation",
+	    trie_frag / 100, trie_frag % 100,
+	    range_frag / 100, range_frag % 100);
+#else
+	FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation",
+	    range_frag / 100, range_frag % 100);
+#endif
 	i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
 	FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
 	    range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
@@ -1046,9 +1114,6 @@ dxr2_try_squeeze:
 	i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
 	FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
 	    i / 1000, i % 1000);
-	FIB_PRINTF(LOG_INFO, da->fd, "range table: %d%%, %d chunks, %d holes",
-	    da->rtbl_top * 100 / BASE_MAX, da->all_chunks_cnt,
-	    da->unused_chunks_cnt);
 }
 
 /*