From nobody Wed Oct 13 20:10:05 2021 X-Original-To: dev-commits-src-branches@mlmmj.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mlmmj.nyi.freebsd.org (Postfix) with ESMTP id 4AA531810974; Wed, 13 Oct 2021 20:10:06 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "R3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 4HV3YB1jHbz3vkv; Wed, 13 Oct 2021 20:10:06 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org (gitrepo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:5]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 0DB542DEDE; Wed, 13 Oct 2021 20:10:06 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org ([127.0.1.44]) by gitrepo.freebsd.org (8.16.1/8.16.1) with ESMTP id 19DKA54Z036445; Wed, 13 Oct 2021 20:10:05 GMT (envelope-from git@gitrepo.freebsd.org) Received: (from git@localhost) by gitrepo.freebsd.org (8.16.1/8.16.1/Submit) id 19DKA5Dk036442; Wed, 13 Oct 2021 20:10:05 GMT (envelope-from git) Date: Wed, 13 Oct 2021 20:10:05 GMT Message-Id: <202110132010.19DKA5Dk036442@gitrepo.freebsd.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org From: Marko Zec Subject: git: 0eeef61aec4b - stable/13 - [fib_algo][dxr] Improve incremental updating strategy List-Id: Commits to the stable branches of the FreeBSD src repository List-Archive: https://lists.freebsd.org/archives/dev-commits-src-branches List-Help: List-Post: List-Subscribe: List-Unsubscribe: Sender: owner-dev-commits-src-branches@freebsd.org X-BeenThere: dev-commits-src-branches@freebsd.org MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit X-Git-Committer: zec X-Git-Repository: src X-Git-Refname: refs/heads/stable/13 X-Git-Reftype: branch X-Git-Commit: 0eeef61aec4b996e88547f31a8e7fef677180e98 Auto-Submitted: auto-generated X-ThisMailContainsUnwantedMimeParts: N The branch stable/13 has been updated by zec: URL: https://cgit.FreeBSD.org/src/commit/?id=0eeef61aec4b996e88547f31a8e7fef677180e98 commit 0eeef61aec4b996e88547f31a8e7fef677180e98 Author: Marko Zec AuthorDate: 2021-10-09 11:22:27 +0000 Commit: Marko Zec CommitDate: 2021-10-13 20:06:10 +0000 [fib_algo][dxr] Improve incremental updating strategy Tracking the number of unused holes in the trie and the range table was a bad metric based on which full trie and / or range rebuilds were triggered, which would happen in vain by far too frequently, particularly with live BGP feeds. Instead, track the total unused space inside the trie and range table structures, and trigger rebuilds if the percentage of unused space exceeds a sysctl-tunable threshold. MFC after: 3 days PR: 257965 --- sys/netinet/in_fib_dxr.c | 103 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 19 deletions(-) diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c index 6a9f414c3ab0..d832a66ee2cc 100644 --- a/sys/netinet/in_fib_dxr.c +++ b/sys/netinet/in_fib_dxr.c @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -195,6 +196,7 @@ struct dxr_aux { uint32_t updates_high; uint32_t all_chunks_cnt; uint32_t unused_chunks_cnt; + uint32_t unused_chunks_size; uint32_t xtbl_size; uint32_t all_trie_cnt; uint32_t unused_trie_cnt; @@ -232,21 +234,48 @@ static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary"); uma_zone_t chunk_zone; uma_zone_t trie_zone; +VNET_DEFINE_STATIC(int, frag_limit) = 100; +#define V_frag_limit VNET(frag_limit) + SYSCTL_DECL(_net_route_algo); SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "DXR tunables"); -VNET_DEFINE_STATIC(int, max_trie_holes) = 8; -#define V_max_trie_holes VNET(max_trie_holes) -SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes, - CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0, - "Trie fragmentation threshold before triggering a full rebuild"); +static int +sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS) +{ + char buf[8]; + int error, new, i; + + snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, + V_frag_limit % 100); + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + return (error); + if (!isdigit(*buf) && *buf != '.') + return (EINVAL); + for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++) + new = new * 10 + buf[i] - '0'; + new *= 100; + if (buf[i++] == '.') { + if (!isdigit(buf[i])) + return (EINVAL); + new += (buf[i++] - '0') * 10; + if (isdigit(buf[i])) + new += buf[i++] - '0'; + } + if (new > 1000) + return (EINVAL); + V_frag_limit = new; + snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100, + V_frag_limit % 100); + return (0); +} -VNET_DEFINE_STATIC(int, max_range_holes) = 16; -#define V_max_range_holes VNET(max_range_holes) -SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes, - CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0, - "Range table fragmentation threshold before triggering a full rebuild"); +SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET, + 0, 0, sysctl_dxr_frag_limit, "A", + "Fragmentation threshold to full rebuild"); /* Binary search for a matching address range */ #define DXR_LOOKUP_STAGE \ @@ -424,6 +453,7 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk) fdesc->base = cdp->cd_base; da->rtbl_top -= size; da->unused_chunks_cnt--; + da->unused_chunks_size -= cdp->cd_max_size; if (cdp->cd_max_size > size) { /* Split the range in two, need a new descriptor */ empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT); @@ -442,6 +472,7 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk) da->all_chunks_cnt++; da->unused_chunks_cnt++; + da->unused_chunks_size += empty_cdp->cd_max_size; cdp->cd_max_size = size; } LIST_REMOVE(cdp, cd_hash_le); @@ -471,9 +502,9 @@ chunk_ref(struct dxr_aux *da, uint32_t chunk) return (1); } da->rtbl_size += RTBL_SIZE_INCR; - if (da->rtbl_top >= BASE_MAX / 4) - FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%", - da->rtbl_top * 100 / BASE_MAX); + i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX; + FIB_PRINTF(i, da->fd, "range table at %d%% structural limit", + da->rtbl_top * 100 / BASE_MAX); da->range_tbl = realloc(da->range_tbl, sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT); @@ -508,6 +539,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk) LIST_REMOVE(cdp, cd_hash_le); da->unused_chunks_cnt++; + da->unused_chunks_size += cdp->cd_max_size; cdp->cd_cur_size = 0; /* Attempt to merge with the preceding chunk, if empty */ @@ -546,6 +578,7 @@ chunk_unref(struct dxr_aux *da, uint32_t chunk) da->all_chunks_cnt--; da->unused_chunks_cnt--; da->rtbl_top -= cdp->cd_max_size; + da->unused_chunks_size -= cdp->cd_max_size; LIST_REMOVE(cdp, cd_all_le); uma_zfree(chunk_zone, cdp); return; @@ -846,10 +879,12 @@ dxr_build(struct dxr *dxr) struct timeval t0, t1, t2, t3; uint32_t r_size, dxr_tot_size; uint32_t i, m, range_rebuild = 0; + uint32_t range_frag; #ifdef DXR2 struct trie_desc *tp; uint32_t d_tbl_size, dxr_x, d_size, x_size; uint32_t ti, trie_rebuild = 0, prev_size = 0; + uint32_t trie_frag; #endif KASSERT(dxr->d == NULL, ("dxr: d not free")); @@ -897,9 +932,10 @@ dxr_build(struct dxr *dxr) dxr->nh_tbl = fib_get_nhop_array(da->fd); fib_get_rtable_info(fib_get_rh(da->fd), &rinfo); - if (da->updates_low > da->updates_high || - da->unused_chunks_cnt > V_max_range_holes) + if (da->updates_low > da->updates_high) range_rebuild = 1; + +range_build: if (range_rebuild) { /* Bulk cleanup */ bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl)); @@ -910,6 +946,7 @@ dxr_build(struct dxr *dxr) for (i = 0; i < UNUSED_BUCKETS; i++) LIST_INIT(&da->unused_chunks[i]); da->all_chunks_cnt = da->unused_chunks_cnt = 0; + da->unused_chunks_size = 0; da->rtbl_top = 0; da->updates_low = 0; da->updates_high = DIRECT_TBL_SIZE - 1; @@ -929,18 +966,32 @@ dxr_build(struct dxr *dxr) else if (m & 1 && update_chunk(da, i) != 0) return; } + + range_frag = 0; + if (da->rtbl_top) + range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top; + if (range_frag > V_frag_limit) { + range_rebuild = 1; + goto range_build; + } + r_size = sizeof(*da->range_tbl) * da->rtbl_top; microuptime(&t1); #ifdef DXR2 - if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes || + if (range_rebuild || abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1) trie_rebuild = 1; + +trie_build: if (trie_rebuild) { da->trie_rebuilt_prefixes = da->prefixes; da->d_bits = DXR_D; da->updates_low = 0; da->updates_high = DIRECT_TBL_SIZE - 1; + if (!range_rebuild) + memset(da->updates_mask, 0xff, + sizeof(da->updates_mask)); } dxr2_try_squeeze: @@ -976,6 +1027,14 @@ dxr2_try_squeeze: da->d_tbl[i] = ti; } + trie_frag = 0; + if (da->all_trie_cnt) + trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt; + if (trie_frag > V_frag_limit) { + trie_rebuild = 1; + goto trie_build; + } + d_size = sizeof(*da->d_tbl) * d_tbl_size; x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size * da->all_trie_cnt; @@ -1035,6 +1094,15 @@ dxr2_try_squeeze: FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix", dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100, i / 100, i % 100); +#ifdef DXR2 + FIB_PRINTF(LOG_INFO, da->fd, + "%d.%02d%% trie, %d.%02d%% range fragmentation", + trie_frag / 100, trie_frag % 100, + range_frag / 100, range_frag % 100); +#else + FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation", + range_frag / 100, range_frag % 100); +#endif i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms", range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000); @@ -1046,9 +1114,6 @@ dxr2_try_squeeze: i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec; FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms", i / 1000, i % 1000); - FIB_PRINTF(LOG_INFO, da->fd, "range table: %d%%, %d chunks, %d holes", - da->rtbl_top * 100 / BASE_MAX, da->all_chunks_cnt, - da->unused_chunks_cnt); } /*