git: 4715d948c593 - stable/13 - Introduce DXR as an IPv4 longest prefix matching / FIB module

Marko Zec zec at FreeBSD.org
Thu Jun 17 10:35:47 UTC 2021


The branch stable/13 has been updated by zec:

URL: https://cgit.FreeBSD.org/src/commit/?id=4715d948c593b5bb3d2e2129ed299becec566373

commit 4715d948c593b5bb3d2e2129ed299becec566373
Author:     Marko Zec <zec at FreeBSD.org>
AuthorDate: 2021-05-05 11:45:52 +0000
Commit:     Marko Zec <zec at FreeBSD.org>
CommitDate: 2021-06-17 10:07:05 +0000

    Introduce DXR as an IPv4 longest prefix matching / FIB module
    
    DXR maintains compressed lookup structures with a trivial search
    procedure.  A two-stage trie is indexed by the more significant bits of
    the search key (IPv4 address), while the remaining bits are used for
    finding the next hop in a sorted array.  The tradeoff between memory
    footprint and search speed depends on the split between the trie and
    the remaining binary search.  The default of 20 bits of the key being
    used for trie indexing yields good performance (see below) with
    footprints of around 2.5 Bytes per prefix with current BGP snapshots.
    
    Rebuilding lookup structures takes some time, which is compensated for by
    batching several RIB change requests into a single FIB update, i.e. FIB
    synchronization with the RIB may be delayed for a fraction of a second.
    RIB to FIB synchronization, next-hop table housekeeping, and lockless
    lookup capability is provided by the FIB_ALGO infrastructure.
    
    DXR works well on modern CPUs with several MBytes of caches, especially
    in VMs, where is outperforms other currently available IPv4 FIB
    algorithms by a large margin.
    
    Reviewed by:    melifaro
    MFC after:      1 week
    Differential Revision: https://reviews.freebsd.org/D29821
    
    (cherry picked from commit 2aca58e16f507bfcad127a0865a9d5c75c5eedc3)
---
 sys/modules/Makefile         |    2 +
 sys/modules/fib_dxr/Makefile |   11 +
 sys/netinet/in_fib_dxr.c     | 1253 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1266 insertions(+)

diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index 7574c612f49c..ec5dd9a047c2 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -120,6 +120,7 @@ SUBDIR=	\
 	fdc \
 	fdescfs \
 	${_ffec} \
+	${_fib_dxr} \
 	filemon \
 	firewire \
 	firmware \
@@ -476,6 +477,7 @@ _ipfilter=	ipfilter
 
 .if ${MK_INET_SUPPORT} != "no" && ${KERN_OPTS:MFIB_ALGO}
 _dpdk_lpm4=	dpdk_lpm4
+_fib_dxr=	fib_dxr
 .endif
 
 .if ${MK_INET6_SUPPORT} != "no" && ${KERN_OPTS:MFIB_ALGO}
diff --git a/sys/modules/fib_dxr/Makefile b/sys/modules/fib_dxr/Makefile
new file mode 100644
index 000000000000..c1a704beb535
--- /dev/null
+++ b/sys/modules/fib_dxr/Makefile
@@ -0,0 +1,11 @@
+# $FreeBSD$
+
+SYSDIR?=${SRCTOP}/sys
+.include "${SYSDIR}/conf/kern.opts.mk"
+
+.PATH: ${SYSDIR}/netinet
+
+KMOD=	fib_dxr
+SRCS=	in_fib_dxr.c opt_inet.h
+
+.include <bsd.kmod.mk>
diff --git a/sys/netinet/in_fib_dxr.c b/sys/netinet/in_fib_dxr.c
new file mode 100644
index 000000000000..ec32819a5a6d
--- /dev/null
+++ b/sys/netinet/in_fib_dxr.c
@@ -0,0 +1,1253 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2012-2021 Marko Zec
+ * Copyright (c) 2005, 2018 University of Zagreb
+ * Copyright (c) 2005 International Computer Science Institute
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
+ * structures and a trivial search procedure.  More significant bits of
+ * the search key are used to directly index a two-stage trie, while the
+ * remaining bits are used for finding the next hop in a sorted array.
+ * More details in:
+ *
+ * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
+ * second in software, ACM SIGCOMM Computer Communication Review, September
+ * 2012
+ *
+ * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
+ * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+
+#include <vm/uma.h>
+
+#include <netinet/in.h>
+#include <netinet/in_fib.h>
+
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/fib_algo.h>
+
+#define	DXR_TRIE_BITS		20
+
+CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
+
+/* DXR2: two-stage primary trie, instead of a single direct lookup table */
+#define	DXR2
+
+#if DXR_TRIE_BITS > 16
+#define	DXR_D			16
+#else
+#define	DXR_D			(DXR_TRIE_BITS - 1)
+#endif
+#define	DXR_X			(DXR_TRIE_BITS - DXR_D)
+
+#define	D_TBL_SIZE		(1 << DXR_D)
+#define	DIRECT_TBL_SIZE		(1 << DXR_TRIE_BITS)
+#define	DXR_RANGE_MASK		(0xffffffffU >> DXR_TRIE_BITS)
+#define	DXR_RANGE_SHIFT		(32 - DXR_TRIE_BITS)
+
+#define	DESC_BASE_BITS		22
+#define	DESC_FRAGMENTS_BITS	(32 - DESC_BASE_BITS)
+#define	BASE_MAX		((1 << DESC_BASE_BITS) - 1)
+#define	RTBL_SIZE_INCR		(BASE_MAX / 64)
+
+#if DXR_TRIE_BITS < 24
+#define	FRAGS_MASK_SHORT	((1 << (23 - DXR_TRIE_BITS)) - 1)
+#else
+#define	FRAGS_MASK_SHORT	0
+#endif
+#define	FRAGS_PREF_SHORT	(((1 << DESC_FRAGMENTS_BITS) - 1) & \
+				 ~FRAGS_MASK_SHORT)
+#define	FRAGS_MARK_XL		(FRAGS_PREF_SHORT - 1)
+#define	FRAGS_MARK_HIT		(FRAGS_PREF_SHORT - 2)
+
+#define	IS_SHORT_FORMAT(x)	((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
+#define	IS_LONG_FORMAT(x)	((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
+#define	IS_XL_FORMAT(x)		(x == FRAGS_MARK_XL)
+
+#define	RE_SHORT_MAX_NH		((1 << (DXR_TRIE_BITS - 8)) - 1)
+
+#define	CHUNK_HASH_BITS		16
+#define	CHUNK_HASH_SIZE		(1 << CHUNK_HASH_BITS)
+#define	CHUNK_HASH_MASK		(CHUNK_HASH_SIZE - 1)
+
+#define	TRIE_HASH_BITS		16
+#define	TRIE_HASH_SIZE		(1 << TRIE_HASH_BITS)
+#define	TRIE_HASH_MASK		(TRIE_HASH_SIZE - 1)
+
+#define	XTBL_SIZE_INCR		(DIRECT_TBL_SIZE / 16)
+
+/* Lookup structure elements */
+
+struct direct_entry {
+	uint32_t		fragments: DESC_FRAGMENTS_BITS,
+				base: DESC_BASE_BITS;
+};
+
+struct range_entry_long {
+	uint32_t		start: DXR_RANGE_SHIFT,
+				nexthop: DXR_TRIE_BITS;
+};
+
+#if DXR_TRIE_BITS < 24
+struct range_entry_short {
+	uint16_t		start: DXR_RANGE_SHIFT - 8,
+				nexthop: DXR_TRIE_BITS - 8;
+};
+#endif
+
+/* Auxiliary structures */
+
+struct heap_entry {
+	uint32_t		start;
+	uint32_t		end;
+	uint32_t		preflen;
+	uint32_t		nexthop;
+};
+
+struct chunk_desc {
+	LIST_ENTRY(chunk_desc)	cd_all_le;
+	LIST_ENTRY(chunk_desc)	cd_hash_le;
+	uint32_t		cd_hash;
+	uint32_t		cd_refcnt;
+	uint32_t		cd_base;
+	uint32_t		cd_cur_size;
+	uint32_t		cd_max_size;
+};
+
+struct trie_desc {
+	LIST_ENTRY(trie_desc)	td_all_le;
+	LIST_ENTRY(trie_desc)	td_hash_le;
+	uint32_t		td_hash;
+	uint32_t		td_index;
+	uint32_t		td_refcnt;
+};
+
+struct dxr_aux {
+	/* Glue to external state */
+	struct fib_data		*fd;
+	uint32_t		fibnum;
+	int			refcnt;
+
+	/* Auxiliary build-time tables */
+	struct direct_entry	direct_tbl[DIRECT_TBL_SIZE];
+	uint16_t		d_tbl[D_TBL_SIZE];
+	struct direct_entry	*x_tbl;
+	union {
+		struct range_entry_long	re;
+		uint32_t	fragments;
+	}			*range_tbl;
+
+	/* Auxiliary internal state */
+	uint32_t		updates_mask[DIRECT_TBL_SIZE / 32];
+	struct trie_desc	*trietbl[D_TBL_SIZE];
+	LIST_HEAD(, chunk_desc)	chunk_hashtbl[CHUNK_HASH_SIZE];
+	LIST_HEAD(, chunk_desc)	all_chunks;
+	LIST_HEAD(, chunk_desc) unused_chunks; /* abuses hash link entry */
+	LIST_HEAD(, trie_desc)	trie_hashtbl[TRIE_HASH_SIZE];
+	LIST_HEAD(, trie_desc)	all_trie;
+	LIST_HEAD(, trie_desc)	unused_trie; /* abuses hash link entry */
+	struct sockaddr_in	dst;
+	struct sockaddr_in	mask;
+	struct heap_entry	heap[33];
+	uint32_t		prefixes;
+	uint32_t		updates_low;
+	uint32_t		updates_high;
+	uint32_t		all_chunks_cnt;
+	uint32_t		unused_chunks_cnt;
+	uint32_t		xtbl_size;
+	uint32_t		all_trie_cnt;
+	uint32_t		unused_trie_cnt;
+	uint32_t		trie_rebuilt_prefixes;
+	uint32_t		heap_index;
+	uint32_t		d_bits;
+	uint32_t		rtbl_size;
+	uint32_t		rtbl_top;
+	uint32_t		rtbl_work_frags;
+	uint32_t		work_chunk;
+};
+
+/* Main lookup structure container */
+
+struct dxr {
+	/* Lookup tables */
+	uint16_t		d_shift;
+	uint16_t		x_shift;
+	uint32_t		x_mask;
+	void			*d;
+	void			*x;
+	void			*r;
+	struct nhop_object	**nh_tbl;
+
+	/* Glue to external state */
+	struct dxr_aux		*aux;
+	struct fib_data		*fd;
+	struct epoch_context	epoch_ctx;
+	uint32_t		fibnum;
+};
+
+static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
+static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
+
+uma_zone_t chunk_zone;
+uma_zone_t trie_zone;
+
+SYSCTL_DECL(_net_route_algo);
+SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+    "DXR tunables");
+
+VNET_DEFINE_STATIC(int, max_trie_holes) = 8;
+#define	V_max_trie_holes	VNET(max_trie_holes)
+SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes,
+    CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0,
+    "Trie fragmentation threshold before triggering a full rebuild");
+
+VNET_DEFINE_STATIC(int, max_range_holes) = 16;
+#define	V_max_range_holes	VNET(max_range_holes)
+SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes,
+    CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0,
+    "Range table fragmentation threshold before triggering a full rebuild");
+
+/* Binary search for a matching address range */
+#define	DXR_LOOKUP_STAGE					\
+	if (masked_dst < range[middle].start) {			\
+		upperbound = middle;				\
+		middle = (middle + lowerbound) / 2;		\
+	} else if (masked_dst < range[middle + 1].start)	\
+		return (range[middle].nexthop);			\
+	else {							\
+		lowerbound = middle + 1;			\
+		middle = (upperbound + middle + 1) / 2;		\
+	}							\
+	if (upperbound == lowerbound)				\
+		return (range[lowerbound].nexthop);
+
+static int
+dxr_lookup(struct dxr *dxr, uint32_t dst)
+{
+#ifdef DXR2
+	uint16_t *dt = dxr->d;
+	struct direct_entry *xt = dxr->x;
+	int xi;
+#else
+	struct direct_entry *dt = dxr->d;
+#endif
+	struct direct_entry de;
+	struct range_entry_long	*rt;
+	uint32_t base;
+	uint32_t upperbound;
+	uint32_t middle;
+	uint32_t lowerbound;
+	uint32_t masked_dst;
+
+#ifdef DXR2
+	xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) +
+	    ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask);
+	de = xt[xi];
+#else
+	de = dt[dst >> DXR_RANGE_SHIFT];
+#endif
+
+	if (__predict_true(de.fragments == FRAGS_MARK_HIT))
+		return (de.base);
+
+	rt = dxr->r;
+	base = de.base;
+	lowerbound = 0;
+	masked_dst = dst & DXR_RANGE_MASK;
+
+#if DXR_TRIE_BITS < 24
+	if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
+		upperbound = de.fragments & FRAGS_MASK_SHORT;
+		struct range_entry_short *range =
+		    (struct range_entry_short *) &rt[base];
+
+		masked_dst >>= 8;
+		middle = upperbound;
+		upperbound = upperbound * 2 + 1;
+
+		for (;;) {
+			DXR_LOOKUP_STAGE
+			DXR_LOOKUP_STAGE
+		}
+	}
+#endif
+
+	upperbound = de.fragments;
+	middle = upperbound / 2;
+	struct range_entry_long *range = &rt[base];
+	if (__predict_false(IS_XL_FORMAT(de.fragments))) {
+		upperbound = *((uint32_t *) range);
+		range++;
+		middle = upperbound / 2;
+	}
+
+	for (;;) {
+		DXR_LOOKUP_STAGE
+		DXR_LOOKUP_STAGE
+	}
+}
+
+static void
+initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
+{
+	struct heap_entry *fhp = &da->heap[0];
+	struct rtentry *rt;
+	struct route_nhop_data rnd;
+ 
+	da->heap_index = 0;
+	da->dst.sin_addr.s_addr = htonl(dst_u32);
+	rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
+	    &rnd);
+	if (rt != NULL) {
+		struct in_addr addr;
+		uint32_t scopeid;
+
+		rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
+		fhp->start = ntohl(addr.s_addr);
+		fhp->end = fhp->start;
+		if (fhp->preflen < 32)
+			fhp->end |= (0xffffffffU >> fhp->preflen);
+		fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
+	} else {
+		fhp->preflen = fhp->nexthop = fhp->start = 0;
+		fhp->end = 0xffffffffU;
+	}
+}
+
+static uint32_t
+chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
+{
+
+	if (IS_SHORT_FORMAT(fdesc->fragments))
+		return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
+	else if (IS_XL_FORMAT(fdesc->fragments))
+		return (da->range_tbl[fdesc->base].fragments + 2);
+	else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
+		return (fdesc->fragments + 1);
+}
+
+static uint32_t
+chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
+{
+	uint32_t size = chunk_size(da, fdesc);
+	uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
+	uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
+	uint32_t hash = fdesc->fragments;
+
+	for (; p < l; p++)
+		hash = (hash << 7) + (hash >> 13) + *p;
+
+	return (hash + (hash >> 16));
+}
+
+static int
+chunk_ref(struct dxr_aux *da, uint32_t chunk)
+{
+	struct direct_entry *fdesc = &da->direct_tbl[chunk];
+	struct chunk_desc *cdp, *empty_cdp;
+	uint32_t base = fdesc->base;
+	uint32_t size = chunk_size(da, fdesc);
+	uint32_t hash = chunk_hash(da, fdesc);
+
+	/* Find an existing descriptor */
+	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
+	    cd_hash_le) {
+		if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
+		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
+		    sizeof(struct range_entry_long) * size))
+			continue;
+		da->rtbl_top = fdesc->base;
+		fdesc->base = cdp->cd_base;
+		cdp->cd_refcnt++;
+		return (0);
+	}
+
+	/* No matching chunks found. Recycle an empty or allocate a new one */
+	cdp = NULL;
+	LIST_FOREACH(empty_cdp, &da->unused_chunks, cd_hash_le)
+		if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
+		    empty_cdp->cd_max_size < cdp->cd_max_size)) {
+			cdp = empty_cdp;
+			if (empty_cdp->cd_max_size == size)
+				break;
+		}
+
+	if (cdp != NULL) {
+		/* Copy from heap into the recycled chunk */
+		bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
+		    size * sizeof(struct range_entry_long));
+		fdesc->base = cdp->cd_base;
+		da->rtbl_top -= size;
+		da->unused_chunks_cnt--;
+		if (cdp->cd_max_size > size + 1) {
+			/* Split the range in two, need a new descriptor */
+			empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
+			if (empty_cdp == NULL)
+				return (1);
+			empty_cdp->cd_max_size = cdp->cd_max_size - size;
+			empty_cdp->cd_base = cdp->cd_base + size;
+			LIST_INSERT_AFTER(cdp, empty_cdp, cd_all_le);
+			LIST_INSERT_AFTER(cdp, empty_cdp, cd_hash_le);
+			da->all_chunks_cnt++;
+			da->unused_chunks_cnt++;
+			cdp->cd_max_size = size;
+		}
+		LIST_REMOVE(cdp, cd_hash_le);
+	} else {
+		/* Alloc a new descriptor */
+		cdp = uma_zalloc(chunk_zone, M_NOWAIT);
+		if (cdp == NULL)
+			return (1);
+		cdp->cd_max_size = size;
+		cdp->cd_base = fdesc->base;
+		LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
+		da->all_chunks_cnt++;
+	}
+
+	cdp->cd_hash = hash;
+	cdp->cd_refcnt = 1;
+	cdp->cd_cur_size = size;
+	LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
+	    cd_hash_le);
+	if (da->rtbl_top >= da->rtbl_size) {
+		if (da->rtbl_top >= BASE_MAX) {
+			FIB_PRINTF(LOG_ERR, da->fd,
+			    "structural limit exceeded at %d "
+			    "range table elements", da->rtbl_top);
+			return (1);
+		}
+		da->rtbl_size += RTBL_SIZE_INCR;
+		if (da->rtbl_top >= BASE_MAX / 4)
+			FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%",
+			    da->rtbl_top * 100 / BASE_MAX);
+		da->range_tbl = realloc(da->range_tbl,
+		    sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
+		    M_DXRAUX, M_NOWAIT);
+		if (da->range_tbl == NULL)
+			return (1);
+	}
+
+	return (0);
+}
+
+static void
+chunk_unref(struct dxr_aux *da, uint32_t chunk)
+{
+	struct direct_entry *fdesc = &da->direct_tbl[chunk];
+	struct chunk_desc *cdp;
+	uint32_t base = fdesc->base;
+	uint32_t size = chunk_size(da, fdesc);
+	uint32_t hash = chunk_hash(da, fdesc);
+
+	/* Find an existing descriptor */
+	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
+	    cd_hash_le)
+		if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
+		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
+		    sizeof(struct range_entry_long) * size) == 0)
+			break;
+
+	KASSERT(cdp != NULL, ("dxr: dangling chunk"));
+	if (--cdp->cd_refcnt > 0)
+		return;
+
+	LIST_REMOVE(cdp, cd_hash_le);
+	da->unused_chunks_cnt++;
+	if (cdp->cd_base + cdp->cd_max_size != da->rtbl_top) {
+		LIST_INSERT_HEAD(&da->unused_chunks, cdp, cd_hash_le);
+		return;
+	}
+
+	do {
+		da->all_chunks_cnt--;
+		da->unused_chunks_cnt--;
+		da->rtbl_top -= cdp->cd_max_size;
+		LIST_REMOVE(cdp, cd_all_le);
+		uma_zfree(chunk_zone, cdp);
+		LIST_FOREACH(cdp, &da->unused_chunks, cd_hash_le)
+			if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
+				LIST_REMOVE(cdp, cd_hash_le);
+				break;
+			}
+	} while (cdp != NULL);
+}
+
+#ifdef DXR2
+static uint32_t
+trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
+{
+	uint32_t i, *val;
+	uint32_t hash = 0;
+
+	for (i = 0; i < (1 << dxr_x); i++) {
+		hash = (hash << 3) ^ (hash >> 3);
+		val = (uint32_t *)
+		    (void *) &da->direct_tbl[(index << dxr_x) + i];
+		hash += (*val << 5);
+		hash += (*val >> 5);
+	}
+
+	return (hash + (hash >> 16));
+}
+
+static int
+trie_ref(struct dxr_aux *da, uint32_t index)
+{
+	struct trie_desc *tp;
+	uint32_t dxr_d = da->d_bits;
+	uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
+	uint32_t hash = trie_hash(da, dxr_x, index);
+
+	/* Find an existing descriptor */
+	LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
+		if (tp->td_hash == hash &&
+		    memcmp(&da->direct_tbl[index << dxr_x],
+		    &da->x_tbl[tp->td_index << dxr_x],
+		    sizeof(*da->x_tbl) << dxr_x) == 0) {
+			tp->td_refcnt++;
+			da->trietbl[index] = tp;
+			return(tp->td_index);
+		}
+
+	tp = LIST_FIRST(&da->unused_trie);
+	if (tp != NULL) {
+		LIST_REMOVE(tp, td_hash_le);
+		da->unused_trie_cnt--;
+	} else {
+		tp = uma_zalloc(trie_zone, M_NOWAIT);
+		if (tp == NULL)
+			return (-1);
+		LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
+		tp->td_index = da->all_trie_cnt++;
+	}
+
+	tp->td_hash = hash;
+	tp->td_refcnt = 1;
+	LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
+	   td_hash_le);
+	memcpy(&da->x_tbl[tp->td_index << dxr_x],
+	    &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
+	da->trietbl[index] = tp;
+	if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
+		da->xtbl_size += XTBL_SIZE_INCR;
+		da->x_tbl = realloc(da->x_tbl,
+		    sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
+		if (da->x_tbl == NULL)
+			return (-1);
+	}
+	return(tp->td_index);
+}
+
+static void
+trie_unref(struct dxr_aux *da, uint32_t index)
+{
+	struct trie_desc *tp = da->trietbl[index];
+
+	if (tp == NULL)
+		return;
+	da->trietbl[index] = NULL;
+	if (--tp->td_refcnt > 0)
+		return;
+
+	LIST_REMOVE(tp, td_hash_le);
+	da->unused_trie_cnt++;
+	if (tp->td_index != da->all_trie_cnt - 1) {
+		LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
+		return;
+	}
+
+	do {
+		da->all_trie_cnt--;
+		da->unused_trie_cnt--;
+		LIST_REMOVE(tp, td_all_le);
+		uma_zfree(trie_zone, tp);
+		LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
+			if (tp->td_index == da->all_trie_cnt - 1) {
+				LIST_REMOVE(tp, td_hash_le);
+				break;
+			}
+	} while (tp != NULL);
+}
+#endif
+
+static void
+heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
+    uint32_t nh)
+{
+	struct heap_entry *fhp;
+	int i;
+
+	for (i = da->heap_index; i >= 0; i--) {
+		if (preflen > da->heap[i].preflen)
+			break;
+		else if (preflen < da->heap[i].preflen)
+			da->heap[i + 1] = da->heap[i];
+		else
+			return;
+	}
+
+	fhp = &da->heap[i + 1];
+	fhp->preflen = preflen;
+	fhp->start = start;
+	fhp->end = end;
+	fhp->nexthop = nh;
+	da->heap_index++;
+}
+
+static int
+dxr_walk(struct rtentry *rt, void *arg)
+{
+	struct dxr_aux *da = arg;
+	uint32_t chunk = da->work_chunk;
+	uint32_t first = chunk << DXR_RANGE_SHIFT;
+	uint32_t last = first | DXR_RANGE_MASK;
+	struct range_entry_long *fp =
+	    &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
+	struct heap_entry *fhp = &da->heap[da->heap_index];
+	uint32_t preflen, nh, start, end, scopeid;
+	struct in_addr addr;
+
+	rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
+	start = ntohl(addr.s_addr);
+	if (start > last)
+		return (-1);	/* Beyond chunk boundaries, we are done */
+	if (start < first)
+		return (0);	/* Skip this route */
+
+	end = start;
+	if (preflen < 32)
+		end |= (0xffffffffU >> preflen);
+	nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
+
+	if (start == fhp->start)
+		heap_inject(da, start, end, preflen, nh);
+	else {
+		/* start > fhp->start */
+		while (start > fhp->end) {
+			uint32_t oend = fhp->end;
+
+			if (da->heap_index > 0) {
+				fhp--;
+				da->heap_index--;
+			} else
+				initheap(da, fhp->end + 1, chunk);
+			if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
+				fp++;
+				da->rtbl_work_frags++;
+				fp->start = (oend + 1) & DXR_RANGE_MASK;
+				fp->nexthop = fhp->nexthop;
+			}
+		}
+		if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
+		    nh != fp->nexthop) {
+			fp++;
+			da->rtbl_work_frags++;
+			fp->start = start & DXR_RANGE_MASK;
+		} else if (da->rtbl_work_frags) {
+			if ((--fp)->nexthop == nh)
+				da->rtbl_work_frags--;
+			else
+				fp++;
+		}
+		fp->nexthop = nh;
+		heap_inject(da, start, end, preflen, nh);
+	}
+
+	return (0);
+}
+
+static int
+update_chunk(struct dxr_aux *da, uint32_t chunk)
+{
+	struct range_entry_long *fp;
+#if DXR_TRIE_BITS < 24
+	struct range_entry_short *fps;
+	uint32_t start, nh, i;
+#endif
+	struct heap_entry *fhp;
+	uint32_t first = chunk << DXR_RANGE_SHIFT;
+	uint32_t last = first | DXR_RANGE_MASK;
+
+	if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
+		chunk_unref(da, chunk);
+
+	initheap(da, first, chunk);
+
+	fp = &da->range_tbl[da->rtbl_top].re;
+	da->rtbl_work_frags = 0;
+	fp->start = first & DXR_RANGE_MASK;
+	fp->nexthop = da->heap[0].nexthop;
+
+	da->dst.sin_addr.s_addr = htonl(first);
+	da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
+
+	da->work_chunk = chunk;
+	rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
+	    (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
+	    dxr_walk, da);
+
+	/* Flush any remaining objects on the heap */
+	fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
+	fhp = &da->heap[da->heap_index];
+	while (fhp->preflen > DXR_TRIE_BITS) {
+		uint32_t oend = fhp->end;
+
+		if (da->heap_index > 0) {
+			fhp--;
+			da->heap_index--;
+		} else
+			initheap(da, fhp->end + 1, chunk);
+		if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
+			/* Have we crossed the upper chunk boundary? */
+			if (oend >= last)
+				break;
+			fp++;
+			da->rtbl_work_frags++;
+			fp->start = (oend + 1) & DXR_RANGE_MASK;
+			fp->nexthop = fhp->nexthop;
+		}
+	}
+
+	/* Direct hit if the chunk contains only a single fragment */
+	if (da->rtbl_work_frags == 0) {
+		da->direct_tbl[chunk].base = fp->nexthop;
+		da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
+		return (0);
+	}
+
+	da->direct_tbl[chunk].base = da->rtbl_top;
+	da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
+
+#if DXR_TRIE_BITS < 24
+	/* Check whether the chunk can be more compactly encoded */
+	fp = &da->range_tbl[da->rtbl_top].re;
+	for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
+		if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
+			break;
+	if (i == da->rtbl_work_frags + 1) {
+		fp = &da->range_tbl[da->rtbl_top].re;
+		fps = (void *) fp;
+		for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
+			start = fp->start;
+			nh = fp->nexthop;
+			fps->start = start >> 8;
+			fps->nexthop = nh;
+		}
+		fps->start = start >> 8;
+		fps->nexthop = nh;
+		da->rtbl_work_frags >>= 1;
+		da->direct_tbl[chunk].fragments =
+		    da->rtbl_work_frags | FRAGS_PREF_SHORT;
+	} else
+#endif
+	if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
+		da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
+		memmove(&da->range_tbl[da->rtbl_top + 1],
+		   &da->range_tbl[da->rtbl_top],
+		   (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
+		da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
+		da->rtbl_work_frags++;
+	}
+	da->rtbl_top += (da->rtbl_work_frags + 1);
+	return (chunk_ref(da, chunk));
+}
+
+static void
+dxr_build(struct dxr *dxr)
+{
+	struct dxr_aux *da = dxr->aux;
+	struct chunk_desc *cdp;
+	struct rib_rtable_info rinfo;
+	struct timeval t0, t1, t2, t3;
+	uint32_t r_size, dxr_tot_size;
+	uint32_t i, m, range_rebuild = 0;
+#ifdef DXR2
+	struct trie_desc *tp;
+	uint32_t d_tbl_size, dxr_x, d_size, x_size;
+	uint32_t ti, trie_rebuild = 0, prev_size = 0;
+#endif
+
+	KASSERT(dxr->d == NULL, ("dxr: d not free"));
+
+	if (da == NULL) {
+		da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
+		if (da == NULL)
+			return;
+		dxr->aux = da;
+		da->fibnum = dxr->fibnum;
+		da->refcnt = 1;
+		LIST_INIT(&da->all_chunks);
+		LIST_INIT(&da->all_trie);
+		da->rtbl_size = RTBL_SIZE_INCR;
+		da->range_tbl = NULL;
+		da->xtbl_size = XTBL_SIZE_INCR;
+		da->x_tbl = NULL;
+		bzero(&da->dst, sizeof(da->dst));
+		bzero(&da->mask, sizeof(da->mask));
+		da->dst.sin_len = sizeof(da->dst);
+		da->mask.sin_len = sizeof(da->mask);
+		da->dst.sin_family = AF_INET;
+		da->mask.sin_family = AF_INET;
+	}
+	if (da->range_tbl == NULL) {
+		da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
+		    + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
+		if (da->range_tbl == NULL)
+			return;
+		range_rebuild = 1;
+	}
+#ifdef DXR2
+	if (da->x_tbl == NULL) {
+		da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
+		    M_DXRAUX, M_NOWAIT);
+		if (da->x_tbl == NULL)
+			return;
+		trie_rebuild = 1;
+	}
+#endif
+	da->fd = dxr->fd;
+
+	microuptime(&t0);
+
+	dxr->nh_tbl = fib_get_nhop_array(da->fd);
+	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
+
+	if (da->updates_low > da->updates_high ||
+	    da->unused_chunks_cnt > V_max_range_holes)
+		range_rebuild = 1;
+	if (range_rebuild) {
+		/* Bulk cleanup */
+		bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
+		while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
+			LIST_REMOVE(cdp, cd_all_le);
+			uma_zfree(chunk_zone, cdp);
+		}
+		LIST_INIT(&da->unused_chunks);
+		da->all_chunks_cnt = da->unused_chunks_cnt = 0;
+		da->rtbl_top = 0;
+		da->updates_low = 0;
+		da->updates_high = DIRECT_TBL_SIZE - 1;
+		memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
+		for (i = 0; i < DIRECT_TBL_SIZE; i++) {
+			da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
+			da->direct_tbl[i].base = 0;
+		}
+	}
+	da->prefixes = rinfo.num_prefixes;
+
+	/* DXR: construct direct & range table */
+	for (i = da->updates_low; i <= da->updates_high; i++) {
+		m = da->updates_mask[i >> 5] >> (i & 0x1f);
+		if (m == 0)
+			i |= 0x1f;
+		else if (m & 1 && update_chunk(da, i) != 0)
+			return;
+	}
+	r_size = sizeof(*da->range_tbl) * da->rtbl_top;
+	microuptime(&t1);
+
+#ifdef DXR2
+	if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes ||
+	    abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
+		trie_rebuild = 1;
+	if (trie_rebuild) {
+		da->trie_rebuilt_prefixes = da->prefixes;
+		da->d_bits = DXR_D;
+		da->updates_low = 0;
+		da->updates_high = DIRECT_TBL_SIZE - 1;
+	}
+
+dxr2_try_squeeze:
+	if (trie_rebuild) {
+		/* Bulk cleanup */
+		bzero(da->trietbl, sizeof(da->trietbl));
*** 351 LINES SKIPPED ***


More information about the dev-commits-src-branches mailing list