git: f5baf8bb12f3 - Add modular fib lookup framework.

Alexander V. Chernikov melifaro at FreeBSD.org
Fri Dec 25 11:36:18 UTC 2020


The branch main has been updated by melifaro:

URL: https://cgit.FreeBSD.org/src/commit/?id=f5baf8bb12f39d0e8d64508c47eb6c4386ef716d

commit f5baf8bb12f39d0e8d64508c47eb6c4386ef716d
Author:     Alexander V. Chernikov <melifaro at FreeBSD.org>
AuthorDate: 2020-12-25 10:39:52 +0000
Commit:     Alexander V. Chernikov <melifaro at FreeBSD.org>
CommitDate: 2020-12-25 11:33:17 +0000

    Add modular fib lookup framework.
    
    This change introduces framework that allows to dynamically
     attach or detach longest prefix match (lpm) lookup algorithms
     to speed up datapath route tables lookups.
    
    Framework takes care of handling initial synchronisation,
     route subscription, nhop/nhop groups reference and indexing,
     dataplane attachments and fib instance algorithm setup/teardown.
    Framework features automatic algorithm selection, allowing for
     picking the best matching algorithm on-the-fly based on the
     amount of routes in the routing table.
    
    Currently framework code is guarded under FIB_ALGO config option.
    An idea is to enable it by default in the next couple of weeks.
    
    The following algorithms are provided by default:
    IPv4:
    * bsearch4 (lockless binary search in a special IP array), tailored for
      small-fib (<16 routes)
    * radix4_lockless (lockless immutable radix, re-created on every rtable change),
      tailored for small-fib (<1000 routes)
    * radix4 (base system radix backend)
    * dpdk_lpm4 (DPDK DIR24-8-based lookups), lockless datastrucure, optimized
      for large-fib (D27412)
    IPv6:
    * radix6_lockless (lockless immutable radix, re-created on every rtable change),
      tailed for small-fib (<1000 routes)
    * radix6 (base system radix backend)
    * dpdk_lpm6 (DPDK DIR24-8-based lookups), lockless datastrucure, optimized
      for large-fib (D27412)
    
    Performance changes:
    Micro benchmarks (I7-7660U, single-core lookups, 2048k dst, code in D27604):
    IPv4:
    8 routes:
      radix4: ~20mpps
      radix4_lockless: ~24.8mpps
      bsearch4: ~69mpps
      dpdk_lpm4: ~67 mpps
    700k routes:
      radix4_lockless: 3.3mpps
      dpdk_lpm4: 46mpps
    
    IPv6:
    8 routes:
      radix6_lockless: ~20mpps
      dpdk_lpm6: ~70mpps
    100k routes:
      radix6_lockless: 13.9mpps
      dpdk_lpm6: 57mpps
    
    Forwarding benchmarks:
    + 10-15% IPv4 forwarding performance (small-fib, bsearch4)
    + 25% IPv4 forwarding performance (full-view, dpdk_lpm4)
    + 20% IPv6 forwarding performance (full-view, dpdk_lpm6)
    
    Control:
    Framwork adds the following runtime sysctls:
    
    List algos
    * net.route.algo.inet.algo_list: bsearch4, radix4_lockless, radix4
    * net.route.algo.inet6.algo_list: radix6_lockless, radix6, dpdk_lpm6
    Debug level (7=LOG_DEBUG, per-route)
    net.route.algo.debug_level: 5
    Algo selection (currently only for fib 0):
    net.route.algo.inet.algo: bsearch4
    net.route.algo.inet6.algo: radix6_lockless
    
    Support for manually changing algos in non-default fib will be added
    soon. Some sysctl names will be changed in the near future.
    
    Differential Revision: https://reviews.freebsd.org/D27401
---
 sys/conf/files               |    3 +
 sys/conf/options             |    1 +
 sys/net/route.c              |    8 +
 sys/net/route/fib_algo.c     | 1608 ++++++++++++++++++++++++++++++++++++++++++
 sys/net/route/fib_algo.h     |  109 +++
 sys/net/route/route_tables.c |   29 +-
 sys/net/route/route_var.h    |   10 +
 sys/netinet/in_fib.c         |   37 +
 sys/netinet/in_fib_algo.c    |  765 ++++++++++++++++++++
 sys/netinet6/in6_fib.c       |   35 +
 sys/netinet6/in6_fib.h       |    2 +
 sys/netinet6/in6_fib_algo.c  |  361 ++++++++++
 12 files changed, 2962 insertions(+), 6 deletions(-)

diff --git a/sys/conf/files b/sys/conf/files
index aa376b0bb9a3..2a71e21ace71 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4178,6 +4178,7 @@ net/route/nhgrp_ctl.c		optional route_mpath
 net/route/nhop.c		standard
 net/route/nhop_ctl.c		standard
 net/route/nhop_utils.c		standard
+net/route/fib_algo.c		optional fib_algo
 net/route/route_ctl.c		standard
 net/route/route_ddb.c		optional ddb
 net/route/route_helpers.c	standard
@@ -4329,6 +4330,7 @@ netinet/in_debug.c		optional inet ddb
 netinet/in_kdtrace.c		optional inet | inet6
 netinet/ip_carp.c		optional inet carp | inet6 carp
 netinet/in_fib.c		optional inet
+netinet/in_fib_algo.c		optional inet fib_algo
 netinet/in_gif.c		optional gif inet | netgraph_gif inet
 netinet/ip_gre.c		optional gre inet
 netinet/ip_id.c			optional inet
@@ -4405,6 +4407,7 @@ netinet6/icmp6.c		optional inet6
 netinet6/in6.c			optional inet6
 netinet6/in6_cksum.c		optional inet6
 netinet6/in6_fib.c		optional inet6
+netinet6/in6_fib_algo.c		optional inet6 fib_algo
 netinet6/in6_gif.c		optional gif inet6 | netgraph_gif inet6
 netinet6/in6_ifattach.c		optional inet6
 netinet6/in6_jail.c		optional inet6
diff --git a/sys/conf/options b/sys/conf/options
index 24f984930dc2..68c8a2e0d3ca 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -454,6 +454,7 @@ PCBGROUP		opt_pcbgroup.h
 PF_DEFAULT_TO_DROP	opt_pf.h
 ROUTE_MPATH		opt_route.h
 ROUTETABLES		opt_route.h
+FIB_ALGO		opt_route.h
 RSS			opt_rss.h
 SLIP_IFF_OPTS		opt_slip.h
 TCPDEBUG
diff --git a/sys/net/route.c b/sys/net/route.c
index 6c051e389d82..2f4ae40e83aa 100644
--- a/sys/net/route.c
+++ b/sys/net/route.c
@@ -151,6 +151,14 @@ void
 rt_table_destroy(struct rib_head *rh)
 {
 
+	RIB_WLOCK(rh);
+	rh->rib_dying = true;
+	RIB_WUNLOCK(rh);
+
+#ifdef FIB_ALGO
+	fib_destroy_rib(rh);
+#endif
+
 	tmproutes_destroy(rh);
 
 	rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head);
diff --git a/sys/net/route/fib_algo.c b/sys/net/route/fib_algo.c
new file mode 100644
index 000000000000..afb009d4c8cf
--- /dev/null
+++ b/sys/net/route/fib_algo.c
@@ -0,0 +1,1608 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_route.h"
+
+#include <sys/param.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/sbuf.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/queue.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#endif
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/route/fib_algo.h>
+
+#include <machine/stdarg.h>
+
+/*
+ * Fib lookup framework.
+ *
+ * This framework enables accelerated longest-prefix-match lookups for the
+ *  routing tables by adding the ability to dynamically attach/detach lookup
+ *  algorithms implementation to/from the datapath.
+ *
+ * flm - fib lookup modules - implementation of particular lookup algorithm
+ * fd - fib data - instance of an flm bound to specific routing table
+ *
+ * This file provides main framework functionality.
+ *
+ * The following are the features provided by the framework
+ *
+ * 1) nexhops abstraction -> provides transparent referencing, indexing
+ *   and efficient idx->ptr mappings for nexthop and nexthop groups.
+ * 2) Routing table synchronisation
+ * 3) dataplane attachment points
+ * 4) automatic algorithm selection based on the provided preference.
+ *
+ *
+ * DATAPATH
+ * For each supported address family, there is a an allocated array of fib_dp
+ *  structures, indexed by fib number. Each array entry contains callback function
+ *  and its argument. This function will be called with a family-specific lookup key,
+ *  scope and provided argument. This array gets re-created every time when new algo
+ *  instance gets created. Please take a look at the replace_rtables_family() function
+ *  for more details.
+ *
+ */
+
+SYSCTL_DECL(_net_route);
+SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+    "Fib algorithm lookups");
+
+#ifdef INET6
+bool algo_fixed_inet6 = false;
+SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+    "IPv6 longest prefix match lookups");
+#endif
+#ifdef INET
+bool algo_fixed_inet = false;
+SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+    "IPv4 longest prefix match lookups");
+#endif
+
+struct nhop_ref_table {
+	uint32_t		count;
+	int32_t			refcnt[0];
+};
+
+/*
+ * Data structure for the fib lookup instance tied to the particular rib.
+ */
+struct fib_data {
+	uint32_t		number_nhops;	/* current # of nhops */
+	uint8_t			hit_nhops;	/* true if out of nhop limit */
+	uint8_t			init_done;	/* true if init is competed */
+	uint32_t		fd_dead:1;	/* Scheduled for deletion */
+	uint32_t		fd_linked:1;	/* true if linked */
+	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
+	uint32_t		fd_force_eval:1;/* true if rebuild scheduled */
+	uint8_t			fd_family;	/* family */
+	uint32_t		fd_fibnum;	/* fibnum */
+	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
+	uint32_t		fd_algo_mask;	/* bitmask for algo data */
+	struct callout		fd_callout;	/* rebuild callout */
+	void			*fd_algo_data;	/* algorithm data */
+	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
+	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
+	struct rib_head		*fd_rh;		/* RIB table we're attached to */
+	struct rib_subscription	*fd_rs;		/* storing table subscription */
+	struct fib_dp		fd_dp;		/* fib datapath data */
+	struct vnet		*fd_vnet;	/* vnet fib belongs to */
+	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
+	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
+	uint32_t		fd_num_changes;	/* number of changes since last callout */
+	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
+};
+
+static void rebuild_fd_callout(void *_data);
+static void destroy_fd_instance_epoch(epoch_context_t ctx);
+static enum flm_op_result attach_datapath(struct fib_data *fd);
+static bool is_idx_free(struct fib_data *fd, uint32_t index);
+static void set_algo_fixed(struct rib_head *rh);
+
+static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
+static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
+
+static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
+    struct fib_lookup_module *orig_flm);
+static void fib_unref_algo(struct fib_lookup_module *flm);
+static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
+
+struct mtx fib_mtx;
+#define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
+#define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
+#define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
+
+
+/* Algorithm has to be this percent better than the current to switch */
+#define	BEST_DIFF_PERCENT	(5 * 256 / 100)
+/* Schedule algo re-evaluation X seconds after a change */
+#define	ALGO_EVAL_DELAY_MS	30000
+/* Force algo re-evaluation after X changes */
+#define	ALGO_EVAL_NUM_ROUTES	100
+/* Try to setup algorithm X times */
+#define	FIB_MAX_TRIES		32
+/* Max amount of supported nexthops */
+#define	FIB_MAX_NHOPS		262144
+#define	FIB_CALLOUT_DELAY_MS	50
+
+/* Debug */
+static int flm_debug_level = LOG_NOTICE;
+SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
+    &flm_debug_level, 0, "debuglevel");
+#define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
+
+#define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
+#define	ALGO_PRINTF(_fmt, ...)	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__)
+#define	_ALGO_PRINTF(_fib, _fam, _aname, _func, _fmt, ...) \
+    printf("[fib_algo] %s.%u (%s) %s: " _fmt "\n",\
+    print_family(_fam), _fib, _aname, _func, ## __VA_ARGS__)
+#define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
+    printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
+#define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
+    _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
+}
+#define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
+#define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
+    _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
+    __func__, _fmt, ## __VA_ARGS__);					\
+}
+#if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
+#define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
+#else
+#define	FD_PRINTF_LOG_DEBUG()
+#endif
+#if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
+#define	FD_PRINTF_LOG_INFO	_FD_PRINTF
+#else
+#define	FD_PRINTF_LOG_INFO()
+#endif
+#define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
+#define	FD_PRINTF_LOG_ERR	_FD_PRINTF
+#define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
+
+
+/* List of all registered lookup algorithms */
+static TAILQ_HEAD(, fib_lookup_module)	all_algo_list;
+
+/* List of all fib lookup instances in the vnet */
+VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
+#define	V_fib_data_list	VNET(fib_data_list)
+
+/* Datastructure for storing non-transient fib lookup module failures */
+struct fib_error {
+	int				fe_family;
+	uint32_t			fe_fibnum;	/* failed rtable */
+	struct fib_lookup_module	*fe_flm;	/* failed module */
+	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
+};
+VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
+#define	V_fib_error_list VNET(fib_error_list)
+
+/* Per-family array of fibnum -> {func, arg} mappings used in datapath */
+struct fib_dp_header {
+	struct epoch_context	fdh_epoch_ctx;
+	uint32_t		fdh_num_tables;
+	struct fib_dp		fdh_idx[0];
+};
+
+/*
+ * Tries to add new non-transient algorithm error to the list of
+ *  errors.
+ * Returns true on success.
+ */
+static bool
+flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
+{
+	struct fib_error *fe;
+
+	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
+	if (fe == NULL)
+		return (false);
+	fe->fe_flm = flm;
+	fe->fe_family = flm->flm_family;
+	fe->fe_fibnum = fibnum;
+
+	FIB_MOD_LOCK();
+	/* Avoid duplicates by checking if error already exists first */
+	if (flm_error_check(flm, fibnum)) {
+		FIB_MOD_UNLOCK();
+		free(fe, M_TEMP);
+		return (true);
+	}
+	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
+	FIB_MOD_UNLOCK();
+
+	return (true);
+}
+
+/*
+ * True if non-transient error has been registered for @flm in @fibnum.
+ */
+static bool
+flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
+{
+	const struct fib_error *fe;
+
+	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
+		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
+			return (true);
+	}
+
+	return (false);
+}
+
+/*
+ * Clear all errors of algo specified by @flm.
+ */
+static void
+fib_error_clear_flm(struct fib_lookup_module *flm)
+{
+	struct fib_error *fe, *fe_tmp;
+
+	FIB_MOD_LOCK_ASSERT();
+
+	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
+		if (fe->fe_flm == flm) {
+			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
+			free(fe, M_TEMP);
+		}
+	}
+}
+
+/*
+ * Clears all errors in current VNET.
+ */
+static void
+fib_error_clear()
+{
+	struct fib_error *fe, *fe_tmp;
+
+	FIB_MOD_LOCK_ASSERT();
+
+	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
+		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
+		free(fe, M_TEMP);
+	}
+}
+
+static const char *
+print_family(int family)
+{
+
+	if (family == AF_INET)
+		return ("inet");
+	else if (family == AF_INET6)
+		return ("inet6");
+	else
+		return ("unknown");
+}
+
+/*
+ * Debug function used by lookup algorithms.
+ * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
+ */
+void
+fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
+{
+	char buf[128];
+	va_list ap;
+
+	if (level > flm_debug_level)
+		return;
+
+	va_start(ap, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+
+	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
+	    func, "%s", buf);
+}
+
+/*
+ * Outputs list of algorithms supported by the provided address family.
+ */
+static int
+print_algos_sysctl(struct sysctl_req *req, int family)
+{
+	struct fib_lookup_module *flm;
+	struct sbuf sbuf;
+	int error, count = 0;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error == 0) {
+		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
+		TAILQ_FOREACH(flm, &all_algo_list, entries) {
+			if (flm->flm_family == family) {
+				if (count++ > 0)
+					sbuf_cat(&sbuf, ", ");
+				sbuf_cat(&sbuf, flm->flm_name);
+			}
+		}
+		error = sbuf_finish(&sbuf);
+		sbuf_delete(&sbuf);
+	}
+	return (error);
+}
+
+#ifdef INET6
+static int
+print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
+{
+
+	return (print_algos_sysctl(req, AF_INET6));
+}
+SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
+#endif
+
+#ifdef INET
+static int
+print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
+{
+
+	return (print_algos_sysctl(req, AF_INET));
+}
+SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
+#endif
+
+/*
+ * Calculate delay between repeated failures.
+ * Returns current delay in milliseconds.
+ */
+static uint32_t
+callout_calc_delay_ms(struct fib_data *fd)
+{
+	uint32_t shift;
+
+	if (fd->fd_failed_rebuilds > 10)
+		shift = 10;
+	else
+		shift = fd->fd_failed_rebuilds;
+
+	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
+}
+
+static void
+schedule_callout(struct fib_data *fd, int delay_ms)
+{
+
+	callout_reset_sbt(&fd->fd_callout, 0, SBT_1MS * delay_ms,
+	    rebuild_fd_callout, fd, 0);
+}
+
+static void
+schedule_fd_rebuild(struct fib_data *fd)
+{
+
+	FIB_MOD_LOCK();
+	if (!fd->fd_need_rebuild) {
+		fd->fd_need_rebuild = true;
+
+		/*
+		 * Potentially re-schedules pending callout
+		 *  initiated by schedule_algo_eval.
+		 */
+		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuilt");
+		schedule_callout(fd, callout_calc_delay_ms(fd));
+	}
+	FIB_MOD_UNLOCK();
+}
+
+static void
+schedule_algo_eval(struct fib_data *fd)
+{
+
+	if (fd->fd_num_changes++ == 0) {
+		/* Start callout to consider switch */
+		FIB_MOD_LOCK();
+		if (!callout_pending(&fd->fd_callout))
+			schedule_callout(fd, ALGO_EVAL_DELAY_MS);
+		FIB_MOD_UNLOCK();
+	} else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) {
+		/* Reset callout to exec immediately */
+		FIB_MOD_LOCK();
+		if (!fd->fd_need_rebuild) {
+			fd->fd_force_eval = true;
+			schedule_callout(fd, 1);
+		}
+		FIB_MOD_UNLOCK();
+	}
+}
+
+/*
+ * Rib subscription handler. Checks if the algorithm is ready to
+ *  receive updates, handles nexthop refcounting and passes change
+ *  data to the algorithm callback.
+ */
+static void
+handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+    void *_data)
+{
+	struct fib_data *fd = (struct fib_data *)_data;
+	enum flm_op_result result;
+
+	RIB_WLOCK_ASSERT(rnh);
+
+	/*
+	 * There is a small gap between subscribing for route changes
+	 *  and initiating rtable dump. Avoid receiving route changes
+	 *  prior to finishing rtable dump by checking `init_done`.
+	 */
+	if (!fd->init_done)
+		return;
+	/*
+	 * If algo requested rebuild, stop sending updates by default.
+	 * This simplifies nexthop refcount handling logic.
+	 */
+	if (fd->fd_need_rebuild)
+		return;
+
+	/* Consider scheduling algorithm re-evaluation */
+	schedule_algo_eval(fd);
+
+	/*
+	 * Maintain guarantee that every nexthop returned by the dataplane
+	 *  lookup has > 0 refcount, so can be safely referenced within current
+	 *  epoch.
+	 */
+	if (rc->rc_nh_new != NULL) {
+		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
+			/* ran out of indexes */
+			schedule_fd_rebuild(fd);
+			return;
+		}
+	}
+
+	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
+
+	switch (result) {
+	case FLM_SUCCESS:
+		/* Unref old nexthop on success */
+		if (rc->rc_nh_old != NULL)
+			fib_unref_nhop(fd, rc->rc_nh_old);
+		break;
+	case FLM_REBUILD:
+
+		/*
+		 * Algo is not able to apply the update.
+		 * Schedule algo rebuild.
+		 */
+		schedule_fd_rebuild(fd);
+		break;
+	case FLM_ERROR:
+
+		/*
+		 * Algo reported a non-recoverable error.
+		 * Record the error and schedule rebuild, which will
+		 *  trigger best algo selection.
+		 */
+		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
+		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
+			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
+		schedule_fd_rebuild(fd);
+	}
+}
+
+static void
+estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
+{
+
+	if (old_fd == NULL) {
+		// TODO: read from rtable
+		fd->number_nhops = 16;
+		return;
+	}
+
+	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
+		fd->number_nhops = 2 * old_fd->number_nhops;
+	else
+		fd->number_nhops = old_fd->number_nhops;
+}
+
+struct walk_cbdata {
+	struct fib_data		*fd;
+	flm_dump_t		*func;
+	enum flm_op_result	result;
+};
+
+/*
+ * Handler called after all rtenties have been dumped.
+ * Performs post-dump framework checks and calls
+ * algo:flm_dump_end_cb().
+ *
+ * Updates walk_cbdata result.
+ */
+static void
+sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
+{
+	struct walk_cbdata *w = (struct walk_cbdata *)_data;
+	struct fib_data *fd = w->fd;
+
+	RIB_WLOCK_ASSERT(w->fd->fd_rh);
+
+	if (rnh->rib_dying) {
+		w->result = FLM_ERROR;
+		return;
+	}
+
+	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
+		return;
+
+	/* Post-dump hook, dump successful */
+
+	if (fd->hit_nhops) {
+		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
+		    fd->nh_ref_table->count);
+		w->result = FLM_REBUILD;
+		return;
+	}
+
+	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
+
+	if (w->result == FLM_SUCCESS) {
+		/* Mark init as done to allow routing updates */
+		fd->init_done = 1;
+	}
+}
+
+/*
+ * Callback for each entry in rib.
+ * Calls algo:flm_dump_rib_item_cb func as a part of initial
+ *  route table synchronisation.
+ */
+static int
+sync_algo_cb(struct rtentry *rt, void *_data)
+{
+	struct walk_cbdata *w = (struct walk_cbdata *)_data;
+
+	RIB_WLOCK_ASSERT(w->fd->fd_rh);
+
+	if (w->result == FLM_SUCCESS && w->func) {
+
+		/*
+		 * Reference nexthops to maintain guarantee that
+		 *  each nexthop returned by datapath has > 0 references
+		 *  and can be safely referenced within current epoch.
+		 */
+		struct nhop_object *nh = rt_get_raw_nhop(rt);
+		if (fib_ref_nhop(w->fd, nh) != 0)
+			w->result = w->func(rt, w->fd->fd_algo_data);
+		else
+			w->result = FLM_REBUILD;
+	}
+
+	return (0);
+}
+
+/*
+ * Dump all routing table state to the algo instance.
+ */
+static enum flm_op_result
+sync_algo(struct fib_data *fd)
+{
+	struct walk_cbdata w = {
+		.fd = fd,
+		.func = fd->fd_flm->flm_dump_rib_item_cb,
+		.result = FLM_SUCCESS,
+	};
+
+	rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w);
+
+	FD_PRINTF(LOG_INFO, fd, "initial dump completed.");
+
+	return (w.result);
+}
+
+/*
+ * Schedules epoch-backed @fd instance deletion.
+ * * Unlinks @fd from the list of active algo instances.
+ * * Removes rib subscription.
+ * * Stops callout.
+ * * Schedules actual deletion.
+ *
+ * Assume @fd is already unlinked from the datapath.
+ */
+static int
+schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
+{
+	bool is_dead;
+
+	NET_EPOCH_ASSERT();
+
+	FIB_MOD_LOCK();
+	is_dead = fd->fd_dead;
+	if (!is_dead)
+		fd->fd_dead = true;
+	if (fd->fd_linked) {
+		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
+		fd->fd_linked = false;
+	}
+	FIB_MOD_UNLOCK();
+	if (is_dead)
+		return (0);
+
+	FD_PRINTF(LOG_INFO, fd, "DETACH");
+
+	if (fd->fd_rs != NULL)
+		rib_unsibscribe(fd->fd_rs);
+
+	/*
+	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
+	 * will be executed, hence no _new_ callout schedules will happen.
+	 *
+	 * There can be 2 possible scenarious here:
+	 * 1) we're running inside a callout when we're deleting ourselves
+	 *  due to migration to a newer fd
+	 * 2) we're running from rt_table_destroy() and callout is scheduled
+	 *  for execution OR is executing
+	 *
+	 * For (2) we need to wait for the callout termination, as the routing table
+	 * will be destroyed after this function returns.
+	 * For (1) we cannot call drain, but can ensure that this is the last invocation.
+	 */
+
+	if (in_callout)
+		callout_stop(&fd->fd_callout);
+	else
+		callout_drain(&fd->fd_callout);
+
+	FD_PRINTF(LOG_INFO, fd, "destroying old instance");
+	epoch_call(net_epoch_preempt, destroy_fd_instance_epoch,
+	    &fd->fd_epoch_ctx);
+
+	return (0);
+}
+
+/*
+ * Wipe all fd instances from the list matching rib specified by @rh.
+ * If @keep_first is set, remove all but the first record.
+ */
+static void
+fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
+{
+	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
+	struct fib_data *fd, *fd_tmp;
+	struct epoch_tracker et;
+
+	FIB_MOD_LOCK();
+	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
+		if (fd->fd_rh == rh) {
+			if (keep_first) {
+				keep_first = false;
+				continue;
+			}
+			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
+			fd->fd_linked = false;
+			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
+		}
+	}
+	FIB_MOD_UNLOCK();
+
+	/* Pass 2: remove each entry */
+	NET_EPOCH_ENTER(et);
+	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
+		schedule_destroy_fd_instance(fd, in_callout);
+	}
+	NET_EPOCH_EXIT(et);
+}
+
+void
+fib_destroy_rib(struct rib_head *rh)
+{
+
+	/*
+	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
+	 *  sync_algo() stage, preventing new entries to be added to the list
+	 *  of active algos. Remove all existing entries for the particular rib.
+	 */
+	fib_cleanup_algo(rh, false, false);
+}
+
+/*
+ * Finalises fd destruction by freeing all fd resources.
+ */
+static void
+destroy_fd_instance(struct fib_data *fd)
+{
+
+	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
+
+	/* Call destroy callback first */
+	if (fd->fd_algo_data != NULL)
+		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
+
+	/* Nhop table */
+	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
+		for (int i = 0; i < fd->number_nhops; i++) {
+			if (!is_idx_free(fd, i)) {
+				FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p",
+				    i, fd->nh_idx[i]);
+				nhop_free_any(fd->nh_idx[i]);
+			}
+		}
+		free(fd->nh_idx, M_RTABLE);
+	}
+	if (fd->nh_ref_table != NULL)
+		free(fd->nh_ref_table, M_RTABLE);
+
+	fib_unref_algo(fd->fd_flm);
+
+	free(fd, M_RTABLE);
+}
+
+/*
+ * Epoch callback indicating fd is safe to destroy
+ */
+static void
+destroy_fd_instance_epoch(epoch_context_t ctx)
+{
+	struct fib_data *fd;
+
+	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
+
+	destroy_fd_instance(fd);
+}
+
+/*
+ * Tries to setup fd instance.
+ * - Allocates fd/nhop table
+ * - Runs algo:flm_init_cb algo init
+ * - Subscribes fd to the rib
+ * - Runs rtable dump
+ * - Adds instance to the list of active instances.
+ *
+ * Returns: operation result. Fills in @pfd with resulting fd on success.
+ *
+ */
+static enum flm_op_result
+try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
+    struct fib_data *old_fd, struct fib_data **pfd)
+{
+	struct fib_data *fd;
+	size_t size;
+	enum flm_op_result result;
+
*** 2314 LINES SKIPPED ***


More information about the dev-commits-src-all mailing list