git: 6bc5d49aba14 - stable/13 - MFC 151ec796a230: Fix the design problem with delayed algorithm sync.

Alexander V. Chernikov melifaro at FreeBSD.org
Thu Feb 4 22:35:01 UTC 2021


The branch stable/13 has been updated by melifaro:

URL: https://cgit.FreeBSD.org/src/commit/?id=6bc5d49aba14c6ce102d4392cd53bcdf8b61c2fc

commit 6bc5d49aba14c6ce102d4392cd53bcdf8b61c2fc
Author:     Alexander V. Chernikov <melifaro at FreeBSD.org>
AuthorDate: 2021-01-30 22:45:46 +0000
Commit:     Alexander V. Chernikov <melifaro at FreeBSD.org>
CommitDate: 2021-02-04 22:34:15 +0000

    MFC 151ec796a230: Fix the design problem with delayed algorithm sync.
    
    Currently, if the immutable algorithm like bsearch or radix_lockless
     receives rtable update notification, it schedules algorithm rebuild.
    This rebuild is executed by the callout after ~50 milliseconds.
    
    It is possible that a script adding an interface address and than route
     with the gateway bound to that address will fail. It can happen due
     to the fact that fib is not updated by the time the route addition
     request arrives.
    
    Fix this by allowing synchronous algorithm rebuilds based on certain
     conditions. By default, these conditions assume:
    1) less than net.route.algo.fib_sync_limit=100 routes
    2) routes without gateway.
    
    * Move algo instance build entirely under rib WLOCK.
     Rib lock is only used for control plane (except radix algo, but there
      are no rebuilds).
    * Add rib_walk_ext_locked() function to allow RIB iteration with
     rib lock already held.
    * Fix rare potential callout use-after-free for fds by binding fd
     callout to the relevant rib rmlock. In that case, callout_stop()
     under rib WLOCK guarantees no callout will be executed afterwards.
---
 sys/net/route/fib_algo.c      | 110 ++++++++++++++++++++++++++++--------------
 sys/net/route/route_ctl.h     |   2 +
 sys/net/route/route_helpers.c |  17 +++++--
 3 files changed, 87 insertions(+), 42 deletions(-)

diff --git a/sys/net/route/fib_algo.c b/sys/net/route/fib_algo.c
index f7a8b3f82431..21b6ba924069 100644
--- a/sys/net/route/fib_algo.c
+++ b/sys/net/route/fib_algo.c
@@ -105,6 +105,11 @@ SYSCTL_DECL(_net_route);
 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Fib algorithm lookups");
 
+VNET_DEFINE(int, fib_sync_limit) = 100;
+#define	V_fib_sync_limit	VNET(fib_sync_limit)
+SYSCTL_INT(_net_route_algo, OID_AUTO, fib_sync_limit, CTLFLAG_RW | CTLFLAG_VNET,
+    &VNET_NAME(fib_sync_limit), 0, "Guarantee synchronous fib till route limit");
+
 #ifdef INET6
 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
 #define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
@@ -465,7 +470,8 @@ static void
 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
 {
 
-	FIB_MOD_LOCK();
+	RIB_WLOCK_ASSERT(fd->fd_rh);
+
 	if (!fd->fd_need_rebuild) {
 		fd->fd_need_rebuild = true;
 
@@ -477,30 +483,52 @@ schedule_fd_rebuild(struct fib_data *fd, const char *reason)
 		    reason, fd->fd_failed_rebuilds);
 		schedule_callout(fd, callout_calc_delay_ms(fd));
 	}
-	FIB_MOD_UNLOCK();
 }
 
 static void
 schedule_algo_eval(struct fib_data *fd)
 {
 
+	RIB_WLOCK_ASSERT(fd->fd_rh);
+
 	if (fd->fd_num_changes++ == 0) {
 		/* Start callout to consider switch */
-		FIB_MOD_LOCK();
 		if (!callout_pending(&fd->fd_callout))
 			schedule_callout(fd, ALGO_EVAL_DELAY_MS);
-		FIB_MOD_UNLOCK();
 	} else if (fd->fd_num_changes > ALGO_EVAL_NUM_ROUTES && !fd->fd_force_eval) {
 		/* Reset callout to exec immediately */
-		FIB_MOD_LOCK();
 		if (!fd->fd_need_rebuild) {
 			fd->fd_force_eval = true;
 			schedule_callout(fd, 1);
 		}
-		FIB_MOD_UNLOCK();
 	}
 }
 
+static bool
+need_immediate_rebuild(struct fib_data *fd, struct rib_cmd_info *rc)
+{
+	struct nhop_object *nh;
+
+	if ((V_fib_sync_limit == 0) || (fd->fd_rh->rnh_prefixes <= V_fib_sync_limit))
+		return (true);
+
+	/* Sync addition/removal of interface routes */
+	switch (rc->rc_cmd) {
+	case RTM_ADD:
+		nh = rc->rc_nh_new;
+		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
+			return (true);
+		break;
+	case RTM_DELETE:
+		nh = rc->rc_nh_old;
+		if (!NH_IS_NHGRP(nh) && (!(nh->nh_flags & NHF_GATEWAY)))
+			return (true);
+		break;
+	}
+
+	return (false);
+}
+
 /*
  * Rib subscription handler. Checks if the algorithm is ready to
  *  receive updates, handles nexthop refcounting and passes change
@@ -559,7 +587,15 @@ handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
 		 * Algo is not able to apply the update.
 		 * Schedule algo rebuild.
 		 */
-		schedule_fd_rebuild(fd, "algo requested rebuild");
+		if (!need_immediate_rebuild(fd, rc)) {
+			schedule_fd_rebuild(fd, "algo requested rebuild");
+			break;
+		}
+
+		fd->fd_need_rebuild = true;
+		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
+		if (!rebuild_fd(fd))
+			schedule_fd_rebuild(fd, "sync rebuild failed");
 		break;
 	case FLM_ERROR:
 
@@ -678,7 +714,7 @@ sync_algo(struct fib_data *fd)
 		.result = FLM_SUCCESS,
 	};
 
-	rib_walk_ext_internal(fd->fd_rh, true, sync_algo_cb, sync_algo_end_cb, &w);
+	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
 
 	FD_PRINTF(LOG_INFO, fd,
 	    "initial dump completed (rtable version: %d), result: %s",
@@ -702,6 +738,7 @@ schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
 	bool is_dead;
 
 	NET_EPOCH_ASSERT();
+	RIB_WLOCK_ASSERT(fd->fd_rh);
 
 	FIB_MOD_LOCK();
 	is_dead = fd->fd_dead;
@@ -718,27 +755,13 @@ schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
 	FD_PRINTF(LOG_INFO, fd, "DETACH");
 
 	if (fd->fd_rs != NULL)
-		rib_unsibscribe(fd->fd_rs);
+		rib_unsibscribe_locked(fd->fd_rs);
 
 	/*
 	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
 	 * will be executed, hence no _new_ callout schedules will happen.
-	 *
-	 * There can be 2 possible scenarious here:
-	 * 1) we're running inside a callout when we're deleting ourselves
-	 *  due to migration to a newer fd
-	 * 2) we're running from rt_table_destroy() and callout is scheduled
-	 *  for execution OR is executing
-	 *
-	 * For (2) we need to wait for the callout termination, as the routing table
-	 * will be destroyed after this function returns.
-	 * For (1) we cannot call drain, but can ensure that this is the last invocation.
 	 */
-
-	if (in_callout)
-		callout_stop(&fd->fd_callout);
-	else
-		callout_drain(&fd->fd_callout);
+	callout_stop(&fd->fd_callout);
 
 	epoch_call(net_epoch_preempt, destroy_fd_instance_epoch,
 	    &fd->fd_epoch_ctx);
@@ -774,7 +797,11 @@ fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
 	/* Pass 2: remove each entry */
 	NET_EPOCH_ENTER(et);
 	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
+		if (!in_callout)
+			RIB_WLOCK(fd->fd_rh);
 		schedule_destroy_fd_instance(fd, in_callout);
+		if (!in_callout)
+			RIB_WUNLOCK(fd->fd_rh);
 	}
 	NET_EPOCH_EXIT(et);
 }
@@ -870,7 +897,7 @@ try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
 	fd->fd_gen = ++fib_gen;
 	fd->fd_family = rh->rib_family;
 	fd->fd_fibnum = rh->rib_fibnum;
-	callout_init(&fd->fd_callout, 1);
+	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
 	fd->fd_vnet = curvnet;
 	fd->fd_flm = flm;
 
@@ -908,8 +935,8 @@ try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
 
 	/* Try to subscribe */
 	if (flm->flm_change_rib_item_cb != NULL) {
-		fd->fd_rs = rib_subscribe_internal(fd->fd_rh,
-		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE, 0);
+		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
+		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
 		if (fd->fd_rs == NULL) {
 			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
 			return (FLM_REBUILD);
@@ -946,13 +973,14 @@ setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
 {
 	struct fib_data *prev_fd, *new_fd;
-	struct epoch_tracker et;
 	enum flm_op_result result;
 
+	NET_EPOCH_ASSERT();
+	RIB_WLOCK_ASSERT(rh);
+
 	prev_fd = orig_fd;
 	new_fd = NULL;
 	for (int i = 0; i < FIB_MAX_TRIES; i++) {
-		NET_EPOCH_ENTER(et);
 		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
 
 		if ((result == FLM_SUCCESS) && attach)
@@ -962,7 +990,6 @@ setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
 			schedule_destroy_fd_instance(prev_fd, false);
 			prev_fd = NULL;
 		}
-		NET_EPOCH_EXIT(et);
 
 		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
 		    print_op_result(result));
@@ -990,14 +1017,12 @@ setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
 		if (result == FLM_ERROR)
 			flm_error_add(flm, rh->rib_fibnum);
 
-		NET_EPOCH_ENTER(et);
 		if ((prev_fd != NULL) && (prev_fd != orig_fd))
 			schedule_destroy_fd_instance(prev_fd, false);
 		if (new_fd != NULL) {
 			schedule_destroy_fd_instance(new_fd, false);
 			new_fd = NULL;
 		}
-		NET_EPOCH_EXIT(et);
 	}
 
 	*pfd = new_fd;
@@ -1013,12 +1038,15 @@ static void
 rebuild_fd_callout(void *_data)
 {
 	struct fib_data *fd = (struct fib_data *)_data;
+	struct epoch_tracker et;
 
 	FD_PRINTF(LOG_INFO, fd, "running callout rebuild");
 
+	NET_EPOCH_ENTER(et);
 	CURVNET_SET(fd->fd_vnet);
 	rebuild_fd(fd);
 	CURVNET_RESTORE();
+	NET_EPOCH_EXIT(et);
 }
 
 /*
@@ -1030,17 +1058,16 @@ rebuild_fd(struct fib_data *fd)
 {
 	struct fib_data *fd_new, *fd_tmp;
 	struct fib_lookup_module *flm_new = NULL;
-	struct epoch_tracker et;
 	enum flm_op_result result;
 	bool need_rebuild = false;
 
+	NET_EPOCH_ASSERT();
+	RIB_WLOCK_ASSERT(fd->fd_rh);
 
-	FIB_MOD_LOCK();
 	need_rebuild = fd->fd_need_rebuild;
 	fd->fd_need_rebuild = false;
 	fd->fd_force_eval = false;
 	fd->fd_num_changes = 0;
-	FIB_MOD_UNLOCK();
 
 	/* First, check if we're still OK to use this algo */
 	if (!is_algo_fixed(fd->fd_rh))
@@ -1069,9 +1096,7 @@ rebuild_fd(struct fib_data *fd)
 	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
 
 	/* Remove old instance */
-	NET_EPOCH_ENTER(et);
 	schedule_destroy_fd_instance(fd, true);
-	NET_EPOCH_EXIT(et);
 
 	return (true);
 }
@@ -1116,6 +1141,7 @@ set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl
 	char old_algo_name[32], algo_name[32];
 	struct rib_head *rh = NULL;
 	enum flm_op_result result;
+	struct epoch_tracker et;
 	int error;
 
 	/* Fetch current algo/rib for af/family */
@@ -1149,7 +1175,11 @@ set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl
 	}
 
 	fd = NULL;
+	NET_EPOCH_ENTER(et);
+	RIB_WLOCK(rh);
 	result = setup_fd_instance(flm, rh, NULL, &fd, true);
+	RIB_WUNLOCK(rh);
+	NET_EPOCH_EXIT(et);
 	fib_unref_algo(flm);
 	if (result != FLM_SUCCESS)
 		return (EINVAL);
@@ -1558,6 +1588,7 @@ fib_select_algo_initial(struct rib_head *rh)
 	struct fib_lookup_module *flm;
 	struct fib_data *fd = NULL;
 	enum flm_op_result result;
+	struct epoch_tracker et;
 	int error = 0;
 
 	flm = fib_check_best_algo(rh, NULL);
@@ -1567,7 +1598,12 @@ fib_select_algo_initial(struct rib_head *rh)
 	}
 	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
 
+	NET_EPOCH_ENTER(et);
+	RIB_WLOCK(rh);
 	result = setup_fd_instance(flm, rh, NULL, &fd, false);
+	RIB_WUNLOCK(rh);
+	NET_EPOCH_EXIT(et);
+
 	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
 	if (result == FLM_SUCCESS) {
 
diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h
index bd256e9f0834..46dba759f8df 100644
--- a/sys/net/route/route_ctl.h
+++ b/sys/net/route/route_ctl.h
@@ -75,6 +75,8 @@ void rib_walk_ext(uint32_t fibnum, int af, bool wlock, rib_walktree_f_t *wa_f,
     rib_walk_hook_f_t *hook_f, void *arg);
 void rib_walk_ext_internal(struct rib_head *rnh, bool wlock,
     rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg);
+void rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
+    rib_walk_hook_f_t *hook_f, void *arg);
 
 void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f,
     void *arg, bool report);
diff --git a/sys/net/route/route_helpers.c b/sys/net/route/route_helpers.c
index 88733fff419b..782e160ca51b 100644
--- a/sys/net/route/route_helpers.c
+++ b/sys/net/route/route_helpers.c
@@ -67,6 +67,17 @@ __FBSDID("$FreeBSD$");
  * RIB helper functions.
  */
 
+void
+rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
+    rib_walk_hook_f_t *hook_f, void *arg)
+{
+	if (hook_f != NULL)
+		hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
+	rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
+	if (hook_f != NULL)
+		hook_f(rnh, RIB_WALK_HOOK_POST, arg);
+}
+
 /*
  * Calls @wa_f with @arg for each entry in the table specified by
  * @af and @fibnum.
@@ -86,11 +97,7 @@ rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f,
 		RIB_WLOCK(rnh);
 	else
 		RIB_RLOCK(rnh);
-	if (hook_f != NULL)
-		hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
-	rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
-	if (hook_f != NULL)
-		hook_f(rnh, RIB_WALK_HOOK_POST, arg);
+	rib_walk_ext_locked(rnh, wa_f, hook_f, arg);
 	if (wlock)
 		RIB_WUNLOCK(rnh);
 	else


More information about the dev-commits-src-branches mailing list