svn commit: r366390 - in head: sys/conf sys/net sys/net/route sys/netinet sys/netinet6 sys/sys usr.bin/netstat

Alexander V. Chernikov melifaro at FreeBSD.org
Sat Oct 3 10:47:20 UTC 2020


Author: melifaro
Date: Sat Oct  3 10:47:17 2020
New Revision: 366390
URL: https://svnweb.freebsd.org/changeset/base/366390

Log:
  Introduce scalable route multipath.
  
  This change is based on the nexthop objects landed in D24232.
  
  The change introduces the concept of nexthop groups.
  Each group contains the collection of nexthops with their
   relative weights and a dataplane-optimized structure to enable
   efficient nexthop selection.
  
  Simular to the nexthops, nexthop groups are immutable. Dataplane part
   gets compiled during group creation and is basically an array of
   nexthop pointers, compiled w.r.t their weights.
  
  With this change, `rt_nhop` field of `struct rtentry` contains either
   nexthop or nexthop group. They are distinguished by the presense of
   NHF_MULTIPATH flag.
  All dataplane lookup functions returns pointer to the nexthop object,
  leaving nexhop groups details inside routing subsystem.
  
  User-visible changes:
  
  The change is intended to be backward-compatible: all non-mpath operations
   should work as before with ROUTE_MPATH and net.route.multipath=1.
  
  All routes now comes with weight, default weight is 1, maximum is 2^24-1.
  
  Current maximum multipath group width is statically set to 64.
   This will become sysctl-tunable in the followup changes.
  
  Using functionality:
  * Recompile kernel with ROUTE_MPATH
  * set net.route.multipath to 1
  
  route add -6 2001:db8::/32 2001:db8::2 -weight 10
  route add -6 2001:db8::/32 2001:db8::3 -weight 20
  
  netstat -6On
  
  Nexthop groups data
  
  Internet6:
  GrpIdx  NhIdx     Weight   Slots                                 Gateway     Netif  Refcnt
  1         ------- ------- ------- --------------------------------------- ---------       1
                13      10       1                             2001:db8::2     vlan2
                14      20       2                             2001:db8::3     vlan2
  
  Next steps:
  * Land outbound hashing for locally-originated routes ( D26523 ).
  * Fix net/bird multipath (net/frr seems to work fine)
  * Add ROUTE_MPATH to GENERIC
  * Set net.route.multipath=1 by default
  
  Tested by:	olivier
  Reviewed by:	glebius
  Relnotes:	yes
  Differential Revision:	https://reviews.freebsd.org/D26449

Added:
  head/sys/net/route/mpath_ctl.c   (contents, props changed)
  head/sys/net/route/nhgrp.c   (contents, props changed)
  head/sys/net/route/nhgrp_ctl.c   (contents, props changed)
  head/sys/net/route/nhgrp_var.h   (contents, props changed)
  head/usr.bin/netstat/nhgrp.c   (contents, props changed)
Modified:
  head/sys/conf/NOTES
  head/sys/conf/files
  head/sys/conf/options
  head/sys/net/radix.c
  head/sys/net/route.c
  head/sys/net/route.h
  head/sys/net/route/nhop.c
  head/sys/net/route/nhop.h
  head/sys/net/route/nhop_ctl.c
  head/sys/net/route/nhop_var.h
  head/sys/net/route/route_ctl.c
  head/sys/net/route/route_ctl.h
  head/sys/net/route/route_helpers.c
  head/sys/net/route/route_var.h
  head/sys/net/rtsock.c
  head/sys/netinet/in.c
  head/sys/netinet/in_fib.c
  head/sys/netinet/in_rmx.c
  head/sys/netinet/ip_output.c
  head/sys/netinet6/in6_fib.c
  head/sys/netinet6/in6_rmx.c
  head/sys/netinet6/nd6.c
  head/sys/sys/socket.h
  head/usr.bin/netstat/Makefile
  head/usr.bin/netstat/common.h
  head/usr.bin/netstat/main.c
  head/usr.bin/netstat/netstat.h
  head/usr.bin/netstat/nhops.c

Modified: head/sys/conf/NOTES
==============================================================================
--- head/sys/conf/NOTES	Sat Oct  3 09:36:33 2020	(r366389)
+++ head/sys/conf/NOTES	Sat Oct  3 10:47:17 2020	(r366390)
@@ -1002,7 +1002,7 @@ device		lagg
 #
 # TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack.
 #
-# RADIX_MPATH provides support for equal-cost multi-path routing.
+# ROUTE_MPATH provides support for multipath routing.
 #
 options 	MROUTING		# Multicast routing
 options 	IPFIREWALL		#firewall
@@ -1023,7 +1023,7 @@ options 	TCPDEBUG
 options 	TCPPCAP
 options 	TCP_BLACKBOX
 options 	TCP_HHOOK
-options 	RADIX_MPATH
+options 	ROUTE_MPATH
 
 # The MBUF_STRESS_TEST option enables options which create
 # various random failures / extreme cases related to mbuf

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Sat Oct  3 09:36:33 2020	(r366389)
+++ head/sys/conf/files	Sat Oct  3 10:47:17 2020	(r366390)
@@ -4143,10 +4143,12 @@ net/debugnet.c			optional inet debugnet
 net/debugnet_inet.c		optional inet debugnet
 net/pfil.c			optional ether | inet
 net/radix.c			standard
-net/radix_mpath.c		standard
 net/raw_cb.c			standard
 net/raw_usrreq.c		standard
 net/route.c			standard
+net/route/mpath_ctl.c		optional route_mpath
+net/route/nhgrp.c		optional route_mpath
+net/route/nhgrp_ctl.c		optional route_mpath
 net/route/nhop.c		standard
 net/route/nhop_ctl.c		standard
 net/route/nhop_utils.c		standard

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options	Sat Oct  3 09:36:33 2020	(r366389)
+++ head/sys/conf/options	Sat Oct  3 10:47:17 2020	(r366390)
@@ -454,6 +454,7 @@ NFSLOCKD
 PCBGROUP		opt_pcbgroup.h
 PF_DEFAULT_TO_DROP	opt_pf.h
 RADIX_MPATH		opt_mpath.h
+ROUTE_MPATH		opt_route.h
 ROUTETABLES		opt_route.h
 RSS			opt_rss.h
 SLIP_IFF_OPTS		opt_slip.h

Modified: head/sys/net/radix.c
==============================================================================
--- head/sys/net/radix.c	Sat Oct  3 09:36:33 2020	(r366389)
+++ head/sys/net/radix.c	Sat Oct  3 10:47:17 2020	(r366390)
@@ -44,10 +44,6 @@
 #include <sys/malloc.h>
 #include <sys/syslog.h>
 #include <net/radix.h>
-#include "opt_mpath.h"
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
 #else /* !_KERNEL */
 #include <stdio.h>
 #include <strings.h>

Modified: head/sys/net/route.c
==============================================================================
--- head/sys/net/route.c	Sat Oct  3 09:36:33 2020	(r366389)
+++ head/sys/net/route.c	Sat Oct  3 10:47:17 2020	(r366390)
@@ -39,7 +39,6 @@
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_mrouting.h"
-#include "opt_mpath.h"
 #include "opt_route.h"
 
 #include <sys/param.h>

Modified: head/sys/net/route.h
==============================================================================
--- head/sys/net/route.h	Sat Oct  3 09:36:33 2020	(r366389)
+++ head/sys/net/route.h	Sat Oct  3 10:47:17 2020	(r366390)
@@ -178,6 +178,7 @@ VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce 
  */
 
 /* Consumer-visible nexthop info flags */
+#define	NHF_MULTIPATH		0x0008	/* Nexhop is a nexthop group */
 #define	NHF_REJECT		0x0010	/* RTF_REJECT */
 #define	NHF_BLACKHOLE		0x0020	/* RTF_BLACKHOLE */
 #define	NHF_REDIRECT		0x0040	/* RTF_DYNAMIC|RTF_MODIFIED */
@@ -208,6 +209,10 @@ struct rtstat {
 	uint64_t rts_wildcard;		/* lookups satisfied by a wildcard */
 	uint64_t rts_nh_idx_alloc_failure;	/* nexthop index alloc failure*/
 	uint64_t rts_nh_alloc_failure;	/* nexthop allocation failure*/
+	uint64_t rts_add_failure;	/* # of route addition failures */
+	uint64_t rts_add_retry;		/* # of route addition retries */
+	uint64_t rts_del_failure;	/* # of route deletion failure */
+	uint64_t rts_del_retry;		/* # of route deletion retries */
 };
 
 /*

Added: head/sys/net/route/mpath_ctl.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/net/route/mpath_ctl.c	Sat Oct  3 10:47:17 2020	(r366390)
@@ -0,0 +1,165 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+
+/*
+ * This file contains the supporting functions for adding/deleting/updating
+ *  multipath routes to the routing table.
+ */
+
+SYSCTL_DECL(_net_route);
+
+/*
+ * Tries to add @rnd_add nhop to the existing set of nhops (@nh_orig) for the
+ * prefix specified by @rt.
+ *
+ * Return 0 ans consumes rt / rnd_add nhop references. @rc gets populated
+ *   with the operation result.
+ * Otherwise errno is returned.
+ *
+ * caller responsibility is to unlock/free rt and
+ *  rt->rt_nhop.
+ */
+int
+add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
+    struct rtentry *rt, struct route_nhop_data *rnd_add,
+    struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
+{
+	RIB_RLOCK_TRACKER;
+	struct route_nhop_data rnd_new;
+	int error = 0;
+
+	/*
+	 * It is possible that multiple rtsock speakers will try to update
+	 * the same route simultaneously. Reduce the chance of failing the
+	 * request by retrying the cycle multiple times.
+	 */
+	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
+		error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add,
+		    &rnd_new);
+		if (error != 0) {
+			if (error != EAGAIN)
+				break;
+
+			/*
+			 * Group creation failed, most probably because
+			 * @rnd_orig data got scheduled for deletion.
+			 * Refresh @rnd_orig data and retry.
+			 */
+			RIB_RLOCK(rnh);
+			lookup_prefix(rnh, info, rnd_orig);
+			RIB_RUNLOCK(rnh);
+			continue;
+		}
+
+		error = change_route_conditional(rnh, rt, info, rnd_orig,
+		    &rnd_new, rc);
+		if (error != EAGAIN)
+			break;
+		RTSTAT_INC(rts_add_retry);
+	}
+
+	return (error);
+}
+
+struct rt_match_info {
+	struct rt_addrinfo *info;
+	struct rtentry *rt;
+};
+
+static bool
+gw_filter_func(const struct nhop_object *nh, void *_data)
+{
+	struct rt_match_info *ri = (struct rt_match_info *)_data;
+
+	return (check_info_match_nhop(ri->info, ri->rt, nh) == 0);
+}
+
+/*
+ * Tries to delete matching paths from @nhg.
+ * Returns 0 on success and updates operation result in @rc.
+ */
+int
+del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
+    struct rtentry *rt, struct nhgrp_object *nhg,
+    struct rib_cmd_info *rc)
+{
+	struct route_nhop_data rnd;
+	struct rt_match_info ri = { .info = info, .rt = rt };
+	int error;
+
+	RIB_WLOCK_ASSERT(rh);
+
+	/*
+	 * Require gateway to delete multipath routes, to forbid
+	 *  deleting all paths at once.
+	 * If the filter function is provided, skip gateway check to
+	 *  allow rib_walk_del() delete routes for any criteria based
+	 *  on provided callback.
+	 */
+	if ((info->rti_info[RTAX_GATEWAY] == NULL) && (info->rti_filter == NULL))
+		return (ESRCH);
+
+	error = nhgrp_get_filtered_group(rh, nhg, gw_filter_func, (void *)&ri,
+	    &rnd);
+	if (error == 0)
+		error = change_route_nhop(rh, rt, info, &rnd, rc);
+	return (error);
+}
+

Added: head/sys/net/route/nhgrp.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/net/route/nhgrp.c	Sat Oct  3 10:47:17 2020	(r366390)
@@ -0,0 +1,344 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_dl.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains data structures management logic for the nexthop
+ * groups ("nhgrp") route subsystem.
+ *
+ * Nexthop groups are used to store multiple routes available for the specific
+ *  prefix. Nexthop groups are immutable and can be shared across multiple
+ *  prefixes.
+ *
+ * Each group consists of a control plane part and a dataplane part.
+ * Control plane is basically a collection of nexthop objects with
+ *  weights and refcount.
+ *
+ * Datapath consists of a array of nexthop pointers, compiled from control
+ *  plane data to support O(1) nexthop selection.
+ *
+ * For example, consider the following group:
+ *  [(nh1, weight=100), (nh2, weight=200)]
+ * It will compile to the following array:
+ *  [nh1, nh2, nh2]
+ *
+ */
+
+static void consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets,
+    uint32_t new_idx_items);
+
+static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b);
+static unsigned int hash_nhgrp(const struct nhgrp_priv *obj);
+
+static unsigned
+djb_hash(const unsigned char *h, const int len)
+{
+	unsigned int result = 0;
+	int i;
+
+	for (i = 0; i < len; i++)
+		result = 33 * result ^ h[i];
+
+	return (result);
+}
+
+static int
+cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b)
+{
+
+	/*
+	 * In case of consistent hashing, there can be multiple nexthop groups
+	 * with the same "control plane" list of nexthops with weights and a
+	 * different set of "data plane" nexthops.
+	 * For now, ignore the data plane and focus on the control plane list.
+	 */
+	if (a->nhg_nh_count != b->nhg_nh_count)
+		return (0);
+	return !memcmp(a->nhg_nh_weights, b->nhg_nh_weights,
+	    sizeof(struct weightened_nhop) * a->nhg_nh_count);
+}
+
+/*
+ * Hash callback: calculate hash of an object
+ */
+static unsigned int
+hash_nhgrp(const struct nhgrp_priv *obj)
+{
+	const unsigned char *key;
+
+	key = (const unsigned char *)obj->nhg_nh_weights;
+
+	return (djb_hash(key, sizeof(struct weightened_nhop) * obj->nhg_nh_count));
+}
+
+/*
+ * Returns object referenced and unlocked
+ */
+struct nhgrp_priv *
+find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key)
+{
+	struct nhgrp_priv *priv_ret;
+
+	NHOPS_RLOCK(ctl);
+	CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret);
+	if (priv_ret != NULL) {
+		if (refcount_acquire_if_not_zero(&priv_ret->nhg_refcount) == 0) {
+			/* refcount is 0 -> group is being deleted */
+			priv_ret = NULL;
+		}
+	}
+	NHOPS_RUNLOCK(ctl);
+
+	return (priv_ret);
+}
+
+int
+link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv)
+{
+	uint16_t idx;
+	uint32_t new_num_buckets, new_num_items;
+
+	NHOPS_WLOCK(ctl);
+	/* Check if we need to resize hash and index */
+	new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head);
+	new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head);
+
+	if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) {
+		NHOPS_WUNLOCK(ctl);
+		DPRINTF("Unable to allocate mpath index");
+		consider_resize(ctl, new_num_buckets, new_num_items);
+		return (0);
+	}
+
+	grp_priv->nhg_idx = idx;
+	grp_priv->nh_control = ctl;
+	CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv);
+
+	NHOPS_WUNLOCK(ctl);
+
+	consider_resize(ctl, new_num_buckets, new_num_items);
+
+	return (1);
+}
+
+struct nhgrp_priv *
+unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key)
+{
+	struct nhgrp_priv *nhg_priv_ret;
+	int ret, idx;
+
+	NHOPS_WLOCK(ctl);
+
+	CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, nhg_priv_ret);
+
+	if (nhg_priv_ret == NULL) {
+		DPRINTF("Unable to find nhop group!");
+		NHOPS_WUNLOCK(ctl);
+		return (NULL);
+	}
+
+	idx = nhg_priv_ret->nhg_idx;
+	ret = bitmask_free_idx(&ctl->gr_idx_head, idx);
+	nhg_priv_ret->nhg_idx = 0;
+	nhg_priv_ret->nh_control = NULL;
+
+	NHOPS_WUNLOCK(ctl);
+
+	return (nhg_priv_ret);
+}
+
+/*
+ * Checks if hash needs resizing and performs this resize if necessary
+ *
+ */
+__noinline static void
+consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t new_idx_items)
+{
+	void *nh_ptr, *nh_idx_ptr;
+	void *old_idx_ptr;
+	size_t alloc_size;
+
+	nh_ptr = NULL ;
+	if (new_nh_buckets != 0) {
+		alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
+		nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+	}
+
+	nh_idx_ptr = NULL;
+	if (new_idx_items != 0) {
+		alloc_size = bitmask_get_size(new_idx_items);
+		nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
+	}
+
+	if (nh_ptr == NULL && nh_idx_ptr == NULL) {
+		/* Either resize is not required or allocations have failed. */
+		return;
+	}
+
+	DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]",
+	    nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items);
+
+	old_idx_ptr = NULL;
+
+	NHOPS_WLOCK(ctl);
+	if (nh_ptr != NULL) {
+		CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets);
+	}
+	if (nh_idx_ptr != NULL) {
+		if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items))
+			bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items, &old_idx_ptr);
+	}
+	NHOPS_WUNLOCK(ctl);
+
+	if (nh_ptr != NULL)
+		free(nh_ptr, M_NHOP);
+	if (old_idx_ptr != NULL)
+		free(old_idx_ptr, M_NHOP);
+}
+
+/*
+ * Function allocating the necessary group data structures.
+ */
+bool
+nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags)
+{
+	size_t alloc_size;
+	uint32_t num_buckets, num_items;
+	void *cht_ptr, *mask_ptr;
+
+	malloc_flags = (malloc_flags & (M_NOWAIT | M_WAITOK)) | M_ZERO;
+
+	num_buckets = 8;
+	alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
+	cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags);
+
+	if (cht_ptr == NULL) {
+		DPRINTF("mpath init failed");
+		return (false);
+	}
+
+	/*
+	 * Allocate nexthop index bitmask.
+	 */
+	num_items = 128;
+	mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags);
+	if (mask_ptr == NULL) {
+		DPRINTF("mpath bitmask init failed");
+		free(cht_ptr, M_NHOP);
+		return (false);
+	}
+
+	NHOPS_WLOCK(ctl);
+
+	if (ctl->gr_head.hash_size == 0) {
+		/* Init hash and bitmask */
+		CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets);
+		bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items);
+		NHOPS_WUNLOCK(ctl);
+	} else {
+		/* Other thread has already initiliazed hash/bitmask */
+		NHOPS_WUNLOCK(ctl);
+		free(cht_ptr, M_NHOP);
+		free(mask_ptr, M_NHOP);
+	}
+
+	DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum,
+	    ctl->rh->rib_family);
+
+	return (true);
+}
+
+int
+nhgrp_ctl_init(struct nh_control *ctl)
+{
+
+	/*
+	 * By default, do not allocate datastructures as multipath
+	 * routes will not be necessarily used.
+	 */
+	CHT_SLIST_INIT(&ctl->gr_head, NULL, 0);
+	bitmask_init(&ctl->gr_idx_head, NULL, 0);
+	return (0);
+}
+
+void
+nhgrp_ctl_free(struct nh_control *ctl)
+{
+
+	if (ctl->gr_head.ptr != NULL)
+		free(ctl->gr_head.ptr, M_NHOP);
+	if (ctl->gr_idx_head.idx != NULL)
+		free(ctl->gr_idx_head.idx, M_NHOP);
+}
+
+void
+nhgrp_ctl_unlink_all(struct nh_control *ctl)
+{
+	struct nhgrp_priv *nhg_priv;
+
+	NHOPS_WLOCK_ASSERT(ctl);
+
+	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
+		DPRINTF("Marking nhgrp %u unlinked", nhg_priv->nhg_idx);
+		refcount_release(&nhg_priv->nhg_linked);
+	} CHT_SLIST_FOREACH_END;
+}
+

Added: head/sys/net/route/nhgrp_ctl.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/net/route/nhgrp_ctl.c	Sat Oct  3 10:47:17 2020	(r366390)
@@ -0,0 +1,788 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#define RTDEBUG
+#include "opt_inet.h"
+#include "opt_route.h"
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/epoch.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/route/route_ctl.h>
+#include <net/route/route_var.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_fib.h>
+
+#include <net/route/nhop_utils.h>
+#include <net/route/nhop.h>
+#include <net/route/nhop_var.h>
+#include <net/route/nhgrp_var.h>
+
+/*
+ * This file contains the supporting functions for creating multipath groups
+ *  and compiling their dataplane parts.
+ */
+
+/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to work */
+_Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
+    "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
+/* Offset and size of flags field has to be the same for nhop/nhop groups */
+CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object, nhg_flags);
+/* Cap multipath to 64, as the larger values would break rib_cmd_info bmasks */
+CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
+
+static int wn_cmp(const void *a, const void *b);
+static void sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops);
+
+static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
+    struct weightened_nhop *wn, int num_nhops, int *perror);
+static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
+static void destroy_nhgrp_epoch(epoch_context_t ctx);
+static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
+
+static int
+wn_cmp(const void *a, const void *b)
+{
+	const struct weightened_nhop *wa = a;
+	const struct weightened_nhop *wb = b;
+
+	if (wa->weight > wb->weight)
+		return (1);
+	else if (wa->weight < wb->weight)
+		return (-1);
+
+	/* Compare nexthops by pointer */
+	if (wa->nh > wb->nh)
+		return (1);
+	else if (wa->nh < wb->nh)
+		return (-1);
+	else
+		return (0);
+}
+
+/*
+ * Perform in-place sorting for array of nexthops in @wn.
+ *
+ * To avoid nh groups duplication, nexthops/weights in the
+ *   @wn need to be ordered deterministically.
+ * As this sorting is needed only for the control plane functionality,
+ *  there are no specific external requirements.
+ *
+ * Sort by weight first, to ease calculation of the slot sizes.
+ */
+static void
+sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
+{
+
+	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights in the common use case where weights are "easily"
+ * comparable.
+ * Assumes @wn is sorted by weight ascending and each weight is > 0.
+ * Returns number of slots or 0 if precise calculation failed.
+ *
+ * Some examples:
+ * note: (i, X) pair means (nhop=i, weight=X):
+ * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
+ * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
+ * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
+ */
+static uint32_t
+calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t num_items)
+{
+	uint32_t i, last, xmin;
+	uint64_t total = 0;
+
+	last = 0;
+	xmin = wn[0].weight;
+	for (i = 0; i < num_items; i++) {
+		total += wn[i].weight;
+		if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
+			xmin = wn[i].weight - last;
+		last = wn[i].weight;
+	}
+	/* xmin is the minimum unit of desired capacity */
+	if ((total % xmin) != 0)
+		return (0);
+	for (i = 0; i < num_items; i++) {
+		if ((wn[i].weight % xmin) != 0)
+			return (0);
+	}
+
+	return ((uint32_t)(total / xmin));
+}
+
+/*
+ * Calculate minimum number of slots required to fit the existing
+ * set of weights while maintaining weight coefficients.
+ *
+ * Assume @wn is sorted by weight ascending and each weight is > 0.
+ *
+ * Tries to find simple precise solution first and falls back to
+ *  RIB_MAX_MPATH_WIDTH in case of any failure.
+ */
+static uint32_t
+calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
+{
+	uint32_t v;
+
+	v = calc_min_mpath_slots_fast(wn, num_items);
+	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
+		v = RIB_MAX_MPATH_WIDTH;
+
+	return (v);
+}
+
+/*
+ * Nexthop group data consists of
+ * 1) dataplane part, with nhgrp_object as a header followed by an
+ *   arbitrary number of nexthop pointers.
+ * 2) control plane part, with nhgrp_priv as a header, followed by
+ *   an arbirtrary number of 'struct weightened_nhop' object.
+ *
+ * Given nexthop groups are (mostly) immutable, allocate all data
+ * in one go.
+ *
+ */
+__noinline static size_t
+get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
+{
+	size_t sz;
+
+	sz = sizeof(struct nhgrp_object);
+	sz += nhg_size * sizeof(struct nhop_object *);
+	sz += sizeof(struct nhgrp_priv);
+	sz += num_nhops * sizeof(struct weightened_nhop);
+	return (sz);
+}
+
+/*
+ * Compile actual list of nexthops to be used by datapath from
+ *  the nexthop group @dst.
+ *
+ * For example, compiling control plane list of 2 nexthops
+ *  [(200, A), (100, B)] would result in the datapath array
+ *  [A, A, B]
+ */
+static void
+compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop *x,
+    uint32_t num_slots)
+{
+	struct nhgrp_object *dst;
+	int i, slot_idx, remaining_slots;
+	uint64_t remaining_sum, nh_weight, nh_slots;
+
+	slot_idx  = 0;
+	dst = dst_priv->nhg;
+	/* Calculate sum of all weights */
+	remaining_sum = 0;
+	for (i = 0; i < dst_priv->nhg_nh_count; i++)
+		remaining_sum += x[i].weight;
+	remaining_slots = num_slots;
+	DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots);
+	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
+		/* Calculate number of slots for the current nexthop */
+		if (remaining_sum > 0) {
+			nh_weight = (uint64_t)x[i].weight;
+			nh_slots = (nh_weight * remaining_slots / remaining_sum);
+		} else
+			nh_slots = 0;
+
+		remaining_sum -= x[i].weight;
+		remaining_slots -= nh_slots;
+
+		DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i,
+		    (uint32_t)remaining_sum, remaining_slots,
+		    (int)nh_slots, slot_idx);
+
+		KASSERT((slot_idx + nh_slots <= num_slots),
+		    ("index overflow during nhg compilation"));
+		while (nh_slots-- > 0)
+			dst->nhops[slot_idx++] = x[i].nh;
+	}
+}
+
+/*
+ * Allocates new nexthop group for the list of weightened nexthops.
+ * Assume sorted list.
+ * Does NOT reference any nexthops in the group.
+ * Returns group with refcount=1 or NULL.
+ */
+static struct nhgrp_priv *
+alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
+{
+	uint32_t nhgrp_size;
+	int flags = M_NOWAIT;
+	struct nhgrp_object *nhg;
+	struct nhgrp_priv *nhg_priv;
+
+	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
+	if (nhgrp_size == 0) {
+		/* Zero weights, abort */
+		return (NULL);
+	}
+
+	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
+	nhg = malloc(sz, M_NHOP, flags | M_ZERO);
+	if (nhg == NULL) {
+		return (NULL);
+	}
+
+	/* Has to be the first to make NHGRP_PRIV() work */
+	nhg->nhg_size = nhgrp_size;
+	DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size);
+	nhg->nhg_flags = MPF_MULTIPATH;
+
+	nhg_priv = NHGRP_PRIV(nhg);
+	nhg_priv->nhg_nh_count = num_nhops;
+	refcount_init(&nhg_priv->nhg_refcount, 1);
+
+	/* Please see nhgrp_free() comments on the initial value */
+	refcount_init(&nhg_priv->nhg_linked, 2);
+
+	nhg_priv->nhg = nhg;
+	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
+	  num_nhops * sizeof(struct weightened_nhop));
+
+	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
+
+	return (nhg_priv);
+}
+
+void
+nhgrp_free(struct nhgrp_object *nhg)
+{
+	struct nhgrp_priv *nhg_priv;
+	struct nh_control *ctl;
+	struct epoch_tracker et;
+
+	nhg_priv = NHGRP_PRIV(nhg);
+
+	if (!refcount_release(&nhg_priv->nhg_refcount))
+		return;
+
+	/*
+	 * group objects don't have an explicit lock attached to it.
+	 * As groups are reclaimed based on reference count, it is possible
+	 * that some groups will persist after vnet destruction callback
+	 * called. Given that, handle scenario with nhgrp_free_group() being
+	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
+	 * by using another reference counter: nhg_linked.
+	 *
+	 * There are only 2 places, where nhg_linked can be decreased:
+	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
+	 * nhg_link can never be increased.
+	 *
+	 * Hence, use initial value of 2 to make use of
+	 *  refcount_release_if_not_last().
+	 *
+	 * There can be two scenarious when calling this function:
+	 *
+	 * 1) nhg_linked value is 2. This means that either
+	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
+	 *  but we are guaranteed that nh_control won't be freed in
+	 *  this epoch. Hence, nexthop can be safely unlinked.
+	 *
+	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
+	 *  has been called and nhgrp unlink can be skipped.
+	 */
+
+	NET_EPOCH_ENTER(et);
+	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
+		ctl = nhg_priv->nh_control;
+		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
+			/* Do not try to reclaim */
+			DPRINTF("Failed to unlink nexhop group %p", nhg_priv);
+			NET_EPOCH_EXIT(et);
+			return;
+		}
+	}
+	NET_EPOCH_EXIT(et);
+
+	epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
+	    &nhg_priv->nhg_epoch_ctx);
+}
+
+/*
+ * Destroys all local resources belonging to @nhg_priv.
+ */
+__noinline static void
+destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
+{
+
+	free(nhg_priv->nhg, M_NHOP);
+}
+
+__noinline static void
+destroy_nhgrp(struct nhgrp_priv *nhg_priv)
+{
+
+	KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
+
+	DPRINTF("DEL MPATH %p", nhg_priv);
+
+	KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
+
+	free_nhgrp_nhops(nhg_priv);
+
+	destroy_nhgrp_int(nhg_priv);
+}
+
+/*
+ * Epoch callback indicating group is safe to destroy
+ */
+static void
+destroy_nhgrp_epoch(epoch_context_t ctx)
+{
+	struct nhgrp_priv *nhg_priv;
+
+	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
+
+	destroy_nhgrp(nhg_priv);
+}
+
+static bool
+ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
+{
+
+	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list