svn commit: r366390 - in head: sys/conf sys/net sys/net/route sys/netinet sys/netinet6 sys/sys usr.bin/netstat

Mateusz Guzik mjguzik at gmail.com
Sat Oct 3 12:30:04 UTC 2020


This gives me:

/tank/users/mjg/src/freebsd/sys/net/route/route_ctl.c:94:13: warning:
unused function 'rib_can_multipath' [-Wunused-function]
static bool rib_can_multipath(struct rib_head *rh);
            ^
/tank/users/mjg/src/freebsd/sys/net/rtsock.c:851:1: warning: unused
function 'save_del_notification' [-Wunused-function]
save_del_notification(struct rib_cmd_info *rc, void *_cbdata)
^
/tank/users/mjg/src/freebsd/sys/net/rtsock.c:860:1: warning: unused
function 'save_add_notification' [-Wunused-function]
save_add_notification(struct rib_cmd_info *rc, void *_cbdata)
^


On 10/3/20, Alexander V. Chernikov <melifaro at freebsd.org> wrote:
> Author: melifaro
> Date: Sat Oct  3 10:47:17 2020
> New Revision: 366390
> URL: https://svnweb.freebsd.org/changeset/base/366390
>
> Log:
>   Introduce scalable route multipath.
>
>   This change is based on the nexthop objects landed in D24232.
>
>   The change introduces the concept of nexthop groups.
>   Each group contains the collection of nexthops with their
>    relative weights and a dataplane-optimized structure to enable
>    efficient nexthop selection.
>
>   Simular to the nexthops, nexthop groups are immutable. Dataplane part
>    gets compiled during group creation and is basically an array of
>    nexthop pointers, compiled w.r.t their weights.
>
>   With this change, `rt_nhop` field of `struct rtentry` contains either
>    nexthop or nexthop group. They are distinguished by the presense of
>    NHF_MULTIPATH flag.
>   All dataplane lookup functions returns pointer to the nexthop object,
>   leaving nexhop groups details inside routing subsystem.
>
>   User-visible changes:
>
>   The change is intended to be backward-compatible: all non-mpath
> operations
>    should work as before with ROUTE_MPATH and net.route.multipath=1.
>
>   All routes now comes with weight, default weight is 1, maximum is 2^24-1.
>
>   Current maximum multipath group width is statically set to 64.
>    This will become sysctl-tunable in the followup changes.
>
>   Using functionality:
>   * Recompile kernel with ROUTE_MPATH
>   * set net.route.multipath to 1
>
>   route add -6 2001:db8::/32 2001:db8::2 -weight 10
>   route add -6 2001:db8::/32 2001:db8::3 -weight 20
>
>   netstat -6On
>
>   Nexthop groups data
>
>   Internet6:
>   GrpIdx  NhIdx     Weight   Slots                                 Gateway
>   Netif  Refcnt
>   1         ------- ------- ------- ---------------------------------------
> ---------       1
>                 13      10       1                             2001:db8::2
>   vlan2
>                 14      20       2                             2001:db8::3
>   vlan2
>
>   Next steps:
>   * Land outbound hashing for locally-originated routes ( D26523 ).
>   * Fix net/bird multipath (net/frr seems to work fine)
>   * Add ROUTE_MPATH to GENERIC
>   * Set net.route.multipath=1 by default
>
>   Tested by:	olivier
>   Reviewed by:	glebius
>   Relnotes:	yes
>   Differential Revision:	https://reviews.freebsd.org/D26449
>
> Added:
>   head/sys/net/route/mpath_ctl.c   (contents, props changed)
>   head/sys/net/route/nhgrp.c   (contents, props changed)
>   head/sys/net/route/nhgrp_ctl.c   (contents, props changed)
>   head/sys/net/route/nhgrp_var.h   (contents, props changed)
>   head/usr.bin/netstat/nhgrp.c   (contents, props changed)
> Modified:
>   head/sys/conf/NOTES
>   head/sys/conf/files
>   head/sys/conf/options
>   head/sys/net/radix.c
>   head/sys/net/route.c
>   head/sys/net/route.h
>   head/sys/net/route/nhop.c
>   head/sys/net/route/nhop.h
>   head/sys/net/route/nhop_ctl.c
>   head/sys/net/route/nhop_var.h
>   head/sys/net/route/route_ctl.c
>   head/sys/net/route/route_ctl.h
>   head/sys/net/route/route_helpers.c
>   head/sys/net/route/route_var.h
>   head/sys/net/rtsock.c
>   head/sys/netinet/in.c
>   head/sys/netinet/in_fib.c
>   head/sys/netinet/in_rmx.c
>   head/sys/netinet/ip_output.c
>   head/sys/netinet6/in6_fib.c
>   head/sys/netinet6/in6_rmx.c
>   head/sys/netinet6/nd6.c
>   head/sys/sys/socket.h
>   head/usr.bin/netstat/Makefile
>   head/usr.bin/netstat/common.h
>   head/usr.bin/netstat/main.c
>   head/usr.bin/netstat/netstat.h
>   head/usr.bin/netstat/nhops.c
>
> Modified: head/sys/conf/NOTES
> ==============================================================================
> --- head/sys/conf/NOTES	Sat Oct  3 09:36:33 2020	(r366389)
> +++ head/sys/conf/NOTES	Sat Oct  3 10:47:17 2020	(r366390)
> @@ -1002,7 +1002,7 @@ device		lagg
>  #
>  # TCP_HHOOK enables the hhook(9) framework hooks for the TCP stack.
>  #
> -# RADIX_MPATH provides support for equal-cost multi-path routing.
> +# ROUTE_MPATH provides support for multipath routing.
>  #
>  options 	MROUTING		# Multicast routing
>  options 	IPFIREWALL		#firewall
> @@ -1023,7 +1023,7 @@ options 	TCPDEBUG
>  options 	TCPPCAP
>  options 	TCP_BLACKBOX
>  options 	TCP_HHOOK
> -options 	RADIX_MPATH
> +options 	ROUTE_MPATH
>
>  # The MBUF_STRESS_TEST option enables options which create
>  # various random failures / extreme cases related to mbuf
>
> Modified: head/sys/conf/files
> ==============================================================================
> --- head/sys/conf/files	Sat Oct  3 09:36:33 2020	(r366389)
> +++ head/sys/conf/files	Sat Oct  3 10:47:17 2020	(r366390)
> @@ -4143,10 +4143,12 @@ net/debugnet.c			optional inet debugnet
>  net/debugnet_inet.c		optional inet debugnet
>  net/pfil.c			optional ether | inet
>  net/radix.c			standard
> -net/radix_mpath.c		standard
>  net/raw_cb.c			standard
>  net/raw_usrreq.c		standard
>  net/route.c			standard
> +net/route/mpath_ctl.c		optional route_mpath
> +net/route/nhgrp.c		optional route_mpath
> +net/route/nhgrp_ctl.c		optional route_mpath
>  net/route/nhop.c		standard
>  net/route/nhop_ctl.c		standard
>  net/route/nhop_utils.c		standard
>
> Modified: head/sys/conf/options
> ==============================================================================
> --- head/sys/conf/options	Sat Oct  3 09:36:33 2020	(r366389)
> +++ head/sys/conf/options	Sat Oct  3 10:47:17 2020	(r366390)
> @@ -454,6 +454,7 @@ NFSLOCKD
>  PCBGROUP		opt_pcbgroup.h
>  PF_DEFAULT_TO_DROP	opt_pf.h
>  RADIX_MPATH		opt_mpath.h
> +ROUTE_MPATH		opt_route.h
>  ROUTETABLES		opt_route.h
>  RSS			opt_rss.h
>  SLIP_IFF_OPTS		opt_slip.h
>
> Modified: head/sys/net/radix.c
> ==============================================================================
> --- head/sys/net/radix.c	Sat Oct  3 09:36:33 2020	(r366389)
> +++ head/sys/net/radix.c	Sat Oct  3 10:47:17 2020	(r366390)
> @@ -44,10 +44,6 @@
>  #include <sys/malloc.h>
>  #include <sys/syslog.h>
>  #include <net/radix.h>
> -#include "opt_mpath.h"
> -#ifdef RADIX_MPATH
> -#include <net/radix_mpath.h>
> -#endif
>  #else /* !_KERNEL */
>  #include <stdio.h>
>  #include <strings.h>
>
> Modified: head/sys/net/route.c
> ==============================================================================
> --- head/sys/net/route.c	Sat Oct  3 09:36:33 2020	(r366389)
> +++ head/sys/net/route.c	Sat Oct  3 10:47:17 2020	(r366390)
> @@ -39,7 +39,6 @@
>  #include "opt_inet.h"
>  #include "opt_inet6.h"
>  #include "opt_mrouting.h"
> -#include "opt_mpath.h"
>  #include "opt_route.h"
>
>  #include <sys/param.h>
>
> Modified: head/sys/net/route.h
> ==============================================================================
> --- head/sys/net/route.h	Sat Oct  3 09:36:33 2020	(r366389)
> +++ head/sys/net/route.h	Sat Oct  3 10:47:17 2020	(r366390)
> @@ -178,6 +178,7 @@ VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce
>   */
>
>  /* Consumer-visible nexthop info flags */
> +#define	NHF_MULTIPATH		0x0008	/* Nexhop is a nexthop group */
>  #define	NHF_REJECT		0x0010	/* RTF_REJECT */
>  #define	NHF_BLACKHOLE		0x0020	/* RTF_BLACKHOLE */
>  #define	NHF_REDIRECT		0x0040	/* RTF_DYNAMIC|RTF_MODIFIED */
> @@ -208,6 +209,10 @@ struct rtstat {
>  	uint64_t rts_wildcard;		/* lookups satisfied by a wildcard */
>  	uint64_t rts_nh_idx_alloc_failure;	/* nexthop index alloc failure*/
>  	uint64_t rts_nh_alloc_failure;	/* nexthop allocation failure*/
> +	uint64_t rts_add_failure;	/* # of route addition failures */
> +	uint64_t rts_add_retry;		/* # of route addition retries */
> +	uint64_t rts_del_failure;	/* # of route deletion failure */
> +	uint64_t rts_del_retry;		/* # of route deletion retries */
>  };
>
>  /*
>
> Added: head/sys/net/route/mpath_ctl.c
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sys/net/route/mpath_ctl.c	Sat Oct  3 10:47:17 2020	(r366390)
> @@ -0,0 +1,165 @@
> +/*-
> + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
> + *
> + * Copyright (c) 2020 Alexander V. Chernikov
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
> PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
> STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
> WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * $FreeBSD$
> + */
> +
> +#include "opt_inet.h"
> +#include "opt_route.h"
> +
> +#include <sys/cdefs.h>
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/lock.h>
> +#include <sys/rmlock.h>
> +#include <sys/rwlock.h>
> +#include <sys/malloc.h>
> +#include <sys/mbuf.h>
> +#include <sys/socket.h>
> +#include <sys/sysctl.h>
> +#include <sys/kernel.h>
> +
> +#include <net/if.h>
> +#include <net/if_var.h>
> +#include <net/if_dl.h>
> +#include <net/route.h>
> +#include <net/route/route_ctl.h>
> +#include <net/route/route_var.h>
> +#include <net/vnet.h>
> +
> +#include <netinet/in.h>
> +#include <netinet/in_var.h>
> +#include <netinet/in_fib.h>
> +
> +#include <net/route/nhop_utils.h>
> +#include <net/route/nhop.h>
> +#include <net/route/nhop_var.h>
> +
> +/*
> + * This file contains the supporting functions for
> adding/deleting/updating
> + *  multipath routes to the routing table.
> + */
> +
> +SYSCTL_DECL(_net_route);
> +
> +/*
> + * Tries to add @rnd_add nhop to the existing set of nhops (@nh_orig) for
> the
> + * prefix specified by @rt.
> + *
> + * Return 0 ans consumes rt / rnd_add nhop references. @rc gets populated
> + *   with the operation result.
> + * Otherwise errno is returned.
> + *
> + * caller responsibility is to unlock/free rt and
> + *  rt->rt_nhop.
> + */
> +int
> +add_route_mpath(struct rib_head *rnh, struct rt_addrinfo *info,
> +    struct rtentry *rt, struct route_nhop_data *rnd_add,
> +    struct route_nhop_data *rnd_orig, struct rib_cmd_info *rc)
> +{
> +	RIB_RLOCK_TRACKER;
> +	struct route_nhop_data rnd_new;
> +	int error = 0;
> +
> +	/*
> +	 * It is possible that multiple rtsock speakers will try to update
> +	 * the same route simultaneously. Reduce the chance of failing the
> +	 * request by retrying the cycle multiple times.
> +	 */
> +	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
> +		error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add,
> +		    &rnd_new);
> +		if (error != 0) {
> +			if (error != EAGAIN)
> +				break;
> +
> +			/*
> +			 * Group creation failed, most probably because
> +			 * @rnd_orig data got scheduled for deletion.
> +			 * Refresh @rnd_orig data and retry.
> +			 */
> +			RIB_RLOCK(rnh);
> +			lookup_prefix(rnh, info, rnd_orig);
> +			RIB_RUNLOCK(rnh);
> +			continue;
> +		}
> +
> +		error = change_route_conditional(rnh, rt, info, rnd_orig,
> +		    &rnd_new, rc);
> +		if (error != EAGAIN)
> +			break;
> +		RTSTAT_INC(rts_add_retry);
> +	}
> +
> +	return (error);
> +}
> +
> +struct rt_match_info {
> +	struct rt_addrinfo *info;
> +	struct rtentry *rt;
> +};
> +
> +static bool
> +gw_filter_func(const struct nhop_object *nh, void *_data)
> +{
> +	struct rt_match_info *ri = (struct rt_match_info *)_data;
> +
> +	return (check_info_match_nhop(ri->info, ri->rt, nh) == 0);
> +}
> +
> +/*
> + * Tries to delete matching paths from @nhg.
> + * Returns 0 on success and updates operation result in @rc.
> + */
> +int
> +del_route_mpath(struct rib_head *rh, struct rt_addrinfo *info,
> +    struct rtentry *rt, struct nhgrp_object *nhg,
> +    struct rib_cmd_info *rc)
> +{
> +	struct route_nhop_data rnd;
> +	struct rt_match_info ri = { .info = info, .rt = rt };
> +	int error;
> +
> +	RIB_WLOCK_ASSERT(rh);
> +
> +	/*
> +	 * Require gateway to delete multipath routes, to forbid
> +	 *  deleting all paths at once.
> +	 * If the filter function is provided, skip gateway check to
> +	 *  allow rib_walk_del() delete routes for any criteria based
> +	 *  on provided callback.
> +	 */
> +	if ((info->rti_info[RTAX_GATEWAY] == NULL) && (info->rti_filter == NULL))
> +		return (ESRCH);
> +
> +	error = nhgrp_get_filtered_group(rh, nhg, gw_filter_func, (void *)&ri,
> +	    &rnd);
> +	if (error == 0)
> +		error = change_route_nhop(rh, rt, info, &rnd, rc);
> +	return (error);
> +}
> +
>
> Added: head/sys/net/route/nhgrp.c
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sys/net/route/nhgrp.c	Sat Oct  3 10:47:17 2020	(r366390)
> @@ -0,0 +1,344 @@
> +/*-
> + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
> + *
> + * Copyright (c) 2020 Alexander V. Chernikov
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
> PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
> STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
> WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * $FreeBSD$
> + */
> +
> +#include "opt_inet.h"
> +#include "opt_route.h"
> +
> +#include <sys/cdefs.h>
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/lock.h>
> +#include <sys/rmlock.h>
> +#include <sys/rwlock.h>
> +#include <sys/malloc.h>
> +#include <sys/mbuf.h>
> +#include <sys/refcount.h>
> +#include <sys/socket.h>
> +#include <sys/sysctl.h>
> +#include <sys/kernel.h>
> +
> +#include <net/if.h>
> +#include <net/if_var.h>
> +#include <net/if_dl.h>
> +#include <net/route.h>
> +#include <net/route/route_ctl.h>
> +#include <net/route/route_var.h>
> +#include <net/vnet.h>
> +
> +#include <netinet/in.h>
> +#include <netinet/in_var.h>
> +#include <netinet/in_fib.h>
> +
> +#include <net/route/nhop_utils.h>
> +#include <net/route/nhop.h>
> +#include <net/route/nhop_var.h>
> +#include <net/route/nhgrp_var.h>
> +
> +/*
> + * This file contains data structures management logic for the nexthop
> + * groups ("nhgrp") route subsystem.
> + *
> + * Nexthop groups are used to store multiple routes available for the
> specific
> + *  prefix. Nexthop groups are immutable and can be shared across multiple
> + *  prefixes.
> + *
> + * Each group consists of a control plane part and a dataplane part.
> + * Control plane is basically a collection of nexthop objects with
> + *  weights and refcount.
> + *
> + * Datapath consists of a array of nexthop pointers, compiled from control
> + *  plane data to support O(1) nexthop selection.
> + *
> + * For example, consider the following group:
> + *  [(nh1, weight=100), (nh2, weight=200)]
> + * It will compile to the following array:
> + *  [nh1, nh2, nh2]
> + *
> + */
> +
> +static void consider_resize(struct nh_control *ctl, uint32_t
> new_nh_buckets,
> +    uint32_t new_idx_items);
> +
> +static int cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv
> *b);
> +static unsigned int hash_nhgrp(const struct nhgrp_priv *obj);
> +
> +static unsigned
> +djb_hash(const unsigned char *h, const int len)
> +{
> +	unsigned int result = 0;
> +	int i;
> +
> +	for (i = 0; i < len; i++)
> +		result = 33 * result ^ h[i];
> +
> +	return (result);
> +}
> +
> +static int
> +cmp_nhgrp(const struct nhgrp_priv *a, const struct nhgrp_priv *b)
> +{
> +
> +	/*
> +	 * In case of consistent hashing, there can be multiple nexthop groups
> +	 * with the same "control plane" list of nexthops with weights and a
> +	 * different set of "data plane" nexthops.
> +	 * For now, ignore the data plane and focus on the control plane list.
> +	 */
> +	if (a->nhg_nh_count != b->nhg_nh_count)
> +		return (0);
> +	return !memcmp(a->nhg_nh_weights, b->nhg_nh_weights,
> +	    sizeof(struct weightened_nhop) * a->nhg_nh_count);
> +}
> +
> +/*
> + * Hash callback: calculate hash of an object
> + */
> +static unsigned int
> +hash_nhgrp(const struct nhgrp_priv *obj)
> +{
> +	const unsigned char *key;
> +
> +	key = (const unsigned char *)obj->nhg_nh_weights;
> +
> +	return (djb_hash(key, sizeof(struct weightened_nhop) *
> obj->nhg_nh_count));
> +}
> +
> +/*
> + * Returns object referenced and unlocked
> + */
> +struct nhgrp_priv *
> +find_nhgrp(struct nh_control *ctl, const struct nhgrp_priv *key)
> +{
> +	struct nhgrp_priv *priv_ret;
> +
> +	NHOPS_RLOCK(ctl);
> +	CHT_SLIST_FIND_BYOBJ(&ctl->gr_head, mpath, key, priv_ret);
> +	if (priv_ret != NULL) {
> +		if (refcount_acquire_if_not_zero(&priv_ret->nhg_refcount) == 0) {
> +			/* refcount is 0 -> group is being deleted */
> +			priv_ret = NULL;
> +		}
> +	}
> +	NHOPS_RUNLOCK(ctl);
> +
> +	return (priv_ret);
> +}
> +
> +int
> +link_nhgrp(struct nh_control *ctl, struct nhgrp_priv *grp_priv)
> +{
> +	uint16_t idx;
> +	uint32_t new_num_buckets, new_num_items;
> +
> +	NHOPS_WLOCK(ctl);
> +	/* Check if we need to resize hash and index */
> +	new_num_buckets = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->gr_head);
> +	new_num_items = bitmask_get_resize_items(&ctl->gr_idx_head);
> +
> +	if (bitmask_alloc_idx(&ctl->gr_idx_head, &idx) != 0) {
> +		NHOPS_WUNLOCK(ctl);
> +		DPRINTF("Unable to allocate mpath index");
> +		consider_resize(ctl, new_num_buckets, new_num_items);
> +		return (0);
> +	}
> +
> +	grp_priv->nhg_idx = idx;
> +	grp_priv->nh_control = ctl;
> +	CHT_SLIST_INSERT_HEAD(&ctl->gr_head, mpath, grp_priv);
> +
> +	NHOPS_WUNLOCK(ctl);
> +
> +	consider_resize(ctl, new_num_buckets, new_num_items);
> +
> +	return (1);
> +}
> +
> +struct nhgrp_priv *
> +unlink_nhgrp(struct nh_control *ctl, struct nhgrp_priv *key)
> +{
> +	struct nhgrp_priv *nhg_priv_ret;
> +	int ret, idx;
> +
> +	NHOPS_WLOCK(ctl);
> +
> +	CHT_SLIST_REMOVE_BYOBJ(&ctl->gr_head, mpath, key, nhg_priv_ret);
> +
> +	if (nhg_priv_ret == NULL) {
> +		DPRINTF("Unable to find nhop group!");
> +		NHOPS_WUNLOCK(ctl);
> +		return (NULL);
> +	}
> +
> +	idx = nhg_priv_ret->nhg_idx;
> +	ret = bitmask_free_idx(&ctl->gr_idx_head, idx);
> +	nhg_priv_ret->nhg_idx = 0;
> +	nhg_priv_ret->nh_control = NULL;
> +
> +	NHOPS_WUNLOCK(ctl);
> +
> +	return (nhg_priv_ret);
> +}
> +
> +/*
> + * Checks if hash needs resizing and performs this resize if necessary
> + *
> + */
> +__noinline static void
> +consider_resize(struct nh_control *ctl, uint32_t new_nh_buckets, uint32_t
> new_idx_items)
> +{
> +	void *nh_ptr, *nh_idx_ptr;
> +	void *old_idx_ptr;
> +	size_t alloc_size;
> +
> +	nh_ptr = NULL ;
> +	if (new_nh_buckets != 0) {
> +		alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_nh_buckets);
> +		nh_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
> +	}
> +
> +	nh_idx_ptr = NULL;
> +	if (new_idx_items != 0) {
> +		alloc_size = bitmask_get_size(new_idx_items);
> +		nh_idx_ptr = malloc(alloc_size, M_NHOP, M_NOWAIT | M_ZERO);
> +	}
> +
> +	if (nh_ptr == NULL && nh_idx_ptr == NULL) {
> +		/* Either resize is not required or allocations have failed. */
> +		return;
> +	}
> +
> +	DPRINTF("mp: going to resize: nh:[ptr:%p sz:%u] idx:[ptr:%p sz:%u]",
> +	    nh_ptr, new_nh_buckets, nh_idx_ptr, new_idx_items);
> +
> +	old_idx_ptr = NULL;
> +
> +	NHOPS_WLOCK(ctl);
> +	if (nh_ptr != NULL) {
> +		CHT_SLIST_RESIZE(&ctl->gr_head, mpath, nh_ptr, new_nh_buckets);
> +	}
> +	if (nh_idx_ptr != NULL) {
> +		if (bitmask_copy(&ctl->gr_idx_head, nh_idx_ptr, new_idx_items))
> +			bitmask_swap(&ctl->nh_idx_head, nh_idx_ptr, new_idx_items,
> &old_idx_ptr);
> +	}
> +	NHOPS_WUNLOCK(ctl);
> +
> +	if (nh_ptr != NULL)
> +		free(nh_ptr, M_NHOP);
> +	if (old_idx_ptr != NULL)
> +		free(old_idx_ptr, M_NHOP);
> +}
> +
> +/*
> + * Function allocating the necessary group data structures.
> + */
> +bool
> +nhgrp_ctl_alloc_default(struct nh_control *ctl, int malloc_flags)
> +{
> +	size_t alloc_size;
> +	uint32_t num_buckets, num_items;
> +	void *cht_ptr, *mask_ptr;
> +
> +	malloc_flags = (malloc_flags & (M_NOWAIT | M_WAITOK)) | M_ZERO;
> +
> +	num_buckets = 8;
> +	alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets);
> +	cht_ptr = malloc(alloc_size, M_NHOP, malloc_flags);
> +
> +	if (cht_ptr == NULL) {
> +		DPRINTF("mpath init failed");
> +		return (false);
> +	}
> +
> +	/*
> +	 * Allocate nexthop index bitmask.
> +	 */
> +	num_items = 128;
> +	mask_ptr = malloc(bitmask_get_size(num_items), M_NHOP, malloc_flags);
> +	if (mask_ptr == NULL) {
> +		DPRINTF("mpath bitmask init failed");
> +		free(cht_ptr, M_NHOP);
> +		return (false);
> +	}
> +
> +	NHOPS_WLOCK(ctl);
> +
> +	if (ctl->gr_head.hash_size == 0) {
> +		/* Init hash and bitmask */
> +		CHT_SLIST_INIT(&ctl->gr_head, cht_ptr, num_buckets);
> +		bitmask_init(&ctl->gr_idx_head, mask_ptr, num_items);
> +		NHOPS_WUNLOCK(ctl);
> +	} else {
> +		/* Other thread has already initiliazed hash/bitmask */
> +		NHOPS_WUNLOCK(ctl);
> +		free(cht_ptr, M_NHOP);
> +		free(mask_ptr, M_NHOP);
> +	}
> +
> +	DPRINTF("mpath init done for fib/af %d/%d", ctl->rh->rib_fibnum,
> +	    ctl->rh->rib_family);
> +
> +	return (true);
> +}
> +
> +int
> +nhgrp_ctl_init(struct nh_control *ctl)
> +{
> +
> +	/*
> +	 * By default, do not allocate datastructures as multipath
> +	 * routes will not be necessarily used.
> +	 */
> +	CHT_SLIST_INIT(&ctl->gr_head, NULL, 0);
> +	bitmask_init(&ctl->gr_idx_head, NULL, 0);
> +	return (0);
> +}
> +
> +void
> +nhgrp_ctl_free(struct nh_control *ctl)
> +{
> +
> +	if (ctl->gr_head.ptr != NULL)
> +		free(ctl->gr_head.ptr, M_NHOP);
> +	if (ctl->gr_idx_head.idx != NULL)
> +		free(ctl->gr_idx_head.idx, M_NHOP);
> +}
> +
> +void
> +nhgrp_ctl_unlink_all(struct nh_control *ctl)
> +{
> +	struct nhgrp_priv *nhg_priv;
> +
> +	NHOPS_WLOCK_ASSERT(ctl);
> +
> +	CHT_SLIST_FOREACH(&ctl->gr_head, mpath, nhg_priv) {
> +		DPRINTF("Marking nhgrp %u unlinked", nhg_priv->nhg_idx);
> +		refcount_release(&nhg_priv->nhg_linked);
> +	} CHT_SLIST_FOREACH_END;
> +}
> +
>
> Added: head/sys/net/route/nhgrp_ctl.c
> ==============================================================================
> --- /dev/null	00:00:00 1970	(empty, because file is newly added)
> +++ head/sys/net/route/nhgrp_ctl.c	Sat Oct  3 10:47:17 2020	(r366390)
> @@ -0,0 +1,788 @@
> +/*-
> + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
> + *
> + * Copyright (c) 2020 Alexander V. Chernikov
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
> PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
> STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
> WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + *
> + * $FreeBSD$
> + */
> +#define RTDEBUG
> +#include "opt_inet.h"
> +#include "opt_route.h"
> +
> +#include <sys/cdefs.h>
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/lock.h>
> +#include <sys/rmlock.h>
> +#include <sys/malloc.h>
> +#include <sys/mbuf.h>
> +#include <sys/refcount.h>
> +#include <sys/socket.h>
> +#include <sys/sysctl.h>
> +#include <sys/kernel.h>
> +#include <sys/epoch.h>
> +
> +#include <net/if.h>
> +#include <net/if_var.h>
> +#include <net/route.h>
> +#include <net/route/route_ctl.h>
> +#include <net/route/route_var.h>
> +#include <net/vnet.h>
> +
> +#include <netinet/in.h>
> +#include <netinet/in_var.h>
> +#include <netinet/in_fib.h>
> +
> +#include <net/route/nhop_utils.h>
> +#include <net/route/nhop.h>
> +#include <net/route/nhop_var.h>
> +#include <net/route/nhgrp_var.h>
> +
> +/*
> + * This file contains the supporting functions for creating multipath
> groups
> + *  and compiling their dataplane parts.
> + */
> +
> +/* MPF_MULTIPATH must be the same as NHF_MULTIPATH for nhop selection to
> work */
> +_Static_assert(MPF_MULTIPATH == NHF_MULTIPATH,
> +    "MPF_MULTIPATH must be the same as NHF_MULTIPATH");
> +/* Offset and size of flags field has to be the same for nhop/nhop groups
> */
> +CHK_STRUCT_FIELD_GENERIC(struct nhop_object, nh_flags, struct nhgrp_object,
> nhg_flags);
> +/* Cap multipath to 64, as the larger values would break rib_cmd_info
> bmasks */
> +CTASSERT(RIB_MAX_MPATH_WIDTH <= 64);
> +
> +static int wn_cmp(const void *a, const void *b);
> +static void sort_weightened_nhops(struct weightened_nhop *wn, int
> num_nhops);
> +
> +static struct nhgrp_priv *get_nhgrp(struct nh_control *ctl,
> +    struct weightened_nhop *wn, int num_nhops, int *perror);
> +static void destroy_nhgrp(struct nhgrp_priv *nhg_priv);
> +static void destroy_nhgrp_epoch(epoch_context_t ctx);
> +static void free_nhgrp_nhops(struct nhgrp_priv *nhg_priv);
> +
> +static int
> +wn_cmp(const void *a, const void *b)
> +{
> +	const struct weightened_nhop *wa = a;
> +	const struct weightened_nhop *wb = b;
> +
> +	if (wa->weight > wb->weight)
> +		return (1);
> +	else if (wa->weight < wb->weight)
> +		return (-1);
> +
> +	/* Compare nexthops by pointer */
> +	if (wa->nh > wb->nh)
> +		return (1);
> +	else if (wa->nh < wb->nh)
> +		return (-1);
> +	else
> +		return (0);
> +}
> +
> +/*
> + * Perform in-place sorting for array of nexthops in @wn.
> + *
> + * To avoid nh groups duplication, nexthops/weights in the
> + *   @wn need to be ordered deterministically.
> + * As this sorting is needed only for the control plane functionality,
> + *  there are no specific external requirements.
> + *
> + * Sort by weight first, to ease calculation of the slot sizes.
> + */
> +static void
> +sort_weightened_nhops(struct weightened_nhop *wn, int num_nhops)
> +{
> +
> +	qsort(wn, num_nhops, sizeof(struct weightened_nhop), wn_cmp);
> +}
> +
> +/*
> + * Calculate minimum number of slots required to fit the existing
> + * set of weights in the common use case where weights are "easily"
> + * comparable.
> + * Assumes @wn is sorted by weight ascending and each weight is > 0.
> + * Returns number of slots or 0 if precise calculation failed.
> + *
> + * Some examples:
> + * note: (i, X) pair means (nhop=i, weight=X):
> + * (1, 1) (2, 2) -> 3 slots [1, 2, 2]
> + * (1, 100), (2, 200) -> 3 slots [1, 2, 2]
> + * (1, 100), (2, 200), (3, 400) -> 7 slots [1, 2, 2, 3, 3, 3]
> + */
> +static uint32_t
> +calc_min_mpath_slots_fast(const struct weightened_nhop *wn, size_t
> num_items)
> +{
> +	uint32_t i, last, xmin;
> +	uint64_t total = 0;
> +
> +	last = 0;
> +	xmin = wn[0].weight;
> +	for (i = 0; i < num_items; i++) {
> +		total += wn[i].weight;
> +		if ((wn[i].weight - last < xmin) && (wn[i].weight != last))
> +			xmin = wn[i].weight - last;
> +		last = wn[i].weight;
> +	}
> +	/* xmin is the minimum unit of desired capacity */
> +	if ((total % xmin) != 0)
> +		return (0);
> +	for (i = 0; i < num_items; i++) {
> +		if ((wn[i].weight % xmin) != 0)
> +			return (0);
> +	}
> +
> +	return ((uint32_t)(total / xmin));
> +}
> +
> +/*
> + * Calculate minimum number of slots required to fit the existing
> + * set of weights while maintaining weight coefficients.
> + *
> + * Assume @wn is sorted by weight ascending and each weight is > 0.
> + *
> + * Tries to find simple precise solution first and falls back to
> + *  RIB_MAX_MPATH_WIDTH in case of any failure.
> + */
> +static uint32_t
> +calc_min_mpath_slots(const struct weightened_nhop *wn, size_t num_items)
> +{
> +	uint32_t v;
> +
> +	v = calc_min_mpath_slots_fast(wn, num_items);
> +	if ((v == 0) || (v > RIB_MAX_MPATH_WIDTH))
> +		v = RIB_MAX_MPATH_WIDTH;
> +
> +	return (v);
> +}
> +
> +/*
> + * Nexthop group data consists of
> + * 1) dataplane part, with nhgrp_object as a header followed by an
> + *   arbitrary number of nexthop pointers.
> + * 2) control plane part, with nhgrp_priv as a header, followed by
> + *   an arbirtrary number of 'struct weightened_nhop' object.
> + *
> + * Given nexthop groups are (mostly) immutable, allocate all data
> + * in one go.
> + *
> + */
> +__noinline static size_t
> +get_nhgrp_alloc_size(uint32_t nhg_size, uint32_t num_nhops)
> +{
> +	size_t sz;
> +
> +	sz = sizeof(struct nhgrp_object);
> +	sz += nhg_size * sizeof(struct nhop_object *);
> +	sz += sizeof(struct nhgrp_priv);
> +	sz += num_nhops * sizeof(struct weightened_nhop);
> +	return (sz);
> +}
> +
> +/*
> + * Compile actual list of nexthops to be used by datapath from
> + *  the nexthop group @dst.
> + *
> + * For example, compiling control plane list of 2 nexthops
> + *  [(200, A), (100, B)] would result in the datapath array
> + *  [A, A, B]
> + */
> +static void
> +compile_nhgrp(struct nhgrp_priv *dst_priv, const struct weightened_nhop
> *x,
> +    uint32_t num_slots)
> +{
> +	struct nhgrp_object *dst;
> +	int i, slot_idx, remaining_slots;
> +	uint64_t remaining_sum, nh_weight, nh_slots;
> +
> +	slot_idx  = 0;
> +	dst = dst_priv->nhg;
> +	/* Calculate sum of all weights */
> +	remaining_sum = 0;
> +	for (i = 0; i < dst_priv->nhg_nh_count; i++)
> +		remaining_sum += x[i].weight;
> +	remaining_slots = num_slots;
> +	DPRINTF("O: %u/%u", (uint32_t)remaining_sum, remaining_slots);
> +	for (i = 0; i < dst_priv->nhg_nh_count; i++) {
> +		/* Calculate number of slots for the current nexthop */
> +		if (remaining_sum > 0) {
> +			nh_weight = (uint64_t)x[i].weight;
> +			nh_slots = (nh_weight * remaining_slots / remaining_sum);
> +		} else
> +			nh_slots = 0;
> +
> +		remaining_sum -= x[i].weight;
> +		remaining_slots -= nh_slots;
> +
> +		DPRINTF(" OO[%d]: %u/%u curr=%d slot_idx=%d", i,
> +		    (uint32_t)remaining_sum, remaining_slots,
> +		    (int)nh_slots, slot_idx);
> +
> +		KASSERT((slot_idx + nh_slots <= num_slots),
> +		    ("index overflow during nhg compilation"));
> +		while (nh_slots-- > 0)
> +			dst->nhops[slot_idx++] = x[i].nh;
> +	}
> +}
> +
> +/*
> + * Allocates new nexthop group for the list of weightened nexthops.
> + * Assume sorted list.
> + * Does NOT reference any nexthops in the group.
> + * Returns group with refcount=1 or NULL.
> + */
> +static struct nhgrp_priv *
> +alloc_nhgrp(struct weightened_nhop *wn, int num_nhops)
> +{
> +	uint32_t nhgrp_size;
> +	int flags = M_NOWAIT;
> +	struct nhgrp_object *nhg;
> +	struct nhgrp_priv *nhg_priv;
> +
> +	nhgrp_size = calc_min_mpath_slots(wn, num_nhops);
> +	if (nhgrp_size == 0) {
> +		/* Zero weights, abort */
> +		return (NULL);
> +	}
> +
> +	size_t sz = get_nhgrp_alloc_size(nhgrp_size, num_nhops);
> +	nhg = malloc(sz, M_NHOP, flags | M_ZERO);
> +	if (nhg == NULL) {
> +		return (NULL);
> +	}
> +
> +	/* Has to be the first to make NHGRP_PRIV() work */
> +	nhg->nhg_size = nhgrp_size;
> +	DPRINTF("new mpath group: num_nhops: %u", (uint32_t)nhgrp_size);
> +	nhg->nhg_flags = MPF_MULTIPATH;
> +
> +	nhg_priv = NHGRP_PRIV(nhg);
> +	nhg_priv->nhg_nh_count = num_nhops;
> +	refcount_init(&nhg_priv->nhg_refcount, 1);
> +
> +	/* Please see nhgrp_free() comments on the initial value */
> +	refcount_init(&nhg_priv->nhg_linked, 2);
> +
> +	nhg_priv->nhg = nhg;
> +	memcpy(&nhg_priv->nhg_nh_weights[0], wn,
> +	  num_nhops * sizeof(struct weightened_nhop));
> +
> +	compile_nhgrp(nhg_priv, wn, nhg->nhg_size);
> +
> +	return (nhg_priv);
> +}
> +
> +void
> +nhgrp_free(struct nhgrp_object *nhg)
> +{
> +	struct nhgrp_priv *nhg_priv;
> +	struct nh_control *ctl;
> +	struct epoch_tracker et;
> +
> +	nhg_priv = NHGRP_PRIV(nhg);
> +
> +	if (!refcount_release(&nhg_priv->nhg_refcount))
> +		return;
> +
> +	/*
> +	 * group objects don't have an explicit lock attached to it.
> +	 * As groups are reclaimed based on reference count, it is possible
> +	 * that some groups will persist after vnet destruction callback
> +	 * called. Given that, handle scenario with nhgrp_free_group() being
> +	 * called either after or simultaneously with nhgrp_ctl_unlink_all()
> +	 * by using another reference counter: nhg_linked.
> +	 *
> +	 * There are only 2 places, where nhg_linked can be decreased:
> +	 *  rib destroy (nhgrp_ctl_unlink_all) and this function.
> +	 * nhg_link can never be increased.
> +	 *
> +	 * Hence, use initial value of 2 to make use of
> +	 *  refcount_release_if_not_last().
> +	 *
> +	 * There can be two scenarious when calling this function:
> +	 *
> +	 * 1) nhg_linked value is 2. This means that either
> +	 *  nhgrp_ctl_unlink_all() has not been called OR it is running,
> +	 *  but we are guaranteed that nh_control won't be freed in
> +	 *  this epoch. Hence, nexthop can be safely unlinked.
> +	 *
> +	 * 2) nh_linked value is 1. In that case, nhgrp_ctl_unlink_all()
> +	 *  has been called and nhgrp unlink can be skipped.
> +	 */
> +
> +	NET_EPOCH_ENTER(et);
> +	if (refcount_release_if_not_last(&nhg_priv->nhg_linked)) {
> +		ctl = nhg_priv->nh_control;
> +		if (unlink_nhgrp(ctl, nhg_priv) == NULL) {
> +			/* Do not try to reclaim */
> +			DPRINTF("Failed to unlink nexhop group %p", nhg_priv);
> +			NET_EPOCH_EXIT(et);
> +			return;
> +		}
> +	}
> +	NET_EPOCH_EXIT(et);
> +
> +	epoch_call(net_epoch_preempt, destroy_nhgrp_epoch,
> +	    &nhg_priv->nhg_epoch_ctx);
> +}
> +
> +/*
> + * Destroys all local resources belonging to @nhg_priv.
> + */
> +__noinline static void
> +destroy_nhgrp_int(struct nhgrp_priv *nhg_priv)
> +{
> +
> +	free(nhg_priv->nhg, M_NHOP);
> +}
> +
> +__noinline static void
> +destroy_nhgrp(struct nhgrp_priv *nhg_priv)
> +{
> +
> +	KASSERT((nhg_priv->nhg_refcount == 0), ("nhg_refcount != 0"));
> +
> +	DPRINTF("DEL MPATH %p", nhg_priv);
> +
> +	KASSERT((nhg_priv->nhg_idx == 0), ("gr_idx != 0"));
> +
> +	free_nhgrp_nhops(nhg_priv);
> +
> +	destroy_nhgrp_int(nhg_priv);
> +}
> +
> +/*
> + * Epoch callback indicating group is safe to destroy
> + */
> +static void
> +destroy_nhgrp_epoch(epoch_context_t ctx)
> +{
> +	struct nhgrp_priv *nhg_priv;
> +
> +	nhg_priv = __containerof(ctx, struct nhgrp_priv, nhg_epoch_ctx);
> +
> +	destroy_nhgrp(nhg_priv);
> +}
> +
> +static bool
> +ref_nhgrp_nhops(struct nhgrp_priv *nhg_priv)
> +{
> +
> +	for (int i = 0; i < nhg_priv->nhg_nh_count; i++) {
>
> *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
>


-- 
Mateusz Guzik <mjguzik gmail.com>


More information about the svn-src-all mailing list