Re: git: 91f44749c6fe - main - ifnet: make if_index global

From: Marko Zec <zec_at_fer.hr>
Date: Sat, 29 Jan 2022 15:20:34 UTC
On Thu, 27 Jan 2022 06:00:01 GMT
Gleb Smirnoff <glebius@FreeBSD.org> wrote:

> The branch main has been updated by glebius:
> 
> URL:
> https://cgit.FreeBSD.org/src/commit/?id=91f44749c6feb50f39af8805dd803e860f0418f1
> 
> commit 91f44749c6feb50f39af8805dd803e860f0418f1
> Author:     Gleb Smirnoff <glebius@FreeBSD.org>
> AuthorDate: 2022-01-27 05:58:44 +0000
> Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
> CommitDate: 2022-01-27 05:58:44 +0000
> 
>     ifnet: make if_index global
>     
>     Now that ifindex is static to if.c we can unvirtualize it.  For
> lifetime of an ifnet its index never changes.  To avoid leaking
> foreign interfaces the net.link.generic.system.ifcount sysctl and the
> ifnet_byindex() KPI filter their returned value on curvnet.  Since
> if_vmove() no longer changes the if_index, inline ifindex_alloc() and
> ifindex_free() into if_alloc() and if_free() respectively.
>     
>     API wise the only change is that now minimum interface index can
> be greater than 1.

IMO this is a huge change in an API which has been stable like forever.
Was this tested with a broader spectrum of userland consumers, such as
FRR & alike?

> The holes in interface indexes were always allowed.

True, the holes were formally allowed but were never huge, and now they
will be guaranteed with #vnets > 1.  Again, that's not something 3rd
party tools would expect.

What could be the repercussions of ifindex globalization to 16-bit size
of ifru_index in struct ifreq?

What can prevent a vnet to inflate the global ifindex space, causing
the linear scan in sysctl_ifcount() to become a global problem?

Why do we now need to check wheter curvnet != NULL in ifnet_byindex(),
i.e. are there callers to ifnet_byindex() where curvnet is not set?  If
so, that smells like a bug in the caller?

Or, is it the other way around, that ifnet_byindex() is now intended to
become a silent workaround for callers which lack curvnet context?  If
that is so, it is indeed becoming a major VNET KPI feature (though I'd
call it a bug), and as such, it should have been clearly stated in both
the commit message, and as a comment in the ifnet_byindex() code.

Finally, the commit message does not reveal any clues what this change
actually tries to accomplish.  The claim "Now that ifindex is static to
if.c we can unvirtualize it." tell us nothing about the true intent.

Sorry for chiming in late, but my sentiment against this change is
obvious...


>     Reviewed by:            kp
>     Differential revision:  https://reviews.freebsd.org/D33672
> ---
>  sys/net/if.c | 216
> ++++++++++++++++++++++++----------------------------------- 1 file
> changed, 86 insertions(+), 130 deletions(-)
> 
> diff --git a/sys/net/if.c b/sys/net/if.c
> index 4622f978acd9..f148ae8c9c6d 100644
> --- a/sys/net/if.c
> +++ b/sys/net/if.c
> @@ -311,19 +311,30 @@ VNET_DEFINE(struct ifnethead, ifnet);	/*
> depend on static init XXX */ VNET_DEFINE(struct ifgrouphead,
> ifg_head); 
>  /* Table of ifnet by index. */
> -VNET_DEFINE_STATIC(int, if_index);
> -#define	V_if_index		VNET(if_index)
> -VNET_DEFINE_STATIC(int, if_indexlim) = 8;
> -#define	V_if_indexlim		VNET(if_indexlim)
> -VNET_DEFINE_STATIC(struct ifnet **, ifindex_table);
> -#define	V_ifindex_table		VNET(ifindex_table)
> +static int if_index;
> +static int if_indexlim = 8;
> +static struct ifnet **ifindex_table;
>  
>  SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system,
>      CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
>      "Variables global to all interfaces");
> -SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount,
> -    CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(if_index), 0,
> -    "Number of configured interfaces");
> +static int
> +sysctl_ifcount(SYSCTL_HANDLER_ARGS)
> +{
> +	int rv = 0;
> +
> +	IFNET_RLOCK();
> +	for (int i = 1; i <= if_index; i++)
> +		if (ifindex_table[i] != NULL &&
> +		    ifindex_table[i]->if_vnet == curvnet)
> +			rv = i;
> +	IFNET_RUNLOCK();
> +
> +	return (sysctl_handle_int(oidp, &rv, 0, req));
> +}
> +SYSCTL_PROC(_net_link_generic_system, IFMIB_IFCOUNT, ifcount,
> +    CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RD, NULL, 0,
> sysctl_ifcount, "I",
> +    "Maximum known interface index");
>  
>  /*
>   * The global network interface list (V_ifnet) and related state
> (such as @@ -352,13 +363,19 @@ MALLOC_DEFINE(M_IFMADDR,
> "ether_multi", "link-level multicast address"); struct ifnet *
>  ifnet_byindex(u_int idx)
>  {
> +	struct ifnet *ifp;
>  
>  	NET_EPOCH_ASSERT();
>  
> -	if (__predict_false(idx > V_if_index))
> +	if (__predict_false(idx > if_index))
>  		return (NULL);
>  
> -	return (ck_pr_load_ptr(&V_ifindex_table[idx]));
> +	ifp = ck_pr_load_ptr(&ifindex_table[idx]);
> +
> +	if (curvnet != NULL && ifp != NULL && ifp->if_vnet !=
> curvnet)
> +		ifp = NULL;
> +
> +	return (ifp);
>  }
>  
>  struct ifnet *
> @@ -374,58 +391,6 @@ ifnet_byindex_ref(u_int idx)
>  	return (ifp);
>  }
>  
> -/*
> - * Allocate an ifindex array entry.
> - */
> -static void
> -ifindex_alloc(struct ifnet *ifp)
> -{
> -	u_short idx;
> -
> -	IFNET_WLOCK();
> -	/*
> -	 * Try to find an empty slot below V_if_index.  If we fail,
> take the
> -	 * next slot.
> -	 */
> -	for (idx = 1; idx <= V_if_index; idx++) {
> -		if (V_ifindex_table[idx] == NULL)
> -			break;
> -	}
> -
> -	/* Catch if_index overflow. */
> -	if (idx >= V_if_indexlim) {
> -		struct ifnet **new, **old;
> -		int newlim;
> -
> -		newlim = V_if_indexlim * 2;
> -		new = malloc(newlim * sizeof(*new), M_IFNET,
> M_WAITOK | M_ZERO);
> -		memcpy(new, V_ifindex_table, V_if_indexlim *
> sizeof(*new));
> -		old = V_ifindex_table;
> -		ck_pr_store_ptr(&V_ifindex_table, new);
> -		V_if_indexlim = newlim;
> -		epoch_wait_preempt(net_epoch_preempt);
> -		free(old, M_IFNET);
> -	}
> -	if (idx > V_if_index)
> -		V_if_index = idx;
> -
> -	ifp->if_index = idx;
> -	ck_pr_store_ptr(&V_ifindex_table[idx], ifp);
> -	IFNET_WUNLOCK();
> -}
> -
> -static void
> -ifindex_free(u_short idx)
> -{
> -
> -	IFNET_WLOCK_ASSERT();
> -
> -	ck_pr_store_ptr(&V_ifindex_table[idx], NULL);
> -	while (V_if_index > 0 &&
> -	    V_ifindex_table[V_if_index] == NULL)
> -		V_if_index--;
> -}
> -
>  struct ifaddr *
>  ifaddr_byindex(u_short idx)
>  {
> @@ -448,33 +413,24 @@ ifaddr_byindex(u_short idx)
>   */
>  
>  static void
> -vnet_if_init(const void *unused __unused)
> +if_init(void *arg __unused)
>  {
>  
> -	CK_STAILQ_INIT(&V_ifnet);
> -	CK_STAILQ_INIT(&V_ifg_head);
> -	V_ifindex_table = malloc(V_if_indexlim *
> sizeof(*V_ifindex_table),
> +	ifindex_table = malloc(if_indexlim * sizeof(*ifindex_table),
>  	    M_IFNET, M_WAITOK | M_ZERO);
> -	vnet_if_clone_init();
>  }
> -VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND,
> vnet_if_init,
> -    NULL);
> +SYSINIT(if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, if_init, NULL);
>  
> -#ifdef VIMAGE
>  static void
> -vnet_if_uninit(const void *unused __unused)
> +vnet_if_init(const void *unused __unused)
>  {
>  
> -	VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq
> &V_ifnet=%p "
> -	    "not empty", __func__, __LINE__, &V_ifnet));
> -	VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq
> &V_ifg_head=%p "
> -	    "not empty", __func__, __LINE__, &V_ifg_head));
> -
> -	free((caddr_t)V_ifindex_table, M_IFNET);
> +	CK_STAILQ_INIT(&V_ifnet);
> +	CK_STAILQ_INIT(&V_ifg_head);
> +	vnet_if_clone_init();
>  }
> -VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST,
> -    vnet_if_uninit, NULL);
> -#endif
> +VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND,
> vnet_if_init,
> +    NULL);
>  
>  static void
>  if_link_ifnet(struct ifnet *ifp)
> @@ -568,6 +524,7 @@ static struct ifnet *
>  if_alloc_domain(u_char type, int numa_domain)
>  {
>  	struct ifnet *ifp;
> +	u_short idx;
>  
>  	KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large"));
>  	if (numa_domain == IF_NODOM)
> @@ -607,7 +564,37 @@ if_alloc_domain(u_char type, int numa_domain)
>  	ifp->if_get_counter = if_get_counter_default;
>  	ifp->if_pcp = IFNET_PCP_NONE;
>  
> -	ifindex_alloc(ifp);
> +	/* Allocate an ifindex array entry. */
> +	IFNET_WLOCK();
> +	/*
> +	 * Try to find an empty slot below if_index.  If we fail,
> take the
> +	 * next slot.
> +	 */
> +	for (idx = 1; idx <= if_index; idx++) {
> +		if (ifindex_table[idx] == NULL)
> +			break;
> +	}
> +
> +	/* Catch if_index overflow. */
> +	if (idx >= if_indexlim) {
> +		struct ifnet **new, **old;
> +		int newlim;
> +
> +		newlim = if_indexlim * 2;
> +		new = malloc(newlim * sizeof(*new), M_IFNET,
> M_WAITOK | M_ZERO);
> +		memcpy(new, ifindex_table, if_indexlim *
> sizeof(*new));
> +		old = ifindex_table;
> +		ck_pr_store_ptr(&ifindex_table, new);
> +		if_indexlim = newlim;
> +		epoch_wait_preempt(net_epoch_preempt);
> +		free(old, M_IFNET);
> +	}
> +	if (idx > if_index)
> +		if_index = idx;
> +
> +	ifp->if_index = idx;
> +	ck_pr_store_ptr(&ifindex_table[idx], ifp);
> +	IFNET_WUNLOCK();
>  
>  	return (ifp);
>  }
> @@ -677,23 +664,18 @@ if_free(struct ifnet *ifp)
>  	 * epoch and then dereferencing ifp while we peform
> if_free(),
>  	 * and after if_free() finished, too.
>  	 *
> -	 * The reason is the VIMAGE.  For some reason it was designed
> -	 * to require all sockets drained before destroying, but not
> all
> -	 * ifnets.  A vnet destruction calls if_vmove() on ifnet,
> which
> -	 * causes ID change.  But ID change and a possible
> misidentification
> -	 * of an ifnet later is a lesser problem, as it doesn't
> crash kernel.
> -	 * A worse problem is that removed interface may outlive the
> vnet it
> -	 * belongs too!  The if_free_deferred() would see
> ifp->if_vnet freed.
> +	 * This early index freeing was important back when ifindex
> was
> +	 * virtualized and interface would outlive the vnet.
>  	 */
> -	CURVNET_SET_QUIET(ifp->if_vnet);
>  	IFNET_WLOCK();
> -	MPASS(V_ifindex_table[ifp->if_index] == ifp);
> -	ifindex_free(ifp->if_index);
> +	MPASS(ifindex_table[ifp->if_index] == ifp);
> +	ck_pr_store_ptr(&ifindex_table[ifp->if_index], NULL);
> +	while (if_index > 0 && ifindex_table[if_index] == NULL)
> +		if_index--;
>  	IFNET_WUNLOCK();
>  
>  	if (refcount_release(&ifp->if_refcount))
>  		NET_EPOCH_CALL(if_free_deferred, &ifp->if_epoch_ctx);
> -	CURVNET_RESTORE();
>  }
>  
>  /*
> @@ -837,7 +819,7 @@ if_attach_internal(struct ifnet *ifp, bool vmove)
>  	struct sockaddr_dl *sdl;
>  	struct ifaddr *ifa;
>  
> -	MPASS(V_ifindex_table[ifp->if_index] == ifp);
> +	MPASS(ifindex_table[ifp->if_index] == ifp);
>  
>  #ifdef VIMAGE
>  	ifp->if_vnet = curvnet;
> @@ -1287,17 +1269,6 @@ if_vmove(struct ifnet *ifp, struct vnet
> *new_vnet) if (rc != 0)
>  		return (rc);
>  
> -	/*
> -	 * Unlink the ifnet from ifindex_table[] in current vnet,
> and shrink
> -	 * the if_index for that vnet if possible.
> -	 *
> -	 * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be
> unvirtualized,
> -	 * or we'd lock on one vnet and unlock on another.
> -	 */
> -	IFNET_WLOCK();
> -	ifindex_free(ifp->if_index);
> -	IFNET_WUNLOCK();
> -
>  	/*
>  	 * Perform interface-specific reassignment tasks, if
> provided by
>  	 * the driver.
> @@ -1309,7 +1280,6 @@ if_vmove(struct ifnet *ifp, struct vnet
> *new_vnet)
>  	 * Switch to the context of the target vnet.
>  	 */
>  	CURVNET_SET_QUIET(new_vnet);
> -	ifindex_alloc(ifp);
>  	if_attach_internal(ifp, true);
>  
>  #ifdef DEV_BPF
> @@ -1945,7 +1915,6 @@ ifa_ifwithnet(const struct sockaddr *addr, int
> ignore_ptp, int fibnum) struct ifaddr *ifa_maybe = NULL;
>  	u_int af = addr->sa_family;
>  	const char *addr_data = addr->sa_data, *cplim;
> -	const struct sockaddr_dl *sdl;
>  
>  	NET_EPOCH_ASSERT();
>  	/*
> @@ -1953,14 +1922,9 @@ ifa_ifwithnet(const struct sockaddr *addr, int
> ignore_ptp, int fibnum)
>  	 * so do that if we can.
>  	 */
>  	if (af == AF_LINK) {
> -		sdl = (const struct sockaddr_dl *)addr;
> -		if (sdl->sdl_index && sdl->sdl_index <= V_if_index) {
> -			ifp = ifnet_byindex(sdl->sdl_index);
> -			if (ifp == NULL)
> -				return (NULL);
> -
> -			return (ifp->if_addr);
> -		}
> +		ifp = ifnet_byindex(
> +		    ((const struct sockaddr_dl *)addr)->sdl_index);
> +		return (ifp ? ifp->if_addr : NULL);
>  	}
>  
>  	/*
> @@ -4596,24 +4560,16 @@ DB_SHOW_COMMAND(ifnet, db_show_ifnet)
>  
>  DB_SHOW_ALL_COMMAND(ifnets, db_show_all_ifnets)
>  {
> -	VNET_ITERATOR_DECL(vnet_iter);
>  	struct ifnet *ifp;
>  	u_short idx;
>  
> -	VNET_FOREACH(vnet_iter) {
> -		CURVNET_SET_QUIET(vnet_iter);
> -#ifdef VIMAGE
> -		db_printf("vnet=%p\n", curvnet);
> -#endif
> -		for (idx = 1; idx <= V_if_index; idx++) {
> -			ifp = V_ifindex_table[idx];
> -			if (ifp == NULL)
> -				continue;
> -			db_printf( "%20s ifp=%p\n", ifp->if_xname,
> ifp);
> -			if (db_pager_quit)
> -				break;
> -		}
> -		CURVNET_RESTORE();
> +	for (idx = 1; idx <= if_index; idx++) {
> +		ifp = ifindex_table[idx];
> +		if (ifp == NULL)
> +			continue;
> +		db_printf( "%20s ifp=%p\n", ifp->if_xname, ifp);
> +		if (db_pager_quit)
> +			break;
>  	}
>  }
>  #endif	/* DDB */