git: 36e15b717eec - main - routing: Fix crashes with dpdk_lpm[46] algo.

Alexander V. Chernikov melifaro at FreeBSD.org
Tue Aug 17 21:12:03 UTC 2021


The branch main has been updated by melifaro:

URL: https://cgit.FreeBSD.org/src/commit/?id=36e15b717eec80047fe7442898b5752101f2fbca

commit 36e15b717eec80047fe7442898b5752101f2fbca
Author:     Alexander V. Chernikov <melifaro at FreeBSD.org>
AuthorDate: 2021-08-15 22:25:21 +0000
Commit:     Alexander V. Chernikov <melifaro at FreeBSD.org>
CommitDate: 2021-08-17 20:46:22 +0000

    routing: Fix crashes with dpdk_lpm[46] algo.
    
    When a prefix gets deleted from the RIB, dpdk_lpm algo needs to know
     the nexthop of the "parent" prefix to update its internal state.
    The glue code, which utilises RIB as a backing route store, uses
     fib[46]_lookup_rt() for the prefix destination after its deletion
     to fetch the desired nexthop.
    This approach does not work when deleting less-specific prefixes
     with most-specific ones are still present. For example, if
     10.0.0.0/24, 10.0.0.0/23 and 10.0.0.0/22 exist in RIB, deleting
     10.0.0.0/23 would result in 10.0.0.0/24 being returned as a search
     result instead of 10.0.0.0/22. This, in turn, results in the failed
     datastructure update: part of the deleted /23 prefix will still
     contain the reference to an old nexthop. This leads to the
     use-after-free behaviour, ending with the eventual crashes.
    
    Fix the logic flaw by properly fetching the prefix "parent" via
     newly-created rt_get_inet[6]_parent() helpers.
    
    Differential Revision: https://reviews.freebsd.org/D31546
    PR:     256882,256833
    MFC after:      1 week
---
 sys/contrib/dpdk_rte_lpm/dpdk_lpm.c  |  32 ++++----
 sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c |  42 +++++-----
 sys/net/radix.c                      |  14 ++++
 sys/net/radix.h                      |   1 +
 sys/net/route/route_ctl.h            |   3 +
 sys/net/route/route_helpers.c        | 150 +++++++++++++++++++++++++++++++++++
 6 files changed, 208 insertions(+), 34 deletions(-)

diff --git a/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c b/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c
index af145997c4d6..51cd134132f6 100644
--- a/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c
+++ b/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c
@@ -134,26 +134,27 @@ handle_default_change(struct dpdk_lpm_data *dd, struct rib_cmd_info *rc)
 }
 
 static void
-get_parent_rule(struct dpdk_lpm_data *dd, struct in_addr addr, uint8_t *plen, uint32_t *nhop_idx)
+get_parent_rule(struct dpdk_lpm_data *dd, struct in_addr addr, int plen,
+    uint8_t *pplen, uint32_t *nhop_idx)
 {
-	struct route_nhop_data rnd;
 	struct rtentry *rt;
 
-	rt = fib4_lookup_rt(dd->fibnum, addr, 0, NHR_UNLOCKED, &rnd);
+	rt = rt_get_inet_parent(dd->fibnum, addr, plen);
 	if (rt != NULL) {
 		struct in_addr addr4;
 		uint32_t scopeid;
-		int inet_plen;
-		rt_get_inet_prefix_plen(rt, &addr4, &inet_plen, &scopeid);
-		if (inet_plen > 0) {
-			*plen = inet_plen;
-			*nhop_idx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop);
+		int parent_plen;
+
+		rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid);
+		if (parent_plen > 0) {
+			*pplen = parent_plen;
+			*nhop_idx = fib_get_nhop_idx(dd->fd, rt_get_raw_nhop(rt));
 			return;
 		}
 	}
 
 	*nhop_idx = 0;
-	*plen = 0;
+	*pplen = 0;
 }
 
 static enum flm_op_result
@@ -181,20 +182,23 @@ handle_gu_change(struct dpdk_lpm_data *dd, const struct rib_cmd_info *rc,
 		}
 
 		ret = rte_lpm_add(dd->lpm, ip, plen, nhidx);
-		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d",
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u -> %u ret: %d",
 		    (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE",
-		    abuf, plen, nhidx, ret);
+		    abuf, plen,
+		    rc->rc_nh_old != NULL ? fib_get_nhop_idx(dd->fd, rc->rc_nh_old) : 0,
+		    nhidx, ret);
 	} else {
 		/*
 		 * Need to lookup parent. Assume deletion happened already
 		 */
 		uint8_t parent_plen;
 		uint32_t parent_nhop_idx;
-		get_parent_rule(dd, addr, &parent_plen, &parent_nhop_idx);
+		get_parent_rule(dd, addr, plen, &parent_plen, &parent_nhop_idx);
 
 		ret = rte_lpm_delete(dd->lpm, ip, plen, parent_plen, parent_nhop_idx);
-		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK: %s %s/%d nhop %u = %d",
-		    "DEL", abuf, plen, nhidx, ret);
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK: %s %s/%d -> /%d nhop %u -> %u ret: %d",
+		    "DEL", abuf, plen, parent_plen, fib_get_nhop_idx(dd->fd, rc->rc_nh_old),
+		    parent_nhop_idx, ret);
 	}
 
 	if (ret != 0) {
diff --git a/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c
index 17d35c16346d..ec1a3228d42b 100644
--- a/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c
+++ b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c
@@ -165,30 +165,26 @@ handle_ll_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc,
 }
 
 static struct rte_lpm6_rule *
-pack_parent_rule(struct dpdk_lpm6_data *dd, const struct in6_addr *addr6,
-    char *buffer)
+pack_parent_rule(struct dpdk_lpm6_data *dd, const struct in6_addr *addr6, int plen,
+    int *pplen, uint32_t *pnhop_idx, char *buffer)
 {
 	struct rte_lpm6_rule *lsp_rule = NULL;
-	struct route_nhop_data rnd;
 	struct rtentry *rt;
-	int plen;
 
-	rt = fib6_lookup_rt(dd->fibnum, addr6, 0, NHR_UNLOCKED, &rnd);
+	*pnhop_idx = 0;
+	*pplen = 0;
+
+	rt = rt_get_inet6_parent(dd->fibnum, addr6, plen);
 	/* plen = 0 means default route and it's out of scope */
 	if (rt != NULL) {
-		uint32_t scopeid;
+		uint32_t nhop_idx, scopeid;
 		struct in6_addr new_addr6;
 		rt_get_inet6_prefix_plen(rt, &new_addr6, &plen, &scopeid);
 		if (plen > 0) {
-			uint32_t nhidx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop);
-			if (nhidx == 0) {
-				/*
-				 * shouldn't happen as we already have parent route.
-				 * It will trigger rebuild automatically.
-				 */
-				return (NULL);
-			}
-			lsp_rule = fill_rule6(buffer, (uint8_t *)&new_addr6, plen, nhidx);
+			nhop_idx = fib_get_nhop_idx(dd->fd, rt_get_raw_nhop(rt));
+			lsp_rule = fill_rule6(buffer, (uint8_t *)&new_addr6, plen, nhop_idx);
+			*pnhop_idx = nhop_idx;
+			*pplen = plen;
 		}
 	}
 
@@ -217,20 +213,26 @@ handle_gu_change(struct dpdk_lpm6_data *dd, const struct rib_cmd_info *rc,
 
 		ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)addr6,
 				   plen, nhidx, (rc->rc_cmd == RTM_ADD) ? 1 : 0);
-		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d",
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u -> %u ret: %d",
 		    (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE",
-		    abuf, plen, nhidx, ret);
+		    abuf, plen,
+		    rc->rc_nh_old != NULL ? fib_get_nhop_idx(dd->fd, rc->rc_nh_old) : 0,
+		    nhidx, ret);
 	} else {
 		/*
 		 * Need to lookup parent. Assume deletion happened already
 		 */
 		char buffer[RTE_LPM6_RULE_SIZE];
 		struct rte_lpm6_rule *lsp_rule = NULL;
-		lsp_rule = pack_parent_rule(dd, addr6, buffer);
+		int parent_plen;
+		uint32_t parent_nhop_idx;
+		lsp_rule = pack_parent_rule(dd, addr6, plen, &parent_plen,
+		    &parent_nhop_idx, buffer);
 
 		ret = rte_lpm6_delete(dd->lpm6, (const uint8_t *)addr6, plen, lsp_rule);
-		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop ? = %d",
-		    "DEL", abuf, plen, ret);
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d -> /%d nhop %u -> %u ret: %d",
+		    "DEL", abuf, plen, parent_plen, fib_get_nhop_idx(dd->fd, rc->rc_nh_old),
+		    parent_nhop_idx, ret);
 	}
 
 	if (ret != 0) {
diff --git a/sys/net/radix.c b/sys/net/radix.c
index 931bf6db871b..2169361c7229 100644
--- a/sys/net/radix.c
+++ b/sys/net/radix.c
@@ -371,6 +371,20 @@ on1:
 	return (0);
 }
 
+/*
+ * Returns the next (wider) prefix for the key defined by @rn
+ *  if exists.
+ */
+struct radix_node *
+rn_nextprefix(struct radix_node *rn)
+{
+	for (rn = rn->rn_dupedkey; rn != NULL; rn = rn->rn_dupedkey) {
+		if (!(rn->rn_flags & RNF_ROOT))
+			return (rn);
+	}
+	return (NULL);
+}
+
 #ifdef RN_DEBUG
 int	rn_nodenum;
 struct	radix_node *rn_clist;
diff --git a/sys/net/radix.h b/sys/net/radix.h
index a0e5e5c5aa3f..97555ee9e16d 100644
--- a/sys/net/radix.h
+++ b/sys/net/radix.h
@@ -119,6 +119,7 @@ typedef int rn_walktree_t(struct radix_head *head, walktree_f_t *f,
 typedef int rn_walktree_from_t(struct radix_head *head,
     void *a, void *m, walktree_f_t *f, void *w);
 typedef void rn_close_t(struct radix_node *rn, struct radix_head *head);
+struct radix_node *rn_nextprefix(struct radix_node *rn);
 
 struct radix_mask_head;
 
diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h
index 229c762d4a73..a670979df8c9 100644
--- a/sys/net/route/route_ctl.h
+++ b/sys/net/route/route_ctl.h
@@ -117,6 +117,7 @@ void rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr,
     int *plen, uint32_t *pscopeid);
 void rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr,
     struct in_addr *pmask, uint32_t *pscopeid);
+struct rtentry *rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen);
 #endif
 #ifdef INET6
 struct in6_addr;
@@ -124,6 +125,8 @@ void rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr,
     int *plen, uint32_t *pscopeid);
 void rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr,
     struct in6_addr *pmask, uint32_t *pscopeid);
+struct rtentry *rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr,
+    int plen);
 #endif
 
 /* Nexthops */
diff --git a/sys/net/route/route_helpers.c b/sys/net/route/route_helpers.c
index 5d29197cc4fb..569e13a308eb 100644
--- a/sys/net/route/route_helpers.c
+++ b/sys/net/route/route_helpers.c
@@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$");
 #endif
 #ifdef INET6
 #include <netinet6/in6_fib.h>
+#include <netinet6/in6_var.h>
 #endif
 #include <net/vnet.h>
 
@@ -415,3 +416,152 @@ rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
 	}
 }
 #endif
+
+#ifdef INET
+/*
+ * Checks if the found key in the trie contains (<=) a prefix covering
+ *  @paddr/@plen.
+ * Returns the most specific rtentry matching the condition or NULL.
+ */
+static struct rtentry *
+get_inet_parent_prefix(uint32_t fibnum, struct in_addr addr, int plen)
+{
+	struct route_nhop_data rnd;
+	struct rtentry *rt;
+	struct in_addr addr4;
+	uint32_t scopeid;
+	int parent_plen;
+	struct radix_node *rn;
+
+	rt = fib4_lookup_rt(fibnum, addr, 0, NHR_UNLOCKED, &rnd);
+	rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid);
+	if (parent_plen <= plen)
+		return (rt);
+
+	/*
+	 * There can be multiple prefixes associated with the found key:
+	 * 10.0.0.0 -> 10.0.0.0/24, 10.0.0.0/23, 10.0.0.0/22, etc.
+	 * All such prefixes are linked via rn_dupedkey, from most specific
+	 *  to least specific. Iterate over them to check if any of these
+	 *  prefixes are wider than desired plen.
+	 */
+	rn = (struct radix_node *)rt;
+	while ((rn = rn_nextprefix(rn)) != NULL) {
+		rt = RNTORT(rn);
+		rt_get_inet_prefix_plen(rt, &addr4, &parent_plen, &scopeid);
+		if (parent_plen <= plen)
+			return (rt);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Returns the most specific prefix containing (>) @paddr/plen.
+ */
+struct rtentry *
+rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen)
+{
+	struct in_addr lookup_addr = { .s_addr = INADDR_BROADCAST };
+	struct in_addr addr4 = addr;
+	struct in_addr mask4;
+	struct rtentry *rt;
+
+	while (plen-- > 0) {
+		/* Calculate wider mask & new key to lookup */
+		mask4.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
+		addr4.s_addr = htonl(ntohl(addr4.s_addr) & ntohl(mask4.s_addr));
+		if (addr4.s_addr == lookup_addr.s_addr) {
+			/* Skip lookup if the key is the same */
+			continue;
+		}
+		lookup_addr = addr4;
+
+		rt = get_inet_parent_prefix(fibnum, lookup_addr, plen);
+		if (rt != NULL)
+			return (rt);
+	}
+
+	return (NULL);
+}
+#endif
+
+#ifdef INET6
+/*
+ * Checks if the found key in the trie contains (<=) a prefix covering
+ *  @paddr/@plen.
+ * Returns the most specific rtentry matching the condition or NULL.
+ */
+static struct rtentry *
+get_inet6_parent_prefix(uint32_t fibnum, const struct in6_addr *paddr, int plen)
+{
+	struct route_nhop_data rnd;
+	struct rtentry *rt;
+	struct in6_addr addr6;
+	uint32_t scopeid;
+	int parent_plen;
+	struct radix_node *rn;
+
+	rt = fib6_lookup_rt(fibnum, paddr, 0, NHR_UNLOCKED, &rnd);
+	rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid);
+	if (parent_plen <= plen)
+		return (rt);
+
+	/*
+	 * There can be multiple prefixes associated with the found key:
+	 * 2001:db8:1::/64 -> 2001:db8:1::/56, 2001:db8:1::/48, etc.
+	 * All such prefixes are linked via rn_dupedkey, from most specific
+	 *  to least specific. Iterate over them to check if any of these
+	 *  prefixes are wider than desired plen.
+	 */
+	rn = (struct radix_node *)rt;
+	while ((rn = rn_nextprefix(rn)) != NULL) {
+		rt = RNTORT(rn);
+		rt_get_inet6_prefix_plen(rt, &addr6, &parent_plen, &scopeid);
+		if (parent_plen <= plen)
+			return (rt);
+	}
+
+	return (NULL);
+}
+
+static void
+ipv6_writemask(struct in6_addr *addr6, uint8_t mask)
+{
+	uint32_t *cp;
+
+	for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32)
+		*cp++ = 0xFFFFFFFF;
+	if (mask > 0)
+		*cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0);
+}
+
+/*
+ * Returns the most specific prefix containing (>) @paddr/plen.
+ */
+struct rtentry *
+rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen)
+{
+	struct in6_addr lookup_addr = in6mask128;
+	struct in6_addr addr6 = *paddr;
+	struct in6_addr mask6;
+	struct rtentry *rt;
+
+	while (plen-- > 0) {
+		/* Calculate wider mask & new key to lookup */
+		ipv6_writemask(&mask6, plen);
+		IN6_MASK_ADDR(&addr6, &mask6);
+		if (IN6_ARE_ADDR_EQUAL(&addr6, &lookup_addr)) {
+			/* Skip lookup if the key is the same */
+			continue;
+		}
+		lookup_addr = addr6;
+
+		rt = get_inet6_parent_prefix(fibnum, &lookup_addr, plen);
+		if (rt != NULL)
+			return (rt);
+	}
+
+	return (NULL);
+}
+#endif


More information about the dev-commits-src-all mailing list