git: 537d13437314 - main - Bring DPDK route lookups to FreeBSD.

Alexander V. Chernikov melifaro at FreeBSD.org
Sat Jan 9 12:51:37 UTC 2021


The branch main has been updated by melifaro:

URL: https://cgit.FreeBSD.org/src/commit/?id=537d134373141c2d25bfb24af6d661d0e6102927

commit 537d134373141c2d25bfb24af6d661d0e6102927
Author:     Alexander V. Chernikov <melifaro at FreeBSD.org>
AuthorDate: 2021-01-09 12:08:00 +0000
Commit:     Alexander V. Chernikov <melifaro at FreeBSD.org>
CommitDate: 2021-01-09 12:41:04 +0000

    Bring DPDK route lookups to FreeBSD.
    
    This change introduces loadable fib lookup modules based on
     DPDK rte_lpm lib targeted for high-speed lookups in large-scale tables.
    It is based on the lookup framework described in D27401.
    
    IPv4 module is called dpdk_lpm4. It wraps around rte_lpm [1] library.
    This library implements variation of DIR24-8 [2] lookup algorithm.
    Module provide lockless route lookups and in-place incremental updates,
     allowing for good RIB performance.
    
    IPv6 module is called dpdk_lpm6. It wraps around rte_lpm6 [3] library.
    Implementation can be seen as multi-bit trie where the stride or number of bits
     inspected on each level varies from level to level.
    It can vary from 1 to 14 memory accesses, with 5 being the average value
     for the lengths that are most commonly used in IPv6.
    Module provide lockless route lookups for global unicast addresses
     and in-place incremental updates, allowing for good RIB performance.
    
    Implementation details:
    * wrapper code lives in `sys/contrib/dpdk_rte_lpm/dpdk_lpm[6].c`.
    * rte_lpm[6] implementation contains both RIB and FIB code.
     . RIB ("rule_") code, backed by array of hash tables part has been commented out,
     as base radix already provides all the necessary primitives.
    * link-local lookups are currently implemented as base radix lookup.
     This part should be converted to something like read-only radix trie.
    
    Usage detail:
    Compile kernel with option FIB_ALGO and load dpdk_lpm4/dpdk_lpm6
     module at any time. They will be picked up automatically when
     amount of routes raises to several thousand.
    
    [1]: https://doc.dpdk.org/guides/prog_guide/lpm_lib.html
    [2]: http://yuba.stanford.edu/~nickm/papers/Infocom98_lookup.pdf
    [3]: https://doc.dpdk.org/guides/prog_guide/lpm6_lib.html
    
    Differential Revision: https://reviews.freebsd.org/D27412
---
 sys/contrib/dpdk_rte_lpm/dpdk_lpm.c              |  423 +++++++
 sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c             |  487 ++++++++
 sys/contrib/dpdk_rte_lpm/dpdk_lpm6.h             |   57 +
 sys/contrib/dpdk_rte_lpm/rte_branch_prediction.h |   41 +
 sys/contrib/dpdk_rte_lpm/rte_common.h            |  838 +++++++++++++
 sys/contrib/dpdk_rte_lpm/rte_debug.h             |   83 ++
 sys/contrib/dpdk_rte_lpm/rte_jhash.h             |  379 ++++++
 sys/contrib/dpdk_rte_lpm/rte_log.h               |  383 ++++++
 sys/contrib/dpdk_rte_lpm/rte_lpm.c               | 1107 +++++++++++++++++
 sys/contrib/dpdk_rte_lpm/rte_lpm.h               |  403 ++++++
 sys/contrib/dpdk_rte_lpm/rte_lpm6.c              | 1415 ++++++++++++++++++++++
 sys/contrib/dpdk_rte_lpm/rte_lpm6.h              |  209 ++++
 sys/contrib/dpdk_rte_lpm/rte_shim.h              |   31 +
 sys/contrib/dpdk_rte_lpm/rte_tailq.h             |  140 +++
 sys/modules/Makefile                             |   10 +
 sys/modules/dpdk_lpm4/Makefile                   |   12 +
 sys/modules/dpdk_lpm6/Makefile                   |   12 +
 17 files changed, 6030 insertions(+)

diff --git a/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c b/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c
new file mode 100644
index 000000000000..af145997c4d6
--- /dev/null
+++ b/sys/contrib/dpdk_rte_lpm/dpdk_lpm.c
@@ -0,0 +1,423 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/in_fib.h>
+#include <netinet/ip.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/fib_algo.h>
+
+#include "rte_shim.h"
+#include "rte_lpm.h"
+
+#define	LPM_MIN_TBL8	8		/* 2 pages of memory */
+#define	LPM_MAX_TBL8	65536 * 16	/* 256M */
+
+MALLOC_DECLARE(M_RTABLE);
+
+struct dpdk_lpm_data {
+	struct rte_lpm *lpm;
+	uint64_t routes_added;
+	uint64_t routes_failed;
+	uint32_t number_tbl8s;
+	uint32_t fibnum;
+	uint8_t hit_tables;
+	uint8_t	hit_records;
+	struct fib_data *fd;
+};
+
+/*
+ * Main datapath routing
+ */
+static struct nhop_object *
+lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+	struct rte_lpm *lpm;
+	const struct rte_lpm_external *rte_ext;
+	uint32_t nhidx = 0;
+	int ret;
+
+	lpm = (struct rte_lpm *)algo_data;
+	rte_ext = (const struct rte_lpm_external *)lpm;
+
+	ret = rte_lpm_lookup(lpm, ntohl(key.addr4.s_addr), &nhidx);
+	if (ret == 0) {
+		/* Success! */
+		return (rte_ext->nh_idx[nhidx]);
+	} else {
+		/* Not found. Check default route */
+		return (rte_ext->nh_idx[rte_ext->default_idx]);
+	}
+
+	return (NULL);
+}
+
+static uint8_t
+rte_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+	if (rinfo->num_prefixes < 10)
+		return (1);
+	else if (rinfo->num_prefixes < 1000)
+		return (rinfo->num_prefixes / 10);
+	else if (rinfo->num_prefixes < 500000)
+		return (100 + rinfo->num_prefixes / 3334);
+	else
+		return (250);
+}
+
+static enum flm_op_result
+handle_default_change(struct dpdk_lpm_data *dd, struct rib_cmd_info *rc)
+{
+	struct rte_lpm_external *rte_ext;
+	rte_ext = (struct rte_lpm_external *)dd->lpm;
+
+	if (rc->rc_cmd != RTM_DELETE) {
+		/* Reference new */
+		uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new);
+
+		if (nhidx == 0)
+			return (FLM_REBUILD);
+		rte_ext->default_idx = nhidx;
+	} else {
+		/* No default route */
+		rte_ext->default_idx = 0;
+	}
+
+	return (FLM_SUCCESS);
+}
+
+static void
+get_parent_rule(struct dpdk_lpm_data *dd, struct in_addr addr, uint8_t *plen, uint32_t *nhop_idx)
+{
+	struct route_nhop_data rnd;
+	struct rtentry *rt;
+
+	rt = fib4_lookup_rt(dd->fibnum, addr, 0, NHR_UNLOCKED, &rnd);
+	if (rt != NULL) {
+		struct in_addr addr4;
+		uint32_t scopeid;
+		int inet_plen;
+		rt_get_inet_prefix_plen(rt, &addr4, &inet_plen, &scopeid);
+		if (inet_plen > 0) {
+			*plen = inet_plen;
+			*nhop_idx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop);
+			return;
+		}
+	}
+
+	*nhop_idx = 0;
+	*plen = 0;
+}
+
+static enum flm_op_result
+handle_gu_change(struct dpdk_lpm_data *dd, const struct rib_cmd_info *rc,
+    const struct in_addr addr, int plen)
+{
+	uint32_t nhidx = 0;
+	int ret;
+	char abuf[INET_ADDRSTRLEN];
+	uint32_t ip;
+
+	ip = ntohl(addr.s_addr);
+	inet_ntop(AF_INET, &addr, abuf, sizeof(abuf));
+
+	/* So we get sin, plen and nhidx */
+	if (rc->rc_cmd != RTM_DELETE) {
+		/*
+		 * Addition or change. Save nhop in the internal table
+		 * and get index.
+		 */
+		nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new);
+		if (nhidx == 0) {
+			FIB_PRINTF(LOG_INFO, dd->fd, "nhop limit reached, need rebuild");
+			return (FLM_REBUILD);
+		}
+
+		ret = rte_lpm_add(dd->lpm, ip, plen, nhidx);
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d",
+		    (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE",
+		    abuf, plen, nhidx, ret);
+	} else {
+		/*
+		 * Need to lookup parent. Assume deletion happened already
+		 */
+		uint8_t parent_plen;
+		uint32_t parent_nhop_idx;
+		get_parent_rule(dd, addr, &parent_plen, &parent_nhop_idx);
+
+		ret = rte_lpm_delete(dd->lpm, ip, plen, parent_plen, parent_nhop_idx);
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK: %s %s/%d nhop %u = %d",
+		    "DEL", abuf, plen, nhidx, ret);
+	}
+
+	if (ret != 0) {
+		FIB_PRINTF(LOG_INFO, dd->fd, "error: %d", ret);
+		if (ret == -ENOSPC)
+			return (FLM_REBUILD);
+		return (FLM_ERROR);
+	}
+	return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+    void *_data)
+{
+	struct dpdk_lpm_data *dd;
+	enum flm_op_result ret;
+	struct in_addr addr4;
+	uint32_t scopeid;
+	int plen;
+
+	dd = (struct dpdk_lpm_data *)_data;
+	rt_get_inet_prefix_plen(rc->rc_rt, &addr4, &plen, &scopeid);
+
+	if (plen != 0)
+		ret = handle_gu_change(dd, rc, addr4, plen);
+	else
+		ret = handle_default_change(dd, rc);
+
+	if (ret != 0)
+		FIB_PRINTF(LOG_INFO, dd->fd, "error handling route");
+	return (ret);
+}
+
+static void
+destroy_table(void *_data)
+{
+	struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data;
+
+	if (dd->lpm != NULL)
+		rte_lpm_free(dd->lpm);
+	free(dd, M_RTABLE);
+}
+
+static enum flm_op_result
+add_route_cb(struct rtentry *rt, void *_data)
+{
+	struct dpdk_lpm_data *dd = (struct dpdk_lpm_data *)_data;
+	struct nhop_object *nh;
+	int plen, ret;
+	struct in_addr addr4;
+	uint32_t scopeid;
+
+	nh = rt_get_raw_nhop(rt);
+	rt_get_inet_prefix_plen(rt, &addr4, &plen, &scopeid);
+
+	char abuf[INET_ADDRSTRLEN];
+	inet_ntop(AF_INET, &addr4, abuf, sizeof(abuf));
+
+	FIB_PRINTF(LOG_DEBUG, dd->fd, "Operating on %s/%d", abuf, plen);
+
+	if (plen == 0) {
+		struct rib_cmd_info rc = {
+			.rc_cmd = RTM_ADD,
+			.rc_nh_new = nh,
+		};
+
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "Adding default route");
+		return (handle_default_change(dd, &rc));
+	}
+
+	uint32_t nhidx = fib_get_nhop_idx(dd->fd, nh);
+	if (nhidx == 0) {
+		FIB_PRINTF(LOG_INFO, dd->fd, "unable to get nhop index");
+		return (FLM_REBUILD);
+	}
+	ret = rte_lpm_add(dd->lpm, ntohl(addr4.s_addr), plen, nhidx);
+	FIB_PRINTF(LOG_DEBUG, dd->fd, "ADD %p %s/%d nh %u = %d",
+	    dd->lpm, abuf, plen, nhidx, ret);
+
+	if (ret != 0) {
+		FIB_PRINTF(LOG_INFO, dd->fd, "rte_lpm_add() returned %d", ret);
+		if (ret == -ENOSPC) {
+			dd->hit_tables = 1;
+			return (FLM_REBUILD);
+		}
+		dd->routes_failed++;
+		return (FLM_ERROR);
+	} else
+		dd->routes_added++;
+
+	return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+check_dump_success(void *_data, struct fib_dp *dp)
+{
+	struct dpdk_lpm_data *dd;
+
+	dd = (struct dpdk_lpm_data *)_data;
+
+	FIB_PRINTF(LOG_INFO, dd->fd, "scan completed. added: %zu failed: %zu",
+	    dd->routes_added, dd->routes_failed);
+	if (dd->hit_tables || dd->routes_failed > 0)
+		return (FLM_REBUILD);
+
+	FIB_PRINTF(LOG_INFO, dd->fd,
+	    "DPDK lookup engine synced with IPv4 RIB id %u, %zu routes",
+	    dd->fibnum, dd->routes_added);
+
+	dp->f = lookup_ptr;
+	dp->arg = dd->lpm;
+
+	return (FLM_SUCCESS);
+}
+
+static void
+estimate_scale(const struct dpdk_lpm_data *dd_src, struct dpdk_lpm_data *dd)
+{
+
+	/* XXX: update at 75% capacity */
+	if (dd_src->hit_tables)
+		dd->number_tbl8s = dd_src->number_tbl8s * 2;
+	else
+		dd->number_tbl8s = dd_src->number_tbl8s;
+
+	/* TODO: look into the appropriate RIB to adjust */
+}
+
+static struct dpdk_lpm_data *
+build_table(struct dpdk_lpm_data *dd_prev, struct fib_data *fd)
+{
+	struct dpdk_lpm_data *dd;
+	struct rte_lpm *lpm;
+
+	dd = malloc(sizeof(struct dpdk_lpm_data), M_RTABLE, M_NOWAIT | M_ZERO);
+	if (dd == NULL) {
+		FIB_PRINTF(LOG_INFO, fd, "Unable to allocate base datastructure");
+		return (NULL);
+	}
+	dd->fibnum = dd_prev->fibnum;
+	dd->fd = fd;
+
+	estimate_scale(dd_prev, dd);
+
+	struct rte_lpm_config cfg = {.number_tbl8s = dd->number_tbl8s};
+	lpm = rte_lpm_create("test", 0, &cfg);
+	if (lpm == NULL) {
+		FIB_PRINTF(LOG_INFO, fd, "unable to create lpm");
+		free(dd, M_RTABLE);
+		return (NULL);
+	}
+	dd->lpm = lpm;
+	struct rte_lpm_external *ext = (struct rte_lpm_external *)lpm;
+	ext->nh_idx = fib_get_nhop_array(dd->fd);
+
+	FIB_PRINTF(LOG_INFO, fd, "allocated %u tbl8s", dd->number_tbl8s);
+
+	return (dd);
+}
+
+static enum flm_op_result
+init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data)
+{
+	struct dpdk_lpm_data *dd, dd_base;
+
+	if (_old_data == NULL) {
+		bzero(&dd_base, sizeof(struct dpdk_lpm_data));
+		dd_base.fibnum = fibnum;
+		/* TODO: get rib statistics */
+		dd_base.number_tbl8s = LPM_MIN_TBL8;
+		dd = &dd_base;
+	} else {
+		FIB_PRINTF(LOG_DEBUG, fd, "Starting with old data");
+		dd = (struct dpdk_lpm_data *)_old_data;
+	}
+
+	/* Guaranteed to be in epoch */
+	dd = build_table(dd, fd);
+	if (dd == NULL) {
+		FIB_PRINTF(LOG_NOTICE, fd, "table creation failed");
+		return (FLM_REBUILD);
+	}
+
+	*data = dd;
+	return (FLM_SUCCESS);
+}
+
+static struct fib_lookup_module dpdk_lpm4 = {
+	.flm_name = "dpdk_lpm4",
+	.flm_family = AF_INET,
+	.flm_init_cb = init_table,
+	.flm_destroy_cb = destroy_table,
+	.flm_dump_rib_item_cb = add_route_cb,
+	.flm_dump_end_cb = check_dump_success,
+	.flm_change_rib_item_cb = handle_rtable_change_cb,
+	.flm_get_pref = rte_get_pref,
+};
+
+static int
+lpm4_modevent(module_t mod, int type, void *unused)
+{
+	int error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		fib_module_register(&dpdk_lpm4);
+		break;
+	case MOD_UNLOAD:
+		error = fib_module_unregister(&dpdk_lpm4);
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t lpm4mod = {
+        "dpdk_lpm4",
+        lpm4_modevent,
+        0
+};
+
+DECLARE_MODULE(lpm4mod, lpm4mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(lpm4mod, 1);
diff --git a/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c
new file mode 100644
index 000000000000..250e3e1bde4a
--- /dev/null
+++ b/sys/contrib/dpdk_rte_lpm/dpdk_lpm6.c
@@ -0,0 +1,487 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 Alexander V. Chernikov
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <net/vnet.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/in6_fib.h>
+
+#include <net/route.h>
+#include <net/route/nhop.h>
+#include <net/route/route_ctl.h>
+#include <net/route/fib_algo.h>
+#define	RTDEBUG
+
+#include "rte_lpm6.h"
+
+#define	LPM6_MIN_TBL8	8		/* 2 pages of memory */
+#define	LPM6_MAX_TBL8	65536 * 16	/* 256M */
+
+struct fib_algo_calldata {
+	void *lookup;
+	void *arg;
+};
+
+struct dpdk_lpm6_data {
+	struct rte_lpm6 *lpm6;
+	uint64_t routes_added;
+	uint64_t routes_failed;
+	uint32_t number_tbl8s;
+	uint32_t fibnum;
+	uint8_t hit_tables;
+	struct fib_data *fd;
+};
+
+static struct nhop_object *
+lookup_ptr_ll(const struct rte_lpm6 *lpm6, const struct in6_addr *dst6,
+    uint32_t scopeid)
+{
+	const struct rte_lpm6_external *rte_ext;
+
+	rte_ext = (const struct rte_lpm6_external *)lpm6;
+
+	return (fib6_radix_lookup_nh(rte_ext->fibnum, dst6, scopeid));
+}
+
+/*
+ * Main datapath routing
+ */
+static struct nhop_object *
+lookup_ptr(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
+{
+	const struct rte_lpm6 *lpm6;
+	const struct rte_lpm6_external *rte_ext;
+	const struct in6_addr *addr6;
+	uint32_t nhidx = 0;
+	int ret;
+
+	lpm6 = (const struct rte_lpm6 *)algo_data;
+	addr6 = key.addr6;
+	rte_ext = (const struct rte_lpm6_external *)lpm6;
+
+	if (!IN6_IS_SCOPE_LINKLOCAL(addr6)) {
+		ret = rte_lpm6_lookup(lpm6, (const uint8_t *)addr6, &nhidx);
+		if (ret == 0) {
+			/* Success! */
+			return (rte_ext->nh_idx[nhidx]);
+		} else {
+			/* Not found. Check default route */
+			if (rte_ext->default_idx > 0)
+				return (rte_ext->nh_idx[rte_ext->default_idx]);
+			else
+				return (NULL);
+		}
+	} else {
+		/* LL */
+		return (lookup_ptr_ll(lpm6, addr6, scopeid));
+	}
+}
+
+static uint8_t
+rte6_get_pref(const struct rib_rtable_info *rinfo)
+{
+
+	if (rinfo->num_prefixes < 10)
+		return (1);
+	else if (rinfo->num_prefixes < 1000)
+		return (rinfo->num_prefixes / 10);
+	else if (rinfo->num_prefixes < 500000)
+		return (100 + rinfo->num_prefixes / 3334);
+	else
+		return (250);
+}
+
+static enum flm_op_result
+handle_default_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc)
+{
+	struct rte_lpm6_external *rte_ext;
+	rte_ext = (struct rte_lpm6_external *)dd->lpm6;
+
+	if (rc->rc_cmd != RTM_DELETE) {
+		/* Reference new */
+		uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new);
+
+		if (nhidx == 0)
+			return (FLM_REBUILD);
+		rte_ext->default_idx = nhidx;
+	} else {
+		/* No default route */
+		rte_ext->default_idx = 0;
+	}
+
+	return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+handle_ll_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc,
+    const struct in6_addr addr6, int plen, uint32_t scopeid)
+{
+
+	return (FLM_SUCCESS);
+}
+
+static struct rte_lpm6_rule *
+pack_parent_rule(struct dpdk_lpm6_data *dd, const struct in6_addr *addr6,
+    char *buffer)
+{
+	struct rte_lpm6_rule *lsp_rule = NULL;
+	struct route_nhop_data rnd;
+	struct rtentry *rt;
+	int plen;
+
+	rt = fib6_lookup_rt(dd->fibnum, addr6, 0, NHR_UNLOCKED, &rnd);
+	/* plen = 0 means default route and it's out of scope */
+	if (rt != NULL) {
+		uint32_t scopeid;
+		struct in6_addr new_addr6;
+		rt_get_inet6_prefix_plen(rt, &new_addr6, &plen, &scopeid);
+		if (plen > 0) {
+			uint32_t nhidx = fib_get_nhop_idx(dd->fd, rnd.rnd_nhop);
+			if (nhidx == 0) {
+				/*
+				 * shouldn't happen as we already have parent route.
+				 * It will trigger rebuild automatically.
+				 */
+				return (NULL);
+			}
+			lsp_rule = fill_rule6(buffer, (uint8_t *)&new_addr6, plen, nhidx);
+		}
+	}
+
+	return (lsp_rule);
+}
+
+static enum flm_op_result
+handle_gu_change(struct dpdk_lpm6_data *dd, const struct rib_cmd_info *rc,
+    const struct in6_addr *addr6, int plen)
+{
+	int ret;
+	char abuf[INET6_ADDRSTRLEN];
+	inet_ntop(AF_INET6, addr6, abuf, sizeof(abuf));
+
+	/* So we get sin6, plen and nhidx */
+	if (rc->rc_cmd != RTM_DELETE) {
+		/*
+		 * Addition or change. Save nhop in the internal table
+		 * and get index.
+		 */
+		uint32_t nhidx = fib_get_nhop_idx(dd->fd, rc->rc_nh_new);
+		if (nhidx == 0) {
+			FIB_PRINTF(LOG_INFO, dd->fd, "nhop limit reached, need rebuild");
+			return (FLM_REBUILD);
+		}
+
+		ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)addr6,
+				   plen, nhidx, (rc->rc_cmd == RTM_ADD) ? 1 : 0);
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop %u = %d",
+		    (rc->rc_cmd == RTM_ADD) ? "ADD" : "UPDATE",
+		    abuf, plen, nhidx, ret);
+	} else {
+		/*
+		 * Need to lookup parent. Assume deletion happened already
+		 */
+		char buffer[RTE_LPM6_RULE_SIZE];
+		struct rte_lpm6_rule *lsp_rule = NULL;
+		lsp_rule = pack_parent_rule(dd, addr6, buffer);
+
+		ret = rte_lpm6_delete(dd->lpm6, (const uint8_t *)addr6, plen, lsp_rule);
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "DPDK GU: %s %s/%d nhop ? = %d",
+		    "DEL", abuf, plen, ret);
+	}
+
+	if (ret != 0) {
+		FIB_PRINTF(LOG_INFO, dd->fd, "error: %d", ret);
+		if (ret == -ENOSPC)
+			return (FLM_REBUILD);
+		return (FLM_ERROR);
+	}
+	return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+handle_any_change(struct dpdk_lpm6_data *dd, struct rib_cmd_info *rc)
+{
+	enum flm_op_result ret;
+	struct in6_addr addr6;
+	uint32_t scopeid;
+	int plen;
+
+	rt_get_inet6_prefix_plen(rc->rc_rt, &addr6, &plen, &scopeid);
+
+	if (IN6_IS_SCOPE_LINKLOCAL(&addr6))
+		ret = handle_ll_change(dd, rc, addr6, plen, scopeid);
+	else if (plen == 0)
+		ret = handle_default_change(dd, rc);
+	else
+		ret = handle_gu_change(dd, rc, &addr6, plen);
+
+	if (ret != 0)
+		FIB_PRINTF(LOG_INFO, dd->fd, "error handling route");
+	return (ret);
+}
+
+static enum flm_op_result
+handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
+    void *_data)
+{
+	struct dpdk_lpm6_data *dd;
+
+	dd = (struct dpdk_lpm6_data *)_data;
+
+	return (handle_any_change(dd, rc));
+}
+
+static void
+destroy_dd(struct dpdk_lpm6_data *dd)
+{
+
+	FIB_PRINTF(LOG_INFO, dd->fd, "destroy dd %p", dd);
+	if (dd->lpm6 != NULL)
+		rte_lpm6_free(dd->lpm6);
+	free(dd, M_TEMP);
+}
+
+static void
+destroy_table(void *_data)
+{
+
+	destroy_dd((struct dpdk_lpm6_data *)_data);
+}
+
+static enum flm_op_result
+add_route_cb(struct rtentry *rt, void *_data)
+{
+	struct dpdk_lpm6_data *dd = (struct dpdk_lpm6_data *)_data;
+	struct in6_addr addr6;
+	struct nhop_object *nh;
+	uint32_t scopeid;
+	int plen;
+	int ret;
+
+	rt_get_inet6_prefix_plen(rt, &addr6, &plen, &scopeid);
+	nh = rt_get_raw_nhop(rt);
+
+	if (IN6_IS_SCOPE_LINKLOCAL(&addr6)) {
+
+		/*
+		 * We don't operate on LL directly, however
+		 * reference them to maintain guarantee on
+		 * ability to refcount nhops in epoch.
+		 */
+		fib_get_nhop_idx(dd->fd, nh);
+		return (FLM_SUCCESS);
+	}
+
+	char abuf[INET6_ADDRSTRLEN];
+	inet_ntop(AF_INET6, &addr6, abuf, sizeof(abuf));
+	FIB_PRINTF(LOG_DEBUG, dd->fd, "Operating on %s/%d", abuf, plen);
+
+	if (plen == 0) {
+		struct rib_cmd_info rc = {
+			.rc_cmd = RTM_ADD,
+			.rc_nh_new = nh,
+		};
+
+		FIB_PRINTF(LOG_DEBUG, dd->fd, "Adding default route");
+		return (handle_default_change(dd, &rc));
+	}
+
+	uint32_t nhidx = fib_get_nhop_idx(dd->fd, nh);
+	if (nhidx == 0) {
+		FIB_PRINTF(LOG_INFO, dd->fd, "unable to get nhop index");
+		return (FLM_REBUILD);
+	}
+	ret = rte_lpm6_add(dd->lpm6, (const uint8_t *)&addr6, plen, nhidx, 1);
+	FIB_PRINTF(LOG_DEBUG, dd->fd, "ADD %p %s/%d nh %u = %d",
+	    dd->lpm6, abuf, plen, nhidx, ret);
+
+	if (ret != 0) {
+		FIB_PRINTF(LOG_INFO, dd->fd, "rte_lpm6_add() returned %d", ret);
+		if (ret == -ENOSPC) {
+			dd->hit_tables = 1;
+			return (FLM_REBUILD);
+		}
+		dd->routes_failed++;
+		return (FLM_ERROR);
+	} else
+		dd->routes_added++;
+
+	return (FLM_SUCCESS);
+}
+
+static enum flm_op_result
+check_dump_success(void *_data, struct fib_dp *dp)
+{
+	struct dpdk_lpm6_data *dd;
+
+	dd = (struct dpdk_lpm6_data *)_data;
+
+	FIB_PRINTF(LOG_INFO, dd->fd, "scan completed. added: %zu failed: %zu",
+	    dd->routes_added, dd->routes_failed);
+	if (dd->hit_tables || dd->routes_failed > 0)
+		return (FLM_REBUILD);
+
+	FIB_PRINTF(LOG_INFO, dd->fd,
+	    "DPDK lookup engine synced with IPv6 RIB id %u, %zu routes",
+	    dd->fibnum, dd->routes_added);
+
+	dp->f = lookup_ptr;
+	dp->arg = dd->lpm6;
+
+	return (FLM_SUCCESS);
+}
+
+static void
+estimate_scale(const struct dpdk_lpm6_data *dd_src, struct dpdk_lpm6_data *dd)
+{
+
+	/* XXX: update at 75% capacity */
+	if (dd_src->hit_tables)
+		dd->number_tbl8s = dd_src->number_tbl8s * 2;
+	else
+		dd->number_tbl8s = dd_src->number_tbl8s;
+
+	/* TODO: look into the appropriate RIB to adjust */
+}
+
+static struct dpdk_lpm6_data *
+build_table(struct dpdk_lpm6_data *dd_prev, struct fib_data *fd)
+{
+	struct dpdk_lpm6_data *dd;
+	struct rte_lpm6 *lpm6;
+
+	dd = malloc(sizeof(struct dpdk_lpm6_data), M_TEMP, M_NOWAIT | M_ZERO);
+	if (dd == NULL) {
+		FIB_PRINTF(LOG_INFO, fd, "Unable to allocate base datastructure");
+		return (NULL);
+	}
+	dd->fibnum = dd_prev->fibnum;
+	dd->fd = fd;
+
+	estimate_scale(dd_prev, dd);
+
+	struct rte_lpm6_config cfg = {.number_tbl8s = dd->number_tbl8s};
+	lpm6 = rte_lpm6_create("test", 0, &cfg);
+	if (lpm6 == NULL) {
+		FIB_PRINTF(LOG_INFO, fd, "unable to create lpm6");
+		free(dd, M_TEMP);
+		return (NULL);
+	}
+	dd->lpm6 = lpm6;
+	struct rte_lpm6_external *ext = (struct rte_lpm6_external *)lpm6;
+	ext->nh_idx = fib_get_nhop_array(dd->fd);
+
+	FIB_PRINTF(LOG_INFO, fd, "allocated %u tbl8s", dd->number_tbl8s);
+
+	return (dd);
+}
+
+static enum flm_op_result
+init_table(uint32_t fibnum, struct fib_data *fd, void *_old_data, void **data)
+{
+	struct dpdk_lpm6_data *dd, dd_base;
+
+	if (_old_data == NULL) {
+		bzero(&dd_base, sizeof(struct dpdk_lpm6_data));
+		dd_base.fibnum = fibnum;
+		/* TODO: get rib statistics */
+		dd_base.number_tbl8s = LPM6_MIN_TBL8;
+		dd = &dd_base;
+	} else {
+		FIB_PRINTF(LOG_INFO, fd, "Starting with old data");
+		dd = (struct dpdk_lpm6_data *)_old_data;
+	}
+
+	/* Guaranteed to be in epoch */
+	dd = build_table(dd, fd);
+	if (dd == NULL) {
+		FIB_PRINTF(LOG_INFO, fd, "table creation failed");
+		return (FLM_REBUILD);
+	}
+
+	*data = dd;
+	return (FLM_SUCCESS);
+}
+
+static struct fib_lookup_module dpdk_lpm6 = {
+	.flm_name = "dpdk_lpm6",
+	.flm_family = AF_INET6,
+	.flm_init_cb = init_table,
+	.flm_destroy_cb = destroy_table,
+	.flm_dump_rib_item_cb = add_route_cb,
+	.flm_dump_end_cb = check_dump_success,
+	.flm_change_rib_item_cb = handle_rtable_change_cb,
+	.flm_get_pref = rte6_get_pref,
+};
+
+static int
+lpm6_modevent(module_t mod, int type, void *unused)
+{
+	int error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		fib_module_register(&dpdk_lpm6);
+		break;
+	case MOD_UNLOAD:
+		error = fib_module_unregister(&dpdk_lpm6);
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	return (error);
+}
+
+static moduledata_t lpm6mod = {
+        "dpdk_lpm6",
+        lpm6_modevent,
+        0
+};
+
*** 5224 LINES SKIPPED ***


More information about the dev-commits-src-all mailing list