git: 2cef62886dc7 - main - pf: convert state retrieval to netlink

From: Kristof Provost <kp_at_FreeBSD.org>
Date: Tue, 10 Oct 2023 09:50:34 UTC
The branch main has been updated by kp:

URL: https://cgit.FreeBSD.org/src/commit/?id=2cef62886dc7c33ca01f70ca712845da1e55b470

commit 2cef62886dc7c33ca01f70ca712845da1e55b470
Author:     Alexander V. Chernikov <melifaro@FreeBSD.org>
AuthorDate: 2023-09-15 10:06:59 +0000
Commit:     Kristof Provost <kp@FreeBSD.org>
CommitDate: 2023-10-10 09:48:21 +0000

    pf: convert state retrieval to netlink
    
    Use netlink to export pf's state table.
    
    The primary motivation is to improve how we deal with very large state
    stables. With the previous implementation we had to build the entire
    list (both in the kernel and in userspace) before we could start
    processing. With netlink we start to get data in userspace while the
    kernel is still generating more. This reduces peak memory consumption
    (which can get to the GB range once we hit millions of states).
    
    Netlink also makes future extension easier, in that we can easily add
    fields to the state export without breaking userspace. In that regard
    it's similar to an nvlist-based approach, except that it also deals
    with transport to userspace and that it performs significantly better
    than nvlists. Testing has failed to measure a performance difference
    between the previous struct-copy based ioctl and the netlink approach.
    
    Differential Revision:  https://reviews.freebsd.org/D38888
---
 include/Makefile          |   3 +-
 lib/libpfctl/libpfctl.c   | 214 +++++++++++++++++----------------
 sys/conf/files            |   1 +
 sys/modules/pf/Makefile   |   2 +-
 sys/netpfil/pf/pf_ioctl.c |   5 +
 sys/netpfil/pf/pf_nl.c    | 292 ++++++++++++++++++++++++++++++++++++++++++++++
 sys/netpfil/pf/pf_nl.h    | 105 +++++++++++++++++
 7 files changed, 522 insertions(+), 100 deletions(-)

diff --git a/include/Makefile b/include/Makefile
index 736a47854534..5a3cb66eb3e3 100644
--- a/include/Makefile
+++ b/include/Makefile
@@ -215,7 +215,8 @@ IPFILTERDIR=	${INCLUDEDIR}/netinet
 .PATH: ${SRCTOP}/sys/netpfil/pf
 PF=		pf.h \
 		pf_altq.h \
-		pf_mtag.h
+		pf_mtag.h \
+		pf_nl.h
 PFPACKAGE=	pf
 PFDIR=	${INCLUDEDIR}/netpfil/pf
 
diff --git a/lib/libpfctl/libpfctl.c b/lib/libpfctl/libpfctl.c
index 1eccf3dfbcdf..8699d8132240 100644
--- a/lib/libpfctl/libpfctl.c
+++ b/lib/libpfctl/libpfctl.c
@@ -40,6 +40,13 @@
 #include <net/pfvar.h>
 #include <netinet/in.h>
 
+#include <netpfil/pf/pf_nl.h>
+#include <netlink/netlink.h>
+#include <netlink/netlink_generic.h>
+#include <netlink/netlink_snl.h>
+#include <netlink/netlink_snl_generic.h>
+#include <netlink/netlink_snl_route.h>
+
 #include <assert.h>
 #include <err.h>
 #include <errno.h>
@@ -1115,125 +1122,136 @@ pfctl_nv_add_state_cmp(nvlist_t *nvl, const char *name,
 	nvlist_destroy(nv);
 }
 
-static void
-pf_state_key_export_to_state_key(struct pfctl_state_key *ps,
-    const struct pf_state_key_export *s)
+static inline bool
+snl_attr_get_pfaddr(struct snl_state *ss __unused, struct nlattr *nla,
+    const void *arg __unused, void *target)
 {
-	bcopy(s->addr, ps->addr, sizeof(ps->addr[0]) * 2);
-	ps->port[0] = s->port[0];
-	ps->port[1] = s->port[1];
+	memcpy(target, NLA_DATA(nla), NLA_DATA_LEN(nla));
+	return (true);
 }
 
-static void
-pf_state_peer_export_to_state_peer(struct pfctl_state_peer *ps,
-    const struct pf_state_peer_export *s)
+static inline bool
+snl_attr_store_ifname(struct snl_state *ss __unused, struct nlattr *nla,
+    const void *arg __unused, void *target)
 {
-	/* Ignore scrub. */
-	ps->seqlo = s->seqlo;
-	ps->seqhi = s->seqhi;
-	ps->seqdiff = s->seqdiff;
-	/* Ignore max_win & mss */
-	ps->state = s->state;
-	ps->wscale = s->wscale;
-}
+	size_t maxlen = NLA_DATA_LEN(nla);
 
-static void
-pf_state_export_to_state(struct pfctl_state *ps, const struct pf_state_export *s)
-{
-	assert(s->version >= PF_STATE_VERSION);
-
-	ps->id = s->id;
-	strlcpy(ps->ifname, s->ifname, sizeof(ps->ifname));
-	strlcpy(ps->orig_ifname, s->orig_ifname, sizeof(ps->orig_ifname));
-	strlcpy(ps->rt_ifname, s->rt_ifname, sizeof(ps->rt_ifname));
-	pf_state_key_export_to_state_key(&ps->key[0], &s->key[0]);
-	pf_state_key_export_to_state_key(&ps->key[1], &s->key[1]);
-	pf_state_peer_export_to_state_peer(&ps->src, &s->src);
-	pf_state_peer_export_to_state_peer(&ps->dst, &s->dst);
-	bcopy(&s->rt_addr, &ps->rt_addr, sizeof(ps->rt_addr));
-	ps->rule = ntohl(s->rule);
-	ps->anchor = ntohl(s->anchor);
-	ps->nat_rule = ntohl(s->nat_rule);
-	ps->creation = ntohl(s->creation);
-	ps->expire = ntohl(s->expire);
-	ps->packets[0] = s->packets[0];
-	ps->packets[1] = s->packets[1];
-	ps->bytes[0] = s->bytes[0];
-	ps->bytes[1] = s->bytes[1];
-	ps->creatorid = ntohl(s->creatorid);
-	ps->key[0].proto = s->proto;
-	ps->key[1].proto = s->proto;
-	ps->key[0].af = s->af;
-	ps->key[1].af = s->af;
-	ps->direction = s->direction;
-	ps->state_flags = ntohs(s->state_flags);
-	ps->sync_flags = ntohs(s->sync_flags);
-	ps->qid = ntohs(s->qid);
-	ps->pqid = ntohs(s->pqid);
-	ps->dnpipe = ntohs(s->dnpipe);
-	ps->dnrpipe = ntohs(s->dnrpipe);
-	ps->rtableid = ntohl(s->rtableid);
-	ps->min_ttl = s->min_ttl;
-	ps->set_tos = s->set_tos;
-	ps->max_mss = ntohs(s->max_mss);
-	ps->rt = s->rt;
-	ps->set_prio[0] = s->set_prio[0];
-	ps->set_prio[1] = s->set_prio[1];
+	if (strnlen((char *)NLA_DATA(nla), maxlen) < maxlen) {
+		strlcpy(target, (char *)NLA_DATA(nla), maxlen);
+		return (true);
+	}
+	return (false);
 }
 
-int
-pfctl_get_states(int dev, struct pfctl_states *states)
+#define	_OUT(_field)	offsetof(struct pfctl_state_peer, _field)
+static const struct snl_attr_parser nla_p_speer[] = {
+	{ .type = PF_STP_SEQLO, .off = _OUT(seqlo), .cb = snl_attr_get_uint32 },
+	{ .type = PF_STP_SEQHI, .off = _OUT(seqhi), .cb = snl_attr_get_uint32 },
+	{ .type = PF_STP_SEQDIFF, .off = _OUT(seqdiff), .cb = snl_attr_get_uint32 },
+	{ .type = PF_STP_STATE, .off = _OUT(state), .cb = snl_attr_get_uint8 },
+	{ .type = PF_STP_WSCALE, .off = _OUT(wscale), .cb = snl_attr_get_uint8 },
+};
+SNL_DECLARE_ATTR_PARSER(speer_parser, nla_p_speer);
+#undef _OUT
+
+#define	_OUT(_field)	offsetof(struct pf_state_key_export, _field)
+static const struct snl_attr_parser nla_p_skey[] = {
+	{ .type = PF_STK_ADDR0, .off = _OUT(addr[0]), .cb = snl_attr_get_pfaddr },
+	{ .type = PF_STK_ADDR1, .off = _OUT(addr[1]), .cb = snl_attr_get_pfaddr },
+	{ .type = PF_STK_PORT0, .off = _OUT(port[0]), .cb = snl_attr_get_uint16 },
+	{ .type = PF_STK_PORT1, .off = _OUT(port[1]), .cb = snl_attr_get_uint16 },
+};
+SNL_DECLARE_ATTR_PARSER(skey_parser, nla_p_skey);
+#undef _OUT
+
+#define	_IN(_field)	offsetof(struct genlmsghdr, _field)
+#define	_OUT(_field)	offsetof(struct pfctl_state, _field)
+static struct snl_attr_parser ap_state[] = {
+	{ .type = PF_ST_ID, .off = _OUT(id), .cb = snl_attr_get_uint64 },
+	{ .type = PF_ST_CREATORID, .off = _OUT(creatorid), .cb = snl_attr_get_uint32 },
+	{ .type = PF_ST_IFNAME, .off = _OUT(ifname), .cb = snl_attr_store_ifname },
+	{ .type = PF_ST_ORIG_IFNAME, .off = _OUT(orig_ifname), .cb = snl_attr_store_ifname },
+	{ .type = PF_ST_KEY_WIRE, .off = _OUT(key[0]), .arg = &skey_parser, .cb = snl_attr_get_nested },
+	{ .type = PF_ST_KEY_STACK, .off = _OUT(key[1]), .arg = &skey_parser, .cb = snl_attr_get_nested },
+	{ .type = PF_ST_PEER_SRC, .off = _OUT(src), .arg = &speer_parser, .cb = snl_attr_get_nested },
+	{ .type = PF_ST_PEER_DST, .off = _OUT(dst), .arg = &speer_parser, .cb = snl_attr_get_nested },
+	{ .type = PF_ST_RT_ADDR, .off = _OUT(rt_addr), .cb = snl_attr_get_pfaddr },
+	{ .type = PF_ST_RULE, .off = _OUT(rule), .cb = snl_attr_get_uint32 },
+	{ .type = PF_ST_ANCHOR, .off = _OUT(anchor), .cb = snl_attr_get_uint32 },
+	{ .type = PF_ST_NAT_RULE, .off = _OUT(nat_rule), .cb = snl_attr_get_uint32 },
+	{ .type = PF_ST_CREATION, .off = _OUT(creation), .cb = snl_attr_get_uint32 },
+	{ .type = PF_ST_EXPIRE, .off = _OUT(expire), .cb = snl_attr_get_uint32 },
+	{ .type = PF_ST_PACKETS0, .off = _OUT(packets[0]), .cb = snl_attr_get_uint64 },
+	{ .type = PF_ST_PACKETS1, .off = _OUT(packets[1]), .cb = snl_attr_get_uint64 },
+	{ .type = PF_ST_BYTES0, .off = _OUT(bytes[0]), .cb = snl_attr_get_uint64 },
+	{ .type = PF_ST_BYTES1, .off = _OUT(bytes[1]), .cb = snl_attr_get_uint64 },
+	{ .type = PF_ST_AF, .off = _OUT(key[0].af), .cb = snl_attr_get_uint8 },
+	{ .type = PF_ST_PROTO, .off = _OUT(key[0].proto), .cb = snl_attr_get_uint8 },
+	{ .type = PF_ST_DIRECTION, .off = _OUT(direction), .cb = snl_attr_get_uint8 },
+	{ .type = PF_ST_LOG, .off = _OUT(log), .cb = snl_attr_get_uint8 },
+	{ .type = PF_ST_STATE_FLAGS, .off = _OUT(state_flags), .cb = snl_attr_get_uint16 },
+	{ .type = PF_ST_SYNC_FLAGS, .off = _OUT(sync_flags), .cb = snl_attr_get_uint8 },
+};
+static struct snl_field_parser fp_state[] = {
+};
+#undef _IN
+#undef _OUT
+SNL_DECLARE_PARSER(state_parser, struct genlmsghdr, fp_state, ap_state);
+
+static const struct snl_hdr_parser *all_parsers[] = {
+	&state_parser, &skey_parser, &speer_parser
+};
+
+static int
+pfctl_get_states_nl(struct snl_state *ss, struct pfctl_states *states)
 {
-	struct pfioc_states_v2 ps;
-	struct pf_state_export *p;
-	char *inbuf = NULL, *newinbuf = NULL;
-	unsigned int len = 0;
-	int i, error;
+	SNL_VERIFY_PARSERS(all_parsers);
+	int family_id = snl_get_genl_family(ss, PFNL_FAMILY_NAME);
+
+	struct nlmsghdr *hdr;
+	struct snl_writer nw;
 
-	bzero(&ps, sizeof(ps));
-	ps.ps_req_version = PF_STATE_VERSION;
+	snl_init_writer(ss, &nw);
+	hdr = snl_create_genl_msg_request(&nw, family_id, PFNL_CMD_GETSTATES);
+	hdr->nlmsg_flags |= NLM_F_DUMP;
+	snl_finalize_msg(&nw);
+	uint32_t seq_id = hdr->nlmsg_seq;
+
+	snl_send_message(ss, hdr);
 
 	bzero(states, sizeof(*states));
 	TAILQ_INIT(&states->states);
 
-	for (;;) {
-		ps.ps_len = len;
-		if (len) {
-			newinbuf = realloc(inbuf, len);
-			if (newinbuf == NULL)
-				return (ENOMEM);
-			ps.ps_buf = inbuf = newinbuf;
-		}
-		if ((error = ioctl(dev, DIOCGETSTATESV2, &ps)) < 0) {
-			free(inbuf);
-			return (error);
-		}
-		if (ps.ps_len + sizeof(struct pfioc_states_v2) < len)
-			break;
-		if (len == 0 && ps.ps_len == 0)
-			goto out;
-		if (len == 0 && ps.ps_len != 0)
-			len = ps.ps_len;
-		if (ps.ps_len == 0)
-			goto out;      /* no states */
-		len *= 2;
-	}
-	p = ps.ps_states;
-
-	for (i = 0; i < ps.ps_len; i += sizeof(*p), p++) {
+	struct snl_errmsg_data e = {};
+	while ((hdr = snl_read_reply_multi(ss, seq_id, &e)) != NULL) {
 		struct pfctl_state *s = malloc(sizeof(*s));
+		bzero(s, sizeof(*s));
 		if (s == NULL) {
 			pfctl_free_states(states);
-			error = ENOMEM;
-			goto out;
+			return (ENOMEM);
 		}
+		if (!snl_parse_nlmsg(ss, hdr, &state_parser, s))
+			continue;
+
+		s->key[1].af = s->key[0].af;
+		s->key[1].proto = s->key[0].proto;
 
-		pf_state_export_to_state(s, p);
 		TAILQ_INSERT_TAIL(&states->states, s, entry);
 	}
 
-out:
-	free(inbuf);
+	return (0);
+}
+
+int
+pfctl_get_states(int dev __unused, struct pfctl_states *states)
+{
+	struct snl_state ss = {};
+	int error;
+
+	snl_init(&ss, NETLINK_GENERIC);
+	error = pfctl_get_states_nl(&ss, states);
+	snl_free(&ss);
+
 	return (error);
 }
 
diff --git a/sys/conf/files b/sys/conf/files
index 5d5e8f30347c..dc837eb02c06 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -4498,6 +4498,7 @@ netpfil/pf/pf_if.c		optional pf inet
 netpfil/pf/pf_ioctl.c		optional pf inet
 netpfil/pf/pf_lb.c		optional pf inet
 netpfil/pf/pf_norm.c		optional pf inet
+netpfil/pf/pf_nl.c		optional pf inet
 netpfil/pf/pf_nv.c		optional pf inet
 netpfil/pf/pf_osfp.c		optional pf inet
 netpfil/pf/pf_ruleset.c		optional pf inet
diff --git a/sys/modules/pf/Makefile b/sys/modules/pf/Makefile
index 918cc78bad5e..6158943139cf 100644
--- a/sys/modules/pf/Makefile
+++ b/sys/modules/pf/Makefile
@@ -3,7 +3,7 @@
 
 KMOD=	pf
 SRCS=	pf.c pf_if.c pf_lb.c pf_osfp.c pf_ioctl.c pf_norm.c pf_table.c \
-	pf_ruleset.c pf_nv.c pf_syncookies.c in4_cksum.c \
+	pf_ruleset.c pf_nl.c pf_nv.c pf_syncookies.c in4_cksum.c \
 	bus_if.h device_if.h \
 	opt_pf.h opt_inet.h opt_inet6.h opt_bpf.h opt_sctp.h opt_global.h \
 	opt_kern_tls.h
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index db8f481a1567..42c2aa9bfb01 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -83,6 +83,7 @@
 #include <netinet/ip_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/ip_icmp.h>
+#include <netpfil/pf/pf_nl.h>
 #include <netpfil/pf/pf_nv.h>
 
 #ifdef INET6
@@ -6648,6 +6649,8 @@ pf_unload(void)
 	}
 	sx_xunlock(&pf_end_lock);
 
+	pf_nl_unregister();
+
 	if (pf_dev != NULL)
 		destroy_dev(pf_dev);
 
@@ -6683,6 +6686,7 @@ pf_modevent(module_t mod, int type, void *data)
 	switch(type) {
 	case MOD_LOAD:
 		error = pf_load();
+		pf_nl_register();
 		break;
 	case MOD_UNLOAD:
 		/* Handled in SYSUNINIT(pf_unload) to ensure it's done after
@@ -6703,4 +6707,5 @@ static moduledata_t pf_mod = {
 };
 
 DECLARE_MODULE(pf, pf_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_SECOND);
+MODULE_DEPEND(pf, netlink, 1, 1, 1);
 MODULE_VERSION(pf, PF_MODVER);
diff --git a/sys/netpfil/pf/pf_nl.c b/sys/netpfil/pf/pf_nl.c
new file mode 100644
index 000000000000..cbea76e7386f
--- /dev/null
+++ b/sys/netpfil/pf/pf_nl.c
@@ -0,0 +1,292 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ * Copyright (c) 2023 Rubicon Communications, LLC (Netgate)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include "opt_netlink.h"
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+
+#include <net/pfvar.h>
+
+#include <netlink/netlink.h>
+#include <netlink/netlink_ctl.h>
+#include <netlink/netlink_generic.h>
+#include <netlink/netlink_message_writer.h>
+
+#include <netpfil/pf/pf_nl.h>
+
+#define	DEBUG_MOD_NAME	nl_pf
+#define	DEBUG_MAX_LEVEL	LOG_DEBUG3
+#include <netlink/netlink_debug.h>
+_DECLARE_DEBUG(LOG_DEBUG);
+
+struct nl_parsed_state {
+	uint8_t		version;
+	uint32_t	id;
+	uint32_t	creatorid;
+};
+
+#define	_IN(_field)	offsetof(struct genlmsghdr, _field)
+#define	_OUT(_field)	offsetof(struct nl_parsed_state, _field)
+static const struct nlattr_parser nla_p_state[] = {
+	{ .type = PF_ST_ID, .off = _OUT(id), .cb = nlattr_get_uint32 },
+	{ .type = PF_ST_CREATORID, .off = _OUT(creatorid), .cb = nlattr_get_uint32 },
+};
+static const struct nlfield_parser nlf_p_generic[] = {
+	{ .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 },
+};
+#undef _IN
+#undef _OUT
+NL_DECLARE_PARSER(state_parser, struct genlmsghdr, nlf_p_generic, nla_p_state);
+
+static void
+dump_addr(struct nl_writer *nw, int attr, const struct pf_addr *addr, int af)
+{
+	switch (af) {
+	case AF_INET:
+		nlattr_add(nw, attr, 4, &addr->v4);
+		break;
+	case AF_INET6:
+		nlattr_add(nw, attr, 16, &addr->v6);
+		break;
+	};
+}
+
+static bool
+dump_state_peer(struct nl_writer *nw, int attr, const struct pf_state_peer *peer)
+{
+	int off = nlattr_add_nested(nw, attr);
+	if (off == 0)
+		return (false);
+
+	nlattr_add_u32(nw, PF_STP_SEQLO, peer->seqlo);
+	nlattr_add_u32(nw, PF_STP_SEQHI, peer->seqhi);
+	nlattr_add_u32(nw, PF_STP_SEQDIFF, peer->seqdiff);
+	nlattr_add_u16(nw, PF_STP_MAX_WIN, peer->max_win);
+	nlattr_add_u16(nw, PF_STP_MSS, peer->mss);
+	nlattr_add_u8(nw, PF_STP_STATE, peer->state);
+	nlattr_add_u8(nw, PF_STP_WSCALE, peer->wscale);
+
+	if (peer->scrub != NULL) {
+		struct pf_state_scrub *sc = peer->scrub;
+		uint16_t pfss_flags = sc->pfss_flags & PFSS_TIMESTAMP;
+
+		nlattr_add_u16(nw, PF_STP_PFSS_FLAGS, pfss_flags);
+		nlattr_add_u32(nw, PF_STP_PFSS_TS_MOD, sc->pfss_ts_mod);
+		nlattr_add_u8(nw, PF_STP_PFSS_TTL, sc->pfss_ttl);
+		nlattr_add_u8(nw, PF_STP_SCRUB_FLAG, PFSYNC_SCRUB_FLAG_VALID);
+	}
+	nlattr_set_len(nw, off);
+
+	return (true);
+}
+
+static bool
+dump_state_key(struct nl_writer *nw, int attr, const struct pf_state_key *key)
+{
+	int off = nlattr_add_nested(nw, attr);
+	if (off == 0)
+		return (false);
+
+	dump_addr(nw, PF_STK_ADDR0, &key->addr[0], key->af);
+	dump_addr(nw, PF_STK_ADDR1, &key->addr[1], key->af);
+	nlattr_add_u16(nw, PF_STK_PORT0, key->port[0]);
+	nlattr_add_u16(nw, PF_STK_PORT1, key->port[1]);
+
+	nlattr_set_len(nw, off);
+
+	return (true);
+}
+
+static int
+dump_state(struct nlpcb *nlp, const struct nlmsghdr *hdr, struct pf_kstate *s,
+    struct nl_pstate *npt)
+{
+	struct nl_writer *nw = npt->nw;
+	int error = 0;
+	int af;
+	struct pf_state_key *key;
+
+	if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr)))
+		goto enomem;
+
+	struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr);
+	ghdr_new->cmd = PFNL_CMD_GETSTATES;
+	ghdr_new->version = 0;
+	ghdr_new->reserved = 0;
+
+	nlattr_add_u64(nw, PF_ST_VERSION, PF_STATE_VERSION);
+
+	key = s->key[PF_SK_WIRE];
+	if (!dump_state_key(nw, PF_ST_KEY_WIRE, key))
+		goto enomem;
+	key = s->key[PF_SK_STACK];
+	if (!dump_state_key(nw, PF_ST_KEY_STACK, key))
+		goto enomem;
+
+	af = s->key[PF_SK_WIRE]->af;
+	nlattr_add_u8(nw, PF_ST_PROTO, s->key[PF_SK_WIRE]->proto);
+	nlattr_add_u8(nw, PF_ST_AF, af);
+
+	nlattr_add_string(nw, PF_ST_IFNAME, s->kif->pfik_name);
+	nlattr_add_string(nw, PF_ST_ORIG_IFNAME, s->orig_kif->pfik_name);
+	dump_addr(nw, PF_ST_RT_ADDR, &s->rt_addr, af);
+	nlattr_add_u32(nw, PF_ST_CREATION, time_uptime - s->creation);
+	uint32_t expire = pf_state_expires(s);
+	if (expire > time_uptime)
+		expire = expire - time_uptime;
+	nlattr_add_u32(nw, PF_ST_EXPIRE, expire);
+	nlattr_add_u8(nw, PF_ST_DIRECTION, s->direction);
+	nlattr_add_u8(nw, PF_ST_LOG, s->act.log);
+	nlattr_add_u8(nw, PF_ST_TIMEOUT, s->timeout);
+	nlattr_add_u16(nw, PF_ST_STATE_FLAGS, s->state_flags);
+	uint8_t sync_flags = 0;
+	if (s->src_node)
+		sync_flags |= PFSYNC_FLAG_SRCNODE;
+	if (s->nat_src_node)
+		sync_flags |= PFSYNC_FLAG_NATSRCNODE;
+	nlattr_add_u8(nw, PF_ST_SYNC_FLAGS, sync_flags);
+	nlattr_add_u64(nw, PF_ST_ID, s->id);
+	nlattr_add_u32(nw, PF_ST_CREATORID, htonl(s->creatorid));
+
+	nlattr_add_u32(nw, PF_ST_RULE, s->rule.ptr ? s->rule.ptr->nr : -1);
+	nlattr_add_u32(nw, PF_ST_ANCHOR, s->anchor.ptr ? s->anchor.ptr->nr : -1);
+	nlattr_add_u32(nw, PF_ST_NAT_RULE, s->nat_rule.ptr ? s->nat_rule.ptr->nr : -1);
+
+	nlattr_add_u64(nw, PF_ST_PACKETS0, s->packets[0]);
+	nlattr_add_u64(nw, PF_ST_PACKETS1, s->packets[1]);
+	nlattr_add_u64(nw, PF_ST_BYTES0, s->bytes[0]);
+	nlattr_add_u64(nw, PF_ST_BYTES1, s->bytes[1]);
+
+	if (!dump_state_peer(nw, PF_ST_PEER_SRC, &s->src))
+		goto enomem;
+	if (!dump_state_peer(nw, PF_ST_PEER_DST, &s->dst))
+		goto enomem;
+
+	if (nlmsg_end(nw))
+		return (0);
+
+enomem:
+	error = ENOMEM;
+	nlmsg_abort(nw);
+	return (error);
+}
+
+static int
+handle_dumpstates(struct nlpcb *nlp, struct nl_parsed_state *attrs,
+    struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+	int error = 0;
+
+	hdr->nlmsg_flags |= NLM_F_MULTI;
+
+	for (int i = 0; i <= pf_hashmask; i++) {
+		struct pf_idhash *ih = &V_pf_idhash[i];
+		struct pf_kstate *s;
+
+		if (LIST_EMPTY(&ih->states))
+			continue;
+
+		PF_HASHROW_LOCK(ih);
+		LIST_FOREACH(s, &ih->states, entry) {
+			if (s->timeout != PFTM_UNLINKED) {
+				error = dump_state(nlp, hdr, s, npt);
+				if (error != 0)
+					break;
+			}
+		}
+		PF_HASHROW_UNLOCK(ih);
+	}
+
+	if (!nlmsg_end_dump(npt->nw, error, hdr)) {
+		NL_LOG(LOG_DEBUG, "Unable to finalize the dump");
+		return (ENOMEM);
+	}
+
+	return (error);
+}
+
+static int
+handle_getstate(struct nlpcb *nlp, struct nl_parsed_state *attrs,
+    struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+	struct pf_kstate *s = pf_find_state_byid(attrs->id, attrs->creatorid);
+	if (s == NULL)
+		return (ENOENT);
+	return (dump_state(nlp, hdr, s, npt));
+}
+
+static int
+pf_handle_getstates(struct nlmsghdr *hdr, struct nl_pstate *npt)
+{
+	int error;
+
+	struct nl_parsed_state attrs = {};
+	error = nl_parse_nlmsg(hdr, &state_parser, npt, &attrs);
+	if (error != 0)
+		return (error);
+
+	if (attrs.id != 0)
+		error = handle_getstate(npt->nlp, &attrs, hdr, npt);
+	else
+		error = handle_dumpstates(npt->nlp, &attrs, hdr, npt);
+
+	return (error);
+}
+
+static const struct nlhdr_parser *all_parsers[] = { &state_parser };
+
+static int family_id;
+
+static const struct genl_cmd pf_cmds[] = {
+	{
+		.cmd_num = PFNL_CMD_GETSTATES,
+		.cmd_name = "GETSTATES",
+		.cmd_cb = pf_handle_getstates,
+		.cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL,
+	},
+};
+
+void
+pf_nl_register(void)
+{
+	NL_VERIFY_PARSERS(all_parsers);
+	family_id = genl_register_family(PFNL_FAMILY_NAME, 0, 2, PFNL_CMD_MAX);
+	genl_register_cmds(PFNL_FAMILY_NAME, pf_cmds, NL_ARRAY_LEN(pf_cmds));
+}
+
+void
+pf_nl_unregister(void)
+{
+	genl_unregister_family(PFNL_FAMILY_NAME);
+}
diff --git a/sys/netpfil/pf/pf_nl.h b/sys/netpfil/pf/pf_nl.h
new file mode 100644
index 000000000000..5ef757eead21
--- /dev/null
+++ b/sys/netpfil/pf/pf_nl.h
@@ -0,0 +1,105 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Alexander V. Chernikov <melifaro@FreeBSD.org>
+ * Copyright (c) 2023 Rubicon Communications, LLC (Netgate)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef _NETPFIL_PF_PF_NL_H_
+#define _NETPFIL_PF_PF_NL_H_
+
+/* Genetlink family */
+#define PFNL_FAMILY_NAME	"pfctl"
+
+/* available commands */
+enum {
+	PFNL_CMD_UNSPEC = 0,
+	PFNL_CMD_GETSTATES = 1,
+	__PFNL_CMD_MAX,
+};
+#define PFNL_CMD_MAX (__PFNL_CMD_MAX -1)
+
+enum pfstate_key_type_t {
+	PF_STK_UNSPEC,
+	PF_STK_ADDR0		= 1, /* ip */
+	PF_STK_ADDR1		= 2, /* ip */
+	PF_STK_PORT0		= 3, /* u16 */
+	PF_STK_PORT1		= 4, /* u16 */
+};
+
+enum pfstate_peer_type_t {
+	PF_STP_UNSPEC,
+	PF_STP_PFSS_FLAGS	= 1, /* u16 */
+	PF_STP_PFSS_TTL		= 2, /* u8 */
+	PF_STP_SCRUB_FLAG	= 3, /* u8 */
+	PF_STP_PFSS_TS_MOD	= 4, /* u32 */
+	PF_STP_SEQLO		= 5, /* u32 */
+	PF_STP_SEQHI		= 6, /* u32 */
+	PF_STP_SEQDIFF		= 7, /* u32 */
+	PF_STP_MAX_WIN		= 8, /* u16 */
+	PF_STP_MSS		= 9, /* u16 */
+	PF_STP_STATE		= 10, /* u8 */
+	PF_STP_WSCALE		= 11, /* u8 */
+};
+
+enum pfstate_type_t {
+	PF_ST_UNSPEC,
+	PF_ST_ID		= 1, /* u32, state id */
+	PF_ST_CREATORID		= 2, /* u32, */
+	PF_ST_IFNAME		= 3, /* string */
+	PF_ST_ORIG_IFNAME	= 4, /* string */
+	PF_ST_KEY_WIRE		= 5, /* nested, pfstate_key_type_t */
+	PF_ST_KEY_STACK		= 6, /* nested, pfstate_key_type_t */
+	PF_ST_PEER_SRC		= 7, /* nested, pfstate_peer_type_t*/
+	PF_ST_PEER_DST		= 8, /* nested, pfstate_peer_type_t */
+	PF_ST_RT_ADDR		= 9, /* ip */
+	PF_ST_RULE		= 10, /* u32 */
+	PF_ST_ANCHOR		= 11, /* u32 */
+	PF_ST_NAT_RULE		= 12, /* u32 */
+	PF_ST_CREATION		= 13, /* u32 */
+	PF_ST_EXPIRE		= 14, /* u32 */
+	PF_ST_PACKETS0		= 15, /* u64 */
+	PF_ST_PACKETS1		= 16, /* u64 */
+	PF_ST_BYTES0		= 17, /* u64 */
+	PF_ST_BYTES1		= 18, /* u64 */
+	PF_ST_AF		= 19, /* u8 */
+	PF_ST_PROTO		= 21, /* u8 */
+	PF_ST_DIRECTION		= 22, /* u8 */
+	PF_ST_LOG		= 23, /* u8 */
+	PF_ST_TIMEOUT		= 24, /* u8 */
+	PF_ST_STATE_FLAGS	= 25, /* u8 */
+	PF_ST_SYNC_FLAGS	= 26, /* u8 */
+	PF_ST_UPDATES		= 27, /* u8 */
+	PF_ST_VERSION		= 28, /* u64 */
+};
+
+#ifdef _KERNEL
+
+void	pf_nl_register(void);
+void	pf_nl_unregister(void);
+
+#endif
+
+#endif