git: 65c318630123 - main - pf: Add prefer-ipv6-nexthop option for route-to pools

From: Kajetan Staszkiewicz <ks_at_FreeBSD.org>
Date: Fri, 29 Aug 2025 09:23:16 UTC
The branch main has been updated by ks:

URL: https://cgit.FreeBSD.org/src/commit/?id=65c318630123fcf2b6f491bf4d02a5cad3031d20

commit 65c318630123fcf2b6f491bf4d02a5cad3031d20
Author:     Kajetan Staszkiewicz <ks@FreeBSD.org>
AuthorDate: 2025-08-01 19:01:37 +0000
Commit:     Kajetan Staszkiewicz <ks@FreeBSD.org>
CommitDate: 2025-08-29 07:58:40 +0000

    pf: Add prefer-ipv6-nexthop option for route-to pools
    
    Now that pf is aware of address family of each pool address and source
    tracking uses distinct address family for source and redirection
    adddresses it is possible to add a new pool option prefer-ipv6-nexthop
    which enables routing of IPv4 packets over IPv6 next hops for rules
    with the route-to option.
    
    Add a pool option flag PF_POOL_IPV6NH, apply it to pools with a keyword
    prefer-ipv6-nexthop.
    
    Modify pf_map_addr() to handle pools with addresses of different
    families. Use *naf as a hint about what address family the forwarded
    packet is, then pick from the pool addresses of family that can be used
    as a next hop for the forwarded packet, controlled by the PF_POOL_IPV6NH
    flag. For NAT pools this flag is never set and thus pf_map_addr()
    will return an IP address of the same family as the forwarded packet.
    For route-to pools when the flag is enabled IPv6 addresses can be
    returned or IPv4 packets.
    
    In pf_route() check rt_af, it is not guaranteed to be AF_INET anymore
    because pf_map_addr() could have changed it (as *naf).
    
    Add tests for behaviour of pf_map_addr() both with PF_POOL_IPV6NH and
    without, for single IP addresses, prefixes and subnets.
    
    Reviewed by:    kp
    Sponsored by:   InnoGames GmbH
    Differential Revision:  https://reviews.freebsd.org/D50781
---
 sbin/pfctl/parse.y                   |  50 ++-
 sbin/pfctl/pfctl_parser.c            |   6 +-
 sbin/pfctl/tests/files/pf1073.in     |   1 +
 sbin/pfctl/tests/files/pf1073.ok     |   1 +
 sbin/pfctl/tests/files/pf1074.fail   |   1 +
 sbin/pfctl/tests/files/pf1074.in     |   1 +
 sbin/pfctl/tests/pfctl_test_list.inc |   2 +
 share/man/man5/pf.conf.5             |  18 +-
 sys/net/pfvar.h                      |   3 +-
 sys/netpfil/pf/if_pfsync.c           |   8 +-
 sys/netpfil/pf/pf.c                  |  63 ++-
 sys/netpfil/pf/pf.h                  |   1 +
 sys/netpfil/pf/pf_ioctl.c            |   1 +
 sys/netpfil/pf/pf_lb.c               | 151 ++++++--
 tests/sys/netpfil/pf/route_to.sh     | 716 ++++++++++++++++++++++++++++++++++-
 tests/sys/netpfil/pf/src_track.sh    |  36 +-
 tests/sys/netpfil/pf/utils.subr      |   4 +-
 17 files changed, 953 insertions(+), 110 deletions(-)

diff --git a/sbin/pfctl/parse.y b/sbin/pfctl/parse.y
index 00c36b218055..59c27d1f5d7c 100644
--- a/sbin/pfctl/parse.y
+++ b/sbin/pfctl/parse.y
@@ -238,6 +238,7 @@ static struct pool_opts {
 #define POM_TYPE		0x01
 #define POM_STICKYADDRESS	0x02
 #define POM_ENDPI		0x04
+#define POM_IPV6NH		0x08
 	u_int8_t		 opts;
 	int			 type;
 	int			 staticport;
@@ -543,7 +544,7 @@ int	parseport(char *, struct range *r, int);
 %token	MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH SLOPPY PFLOW ALLOW_RELATED
 %token	TAGGED TAG IFBOUND FLOATING STATEPOLICY STATEDEFAULTS ROUTE SETTOS
 %token	DIVERTTO DIVERTREPLY BRIDGE_TO RECEIVEDON NE LE GE AFTO NATTO RDRTO
-%token	BINATTO MAXPKTRATE MAXPKTSIZE
+%token	BINATTO MAXPKTRATE MAXPKTSIZE IPV6NH
 %token	<v.string>		STRING
 %token	<v.number>		NUMBER
 %token	<v.i>			PORTBINARY
@@ -2648,13 +2649,16 @@ pfrule		: action dir logquick interface route af proto fromto
 					YYERROR;
 				}
 				r.rt = $5.rt;
-				decide_address_family($5.redirspec->host, &r.af);
-				if (!(r.rule_flag & PFRULE_AFTO))
-					remove_invalid_hosts(&($5.redirspec->host), &r.af);
-				if ($5.redirspec->host == NULL) {
-					yyerror("no routing address with "
-					    "matching address family found.");
-					YYERROR;
+
+				if (!($5.redirspec->pool_opts.opts & PF_POOL_IPV6NH)) {
+					decide_address_family($5.redirspec->host, &r.af);
+					if (!(r.rule_flag & PFRULE_AFTO))
+						remove_invalid_hosts(&($5.redirspec->host), &r.af);
+					if ($5.redirspec->host == NULL) {
+						yyerror("no routing address with "
+						    "matching address family found.");
+						YYERROR;
+					}
 				}
 			}
 #ifdef __FreeBSD__
@@ -2978,7 +2982,8 @@ filter_opt	: USER uids {
 
 			filter_opts.nat = $4;
 			filter_opts.nat->af = $2;
-			if ($4->af && $4->af != $2) {
+			remove_invalid_hosts(&($4->host), &(filter_opts.nat->af));
+			if ($4->host == NULL) {
 				yyerror("af-to addresses must be in the "
 				   "target address family");
 				YYERROR;
@@ -2998,8 +3003,9 @@ filter_opt	: USER uids {
 			filter_opts.nat->af = $2;
 			filter_opts.rdr = $6;
 			filter_opts.rdr->af = $2;
-			if (($4->af && $4->host->af != $2) ||
-			    ($6->af && $6->host->af != $2)) {
+			remove_invalid_hosts(&($4->host), &(filter_opts.nat->af));
+			remove_invalid_hosts(&($6->host), &(filter_opts.rdr->af));
+			if ($4->host == NULL || $6->host == NULL) {
 				yyerror("af-to addresses must be in the "
 				   "target address family");
 				YYERROR;
@@ -4674,6 +4680,14 @@ pool_opt	: BITMASK	{
 			pool_opts.marker |= POM_ENDPI;
 			pool_opts.opts |= PF_POOL_ENDPI;
 		}
+		| IPV6NH {
+			if (pool_opts.marker & POM_IPV6NH) {
+				yyerror("prefer-ipv6-nexthop cannot be redefined");
+				YYERROR;
+			}
+			pool_opts.marker |= POM_IPV6NH;
+			pool_opts.opts |= PF_POOL_IPV6NH;
+		}
 		| MAPEPORTSET number '/' number '/' number {
 			if (pool_opts.mape.offset) {
 				yyerror("map-e-portset cannot be redefined");
@@ -4813,6 +4827,12 @@ natrule		: nataction interface af proto fromto tag tagged rtable
 					    "address'");
 					YYERROR;
 				}
+				if ($9->pool_opts.opts & PF_POOL_IPV6NH) {
+					yyerror("The prefer-ipv6-nexthop option "
+					    "can't be used for nat/rdr/binat pools"
+					);
+					YYERROR;
+				}
 				if (!r.af && ! $9->host->ifindex)
 					r.af = $9->host->af;
 
@@ -5074,13 +5094,6 @@ route_host	: STRING			{
 
 route_host_list	: route_host optnl			{ $$ = $1; }
 		| route_host_list comma route_host optnl {
-			if ($1->af == 0)
-				$1->af = $3->af;
-			if ($1->af != $3->af) {
-				yyerror("all pool addresses must be in the "
-				    "same address family");
-				YYERROR;
-			}
 			$1->tail->next = $3;
 			$1->tail = $3->tail;
 			$$ = $1;
@@ -6678,6 +6691,7 @@ lookup(char *s)
 		{ "pass",		PASS},
 		{ "pflow",		PFLOW},
 		{ "port",		PORT},
+		{ "prefer-ipv6-nexthop", IPV6NH},
 		{ "prio",		PRIO},
 		{ "priority",		PRIORITY},
 		{ "priq",		PRIQ},
diff --git a/sbin/pfctl/pfctl_parser.c b/sbin/pfctl/pfctl_parser.c
index 18b78a150c28..3c4f9f6b4334 100644
--- a/sbin/pfctl/pfctl_parser.c
+++ b/sbin/pfctl/pfctl_parser.c
@@ -508,6 +508,8 @@ print_pool(struct pfctl_pool *pool, u_int16_t p1, u_int16_t p2, int id)
 	if (pool->mape.offset > 0)
 		printf(" map-e-portset %u/%u/%u",
 		    pool->mape.offset, pool->mape.psidlen, pool->mape.psid);
+	if (pool->opts & PF_POOL_IPV6NH)
+		printf(" prefer-ipv6-nexthop");
 }
 
 void
@@ -1438,7 +1440,7 @@ ifa_add_groups_to_map(char *ifa_name)
 			ENTRY	 		 item;
 			ENTRY			*ret_item;
 			int			*answer;
-	
+
 			item.key = ifg->ifgrq_group;
 			if (hsearch_r(item, FIND, &ret_item, &isgroup_map) == 0) {
 				struct ifgroupreq	 ifgr2;
@@ -1580,7 +1582,7 @@ is_a_group(char *name)
 {
 	ENTRY	 		 item;
 	ENTRY			*ret_item;
-	
+
 	item.key = name;
 	if (hsearch_r(item, FIND, &ret_item, &isgroup_map) == 0)
 		return (0);
diff --git a/sbin/pfctl/tests/files/pf1073.in b/sbin/pfctl/tests/files/pf1073.in
new file mode 100644
index 000000000000..477995893ac3
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1073.in
@@ -0,0 +1 @@
+pass in on vtnet0 route-to ( vtnet1 2001:db8::1 ) prefer-ipv6-nexthop inet
diff --git a/sbin/pfctl/tests/files/pf1073.ok b/sbin/pfctl/tests/files/pf1073.ok
new file mode 100644
index 000000000000..f34867508c75
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1073.ok
@@ -0,0 +1 @@
+pass in on vtnet0 route-to (vtnet1 2001:db8::1) prefer-ipv6-nexthop inet all flags S/SA keep state
diff --git a/sbin/pfctl/tests/files/pf1074.fail b/sbin/pfctl/tests/files/pf1074.fail
new file mode 100644
index 000000000000..afe8ee3c458f
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1074.fail
@@ -0,0 +1 @@
+no routing address with matching address family found.
diff --git a/sbin/pfctl/tests/files/pf1074.in b/sbin/pfctl/tests/files/pf1074.in
new file mode 100644
index 000000000000..5d285bc5d6e8
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1074.in
@@ -0,0 +1 @@
+pass in on vtnet0 route-to ( vtnet1 2001:db8::1 ) inet
diff --git a/sbin/pfctl/tests/pfctl_test_list.inc b/sbin/pfctl/tests/pfctl_test_list.inc
index 3a68cc06ec74..8bfccddf50e5 100644
--- a/sbin/pfctl/tests/pfctl_test_list.inc
+++ b/sbin/pfctl/tests/pfctl_test_list.inc
@@ -181,3 +181,5 @@ PFCTL_TEST(1069, "max-pkt-size")
 PFCTL_TEST_FAIL(1070, "include line number")
 PFCTL_TEST(1071, "mask length on (lo0)")
 PFCTL_TEST_FAIL(1072, "Invalid port range")
+PFCTL_TEST(1073, "Filter AF different than route-to AF, with prefer-ipv6-nexthop")
+PFCTL_TEST_FAIL(1074, "Filter AF different than route-to AF, without prefer-ipv6-nexthop")
diff --git a/share/man/man5/pf.conf.5 b/share/man/man5/pf.conf.5
index a9ae823257a4..bdd8a843d72a 100644
--- a/share/man/man5/pf.conf.5
+++ b/share/man/man5/pf.conf.5
@@ -2470,7 +2470,13 @@ NAT address and port.
 This feature implements "full-cone" NAT behavior.
 .El
 .Pp
-Additionally, the
+Additionally, options
+.Ar sticky-address
+and
+.Ar prefer-ipv6-nexthop
+can be specified to influence how IP addresses selected from pools.
+.Pp
+The
 .Ar sticky-address
 option can be specified to help ensure that multiple connections from the
 same source are mapped to the same redirection address.
@@ -2486,6 +2492,14 @@ beyond the lifetime of the states, increase the global options with
 See
 .Sx STATEFUL TRACKING OPTIONS
 for more ways to control the source tracking.
+.Pp
+The
+.Ar prefer-ipv6-nexthop
+option allows for IPv6 addresses to be used as the nexthop
+for IPv4 packets routed with the
+.Ar route-to
+rule option. If a table is used with IPv4 and IPv6 addresses, first the IPv6 addresses
+will be used in round-robin fashion, then IPv4 addresses.
 .Sh STATE MODULATION
 Much of the security derived from TCP is attributable to how well the
 initial sequence numbers (ISNs) are chosen.
@@ -3580,7 +3594,7 @@ limit-item     = ( "states" | "frags" | "src-nodes" ) number
 
 pooltype       = ( "bitmask" | "random" |
                  "source-hash" [ ( hex-key | string-key ) ] |
-                 "round-robin" ) [ sticky-address ]
+                 "round-robin" ) [ sticky-address | prefer-ipv6-nexthop ]
 
 subqueue       = string | "{" queue-list "}"
 queue-list     = string [ [ "," ] string ]
diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
index d6c13470f2eb..cf6d2508cf65 100644
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -645,6 +645,7 @@ struct pf_kpool {
 	int			 tblidx;
 	u_int16_t		 proxy_port[2];
 	u_int8_t		 opts;
+	sa_family_t		 ipv6_nexthop_af;
 };
 
 struct pf_rule_actions {
@@ -2680,7 +2681,7 @@ u_short			 pf_map_addr(sa_family_t, struct pf_krule *,
 			    struct pf_addr *, struct pf_kpool *);
 u_short			 pf_map_addr_sn(u_int8_t, struct pf_krule *,
 			    struct pf_addr *, struct pf_addr *,
-			    sa_family_t *, struct pfi_kkif **nkif,
+			    sa_family_t *, struct pfi_kkif **,
 			    struct pf_addr *, struct pf_kpool *,
 			    pf_sn_types_t);
 int			 pf_get_transaddr_af(struct pf_krule *,
diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c
index 585c196391c0..cfc300d99396 100644
--- a/sys/netpfil/pf/if_pfsync.c
+++ b/sys/netpfil/pf/if_pfsync.c
@@ -605,7 +605,8 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
 			rt_kif = rpool_first->kif;
 			/*
 			 * Guess the AF of the route address, FreeBSD 13 does
-			 * not support af-to so it should be safe.
+			 * not support af-to nor prefer-ipv6-nexthop
+			 * so it should be safe.
 			 */
 			rt_af = r->af;
 		} else if (!PF_AZERO(&sp->pfs_1301.rt_addr, sp->pfs_1301.af)) {
@@ -634,8 +635,9 @@ pfsync_state_import(union pfsync_state_union *sp, int flags, int msg_version)
 			}
 			rt = sp->pfs_1400.rt;
 			/*
-			 * Guess the AF of the route address, FreeBSD 13 does
-			 * not support af-to so it should be safe.
+			 * Guess the AF of the route address, FreeBSD 14 does
+			 * not support af-to nor prefer-ipv6-nexthop
+			 * so it should be safe.
 			 */
 			rt_af = sp->pfs_1400.af;
 		}
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index 8cd4fff95b15..4325835c7671 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -5960,7 +5960,9 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm,
 	if (r->rt) {
 		/*
 		 * Set act.rt here instead of in pf_rule_to_actions() because
-		 * it is applied only from the last pass rule.
+		 * it is applied only from the last pass rule. For rules
+		 * with the prefer-ipv6-nexthop option act.rt_af is a hint
+		 * about AF of the forwarded packet and might be changed.
 		 */
 		pd->act.rt = r->rt;
 		if (r->rt == PF_REPLYTO)
@@ -8974,9 +8976,10 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
     struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp)
 {
 	struct mbuf		*m0, *m1, *md;
-	struct route		 ro;
-	const struct sockaddr	*gw = &ro.ro_dst;
-	struct sockaddr_in	*dst;
+	struct route_in6	 ro;
+	union sockaddr_union	 rt_gw;
+	const union sockaddr_union	*gw = (const union sockaddr_union *)&ro.ro_dst;
+	union sockaddr_union	*dst;
 	struct ip		*ip;
 	struct ifnet		*ifp = NULL;
 	int			 error = 0;
@@ -9071,10 +9074,35 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
 	ip = mtod(m0, struct ip *);
 
 	bzero(&ro, sizeof(ro));
-	dst = (struct sockaddr_in *)&ro.ro_dst;
-	dst->sin_family = AF_INET;
-	dst->sin_len = sizeof(struct sockaddr_in);
-	dst->sin_addr.s_addr = pd->act.rt_addr.v4.s_addr;
+	dst = (union sockaddr_union *)&ro.ro_dst;
+	dst->sin.sin_family = AF_INET;
+	dst->sin.sin_len = sizeof(struct sockaddr_in);
+	dst->sin.sin_addr = ip->ip_dst;
+	if (ifp) { /* Only needed in forward direction and route-to */
+		bzero(&rt_gw, sizeof(rt_gw));
+		ro.ro_flags |= RT_HAS_GW;
+		gw = &rt_gw;
+		switch (pd->act.rt_af) {
+#ifdef INET
+		case AF_INET:
+			rt_gw.sin.sin_family = AF_INET;
+			rt_gw.sin.sin_len = sizeof(struct sockaddr_in);
+			rt_gw.sin.sin_addr.s_addr = pd->act.rt_addr.v4.s_addr;
+			break;
+#endif /* INET */
+#ifdef INET6
+		case AF_INET6:
+			rt_gw.sin6.sin6_family = AF_INET6;
+			rt_gw.sin6.sin6_len = sizeof(struct sockaddr_in6);
+			pf_addrcpy((struct pf_addr *)&rt_gw.sin6.sin6_addr,
+			    &pd->act.rt_addr, AF_INET6);
+			break;
+#endif /* INET6 */
+		default:
+			/* Normal af-to without route-to */
+			break;
+		}
+	}
 
 	if (pd->dir == PF_IN) {
 		if (ip->ip_ttl <= IPTTLDEC) {
@@ -9098,10 +9126,10 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
 
 				/* Use the gateway if needed. */
 				if (nh->nh_flags & NHF_GATEWAY) {
-					gw = &nh->gw_sa;
+					gw = (const union sockaddr_union *)&nh->gw_sa;
 					ro.ro_flags |= RT_HAS_GW;
 				} else {
-					dst->sin_addr = ip->ip_dst;
+					dst->sin.sin_addr = ip->ip_dst;
 				}
 
 				/*
@@ -9126,6 +9154,9 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
 		PF_STATE_UNLOCK(s);
 	}
 
+	/* It must have been either set from rt_af or from fib4_lookup */
+	KASSERT(gw->sin.sin_family != 0, ("%s: gw address family undetermined", __func__));
+
 	if (ifp == NULL) {
 		m0 = pd->m;
 		pd->m = NULL;
@@ -9210,9 +9241,11 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
 		m_clrprotoflags(m0);	/* Avoid confusing lower layers. */
 
 		md = m0;
-		error = pf_dummynet_route(pd, s, r, ifp, gw, &md);
+		error = pf_dummynet_route(pd, s, r, ifp,
+		    (const struct sockaddr *)gw, &md);
 		if (md != NULL) {
-			error = (*ifp->if_output)(ifp, md, gw, &ro);
+			error = (*ifp->if_output)(ifp, md,
+			    (const struct sockaddr *)gw, (struct route *)&ro);
 			SDT_PROBE2(pf, ip, route_to, output, ifp, error);
 		}
 		goto done;
@@ -9253,9 +9286,11 @@ pf_route(struct pf_krule *r, struct ifnet *oifp,
 			md = m0;
 			pd->pf_mtag = pf_find_mtag(md);
 			error = pf_dummynet_route(pd, s, r, ifp,
-			    gw, &md);
+			    (const struct sockaddr *)gw, &md);
 			if (md != NULL) {
-				error = (*ifp->if_output)(ifp, md, gw, &ro);
+				error = (*ifp->if_output)(ifp, md,
+				    (const struct sockaddr *)gw,
+				    (struct route *)&ro);
 				SDT_PROBE2(pf, ip, route_to, output, ifp, error);
 			}
 		} else
diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h
index 51b3fd6390e1..8edd5a5110a1 100644
--- a/sys/netpfil/pf/pf.h
+++ b/sys/netpfil/pf/pf.h
@@ -131,6 +131,7 @@ enum	{ PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL,
 #define PF_POOL_TYPEMASK	0x0f
 #define PF_POOL_STICKYADDR	0x20
 #define PF_POOL_ENDPI		0x40
+#define PF_POOL_IPV6NH		0x80
 #define	PF_WSCALE_FLAG		0x80
 #define	PF_WSCALE_MASK		0x0f
 
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index e5da05a958f6..d395730d6a54 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -2276,6 +2276,7 @@ pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
 	rule->nat.cur = TAILQ_FIRST(&rule->nat.list);
 	rule->rdr.cur = TAILQ_FIRST(&rule->rdr.list);
 	rule->route.cur = TAILQ_FIRST(&rule->route.list);
+	rule->route.ipv6_nexthop_af = AF_INET6;
 	TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr,
 	    rule, entries);
 	ruleset->rules[rs_num].inactive.rcount++;
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index bc9e1dc72902..b8b5157c9b15 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -545,11 +545,18 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct pf_addr *saddr,
 	uint64_t		 hashidx;
 	int			 cnt;
 	sa_family_t		 wanted_af;
+	u_int8_t		 pool_type;
+	bool			 prefer_ipv6_nexthop = rpool->opts & PF_POOL_IPV6NH;
 
 	KASSERT(saf != 0, ("%s: saf == 0", __func__));
 	KASSERT(naf != NULL, ("%s: naf = NULL", __func__));
 	KASSERT((*naf) != 0, ("%s: *naf = 0", __func__));
 
+	/*
+	 * Given (*naf) is a hint about AF of the forwarded packet.
+	 * It might be changed if prefer_ipv6_nexthop is enabled and
+	 * the combination of nexthop AF and packet AF allows for it.
+	 */
 	wanted_af = (*naf);
 
 	mtx_lock(&rpool->mtx);
@@ -594,19 +601,38 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct pf_addr *saddr,
 	} else {
 		raddr = &rpool->cur->addr.v.a.addr;
 		rmask = &rpool->cur->addr.v.a.mask;
-		/*
-		 * For single addresses check their address family. Unless they
-		 * have none, which happens when addresses are added with
-		 * the old ioctl mechanism. In such case trust that the address
-		 * has the proper AF.
-		 */
-		if (rpool->cur->af && rpool->cur->af != wanted_af) {
-			reason = PFRES_MAPFAILED;
-			goto done_pool_mtx;
+	}
+
+	/*
+	 * For pools with a single host with the prefer-ipv6-nexthop option
+	 * we can return pool address of any AF, unless the forwarded packet
+	 * is IPv6, then we can return only if pool address is IPv6.
+	 * For non-prefer-ipv6-nexthop we can return pool address only
+	 * of wanted AF, unless the pool address'es AF is unknown, which
+	 * happens in case old ioctls have been used to set up the pool.
+	 *
+	 * Round-robin pools have their own logic for retrying next addresses.
+	 */
+	pool_type = rpool->opts & PF_POOL_TYPEMASK;
+	if (pool_type == PF_POOL_NONE || pool_type == PF_POOL_BITMASK ||
+	    ((pool_type == PF_POOL_RANDOM || pool_type == PF_POOL_SRCHASH) &&
+	    rpool->cur->addr.type != PF_ADDR_TABLE &&
+	    rpool->cur->addr.type != PF_ADDR_DYNIFTL)) {
+		if (prefer_ipv6_nexthop) {
+			if (rpool->cur->af == AF_INET && (*naf) == AF_INET6) {
+				reason = PFRES_MAPFAILED;
+				goto done_pool_mtx;
+			}
+			wanted_af = rpool->cur->af;
+		} else {
+			if (rpool->cur->af != 0 && rpool->cur->af != (*naf)) {
+				reason = PFRES_MAPFAILED;
+				goto done_pool_mtx;
+			}
 		}
 	}
 
-	switch (rpool->opts & PF_POOL_TYPEMASK) {
+	switch (pool_type) {
 	case PF_POOL_NONE:
 		pf_addrcpy(naddr, raddr, wanted_af);
 		break;
@@ -631,10 +657,22 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct pf_addr *saddr,
 			else
 				rpool->tblidx = (int)arc4random_uniform(cnt);
 			memset(&rpool->counter, 0, sizeof(rpool->counter));
+			if (prefer_ipv6_nexthop)
+				wanted_af = AF_INET6;
+		retry_other_af_random:
 			if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter,
 			    wanted_af, pf_islinklocal, false)) {
-				reason = PFRES_MAPFAILED;
-				goto done_pool_mtx; /* unsupported */
+				/* Retry with IPv4 nexthop for IPv4 traffic */
+				if (prefer_ipv6_nexthop &&
+				    wanted_af == AF_INET6 &&
+				    (*naf) == AF_INET) {
+					wanted_af = AF_INET;
+					goto retry_other_af_random;
+				} else {
+					 /* no hosts in wanted AF */
+					reason = PFRES_MAPFAILED;
+					goto done_pool_mtx;
+				}
 			}
 			pf_addrcpy(naddr, &rpool->counter, wanted_af);
 		} else if (init_addr != NULL && PF_AZERO(init_addr,
@@ -702,10 +740,22 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct pf_addr *saddr,
 			else
 				rpool->tblidx = (int)(hashidx % cnt);
 			memset(&rpool->counter, 0, sizeof(rpool->counter));
+			if (prefer_ipv6_nexthop)
+				wanted_af = AF_INET6;
+		retry_other_af_srchash:
 			if (pfr_pool_get(kt, &rpool->tblidx, &rpool->counter,
 			    wanted_af, pf_islinklocal, false)) {
-				reason = PFRES_MAPFAILED;
-				goto done_pool_mtx; /* unsupported */
+				/* Retry with IPv4 nexthop for IPv4 traffic */
+				if (prefer_ipv6_nexthop &&
+				    wanted_af == AF_INET6 &&
+				    (*naf) == AF_INET) {
+					wanted_af = AF_INET;
+					goto retry_other_af_srchash;
+				} else {
+					 /* no hosts in wanted AF */
+					reason = PFRES_MAPFAILED;
+					goto done_pool_mtx;
+				}
 			}
 			pf_addrcpy(naddr, &rpool->counter, wanted_af);
 		} else {
@@ -718,6 +768,9 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct pf_addr *saddr,
 	    {
 		struct pf_kpooladdr *acur = rpool->cur;
 
+	retry_other_af_rr:
+		if (prefer_ipv6_nexthop)
+			wanted_af = rpool->ipv6_nexthop_af;
 		if (rpool->cur->addr.type == PF_ADDR_TABLE) {
 			if (!pfr_pool_get(rpool->cur->addr.p.tbl,
 			    &rpool->tblidx, &rpool->counter, wanted_af,
@@ -728,46 +781,55 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct pf_addr *saddr,
 			    &rpool->tblidx, &rpool->counter, wanted_af,
 			    pf_islinklocal, true))
 				goto get_addr;
-		} else if (pf_match_addr(0, raddr, rmask, &rpool->counter,
-		    wanted_af))
+		} else if (rpool->cur->af == wanted_af &&
+		    pf_match_addr(0, raddr, rmask, &rpool->counter, wanted_af))
 			goto get_addr;
-
+		if (prefer_ipv6_nexthop &&
+		    (*naf) == AF_INET && wanted_af == AF_INET6) {
+			/* Reset table index when changing wanted AF. */
+			rpool->tblidx = -1;
+			rpool->ipv6_nexthop_af = AF_INET;
+			goto retry_other_af_rr;
+		}
 	try_next:
+		/* Reset prefer-ipv6-nexthop search to IPv6 when iterating pools. */
+		rpool->ipv6_nexthop_af = AF_INET6;
 		if (TAILQ_NEXT(rpool->cur, entries) == NULL)
 			rpool->cur = TAILQ_FIRST(&rpool->list);
 		else
 			rpool->cur = TAILQ_NEXT(rpool->cur, entries);
+	try_next_ipv6_nexthop_rr:
+		/* Reset table index when iterating pools or changing wanted AF. */
 		rpool->tblidx = -1;
+		if (prefer_ipv6_nexthop)
+			wanted_af = rpool->ipv6_nexthop_af;
 		if (rpool->cur->addr.type == PF_ADDR_TABLE) {
-			if (pfr_pool_get(rpool->cur->addr.p.tbl,
+			if (!pfr_pool_get(rpool->cur->addr.p.tbl,
 			    &rpool->tblidx, &rpool->counter, wanted_af, NULL,
-			    true)) {
-				/* table contains no address of type 'wanted_af' */
-				if (rpool->cur != acur)
-					goto try_next;
-				reason = PFRES_MAPFAILED;
-				goto done_pool_mtx;
-			}
+			    true))
+				goto get_addr;
 		} else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) {
-			if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
-			    &rpool->tblidx, &rpool->counter, wanted_af,
-			    pf_islinklocal, true)) {
-				/* interface has no address of type 'wanted_af' */
-				if (rpool->cur != acur)
-					goto try_next;
-				reason = PFRES_MAPFAILED;
-				goto done_pool_mtx;
-			}
+			if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt,
+			    &rpool->tblidx, &rpool->counter, wanted_af, pf_islinklocal,
+			    true))
+				goto get_addr;
 		} else {
-			raddr = &rpool->cur->addr.v.a.addr;
-			rmask = &rpool->cur->addr.v.a.mask;
-			if (rpool->cur->af && rpool->cur->af != wanted_af) {
-				reason = PFRES_MAPFAILED;
-				goto done_pool_mtx;
+			if (rpool->cur->af == wanted_af) {
+				raddr = &rpool->cur->addr.v.a.addr;
+				rmask = &rpool->cur->addr.v.a.mask;
+				pf_addrcpy(&rpool->counter, raddr, wanted_af);
+				goto get_addr;
 			}
-			pf_addrcpy(&rpool->counter, raddr, wanted_af);
 		}
-
+		if (prefer_ipv6_nexthop &&
+		    (*naf) == AF_INET && wanted_af == AF_INET6) {
+			rpool->ipv6_nexthop_af = AF_INET;
+			goto try_next_ipv6_nexthop_rr;
+		}
+		if (rpool->cur != acur)
+			goto try_next;
+		reason = PFRES_MAPFAILED;
+		goto done_pool_mtx;
 	get_addr:
 		pf_addrcpy(naddr, &rpool->counter, wanted_af);
 		if (init_addr != NULL && PF_AZERO(init_addr, wanted_af))
@@ -777,9 +839,16 @@ pf_map_addr(sa_family_t saf, struct pf_krule *r, struct pf_addr *saddr,
 	    }
 	}
 
+	if (wanted_af == 0) {
+		reason = PFRES_MAPFAILED;
+		goto done_pool_mtx;
+	}
+
 	if (nkif)
 		*nkif = rpool->cur->kif;
 
+	(*naf) = wanted_af;
+
 done_pool_mtx:
 	mtx_unlock(&rpool->mtx);
 
diff --git a/tests/sys/netpfil/pf/route_to.sh b/tests/sys/netpfil/pf/route_to.sh
index 765403dcb79c..872de0dcbb91 100644
--- a/tests/sys/netpfil/pf/route_to.sh
+++ b/tests/sys/netpfil/pf/route_to.sh
@@ -28,6 +28,75 @@
 
 common_dir=$(atf_get_srcdir)/../common
 
+# We need to somehow test if the random algorithm of pf_map_addr() is working.
+# The table or prefix contains multiple IP next-hop addresses, for each one try
+# to establish up to 10 connections. Fail the test if with this many attempts
+# the "good" target has not been chosen. However this choice is random,
+# the check might still ocasionally fail.
+check_random() {
+	if [ "$1" = "IPv4" ]; then
+		ping_from="${net_clients_4}.1"
+		ping_to="${host_server_4}"
+	else
+		ping_from="${net_clients_6}::1"
+		ping_to="${host_server_6}"
+	fi
+	good_targets="$2"
+	bad_targets="$3"
+
+	port=42000
+	states=$(mktemp) || exit 1
+	for good_target in $good_targets; do
+		found="no"
+		for attempt in $(seq 1 10); do
+			port=$(( port + 1 ))
+			jexec router pfctl -Fs
+			atf_check -s exit:0 ${common_dir}/pft_ping.py \
+				--sendif ${epair_tester}a --replyif ${epair_tester}a \
+				--fromaddr ${ping_from} --to ${ping_to} \
+				--ping-type=tcp3way --send-sport=${port}
+			jexec router pfctl -qvvss | normalize_pfctl_s > $states
+			cat $states
+			if [ -n "${bad_targets}" ]; then
+				for bad_target in $bad_targets; do
+					if grep -qE "route-to: ${bad_target}@" $states; then
+						atf_fail "Bad target ${bad_target} selected!"
+					fi
+				done
+			fi;
+			if grep -qE "route-to: ${good_target}@" $states; then
+				found=yes
+				break
+			fi
+		done
+		if [ "${found}" = "no" ]; then
+			atf_fail "Target ${good_target} not selected after ${attempt} attempts!"
+		fi
+	done
+}
+
+pf_map_addr_common()
+{
+	setup_router_server_nat64
+
+	# Clients will connect from another network behind the router.
+	# This allows for using multiple source addresses.
+	jexec router route add -6 ${net_clients_6}::/${net_clients_6_mask} ${net_tester_6_host_tester}
+	jexec router route add    ${net_clients_4}.0/${net_clients_4_mask} ${net_tester_4_host_tester}
+
+	# The servers are reachable over additional IP addresses for
+	# testing of tables and subnets. The addresses are noncontinougnus
+	# for pf_map_addr() counter tests.
+	for i in 0 1 4 5; do
+		a1=$((24 + i))
+		jexec server1 ifconfig ${epair_server1}b inet  ${net_server1_4}.${a1}/32 alias
+		jexec server1 ifconfig ${epair_server1}b inet6 ${net_server1_6}::42:${i}/128 alias
+		a2=$((40 + i))
+		jexec server2 ifconfig ${epair_server2}b inet  ${net_server2_4}.${a2}/32 alias
+		jexec server2 ifconfig ${epair_server2}b inet6 ${net_server2_6}::42:${i}/128 alias
+	done
+}
+
 atf_test_case "v4" "cleanup"
 v4_head()
 {
@@ -893,36 +962,17 @@ empty_pool_cleanup()
 	pft_cleanup
 }
 
-
 atf_test_case "table_loop" "cleanup"
 
 table_loop_head()
 {
 	atf_set descr 'Check that iterating over tables poperly loops'
 	atf_set require.user root
-	atf_set require.progs python3 scapy
 }
 
 table_loop_body()
 {
-	setup_router_server_nat64
-
-	# Clients will connect from another network behind the router.
-	# This allows for using multiple source addresses.
-	jexec router route add -6 ${net_clients_6}::/${net_clients_6_mask} ${net_tester_6_host_tester}
-	jexec router route add    ${net_clients_4}.0/${net_clients_4_mask} ${net_tester_4_host_tester}
-
-	# The servers are reachable over additional IP addresses for
-	# testing of tables and subnets. The addresses are noncontinougnus
-	# for pf_map_addr() counter tests.
-	for i in 0 1 4 5; do
-		a1=$((24 + i))
-		jexec server1 ifconfig ${epair_server1}b inet  ${net_server1_4}.${a1}/32 alias
-		jexec server1 ifconfig ${epair_server1}b inet6 ${net_server1_6}::42:${i}/128 alias
-		a2=$((40 + i))
-		jexec server2 ifconfig ${epair_server2}b inet  ${net_server2_4}.${a2}/32 alias
-		jexec server2 ifconfig ${epair_server2}b inet6 ${net_server2_6}::42:${i}/128 alias
-	done
+	pf_map_addr_common
 
 	jexec router pfctl -e
 	pft_set_rules router \
@@ -976,6 +1026,612 @@ table_loop_cleanup()
 }
 
 
+atf_test_case "roundrobin" "cleanup"
+
+roundrobin_head()
+{
+	atf_set descr 'multiple gateways of mixed AF, including prefixes and tables, for IPv6 packets'
+	atf_set require.user root
+}
+
+roundrobin_body()
+{
+	pf_map_addr_common
+
+	# The rule is defined as "inet6 proto tcp" so directly given IPv4 hosts
+	# will be removed from the pool by pfctl. Tables will still be loaded
+	# and pf_map_addr() will only use IPv6 addresses from them. It will
+	# iterate over members of the pool and inside of tables and prefixes.
+
+	jexec router pfctl -e
+	pft_set_rules router \
+		"set debug loud" \
+		"set reassemble yes" \
+		"set state-policy if-bound" \
+		"table <rt_targets> { ${net_server2_4}.40/31 ${net_server2_4}.44 ${net_server2_6}::42:0/127 ${net_server2_6}::42:4 }" \
+		"pass in on ${epair_tester}b \
+			route-to { \
+				(${epair_server1}a ${net_server1_4_host_server}) \
+				(${epair_server2}a <rt_targets_empty>) \
+				(${epair_server1}a ${net_server1_6}::42:0/127) \
+				(${epair_server2}a <rt_targets_empty>) \
+				(${epair_server2}a <rt_targets>) \
+			} \
+			inet6 proto tcp \
+			keep state"
+
+	for port in $(seq 1 6); do
+		port=$((4200 + port))
+		atf_check -s exit:0 ${common_dir}/pft_ping.py \
+			--sendif ${epair_tester}a --replyif ${epair_tester}a \
+			--fromaddr ${net_clients_6}::1 --to ${host_server_6} \
+			--ping-type=tcp3way --send-sport=${port}
+	done
+
+	states=$(mktemp) || exit 1
+	jexec router pfctl -qvvss | normalize_pfctl_s > $states
+
+	for state_regexp in \
+		"${epair_tester}b tcp ${host_server_6}\[9\] <- ${net_clients_6}::1\[4201\] .* route-to: ${net_server1_6}::42:0@${epair_server1}a" \
+		"${epair_tester}b tcp ${host_server_6}\[9\] <- ${net_clients_6}::1\[4202\] .* route-to: ${net_server1_6}::42:1@${epair_server1}a" \
+		"${epair_tester}b tcp ${host_server_6}\[9\] <- ${net_clients_6}::1\[4203\] .* route-to: ${net_server2_6}::42:0@${epair_server2}a" \
+		"${epair_tester}b tcp ${host_server_6}\[9\] <- ${net_clients_6}::1\[4204\] .* route-to: ${net_server2_6}::42:1@${epair_server2}a" \
+		"${epair_tester}b tcp ${host_server_6}\[9\] <- ${net_clients_6}::1\[4205\] .* route-to: ${net_server2_6}::42:4@${epair_server2}a" \
+		"${epair_tester}b tcp ${host_server_6}\[9\] <- ${net_clients_6}::1\[4206\] .* route-to: ${net_server1_6}::42:0@${epair_server1}a" \
+	; do
+		grep -qE "${state_regexp}" $states || atf_fail "State not found for '${state_regexp}'"
+	done
+}
+
+roundrobin_cleanup()
+{
+	pft_cleanup
+}
+
+atf_test_case "random_table" "cleanup"
+
+random_table_head()
+{
+	atf_set descr 'Pool with random flag and a table for IPv6'
+	atf_set require.user root
+}
+
+random_table_body()
+{
+	pf_map_addr_common
+
+	# The "random" flag will pick random hosts from the table but will
+	# not dive into prefixes, always choosing the 0th address.
+	# Proper address family will be choosen.
+
+	jexec router pfctl -e
+	pft_set_rules router \
+		"set debug loud" \
+		"set reassemble yes" \
+		"set state-policy if-bound" \
+		"table <rt_targets> { ${net_server2_4}.40/31 ${net_server2_4}.44 ${net_server2_6}::42:0/127 ${net_server2_6}::42:4 }" \
+		"pass in on ${epair_tester}b \
+			route-to { (${epair_server2}a <rt_targets>) } random \
+			inet6 proto tcp \
+			keep state"
+
+	good_targets="${net_server2_6}::42:0 ${net_server2_6}::42:4"
+	bad_targets="${net_server2_6}::42:1"
+	check_random IPv6 "${good_targets}" "${bad_targets}"
+}
+
+random_table_cleanup()
+{
+	pft_cleanup
+}
+
+atf_test_case "random_prefix" "cleanup"
+
+random_prefix_head()
+{
+	atf_set descr 'Pool with random flag and a table for IPv4'
+	atf_set require.user root
+}
+
+random_prefix_body()
+{
+	pf_map_addr_common
+
+	# The "random" flag will pick random hosts from given prefix.
+	# The choice being random makes testing it non-trivial. We do 10
+	# attempts to have each target chosen. Hopefully this is enough to have
+	# this test pass often enough.
+
+	jexec router pfctl -e
+	pft_set_rules router \
+		"set debug loud" \
+		"set reassemble yes" \
+		"set state-policy if-bound" \
+		"pass in on ${epair_tester}b \
+			route-to { (${epair_server2}a ${net_server2_6}::42:0/127) } random \
+			inet6 proto tcp \
+			keep state"
+
+	good_targets="${net_server2_6}::42:0 ${net_server2_6}::42:1"
+	check_random IPv6 "${good_targets}"
+}
+
+random_prefix_cleanup()
+{
+	pft_cleanup
+}
+
+atf_test_case "prefer_ipv6_nexthop_single_ipv4" "cleanup"
+
+prefer_ipv6_nexthop_single_ipv4_head()
+{
+	atf_set descr 'prefer-ipv6-nexthop option for a single IPv4 gateway'
+	atf_set require.user root
+}
+
+prefer_ipv6_nexthop_single_ipv4_body()
+{
+	pf_map_addr_common
+
+	# Basic forwarding test for prefer-ipv6-nexthop pool option.
+	# A single IPv4 gateway will work only for IPv4 traffic.
+
+	jexec router pfctl -e
+	pft_set_rules router \
+		"set reassemble yes" \
*** 565 LINES SKIPPED ***