git: 08d0ebe560a8 - stable/14 - pf: Let rdr rules modify the src port if doing so would avoid a conflict

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Wed, 20 Nov 2024 21:41:22 UTC
The branch stable/14 has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=08d0ebe560a89db189c36dcba270de33fb2c0965

commit 08d0ebe560a89db189c36dcba270de33fb2c0965
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2024-08-19 14:08:44 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2024-11-20 21:41:09 +0000

    pf: Let rdr rules modify the src port if doing so would avoid a conflict
    
    If NAT rules cause inbound connections to different external IPs to be
    mapped to the same internal IP, and some application uses the same
    source port for multiple such connections, rdr translation may result in
    conflicts that cause some of the connections to be dropped.
    
    Address this by letting rdr rules detect state conflicts and modulate
    the source port to avoid them.
    
    Reviewed by:    kp, allanjude
    MFC after:      3 months
    Sponsored by:   Klara, Inc.
    Sponsored by:   Modirum
    Differential Revision:  https://reviews.freebsd.org/D44488
    
    (cherry picked from commit 9897a66923a3e79c22fcbd4bc80afae9eb9f277c)
---
 share/man/man5/pf.conf.5            |   9 +++-
 sys/netpfil/pf/pf_lb.c              |  70 ++++++++++++++++++++++---
 tests/sys/netpfil/pf/Makefile       |   1 +
 tests/sys/netpfil/pf/rdr-srcport.py |  20 ++++++++
 tests/sys/netpfil/pf/rdr.sh         | 100 ++++++++++++++++++++++++++++++++++++
 5 files changed, 191 insertions(+), 9 deletions(-)

diff --git a/share/man/man5/pf.conf.5 b/share/man/man5/pf.conf.5
index 7e2db95fd961..d9e9127f8a84 100644
--- a/share/man/man5/pf.conf.5
+++ b/share/man/man5/pf.conf.5
@@ -27,7 +27,7 @@
 .\" ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
-.Dd November 17, 2023
+.Dd June 24, 2024
 .Dt PF.CONF 5
 .Os
 .Sh NAME
@@ -1403,9 +1403,14 @@ or
 .Xr udp 4
 connections; implicitly in the case of
 .Ar nat
-rules and explicitly in the case of
+rules and both implicitly and explicitly in the case of
 .Ar rdr
 rules.
+A
+.Ar rdr
+rule may cause the source port to be modified if doing so avoids a conflict
+with an existing connection.
+A random source port in the range 50001-65535 is chosen in this case.
 Port numbers are never translated with a
 .Ar binat
 rule.
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index 06c82569ad6c..7524bf138e6d 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -600,7 +600,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 {
 	struct pf_krule	*r = NULL;
 	struct pf_addr	*naddr;
-	uint16_t	*nport;
+	uint16_t	*nportp;
 	uint16_t	 low, high;
 
 	PF_RULES_RASSERT();
@@ -643,9 +643,8 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 		return (NULL);
 	}
 
-	/* XXX We only modify one side for now. */
 	naddr = &(*nkp)->addr[1];
-	nport = &(*nkp)->port[1];
+	nportp = &(*nkp)->port[1];
 
 	switch (r->action) {
 	case PF_NAT:
@@ -658,7 +657,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 		}
 		if (r->rpool.mape.offset > 0) {
 			if (pf_get_mape_sport(pd->af, pd->proto, r, saddr,
-			    sport, daddr, dport, naddr, nport, sn)) {
+			    sport, daddr, dport, naddr, nportp, sn)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: MAP-E port allocation (%u/%u/%u)"
 				    " failed\n",
@@ -668,7 +667,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 				goto notrans;
 			}
 		} else if (pf_get_sport(pd->af, pd->proto, r, saddr, sport,
-		    daddr, dport, naddr, nport, low, high, sn)) {
+		    daddr, dport, naddr, nportp, low, high, sn)) {
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("pf: NAT proxy port allocation (%u-%u) failed\n",
 			    r->rpool.proxy_port[0], r->rpool.proxy_port[1]));
@@ -742,6 +741,9 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 		}
 		break;
 	case PF_RDR: {
+		struct pf_state_key_cmp key;
+		uint16_t cut, low, high, nport;
+
 		if (pf_map_addr(pd->af, r, saddr, naddr, NULL, NULL, sn))
 			goto notrans;
 		if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
@@ -762,9 +764,63 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 			/* Wrap around if necessary. */
 			if (tmp_nport > 65535)
 				tmp_nport -= 65535;
-			*nport = htons((uint16_t)tmp_nport);
+			nport = htons((uint16_t)tmp_nport);
 		} else if (r->rpool.proxy_port[0])
-			*nport = htons(r->rpool.proxy_port[0]);
+			nport = htons(r->rpool.proxy_port[0]);
+		else
+			nport = dport;
+
+		/*
+		 * Update the destination port.
+		 */
+		*nportp = nport;
+
+		/*
+		 * Do we have a source port conflict in the stack state?  Try to
+		 * modulate the source port if so.  Note that this is racy since
+		 * the state lookup may not find any matches here but will once
+		 * pf_create_state() actually instantiates the state.
+		 */
+		bzero(&key, sizeof(key));
+		key.af = pd->af;
+		key.proto = pd->proto;
+		key.port[0] = sport;
+		PF_ACPY(&key.addr[0], saddr, key.af);
+		key.port[1] = nport;
+		PF_ACPY(&key.addr[1], naddr, key.af);
+
+		if (!pf_find_state_all_exists(&key, PF_OUT))
+			break;
+
+		low = 50001;	/* XXX-MJ PF_NAT_PROXY_PORT_LOW/HIGH */
+		high = 65535;
+		cut = arc4random() % (1 + high - low) + low;
+		for (uint32_t tmp = cut;
+		    tmp <= high && tmp <= UINT16_MAX; tmp++) {
+			key.port[0] = htons(tmp);
+			if (!pf_find_state_all_exists(&key, PF_OUT)) {
+				/* Update the source port. */
+				(*nkp)->port[0] = htons(tmp);
+				goto out;
+			}
+		}
+		for (uint32_t tmp = cut - 1; tmp >= low; tmp--) {
+			key.port[0] = htons(tmp);
+			if (!pf_find_state_all_exists(&key, PF_OUT)) {
+				/* Update the source port. */
+				(*nkp)->port[0] = htons(tmp);
+				goto out;
+			}
+		}
+
+		DPFPRINTF(PF_DEBUG_MISC,
+		    ("pf: RDR source port allocation failed\n"));
+		if (0) {
+out:
+			DPFPRINTF(PF_DEBUG_MISC,
+			    ("pf: RDR source port allocation %u->%u\n",
+			    ntohs(sport), ntohs((*nkp)->port[0])));
+		}
 		break;
 	}
 	default:
diff --git a/tests/sys/netpfil/pf/Makefile b/tests/sys/netpfil/pf/Makefile
index cc61a48f80ea..dc77fd67b2c6 100644
--- a/tests/sys/netpfil/pf/Makefile
+++ b/tests/sys/netpfil/pf/Makefile
@@ -64,6 +64,7 @@ ${PACKAGE}FILES+=	CVE-2019-5597.py \
 			frag-overreplace.py \
 			pfsync_defer.py \
 			pft_ether.py \
+			rdr-srcport.py \
 			utils.subr
 
 ${PACKAGE}FILESMODE_CVE-2019-5597.py=	0555
diff --git a/tests/sys/netpfil/pf/rdr-srcport.py b/tests/sys/netpfil/pf/rdr-srcport.py
new file mode 100644
index 000000000000..633580582711
--- /dev/null
+++ b/tests/sys/netpfil/pf/rdr-srcport.py
@@ -0,0 +1,20 @@
+#
+# A helper script which accepts TCP connections and writes the remote port
+# number to the stream.
+#
+
+import socket
+
+def main():
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    s.bind(('0.0.0.0', 8888))
+    s.listen(5)
+
+    while True:
+        cs, addr = s.accept()
+        cs.sendall(str(addr[1]).encode())
+        cs.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/sys/netpfil/pf/rdr.sh b/tests/sys/netpfil/pf/rdr.sh
index 5e60b97c653b..07aca78cc650 100644
--- a/tests/sys/netpfil/pf/rdr.sh
+++ b/tests/sys/netpfil/pf/rdr.sh
@@ -121,7 +121,107 @@ tcp_v6_cleanup()
 	pft_cleanup
 }
 
+
+atf_test_case "srcport" "cleanup"
+srcport_head()
+{
+	atf_set descr 'TCP rdr srcport modulation'
+	atf_set require.user root
+	atf_set require.progs python3
+	atf_set timeout 9999
+}
+
+#
+# Test that rdr works for multiple TCP with same srcip and srcport.
+#
+# Four jails, a, b, c, d, are used:
+# - jail d runs a server on port 8888,
+# - jail a makes connections to the server, routed through jails b and c,
+# - jail b uses NAT to rewrite source addresses and ports to the same 2-tuple,
+#   avoiding the need to use SO_REUSEADDR in jail a,
+# - jail c uses a redirect rule to map the destination address to the same
+#   address and port, resulting in a NAT state conflict.
+#
+# In this case, the rdr rule should also rewrite the source port (again) to
+# resolve the state conflict.
+#
+srcport_body()
+{
+	pft_init
+
+	j="rdr:srcport"
+	epair1=$(vnet_mkepair)
+	epair2=$(vnet_mkepair)
+	epair3=$(vnet_mkepair)
+
+	echo $epair_one
+	echo $epair_two
+
+	vnet_mkjail ${j}a ${epair1}a
+	vnet_mkjail ${j}b ${epair1}b ${epair2}a
+	vnet_mkjail ${j}c ${epair2}b ${epair3}a
+	vnet_mkjail ${j}d ${epair3}b
+
+	# configure addresses for a
+	jexec ${j}a ifconfig lo0 up
+	jexec ${j}a ifconfig ${epair1}a inet 198.51.100.50/24 up
+	jexec ${j}a ifconfig ${epair1}a inet alias 198.51.100.51/24
+	jexec ${j}a ifconfig ${epair1}a inet alias 198.51.100.52/24
+
+	# configure addresses for b
+	jexec ${j}b ifconfig lo0 up
+	jexec ${j}b ifconfig ${epair1}b inet 198.51.100.1/24 up
+	jexec ${j}b ifconfig ${epair2}a inet 198.51.101.2/24 up
+
+	# configure addresses for c
+	jexec ${j}c ifconfig lo0 up
+	jexec ${j}c ifconfig ${epair2}b inet 198.51.101.3/24 up
+	jexec ${j}c ifconfig ${epair2}b inet alias 198.51.101.4/24
+	jexec ${j}c ifconfig ${epair2}b inet alias 198.51.101.5/24
+	jexec ${j}c ifconfig ${epair3}a inet 203.0.113.1/24 up
+
+	# configure addresses for d
+	jexec ${j}d ifconfig lo0 up
+	jexec ${j}d ifconfig ${epair3}b inet 203.0.113.50/24 up
+
+	jexec ${j}b sysctl net.inet.ip.forwarding=1
+	jexec ${j}c sysctl net.inet.ip.forwarding=1
+	jexec ${j}b pfctl -e
+	jexec ${j}c pfctl -e
+
+	pft_set_rules ${j}b \
+		"set debug misc" \
+		"nat on ${epair2}a inet from 198.51.100.0/24 to any -> ${epair2}a static-port"
+
+	pft_set_rules ${j}c \
+		"set debug misc" \
+		"rdr on ${epair2}b proto tcp from any to ${epair2}b port 7777 -> 203.0.113.50 port 8888"
+
+	jexec ${j}a route add default 198.51.100.1
+	jexec ${j}c route add 198.51.100.0/24 198.51.101.2
+	jexec ${j}d route add 198.51.101.0/24 203.0.113.1
+
+	jexec ${j}d python3 $(atf_get_srcdir)/rdr-srcport.py &
+        sleep 1
+
+	echo a | jexec ${j}a nc -w 3 -s 198.51.100.50 -p 1234 198.51.101.3 7777 > port1
+
+	jexec ${j}a nc -s 198.51.100.51 -p 1234 198.51.101.4 7777 > port2 &
+	jexec ${j}a nc -s 198.51.100.52 -p 1234 198.51.101.5 7777 > port3 &
+	sleep 1
+
+	atf_check -o inline:"1234" cat port1
+	atf_check -o match:"[0-9]+" -o not-inline:"1234" cat port2
+	atf_check -o match:"[0-9]+" -o not-inline:"1234" cat port3
+}
+
+srcport_cleanup()
+{
+	pft_cleanup
+}
+
 atf_init_test_cases()
 {
 	atf_add_test_case "tcp_v6"
+	atf_add_test_case "srcport"
 }