git: 9897a66923a3 - main - pf: Let rdr rules modify the src port if doing so would avoid a conflict

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Mon, 19 Aug 2024 14:37:38 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=9897a66923a3e79c22fcbd4bc80afae9eb9f277c

commit 9897a66923a3e79c22fcbd4bc80afae9eb9f277c
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2024-08-19 14:08:44 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2024-08-19 14:37:27 +0000

    pf: Let rdr rules modify the src port if doing so would avoid a conflict
    
    If NAT rules cause inbound connections to different external IPs to be
    mapped to the same internal IP, and some application uses the same
    source port for multiple such connections, rdr translation may result in
    conflicts that cause some of the connections to be dropped.
    
    Address this by letting rdr rules detect state conflicts and modulate
    the source port to avoid them.
    
    Reviewed by:    kp, allanjude
    MFC after:      3 months
    Sponsored by:   Klara, Inc.
    Sponsored by:   Modirum
    Differential Revision:  https://reviews.freebsd.org/D44488
---
 share/man/man5/pf.conf.5            |   9 +++-
 sys/netpfil/pf/pf_lb.c              |  70 ++++++++++++++++++++++---
 tests/sys/netpfil/pf/Makefile       |   1 +
 tests/sys/netpfil/pf/rdr-srcport.py |  20 ++++++++
 tests/sys/netpfil/pf/rdr.sh         | 100 ++++++++++++++++++++++++++++++++++++
 5 files changed, 191 insertions(+), 9 deletions(-)

diff --git a/share/man/man5/pf.conf.5 b/share/man/man5/pf.conf.5
index da55f00293bb..f04b0799741e 100644
--- a/share/man/man5/pf.conf.5
+++ b/share/man/man5/pf.conf.5
@@ -27,7 +27,7 @@
 .\" ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
-.Dd June 6, 2024
+.Dd June 24, 2024
 .Dt PF.CONF 5
 .Os
 .Sh NAME
@@ -1400,9 +1400,14 @@ or
 .Xr udp 4
 connections; implicitly in the case of
 .Ar nat
-rules and explicitly in the case of
+rules and both implicitly and explicitly in the case of
 .Ar rdr
 rules.
+A
+.Ar rdr
+rule may cause the source port to be modified if doing so avoids a conflict
+with an existing connection.
+A random source port in the range 50001-65535 is chosen in this case.
 Port numbers are never translated with a
 .Ar binat
 rule.
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index 4fcad7e578a8..4b703d3d02da 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -600,7 +600,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 {
 	struct pf_krule	*r = NULL;
 	struct pf_addr	*naddr;
-	uint16_t	*nport;
+	uint16_t	*nportp;
 	uint16_t	 low, high;
 
 	PF_RULES_RASSERT();
@@ -643,9 +643,8 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 		return (NULL);
 	}
 
-	/* XXX We only modify one side for now. */
 	naddr = &(*nkp)->addr[1];
-	nport = &(*nkp)->port[1];
+	nportp = &(*nkp)->port[1];
 
 	switch (r->action) {
 	case PF_NAT:
@@ -658,7 +657,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 		}
 		if (r->rpool.mape.offset > 0) {
 			if (pf_get_mape_sport(pd->af, pd->proto, r, saddr,
-			    sport, daddr, dport, naddr, nport, sn)) {
+			    sport, daddr, dport, naddr, nportp, sn)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: MAP-E port allocation (%u/%u/%u)"
 				    " failed\n",
@@ -668,7 +667,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 				goto notrans;
 			}
 		} else if (pf_get_sport(pd->af, pd->proto, r, saddr, sport,
-		    daddr, dport, naddr, nport, low, high, sn)) {
+		    daddr, dport, naddr, nportp, low, high, sn)) {
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("pf: NAT proxy port allocation (%u-%u) failed\n",
 			    r->rpool.proxy_port[0], r->rpool.proxy_port[1]));
@@ -742,6 +741,9 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 		}
 		break;
 	case PF_RDR: {
+		struct pf_state_key_cmp key;
+		uint16_t cut, low, high, nport;
+
 		if (pf_map_addr(pd->af, r, saddr, naddr, NULL, NULL, sn))
 			goto notrans;
 		if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK)
@@ -762,9 +764,63 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off,
 			/* Wrap around if necessary. */
 			if (tmp_nport > 65535)
 				tmp_nport -= 65535;
-			*nport = htons((uint16_t)tmp_nport);
+			nport = htons((uint16_t)tmp_nport);
 		} else if (r->rpool.proxy_port[0])
-			*nport = htons(r->rpool.proxy_port[0]);
+			nport = htons(r->rpool.proxy_port[0]);
+		else
+			nport = dport;
+
+		/*
+		 * Update the destination port.
+		 */
+		*nportp = nport;
+
+		/*
+		 * Do we have a source port conflict in the stack state?  Try to
+		 * modulate the source port if so.  Note that this is racy since
+		 * the state lookup may not find any matches here but will once
+		 * pf_create_state() actually instantiates the state.
+		 */
+		bzero(&key, sizeof(key));
+		key.af = pd->af;
+		key.proto = pd->proto;
+		key.port[0] = sport;
+		PF_ACPY(&key.addr[0], saddr, key.af);
+		key.port[1] = nport;
+		PF_ACPY(&key.addr[1], naddr, key.af);
+
+		if (!pf_find_state_all_exists(&key, PF_OUT))
+			break;
+
+		low = 50001;	/* XXX-MJ PF_NAT_PROXY_PORT_LOW/HIGH */
+		high = 65535;
+		cut = arc4random() % (1 + high - low) + low;
+		for (uint32_t tmp = cut;
+		    tmp <= high && tmp <= UINT16_MAX; tmp++) {
+			key.port[0] = htons(tmp);
+			if (!pf_find_state_all_exists(&key, PF_OUT)) {
+				/* Update the source port. */
+				(*nkp)->port[0] = htons(tmp);
+				goto out;
+			}
+		}
+		for (uint32_t tmp = cut - 1; tmp >= low; tmp--) {
+			key.port[0] = htons(tmp);
+			if (!pf_find_state_all_exists(&key, PF_OUT)) {
+				/* Update the source port. */
+				(*nkp)->port[0] = htons(tmp);
+				goto out;
+			}
+		}
+
+		DPFPRINTF(PF_DEBUG_MISC,
+		    ("pf: RDR source port allocation failed\n"));
+		if (0) {
+out:
+			DPFPRINTF(PF_DEBUG_MISC,
+			    ("pf: RDR source port allocation %u->%u\n",
+			    ntohs(sport), ntohs((*nkp)->port[0])));
+		}
 		break;
 	}
 	default:
diff --git a/tests/sys/netpfil/pf/Makefile b/tests/sys/netpfil/pf/Makefile
index 4a16642a967b..2b3cb9fbd858 100644
--- a/tests/sys/netpfil/pf/Makefile
+++ b/tests/sys/netpfil/pf/Makefile
@@ -73,6 +73,7 @@ ${PACKAGE}FILES+=	CVE-2019-5597.py \
 			pfsync_defer.py \
 			pft_ether.py \
 			pft_read_ipfix.py \
+			rdr-srcport.py \
 			utils.subr
 
 ${PACKAGE}FILESMODE_CVE-2019-5597.py=	0555
diff --git a/tests/sys/netpfil/pf/rdr-srcport.py b/tests/sys/netpfil/pf/rdr-srcport.py
new file mode 100644
index 000000000000..633580582711
--- /dev/null
+++ b/tests/sys/netpfil/pf/rdr-srcport.py
@@ -0,0 +1,20 @@
+#
+# A helper script which accepts TCP connections and writes the remote port
+# number to the stream.
+#
+
+import socket
+
+def main():
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    s.bind(('0.0.0.0', 8888))
+    s.listen(5)
+
+    while True:
+        cs, addr = s.accept()
+        cs.sendall(str(addr[1]).encode())
+        cs.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/sys/netpfil/pf/rdr.sh b/tests/sys/netpfil/pf/rdr.sh
index b7ec80b4d85e..135bfd42c1f4 100644
--- a/tests/sys/netpfil/pf/rdr.sh
+++ b/tests/sys/netpfil/pf/rdr.sh
@@ -121,7 +121,107 @@ tcp_v6_cleanup()
 	pft_cleanup
 }
 
+
+atf_test_case "srcport" "cleanup"
+srcport_head()
+{
+	atf_set descr 'TCP rdr srcport modulation'
+	atf_set require.user root
+	atf_set require.progs python3
+	atf_set timeout 9999
+}
+
+#
+# Test that rdr works for multiple TCP with same srcip and srcport.
+#
+# Four jails, a, b, c, d, are used:
+# - jail d runs a server on port 8888,
+# - jail a makes connections to the server, routed through jails b and c,
+# - jail b uses NAT to rewrite source addresses and ports to the same 2-tuple,
+#   avoiding the need to use SO_REUSEADDR in jail a,
+# - jail c uses a redirect rule to map the destination address to the same
+#   address and port, resulting in a NAT state conflict.
+#
+# In this case, the rdr rule should also rewrite the source port (again) to
+# resolve the state conflict.
+#
+srcport_body()
+{
+	pft_init
+
+	j="rdr:srcport"
+	epair1=$(vnet_mkepair)
+	epair2=$(vnet_mkepair)
+	epair3=$(vnet_mkepair)
+
+	echo $epair_one
+	echo $epair_two
+
+	vnet_mkjail ${j}a ${epair1}a
+	vnet_mkjail ${j}b ${epair1}b ${epair2}a
+	vnet_mkjail ${j}c ${epair2}b ${epair3}a
+	vnet_mkjail ${j}d ${epair3}b
+
+	# configure addresses for a
+	jexec ${j}a ifconfig lo0 up
+	jexec ${j}a ifconfig ${epair1}a inet 198.51.100.50/24 up
+	jexec ${j}a ifconfig ${epair1}a inet alias 198.51.100.51/24
+	jexec ${j}a ifconfig ${epair1}a inet alias 198.51.100.52/24
+
+	# configure addresses for b
+	jexec ${j}b ifconfig lo0 up
+	jexec ${j}b ifconfig ${epair1}b inet 198.51.100.1/24 up
+	jexec ${j}b ifconfig ${epair2}a inet 198.51.101.2/24 up
+
+	# configure addresses for c
+	jexec ${j}c ifconfig lo0 up
+	jexec ${j}c ifconfig ${epair2}b inet 198.51.101.3/24 up
+	jexec ${j}c ifconfig ${epair2}b inet alias 198.51.101.4/24
+	jexec ${j}c ifconfig ${epair2}b inet alias 198.51.101.5/24
+	jexec ${j}c ifconfig ${epair3}a inet 203.0.113.1/24 up
+
+	# configure addresses for d
+	jexec ${j}d ifconfig lo0 up
+	jexec ${j}d ifconfig ${epair3}b inet 203.0.113.50/24 up
+
+	jexec ${j}b sysctl net.inet.ip.forwarding=1
+	jexec ${j}c sysctl net.inet.ip.forwarding=1
+	jexec ${j}b pfctl -e
+	jexec ${j}c pfctl -e
+
+	pft_set_rules ${j}b \
+		"set debug misc" \
+		"nat on ${epair2}a inet from 198.51.100.0/24 to any -> ${epair2}a static-port"
+
+	pft_set_rules ${j}c \
+		"set debug misc" \
+		"rdr on ${epair2}b proto tcp from any to ${epair2}b port 7777 -> 203.0.113.50 port 8888"
+
+	jexec ${j}a route add default 198.51.100.1
+	jexec ${j}c route add 198.51.100.0/24 198.51.101.2
+	jexec ${j}d route add 198.51.101.0/24 203.0.113.1
+
+	jexec ${j}d python3 $(atf_get_srcdir)/rdr-srcport.py &
+        sleep 1
+
+	echo a | jexec ${j}a nc -w 3 -s 198.51.100.50 -p 1234 198.51.101.3 7777 > port1
+
+	jexec ${j}a nc -s 198.51.100.51 -p 1234 198.51.101.4 7777 > port2 &
+	jexec ${j}a nc -s 198.51.100.52 -p 1234 198.51.101.5 7777 > port3 &
+	sleep 1
+
+	atf_check -o inline:"1234" cat port1
+	atf_check -o match:"[0-9]+" -o not-inline:"1234" cat port2
+	atf_check -o match:"[0-9]+" -o not-inline:"1234" cat port3
+}
+
+srcport_cleanup()
+{
+	pft_cleanup
+}
+
 atf_init_test_cases()
 {
 	atf_add_test_case "tcp_v6"
+	atf_add_test_case "srcport"
 }