git: aa6163ff658b - stable/14 - tcp: Add a sysctl to modify listening socket FIB inheritance
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 21 Feb 2025 01:57:17 UTC
The branch stable/14 has been updated by markj:
URL: https://cgit.FreeBSD.org/src/commit/?id=aa6163ff658b65f4a58d7603a7f7cfd1c39ee086
commit aa6163ff658b65f4a58d7603a7f7cfd1c39ee086
Author: Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2025-02-06 14:14:49 +0000
Commit: Mark Johnston <markj@FreeBSD.org>
CommitDate: 2025-02-21 01:04:50 +0000
tcp: Add a sysctl to modify listening socket FIB inheritance
Introduce the net.inet.tcp.bind_all_fibs tunable, set to 1 by default
for compatibility with current behaviour. When set to 0, all TCP
listening sockets are private to their FIB. Inbound connection requests
will only succeed if a matching inpcb is bound to the same FIB as the
request.
No functional change intended, as the new behaviour is not enabled by
default.
Reviewed by: glebius
MFC after: 2 weeks
Sponsored by: Klara, Inc.
Sponsored by: Stormshield
Differential Revision: https://reviews.freebsd.org/D48663
(cherry picked from commit 5dc99e9bb985dce58e8fc85c09ef4e49bf051971)
---
share/man/man4/tcp.4 | 32 +++++++++++++++++++++++++++++++-
sys/netinet/tcp_input.c | 8 +++++++-
sys/netinet/tcp_usrreq.c | 12 ++++++++----
sys/netinet/tcp_var.h | 2 ++
4 files changed, 48 insertions(+), 6 deletions(-)
diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index da88a30bf86a..bf86a8d35feb 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -33,7 +33,7 @@
.\"
.\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93
.\"
-.Dd July 28, 2024
+.Dd January 10, 2025
.Dt TCP 4
.Os
.Sh NAME
@@ -202,6 +202,35 @@ The alternate TCP stack must already be loaded in the kernel.
To list the available TCP stacks, see
.Va functions_available
in the
+.Sx FIB support
+TCP sockets are FIB-aware.
+They inherit the FIB of the process which created the socket, or that of the
+listening socket for sockets created by
+.Xr accept 2 .
+In particular, the FIB is not inherited from that of the interface where the
+initiating SYN packet was received.
+When an incoming connection request arrives to a listening socket, the initial
+handshake also occurs in the FIB of the listening socket, not that of the
+received packet.
+.Pp
+By default, a TCP listening socket can accept connections originating from any
+FIB.
+If the
+.Va net.inet.tcp.bind_all_fibs
+tunable is set to 0, a listening socket will only accept connections
+originating
+from the FIB's listening socket.
+Connection requests from other FIBs will be treated as though there is no
+listening socket for the destination address and port.
+In this mode, multiple listening sockets owned by the same user can listen on
+the same address and port so long as they belong to different FIBs, similar to
+the behavior of the
+.Dv SO_REUSEPORT
+socket option.
+If the tunable is set to 0, all sockets added to a load-balancing group created
+with the
+.Dv SO_REUSEPORT_LB
+socket option must belong to the same FIB.
.Sx MIB (sysctl) Variables
section further down.
To list the default TCP stack, see
@@ -1048,6 +1077,7 @@ when trying to use a TCP function block that is not available;
.El
.Sh SEE ALSO
.Xr getsockopt 2 ,
+.Xr setfib 2 ,
.Xr socket 2 ,
.Xr stats 3 ,
.Xr sysctl 3 ,
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 83f85a50ed40..fe67710fadd6 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -137,6 +137,11 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_log_in_vain), 0,
"Log all incoming TCP segments to closed ports");
+VNET_DEFINE(int, tcp_bind_all_fibs) = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN,
+ &VNET_NAME(tcp_bind_all_fibs), 0,
+ "Bound sockets receive traffic from all FIBs");
+
VNET_DEFINE(int, blackhole) = 0;
#define V_blackhole VNET(blackhole)
SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
@@ -832,7 +837,8 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
*/
lookupflag = INPLOOKUP_WILDCARD |
((thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
- INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB);
+ INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB) |
+ (V_tcp_bind_all_fibs ? 0 : INPLOOKUP_FIB);
findpcb:
tp = NULL;
#ifdef INET6
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 34a4bc15ff0d..67645827cb58 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -264,7 +264,8 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
goto out;
}
INP_HASH_WLOCK(&V_tcbinfo);
- error = in_pcbbind(inp, sinp, 0, td->td_ucred);
+ error = in_pcbbind(inp, sinp, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB,
+ td->td_ucred);
INP_HASH_WUNLOCK(&V_tcbinfo);
out:
tcp_bblog_pru(tp, PRU_BIND, error);
@@ -338,7 +339,8 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
}
}
#endif
- error = in6_pcbbind(inp, sin6, 0, td->td_ucred);
+ error = in6_pcbbind(inp, sin6, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB,
+ td->td_ucred);
INP_HASH_WUNLOCK(&V_tcbinfo);
out:
if (error != 0)
@@ -378,7 +380,8 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
}
if (inp->inp_lport == 0) {
INP_HASH_WLOCK(&V_tcbinfo);
- error = in_pcbbind(inp, NULL, 0, td->td_ucred);
+ error = in_pcbbind(inp, NULL,
+ V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred);
INP_HASH_WUNLOCK(&V_tcbinfo);
}
if (error == 0) {
@@ -435,7 +438,8 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
inp->inp_vflag &= ~INP_IPV4;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
inp->inp_vflag |= INP_IPV4;
- error = in6_pcbbind(inp, NULL, 0, td->td_ucred);
+ error = in6_pcbbind(inp, NULL,
+ V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred);
}
INP_HASH_WUNLOCK(&V_tcbinfo);
if (error == 0) {
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index b75210acad33..d5f7f0d4dc19 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1283,6 +1283,7 @@ VNET_DECLARE(uint32_t, tcp_ack_war_time_window);
VNET_DECLARE(int, tcp_autorcvbuf_max);
VNET_DECLARE(int, tcp_autosndbuf_inc);
VNET_DECLARE(int, tcp_autosndbuf_max);
+VNET_DECLARE(int, tcp_bind_all_fibs);
VNET_DECLARE(int, tcp_delack_enabled);
VNET_DECLARE(int, tcp_do_autorcvbuf);
VNET_DECLARE(int, tcp_do_autosndbuf);
@@ -1335,6 +1336,7 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo);
#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc)
#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max)
+#define V_tcp_bind_all_fibs VNET(tcp_bind_all_fibs)
#define V_tcp_delack_enabled VNET(tcp_delack_enabled)
#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf)