git: 8624f4347e81 - main - divert: declare PF_DIVERT domain and stop abusing PF_INET

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Tue, 30 Aug 2022 22:49:24 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=8624f4347e8133911b0554e816f6bedb56dc5fb3

commit 8624f4347e8133911b0554e816f6bedb56dc5fb3
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2022-08-30 22:09:21 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2022-08-30 22:09:21 +0000

    divert: declare PF_DIVERT domain and stop abusing PF_INET
    
    The divert(4) is not a protocol of IPv4.  It is a socket to
    intercept packets from ipfw(4) to userland and re-inject them
    back.  It can divert and re-inject IPv4 and IPv6 packets today,
    but potentially it is not limited to these two protocols.  The
    IPPROTO_DIVERT does not belong to known IP protocols, it
    doesn't even fit into u_char.  I guess, the implementation of
    divert(4) was done the way it is done basically because it was
    easier to do it this way, back when protocols for sockets were
    intertwined with IP protocols and domains were statically
    compiled in.
    
    Moving divert(4) out of inetsw accomplished two important things:
    
    1) IPDIVERT is getting much closer to be not dependent on INET.
       This will be finalized in following changes.
    2) Now divert socket no longer aliases with raw IPv4 socket.
       Domain/proto selection code won't need a hack for SOCK_RAW and
       multiple entries in inetsw implementing different flavors of
       raw socket can merge into one without requirement of raw IPv4
       being the last member of dom_protosw.
    
    Differential revision:  https://reviews.freebsd.org/D36379
---
 lib/libc/sys/socket.2         |  4 +++-
 share/examples/netgraph/ngctl |  6 +++---
 share/man/man4/divert.4       | 38 +++++++++++++++++++++++---------------
 sys/kern/uipc_socket.c        | 11 +++++++++++
 sys/netgraph/ng_ksocket.c     |  2 +-
 sys/netinet/in_mcast.c        | 20 ++++++--------------
 sys/netinet/ip_divert.c       | 23 ++++++++++++++---------
 sys/netinet6/in6_mcast.c      | 20 ++++++--------------
 sys/sys/socket.h              |  4 +++-
 usr.bin/netstat/inet.c        |  7 +++----
 usr.bin/netstat/main.c        |  2 +-
 11 files changed, 74 insertions(+), 63 deletions(-)

diff --git a/lib/libc/sys/socket.2 b/lib/libc/sys/socket.2
index 8ced1f0ba930..1eceabbf6fd4 100644
--- a/lib/libc/sys/socket.2
+++ b/lib/libc/sys/socket.2
@@ -28,7 +28,7 @@
 .\"     From: @(#)socket.2	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD$
 .\"
-.Dd August 26, 2022
+.Dd August 30, 2022
 .Dt SOCKET 2
 .Os
 .Sh NAME
@@ -60,6 +60,7 @@ PF_LOCAL	Host-internal protocols (alias for PF_UNIX),
 PF_UNIX		Host-internal protocols,
 PF_INET		Internet version 4 protocols,
 PF_INET6	Internet version 6 protocols,
+PF_DIVERT	Firewall packet diversion/re-injection,
 PF_ROUTE	Internal routing protocol,
 PF_KEY		Internal key-management function,
 PF_NETGRAPH	Netgraph sockets,
@@ -283,6 +284,7 @@ The socket type is not supported by the protocol.
 .Xr accept 2 ,
 .Xr bind 2 ,
 .Xr connect 2 ,
+.Xr divert 4 ,
 .Xr getpeername 2 ,
 .Xr getsockname 2 ,
 .Xr getsockopt 2 ,
diff --git a/share/examples/netgraph/ngctl b/share/examples/netgraph/ngctl
index e7b7cd86b04f..8dc6b23815b7 100644
--- a/share/examples/netgraph/ngctl
+++ b/share/examples/netgraph/ngctl
@@ -31,10 +31,10 @@
       quit       Exit program
     +
 
-# Now let's create a ng_ksocket(4) node, in the family PF_INET,
-# of type SOCK_RAW, and protocol IPPROTO_DIVERT:
+# Now let's create a ng_ksocket(4) node, in the family PF_DIVERT,
+# of type SOCK_RAW:
 
-    + mkpeer ksocket foo inet/raw/divert
+    + mkpeer ksocket foo divert/raw/0
 
 # Note that ``foo'' is the hook name on the socket node, which can be
 # anything.  The ``inet/raw/divert'' is the hook name on the ksocket
diff --git a/share/man/man4/divert.4 b/share/man/man4/divert.4
index d8296995ca97..cfe1a31486c9 100644
--- a/share/man/man4/divert.4
+++ b/share/man/man4/divert.4
@@ -1,6 +1,6 @@
 .\" $FreeBSD$
 .\"
-.Dd December 17, 2004
+.Dd August 30, 2022
 .Dt DIVERT 4
 .Os
 .Sh NAME
@@ -11,7 +11,7 @@
 .In sys/socket.h
 .In netinet/in.h
 .Ft int
-.Fn socket PF_INET SOCK_RAW IPPROTO_DIVERT
+.Fn socket PF_DIVERT SOCK_RAW 0
 .Pp
 To enable support for divert sockets, place the following lines in the
 kernel configuration file:
@@ -30,24 +30,30 @@ ipfw_load="YES"
 ipdivert_load="YES"
 .Ed
 .Sh DESCRIPTION
-Divert sockets are similar to raw IP sockets, except that they
-can be bound to a specific
+Divert sockets allow to intercept and re-inject packets flowing through
+the
+.Xr ipfw 4
+firewall.
+A divert socket can be bound to a specific
 .Nm
 port via the
 .Xr bind 2
 system call.
-The IP address in the bind is ignored; only the port
-number is significant.
+The sockaddr argument shall be sockaddr_in with sin_port set to the
+desired value.
+Note that the
+.Nm
+port has nothing to do with TCP/UDP ports.
+It is just a cookie number, that allows to differentiate between different
+divert points in the
+.Xr ipfw 4
+ruleset.
 A divert socket bound to a divert port will receive all packets diverted
-to that port by some (here unspecified) kernel mechanism(s).
-Packets may also be written to a divert port, in which case they
-re-enter kernel IP packet processing.
+to that port by
+.Xr ipfw 4 .
+Packets may also be written to a divert port, in which case they re-enter
+firewall processing at the next rule.
 .Pp
-Divert sockets are normally used in conjunction with
-.Fx Ns 's
-packet filtering implementation and the
-.Xr ipfw 8
-program.
 By reading from and writing to a divert socket, matching packets
 can be passed through an arbitrary ``filter'' as they travel through
 the host machine, special routing tricks can be done, etc.
@@ -154,7 +160,9 @@ Packets written as incoming and having incorrect checksums will be dropped.
 Otherwise, all header fields are unchanged (and therefore in network order).
 .Pp
 Binding to port numbers less than 1024 requires super-user access, as does
-creating a socket of type SOCK_RAW.
+creating a
+.Nm
+socket.
 .Sh ERRORS
 Writing to a divert socket can return these errors, along with
 the usual errors possible when writing raw packets:
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index bf22c0245f24..1bc172eacd89 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -509,6 +509,17 @@ socreate(int dom, struct socket **aso, int type, int proto,
 	struct socket *so;
 	int error;
 
+	/*
+	 * XXX: divert(4) historically abused PF_INET.  Keep this compatibility
+	 * shim until all applications have been updated.
+	 */
+	if (__predict_false(dom == PF_INET && type == SOCK_RAW &&
+	    proto == IPPROTO_DIVERT)) {
+		dom = PF_DIVERT;
+		printf("%s uses obsolete way to create divert(4) socket\n",
+		    td->td_proc->p_comm);
+	}
+
 	if (proto)
 		prp = pffindproto(dom, proto, type);
 	else
diff --git a/sys/netgraph/ng_ksocket.c b/sys/netgraph/ng_ksocket.c
index d4f41fe02205..ff5e7b4812bf 100644
--- a/sys/netgraph/ng_ksocket.c
+++ b/sys/netgraph/ng_ksocket.c
@@ -121,6 +121,7 @@ static const struct ng_ksocket_alias ng_ksocket_families[] = {
 	{ "inet",	PF_INET		},
 	{ "inet6",	PF_INET6	},
 	{ "atm",	PF_ATM		},
+	{ "divert",	PF_DIVERT	},
 	{ NULL,		-1		},
 };
 
@@ -147,7 +148,6 @@ static const struct ng_ksocket_alias ng_ksocket_protos[] = {
 	{ "ah",		IPPROTO_AH,		PF_INET		},
 	{ "swipe",	IPPROTO_SWIPE,		PF_INET		},
 	{ "encap",	IPPROTO_ENCAP,		PF_INET		},
-	{ "divert",	IPPROTO_DIVERT,		PF_INET		},
 	{ "pim",	IPPROTO_PIM,		PF_INET		},
 	{ NULL,		-1					},
 };
diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c
index 3f25471f0858..87de83da7a6a 100644
--- a/sys/netinet/in_mcast.c
+++ b/sys/netinet/in_mcast.c
@@ -1751,13 +1751,9 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 
 	INP_WLOCK(inp);
 	imo = inp->inp_moptions;
-	/*
-	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
-	 * or is a divert socket, reject it.
-	 */
-	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
-	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
-	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
+	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM reject it. */
+	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
@@ -2717,13 +2713,9 @@ inp_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 
 	error = 0;
 
-	/*
-	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
-	 * or is a divert socket, reject it.
-	 */
-	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
-	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
-	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
+	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */
+	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index d14ec5190ad0..b09d7e1dda7a 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -36,7 +36,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_inet6.h"
 #include "opt_sctp.h"
 #ifndef INET
-#error "IPDIVERT requires INET"
+#error "IPDIVERT requires INET"		/* XXX! */
 #endif
 
 #include <sys/param.h>
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
@@ -716,7 +717,6 @@ SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist,
 
 static struct protosw div_protosw = {
 	.pr_type =		SOCK_RAW,
-	.pr_protocol =		IPPROTO_DIVERT,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_attach =		div_attach,
 	.pr_bind =		div_bind,
@@ -729,6 +729,13 @@ static struct protosw div_protosw = {
 	.pr_sosetlabel =	in_pcbsosetlabel
 };
 
+static struct domain divertdomain = {
+	.dom_family =	PF_DIVERT,
+	.dom_name =	"divert",
+	.dom_nprotosw =	1,
+	.dom_protosw =	{ &div_protosw },
+};
+
 static int
 div_modevent(module_t mod, int type, void *unused)
 {
@@ -736,12 +743,7 @@ div_modevent(module_t mod, int type, void *unused)
 
 	switch (type) {
 	case MOD_LOAD:
-		/*
-		 * Protocol will be initialized by pf_proto_register().
-		 */
-		err = protosw_register(&inetdomain, &div_protosw);
-		if (err != 0)
-			return (err);
+		domain_add(&divertdomain);
 		ip_divert_ptr = divert_packet;
 		break;
 	case MOD_QUIESCE:
@@ -763,6 +765,9 @@ div_modevent(module_t mod, int type, void *unused)
 		 * XXXRW: Note that there is a slight race here, as a new
 		 * socket open request could be spinning on the lock and then
 		 * we destroy the lock.
+		 *
+		 * XXXGL: One more reason this code is incorrect is that it
+		 * checks only the current vnet.
 		 */
 		INP_INFO_WLOCK(&V_divcbinfo);
 		if (V_divcbinfo.ipi_count != 0) {
@@ -771,7 +776,7 @@ div_modevent(module_t mod, int type, void *unused)
 			break;
 		}
 		ip_divert_ptr = NULL;
-		err = protosw_unregister(&div_protosw);
+		domain_remove(&divertdomain);
 		INP_INFO_WUNLOCK(&V_divcbinfo);
 #ifndef VIMAGE
 		div_destroy(NULL);
diff --git a/sys/netinet6/in6_mcast.c b/sys/netinet6/in6_mcast.c
index d0f8186e75c7..a02e18656dc2 100644
--- a/sys/netinet6/in6_mcast.c
+++ b/sys/netinet6/in6_mcast.c
@@ -1772,13 +1772,9 @@ ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 
 	INP_WLOCK(inp);
 	im6o = inp->in6p_moptions;
-	/*
-	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
-	 * or is a divert socket, reject it.
-	 */
-	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
-	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
-	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
+	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */
+	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
@@ -2655,13 +2651,9 @@ ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 
 	error = 0;
 
-	/*
-	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
-	 * or is a divert socket, reject it.
-	 */
-	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
-	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
-	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
+	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */
+	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 3ec0d3b1d06d..f81aba8f972d 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -268,7 +268,8 @@ struct accept_filter_arg {
 #define	AF_INET_SDP	40		/* OFED Socket Direct Protocol ipv4 */
 #define	AF_INET6_SDP	42		/* OFED Socket Direct Protocol ipv6 */
 #define	AF_HYPERV	43		/* HyperV sockets */
-#define	AF_MAX		43
+#define	AF_DIVERT	44		/* divert(4) */
+#define	AF_MAX		44
 /*
  * When allocating a new AF_ constant, please only allocate
  * even numbered constants for FreeBSD until 134 as odd numbered AF_
@@ -393,6 +394,7 @@ struct sockproto {
 #define	PF_NETLINK	AF_NETLINK
 #define	PF_INET_SDP	AF_INET_SDP
 #define	PF_INET6_SDP	AF_INET6_SDP
+#define	PF_DIVERT	AF_DIVERT
 
 #define	PF_MAX		AF_MAX
 
diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c
index b7dbcb3531b0..e848874d1695 100644
--- a/usr.bin/netstat/inet.c
+++ b/usr.bin/netstat/inet.c
@@ -109,15 +109,14 @@ pcblist_sysctl(int proto, const char *name, char **bufp)
 	case IPPROTO_UDP:
 		mibvar = "net.inet.udp.pcblist";
 		break;
-	case IPPROTO_DIVERT:
-		mibvar = "net.inet.divert.pcblist";
-		break;
 	default:
 		mibvar = "net.inet.raw.pcblist";
 		break;
 	}
 	if (strncmp(name, "sdp", 3) == 0)
 		mibvar = "net.inet.sdp.pcblist";
+	else if (strncmp(name, "divert", 6) == 0)
+		mibvar = "net.inet.divert.pcblist";
 	len = 0;
 	if (sysctlbyname(mibvar, 0, &len, 0, 0) < 0) {
 		if (errno != ENOENT)
@@ -272,7 +271,7 @@ protopr(u_long off, const char *name, int af1, int proto)
 		so = &inp->xi_socket;
 
 		/* Ignore sockets for protocols other than the desired one. */
-		if (so->xso_protocol != proto)
+		if (proto != 0 && so->xso_protocol != proto)
 			continue;
 
 		/* Ignore PCBs which were freed during copyout. */
diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c
index 1a011b9d5488..d1b069f38f0c 100644
--- a/usr.bin/netstat/main.c
+++ b/usr.bin/netstat/main.c
@@ -101,7 +101,7 @@ static struct protox {
 	 NULL,		NULL,		"sdp",	1,	IPPROTO_TCP },
 #endif
 	{ N_DIVCBINFO,	-1,		1,	protopr,
-	  NULL,		NULL,		"divert", 1,	IPPROTO_DIVERT },
+	  NULL,		NULL,		"divert", 1,	0 },
 	{ N_RIPCBINFO,	N_IPSTAT,	1,	protopr,
 	  ip_stats,	NULL,		"ip",	1,	IPPROTO_RAW },
 	{ N_RIPCBINFO,	N_ICMPSTAT,	1,	protopr,