git: f1fb05171662 - main - divert(4): maintain own cb database and stop using inpcb KPI

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Tue, 30 Aug 2022 22:49:27 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=f1fb051716625d29d9dea599c6d7d20d773fe6e3

commit f1fb051716625d29d9dea599c6d7d20d773fe6e3
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2022-08-30 22:09:21 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2022-08-30 22:09:21 +0000

    divert(4): maintain own cb database and stop using inpcb KPI
    
    Here go cons of using inpcb for divert:
    - divert(4) uses only 16 bits (local port) out of struct inpcb,
      which is 424 bytes today.
    - The inpcb KPI isn't able to provide hashing for divert(4),
      thus it uses global inpcb list for lookups.
    - divert(4) uses INET-specific part of the KPI, making INET
      a requirement for IPDIVERT.
    
    Maintain our own very simple hash lookup database instead.  It
    has mutex protection for write and epoch protection for lookups.
    Since now so->so_pcb no longer points to struct inpcb, don't
    initialize protosw methods to methods that belong to PF_INET.
    Also, drop support for setting options on a divert socket.  My
    review of software in base and ports confirms that this has no
    use and unlikely worked before.
    
    Differential revision:  https://reviews.freebsd.org/D36382
---
 share/man/man4/divert.4 |   5 +-
 sys/netinet/ip_divert.c | 310 ++++++++++++++++++++----------------------------
 2 files changed, 132 insertions(+), 183 deletions(-)

diff --git a/share/man/man4/divert.4 b/share/man/man4/divert.4
index cfe1a31486c9..18da54ee9fc9 100644
--- a/share/man/man4/divert.4
+++ b/share/man/man4/divert.4
@@ -159,10 +159,9 @@ with the correct value.
 Packets written as incoming and having incorrect checksums will be dropped.
 Otherwise, all header fields are unchanged (and therefore in network order).
 .Pp
-Binding to port numbers less than 1024 requires super-user access, as does
-creating a
+Creating a
 .Nm
-socket.
+socket requires super-user access.
 .Sh ERRORS
 Writing to a divert socket can return these errors, along with
 the usual errors possible when writing raw packets:
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index 0cecbbd0d15d..7e0a09bd8e3a 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -86,6 +86,14 @@ __FBSDID("$FreeBSD$");
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
+/*
+ * Usually a system has very few divert ports.  Previous implementation
+ * used a linked list.
+ */
+#define	DIVHASHSIZE	(1 << 3)	/* 8 entries, one cache line. */
+#define	DIVHASH(port)	(port % DIVHASHSIZE)
+#define	DCBHASH(dcb)	((dcb)->dcb_port % DIVHASHSIZE)
+
 /*
  * Divert sockets work in conjunction with ipfw or other packet filters,
  * see the divert(4) manpage for features.
@@ -124,9 +132,6 @@ SYSCTL_VNET_PCPUSTAT(_net_inet_divert, OID_AUTO, stats, struct divstat,
 #define	DIVSTAT_INC(name)	\
     VNET_PCPUSTAT_ADD(struct divstat, divstat, div_ ## name, 1)
 
-VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
-#define	V_divcbinfo			VNET(divcbinfo)
-
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
@@ -134,39 +139,31 @@ static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m,
     struct sockaddr_in *sin);
 static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
 
-/*
- * Initialize divert connection block queue.
- */
-INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash");
-
-static void
-div_init(void *arg __unused)
-{
-
-	/*
-	 * XXX We don't use the hash list for divert IP, but it's easier to
-	 * allocate one-entry hash lists than it is to check all over the
-	 * place for hashbase == NULL.
-	 */
-	in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1);
-}
-VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL);
-
-static void
-div_destroy(void *unused __unused)
-{
+struct divcb {
+	union {
+		SLIST_ENTRY(divcb)	dcb_next;
+		intptr_t		dcb_bound;
+#define	DCB_UNBOUND	((intptr_t)-1)
+	};
+	struct socket		*dcb_socket;
+	uint16_t		 dcb_port;
+	uint64_t		 dcb_gencnt;
+	struct epoch_context	 dcb_epochctx;
+};
 
-	in_pcbinfo_destroy(&V_divcbinfo);
-}
-VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_destroy, NULL);
+SLIST_HEAD(divhashhead, divcb);
 
-static bool
-div_port_match(const struct inpcb *inp, void *v)
-{
-	uint16_t nport = *(uint16_t *)v;
+VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]) = {};
+#define	V_divhash	VNET(divhash)
+VNET_DEFINE_STATIC(uint64_t, dcb_count) = 0;
+#define	V_dcb_count	VNET(dcb_count)
+VNET_DEFINE_STATIC(uint64_t, dcb_gencnt) = 0;
+#define	V_dcb_gencnt	VNET(dcb_gencnt)
 
-	return (inp->inp_lport == nport);
-}
+static struct mtx divert_mtx;
+MTX_SYSINIT(divert, &divert_mtx, "divert(4) socket pcb lists", MTX_DEF);
+#define	DIVERT_LOCK()	mtx_lock(&divert_mtx)
+#define	DIVERT_UNLOCK()	mtx_unlock(&divert_mtx)
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
@@ -177,12 +174,9 @@ divert_packet(struct mbuf *m, bool incoming)
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	struct ip *ip;
 #endif
-	struct inpcb *inp;
-	struct socket *sa;
+	struct divcb *dcb;
 	u_int16_t nport;
 	struct sockaddr_in divsrc;
-	struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo,
-	    INPLOOKUP_RLOCKPCB, div_port_match, &nport);
 	struct m_tag *mtag;
 
 	NET_EPOCH_ASSERT();
@@ -275,27 +269,26 @@ divert_packet(struct mbuf *m, bool incoming)
 	}
 
 	/* Put packet on socket queue, if any */
-	sa = NULL;
-	/* nport is inp_next's context. */
-	nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
-	while ((inp = inp_next(&inpi)) != NULL) {
-		sa = inp->inp_socket;
+	nport = htons((uint16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
+	SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next)
+		if (dcb->dcb_port == nport)
+			break;
+
+	if (dcb != NULL) {
+		struct socket *sa = dcb->dcb_socket;
+
 		SOCKBUF_LOCK(&sa->so_rcv);
 		if (sbappendaddr_locked(&sa->so_rcv,
 		    (struct sockaddr *)&divsrc, m, NULL) == 0) {
 			soroverflow_locked(sa);
-			sa = NULL;	/* force mbuf reclaim below */
+			m_freem(m);
 		} else {
 			sorwakeup_locked(sa);
 			DIVSTAT_INC(diverted);
 		}
-		/* XXX why does only one socket match? */
-		INP_RUNLOCK(inp);
-		break;
-	}
-	if (sa == NULL) {
-		m_freem(m);
+	} else {
 		DIVSTAT_INC(noport);
+		m_freem(m);
 	}
 }
 
@@ -422,23 +415,12 @@ static int
 div_output_outbound(int family, struct socket *so, struct mbuf *m)
 {
 	struct ip *const ip = mtod(m, struct ip *);
-	struct mbuf *options;
-	struct inpcb *inp;
 	int error;
 
-	inp = sotoinpcb(so);
-	INP_RLOCK(inp);
 	switch (family) {
 	case AF_INET:
-		/*
-		 * Don't allow both user specified and setsockopt
-		 * options, and don't allow packet length sizes that
-		 * will crash.
-		 */
-		if ((((ip->ip_hl << 2) != sizeof(struct ip)) &&
-		    inp->inp_options != NULL) ||
-		    ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
-			INP_RUNLOCK(inp);
+		/* Don't allow packet length sizes that will crash. */
+		if (((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 			m_freem(m);
 			return (EINVAL);
 		}
@@ -450,7 +432,6 @@ div_output_outbound(int family, struct socket *so, struct mbuf *m)
 
 		/* Don't allow packet length sizes that will crash */
 		if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) {
-			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (EINVAL);
 		}
@@ -460,44 +441,13 @@ div_output_outbound(int family, struct socket *so, struct mbuf *m)
 	}
 
 #ifdef MAC
-	mac_inpcb_create_mbuf(inp, m);
+	mac_socket_create_mbuf(so, m);
 #endif
-	/*
-	 * Get ready to inject the packet into ip_output().
-	 * Just in case socket options were specified on the
-	 * divert socket, we duplicate them.  This is done
-	 * to avoid having to hold the PCB locks over the call
-	 * to ip_output(), as doing this results in a number of
-	 * lock ordering complexities.
-	 *
-	 * Note that we set the multicast options argument for
-	 * ip_output() to NULL since it should be invariant that
-	 * they are not present.
-	 */
-	KASSERT(inp->inp_moptions == NULL,
-	    ("multicast options set on a divert socket"));
-	/*
-	 * XXXCSJP: It is unclear to me whether or not it makes
-	 * sense for divert sockets to have options.  However,
-	 * for now we will duplicate them with the INP locks
-	 * held so we can use them in ip_output() without
-	 * requring a reference to the pcb.
-	 */
-	options = NULL;
-	if (inp->inp_options != NULL) {
-		options = m_dup(inp->inp_options, M_NOWAIT);
-		if (options == NULL) {
-			INP_RUNLOCK(inp);
-			m_freem(m);
-			return (ENOBUFS);
-		}
-	}
-	INP_RUNLOCK(inp);
 
 	error = 0;
 	switch (family) {
 	case AF_INET:
-		error = ip_output(m, options, NULL,
+		error = ip_output(m, NULL, NULL,
 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0)
 		    | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL);
 		break;
@@ -509,8 +459,6 @@ div_output_outbound(int family, struct socket *so, struct mbuf *m)
 	}
 	if (error == 0)
 		DIVSTAT_INC(outbound);
-	if (options != NULL)
-		m_freem(options);
 
 	return (error);
 }
@@ -579,11 +527,9 @@ div_output_inbound(int family, struct socket *so, struct mbuf *m,
 static int
 div_attach(struct socket *so, int proto, struct thread *td)
 {
-	struct inpcb *inp;
+	struct divcb *dcb;
 	int error;
 
-	inp  = sotoinpcb(so);
-	KASSERT(inp == NULL, ("div_attach: inp != NULL"));
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NETINET_DIVERT);
 		if (error)
@@ -592,85 +538,90 @@ div_attach(struct socket *so, int proto, struct thread *td)
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error)
 		return error;
-	error = in_pcballoc(so, &V_divcbinfo);
-	if (error)
-		return error;
-	inp = (struct inpcb *)so->so_pcb;
-	inp->inp_ip_p = proto;
-	inp->inp_flags |= INP_HDRINCL;
-	INP_WUNLOCK(inp);
-	return 0;
+	dcb = malloc(sizeof(*dcb), M_PCB, M_WAITOK);
+	dcb->dcb_bound = DCB_UNBOUND;
+	dcb->dcb_socket = so;
+	DIVERT_LOCK();
+	V_dcb_count++;
+	dcb->dcb_gencnt = ++V_dcb_gencnt;
+	DIVERT_UNLOCK();
+	so->so_pcb = dcb;
+
+	return (0);
 }
 
 static void
-div_detach(struct socket *so)
+div_free(epoch_context_t ctx)
 {
-	struct inpcb *inp;
+	struct divcb *dcb = __containerof(ctx, struct divcb, dcb_epochctx);
+
+	free(dcb, M_PCB);
+}
 
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("div_detach: inp == NULL"));
-	INP_WLOCK(inp);
-	in_pcbdetach(inp);
-	in_pcbfree(inp);
+static void
+div_detach(struct socket *so)
+{
+	struct divcb *dcb = so->so_pcb;
+
+	so->so_pcb = NULL;
+	DIVERT_LOCK();
+	if (dcb->dcb_bound != DCB_UNBOUND)
+		SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
+	V_dcb_count--;
+	V_dcb_gencnt++;
+	DIVERT_UNLOCK();
+	NET_EPOCH_CALL(div_free, &dcb->dcb_epochctx);
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
-	struct inpcb *inp;
-	int error;
+	struct divcb *dcb;
+	uint16_t port;
 
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("div_bind: inp == NULL"));
-	/* in_pcbbind assumes that nam is a sockaddr_in
-	 * and in_pcbbind requires a valid address. Since divert
-	 * sockets don't we need to make sure the address is
-	 * filled in properly.
-	 * XXX -- divert should not be abusing in_pcbind
-	 * and should probably have its own family.
-	 */
 	if (nam->sa_family != AF_INET)
 		return EAFNOSUPPORT;
 	if (nam->sa_len != sizeof(struct sockaddr_in))
 		return EINVAL;
-	((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
-	INP_WLOCK(inp);
-	INP_HASH_WLOCK(&V_divcbinfo);
-	error = in_pcbbind(inp, nam, td->td_ucred);
-	INP_HASH_WUNLOCK(&V_divcbinfo);
-	INP_WUNLOCK(inp);
-	return error;
+	port = ((struct sockaddr_in *)nam)->sin_port;
+	DIVERT_LOCK();
+	SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next)
+		if (dcb->dcb_port == port) {
+			DIVERT_UNLOCK();
+			return (EADDRINUSE);
+		}
+	dcb = so->so_pcb;
+	if (dcb->dcb_bound != DCB_UNBOUND)
+		SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
+	dcb->dcb_port = port;
+	SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next);
+	DIVERT_UNLOCK();
+
+	return (0);
 }
 
 static int
 div_shutdown(struct socket *so)
 {
-	struct inpcb *inp;
 
-	inp = sotoinpcb(so);
-	KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
-	INP_WLOCK(inp);
 	socantsendmore(so);
-	INP_WUNLOCK(inp);
 	return 0;
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
-	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo,
-	    INPLOOKUP_RLOCKPCB);
 	struct xinpgen xig;
-	struct inpcb *inp;
+	struct divcb *dcb;
 	int error;
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	if (req->oldptr == 0) {
-		int n;
+		u_int n;
 
-		n = V_divcbinfo.ipi_count;
+		n = V_dcb_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return 0;
@@ -681,39 +632,45 @@ div_pcblist(SYSCTL_HANDLER_ARGS)
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
-	xig.xig_count = V_divcbinfo.ipi_count;
-	xig.xig_gen = V_divcbinfo.ipi_gencnt;
+	xig.xig_count = V_dcb_count;
+	xig.xig_gen = V_dcb_gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
-	while ((inp = inp_next(&inpi)) != NULL) {
-		if (inp->inp_gencnt <= xig.xig_gen) {
-			struct xinpcb xi;
-
-			in_pcbtoxinpcb(inp, &xi);
-			error = SYSCTL_OUT(req, &xi, sizeof xi);
-			if (error) {
-				INP_RUNLOCK(inp);
-				break;
+	DIVERT_LOCK();
+	for (int i = 0; i < DIVHASHSIZE; i++)
+		SLIST_FOREACH(dcb, &V_divhash[i], dcb_next) {
+			if (dcb->dcb_gencnt <= xig.xig_gen) {
+				struct xinpcb xi;
+
+				bzero(&xi, sizeof(xi));
+				xi.xi_len = sizeof(struct xinpcb);
+				sotoxsocket(dcb->dcb_socket, &xi.xi_socket);
+				xi.inp_gencnt = dcb->dcb_gencnt;
+				xi.inp_vflag = INP_IPV4; /* XXX: netstat(1) */
+				xi.inp_inc.inc_ie.ie_lport = dcb->dcb_port;
+				error = SYSCTL_OUT(req, &xi, sizeof xi);
+				if (error)
+					goto errout;
 			}
 		}
-	}
 
-	if (!error) {
-		/*
-		 * Give the user an updated idea of our state.
-		 * If the generation differs from what we told
-		 * her before, she knows that something happened
-		 * while we were processing this request, and it
-		 * might be necessary to retry.
-		 */
-		xig.xig_gen = V_divcbinfo.ipi_gencnt;
-		xig.xig_sogen = so_gencnt;
-		xig.xig_count = V_divcbinfo.ipi_count;
-		error = SYSCTL_OUT(req, &xig, sizeof xig);
-	}
+	/*
+	 * Give the user an updated idea of our state.
+	 * If the generation differs from what we told
+	 * her before, she knows that something happened
+	 * while we were processing this request, and it
+	 * might be necessary to retry.
+	 */
+	xig.xig_gen = V_dcb_gencnt;
+	xig.xig_sogen = so_gencnt;
+	xig.xig_count = V_dcb_count;
+	error = SYSCTL_OUT(req, &xig, sizeof xig);
+
+errout:
+	DIVERT_UNLOCK();
 
 	return (error);
 }
@@ -726,13 +683,9 @@ static struct protosw div_protosw = {
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_attach =		div_attach,
 	.pr_bind =		div_bind,
-	.pr_control =		in_control,
 	.pr_detach =		div_detach,
-	.pr_peeraddr =		in_getpeeraddr,
 	.pr_send =		div_send,
 	.pr_shutdown =		div_shutdown,
-	.pr_sockaddr =		in_getsockaddr,
-	.pr_sosetlabel =	in_pcbsosetlabel
 };
 
 static struct domain divertdomain = {
@@ -775,18 +728,15 @@ div_modevent(module_t mod, int type, void *unused)
 		 * XXXGL: One more reason this code is incorrect is that it
 		 * checks only the current vnet.
 		 */
-		INP_INFO_WLOCK(&V_divcbinfo);
-		if (V_divcbinfo.ipi_count != 0) {
+		DIVERT_LOCK();
+		if (V_dcb_count != 0) {
+			DIVERT_UNLOCK();
 			err = EBUSY;
-			INP_INFO_WUNLOCK(&V_divcbinfo);
 			break;
 		}
+		DIVERT_UNLOCK();
 		ip_divert_ptr = NULL;
 		domain_remove(&divertdomain);
-		INP_INFO_WUNLOCK(&V_divcbinfo);
-#ifndef VIMAGE
-		div_destroy(NULL);
-#endif
 		break;
 	default:
 		err = EOPNOTSUPP;