git: ece716c5d347 - main - raw ip: move hash table manipulation to inpcb layer

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Sun, 12 Apr 2026 18:35:43 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=ece716c5d34728a170f1dfe1b3389c267d6ddd1e

commit ece716c5d34728a170f1dfe1b3389c267d6ddd1e
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2026-04-12 18:35:13 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2026-04-12 18:35:13 +0000

    raw ip: move hash table manipulation to inpcb layer
    
    The SOCK_RAW socket is a multiple receiver socket by its definition.  An
    incoming packet may be copied to multiple sockets.  Thus, incoming packet
    handling is expensive.  Systems with many thousands of raw sockets usually
    have them connect(2)-ed to different destinations.  This allows for some
    improvement of the input handling, which was introduced by 9ed324c9a588
    back in 2008.  This optimization was made specifically for L2TP/PPTP VPN
    concentrators based on ports/net/mpd5.
    
    This change generalizes the idea of 9ed324c9a588, so that it potentially
    can be used with IPv6 raw sockets.  This also eliminates last use of the
    pcbinfo hash lock outside of in_pcb.c.
    
    While here make a speculative design decision: put into the hash table
    sockets that did only connect(2).  Previously, we were indexing only
    sockets that were protocol bound, did bind(2) and did connect(2).  My
    speculation is that only the remote IP provides some real entropy into the
    hash and local address and proto are expected to be the same for majority
    of the sockets.  My other speculation is that VPN concentrators other than
    mpd5 may not bind(2) their sockets, thus not getting any use of the hash.
    
    Differential Revision:  https://reviews.freebsd.org/D56172
---
 sys/netinet/in_pcb.c     |  36 +++++++++++++++
 sys/netinet/in_pcb.h     |  10 ++++
 sys/netinet/in_pcb_var.h |   3 --
 sys/netinet/raw_ip.c     | 118 +++++++++--------------------------------------
 4 files changed, 69 insertions(+), 98 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 7b294e0a92d5..af62aea91bf8 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -2833,6 +2833,42 @@ in_pcbrehash(struct inpcb *inp)
 	}
 }
 
+void
+ripcb_connect(struct inpcb *inp)
+{
+	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+	uint32_t hash;
+
+	INP_WLOCK_ASSERT(inp);
+	MPASS(inp->inp_flags & INP_UNCONNECTED);
+
+	hash = RIPCB_HASH(inp) & pcbinfo->ipi_hashmask;
+
+	INP_HASH_WLOCK(pcbinfo);
+	CK_LIST_REMOVE(inp, inp_unconn_list);
+	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_hash_exact[hash], inp,
+	    inp_hash_exact);
+	INP_HASH_WUNLOCK(pcbinfo);
+	inp->inp_flags &= ~INP_UNCONNECTED;
+}
+
+void
+ripcb_disconnect(struct inpcb *inp)
+{
+	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+
+	INP_WLOCK_ASSERT(inp);
+
+	if (inp->inp_flags & INP_UNCONNECTED)
+		return;
+
+	INP_HASH_WLOCK(pcbinfo);
+	CK_LIST_REMOVE(inp, inp_hash_exact);
+	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
+	INP_HASH_WUNLOCK(pcbinfo);
+	inp->inp_flags |= INP_UNCONNECTED;
+}
+
 /*
  * Check for alternatives when higher level complains
  * about service problems.  For now, invalidate cached
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index d34c88941c7f..8663ba205b0a 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -565,6 +565,9 @@ void 	inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 #define	INP_HASH_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_hash_lock, \
 					MA_OWNED)
 
+VNET_DECLARE(uint32_t, in_pcbhashseed);
+#define	V_in_pcbhashseed	VNET(in_pcbhashseed)
+
 /*
  * Wildcard matching hash is not just a microoptimisation!  The hash for
  * wildcard IPv4 and wildcard IPv6 must be the same, otherwise AF_INET6
@@ -596,6 +599,10 @@ void 	inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 
 #define INP_PCBPORTHASH(lport, mask)	(ntohs((lport)) & (mask))
 
+#define	RIPCB_HASH(inp)	(((inp)->inp_vflag & INP_IPV6) ?		\
+	IN6_ADDR_JHASH32(&(inp)->in6p_faddr) :				\
+	IN_ADDR_JHASH32(&(inp)->inp_faddr))
+
 /*
  * Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next().
  */
@@ -666,6 +673,9 @@ bool	in_pcbrele(struct inpcb *, inp_lookup_t);
 bool	in_pcbrele_rlocked(struct inpcb *);
 bool	in_pcbrele_wlocked(struct inpcb *);
 bool	in_pcbrele_rlock(struct inpcb *inp);
+void	ripcb_connect(struct inpcb *);
+void	ripcb_disconnect(struct inpcb *);
+
 #ifdef _SYS_SOCKETVAR_H_
 void	in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
 int	sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h
index 1f46e1bd8f3d..8b005712de27 100644
--- a/sys/netinet/in_pcb_var.h
+++ b/sys/netinet/in_pcb_var.h
@@ -43,9 +43,6 @@
 
 #define	INP_UNCONNECTED	0x04000000	/* Not inserted into hashes. */
 
-VNET_DECLARE(uint32_t, in_pcbhashseed);
-#define	V_in_pcbhashseed	VNET(in_pcbhashseed)
-
 void	inp_lock(struct inpcb *inp, const inp_lookup_t lock);
 void	inp_unlock(struct inpcb *inp, const inp_lookup_t lock);
 int	inp_trylock(struct inpcb *inp, const inp_lookup_t lock);
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index 5114a69d7bf2..6f2b4dd9cb05 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -41,6 +41,7 @@
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
+#include <sys/hash.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
@@ -140,53 +141,12 @@ u_long	rip_recvspace = 9216;
 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
     &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
 
-/*
- * Hash functions
- */
-
-#define INP_PCBHASH_RAW_SIZE	256
-#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
-        (((proto) + (laddr) + (faddr)) % (mask) + 1)
-
-#ifdef INET
-static void
-rip_inshash(struct inpcb *inp)
-{
-	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
-	struct inpcbhead *pcbhash;
-	int hash;
-
-	INP_HASH_WLOCK_ASSERT(pcbinfo);
-	INP_WLOCK_ASSERT(inp);
-
-	if (inp->inp_ip_p != 0 &&
-	    inp->inp_laddr.s_addr != INADDR_ANY &&
-	    inp->inp_faddr.s_addr != INADDR_ANY) {
-		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
-		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
-	} else
-		hash = 0;
-	pcbhash = &pcbinfo->ipi_hash_exact[hash];
-	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
-}
-
-static void
-rip_delhash(struct inpcb *inp)
-{
-
-	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
-	INP_WLOCK_ASSERT(inp);
-
-	CK_LIST_REMOVE(inp, inp_hash_exact);
-}
-#endif /* INET */
-
 INPCBSTORAGE_DEFINE(ripcbstor, inpcb, "rawinp", "ripcb", "riphash");
 
 static void
 rip_init(void *arg __unused)
 {
-
+#define	INP_PCBHASH_RAW_SIZE	256
 	in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1);
 }
 VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL);
@@ -250,26 +210,7 @@ struct rip_inp_match_ctx {
 };
 
 static bool
-rip_inp_match1(const struct inpcb *inp, void *v)
-{
-	struct rip_inp_match_ctx *ctx = v;
-
-	if (inp->inp_ip_p != ctx->proto)
-		return (false);
-#ifdef INET6
-	/* XXX inp locking */
-	if ((inp->inp_vflag & INP_IPV4) == 0)
-		return (false);
-#endif
-	if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr)
-		return (false);
-	if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr)
-		return (false);
-	return (true);
-}
-
-static bool
-rip_inp_match2(const struct inpcb *inp, void *v)
+rip_inp_match(const struct inpcb *inp, void *v)
 {
 	struct rip_inp_match_ctx *ctx = v;
 
@@ -301,7 +242,7 @@ rip_input(struct mbuf **mp, int *offp, int proto)
 		.proto = proto,
 	};
 	struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
-	    INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx);
+	    INPLOOKUP_RLOCKPCB, rip_inp_match, &ctx);
 	struct ifnet *ifp;
 	struct mbuf *m = *mp;
 	struct inpcb *inp;
@@ -321,8 +262,7 @@ rip_input(struct mbuf **mp, int *offp, int proto)
 	fib = M_GETFIB(m);
 	ifp = m->m_pkthdr.rcvif;
 
-	inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr,
-	    ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
+	inpi.mode = IN_ADDR_JHASH32(&ctx.ip->ip_src) & V_ripcbinfo.ipi_hashmask;
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (jailed_without_vnet(inp->inp_cred) &&
@@ -342,8 +282,7 @@ rip_input(struct mbuf **mp, int *offp, int proto)
 		appended += rip_append(inp, ctx.ip, m, &ripsrc);
 	}
 
-	inpi.hash = 0;
-	inpi.match = rip_inp_match2;
+	inpi.mode = INP_UNCONN_LIST;
 	MPASS(inpi.inp == NULL);
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_RLOCK_ASSERT(inp);
@@ -837,9 +776,6 @@ rip_attach(struct socket *so, int proto, struct thread *td)
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_ip_p = proto;
 	inp->inp_ip_ttl = V_ip_defttl;
-	INP_HASH_WLOCK(&V_ripcbinfo);
-	rip_inshash(inp);
-	INP_HASH_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
@@ -859,9 +795,6 @@ rip_detach(struct socket *so)
 		ip_mrouter_done(so);
 
 	INP_WLOCK(inp);
-	INP_HASH_WLOCK(&V_ripcbinfo);
-	rip_delhash(inp);
-	INP_HASH_WUNLOCK(&V_ripcbinfo);
 
 	if (ip_rsvp_force_done)
 		ip_rsvp_force_done(so);
@@ -871,20 +804,17 @@ rip_detach(struct socket *so)
 }
 
 static void
-rip_dodisconnect(struct inpcb *inp)
+rip_dodisconnect(struct inpcb *inp, bool disconnect_socket)
 {
-	struct inpcbinfo *pcbinfo;
 
-	pcbinfo = inp->inp_pcbinfo;
 	INP_WLOCK(inp);
-	INP_HASH_WLOCK(pcbinfo);
-	rip_delhash(inp);
 	inp->inp_faddr.s_addr = INADDR_ANY;
-	rip_inshash(inp);
-	INP_HASH_WUNLOCK(pcbinfo);
-	SOCK_LOCK(inp->inp_socket);
-	inp->inp_socket->so_state &= ~SS_ISCONNECTED;
-	SOCK_UNLOCK(inp->inp_socket);
+	ripcb_disconnect(inp);
+	if (disconnect_socket) {
+		SOCK_LOCK(inp->inp_socket);
+		inp->inp_socket->so_state &= ~SS_ISCONNECTED;
+		SOCK_UNLOCK(inp->inp_socket);
+	}
 	INP_WUNLOCK(inp);
 }
 
@@ -896,7 +826,7 @@ rip_abort(struct socket *so)
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
 
-	rip_dodisconnect(inp);
+	rip_dodisconnect(inp, true);
 }
 
 static void
@@ -907,7 +837,7 @@ rip_close(struct socket *so)
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
 
-	rip_dodisconnect(inp);
+	rip_dodisconnect(inp, true);
 }
 
 static int
@@ -921,7 +851,7 @@ rip_disconnect(struct socket *so)
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
 
-	rip_dodisconnect(inp);
+	rip_dodisconnect(inp, true);
 	return (0);
 }
 
@@ -952,11 +882,7 @@ rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 		return (EADDRNOTAVAIL);
 
 	INP_WLOCK(inp);
-	INP_HASH_WLOCK(&V_ripcbinfo);
-	rip_delhash(inp);
 	inp->inp_laddr = addr->sin_addr;
-	rip_inshash(inp);
-	INP_HASH_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
@@ -978,11 +904,13 @@ rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
 
 	INP_WLOCK(inp);
-	INP_HASH_WLOCK(&V_ripcbinfo);
-	rip_delhash(inp);
-	inp->inp_faddr = addr->sin_addr;
-	rip_inshash(inp);
-	INP_HASH_WUNLOCK(&V_ripcbinfo);
+	if (inp->inp_faddr.s_addr != INADDR_ANY &&
+	    addr->sin_addr.s_addr == INADDR_ANY)
+		rip_dodisconnect(inp, false);
+	if (addr->sin_addr.s_addr != INADDR_ANY) {
+		inp->inp_faddr = addr->sin_addr;
+		ripcb_connect(inp);
+	}
 	soisconnected(so);
 	INP_WUNLOCK(inp);
 	return (0);