git: ece716c5d347 - main - raw ip: move hash table manipulation to inpcb layer
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Sun, 12 Apr 2026 18:35:43 UTC
The branch main has been updated by glebius:
URL: https://cgit.FreeBSD.org/src/commit/?id=ece716c5d34728a170f1dfe1b3389c267d6ddd1e
commit ece716c5d34728a170f1dfe1b3389c267d6ddd1e
Author: Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2026-04-12 18:35:13 +0000
Commit: Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2026-04-12 18:35:13 +0000
raw ip: move hash table manipulation to inpcb layer
The SOCK_RAW socket is a multiple receiver socket by its definition. An
incoming packet may be copied to multiple sockets. Thus, incoming packet
handling is expensive. Systems with many thousands of raw sockets usually
have them connect(2)-ed to different destinations. This allows for some
improvement of the input handling, which was introduced by 9ed324c9a588
back in 2008. This optimization was made specifically for L2TP/PPTP VPN
concentrators based on ports/net/mpd5.
This change generalizes the idea of 9ed324c9a588, so that it potentially
can be used with IPv6 raw sockets. This also eliminates last use of the
pcbinfo hash lock outside of in_pcb.c.
While here make a speculative design decision: put into the hash table
sockets that did only connect(2). Previously, we were indexing only
sockets that were protocol bound, did bind(2) and did connect(2). My
speculation is that only the remote IP provides some real entropy into the
hash and local address and proto are expected to be the same for majority
of the sockets. My other speculation is that VPN concentrators other than
mpd5 may not bind(2) their sockets, thus not getting any use of the hash.
Differential Revision: https://reviews.freebsd.org/D56172
---
sys/netinet/in_pcb.c | 36 +++++++++++++++
sys/netinet/in_pcb.h | 10 ++++
sys/netinet/in_pcb_var.h | 3 --
sys/netinet/raw_ip.c | 118 +++++++++--------------------------------------
4 files changed, 69 insertions(+), 98 deletions(-)
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 7b294e0a92d5..af62aea91bf8 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -2833,6 +2833,42 @@ in_pcbrehash(struct inpcb *inp)
}
}
+void
+ripcb_connect(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+ uint32_t hash;
+
+ INP_WLOCK_ASSERT(inp);
+ MPASS(inp->inp_flags & INP_UNCONNECTED);
+
+ hash = RIPCB_HASH(inp) & pcbinfo->ipi_hashmask;
+
+ INP_HASH_WLOCK(pcbinfo);
+ CK_LIST_REMOVE(inp, inp_unconn_list);
+ CK_LIST_INSERT_HEAD(&pcbinfo->ipi_hash_exact[hash], inp,
+ inp_hash_exact);
+ INP_HASH_WUNLOCK(pcbinfo);
+ inp->inp_flags &= ~INP_UNCONNECTED;
+}
+
+void
+ripcb_disconnect(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+
+ INP_WLOCK_ASSERT(inp);
+
+ if (inp->inp_flags & INP_UNCONNECTED)
+ return;
+
+ INP_HASH_WLOCK(pcbinfo);
+ CK_LIST_REMOVE(inp, inp_hash_exact);
+ CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
+ INP_HASH_WUNLOCK(pcbinfo);
+ inp->inp_flags |= INP_UNCONNECTED;
+}
+
/*
* Check for alternatives when higher level complains
* about service problems. For now, invalidate cached
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index d34c88941c7f..8663ba205b0a 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -565,6 +565,9 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, \
MA_OWNED)
+VNET_DECLARE(uint32_t, in_pcbhashseed);
+#define V_in_pcbhashseed VNET(in_pcbhashseed)
+
/*
* Wildcard matching hash is not just a microoptimisation! The hash for
* wildcard IPv4 and wildcard IPv6 must be the same, otherwise AF_INET6
@@ -596,6 +599,10 @@ void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
#define INP_PCBPORTHASH(lport, mask) (ntohs((lport)) & (mask))
+#define RIPCB_HASH(inp) (((inp)->inp_vflag & INP_IPV6) ? \
+ IN6_ADDR_JHASH32(&(inp)->in6p_faddr) : \
+ IN_ADDR_JHASH32(&(inp)->inp_faddr))
+
/*
* Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next().
*/
@@ -666,6 +673,9 @@ bool in_pcbrele(struct inpcb *, inp_lookup_t);
bool in_pcbrele_rlocked(struct inpcb *);
bool in_pcbrele_wlocked(struct inpcb *);
bool in_pcbrele_rlock(struct inpcb *inp);
+void ripcb_connect(struct inpcb *);
+void ripcb_disconnect(struct inpcb *);
+
#ifdef _SYS_SOCKETVAR_H_
void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h
index 1f46e1bd8f3d..8b005712de27 100644
--- a/sys/netinet/in_pcb_var.h
+++ b/sys/netinet/in_pcb_var.h
@@ -43,9 +43,6 @@
#define INP_UNCONNECTED 0x04000000 /* Not inserted into hashes. */
-VNET_DECLARE(uint32_t, in_pcbhashseed);
-#define V_in_pcbhashseed VNET(in_pcbhashseed)
-
void inp_lock(struct inpcb *inp, const inp_lookup_t lock);
void inp_unlock(struct inpcb *inp, const inp_lookup_t lock);
int inp_trylock(struct inpcb *inp, const inp_lookup_t lock);
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index 5114a69d7bf2..6f2b4dd9cb05 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -41,6 +41,7 @@
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
+#include <sys/hash.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/protosw.h>
@@ -140,53 +141,12 @@ u_long rip_recvspace = 9216;
SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
&rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
-/*
- * Hash functions
- */
-
-#define INP_PCBHASH_RAW_SIZE 256
-#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
- (((proto) + (laddr) + (faddr)) % (mask) + 1)
-
-#ifdef INET
-static void
-rip_inshash(struct inpcb *inp)
-{
- struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
- struct inpcbhead *pcbhash;
- int hash;
-
- INP_HASH_WLOCK_ASSERT(pcbinfo);
- INP_WLOCK_ASSERT(inp);
-
- if (inp->inp_ip_p != 0 &&
- inp->inp_laddr.s_addr != INADDR_ANY &&
- inp->inp_faddr.s_addr != INADDR_ANY) {
- hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
- inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
- } else
- hash = 0;
- pcbhash = &pcbinfo->ipi_hash_exact[hash];
- CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
-}
-
-static void
-rip_delhash(struct inpcb *inp)
-{
-
- INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
- INP_WLOCK_ASSERT(inp);
-
- CK_LIST_REMOVE(inp, inp_hash_exact);
-}
-#endif /* INET */
-
INPCBSTORAGE_DEFINE(ripcbstor, inpcb, "rawinp", "ripcb", "riphash");
static void
rip_init(void *arg __unused)
{
-
+#define INP_PCBHASH_RAW_SIZE 256
in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1);
}
VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL);
@@ -250,26 +210,7 @@ struct rip_inp_match_ctx {
};
static bool
-rip_inp_match1(const struct inpcb *inp, void *v)
-{
- struct rip_inp_match_ctx *ctx = v;
-
- if (inp->inp_ip_p != ctx->proto)
- return (false);
-#ifdef INET6
- /* XXX inp locking */
- if ((inp->inp_vflag & INP_IPV4) == 0)
- return (false);
-#endif
- if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr)
- return (false);
- if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr)
- return (false);
- return (true);
-}
-
-static bool
-rip_inp_match2(const struct inpcb *inp, void *v)
+rip_inp_match(const struct inpcb *inp, void *v)
{
struct rip_inp_match_ctx *ctx = v;
@@ -301,7 +242,7 @@ rip_input(struct mbuf **mp, int *offp, int proto)
.proto = proto,
};
struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
- INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx);
+ INPLOOKUP_RLOCKPCB, rip_inp_match, &ctx);
struct ifnet *ifp;
struct mbuf *m = *mp;
struct inpcb *inp;
@@ -321,8 +262,7 @@ rip_input(struct mbuf **mp, int *offp, int proto)
fib = M_GETFIB(m);
ifp = m->m_pkthdr.rcvif;
- inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr,
- ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
+ inpi.mode = IN_ADDR_JHASH32(&ctx.ip->ip_src) & V_ripcbinfo.ipi_hashmask;
while ((inp = inp_next(&inpi)) != NULL) {
INP_RLOCK_ASSERT(inp);
if (jailed_without_vnet(inp->inp_cred) &&
@@ -342,8 +282,7 @@ rip_input(struct mbuf **mp, int *offp, int proto)
appended += rip_append(inp, ctx.ip, m, &ripsrc);
}
- inpi.hash = 0;
- inpi.match = rip_inp_match2;
+ inpi.mode = INP_UNCONN_LIST;
MPASS(inpi.inp == NULL);
while ((inp = inp_next(&inpi)) != NULL) {
INP_RLOCK_ASSERT(inp);
@@ -837,9 +776,6 @@ rip_attach(struct socket *so, int proto, struct thread *td)
inp = (struct inpcb *)so->so_pcb;
inp->inp_ip_p = proto;
inp->inp_ip_ttl = V_ip_defttl;
- INP_HASH_WLOCK(&V_ripcbinfo);
- rip_inshash(inp);
- INP_HASH_WUNLOCK(&V_ripcbinfo);
INP_WUNLOCK(inp);
return (0);
}
@@ -859,9 +795,6 @@ rip_detach(struct socket *so)
ip_mrouter_done(so);
INP_WLOCK(inp);
- INP_HASH_WLOCK(&V_ripcbinfo);
- rip_delhash(inp);
- INP_HASH_WUNLOCK(&V_ripcbinfo);
if (ip_rsvp_force_done)
ip_rsvp_force_done(so);
@@ -871,20 +804,17 @@ rip_detach(struct socket *so)
}
static void
-rip_dodisconnect(struct inpcb *inp)
+rip_dodisconnect(struct inpcb *inp, bool disconnect_socket)
{
- struct inpcbinfo *pcbinfo;
- pcbinfo = inp->inp_pcbinfo;
INP_WLOCK(inp);
- INP_HASH_WLOCK(pcbinfo);
- rip_delhash(inp);
inp->inp_faddr.s_addr = INADDR_ANY;
- rip_inshash(inp);
- INP_HASH_WUNLOCK(pcbinfo);
- SOCK_LOCK(inp->inp_socket);
- inp->inp_socket->so_state &= ~SS_ISCONNECTED;
- SOCK_UNLOCK(inp->inp_socket);
+ ripcb_disconnect(inp);
+ if (disconnect_socket) {
+ SOCK_LOCK(inp->inp_socket);
+ inp->inp_socket->so_state &= ~SS_ISCONNECTED;
+ SOCK_UNLOCK(inp->inp_socket);
+ }
INP_WUNLOCK(inp);
}
@@ -896,7 +826,7 @@ rip_abort(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
- rip_dodisconnect(inp);
+ rip_dodisconnect(inp, true);
}
static void
@@ -907,7 +837,7 @@ rip_close(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_close: inp == NULL"));
- rip_dodisconnect(inp);
+ rip_dodisconnect(inp, true);
}
static int
@@ -921,7 +851,7 @@ rip_disconnect(struct socket *so)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
- rip_dodisconnect(inp);
+ rip_dodisconnect(inp, true);
return (0);
}
@@ -952,11 +882,7 @@ rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
return (EADDRNOTAVAIL);
INP_WLOCK(inp);
- INP_HASH_WLOCK(&V_ripcbinfo);
- rip_delhash(inp);
inp->inp_laddr = addr->sin_addr;
- rip_inshash(inp);
- INP_HASH_WUNLOCK(&V_ripcbinfo);
INP_WUNLOCK(inp);
return (0);
}
@@ -978,11 +904,13 @@ rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
INP_WLOCK(inp);
- INP_HASH_WLOCK(&V_ripcbinfo);
- rip_delhash(inp);
- inp->inp_faddr = addr->sin_addr;
- rip_inshash(inp);
- INP_HASH_WUNLOCK(&V_ripcbinfo);
+ if (inp->inp_faddr.s_addr != INADDR_ANY &&
+ addr->sin_addr.s_addr == INADDR_ANY)
+ rip_dodisconnect(inp, false);
+ if (addr->sin_addr.s_addr != INADDR_ANY) {
+ inp->inp_faddr = addr->sin_addr;
+ ripcb_connect(inp);
+ }
soisconnected(so);
INP_WUNLOCK(inp);
return (0);