git: ac5b9628002c - main - inpcb: retire the inpcb global list
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Sun, 12 Apr 2026 18:35:32 UTC
The branch main has been updated by glebius:
URL: https://cgit.FreeBSD.org/src/commit/?id=ac5b9628002c7c97929984eb578918077d564be4
commit ac5b9628002c7c97929984eb578918077d564be4
Author: Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2026-04-12 18:31:09 +0000
Commit: Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2026-04-12 18:31:09 +0000
inpcb: retire the inpcb global list
The iteration over all pcbs is possible without the global list. The
newborn inpcbs are put on a global list of unconnected inpcbs, then after
connect(2) or bind(2) they move to respective hash slot list.
This adds a bit of complexity to inp_next(), but the storage scheme is
actually simplified.
One potential problem before this change was that a couple of pcbs fall
into the same hash slot and are linked A->B there, but they also sit next
to each other in the global list, linked as B->A. This can deadlock of
course. The problem was never observed in the wild, but I was able to
instrument it with lots of effort: just few pcbs in the system, hash size
reduced down to 2 and a lot of repetitive calls into two kinds of
iterators.
However the main motivation is not the above problem, but make a step
towards splitting the big hash lock into per-slot locks.
Differential Revision: https://reviews.freebsd.org/D55967
---
sys/netinet/in_pcb.c | 125 ++++++++++++++++++++++++++++++-----------------
sys/netinet/in_pcb.h | 49 ++++++++++++-------
sys/netinet/in_pcb_var.h | 2 +-
sys/netinet/tcp_usrreq.c | 29 ++++++-----
sys/netinet6/in6_pcb.c | 4 +-
5 files changed, 134 insertions(+), 75 deletions(-)
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index d60c75ab45b5..cb5b7fa274f6 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -240,8 +240,6 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
"Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
#endif
-static void in_pcbremhash(struct inpcb *);
-
/*
* in_pcb.c: manage the Protocol Control Blocks.
*
@@ -563,7 +561,7 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
#ifdef VIMAGE
pcbinfo->ipi_vnet = curvnet;
#endif
- CK_LIST_INIT(&pcbinfo->ipi_listhead);
+ CK_LIST_INIT(&pcbinfo->ipi_list_unconn);
pcbinfo->ipi_count = 0;
ha.size = hash_nelements;
@@ -698,7 +696,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
INP_HASH_WLOCK(pcbinfo);
pcbinfo->ipi_count++;
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
- CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
+ CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
INP_HASH_WUNLOCK(pcbinfo);
so->so_pcb = inp;
@@ -1429,7 +1427,9 @@ in_pcbdisconnect(struct inpcb *inp)
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
("%s: inp %p was already disconnected", __func__, inp));
- in_pcbremhash_locked(inp);
+ in_pcbremhash(inp);
+ CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
+ inp_unconn_list);
if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
/* See the comment in in_pcbinshash(). */
@@ -1552,8 +1552,17 @@ inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
*
* - Iterator can have either write-lock or read-lock semantics, that can not
* be changed later.
- * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
- * a single hash slot. Note: only rip_input() does the latter.
+ * - Iterator has three modes of operation, defined by value of .hash member
+ * on the first call:
+ * - .hash = INP_ALL_LIST: the iterator will go through the unconnected
+ * list, then all wildcard hash slots and then all exact hash slots.
+ * - .hash = INP_UNCONN_LIST: the iterator will go through the list of
+ * unconnected pcbs only.
+ * - .hash initialized with an arbitrary positive value: iterator will go
+ * through this exact hash slot only.
+ * Note: only rip_input() and sysctl_setsockopt() use the latter.
+ * The interface may be extended for iteration over single wildcard hash
+ * slot, but there is no use case for that today.
* - Iterator may have optional bool matching function. The matching function
* will be executed for each inpcb in the SMR context, so it can not acquire
* locks and can safely access only immutable fields of inpcb.
@@ -1571,49 +1580,72 @@ inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
* - Removed entries won't stop traversal as long as they are not added to
* a different list. This is violated by in_pcbrehash().
*/
-#define II_LIST_FIRST(ipi, hash) \
- (((hash) == INP_ALL_LIST) ? \
- CK_LIST_FIRST(&(ipi)->ipi_listhead) : \
- CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
-#define II_LIST_NEXT(inp, hash) \
- (((hash) == INP_ALL_LIST) ? \
- CK_LIST_NEXT((inp), inp_list) : \
- CK_LIST_NEXT((inp), inp_hash_exact))
-#define II_LOCK_ASSERT(inp, lock) \
- rw_assert(&(inp)->inp_lock, \
- (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED )
+static inline struct inpcb *
+ii_list_first(const struct inpcb_iterator *ii)
+{
+ const struct inpcbinfo *ipi = ii->ipi;
+ const int hash = ii->hash;
+
+ if (hash < 0)
+ return (CK_LIST_FIRST(&ipi->ipi_list_unconn));
+ else if (hash <= ipi->ipi_hashmask)
+ return (CK_LIST_FIRST(&ipi->ipi_hash_wild[hash]));
+ else
+ return (CK_LIST_FIRST(
+ &ipi->ipi_hash_exact[hash - ipi->ipi_hashmask - 1]));
+}
+
+static inline struct inpcb *
+ii_list_next(const struct inpcb_iterator *ii, struct inpcb *inp)
+{
+ if (ii->hash < 0)
+ return (CK_LIST_NEXT(inp, inp_unconn_list));
+ else if (ii->hash <= ii->ipi->ipi_hashmask)
+ return (CK_LIST_NEXT(inp, inp_hash_wild));
+ else
+ return (CK_LIST_NEXT(inp, inp_hash_exact));
+}
+
struct inpcb *
inp_next(struct inpcb_iterator *ii)
{
const struct inpcbinfo *ipi = ii->ipi;
+ const int hashmax = (ipi->ipi_hashmask + 1) * 2;
inp_match_t *match = ii->match;
void *ctx = ii->ctx;
inp_lookup_t lock = ii->lock;
- int hash = ii->hash;
struct inpcb *inp;
if (ii->inp == NULL) { /* First call. */
+ if ((ii->hash = ii->mode) >= 0) {
+ /* Targeted iterators support only the exact hash. */
+ MPASS(ii->hash <= ipi->ipi_hashmask);
+ ii->hash += ipi->ipi_hashmask + 1;
+ }
smr_enter(ipi->ipi_smr);
- /* This is unrolled CK_LIST_FOREACH(). */
- for (inp = II_LIST_FIRST(ipi, hash);
+next_first:
+ /* This is unrolled CK_LIST_FOREACH() over different headers. */
+ for (inp = ii_list_first(ii);
inp != NULL;
- inp = II_LIST_NEXT(inp, hash)) {
+ inp = ii_list_next(ii, inp)) {
if (match != NULL && (match)(inp, ctx) == false)
continue;
if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
break;
else {
smr_enter(ipi->ipi_smr);
- MPASS(inp != II_LIST_FIRST(ipi, hash));
- inp = II_LIST_FIRST(ipi, hash);
+ MPASS(inp != ii_list_first(ii));
+ inp = ii_list_first(ii);
if (inp == NULL)
break;
}
}
- if (inp == NULL)
+ if (inp == NULL) {
+ if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax)
+ goto next_first;
smr_exit(ipi->ipi_smr);
- else
+ } else
ii->inp = inp;
return (inp);
@@ -1623,10 +1655,16 @@ inp_next(struct inpcb_iterator *ii)
smr_enter(ipi->ipi_smr);
restart:
inp = ii->inp;
- II_LOCK_ASSERT(inp, lock);
+ rw_assert(&inp->inp_lock,
+ lock == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED);
next:
- inp = II_LIST_NEXT(inp, hash);
+ inp = ii_list_next(ii, inp);
if (inp == NULL) {
+ if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) {
+ inp_unlock(ii->inp, lock);
+ ii->inp = NULL;
+ goto next_first;
+ }
smr_exit(ipi->ipi_smr);
goto found;
}
@@ -1799,10 +1837,11 @@ in_pcbfree(struct inpcb *inp)
*/
INP_HASH_WLOCK(pcbinfo);
if (inp->inp_flags & INP_INHASHLIST)
- in_pcbremhash_locked(inp);
+ in_pcbremhash(inp);
+ else
+ CK_LIST_REMOVE(inp, inp_unconn_list);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
pcbinfo->ipi_count--;
- CK_LIST_REMOVE(inp, inp_list);
INP_HASH_WUNLOCK(pcbinfo);
#ifdef RATELIMIT
@@ -1882,8 +1921,13 @@ in_pcbdrop(struct inpcb *inp)
INP_WLOCK_ASSERT(inp);
inp->inp_flags |= INP_DROPPED;
- if (inp->inp_flags & INP_INHASHLIST)
+ if (inp->inp_flags & INP_INHASHLIST) {
+ INP_HASH_WLOCK(inp->inp_pcbinfo);
in_pcbremhash(inp);
+ CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
+ inp_unconn_list);
+ INP_HASH_WUNLOCK(inp->inp_pcbinfo);
+ }
}
#ifdef INET
@@ -2693,6 +2737,8 @@ in_pcbinshash(struct inpcb *inp)
inp->inp_smr = SMR_SEQ_INVALID;
}
+ CK_LIST_REMOVE(inp, inp_unconn_list);
+
if (connected)
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
else {
@@ -2710,7 +2756,7 @@ in_pcbinshash(struct inpcb *inp)
}
void
-in_pcbremhash_locked(struct inpcb *inp)
+in_pcbremhash(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
@@ -2737,14 +2783,6 @@ in_pcbremhash_locked(struct inpcb *inp)
inp->inp_flags &= ~INP_INHASHLIST;
}
-static void
-in_pcbremhash(struct inpcb *inp)
-{
- INP_HASH_WLOCK(inp->inp_pcbinfo);
- in_pcbremhash_locked(inp);
- INP_HASH_WUNLOCK(inp->inp_pcbinfo);
-}
-
/*
* Move PCB to the proper hash bucket when { faddr, fport } have been
* changed. NOTE: This does not handle the case of the lport changing (the
@@ -2787,15 +2825,12 @@ in_pcbrehash(struct inpcb *inp)
* When rehashing, the caller must ensure that either the new or the old
* foreign address was unspecified.
*/
- if (connected)
- CK_LIST_REMOVE(inp, inp_hash_wild);
- else
- CK_LIST_REMOVE(inp, inp_hash_exact);
-
if (connected) {
+ CK_LIST_REMOVE(inp, inp_hash_wild);
head = &pcbinfo->ipi_hash_exact[hash];
CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
} else {
+ CK_LIST_REMOVE(inp, inp_hash_exact);
head = &pcbinfo->ipi_hash_wild[hash];
CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
}
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index e76e3fa8e382..592d951c018f 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -321,10 +321,15 @@ CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
* Almost all fields of struct inpcb are static after creation or protected by
* a per-inpcb rwlock, inp_lock.
*
- * A inpcb database is indexed by addresses/ports hash as well as list of
- * all pcbs that belong to a certain proto. Database lookups or list traversals
- * are be performed inside SMR section. Once desired PCB is found its own
- * lock is to be obtained and SMR section exited.
+ * A inpcb database consists of two hash tables: one for connected pcbs and the
+ * other for wildcard-bound pcbs. The newborn pcbs reside on the unconnected
+ * list. Although a pcb can be on either of these three lists, we can't share
+ * the linkage pointers because unlocked readers might be iterating over them.
+ * The only thing that can be unionized is the load-balance table and exact
+ * hash, as a pcb can never participate in both tables through its entire life
+ * time. Database lookups or list traversals are to be performed inside SMR
+ * section. Once desired PCB is found its own lock is to be obtained and SMR
+ * section exited.
*
* Key:
* (c) - Constant after initialization
@@ -350,14 +355,13 @@ struct icmp6_filter;
struct inpcbpolicy;
struct m_snd_tag;
struct inpcb {
- /* Cache line #1 (amd64) */
union {
- CK_LIST_ENTRY(inpcb) inp_hash_exact; /* hash table linkage */
- LIST_ENTRY(inpcb) inp_lbgroup_list; /* lb group list */
+ CK_LIST_ENTRY(inpcb) inp_hash_exact;
+ LIST_ENTRY(inpcb) inp_lbgroup_list;
};
- CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */
+ CK_LIST_ENTRY(inpcb) inp_hash_wild;
+ CK_LIST_ENTRY(inpcb) inp_unconn_list;
struct rwlock inp_lock;
- /* Cache line #2 (amd64) */
#define inp_start_zero inp_refcount
#define inp_zero_size (sizeof(struct inpcb) - \
offsetof(struct inpcb, inp_start_zero))
@@ -412,7 +416,6 @@ struct inpcb {
struct route inp_route;
struct route_in6 inp_route6;
};
- CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */
};
#define inp_vnet inp_pcbinfo->ipi_vnet
@@ -431,7 +434,6 @@ struct inpcb {
* (h) Locked by ipi_hash_lock
*/
struct inpcbinfo {
- struct inpcbhead ipi_listhead; /* (r:e/w:h) */
u_int ipi_count; /* (h) */
/*
@@ -460,6 +462,7 @@ struct inpcbinfo {
* address, and "wild" holds the rest.
*/
struct mtx ipi_hash_lock;
+ struct inpcbhead ipi_list_unconn; /* (r:e/w:h) */
struct inpcbhead *ipi_hash_exact; /* (r:e/w:h) */
struct inpcbhead *ipi_hash_wild; /* (r:e/w:h) */
u_long ipi_hashmask; /* (c) */
@@ -670,14 +673,26 @@ int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
int (*ctloutput_set)(struct inpcb *, struct sockopt *));
#endif
+/*
+ * struct inpcb_iterator is located on the stack of a function that uses
+ * inp_next(). The caller shall initialize the const members before first
+ * invocation of inp_next(). After that, until the iterator finishes the
+ * caller is supposed to only read 'inp' until it reads NULL. Some members
+ * have constness commented out for convenience of callers, that may reuse
+ * the iterator after it finishes.
+ * (c) - caller
+ * (n) - inp_next()
+ */
typedef bool inp_match_t(const struct inpcb *, void *);
struct inpcb_iterator {
const struct inpcbinfo *ipi;
- struct inpcb *inp;
- inp_match_t *match;
- void *ctx;
- int hash;
+ struct inpcb *inp; /* c:r, n:rw */
+ /* const */ inp_match_t *match;
+ /* const */ void *ctx;
+ int hash; /* n:rw */
+ /* const */ int mode;
#define INP_ALL_LIST -1
+#define INP_UNCONN_LIST -2
const inp_lookup_t lock;
};
@@ -686,7 +701,7 @@ struct inpcb_iterator {
{ \
.ipi = (_ipi), \
.lock = (_lock), \
- .hash = INP_ALL_LIST, \
+ .mode = INP_ALL_LIST, \
.match = (_match), \
.ctx = (_ctx), \
}
@@ -694,7 +709,7 @@ struct inpcb_iterator {
{ \
.ipi = (_ipi), \
.lock = (_lock), \
- .hash = INP_ALL_LIST, \
+ .mode = INP_ALL_LIST, \
}
struct inpcb *inp_next(struct inpcb_iterator *);
diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h
index 7e8a1626ab40..7a5c489f26d7 100644
--- a/sys/netinet/in_pcb_var.h
+++ b/sys/netinet/in_pcb_var.h
@@ -57,7 +57,7 @@ struct inpcb *in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short,
int, int, struct ucred *);
int in_pcbinshash(struct inpcb *);
void in_pcbrehash(struct inpcb *);
-void in_pcbremhash_locked(struct inpcb *);
+void in_pcbremhash(struct inpcb *);
/*
* Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index b0a75127b124..07c436a1f2e0 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -2841,9 +2841,12 @@ db_print_bblog_state(int state)
static void
db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog,
- bool show_inpcb)
+ bool show_inpcb, bool only_locked)
{
+ if (only_locked && tp->t_inpcb.inp_lock.rw_lock == RW_UNLOCKED)
+ return;
+
db_print_indent(indent);
db_printf("%s at %p\n", name, tp);
@@ -2987,14 +2990,13 @@ DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
show_bblog = strchr(modif, 'b') != NULL;
show_inpcb = strchr(modif, 'i') != NULL;
tp = (struct tcpcb *)addr;
- db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
+ db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb, false);
}
DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs)
{
VNET_ITERATOR_DECL(vnet_iter);
struct inpcb *inp;
- struct tcpcb *tp;
bool only_locked, show_bblog, show_inpcb;
only_locked = strchr(modif, 'l') != NULL;
@@ -3002,18 +3004,23 @@ DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs)
show_inpcb = strchr(modif, 'i') != NULL;
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
- if (only_locked &&
- inp->inp_lock.rw_lock == RW_UNLOCKED)
- continue;
- tp = intotcpcb(inp);
- db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
+ for (u_int i = 0; i <= V_tcbinfo.ipi_porthashmask; i++)
+ CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_porthashbase[i],
+ inp_portlist) {
+ db_print_tcpcb(intotcpcb(inp), "tcpcb", 0,
+ show_bblog, show_inpcb, only_locked);
+ if (db_pager_quit)
+ goto break_hash;
+ }
+break_hash:
+ CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_list_unconn,
+ inp_unconn_list) {
+ db_print_tcpcb(intotcpcb(inp), "tcpcb", 0,
+ show_bblog, show_inpcb, only_locked);
if (db_pager_quit)
break;
}
CURVNET_RESTORE();
- if (db_pager_quit)
- break;
}
}
#endif
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 2d6e860a72ba..8a644f96f72e 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -508,7 +508,9 @@ in6_pcbdisconnect(struct inpcb *inp)
KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
("%s: inp %p was already disconnected", __func__, inp));
- in_pcbremhash_locked(inp);
+ in_pcbremhash(inp);
+ CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
+ inp_unconn_list);
if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
/* See the comment in in_pcbinshash(). */