git: ac5b9628002c - main - inpcb: retire the inpcb global list

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Sun, 12 Apr 2026 18:35:32 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=ac5b9628002c7c97929984eb578918077d564be4

commit ac5b9628002c7c97929984eb578918077d564be4
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2026-04-12 18:31:09 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2026-04-12 18:31:09 +0000

    inpcb: retire the inpcb global list
    
    The iteration over all pcbs is possible without the global list. The
    newborn inpcbs are put on a global list of unconnected inpcbs, then after
    connect(2) or bind(2) they move to respective hash slot list.
    
    This adds a bit of complexity to inp_next(), but the storage scheme is
    actually simplified.
    
    One potential problem before this change was that a couple of pcbs fall
    into the same hash slot and are linked A->B there, but they also sit next
    to each other in the global list, linked as B->A.  This can deadlock of
    course.  The problem was never observed in the wild, but I was able to
    instrument it with lots of effort: just few pcbs in the system, hash size
    reduced down to 2 and a lot of repetitive calls into two kinds of
    iterators.
    
    However the main motivation is not the above problem, but make a step
    towards splitting the big hash lock into per-slot locks.
    
    Differential Revision:  https://reviews.freebsd.org/D55967
---
 sys/netinet/in_pcb.c     | 125 ++++++++++++++++++++++++++++++-----------------
 sys/netinet/in_pcb.h     |  49 ++++++++++++-------
 sys/netinet/in_pcb_var.h |   2 +-
 sys/netinet/tcp_usrreq.c |  29 ++++++-----
 sys/netinet6/in6_pcb.c   |   4 +-
 5 files changed, 134 insertions(+), 75 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index d60c75ab45b5..cb5b7fa274f6 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -240,8 +240,6 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild,
     "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)");
 #endif
 
-static void in_pcbremhash(struct inpcb *);
-
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
@@ -563,7 +561,7 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
-	CK_LIST_INIT(&pcbinfo->ipi_listhead);
+	CK_LIST_INIT(&pcbinfo->ipi_list_unconn);
 	pcbinfo->ipi_count = 0;
 
 	ha.size = hash_nelements;
@@ -698,7 +696,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 	INP_HASH_WLOCK(pcbinfo);
 	pcbinfo->ipi_count++;
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
-	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
+	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list);
 	INP_HASH_WUNLOCK(pcbinfo);
 	so->so_pcb = inp;
 
@@ -1429,7 +1427,9 @@ in_pcbdisconnect(struct inpcb *inp)
 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
 	    ("%s: inp %p was already disconnected", __func__, inp));
 
-	in_pcbremhash_locked(inp);
+	in_pcbremhash(inp);
+	CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
+	    inp_unconn_list);
 
 	if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 		/* See the comment in in_pcbinshash(). */
@@ -1552,8 +1552,17 @@ inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
  *
  * - Iterator can have either write-lock or read-lock semantics, that can not
  *   be changed later.
- * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
- *   a single hash slot.  Note: only rip_input() does the latter.
+ * - Iterator has three modes of operation, defined by value of .hash member
+ *   on the first call:
+ *   - .hash = INP_ALL_LIST: the iterator will go through the unconnected
+ *     list, then all wildcard hash slots and then all exact hash slots.
+ *   - .hash = INP_UNCONN_LIST: the iterator will go through the list of
+ *     unconnected pcbs only.
+ *   - .hash initialized with an arbitrary positive value: iterator will go
+ *     through this exact hash slot only.
+ *   Note: only rip_input() and sysctl_setsockopt() use the latter.
+ *   The interface may be extended for iteration over single wildcard hash
+ *   slot, but there is no use case for that today.
  * - Iterator may have optional bool matching function.  The matching function
  *   will be executed for each inpcb in the SMR context, so it can not acquire
  *   locks and can safely access only immutable fields of inpcb.
@@ -1571,49 +1580,72 @@ inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
  * - Removed entries won't stop traversal as long as they are not added to
  *   a different list. This is violated by in_pcbrehash().
  */
-#define	II_LIST_FIRST(ipi, hash)					\
-		(((hash) == INP_ALL_LIST) ?				\
-		    CK_LIST_FIRST(&(ipi)->ipi_listhead) :		\
-		    CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)]))
-#define	II_LIST_NEXT(inp, hash)						\
-		(((hash) == INP_ALL_LIST) ?				\
-		    CK_LIST_NEXT((inp), inp_list) :			\
-		    CK_LIST_NEXT((inp), inp_hash_exact))
-#define	II_LOCK_ASSERT(inp, lock)					\
-		rw_assert(&(inp)->inp_lock,				\
-		    (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
+static inline struct inpcb *
+ii_list_first(const struct inpcb_iterator *ii)
+{
+	const struct inpcbinfo *ipi = ii->ipi;
+	const int hash = ii->hash;
+
+	if (hash < 0)
+		return (CK_LIST_FIRST(&ipi->ipi_list_unconn));
+	else if (hash <= ipi->ipi_hashmask)
+		return (CK_LIST_FIRST(&ipi->ipi_hash_wild[hash]));
+	else
+		return (CK_LIST_FIRST(
+		    &ipi->ipi_hash_exact[hash - ipi->ipi_hashmask - 1]));
+}
+
+static inline struct inpcb *
+ii_list_next(const struct inpcb_iterator *ii, struct inpcb *inp)
+{
+	if (ii->hash < 0)
+		return (CK_LIST_NEXT(inp, inp_unconn_list));
+	else if (ii->hash <= ii->ipi->ipi_hashmask)
+		return (CK_LIST_NEXT(inp, inp_hash_wild));
+	else
+		return (CK_LIST_NEXT(inp, inp_hash_exact));
+}
+
 struct inpcb *
 inp_next(struct inpcb_iterator *ii)
 {
 	const struct inpcbinfo *ipi = ii->ipi;
+	const int hashmax = (ipi->ipi_hashmask + 1) * 2;
 	inp_match_t *match = ii->match;
 	void *ctx = ii->ctx;
 	inp_lookup_t lock = ii->lock;
-	int hash = ii->hash;
 	struct inpcb *inp;
 
 	if (ii->inp == NULL) {		/* First call. */
+		if ((ii->hash = ii->mode) >= 0) {
+			/* Targeted iterators support only the exact hash. */
+			MPASS(ii->hash <= ipi->ipi_hashmask);
+			ii->hash += ipi->ipi_hashmask + 1;
+		}
 		smr_enter(ipi->ipi_smr);
-		/* This is unrolled CK_LIST_FOREACH(). */
-		for (inp = II_LIST_FIRST(ipi, hash);
+next_first:
+		/* This is unrolled CK_LIST_FOREACH() over different headers. */
+		for (inp = ii_list_first(ii);
 		    inp != NULL;
-		    inp = II_LIST_NEXT(inp, hash)) {
+		    inp = ii_list_next(ii, inp)) {
 			if (match != NULL && (match)(inp, ctx) == false)
 				continue;
 			if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
 				break;
 			else {
 				smr_enter(ipi->ipi_smr);
-				MPASS(inp != II_LIST_FIRST(ipi, hash));
-				inp = II_LIST_FIRST(ipi, hash);
+				MPASS(inp != ii_list_first(ii));
+				inp = ii_list_first(ii);
 				if (inp == NULL)
 					break;
 			}
 		}
 
-		if (inp == NULL)
+		if (inp == NULL) {
+			if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax)
+				goto next_first;
 			smr_exit(ipi->ipi_smr);
-		else
+		} else
 			ii->inp = inp;
 
 		return (inp);
@@ -1623,10 +1655,16 @@ inp_next(struct inpcb_iterator *ii)
 	smr_enter(ipi->ipi_smr);
 restart:
 	inp = ii->inp;
-	II_LOCK_ASSERT(inp, lock);
+	rw_assert(&inp->inp_lock,
+	    lock == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED);
 next:
-	inp = II_LIST_NEXT(inp, hash);
+	inp = ii_list_next(ii, inp);
 	if (inp == NULL) {
+		if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) {
+			inp_unlock(ii->inp, lock);
+			ii->inp = NULL;
+			goto next_first;
+		}
 		smr_exit(ipi->ipi_smr);
 		goto found;
 	}
@@ -1799,10 +1837,11 @@ in_pcbfree(struct inpcb *inp)
 	 */
 	INP_HASH_WLOCK(pcbinfo);
 	if (inp->inp_flags & INP_INHASHLIST)
-		in_pcbremhash_locked(inp);
+		in_pcbremhash(inp);
+	else
+		CK_LIST_REMOVE(inp, inp_unconn_list);
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	pcbinfo->ipi_count--;
-	CK_LIST_REMOVE(inp, inp_list);
 	INP_HASH_WUNLOCK(pcbinfo);
 
 #ifdef RATELIMIT
@@ -1882,8 +1921,13 @@ in_pcbdrop(struct inpcb *inp)
 	INP_WLOCK_ASSERT(inp);
 
 	inp->inp_flags |= INP_DROPPED;
-	if (inp->inp_flags & INP_INHASHLIST)
+	if (inp->inp_flags & INP_INHASHLIST) {
+		INP_HASH_WLOCK(inp->inp_pcbinfo);
 		in_pcbremhash(inp);
+		CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
+		    inp_unconn_list);
+		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
+	}
 }
 
 #ifdef INET
@@ -2693,6 +2737,8 @@ in_pcbinshash(struct inpcb *inp)
 		inp->inp_smr = SMR_SEQ_INVALID;
 	}
 
+	CK_LIST_REMOVE(inp, inp_unconn_list);
+
 	if (connected)
 		CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact);
 	else {
@@ -2710,7 +2756,7 @@ in_pcbinshash(struct inpcb *inp)
 }
 
 void
-in_pcbremhash_locked(struct inpcb *inp)
+in_pcbremhash(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
@@ -2737,14 +2783,6 @@ in_pcbremhash_locked(struct inpcb *inp)
 	inp->inp_flags &= ~INP_INHASHLIST;
 }
 
-static void
-in_pcbremhash(struct inpcb *inp)
-{
-	INP_HASH_WLOCK(inp->inp_pcbinfo);
-	in_pcbremhash_locked(inp);
-	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
-}
-
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
@@ -2787,15 +2825,12 @@ in_pcbrehash(struct inpcb *inp)
 	 * When rehashing, the caller must ensure that either the new or the old
 	 * foreign address was unspecified.
 	 */
-	if (connected)
-		CK_LIST_REMOVE(inp, inp_hash_wild);
-	else
-		CK_LIST_REMOVE(inp, inp_hash_exact);
-
 	if (connected) {
+		CK_LIST_REMOVE(inp, inp_hash_wild);
 		head = &pcbinfo->ipi_hash_exact[hash];
 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact);
 	} else {
+		CK_LIST_REMOVE(inp, inp_hash_exact);
 		head = &pcbinfo->ipi_hash_wild[hash];
 		CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild);
 	}
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index e76e3fa8e382..592d951c018f 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -321,10 +321,15 @@ CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
  * Almost all fields of struct inpcb are static after creation or protected by
  * a per-inpcb rwlock, inp_lock.
  *
- * A inpcb database is indexed by addresses/ports hash as well as list of
- * all pcbs that belong to a certain proto. Database lookups or list traversals
- * are be performed inside SMR section. Once desired PCB is found its own
- * lock is to be obtained and SMR section exited.
+ * A inpcb database consists of two hash tables: one for connected pcbs and the
+ * other for wildcard-bound pcbs.  The newborn pcbs reside on the unconnected
+ * list.  Although a pcb can be on either of these three lists, we can't share
+ * the linkage pointers because unlocked readers might be iterating over them.
+ * The only thing that can be unionized is the load-balance table and exact
+ * hash, as a pcb can never participate in both tables through its entire life
+ * time.  Database lookups or list traversals are to be performed inside SMR
+ * section.  Once desired PCB is found its own lock is to be obtained and SMR
+ * section exited.
  *
  * Key:
  * (c) - Constant after initialization
@@ -350,14 +355,13 @@ struct icmp6_filter;
 struct inpcbpolicy;
 struct m_snd_tag;
 struct inpcb {
-	/* Cache line #1 (amd64) */
 	union {
-		CK_LIST_ENTRY(inpcb) inp_hash_exact;	/* hash table linkage */
-		LIST_ENTRY(inpcb) inp_lbgroup_list;	/* lb group list */
+		CK_LIST_ENTRY(inpcb)	inp_hash_exact;
+		LIST_ENTRY(inpcb)	inp_lbgroup_list;
 	};
-	CK_LIST_ENTRY(inpcb) inp_hash_wild;	/* hash table linkage */
+	CK_LIST_ENTRY(inpcb)	inp_hash_wild;
+	CK_LIST_ENTRY(inpcb)	inp_unconn_list;
 	struct rwlock	inp_lock;
-	/* Cache line #2 (amd64) */
 #define	inp_start_zero	inp_refcount
 #define	inp_zero_size	(sizeof(struct inpcb) - \
 			    offsetof(struct inpcb, inp_start_zero))
@@ -412,7 +416,6 @@ struct inpcb {
 		struct route inp_route;
 		struct route_in6 inp_route6;
 	};
-	CK_LIST_ENTRY(inpcb) inp_list;	/* (r:e/w:p) all PCBs for proto */
 };
 
 #define	inp_vnet	inp_pcbinfo->ipi_vnet
@@ -431,7 +434,6 @@ struct inpcb {
  * (h) Locked by ipi_hash_lock
  */
 struct inpcbinfo {
-	struct inpcbhead	 ipi_listhead;		/* (r:e/w:h) */
 	u_int			 ipi_count;		/* (h) */
 
 	/*
@@ -460,6 +462,7 @@ struct inpcbinfo {
 	 * address, and "wild" holds the rest.
 	 */
 	struct mtx		 ipi_hash_lock;
+	struct inpcbhead	 ipi_list_unconn;	/* (r:e/w:h) */
 	struct inpcbhead 	*ipi_hash_exact;	/* (r:e/w:h) */
 	struct inpcbhead 	*ipi_hash_wild;		/* (r:e/w:h) */
 	u_long			 ipi_hashmask;		/* (c) */
@@ -670,14 +673,26 @@ int	sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
 	    int (*ctloutput_set)(struct inpcb *, struct sockopt *));
 #endif
 
+/*
+ * struct inpcb_iterator is located on the stack of a function that uses
+ * inp_next().  The caller shall initialize the const members before first
+ * invocation of inp_next().  After that, until the iterator finishes the
+ * caller is supposed to only read 'inp' until it reads NULL.  Some members
+ * have constness commented out for convenience of callers, that may reuse
+ * the iterator after it finishes.
+ * (c) - caller
+ * (n) - inp_next()
+ */
 typedef bool inp_match_t(const struct inpcb *, void *);
 struct inpcb_iterator {
 	const struct inpcbinfo	*ipi;
-	struct inpcb		*inp;
-	inp_match_t		*match;
-	void			*ctx;
-	int			hash;
+	struct inpcb		*inp;	/* c:r, n:rw */
+	/* const */ inp_match_t	*match;
+	/* const */ void	*ctx;
+	int			hash;	/* n:rw */
+	/* const */ int		mode;
 #define	INP_ALL_LIST		-1
+#define	INP_UNCONN_LIST		-2
 	const inp_lookup_t	lock;
 };
 
@@ -686,7 +701,7 @@ struct inpcb_iterator {
 	{						\
 		.ipi = (_ipi),				\
 		.lock = (_lock),			\
-		.hash = INP_ALL_LIST,			\
+		.mode = INP_ALL_LIST,			\
 		.match = (_match),			\
 		.ctx = (_ctx),				\
 	}
@@ -694,7 +709,7 @@ struct inpcb_iterator {
 	{						\
 		.ipi = (_ipi),				\
 		.lock = (_lock),			\
-		.hash = INP_ALL_LIST,			\
+		.mode = INP_ALL_LIST,			\
 	}
 
 struct inpcb *inp_next(struct inpcb_iterator *);
diff --git a/sys/netinet/in_pcb_var.h b/sys/netinet/in_pcb_var.h
index 7e8a1626ab40..7a5c489f26d7 100644
--- a/sys/netinet/in_pcb_var.h
+++ b/sys/netinet/in_pcb_var.h
@@ -57,7 +57,7 @@ struct inpcb *in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short,
 	    int, int, struct ucred *);
 int     in_pcbinshash(struct inpcb *);
 void    in_pcbrehash(struct inpcb *);
-void    in_pcbremhash_locked(struct inpcb *);
+void    in_pcbremhash(struct inpcb *);
 
 /*
  * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index b0a75127b124..07c436a1f2e0 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -2841,9 +2841,12 @@ db_print_bblog_state(int state)
 
 static void
 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog,
-    bool show_inpcb)
+    bool show_inpcb, bool only_locked)
 {
 
+	if (only_locked && tp->t_inpcb.inp_lock.rw_lock == RW_UNLOCKED)
+		return;
+
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, tp);
 
@@ -2987,14 +2990,13 @@ DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
 	show_bblog = strchr(modif, 'b') != NULL;
 	show_inpcb = strchr(modif, 'i') != NULL;
 	tp = (struct tcpcb *)addr;
-	db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
+	db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb, false);
 }
 
 DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct inpcb *inp;
-	struct tcpcb *tp;
 	bool only_locked, show_bblog, show_inpcb;
 
 	only_locked = strchr(modif, 'l') != NULL;
@@ -3002,18 +3004,23 @@ DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs)
 	show_inpcb = strchr(modif, 'i') != NULL;
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
-		CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
-			if (only_locked &&
-			    inp->inp_lock.rw_lock == RW_UNLOCKED)
-				continue;
-			tp = intotcpcb(inp);
-			db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
+		for (u_int i = 0; i <= V_tcbinfo.ipi_porthashmask; i++)
+			CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_porthashbase[i],
+			    inp_portlist) {
+				db_print_tcpcb(intotcpcb(inp), "tcpcb", 0,
+				    show_bblog, show_inpcb, only_locked);
+				if (db_pager_quit)
+					goto break_hash;
+			}
+break_hash:
+		CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_list_unconn,
+		    inp_unconn_list) {
+			db_print_tcpcb(intotcpcb(inp), "tcpcb", 0,
+			    show_bblog, show_inpcb, only_locked);
 			if (db_pager_quit)
 				break;
 		}
 		CURVNET_RESTORE();
-		if (db_pager_quit)
-			break;
 	}
 }
 #endif
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 2d6e860a72ba..8a644f96f72e 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -508,7 +508,9 @@ in6_pcbdisconnect(struct inpcb *inp)
 	KASSERT(inp->inp_smr == SMR_SEQ_INVALID,
 	    ("%s: inp %p was already disconnected", __func__, inp));
 
-	in_pcbremhash_locked(inp);
+	in_pcbremhash(inp);
+	CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp,
+	    inp_unconn_list);
 
 	if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
 		/* See the comment in in_pcbinshash(). */