git: de2d47842e88 - main - SMR protection for inpcbs

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Thu, 02 Dec 2021 18:49:06 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=de2d47842e880281da07f2589b9ec558b42c09c1

commit de2d47842e880281da07f2589b9ec558b42c09c1
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2021-12-02 18:48:48 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2021-12-02 18:48:48 +0000

    SMR protection for inpcbs
    
    With introduction of epoch(9) synchronization to network stack the
    inpcb database became protected by the network epoch together with
    static network data (interfaces, addresses, etc).  However, inpcb
    aren't static in nature, they are created and destroyed all the
    time, which creates some traffic on the epoch(9) garbage collector.
    
    Fairly new feature of uma(9) - Safe Memory Reclamation allows to
    safely free memory in page-sized batches, with virtually zero
    overhead compared to uma_zfree().  However, unlike epoch(9), it
    puts stricter requirement on the access to the protected memory,
    needing the critical(9) section to access it.  Details:
    
    - The database is already build on CK lists, thanks to epoch(9).
    - For write access nothing is changed.
    - For a lookup in the database SMR section is now required.
      Once the desired inpcb is found we need to transition from SMR
      section to r/w lock on the inpcb itself, with a check that inpcb
      isn't yet freed.  This requires some compexity, since SMR section
      itself is a critical(9) section.  The complexity is hidden from
      KPI users in inp_smr_lock().
    - For a inpcb list traversal (a pcblist sysctl, or broadcast
      notification) also a new KPI is provided, that hides internals of
      the database - inp_next(struct inp_iterator *).
    
    Reviewed by:            rrs
    Differential revision:  https://reviews.freebsd.org/D33022
---
 sys/kern/subr_witness.c    |   4 +-
 sys/kern/uipc_ktls.c       |  12 +-
 sys/netinet/in_pcb.c       | 672 ++++++++++++++++++++++++++-------------------
 sys/netinet/in_pcb.h       | 227 ++++++---------
 sys/netinet/in_pcb_var.h   |   7 +
 sys/netinet/ip_divert.c    |  75 +++--
 sys/netinet/ip_gre.c       |  18 +-
 sys/netinet/raw_ip.c       | 288 +++++++++----------
 sys/netinet/tcp_hpts.c     |  24 +-
 sys/netinet/tcp_input.c    |   2 -
 sys/netinet/tcp_lro.c      |   3 +-
 sys/netinet/tcp_subr.c     |  74 ++---
 sys/netinet/tcp_var.h      |   2 -
 sys/netinet/udp_usrreq.c   | 303 ++++++++++----------
 sys/netinet/udp_var.h      |  10 -
 sys/netinet6/icmp6.c       | 171 +++++-------
 sys/netinet6/in6_pcb.c     | 112 ++++----
 sys/netinet6/ip6_gre.c     |  19 +-
 sys/netinet6/raw_ip6.c     | 169 +++++-------
 sys/netinet6/udp6_usrreq.c | 279 +++++++++----------
 20 files changed, 1147 insertions(+), 1324 deletions(-)

diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
index e3fcbbb31311..db82450f0570 100644
--- a/sys/kern/subr_witness.c
+++ b/sys/kern/subr_witness.c
@@ -564,15 +564,15 @@ static struct witness_order_list_entry order_lists[] = {
 	/*
 	 * UDP/IP
 	 */
-	{ "udp", &lock_class_mtx_sleep },
 	{ "udpinp", &lock_class_rw },
+	{ "udp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
-	{ "tcp", &lock_class_mtx_sleep },
 	{ "tcpinp", &lock_class_rw },
+	{ "tcp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index 07e5a4c8399f..aee9c2374e0b 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -854,10 +854,6 @@ ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
 
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
-	if (inp->inp_flags2 & INP_FREED) {
-		INP_WUNLOCK(inp);
-		return (ECONNRESET);
-	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
@@ -909,10 +905,6 @@ ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
 	int error;
 
 	INP_RLOCK(inp);
-	if (inp->inp_flags2 & INP_FREED) {
-		INP_RUNLOCK(inp);
-		return (ECONNRESET);
-	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
@@ -2716,8 +2708,7 @@ ktls_disable_ifnet_help(void *context, int pending __unused)
 	INP_WLOCK(inp);
 	so = inp->inp_socket;
 	MPASS(so != NULL);
-	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
-	    (inp->inp_flags2 & INP_FREED)) {
+	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		goto out;
 	}
 
@@ -2729,7 +2720,6 @@ ktls_disable_ifnet_help(void *context, int pending __unused)
 		counter_u64_add(ktls_ifnet_disable_ok, 1);
 		/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
 		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
-		    (inp->inp_flags2 & INP_FREED) == 0 &&
 		    (tp = intotcpcb(inp)) != NULL &&
 		    tp->t_fb->tfb_hwtls_change != NULL)
 			(*tp->t_fb->tfb_hwtls_change)(tp, 0);
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index ecca470805d2..ac8c0d3e368a 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -113,6 +113,7 @@ __FBSDID("$FreeBSD$");
 
 #define	INPCBLBGROUP_SIZMIN	8
 #define	INPCBLBGROUP_SIZMAX	256
+#define	INP_FREED	0x00000200	/* See in_pcb.h. */
 
 static struct callout	ipport_tick_callout;
 
@@ -145,7 +146,6 @@ VNET_DEFINE_STATIC(int, ipport_tcplastcount);
 
 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
 
-static void	in_pcbremlists(struct inpcb *inp);
 #ifdef INET
 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
 			    struct in_addr faddr, u_int fport_arg,
@@ -514,38 +514,43 @@ inpcb_fini(void *mem, int size)
 	INP_LOCK_DESTROY(inp);
 }
 
+/* Make sure it is safe to use hashinit(9) on CK_LIST. */
+CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
+
 /*
  * Initialize an inpcbinfo -- we should be able to reduce the number of
  * arguments in time.
  */
 void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
-    struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
-    char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
+    u_int hash_nelements, int porthash_nelements, char *inpcbzone_name,
+    uma_init inpcbzone_init)
 {
 
-	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
-
-	INP_INFO_LOCK_INIT(pcbinfo, name);
-	INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");	/* XXXRW: argument? */
-	INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
+	mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF);
+	mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF);
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
-	pcbinfo->ipi_listhead = listhead;
-	CK_LIST_INIT(pcbinfo->ipi_listhead);
+	CK_LIST_INIT(&pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
+	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_lbgrouphashmask);
 	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
-	    NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
+	    NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR,
+	    UMA_ZONE_SMR);
 	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
 	uma_zone_set_warning(pcbinfo->ipi_zone,
 	    "kern.ipc.maxsockets limit reached");
+	pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
+	pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name,
+	    sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr);
 }
 
 /*
@@ -564,9 +569,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
 	    pcbinfo->ipi_lbgrouphashmask);
 	uma_zdestroy(pcbinfo->ipi_zone);
-	INP_LIST_LOCK_DESTROY(pcbinfo);
-	INP_HASH_LOCK_DESTROY(pcbinfo);
-	INP_INFO_LOCK_DESTROY(pcbinfo);
+	mtx_destroy(&pcbinfo->ipi_hash_lock);
+	mtx_destroy(&pcbinfo->ipi_lock);
 }
 
 /*
@@ -580,7 +584,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 	int error;
 
 	error = 0;
-	inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
+	inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(&inp->inp_start_zero, inp_zero_size);
@@ -612,33 +616,38 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 	}
-#endif
-	INP_WLOCK(inp);
-	INP_LIST_WLOCK(pcbinfo);
-	CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
-	pcbinfo->ipi_count++;
-	so->so_pcb = (caddr_t)inp;
-#ifdef INET6
 	if (V_ip6_auto_flowlabel)
 		inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 #endif
-	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
-	refcount_init(&inp->inp_refcount, 1);	/* Reference from inpcbinfo */
-
 	/*
 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
 	 * to be cleaned up.
 	 */
 	inp->inp_route.ro_flags = RT_LLE_CACHE;
-	INP_LIST_WUNLOCK(pcbinfo);
+#ifdef TCPHPTS
+	/*
+	 * If using hpts lets drop a random number in so
+	 * not all new connections fall on the same CPU.
+	 */
+	inp->inp_hpts_cpu = inp->inp_input_cpu = hpts_random_cpu(inp);
+#endif
+	refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
+	INP_WLOCK(inp);
+	INP_INFO_WLOCK(pcbinfo);
+	pcbinfo->ipi_count++;
+	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
+	INP_INFO_WUNLOCK(pcbinfo);
+	so->so_pcb = inp;
+
+	return (0);
+
 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
 out:
-	if (error != 0) {
-		crfree(inp->inp_cred);
-		uma_zfree(pcbinfo->ipi_zone, inp);
-	}
-#endif
+	crfree(inp->inp_cred);
+	uma_zfree_smr(pcbinfo->ipi_zone, inp);
 	return (error);
+#endif
 }
 
 #ifdef INET
@@ -1505,192 +1514,274 @@ in_pcbdetach(struct inpcb *inp)
 }
 
 /*
- * in_pcbref() bumps the reference count on an inpcb in order to maintain
- * stability of an inpcb pointer despite the inpcb lock being released.  This
- * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
- * but where the inpcb lock may already held.
- *
- * in_pcbref() should be used only to provide brief memory stability, and
- * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
- * garbage collect the inpcb if it has been in_pcbfree()'d from another
- * context.  Until in_pcbrele() has returned that the inpcb is still valid,
- * lock and rele are the *only* safe operations that may be performed on the
- * inpcb.
+ * inpcb hash lookups are protected by SMR section.
  *
- * While the inpcb will not be freed, releasing the inpcb lock means that the
- * connection's state may change, so the caller should be careful to
- * revalidate any cached state on reacquiring the lock.  Drop the reference
- * using in_pcbrele().
+ * Once desired pcb has been found, switching from SMR section to a pcb
+ * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
+ * here because SMR is a critical section.
+ * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
  */
-void
-in_pcbref(struct inpcb *inp)
+static inline void
+inp_lock(struct inpcb *inp, const inp_lookup_t lock)
 {
 
-	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+	lock == INPLOOKUP_RLOCKPCB ?
+	    rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
+}
+
+static inline void
+inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
+{
 
-	refcount_acquire(&inp->inp_refcount);
+	lock == INPLOOKUP_RLOCKPCB ?
+	    rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
 }
 
-/*
- * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
- * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
- * return a flag indicating whether or not the inpcb remains valid.  If it is
- * valid, we return with the inpcb lock held.
- *
- * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
- * reference on an inpcb.  Historically more work was done here (actually, in
- * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
- * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
- * about memory stability (and continued use of the write lock).
- */
-int
-in_pcbrele_rlocked(struct inpcb *inp)
+static inline int
+inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
 {
-	struct inpcbinfo *pcbinfo;
 
-	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+	return (lock == INPLOOKUP_RLOCKPCB ?
+	    rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
+}
 
-	INP_RLOCK_ASSERT(inp);
+static inline bool
+in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
+{
 
-	if (refcount_release(&inp->inp_refcount) == 0) {
-		/*
-		 * If the inpcb has been freed, let the caller know, even if
-		 * this isn't the last reference.
-		 */
-		if (inp->inp_flags2 & INP_FREED) {
-			INP_RUNLOCK(inp);
-			return (1);
+	return (lock == INPLOOKUP_RLOCKPCB ?
+	    in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
+}
+
+bool
+inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
+{
+
+	MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
+	SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
+
+	if (__predict_true(inp_trylock(inp, lock))) {
+		if (__predict_false(inp->inp_flags & INP_FREED)) {
+			smr_exit(inp->inp_pcbinfo->ipi_smr);
+			inp_unlock(inp, lock);
+			return (false);
 		}
-		return (0);
+		smr_exit(inp->inp_pcbinfo->ipi_smr);
+		return (true);
 	}
 
-	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-#ifdef TCPHPTS
-	if (inp->inp_in_hpts || inp->inp_in_input) {
-		struct tcp_hpts_entry *hpts;
+	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
+		smr_exit(inp->inp_pcbinfo->ipi_smr);
+		inp_lock(inp, lock);
+		if (__predict_false(in_pcbrele(inp, lock)))
+			return (false);
 		/*
-		 * We should not be on the hpts at
-		 * this point in any form. we must
-		 * get the lock to be sure.
+		 * inp acquired through refcount & lock for sure didn't went
+		 * through uma_zfree().  However, it may have already went
+		 * through in_pcbfree() and has another reference, that
+		 * prevented its release by our in_pcbrele().
 		 */
-		hpts = tcp_hpts_lock(inp);
-		if (inp->inp_in_hpts)
-			panic("Hpts:%p inp:%p at free still on hpts",
-			      hpts, inp);
-		mtx_unlock(&hpts->p_mtx);
-		hpts = tcp_input_lock(inp);
-		if (inp->inp_in_input)
-			panic("Hpts:%p inp:%p at free still on input hpts",
-			      hpts, inp);
-		mtx_unlock(&hpts->p_mtx);
+		if (__predict_false(inp->inp_flags & INP_FREED)) {
+			inp_unlock(inp, lock);
+			return (false);
+		}
+		return (true);
+	} else {
+		smr_exit(inp->inp_pcbinfo->ipi_smr);
+		return (false);
 	}
-#endif
-	INP_RUNLOCK(inp);
-	pcbinfo = inp->inp_pcbinfo;
-	uma_zfree(pcbinfo->ipi_zone, inp);
-	return (1);
 }
 
-int
-in_pcbrele_wlocked(struct inpcb *inp)
+/*
+ * inp_next() - inpcb hash/list traversal iterator
+ *
+ * Requires initialized struct inpcb_iterator for context.
+ * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
+ *
+ * - Iterator can have either write-lock or read-lock semantics, that can not
+ *   be changed later.
+ * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
+ *   a single hash slot.  Note: only rip_input() does the latter.
+ * - Iterator may have optional bool matching function.  The matching function
+ *   will be executed for each inpcb in the SMR context, so it can not acquire
+ *   locks and can safely access only immutable fields of inpcb.
+ *
+ * A fresh initialized iterator has NULL inpcb in its context and that
+ * means that inp_next() call would return the very first inpcb on the list
+ * locked with desired semantic.  In all following calls the context pointer
+ * shall hold the current inpcb pointer.  The KPI user is not supposed to
+ * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
+ * and write NULL to its context.  After end of traversal an iterator can be
+ * reused.
+ *
+ * List traversals have the following features/constraints:
+ * - New entries won't be seen, as they are always added to the head of a list.
+ * - Removed entries won't stop traversal as long as they are not added to
+ *   a different list. This is violated by in_pcbrehash().
+ */
+#define	II_LIST_FIRST(ipi, hash)					\
+		(((hash) == INP_ALL_LIST) ?				\
+		    CK_LIST_FIRST(&(ipi)->ipi_listhead) :		\
+		    CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
+#define	II_LIST_NEXT(inp, hash)						\
+		(((hash) == INP_ALL_LIST) ?				\
+		    CK_LIST_NEXT((inp), inp_list) :			\
+		    CK_LIST_NEXT((inp), inp_hash))
+#define	II_LOCK_ASSERT(inp, lock)					\
+		rw_assert(&(inp)->inp_lock,				\
+		    (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
+struct inpcb *
+inp_next(struct inpcb_iterator *ii)
 {
-	struct inpcbinfo *pcbinfo;
+	const struct inpcbinfo *ipi = ii->ipi;
+	inp_match_t *match = ii->match;
+	void *ctx = ii->ctx;
+	inp_lookup_t lock = ii->lock;
+	int hash = ii->hash;
+	struct inpcb *inp;
 
-	KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+	if (ii->inp == NULL) {		/* First call. */
+		smr_enter(ipi->ipi_smr);
+		/* This is unrolled CK_LIST_FOREACH(). */
+		for (inp = II_LIST_FIRST(ipi, hash);
+		    inp != NULL;
+		    inp = II_LIST_NEXT(inp, hash)) {
+			if (match != NULL && (match)(inp, ctx) == false)
+				continue;
+			if (__predict_true(inp_smr_lock(inp, lock)))
+				break;
+			else {
+				smr_enter(ipi->ipi_smr);
+				MPASS(inp != II_LIST_FIRST(ipi, hash));
+				inp = II_LIST_FIRST(ipi, hash);
+			}
+		}
 
-	INP_WLOCK_ASSERT(inp);
+		if (inp == NULL)
+			smr_exit(ipi->ipi_smr);
+		else
+			ii->inp = inp;
 
-	if (refcount_release(&inp->inp_refcount) == 0) {
-		/*
-		 * If the inpcb has been freed, let the caller know, even if
-		 * this isn't the last reference.
-		 */
-		if (inp->inp_flags2 & INP_FREED) {
-			INP_WUNLOCK(inp);
-			return (1);
+		return (inp);
+	}
+
+	/* Not a first call. */
+	smr_enter(ipi->ipi_smr);
+restart:
+	inp = ii->inp;
+	II_LOCK_ASSERT(inp, lock);
+next:
+	inp = II_LIST_NEXT(inp, hash);
+	if (inp == NULL) {
+		smr_exit(ipi->ipi_smr);
+		goto found;
+	}
+
+	if (match != NULL && (match)(inp, ctx) == false)
+		goto next;
+
+	if (__predict_true(inp_trylock(inp, lock))) {
+		if (__predict_false(inp->inp_flags & INP_FREED)) {
+			/*
+			 * Entries are never inserted in middle of a list, thus
+			 * as long as we are in SMR, we can continue traversal.
+			 * Jump to 'restart' should yield in the same result,
+			 * but could produce unnecessary looping.  Could this
+			 * looping be unbound?
+			 */
+			inp_unlock(inp, lock);
+			goto next;
+		} else {
+			smr_exit(ipi->ipi_smr);
+			goto found;
 		}
-		return (0);
 	}
 
-	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-#ifdef TCPHPTS
-	if (inp->inp_in_hpts || inp->inp_in_input) {
-		struct tcp_hpts_entry *hpts;
+	/*
+	 * Can't obtain lock immediately, thus going hard.  Once we exit the
+	 * SMR section we can no longer jump to 'next', and our only stable
+	 * anchoring point is ii->inp, which we keep locked for this case, so
+	 * we jump to 'restart'.
+	 */
+	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
+		smr_exit(ipi->ipi_smr);
+		inp_lock(inp, lock);
+		if (__predict_false(in_pcbrele(inp, lock))) {
+			smr_enter(ipi->ipi_smr);
+			goto restart;
+		}
 		/*
-		 * We should not be on the hpts at
-		 * this point in any form. we must
-		 * get the lock to be sure.
+		 * See comment in inp_smr_lock().
 		 */
-		hpts = tcp_hpts_lock(inp);
-		if (inp->inp_in_hpts)
-			panic("Hpts:%p inp:%p at free still on hpts",
-			      hpts, inp);
-		mtx_unlock(&hpts->p_mtx);
-		hpts = tcp_input_lock(inp);
-		if (inp->inp_in_input)
-			panic("Hpts:%p inp:%p at free still on input hpts",
-			      hpts, inp);
-		mtx_unlock(&hpts->p_mtx);
-	}
-#endif
-	INP_WUNLOCK(inp);
-	pcbinfo = inp->inp_pcbinfo;
-	uma_zfree(pcbinfo->ipi_zone, inp);
-	return (1);
+		if (__predict_false(inp->inp_flags & INP_FREED)) {
+			inp_unlock(inp, lock);
+			smr_enter(ipi->ipi_smr);
+			goto restart;
+		}
+	} else
+		goto next;
+
+found:
+	inp_unlock(ii->inp, lock);
+	ii->inp = inp;
+
+	return (ii->inp);
 }
 
-static void
-inpcbport_free(epoch_context_t ctx)
+/*
+ * in_pcbref() bumps the reference count on an inpcb in order to maintain
+ * stability of an inpcb pointer despite the inpcb lock being released or
+ * SMR section exited.
+ *
+ * To free a reference later in_pcbrele_(r|w)locked() must be performed.
+ */
+void
+in_pcbref(struct inpcb *inp)
 {
-	struct inpcbport *phd;
+	u_int old __diagused;
 
-	phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
-	free(phd, M_PCB);
+	old = refcount_acquire(&inp->inp_refcount);
+	KASSERT(old > 0, ("%s: refcount 0", __func__));
 }
 
-static void
-in_pcbfree_deferred(epoch_context_t ctx)
+/*
+ * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
+ * freeing the pcb, if the reference was very last.
+ */
+bool
+in_pcbrele_rlocked(struct inpcb *inp)
 {
-	struct inpcb *inp;
-	int released __unused;
 
-	inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
+	INP_RLOCK_ASSERT(inp);
 
-	INP_WLOCK(inp);
-	CURVNET_SET(inp->inp_vnet);
-#ifdef INET
-	struct ip_moptions *imo = inp->inp_moptions;
-	inp->inp_moptions = NULL;
-#endif
-	/* XXXRW: Do as much as possible here. */
-#if defined(IPSEC) || defined(IPSEC_SUPPORT)
-	if (inp->inp_sp != NULL)
-		ipsec_delete_pcbpolicy(inp);
-#endif
-#ifdef INET6
-	struct ip6_moptions *im6o = NULL;
-	if (inp->inp_vflag & INP_IPV6PROTO) {
-		ip6_freepcbopts(inp->in6p_outputopts);
-		im6o = inp->in6p_moptions;
-		inp->in6p_moptions = NULL;
-	}
-#endif
-	if (inp->inp_options)
-		(void)m_free(inp->inp_options);
-	inp->inp_vflag = 0;
-	crfree(inp->inp_cred);
-#ifdef MAC
-	mac_inpcb_destroy(inp);
-#endif
-	released = in_pcbrele_wlocked(inp);
-	MPASS(released);
-#ifdef INET6
-	ip6_freemoptions(im6o);
-#endif
-#ifdef INET
-	inp_freemoptions(imo);
-#endif
-	CURVNET_RESTORE();
+	if (refcount_release(&inp->inp_refcount) == 0)
+		return (false);
+
+	MPASS(inp->inp_flags & INP_FREED);
+	MPASS(inp->inp_socket == NULL);
+	MPASS(inp->inp_in_hpts == 0);
+	MPASS(inp->inp_in_input == 0);
+	INP_RUNLOCK(inp);
+	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
+	return (true);
+}
+
+bool
+in_pcbrele_wlocked(struct inpcb *inp)
+{
+
+	INP_WLOCK_ASSERT(inp);
+
+	if (refcount_release(&inp->inp_refcount) == 0)
+		return (false);
+
+	MPASS(inp->inp_flags & INP_FREED);
+	MPASS(inp->inp_socket == NULL);
+	MPASS(inp->inp_in_hpts == 0);
+	MPASS(inp->inp_in_input == 0);
+	INP_WUNLOCK(inp);
+	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
+	return (true);
 }
 
 /*
@@ -1698,32 +1789,81 @@ in_pcbfree_deferred(epoch_context_t ctx)
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
- * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
- * work, including removal from global lists, is done in this context, where
- * the pcbinfo lock is held.
+ * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
+ *  Almost all work, including removal from global lists, is done in this
+ * context, where the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+#ifdef INET
+	struct ip_moptions *imo;
+#endif
+#ifdef INET6
+	struct ip6_moptions *im6o;
+#endif
 
+	INP_WLOCK_ASSERT(inp);
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-	KASSERT((inp->inp_flags2 & INP_FREED) == 0,
+	KASSERT((inp->inp_flags & INP_FREED) == 0,
 	    ("%s: called twice for pcb %p", __func__, inp));
-	if (inp->inp_flags2 & INP_FREED) {
-		INP_WUNLOCK(inp);
-		return;
+
+	inp->inp_flags |= INP_FREED;
+	INP_INFO_WLOCK(pcbinfo);
+	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+	pcbinfo->ipi_count--;
+	CK_LIST_REMOVE(inp, inp_list);
+	INP_INFO_WUNLOCK(pcbinfo);
+
+	if (inp->inp_flags & INP_INHASHLIST) {
+		struct inpcbport *phd = inp->inp_phd;
+
+		INP_HASH_WLOCK(pcbinfo);
+		/* XXX: Only do if SO_REUSEPORT_LB set? */
+		in_pcbremlbgrouphash(inp);
+
+		CK_LIST_REMOVE(inp, inp_hash);
+		CK_LIST_REMOVE(inp, inp_portlist);
+		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
+			CK_LIST_REMOVE(phd, phd_hash);
+			uma_zfree_smr(pcbinfo->ipi_portzone, phd);
+		}
+		INP_HASH_WUNLOCK(pcbinfo);
+		inp->inp_flags &= ~INP_INHASHLIST;
 	}
 
-	INP_WLOCK_ASSERT(inp);
-	INP_LIST_WLOCK(pcbinfo);
-	in_pcbremlists(inp);
-	INP_LIST_WUNLOCK(pcbinfo);
+	crfree(inp->inp_cred);
 	RO_INVALIDATE_CACHE(&inp->inp_route);
-	/* mark as destruction in progress */
-	inp->inp_flags2 |= INP_FREED;
-	INP_WUNLOCK(inp);
-	NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx);
+#ifdef MAC
+	mac_inpcb_destroy(inp);
+#endif
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+	if (inp->inp_sp != NULL)
+		ipsec_delete_pcbpolicy(inp);
+#endif
+#ifdef INET
+	if (inp->inp_options)
+		(void)m_free(inp->inp_options);
+	imo = inp->inp_moptions;
+#endif
+#ifdef INET6
+	if (inp->inp_vflag & INP_IPV6PROTO) {
+		ip6_freepcbopts(inp->in6p_outputopts);
+		im6o = inp->in6p_moptions;
+	} else
+		im6o = NULL;
+#endif
+
+	if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
+		INP_WUNLOCK(inp);
+	}
+#ifdef INET6
+	ip6_freemoptions(im6o);
+#endif
+#ifdef INET
+	inp_freemoptions(imo);
+#endif
 }
 
 /*
@@ -1764,7 +1904,7 @@ in_pcbdrop(struct inpcb *inp)
 		CK_LIST_REMOVE(inp, inp_portlist);
 		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			CK_LIST_REMOVE(phd, phd_hash);
-			NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx);
+			uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
 		}
 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
@@ -1835,7 +1975,7 @@ in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
-	CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
+	CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
@@ -1854,49 +1994,57 @@ in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
+static bool
+inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
+{
+
+	if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
+		return (true);
+	else
+		return (false);
+}
+
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
+	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
+	    inp_v4_multi_match, NULL);
 	struct inpcb *inp;
 	struct in_multi *inm;
 	struct in_mfilter *imf;
 	struct ip_moptions *imo;
 
-	INP_INFO_WLOCK(pcbinfo);
-	CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
-		INP_WLOCK(inp);
+	IN_MULTI_LOCK_ASSERT();
+
+	while ((inp = inp_next(&inpi)) != NULL) {
+		INP_WLOCK_ASSERT(inp);
+
 		imo = inp->inp_moptions;
-		if ((inp->inp_vflag & INP_IPV4) &&
-		    imo != NULL) {
-			/*
-			 * Unselect the outgoing interface if it is being
-			 * detached.
-			 */
-			if (imo->imo_multicast_ifp == ifp)
-				imo->imo_multicast_ifp = NULL;
+		/*
+		 * Unselect the outgoing interface if it is being
+		 * detached.
+		 */
+		if (imo->imo_multicast_ifp == ifp)
+			imo->imo_multicast_ifp = NULL;
 
-			/*
-			 * Drop multicast group membership if we joined
-			 * through the interface being detached.
-			 *
-			 * XXX This can all be deferred to an epoch_call
-			 */
+		/*
+		 * Drop multicast group membership if we joined
+		 * through the interface being detached.
+		 *
+		 * XXX This can all be deferred to an epoch_call
+		 */
 restart:
-			IP_MFILTER_FOREACH(imf, &imo->imo_head) {
-				if ((inm = imf->imf_inm) == NULL)
-					continue;
-				if (inm->inm_ifp != ifp)
-					continue;
-				ip_mfilter_remove(&imo->imo_head, imf);
-				IN_MULTI_LOCK_ASSERT();
-				in_leavegroup_locked(inm, NULL);
-				ip_mfilter_free(imf);
-				goto restart;
-			}
+		IP_MFILTER_FOREACH(imf, &imo->imo_head) {
+			if ((inm = imf->imf_inm) == NULL)
+				continue;
+			if (inm->inm_ifp != ifp)
+				continue;
+			ip_mfilter_remove(&imo->imo_head, imf);
+			in_leavegroup_locked(inm, NULL);
+			ip_mfilter_free(imf);
+			goto restart;
 		}
-		INP_WUNLOCK(inp);
 	}
-	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 /*
@@ -1918,7 +2066,6 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
-
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
@@ -2081,8 +2228,9 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
- * that the caller has locked the hash list, and will not perform any further
- * locking or reference operations on either the hash list or the connection.
+ * that the caller has either locked the hash list, which usually happens
+ * for bind(2) operations, or is in SMR section, which happens when sorting
+ * out incoming packets.
  */
 static struct inpcb *
 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
@@ -2223,20 +2371,15 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 {
 	struct inpcb *inp;
 
+	smr_enter(pcbinfo->ipi_smr);
 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
 	if (inp != NULL) {
-		if (lookupflags & INPLOOKUP_WLOCKPCB) {
-			INP_WLOCK(inp);
-		} else if (lookupflags & INPLOOKUP_RLOCKPCB) {
-			INP_RLOCK(inp);
-		} else
-			panic("%s: locking bug", __func__);
-		if (__predict_false(inp->inp_flags2 & INP_FREED)) {
-			INP_UNLOCK(inp);
+		if (__predict_false(inp_smr_lock(inp,
+		    (lookupflags & INPLOOKUP_LOCKMASK)) == false))
 			inp = NULL;
-		}
-	}
+	} else
+		smr_exit(pcbinfo->ipi_smr);
 
 	return (inp);
 }
@@ -2331,11 +2474,10 @@ in_pcbinshash(struct inpcb *inp)
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
-		phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
+		phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
-		bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
 		phd->phd_port = inp->inp_lport;
 		CK_LIST_INIT(&phd->phd_pcblist);
 		CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
@@ -2353,6 +2495,10 @@ in_pcbinshash(struct inpcb *inp)
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
+ *
+ * XXXGL: a race between this function and SMR-protected hash iterator
+ * will lead to iterator traversing a possibly wrong hash list. However,
+ * this race should have been here since change from rwlock to epoch.
  */
 void
 in_pcbrehash(struct inpcb *inp)
@@ -2381,39 +2527,6 @@ in_pcbrehash(struct inpcb *inp)
 	CK_LIST_INSERT_HEAD(head, inp, inp_hash);
 }
 
-/*
- * Remove PCB from various lists.
- */
-static void
-in_pcbremlists(struct inpcb *inp)
-{
-	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
-
-	INP_WLOCK_ASSERT(inp);
*** 3009 LINES SKIPPED ***