git: fec8a8c7cbe4 - main - inpcb: use global UMA zones for protocols

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Mon, 03 Jan 2022 18:22:59 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=fec8a8c7cbe4384c7e61d376f3aa5be5ac895915

commit fec8a8c7cbe4384c7e61d376f3aa5be5ac895915
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2022-01-03 18:15:22 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2022-01-03 18:17:46 +0000

    inpcb: use global UMA zones for protocols
    
    Provide structure inpcbstorage, that holds zones and lock names for
    a protocol.  Initialize it with global protocol init using macro
    INPCBSTORAGE_DEFINE().  Then, at VNET protocol init supply it as
    the main argument to the in_pcbinfo_init().  Each VNET pcbinfo uses
    its private hash, but they all use same zone to allocate and SMR
    section to synchronize.
    
    Note: there is kern.ipc.maxsockets sysctl, which controls UMA limit
    on the socket zone, which was always global.  Historically same
    maxsockets value is applied also to every PCB zone.  Important fact:
    you can't create a pcb without a socket!  A pcb may outlive its socket,
    however.  Given that there are multiple protocols, and only one socket
    zone, the per pcb zone limits seem to have little value.  Under very
    special conditions it may trigger a little bit earlier than socket zone
    limit, but in most setups the socket zone limit will be triggered
    earlier.  When VIMAGE was added to the kernel PCB zones became per-VNET.
    This magnified existing disbalance further: now we have multiple pcb
    zones in multiple vnets limited to maxsockets, but every pcb requires a
    socket allocated from the global zone also limited by maxsockets.
    IMHO, this per pcb zone limit doesn't bring any value, so this patch
    drops it.  If anybody explains value of this limit, it can be restored
    very easy - just 2 lines change to in_pcbstorage_init().
    
    Differential revision:  https://reviews.freebsd.org/D33542
---
 sys/netinet/in_pcb.c     | 59 ++++++++++++++++++++++++++++++++----------------
 sys/netinet/in_pcb.h     | 45 ++++++++++++++++++++++++++++++++----
 sys/netinet/ip_divert.c  | 23 ++-----------------
 sys/netinet/raw_ip.c     | 28 ++---------------------
 sys/netinet/tcp_subr.c   | 27 +++-------------------
 sys/netinet/udp_usrreq.c | 39 +++++---------------------------
 6 files changed, 93 insertions(+), 128 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 03250d5101ff..97471ae7800c 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -518,19 +518,16 @@ abort_with_hash_wlock:
 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
 
 /*
- * Initialize an inpcbinfo -- we should be able to reduce the number of
- * arguments in time.
+ * Initialize an inpcbinfo - a per-VNET instance of connections db.
  */
-static void inpcb_dtor(void *, int, void *);
-static void inpcb_fini(void *, int);
 void
-in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
-    u_int hash_nelements, int porthash_nelements, char *inpcbzone_name,
-    uma_init inpcbzone_init)
+in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
+    u_int hash_nelements, u_int porthash_nelements)
 {
 
-	mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF);
-	mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF);
+	mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
+	mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
+	    NULL, MTX_DEF);
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
@@ -543,16 +540,9 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
 	    &pcbinfo->ipi_porthashmask);
 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_lbgrouphashmask);
-	pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
-	    NULL, inpcb_dtor, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR,
-	    UMA_ZONE_SMR);
-	uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
-	uma_zone_set_warning(pcbinfo->ipi_zone,
-	    "kern.ipc.maxsockets limit reached");
+	pcbinfo->ipi_zone = pcbstor->ips_zone;
+	pcbinfo->ipi_portzone = pcbstor->ips_portzone;
 	pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
-	pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name,
-	    sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-	uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr);
 }
 
 /*
@@ -570,12 +560,41 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 	    pcbinfo->ipi_porthashmask);
 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
 	    pcbinfo->ipi_lbgrouphashmask);
-	uma_zdestroy(pcbinfo->ipi_zone);
-	uma_zdestroy(pcbinfo->ipi_portzone);
 	mtx_destroy(&pcbinfo->ipi_hash_lock);
 	mtx_destroy(&pcbinfo->ipi_lock);
 }
 
+/*
+ * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
+ */
+static void inpcb_dtor(void *, int, void *);
+static void inpcb_fini(void *, int);
+void
+in_pcbstorage_init(void *arg)
+{
+	struct inpcbstorage *pcbstor = arg;
+
+	pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
+	    sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit,
+	    inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
+	pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
+	    sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	uma_zone_set_smr(pcbstor->ips_portzone,
+	    uma_zone_get_smr(pcbstor->ips_zone));
+}
+
+/*
+ * Destroy a pcbstorage - used by unloadable protocols.
+ */
+void
+in_pcbstorage_destroy(void *arg)
+{
+	struct inpcbstorage *pcbstor = arg;
+
+	uma_zdestroy(pcbstor->ips_zone);
+	uma_zdestroy(pcbstor->ips_portzone);
+}
+
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 0e87a68e81fa..1f7efd19e868 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -375,8 +375,8 @@ void	in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
 
 #ifdef _KERNEL
 /*
- * Global data structure for each high-level protocol (UDP, TCP, ...) in both
- * IPv4 and IPv6.  Holds inpcb lists and information for managing them.
+ * Per-VNET pcb database for each high-level protocol (UDP, TCP, ...) in both
+ * IPv4 and IPv6.
  *
  * The pcbs are protected with SMR section and thus all lists in inpcbinfo
  * are CK-lists.  Locking is required to insert a pcb into database. Two
@@ -445,6 +445,41 @@ struct inpcbinfo {
 	struct vnet		*ipi_vnet;		/* (c) */
 };
 
+/*
+ * Global allocation storage for each high-level protocol (UDP, TCP, ...).
+ * Each corresponding per-VNET inpcbinfo points into this one.
+ */
+struct inpcbstorage {
+	uma_zone_t	ips_zone;
+	uma_zone_t	ips_portzone;
+	uma_init	ips_pcbinit;
+	const char *	ips_zone_name;
+	const char *	ips_portzone_name;
+	const char *	ips_infolock_name;
+	const char *	ips_hashlock_name;
+};
+
+#define INPCBSTORAGE_DEFINE(prot, lname, zname, iname, hname)		\
+static int								\
+prot##_inpcb_init(void *mem, int size __unused, int flags __unused)	\
+{									\
+	struct inpcb *inp = mem;					\
+									\
+	rw_init_flags(&inp->inp_lock, lname, RW_RECURSE | RW_DUPOK);	\
+	return (0);							\
+}									\
+static struct inpcbstorage prot = {					\
+	.ips_pcbinit = prot##_inpcb_init,				\
+	.ips_zone_name = zname,						\
+	.ips_portzone_name = zname " ports",				\
+	.ips_infolock_name = iname,					\
+	.ips_hashlock_name = hname,					\
+};									\
+SYSINIT(prot##_inpcbstorage_init, SI_SUB_PROTO_DOMAIN,			\
+    SI_ORDER_SECOND, in_pcbstorage_init, &prot);			\
+SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN,		\
+    SI_ORDER_SECOND, in_pcbstorage_destroy, &prot)
+
 /*
  * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
  * (or unique address:port combination) can be re-used at most
@@ -688,9 +723,11 @@ VNET_DECLARE(int, ipport_tcpallocs);
 #define	V_ipport_stoprandom	VNET(ipport_stoprandom)
 #define	V_ipport_tcpallocs	VNET(ipport_tcpallocs)
 
+void	in_pcbinfo_init(struct inpcbinfo *, struct inpcbstorage *,
+	    u_int, u_int);
 void	in_pcbinfo_destroy(struct inpcbinfo *);
-void	in_pcbinfo_init(struct inpcbinfo *, const char *, u_int, int, char *,
-	    uma_init);
+void	in_pcbstorage_init(void *);
+void	in_pcbstorage_destroy(void *);
 
 int	in_pcbbind_check_bindmulti(const struct inpcb *ni,
 	    const struct inpcb *oi);
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index 8f972f49b604..6c4d85b03e6a 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -117,8 +117,6 @@ VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
-static eventhandler_tag ip_divert_event_tag;
-
 static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m,
     struct sockaddr_in *sin);
 static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
@@ -126,21 +124,7 @@ static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
 /*
  * Initialize divert connection block queue.
  */
-static void
-div_zone_change(void *tag)
-{
-
-	uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets);
-}
-
-static int
-div_inpcb_init(void *mem, int size, int flags)
-{
-	struct inpcb *inp = mem;
-
-	INP_LOCK_INIT(inp, "inp", "divinp");
-	return (0);
-}
+INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash");
 
 static void
 div_init(void *arg __unused)
@@ -151,7 +135,7 @@ div_init(void *arg __unused)
 	 * allocate one-entry hash lists than it is to check all over the
 	 * place for hashbase == NULL.
 	 */
-	in_pcbinfo_init(&V_divcbinfo, "div", 1, 1, "divcb", div_inpcb_init);
+	in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1);
 }
 VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL);
 
@@ -794,8 +778,6 @@ div_modevent(module_t mod, int type, void *unused)
 		if (err != 0)
 			return (err);
 		ip_divert_ptr = divert_packet;
-		ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change,
-		    div_zone_change, NULL, EVENTHANDLER_PRI_ANY);
 		break;
 	case MOD_QUIESCE:
 		/*
@@ -829,7 +811,6 @@ div_modevent(module_t mod, int type, void *unused)
 #ifndef VIMAGE
 		div_destroy(NULL);
 #endif
-		EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag);
 		break;
 	default:
 		err = EOPNOTSUPP;
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index b7d338306b49..7c495745806e 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -182,37 +182,13 @@ rip_delhash(struct inpcb *inp)
 }
 #endif /* INET */
 
-/*
- * Raw interface to IP protocol.
- */
-
-/*
- * Initialize raw connection block q.
- */
-static void
-rip_zone_change(void *tag)
-{
-
-	uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets);
-}
-
-static int
-rip_inpcb_init(void *mem, int size, int flags)
-{
-	struct inpcb *inp = mem;
-
-	INP_LOCK_INIT(inp, "inp", "rawinp");
-	return (0);
-}
+INPCBSTORAGE_DEFINE(ripcbstor, "rawinp", "ripcb", "rip", "riphash");
 
 static void
 rip_init(void *arg __unused)
 {
 
-	in_pcbinfo_init(&V_ripcbinfo, "rip", INP_PCBHASH_RAW_SIZE, 1, "ripcb",
-	    rip_inpcb_init);
-	EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL,
-	    EVENTHANDLER_PRI_ANY);
+	in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1);
 }
 VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL);
 
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 3a02988e5b6d..47b6ff173afe 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1146,26 +1146,7 @@ static struct mtx isn_mtx;
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
-/*
- * TCP initialization.
- */
-static void
-tcp_zone_change(void *tag)
-{
-
-	uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets);
-	uma_zone_set_max(V_tcpcb_zone, maxsockets);
-	tcp_tw_zone_change();
-}
-
-static int
-tcp_inpcb_init(void *mem, int size, int flags)
-{
-	struct inpcb *inp = mem;
-
-	INP_LOCK_INIT(inp, "inp", "tcpinp");
-	return (0);
-}
+INPCBSTORAGE_DEFINE(tcpcbstor, "tcpinp", "tcp_inpcb", "tcp", "tcphash");
 
 /*
  * Take a value and get the next power of 2 that doesn't overflow.
@@ -1439,8 +1420,8 @@ tcp_vnet_init(void *arg __unused)
 		printf("%s: WARNING: unable to initialise TCP stats\n",
 		    __func__);
 #endif
-	in_pcbinfo_init(&V_tcbinfo, "tcp", tcp_tcbhashsize, tcp_tcbhashsize,
-	    "tcp_inpcb", tcp_inpcb_init);
+	in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize,
+	    tcp_tcbhashsize);
 
 	/*
 	 * These have to be type stable for the benefit of the timers.
@@ -1526,8 +1507,6 @@ tcp_init(void *arg __unused)
 	ISN_LOCK_INIT();
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
-	EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL,
-		EVENTHANDLER_PRI_ANY);
 
 	tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK);
diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c
index 582486ffbc92..ad5a2df7d4aa 100644
--- a/sys/netinet/udp_usrreq.c
+++ b/sys/netinet/udp_usrreq.c
@@ -170,33 +170,9 @@ static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		    struct mbuf *, struct thread *, int);
 #endif
 
-static void
-udp_zone_change(void *tag)
-{
-
-	uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets);
-	uma_zone_set_max(V_udpcb_zone, maxsockets);
-}
-
-static int
-udp_inpcb_init(void *mem, int size, int flags)
-{
-	struct inpcb *inp;
-
-	inp = mem;
-	INP_LOCK_INIT(inp, "inp", "udpinp");
-	return (0);
-}
-
-static int
-udplite_inpcb_init(void *mem, int size, int flags)
-{
-	struct inpcb *inp;
-
-	inp = mem;
-	INP_LOCK_INIT(inp, "inp", "udpliteinp");
-	return (0);
-}
+INPCBSTORAGE_DEFINE(udpcbstor, "udpinp", "udp_inpcb", "udp", "udphash");
+INPCBSTORAGE_DEFINE(udplitecbstor, "udpliteinp", "udplite_inpcb", "udplite",
+    "udplitehash");
 
 static void
 udp_init(void *arg __unused)
@@ -209,18 +185,15 @@ udp_init(void *arg __unused)
 	 * Once we can calculate the flowid that way and re-establish
 	 * a 4-tuple, flip this to 4-tuple.
 	 */
-	in_pcbinfo_init(&V_udbinfo, "udp", UDBHASHSIZE, UDBHASHSIZE,
-	    "udp_inpcb", udp_inpcb_init);
+	in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE);
 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
-	EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL,
-	    EVENTHANDLER_PRI_ANY);
 
 	/* Additional pcbinfo for UDP-Lite */
-	in_pcbinfo_init(&V_ulitecbinfo, "udplite", UDBHASHSIZE,
-	    UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init);
+	in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE,
+	    UDBHASHSIZE);
 }
 VNET_SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL);