git: a72ee355646c - main - ktls: Defer creation of threads and zones until first use.

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Thu, 14 Oct 2021 23:14:23 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=a72ee355646c4a379f55ac32d7d8dfe6c0d366f9

commit a72ee355646c4a379f55ac32d7d8dfe6c0d366f9
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2021-10-14 22:48:34 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2021-10-14 22:48:34 +0000

    ktls: Defer creation of threads and zones until first use.
    
    Run ktls_init() when the first KTLS session is created rather than
    unconditionally during boot.  This avoids creating unused threads and
    allocating unused resources on systems which do not use KTLS.
    
    Reviewed by:    gallatin, markj
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D32487
---
 sys/kern/uipc_ktls.c | 173 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 115 insertions(+), 58 deletions(-)

diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index 1e778dbf113a..bde9fca97f50 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -109,6 +109,9 @@ static struct proc *ktls_proc;
 static uma_zone_t ktls_session_zone;
 static uma_zone_t ktls_buffer_zone;
 static uint16_t ktls_cpuid_lookup[MAXCPU];
+static int ktls_init_state;
+static struct sx ktls_init_lock;
+SX_SYSINIT(ktls_init_lock, &ktls_init_lock, "ktls init");
 
 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload");
@@ -379,12 +382,11 @@ ktls_free_mext_contig(struct mbuf *m)
 	uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0]));
 }
 
-static void
-ktls_init(void *dummy __unused)
+static int
+ktls_init(void)
 {
 	struct thread *td;
 	struct pcpu *pc;
-	cpuset_t mask;
 	int count, domain, error, i;
 
 	ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
@@ -410,36 +412,40 @@ ktls_init(void *dummy __unused)
 		STAILQ_INIT(&ktls_wq[i].m_head);
 		STAILQ_INIT(&ktls_wq[i].so_head);
 		mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
-		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
-		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
-		if (error)
-			panic("Can't add KTLS thread %d error %d", i, error);
-
-		/*
-		 * Bind threads to cores.  If ktls_bind_threads is >
-		 * 1, then we bind to the NUMA domain.
-		 */
-		if (ktls_bind_threads) {
-			if (ktls_bind_threads > 1) {
-				pc = pcpu_find(i);
-				domain = pc->pc_domain;
-				CPU_COPY(&cpuset_domain[domain], &mask);
-				count = ktls_domains[domain].count;
-				ktls_domains[domain].cpu[count] = i;
-				ktls_domains[domain].count++;
-			} else {
-				CPU_SETOF(i, &mask);
-			}
-			error = cpuset_setthread(td->td_tid, &mask);
-			if (error)
-				panic(
-			    "Unable to bind KTLS thread for CPU %d error %d",
-				     i, error);
+		if (ktls_bind_threads > 1) {
+			pc = pcpu_find(i);
+			domain = pc->pc_domain;
+			count = ktls_domains[domain].count;
+			ktls_domains[domain].cpu[count] = i;
+			ktls_domains[domain].count++;
 		}
 		ktls_cpuid_lookup[ktls_number_threads] = i;
 		ktls_number_threads++;
 	}
 
+	/*
+	 * If we somehow have an empty domain, fall back to choosing
+	 * among all KTLS threads.
+	 */
+	if (ktls_bind_threads > 1) {
+		for (i = 0; i < vm_ndomains; i++) {
+			if (ktls_domains[i].count == 0) {
+				ktls_bind_threads = 1;
+				break;
+			}
+		}
+	}
+
+	/* Start kthreads for each workqueue. */
+	CPU_FOREACH(i) {
+		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
+		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
+		if (error) {
+			printf("Can't add KTLS thread %d error %d\n", i, error);
+			return (error);
+		}
+	}
+
 	/*
 	 * Start an allocation thread per-domain to perform blocking allocations
 	 * of 16k physically contiguous TLS crypto destination buffers.
@@ -454,35 +460,46 @@ ktls_init(void *dummy __unused)
 			    &ktls_domains[domain], &ktls_proc,
 			    &ktls_domains[domain].alloc_td.td,
 			    0, 0, "KTLS", "alloc_%d", domain);
-			if (error)
-				panic("Can't add KTLS alloc thread %d error %d",
+			if (error) {
+				printf("Can't add KTLS alloc thread %d error %d\n",
 				    domain, error);
-			CPU_COPY(&cpuset_domain[domain], &mask);
-			error = cpuset_setthread(ktls_domains[domain].alloc_td.td->td_tid,
-			    &mask);
-			if (error)
-				panic("Unable to bind KTLS alloc %d error %d",
-				    domain, error);
-		}
-	}
-
-	/*
-	 * If we somehow have an empty domain, fall back to choosing
-	 * among all KTLS threads.
-	 */
-	if (ktls_bind_threads > 1) {
-		for (i = 0; i < vm_ndomains; i++) {
-			if (ktls_domains[i].count == 0) {
-				ktls_bind_threads = 1;
-				break;
+				return (error);
 			}
 		}
 	}
 
 	if (bootverbose)
 		printf("KTLS: Initialized %d threads\n", ktls_number_threads);
+	return (0);
+}
+
+static int
+ktls_start_kthreads(void)
+{
+	int error, state;
+
+start:
+	state = atomic_load_acq_int(&ktls_init_state);
+	if (__predict_true(state > 0))
+		return (0);
+	if (state < 0)
+		return (ENXIO);
+
+	sx_xlock(&ktls_init_lock);
+	if (ktls_init_state != 0) {
+		sx_xunlock(&ktls_init_lock);
+		goto start;
+	}
+
+	error = ktls_init();
+	if (error == 0)
+		state = 1;
+	else
+		state = -1;
+	atomic_store_rel_int(&ktls_init_state, state);
+	sx_xunlock(&ktls_init_lock);
+	return (error);
 }
-SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL);
 
 #if defined(INET) || defined(INET6)
 static int
@@ -583,6 +600,10 @@ ktls_create_session(struct socket *so, struct tls_enable *en,
 		return (EINVAL);
 	}
 
+	error = ktls_start_kthreads();
+	if (error != 0)
+		return (error);
+
 	tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
@@ -2457,6 +2478,18 @@ ktls_encrypt_async(struct ktls_wq *wq, struct mbuf *top)
 	CURVNET_RESTORE();
 }
 
+static int
+ktls_bind_domain(int domain)
+{
+	int error;
+
+	error = cpuset_setthread(curthread->td_tid, &cpuset_domain[domain]);
+	if (error != 0)
+		return (error);
+	curthread->td_domain.dr_policy = DOMAINSET_PREF(domain);
+	return (0);
+}
+
 static void
 ktls_alloc_thread(void *ctx)
 {
@@ -2465,14 +2498,16 @@ ktls_alloc_thread(void *ctx)
 	void **buf;
 	struct sysctl_oid *oid;
 	char name[80];
-	int i, nbufs;
+	int domain, error, i, nbufs;
 
-	curthread->td_domain.dr_policy =
-	    DOMAINSET_PREF(PCPU_GET(domain));
-	snprintf(name, sizeof(name), "domain%d", PCPU_GET(domain));
+	domain = ktls_domain - ktls_domains;
 	if (bootverbose)
-		printf("Starting KTLS alloc thread for domain %d\n",
-		    PCPU_GET(domain));
+		printf("Starting KTLS alloc thread for domain %d\n", domain);
+	error = ktls_bind_domain(domain);
+	if (error)
+		printf("Unable to bind KTLS alloc thread for domain %d: error %d\n",
+		    domain, error);
+	snprintf(name, sizeof(name), "domain%d", domain);
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO,
 	    name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs",
@@ -2527,10 +2562,32 @@ ktls_work_thread(void *ctx)
 	struct socket *so, *son;
 	STAILQ_HEAD(, mbuf) local_m_head;
 	STAILQ_HEAD(, socket) local_so_head;
+	int cpu;
 
-	if (ktls_bind_threads > 1) {
-		curthread->td_domain.dr_policy =
-			DOMAINSET_PREF(PCPU_GET(domain));
+	cpu = wq - ktls_wq;
+	if (bootverbose)
+		printf("Starting KTLS worker thread for CPU %d\n", cpu);
+
+	/*
+	 * Bind to a core.  If ktls_bind_threads is > 1, then
+	 * we bind to the NUMA domain instead.
+	 */
+	if (ktls_bind_threads) {
+		int error;
+
+		if (ktls_bind_threads > 1) {
+			struct pcpu *pc = pcpu_find(cpu);
+
+			error = ktls_bind_domain(pc->pc_domain);
+		} else {
+			cpuset_t mask;
+
+			CPU_SETOF(cpu, &mask);
+			error = cpuset_setthread(curthread->td_tid, &mask);
+		}
+		if (error)
+			printf("Unable to bind KTLS worker thread for CPU %d: error %d\n",
+				cpu, error);
 	}
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 	fpu_kern_thread(0);