git: f0278de38052 - stable/13 - Refactor/optimize cpu_search_*().

Sat Aug 28 02:21:29 UTC 2021

The branch stable/13 has been updated by mav:

URL: https://cgit.FreeBSD.org/src/commit/?id=f0278de380520b897e6936171425af035caa9289

commit f0278de380520b897e6936171425af035caa9289
Author:     Alexander Motin <mav at FreeBSD.org>
AuthorDate: 2021-07-29 01:18:50 +0000
Commit:     Alexander Motin <mav at FreeBSD.org>
CommitDate: 2021-08-28 02:17:55 +0000

    Refactor/optimize cpu_search_*().
    
    Remove cpu_search_both(), unused for many years.  Without it there is
    less sense for the trick of compiling common cpu_search() into separate
    cpu_search_lowest() and cpu_search_highest(), so split them completely,
    making code more readable.  While there, split iteration over children
    groups and CPUs, complicating code for very small deduplication.
    
    Stop passing cpuset_t arguments by value and avoid some manipulations.
    Since MAXCPU bump from 64 to 256, what was a single register turned
    into 32-byte memory array, requiring memory allocation and accesses.
    Splitting struct cpu_search into parameter and result parts allows to
    even more reduce stack usage, since the first can be passed through
    on recursion.
    
    Remove CPU_FFS() from the hot paths, precalculating first and last CPU
    for each CPU group in advance during initialization.  Again, it was
    not a problem for 64 CPUs before, but for 256 FFS needs much more code.
    
    With these changes on 80-thread system doing ~260K uncached ZFS reads
    per second I observe ~30% reduction of time spent in cpu_search_*().
    
    MFC after:      1 month
    
    (cherry picked from commit aefe0a8c32d370f2fdd0d0771eb59f8845beda17)
---
 sys/kern/sched_ule.c | 290 +++++++++++++++++++++------------------------------
 sys/kern/subr_smp.c  |  12 +++
 sys/sys/smp.h        |   2 +
 3 files changed, 134 insertions(+), 170 deletions(-)

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 49bf3328f3d9..6e9ef73efe99 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -631,170 +631,120 @@ sched_random(void)
 }
 
 struct cpu_search {
-	cpuset_t cs_mask;
+	cpuset_t *cs_mask;
 	u_int	cs_prefer;
 	int	cs_pri;		/* Min priority for low. */
 	int	cs_limit;	/* Max load for low, min load for high. */
+};
+
+struct cpu_search_res {
 	int	cs_cpu;
 	int	cs_load;
 };
 
-#define	CPU_SEARCH_LOWEST	0x1
-#define	CPU_SEARCH_HIGHEST	0x2
-#define	CPU_SEARCH_BOTH		(CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
-
-static __always_inline int cpu_search(const struct cpu_group *cg,
-    struct cpu_search *low, struct cpu_search *high, const int match);
-int __noinline cpu_search_lowest(const struct cpu_group *cg,
-    struct cpu_search *low);
-int __noinline cpu_search_highest(const struct cpu_group *cg,
-    struct cpu_search *high);
-int __noinline cpu_search_both(const struct cpu_group *cg,
-    struct cpu_search *low, struct cpu_search *high);
-
-/*
- * Search the tree of cpu_groups for the lowest or highest loaded cpu
- * according to the match argument.  This routine actually compares the
- * load on all paths through the tree and finds the least loaded cpu on
- * the least loaded path, which may differ from the least loaded cpu in
- * the system.  This balances work among caches and buses.
- *
- * This inline is instantiated in three forms below using constants for the
- * match argument.  It is reduced to the minimum set for each case.  It is
- * also recursive to the depth of the tree.
- */
-static __always_inline int
-cpu_search(const struct cpu_group *cg, struct cpu_search *low,
-    struct cpu_search *high, const int match)
-{
-	struct cpu_search lgroup;
-	struct cpu_search hgroup;
-	cpuset_t cpumask;
-	struct cpu_group *child;
+/*
+ * Search the tree of cpu_groups for the lowest or highest loaded CPU.
+ * These routines actually compare the load on all paths through the tree
+ * and find the least loaded cpu on the least loaded path, which may differ
+ * from the least loaded cpu in the system.  This balances work among caches
+ * and buses.
+ */
+static int
+cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
+    struct cpu_search_res *r)
+{
+	struct cpu_search_res lr;
 	struct tdq *tdq;
-	int cpu, i, hload, lload, load, total, rnd;
+	int c, bload, l, load, total;
 
 	total = 0;
-	cpumask = cg->cg_mask;
-	if (match & CPU_SEARCH_LOWEST) {
-		lload = INT_MAX;
-		lgroup = *low;
-	}
-	if (match & CPU_SEARCH_HIGHEST) {
-		hload = INT_MIN;
-		hgroup = *high;
-	}
+	bload = INT_MAX;
+	r->cs_cpu = -1;
 
-	/* Iterate through the child CPU groups and then remaining CPUs. */
-	for (i = cg->cg_children, cpu = mp_maxid; ; ) {
-		if (i == 0) {
-#ifdef HAVE_INLINE_FFSL
-			cpu = CPU_FFS(&cpumask) - 1;
-#else
-			while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
-				cpu--;
-#endif
-			if (cpu < 0)
-				break;
-			child = NULL;
-		} else
-			child = &cg->cg_child[i - 1];
-
-		if (match & CPU_SEARCH_LOWEST)
-			lgroup.cs_cpu = -1;
-		if (match & CPU_SEARCH_HIGHEST)
-			hgroup.cs_cpu = -1;
-		if (child) {			/* Handle child CPU group. */
-			CPU_ANDNOT(&cpumask, &child->cg_mask);
-			switch (match) {
-			case CPU_SEARCH_LOWEST:
-				load = cpu_search_lowest(child, &lgroup);
-				break;
-			case CPU_SEARCH_HIGHEST:
-				load = cpu_search_highest(child, &hgroup);
-				break;
-			case CPU_SEARCH_BOTH:
-				load = cpu_search_both(child, &lgroup, &hgroup);
-				break;
-			}
-		} else {			/* Handle child CPU. */
-			CPU_CLR(cpu, &cpumask);
-			tdq = TDQ_CPU(cpu);
-			load = tdq->tdq_load * 256;
-			rnd = sched_random() % 32;
-			if (match & CPU_SEARCH_LOWEST) {
-				if (cpu == low->cs_prefer)
-					load -= 64;
-				/* If that CPU is allowed and get data. */
-				if (tdq->tdq_lowpri > lgroup.cs_pri &&
-				    tdq->tdq_load <= lgroup.cs_limit &&
-				    CPU_ISSET(cpu, &lgroup.cs_mask)) {
-					lgroup.cs_cpu = cpu;
-					lgroup.cs_load = load - rnd;
-				}
+	/* Loop through children CPU groups if there are any. */
+	if (cg->cg_children > 0) {
+		for (c = cg->cg_children - 1; c >= 0; c--) {
+			load = cpu_search_lowest(&cg->cg_child[c], s, &lr);
+			total += load;
+			if (lr.cs_cpu >= 0 && (load < bload ||
+			    (load == bload && lr.cs_load < r->cs_load))) {
+				bload = load;
+				r->cs_cpu = lr.cs_cpu;
+				r->cs_load = lr.cs_load;
 			}
-			if (match & CPU_SEARCH_HIGHEST)
-				if (tdq->tdq_load >= hgroup.cs_limit &&
-				    tdq->tdq_transferable &&
-				    CPU_ISSET(cpu, &hgroup.cs_mask)) {
-					hgroup.cs_cpu = cpu;
-					hgroup.cs_load = load - rnd;
-				}
 		}
-		total += load;
+		return (total);
+	}
 
-		/* We have info about child item. Compare it. */
-		if (match & CPU_SEARCH_LOWEST) {
-			if (lgroup.cs_cpu >= 0 &&
-			    (load < lload ||
-			     (load == lload && lgroup.cs_load < low->cs_load))) {
-				lload = load;
-				low->cs_cpu = lgroup.cs_cpu;
-				low->cs_load = lgroup.cs_load;
-			}
-		}
-		if (match & CPU_SEARCH_HIGHEST)
-			if (hgroup.cs_cpu >= 0 &&
-			    (load > hload ||
-			     (load == hload && hgroup.cs_load > high->cs_load))) {
-				hload = load;
-				high->cs_cpu = hgroup.cs_cpu;
-				high->cs_load = hgroup.cs_load;
-			}
-		if (child) {
-			i--;
-			if (i == 0 && CPU_EMPTY(&cpumask))
-				break;
+	/* Loop through children CPUs otherwise. */
+	for (c = cg->cg_last; c >= cg->cg_first; c--) {
+		if (!CPU_ISSET(c, &cg->cg_mask))
+			continue;
+		tdq = TDQ_CPU(c);
+		l = tdq->tdq_load;
+		load = l * 256;
+		if (c == s->cs_prefer)
+			load -= 128;
+		total += load;
+		if (l > s->cs_limit || tdq->tdq_lowpri <= s->cs_pri ||
+		    !CPU_ISSET(c, s->cs_mask))
+			continue;
+		load -= sched_random() % 128;
+		if (load < bload) {
+			bload = load;
+			r->cs_cpu = c;
 		}
-#ifndef HAVE_INLINE_FFSL
-		else
-			cpu--;
-#endif
 	}
+	r->cs_load = bload;
 	return (total);
 }
 
-/*
- * cpu_search instantiations must pass constants to maintain the inline
- * optimization.
- */
-int
-cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
+static int
+cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,
+    struct cpu_search_res *r)
 {
-	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
-}
+	struct cpu_search_res lr;
+	struct tdq *tdq;
+	int c, bload, l, load, total;
 
-int
-cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
-{
-	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
-}
+	total = 0;
+	bload = INT_MIN;
+	r->cs_cpu = -1;
 
-int
-cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
-    struct cpu_search *high)
-{
-	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
+	/* Loop through children CPU groups if there are any. */
+	if (cg->cg_children > 0) {
+		for (c = cg->cg_children - 1; c >= 0; c--) {
+			load = cpu_search_highest(&cg->cg_child[c], s, &lr);
+			total += load;
+			if (lr.cs_cpu >= 0 && (load > bload ||
+			    (load == bload && lr.cs_load > r->cs_load))) {
+				bload = load;
+				r->cs_cpu = lr.cs_cpu;
+				r->cs_load = lr.cs_load;
+			}
+		}
+		return (total);
+	}
+
+	/* Loop through children CPUs otherwise. */
+	for (c = cg->cg_last; c >= cg->cg_first; c--) {
+		if (!CPU_ISSET(c, &cg->cg_mask))
+			continue;
+		tdq = TDQ_CPU(c);
+		l = tdq->tdq_load;
+		load = l * 256;
+		total += load;
+		if (l < s->cs_limit || !tdq->tdq_transferable ||
+		    !CPU_ISSET(c, s->cs_mask))
+			continue;
+		load -= sched_random() % 128;
+		if (load > bload) {
+			bload = load;
+			r->cs_cpu = c;
+		}
+	}
+	r->cs_load = bload;
+	return (total);
 }
 
 /*
@@ -803,33 +753,33 @@ cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
  * acceptable.
  */
 static inline int
-sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
+sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload,
     int prefer)
 {
-	struct cpu_search low;
+	struct cpu_search s;
+	struct cpu_search_res r;
 
-	low.cs_cpu = -1;
-	low.cs_prefer = prefer;
-	low.cs_mask = mask;
-	low.cs_pri = pri;
-	low.cs_limit = maxload;
-	cpu_search_lowest(cg, &low);
-	return low.cs_cpu;
+	s.cs_prefer = prefer;
+	s.cs_mask = mask;
+	s.cs_pri = pri;
+	s.cs_limit = maxload;
+	cpu_search_lowest(cg, &s, &r);
+	return (r.cs_cpu);
 }
 
 /*
  * Find the cpu with the highest load via the highest loaded path.
  */
 static inline int
-sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
+sched_highest(const struct cpu_group *cg, cpuset_t *mask, int minload)
 {
-	struct cpu_search high;
+	struct cpu_search s;
+	struct cpu_search_res r;
 
-	high.cs_cpu = -1;
-	high.cs_mask = mask;
-	high.cs_limit = minload;
-	cpu_search_highest(cg, &high);
-	return high.cs_cpu;
+	s.cs_mask = mask;
+	s.cs_limit = minload;
+	cpu_search_highest(cg, &s, &r);
+	return (r.cs_cpu);
 }
 
 static void
@@ -841,7 +791,7 @@ sched_balance_group(struct cpu_group *cg)
 
 	CPU_FILL(&hmask);
 	for (;;) {
-		high = sched_highest(cg, hmask, 2);
+		high = sched_highest(cg, &hmask, 2);
 		/* Stop if there is no more CPU with transferrable threads. */
 		if (high == -1)
 			break;
@@ -853,7 +803,7 @@ sched_balance_group(struct cpu_group *cg)
 		anylow = 1;
 		tdq = TDQ_CPU(high);
 nextlow:
-		low = sched_lowest(cg, lmask, -1, tdq->tdq_load - 1, high);
+		low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high);
 		/* Stop if we looked well and found no less loaded CPU. */
 		if (anylow && low == -1)
 			break;
@@ -995,7 +945,7 @@ tdq_idled(struct tdq *tdq)
     restart:
 	switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 	for (cg = tdq->tdq_cg; ; ) {
-		cpu = sched_highest(cg, mask, steal_thresh);
+		cpu = sched_highest(cg, &mask, steal_thresh);
 		/*
 		 * We were assigned a thread but not preempted.  Returning
 		 * 0 here will cause our caller to switch to it.
@@ -1244,7 +1194,7 @@ sched_pickcpu(struct thread *td, int flags)
 	struct cpu_group *cg, *ccg;
 	struct td_sched *ts;
 	struct tdq *tdq;
-	cpuset_t mask;
+	cpuset_t *mask;
 	int cpu, pri, self, intr;
 
 	self = PCPU_GET(cpuid);
@@ -1287,14 +1237,14 @@ sched_pickcpu(struct thread *td, int flags)
 	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
 		if (cg->cg_flags & CG_FLAG_THREAD) {
 			/* Check all SMT threads for being idle. */
-			for (cpu = CPU_FFS(&cg->cg_mask) - 1; ; cpu++) {
+			for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) {
 				if (CPU_ISSET(cpu, &cg->cg_mask) &&
 				    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
 					break;
-				if (cpu >= mp_maxid) {
-					SCHED_STAT_INC(pickcpu_idle_affinity);
-					return (ts->ts_cpu);
-				}
+			}
+			if (cpu > cg->cg_last) {
+				SCHED_STAT_INC(pickcpu_idle_affinity);
+				return (ts->ts_cpu);
 			}
 		} else {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
@@ -1321,7 +1271,7 @@ llc:
 	if (ccg == cpu_top)
 		ccg = NULL;
 	cpu = -1;
-	mask = td->td_cpuset->cs_mask;
+	mask = &td->td_cpuset->cs_mask;
 	pri = td->td_priority;
 	/*
 	 * Try hard to keep interrupts within found LLC.  Search the LLC for
@@ -1931,7 +1881,7 @@ tdq_trysteal(struct tdq *tdq)
 	spinlock_enter();
 	TDQ_UNLOCK(tdq);
 	for (i = 1, cg = tdq->tdq_cg; ; ) {
-		cpu = sched_highest(cg, mask, steal_thresh);
+		cpu = sched_highest(cg, &mask, steal_thresh);
 		/*
 		 * If a thread was added while interrupts were disabled don't
 		 * steal one here.
@@ -3002,7 +2952,7 @@ sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
 	sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
 	    cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
 	first = TRUE;
-	for (i = 0; i < MAXCPU; i++) {
+	for (i = cg->cg_first; i <= cg->cg_last; i++) {
 		if (CPU_ISSET(i, &cg->cg_mask)) {
 			if (!first)
 				sbuf_printf(sb, ", ");
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
index 935fb6ee977c..bfe890d773f9 100644
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -630,6 +630,17 @@ smp_rendezvous(void (* setup_func)(void *),
 
 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 
+static void
+smp_topo_fill(struct cpu_group *cg)
+{
+	int c;
+
+	for (c = 0; c < cg->cg_children; c++)
+		smp_topo_fill(&cg->cg_child[c]);
+	cg->cg_first = CPU_FFS(&cg->cg_mask) - 1;
+	cg->cg_last = CPU_FLS(&cg->cg_mask) - 1;
+}
+
 struct cpu_group *
 smp_topo(void)
 {
@@ -693,6 +704,7 @@ smp_topo(void)
 		top = &top->cg_child[0];
 		top->cg_parent = NULL;
 	}
+	smp_topo_fill(top);
 	return (top);
 }
 
diff --git a/sys/sys/smp.h b/sys/sys/smp.h
index a971ffbbd91b..cee1199015a7 100644
--- a/sys/sys/smp.h
+++ b/sys/sys/smp.h
@@ -81,6 +81,8 @@ struct cpu_group {
 	struct cpu_group *cg_child;	/* Optional children groups. */
 	cpuset_t	cg_mask;	/* Mask of cpus in this group. */
 	int32_t		cg_count;	/* Count of cpus in this group. */
+	int32_t		cg_first;	/* First cpu in this group. */
+	int32_t		cg_last;	/* Last cpu in this group. */
 	int16_t		cg_children;	/* Number of children groups. */
 	int8_t		cg_level;	/* Shared cache level. */
 	int8_t		cg_flags;	/* Traversal modifiers. */