git: 4808bab7fa6c - stable/13 - sched_ule(4): Improve long-term load balancer.

From: Alexander Motin <mav_at_FreeBSD.org>
Date: Thu, 21 Oct 2021 22:31:33 UTC
The branch stable/13 has been updated by mav:

URL: https://cgit.FreeBSD.org/src/commit/?id=4808bab7fa6c3ec49b49476b8326d7a0250a03fa

commit 4808bab7fa6c3ec49b49476b8326d7a0250a03fa
Author:     Alexander Motin <mav@FreeBSD.org>
AuthorDate: 2021-09-21 22:14:22 +0000
Commit:     Alexander Motin <mav@FreeBSD.org>
CommitDate: 2021-10-21 22:24:35 +0000

    sched_ule(4): Improve long-term load balancer.
    
    Before this change long-term load balancer was unable to migrate
    running threads, only ones waiting on run queues.  But with growing
    number of CPU cores it is quite typical now for system to not have
    many waiting threads.  But same time if due to some coincidence two
    long-running CPU-bound threads ended up sharing same physical CPU
    core, they could suffer from the SMT penalty indefinitely, and the
    load balancer couldn't help.
    
    Improve that by teaching the load balancer to hint running threads
    to migrate by marking them with TDF_NEEDRESCHED and new TDF_PICKCPU
    flag, making sched_pickcpu() to search for better CPU later, when
    it is convenient.
    
    Fix CPU search logic when balancing to limit round-robin migrations
    in case of almost equal load to the group of physical cores.  The
    previous code bounced threads across all the system, that should be
    pretty bad for caches and NUMA affinity, while additional fairness
    was almost invisible, diminishing with number of cores in the group.
    
    MFC after:      1 month
    
    (cherry picked from commit e745d729be60a47b49eb19c02a6864a747fb2744)
---
 sys/kern/sched_ule.c | 119 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 93 insertions(+), 26 deletions(-)

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 53d5a59a3605..e7de4a50bb93 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -196,6 +196,7 @@ _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
 #define	SCHED_SLICE_MIN_DIVISOR		6	/* DEFAULT/MIN = ~16 ms. */
 
 /* Flags kept in td_flags. */
+#define	TDF_PICKCPU	TDF_SCHED0	/* Thread should pick new CPU. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /*
@@ -633,15 +634,16 @@ sched_random(void)
 }
 
 struct cpu_search {
-	cpuset_t *cs_mask;
-	u_int	cs_prefer;
+	cpuset_t *cs_mask;	/* The mask of allowed CPUs to choose from. */
+	int	cs_prefer;	/* Prefer this CPU and groups including it. */
+	int	cs_running;	/* The thread is now running at cs_prefer. */
 	int	cs_pri;		/* Min priority for low. */
 	int	cs_limit;	/* Max load for low, min load for high. */
 };
 
 struct cpu_search_res {
-	int	cs_cpu;
-	int	cs_load;
+	int	cs_cpu;		/* The best CPU found. */
+	int	cs_load;	/* The load of cs_cpu. */
 };
 
 /*
@@ -657,7 +659,7 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
 {
 	struct cpu_search_res lr;
 	struct tdq *tdq;
-	int c, bload, l, load, total;
+	int c, bload, l, load, p, total;
 
 	total = 0;
 	bload = INT_MAX;
@@ -668,6 +670,17 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
 		for (c = cg->cg_children - 1; c >= 0; c--) {
 			load = cpu_search_lowest(&cg->cg_child[c], s, &lr);
 			total += load;
+
+			/*
+			 * When balancing do not prefer SMT groups with load >1.
+			 * It allows round-robin between SMT groups with equal
+			 * load within parent group for more fair scheduling.
+			 */
+			if (__predict_false(s->cs_running) &&
+			    (cg->cg_child[c].cg_flags & CG_FLAG_THREAD) &&
+			    load >= 128 && (load & 128) != 0)
+				load += 128;
+
 			if (lr.cs_cpu >= 0 && (load < bload ||
 			    (load == bload && lr.cs_load < r->cs_load))) {
 				bload = load;
@@ -684,20 +697,40 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
 			continue;
 		tdq = TDQ_CPU(c);
 		l = tdq->tdq_load;
+		if (c == s->cs_prefer) {
+			if (__predict_false(s->cs_running))
+				l--;
+			p = 128;
+		} else
+			p = 0;
 		load = l * 256;
-		if (c == s->cs_prefer)
-			load -= 128;
-		total += load;
-		if (l > s->cs_limit || tdq->tdq_lowpri <= s->cs_pri ||
+		total += load - p;
+
+		/*
+		 * Check this CPU is acceptable.
+		 * If the threads is already on the CPU, don't look on the TDQ
+		 * priority, since it can be the priority of the thread itself.
+		 */
+		if (l > s->cs_limit || (tdq->tdq_lowpri <= s->cs_pri &&
+		     (!s->cs_running || c != s->cs_prefer)) ||
 		    !CPU_ISSET(c, s->cs_mask))
 			continue;
+
+		/*
+		 * When balancing do not prefer CPUs with load > 1.
+		 * It allows round-robin between CPUs with equal load
+		 * within the CPU group for more fair scheduling.
+		 */
+		if (__predict_false(s->cs_running) && l > 0)
+			p = 0;
+
 		load -= sched_random() % 128;
-		if (load < bload) {
-			bload = load;
+		if (bload > load - p) {
+			bload = load - p;
 			r->cs_cpu = c;
+			r->cs_load = load;
 		}
 	}
-	r->cs_load = bload;
 	return (total);
 }
 
@@ -736,9 +769,17 @@ cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,
 		l = tdq->tdq_load;
 		load = l * 256;
 		total += load;
-		if (l < s->cs_limit || !tdq->tdq_transferable ||
+
+		/*
+		 * Check this CPU is acceptable.
+		 * If requested minimum load is 1, then caller must know how
+		 * to handle running threads, not counted in tdq_transferable.
+		 */
+		if (l < s->cs_limit || (tdq->tdq_transferable == 0 &&
+		    (s->cs_limit > 1 || l > 1)) ||
 		    !CPU_ISSET(c, s->cs_mask))
 			continue;
+
 		load -= sched_random() % 256;
 		if (load > bload) {
 			bload = load;
@@ -756,12 +797,13 @@ cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,
  */
 static inline int
 sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload,
-    int prefer)
+    int prefer, int running)
 {
 	struct cpu_search s;
 	struct cpu_search_res r;
 
 	s.cs_prefer = prefer;
+	s.cs_running = running;
 	s.cs_mask = mask;
 	s.cs_pri = pri;
 	s.cs_limit = maxload;
@@ -788,12 +830,13 @@ static void
 sched_balance_group(struct cpu_group *cg)
 {
 	struct tdq *tdq;
+	struct thread *td;
 	cpuset_t hmask, lmask;
 	int high, low, anylow;
 
 	CPU_FILL(&hmask);
 	for (;;) {
-		high = sched_highest(cg, &hmask, 2);
+		high = sched_highest(cg, &hmask, 1);
 		/* Stop if there is no more CPU with transferrable threads. */
 		if (high == -1)
 			break;
@@ -802,10 +845,28 @@ sched_balance_group(struct cpu_group *cg)
 		/* Stop if there is no more CPU left for low. */
 		if (CPU_EMPTY(&lmask))
 			break;
-		anylow = 1;
 		tdq = TDQ_CPU(high);
+		if (tdq->tdq_load == 1) {
+			/*
+			 * There is only one running thread.  We can't move
+			 * it from here, so tell it to pick new CPU by itself.
+			 */
+			TDQ_LOCK(tdq);
+			td = pcpu_find(high)->pc_curthread;
+			if ((td->td_flags & TDF_IDLETD) == 0 &&
+			    THREAD_CAN_MIGRATE(td)) {
+				td->td_flags |= TDF_NEEDRESCHED | TDF_PICKCPU;
+				if (high != curcpu)
+					ipi_cpu(high, IPI_AST);
+			}
+			TDQ_UNLOCK(tdq);
+			break;
+		}
+		anylow = 1;
 nextlow:
-		low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high);
+		if (tdq->tdq_transferable == 0)
+			continue;
+		low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high, 1);
 		/* Stop if we looked well and found no less loaded CPU. */
 		if (anylow && low == -1)
 			break;
@@ -1227,7 +1288,7 @@ sched_pickcpu(struct thread *td, int flags)
 	struct td_sched *ts;
 	struct tdq *tdq;
 	cpuset_t *mask;
-	int cpu, pri, self, intr;
+	int cpu, pri, r, self, intr;
 
 	self = PCPU_GET(cpuid);
 	ts = td_get_sched(td);
@@ -1305,32 +1366,33 @@ llc:
 	cpu = -1;
 	mask = &td->td_cpuset->cs_mask;
 	pri = td->td_priority;
+	r = TD_IS_RUNNING(td);
 	/*
 	 * Try hard to keep interrupts within found LLC.  Search the LLC for
 	 * the least loaded CPU we can run now.  For NUMA systems it should
 	 * be within target domain, and it also reduces scheduling overhead.
 	 */
 	if (ccg != NULL && intr) {
-		cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu);
+		cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_intrbind);
 	} else
 	/* Search the LLC for the least loaded idle CPU we can run now. */
 	if (ccg != NULL) {
 		cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE),
-		    INT_MAX, ts->ts_cpu);
+		    INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_affinity);
 	}
 	/* Search globally for the least loaded CPU we can run now. */
 	if (cpu < 0) {
-		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
+		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_lowest);
 	}
 	/* Search globally for the least loaded CPU. */
 	if (cpu < 0) {
-		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
+		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_lowest);
 	}
@@ -2056,7 +2118,7 @@ sched_switch(struct thread *td, int flags)
 	struct td_sched *ts;
 	struct mtx *mtx;
 	int srqflag;
-	int cpuid, preempted;
+	int cpuid, pickcpu, preempted;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
@@ -2064,11 +2126,15 @@ sched_switch(struct thread *td, int flags)
 	tdq = TDQ_SELF();
 	ts = td_get_sched(td);
 	sched_pctcpu_update(ts, 1);
-	ts->ts_rltick = ticks;
+	pickcpu = (td->td_flags & TDF_PICKCPU) != 0;
+	if (pickcpu)
+		ts->ts_rltick = ticks - affinity * MAX_CACHE_LEVELS;
+	else
+		ts->ts_rltick = ticks;
 	td->td_lastcpu = td->td_oncpu;
 	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
 	    (flags & SW_PREEMPT) != 0;
-	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
+	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_PICKCPU | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	tdq->tdq_owepreempt = 0;
 	if (!TD_IS_IDLETHREAD(td))
@@ -2088,7 +2154,8 @@ sched_switch(struct thread *td, int flags)
 		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 		    SRQ_OURSELF|SRQ_YIELDING;
 #ifdef SMP
-		if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
+		if (THREAD_CAN_MIGRATE(td) && (!THREAD_CAN_SCHED(td, ts->ts_cpu)
+		    || pickcpu))
 			ts->ts_cpu = sched_pickcpu(td, 0);
 #endif
 		if (ts->ts_cpu == cpuid)