PERFORCE change 137663 for review
Peter Wemm
peter at FreeBSD.org
Fri Mar 14 01:55:21 UTC 2008
http://perforce.freebsd.org/chv.cgi?CH=137663
Change 137663 by peter at peter_overcee on 2008/03/14 01:54:48
hack job at finishing the merge
Affected files ...
.. //depot/projects/bike_sched/sys/kern/sched_ule.c#20 integrate
.. //depot/projects/bike_sched/sys/powerpc/powerpc/machdep.c#6 delete
.. //depot/projects/bike_sched/sys/powerpc/powerpc/trap.c#4 delete
.. //depot/projects/bike_sched/sys/powerpc/powerpc/vm_machdep.c#4 delete
Differences ...
==== //depot/projects/bike_sched/sys/kern/sched_ule.c#20 (text+ko) ====
@@ -36,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.217 2007/11/14 06:21:23 julian Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.232 2008/03/12 10:11:59 jeff Exp $");
#include "opt_hwpmc_hooks.h"
#include "opt_sched.h"
@@ -59,6 +59,7 @@
#include <sys/turnstile.h>
#include <sys/umtx.h>
#include <sys/vmmeter.h>
+#include <sys/cpuset.h>
#ifdef KTRACE
#include <sys/uio.h>
#include <sys/ktrace.h>
@@ -85,16 +86,13 @@
struct runq *ts_runq; /* Run-queue we're queued on. */
short ts_flags; /* TSF_* flags. */
u_char ts_cpu; /* CPU that we have affinity for. */
+ int ts_rltick; /* Real last tick, for affinity. */
int ts_slice; /* Ticks of slice remaining. */
u_int ts_slptime; /* Number of ticks we vol. slept */
u_int ts_runtime; /* Number of ticks we were running */
- /* The following variables are only used for pctcpu calculation */
int ts_ltick; /* Last tick that we were running on */
int ts_ftick; /* First tick that we were running on */
int ts_ticks; /* Tick count */
-#ifdef SMP
- int ts_rltick; /* Real last tick, for affinity. */
-#endif
};
/* flags kept in ts_flags */
#define TSF_BOUND 0x0001 /* Thread can not migrate. */
@@ -102,6 +100,10 @@
#define TD_TO_TS(td) ((struct td_sched *)(&(td)[1]))
+#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
+#define THREAD_CAN_SCHED(td, cpu) \
+ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
+
static struct {
struct thread initial_thread;
struct td_sched initial_sched;
@@ -175,7 +177,7 @@
static int sched_interact = SCHED_INTERACT_THRESH;
static int realstathz;
static int tickincr;
-static int sched_slice;
+static int sched_slice = 1;
#ifdef PREEMPTION
#ifdef FULL_PREEMPTION
static int preempt_thresh = PRI_MAX_IDLE;
@@ -185,6 +187,7 @@
#else
static int preempt_thresh = 0;
#endif
+static int static_boost = 1;
/*
* tdq - per processor runqs and statistics. All fields are protected by the
@@ -192,80 +195,51 @@
* locking in sched_pickcpu();
*/
struct tdq {
- struct mtx *tdq_lock; /* Pointer to group lock. */
+ /* Ordered to improve efficiency of cpu_search() and switch(). */
+ struct mtx tdq_lock; /* run queue lock. */
+ struct cpu_group *tdq_cg; /* Pointer to cpu topology. */
+ int tdq_load; /* Aggregate load. */
+ int tdq_sysload; /* For loadavg, !ITHD load. */
+ int tdq_transferable; /* Transferable thread count. */
+ u_char tdq_lowpri; /* Lowest priority thread. */
+ u_char tdq_ipipending; /* IPI pending. */
+ u_char tdq_idx; /* Current insert index. */
+ u_char tdq_ridx; /* Current removal index. */
struct runq tdq_realtime; /* real-time run queue. */
struct runq tdq_timeshare; /* timeshare run queue. */
struct runq tdq_idle; /* Queue of IDLE threads. */
- int tdq_load; /* Aggregate load. */
- u_char tdq_idx; /* Current insert index. */
- u_char tdq_ridx; /* Current removal index. */
-#ifdef SMP
- u_char tdq_lowpri; /* Lowest priority thread. */
- int tdq_transferable; /* Transferable thread count. */
- LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */
- struct tdq_group *tdq_group; /* Our processor group. */
-#else
- int tdq_sysload; /* For loadavg, !ITHD load. */
-#endif
+ char tdq_name[sizeof("sched lock") + 6];
} __aligned(64);
#ifdef SMP
-/*
- * tdq groups are groups of processors which can cheaply share threads. When
- * one processor in the group goes idle it will check the runqs of the other
- * processors in its group prior to halting and waiting for an interrupt.
- * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
- * In a numa environment we'd want an idle bitmap per group and a two tiered
- * load balancer.
- */
-struct tdq_group {
- struct mtx tdg_lock; /* Protects all fields below. */
- int tdg_cpus; /* Count of CPUs in this tdq group. */
- cpumask_t tdg_cpumask; /* Mask of cpus in this group. */
- cpumask_t tdg_idlemask; /* Idle cpus in this group. */
- cpumask_t tdg_mask; /* Bit mask for first cpu. */
- int tdg_load; /* Total load of this group. */
- int tdg_transferable; /* Transferable load of this group. */
- LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */
- char tdg_name[16]; /* lock name. */
-} __aligned(64);
+struct cpu_group *cpu_top;
-#define SCHED_AFFINITY_DEFAULT (max(1, hz / 300))
-#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity)
+#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000))
+#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity))
/*
* Run-time tunables.
*/
static int rebalance = 1;
static int balance_interval = 128; /* Default set in sched_initticks(). */
-static int pick_pri = 1;
static int affinity;
-static int tryself = 1;
static int steal_htt = 1;
static int steal_idle = 1;
static int steal_thresh = 2;
-static int topology = 0;
/*
* One thread queue per processor.
*/
-static volatile cpumask_t tdq_idle;
-static int tdg_maxid;
static struct tdq tdq_cpu[MAXCPU];
-static struct tdq_group tdq_groups[MAXCPU];
static struct tdq *balance_tdq;
-static int balance_group_ticks;
static int balance_ticks;
#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)])
#define TDQ_CPU(x) (&tdq_cpu[(x)])
#define TDQ_ID(x) ((int)((x) - tdq_cpu))
-#define TDQ_GROUP(x) (&tdq_groups[(x)])
-#define TDG_ID(x) ((int)((x) - tdq_groups))
#else /* !SMP */
static struct tdq tdq_cpu;
-static struct mtx tdq_lock;
#define TDQ_ID(x) (0)
#define TDQ_SELF() (&tdq_cpu)
@@ -276,7 +250,7 @@
#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t)))
#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t)))
-#define TDQ_LOCKPTR(t) ((t)->tdq_lock)
+#define TDQ_LOCKPTR(t) (&(t)->tdq_lock)
static void sched_priority(struct thread *);
static void sched_thread_priority(struct thread *, u_char);
@@ -292,26 +266,23 @@
static void tdq_load_rem(struct tdq *, struct thread *);
static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
static __inline void tdq_runq_rem(struct tdq *, struct thread *);
+static inline int sched_shouldpreempt(int, int, int);
void tdq_print(int cpu);
static void runq_print(struct runq *rq);
static void tdq_add(struct tdq *, struct thread *, int);
#ifdef SMP
-static void tdq_move(struct tdq *, struct tdq *);
+static int tdq_move(struct tdq *, struct tdq *);
static int tdq_idled(struct tdq *);
-static void tdq_notify(struct thread *);
-static struct thread *tdq_steal(struct tdq *);
+static void tdq_notify(struct tdq *, struct thread *);
+static struct thread *tdq_steal(struct tdq *, int);
static struct thread *runq_steal(struct runq *);
static int sched_pickcpu(struct thread *, int);
static void sched_balance(void);
-static void sched_balance_groups(void);
-static void sched_balance_group(struct tdq_group *);
-static void sched_balance_pair(struct tdq *, struct tdq *);
+static int sched_balance_pair(struct tdq *, struct tdq *);
static inline struct tdq *sched_setcpu(struct thread *, int, int);
static inline struct mtx *thread_block_switch(struct thread *);
static inline void thread_unblock_switch(struct thread *, struct mtx *);
static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
-
-#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
#endif
static void sched_setup(void *dummy);
@@ -358,7 +329,8 @@
tdq = TDQ_CPU(cpu);
printf("tdq %d:\n", TDQ_ID(tdq));
- printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq));
+ printf("\tlock %p\n", TDQ_LOCKPTR(tdq));
+ printf("\tLock name: %s\n", tdq->tdq_name);
printf("\tload: %d\n", tdq->tdq_load);
printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
@@ -368,12 +340,41 @@
runq_print(&tdq->tdq_timeshare);
printf("\tidle runq:\n");
runq_print(&tdq->tdq_idle);
-#ifdef SMP
printf("\tload transferable: %d\n", tdq->tdq_transferable);
printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
- printf("\tgroup: %d\n", TDG_ID(tdq->tdq_group));
- printf("\tLock name: %s\n", tdq->tdq_group->tdg_name);
-#endif
+}
+
+static inline int
+sched_shouldpreempt(int pri, int cpri, int remote)
+{
+ /*
+ * If the new priority is not better than the current priority there is
+ * nothing to do.
+ */
+ if (pri >= cpri)
+ return (0);
+ /*
+ * Always preempt idle.
+ */
+ if (cpri >= PRI_MIN_IDLE)
+ return (1);
+ /*
+ * If preemption is disabled don't preempt others.
+ */
+ if (preempt_thresh == 0)
+ return (0);
+ /*
+ * Preempt if we exceed the threshold.
+ */
+ if (pri <= preempt_thresh)
+ return (1);
+ /*
+ * If we're realtime or better and there is timeshare or worse running
+ * preempt only remote processors.
+ */
+ if (remote && pri <= PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
+ return (1);
+ return (0);
}
#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
@@ -385,20 +386,22 @@
static __inline void
tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
{
+ u_char pri;
struct td_sched *ts = TD_TO_TS(td);
+
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
THREAD_LOCK_ASSERT(td, MA_OWNED);
-#ifdef SMP
+
+ TD_SET_RUNQ(ts->ts_thread);
if (THREAD_CAN_MIGRATE(td)) {
tdq->tdq_transferable++;
- tdq->tdq_group->tdg_transferable++;
ts->ts_flags |= TSF_XFERABLE;
}
-#endif
- if (ts->ts_runq == &tdq->tdq_timeshare) {
- u_char pri;
-
- pri = td->td_priority;
+ pri = td->td_priority;
+ if (pri <= PRI_MAX_REALTIME) {
+ ts->ts_runq = &tdq->tdq_realtime;
+ } else if (pri <= PRI_MAX_TIMESHARE) {
+ ts->ts_runq = &tdq->tdq_timeshare;
KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
("Invalid priority %d on timeshare runq", pri));
/*
@@ -419,8 +422,10 @@
} else
pri = tdq->tdq_ridx;
runq_add_pri(ts->ts_runq, ts, pri, flags);
+ return;
} else
- runq_add(ts->ts_runq, ts, flags);
+ ts->ts_runq = &tdq->tdq_idle;
+ runq_add(ts->ts_runq, ts, flags);
}
/*
@@ -435,25 +440,15 @@
TDQ_LOCK_ASSERT(tdq, MA_OWNED);
KASSERT(ts->ts_runq != NULL,
("tdq_runq_remove: thread %p null ts_runq", td));
-#ifdef SMP
if (ts->ts_flags & TSF_XFERABLE) {
tdq->tdq_transferable--;
- tdq->tdq_group->tdg_transferable--;
ts->ts_flags &= ~TSF_XFERABLE;
}
-#endif
if (ts->ts_runq == &tdq->tdq_timeshare) {
if (tdq->tdq_idx != tdq->tdq_ridx)
runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
else
runq_remove_idx(ts->ts_runq, td, NULL);
- /*
- * For timeshare threads we update the priority here so
- * the priority reflects the time we've been sleeping.
- */
- ts->ts_ltick = ticks;
- sched_pctcpu_update(td);
- sched_priority(td);
} else
runq_remove(ts->ts_runq, ts);
}
@@ -474,11 +469,7 @@
CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
if (class != PRI_ITHD &&
(td->td_proc->p_flag & P_NOLOAD) == 0)
-#ifdef SMP
- tdq->tdq_group->tdg_load++;
-#else
tdq->tdq_sysload++;
-#endif
}
/*
@@ -495,11 +486,7 @@
class = PRI_BASE(td->td_pri_class);
if (class != PRI_ITHD &&
(td->td_proc->p_flag & P_NOLOAD) == 0)
-#ifdef SMP
- tdq->tdq_group->tdg_load--;
-#else
tdq->tdq_sysload--;
-#endif
KASSERT(tdq->tdq_load != 0,
("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
tdq->tdq_load--;
@@ -507,112 +494,282 @@
TD_TO_TS(td)->ts_runq = NULL;
}
+/*
+ * Set lowpri to its exact value by searching the run-queue and
+ * evaluating curthread. curthread may be passed as an optimization.
+ */
+static void
+tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
+{
+ struct td_sched *ts;
+ struct thread *td;
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ if (ctd == NULL)
+ ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
+ ts = tdq_choose(tdq);
+ if (ts)
+ td = ts->ts_thread;
+ if (ts == NULL || td->td_priority > ctd->td_priority)
+ tdq->tdq_lowpri = ctd->td_priority;
+ else
+ tdq->tdq_lowpri = td->td_priority;
+}
+
#ifdef SMP
+struct cpu_search {
+ cpumask_t cs_mask; /* Mask of valid cpus. */
+ u_int cs_load;
+ u_int cs_cpu;
+ int cs_limit; /* Min priority for low min load for high. */
+};
+
+#define CPU_SEARCH_LOWEST 0x1
+#define CPU_SEARCH_HIGHEST 0x2
+#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
+
+#define CPUMASK_FOREACH(cpu, mask) \
+ for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++) \
+ if ((mask) & 1 << (cpu))
+
+__inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
+ struct cpu_search *high, const int match);
+int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
+int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
+int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+ struct cpu_search *high);
+
+/*
+ * This routine compares according to the match argument and should be
+ * reduced in actual instantiations via constant propagation and dead code
+ * elimination.
+ */
+static __inline int
+cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
+ const int match)
+{
+ struct tdq *tdq;
+
+ tdq = TDQ_CPU(cpu);
+ if (match & CPU_SEARCH_LOWEST)
+ if (low->cs_mask & (1 << cpu) &&
+ tdq->tdq_load < low->cs_load &&
+ tdq->tdq_lowpri > low->cs_limit) {
+ low->cs_cpu = cpu;
+ low->cs_load = tdq->tdq_load;
+ }
+ if (match & CPU_SEARCH_HIGHEST)
+ if (high->cs_mask & (1 << cpu) &&
+ tdq->tdq_load >= high->cs_limit &&
+ tdq->tdq_load > high->cs_load &&
+ tdq->tdq_transferable) {
+ high->cs_cpu = cpu;
+ high->cs_load = tdq->tdq_load;
+ }
+ return (tdq->tdq_load);
+}
+
/*
- * sched_balance is a simple CPU load balancing algorithm. It operates by
- * finding the least loaded and most loaded cpu and equalizing their load
- * by migrating some processes.
+ * Search the tree of cpu_groups for the lowest or highest loaded cpu
+ * according to the match argument. This routine actually compares the
+ * load on all paths through the tree and finds the least loaded cpu on
+ * the least loaded path, which may differ from the least loaded cpu in
+ * the system. This balances work among caches and busses.
*
- * Dealing only with two CPUs at a time has two advantages. Firstly, most
- * installations will only have 2 cpus. Secondly, load balancing too much at
- * once can have an unpleasant effect on the system. The scheduler rarely has
- * enough information to make perfect decisions. So this algorithm chooses
- * simplicity and more gradual effects on load in larger systems.
- *
+ * This inline is instantiated in three forms below using constants for the
+ * match argument. It is reduced to the minimum set for each case. It is
+ * also recursive to the depth of the tree.
+ */
+static inline int
+cpu_search(struct cpu_group *cg, struct cpu_search *low,
+ struct cpu_search *high, const int match)
+{
+ int total;
+
+ total = 0;
+ if (cg->cg_children) {
+ struct cpu_search lgroup;
+ struct cpu_search hgroup;
+ struct cpu_group *child;
+ u_int lload;
+ int hload;
+ int load;
+ int i;
+
+ lload = -1;
+ hload = -1;
+ for (i = 0; i < cg->cg_children; i++) {
+ child = &cg->cg_child[i];
+ if (match & CPU_SEARCH_LOWEST) {
+ lgroup = *low;
+ lgroup.cs_load = -1;
+ }
+ if (match & CPU_SEARCH_HIGHEST) {
+ hgroup = *high;
+ lgroup.cs_load = 0;
+ }
+ switch (match) {
+ case CPU_SEARCH_LOWEST:
+ load = cpu_search_lowest(child, &lgroup);
+ break;
+ case CPU_SEARCH_HIGHEST:
+ load = cpu_search_highest(child, &hgroup);
+ break;
+ case CPU_SEARCH_BOTH:
+ load = cpu_search_both(child, &lgroup, &hgroup);
+ break;
+ }
+ total += load;
+ if (match & CPU_SEARCH_LOWEST)
+ if (load < lload || low->cs_cpu == -1) {
+ *low = lgroup;
+ lload = load;
+ }
+ if (match & CPU_SEARCH_HIGHEST)
+ if (load > hload || high->cs_cpu == -1) {
+ hload = load;
+ *high = hgroup;
+ }
+ }
+ } else {
+ int cpu;
+
+ CPUMASK_FOREACH(cpu, cg->cg_mask)
+ total += cpu_compare(cpu, low, high, match);
+ }
+ return (total);
+}
+
+/*
+ * cpu_search instantiations must pass constants to maintain the inline
+ * optimization.
+ */
+int
+cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
+{
+ return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
+}
+
+int
+cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
+{
+ return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
+}
+
+int
+cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+ struct cpu_search *high)
+{
+ return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
+}
+
+/*
+ * Find the cpu with the least load via the least loaded path that has a
+ * lowpri greater than pri pri. A pri of -1 indicates any priority is
+ * acceptable.
+ */
+static inline int
+sched_lowest(struct cpu_group *cg, cpumask_t mask, int pri)
+{
+ struct cpu_search low;
+
+ low.cs_cpu = -1;
+ low.cs_load = -1;
+ low.cs_mask = mask;
+ low.cs_limit = pri;
+ cpu_search_lowest(cg, &low);
+ return low.cs_cpu;
+}
+
+/*
+ * Find the cpu with the highest load via the highest loaded path.
+ */
+static inline int
+sched_highest(struct cpu_group *cg, cpumask_t mask, int minload)
+{
+ struct cpu_search high;
+
+ high.cs_cpu = -1;
+ high.cs_load = 0;
+ high.cs_mask = mask;
+ high.cs_limit = minload;
+ cpu_search_highest(cg, &high);
+ return high.cs_cpu;
+}
+
+/*
+ * Simultaneously find the highest and lowest loaded cpu reachable via
+ * cg.
*/
+static inline void
+sched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu)
+{
+ struct cpu_search high;
+ struct cpu_search low;
+
+ low.cs_cpu = -1;
+ low.cs_limit = -1;
+ low.cs_load = -1;
+ low.cs_mask = mask;
+ high.cs_load = 0;
+ high.cs_cpu = -1;
+ high.cs_limit = -1;
+ high.cs_mask = mask;
+ cpu_search_both(cg, &low, &high);
+ *lowcpu = low.cs_cpu;
+ *highcpu = high.cs_cpu;
+ return;
+}
+
static void
-sched_balance()
+sched_balance_group(struct cpu_group *cg)
{
- struct tdq_group *high;
- struct tdq_group *low;
- struct tdq_group *tdg;
- struct tdq *tdq;
- int cnt;
+ cpumask_t mask;
+ int high;
+ int low;
int i;
- /*
- * Select a random time between .5 * balance_interval and
- * 1.5 * balance_interval.
- */
- balance_ticks = max(balance_interval / 2, 1);
- balance_ticks += random() % balance_interval;
- if (smp_started == 0 || rebalance == 0)
- return;
- tdq = TDQ_SELF();
- TDQ_UNLOCK(tdq);
- low = high = NULL;
- i = random() % (tdg_maxid + 1);
- for (cnt = 0; cnt <= tdg_maxid; cnt++) {
- tdg = TDQ_GROUP(i);
+ mask = -1;
+ for (;;) {
+ sched_both(cg, mask, &low, &high);
+ if (low == high || low == -1 || high == -1)
+ break;
+ if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
+ break;
/*
- * Find the CPU with the highest load that has some
- * threads to transfer.
- */
- if ((high == NULL || tdg->tdg_load > high->tdg_load)
- && tdg->tdg_transferable)
- high = tdg;
- if (low == NULL || tdg->tdg_load < low->tdg_load)
- low = tdg;
- if (++i > tdg_maxid)
- i = 0;
+ * If we failed to move any threads determine which cpu
+ * to kick out of the set and try again.
+ */
+ if (TDQ_CPU(high)->tdq_transferable == 0)
+ mask &= ~(1 << high);
+ else
+ mask &= ~(1 << low);
}
- if (low != NULL && high != NULL && high != low)
- sched_balance_pair(LIST_FIRST(&high->tdg_members),
- LIST_FIRST(&low->tdg_members));
- TDQ_LOCK(tdq);
+
+ for (i = 0; i < cg->cg_children; i++)
+ sched_balance_group(&cg->cg_child[i]);
}
-/*
- * Balance load between CPUs in a group. Will only migrate within the group.
- */
static void
-sched_balance_groups()
+sched_balance()
{
struct tdq *tdq;
- int i;
/*
* Select a random time between .5 * balance_interval and
* 1.5 * balance_interval.
*/
- balance_group_ticks = max(balance_interval / 2, 1);
- balance_group_ticks += random() % balance_interval;
+ balance_ticks = max(balance_interval / 2, 1);
+ balance_ticks += random() % balance_interval;
if (smp_started == 0 || rebalance == 0)
return;
tdq = TDQ_SELF();
TDQ_UNLOCK(tdq);
- for (i = 0; i <= tdg_maxid; i++)
- sched_balance_group(TDQ_GROUP(i));
+ sched_balance_group(cpu_top);
TDQ_LOCK(tdq);
}
/*
- * Finds the greatest imbalance between two tdqs in a group.
- */
-static void
-sched_balance_group(struct tdq_group *tdg)
-{
- struct tdq *tdq;
- struct tdq *high;
- struct tdq *low;
- int load;
-
- if (tdg->tdg_transferable == 0)
- return;
- low = NULL;
- high = NULL;
- LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
- load = tdq->tdq_load;
- if (high == NULL || load > high->tdq_load)
- high = tdq;
- if (low == NULL || load < low->tdq_load)
- low = tdq;
- }
- if (high != NULL && low != NULL && high != low)
- sched_balance_pair(high, low);
-}
-
-/*
* Lock two thread queues using their address to maintain lock order.
*/
static void
@@ -640,31 +797,22 @@
/*
* Transfer load between two imbalanced thread queues.
*/
-static void
+static int
sched_balance_pair(struct tdq *high, struct tdq *low)
{
int transferable;
int high_load;
int low_load;
+ int moved;
int move;
int diff;
int i;
tdq_lock_pair(high, low);
- /*
- * If we're transfering within a group we have to use this specific
- * tdq's transferable count, otherwise we can steal from other members
- * of the group.
- */
- if (high->tdq_group == low->tdq_group) {
- transferable = high->tdq_transferable;
- high_load = high->tdq_load;
- low_load = low->tdq_load;
- } else {
- transferable = high->tdq_group->tdg_transferable;
- high_load = high->tdq_group->tdg_load;
- low_load = low->tdq_group->tdg_load;
- }
+ transferable = high->tdq_transferable;
+ high_load = high->tdq_load;
+ low_load = low->tdq_load;
+ moved = 0;
/*
* Determine what the imbalance is and then adjust that to how many
* threads we actually have to give up (transferable).
@@ -676,7 +824,7 @@
move++;
move = min(move, transferable);
for (i = 0; i < move; i++)
- tdq_move(high, low);
+ moved += tdq_move(high, low);
/*
* IPI the target cpu to force it to reschedule with the new
* workload.
@@ -684,13 +832,13 @@
ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT);
}
tdq_unlock_pair(high, low);
- return;
+ return (moved);
}
/*
* Move a thread from one thread queue to another.
*/
-static void
+static int
tdq_move(struct tdq *from, struct tdq *to)
{
struct thread *td;
@@ -703,22 +851,9 @@
tdq = from;
cpu = TDQ_ID(to);
- td = tdq_steal(tdq);
- if (td == NULL) {
- struct tdq_group *tdg;
-
- tdg = tdq->tdq_group;
- LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
- if (tdq == from || tdq->tdq_transferable == 0)
- continue;
- td = tdq_steal(tdq);
- break;
- }
- if (td == NULL)
- return;
- }
- if (tdq == to)
- return;
+ td = tdq_steal(tdq, cpu);
+ if (td == NULL)
+ return (0);
/*
* Although the run queue is locked the thread may be blocked. Lock
* it to clear this and acquire the run-queue lock.
@@ -730,6 +865,7 @@
TD_TO_TS(ts)->ts_cpu = cpu;
td->td_lock = TDQ_LOCKPTR(to);
tdq_add(to, td, SRQ_YIELDING);
+ return (1);
}
/*
@@ -739,116 +875,72 @@
static int
tdq_idled(struct tdq *tdq)
{
- struct tdq_group *tdg;
+ struct cpu_group *cg;
struct tdq *steal;
- int highload;
- int highcpu;
+ cpumask_t mask;
+ int thresh;
int cpu;
if (smp_started == 0 || steal_idle == 0)
return (1);
- /* We don't want to be preempted while we're iterating over tdqs */
+ mask = -1;
+ mask &= ~PCPU_GET(cpumask);
+ /* We don't want to be preempted while we're iterating. */
spinlock_enter();
- tdg = tdq->tdq_group;
- /*
- * If we're in a cpu group, try and steal threads from another cpu in
- * the group before idling. In a HTT group all cpus share the same
- * run-queue lock, however, we still need a recursive lock to
- * call tdq_move().
- */
- if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
- TDQ_LOCK(tdq);
- LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
- if (steal == tdq || steal->tdq_transferable == 0)
- continue;
- TDQ_LOCK(steal);
- goto steal;
+ for (cg = tdq->tdq_cg; cg != NULL; ) {
+ if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0)
+ thresh = steal_thresh;
+ else
+ thresh = 1;
+ cpu = sched_highest(cg, mask, thresh);
+ if (cpu == -1) {
+ cg = cg->cg_parent;
+ continue;
+ }
+ steal = TDQ_CPU(cpu);
+ mask &= ~(1 << cpu);
+ tdq_lock_pair(tdq, steal);
+ if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
+ tdq_unlock_pair(tdq, steal);
+ continue;
}
- TDQ_UNLOCK(tdq);
- }
- /*
- * Find the least loaded CPU with a transferable thread and attempt
- * to steal it. We make a lockless pass and then verify that the
- * thread is still available after locking.
- */
- for (;;) {
- highcpu = 0;
- highload = 0;
- for (cpu = 0; cpu <= mp_maxid; cpu++) {
- if (CPU_ABSENT(cpu))
- continue;
- steal = TDQ_CPU(cpu);
- if (steal->tdq_transferable == 0)
- continue;
- if (steal->tdq_load < highload)
- continue;
- highload = steal->tdq_load;
- highcpu = cpu;
+ /*
+ * If a thread was added while interrupts were disabled don't
+ * steal one here. If we fail to acquire one due to affinity
+ * restrictions loop again with this cpu removed from the
+ * set.
+ */
+ if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
+ tdq_unlock_pair(tdq, steal);
+ continue;
}
- if (highload < steal_thresh)
- break;
- steal = TDQ_CPU(highcpu);
- if (steal == tdq)
- break;
- tdq_lock_pair(tdq, steal);
- if (steal->tdq_load >= steal_thresh && steal->tdq_transferable)
- goto steal;
- tdq_unlock_pair(tdq, steal);
+ spinlock_exit();
+ TDQ_UNLOCK(steal);
+ mi_switch(SW_VOL, NULL);
+ thread_unlock(curthread);
+
+ return (0);
}
spinlock_exit();
return (1);
-steal:
- spinlock_exit();
- tdq_move(steal, tdq);
- TDQ_UNLOCK(steal);
- mi_switch(SW_VOL, NULL);
- thread_unlock(curthread);
-
- return (0);
}
/*
* Notify a remote cpu of new work. Sends an IPI if criteria are met.
*/
static void
-tdq_notify(struct thread *td)
+tdq_notify(struct tdq *tdq, struct thread *td)
{
- struct thread *ctd;
- struct pcpu *pcpu;
int cpri;
int pri;
int cpu;
- cpu = TD_TO_TS(ts)->ts_cpu;
- pri = td->td_priority;
- pcpu = pcpu_find(cpu);
- ctd = pcpu->pc_curthread;
- cpri = ctd->td_priority;
-
- /*
- * If our priority is not better than the current priority there is
- * nothing to do.
- */
- if (pri > cpri)
+ if (tdq->tdq_ipipending)
return;
- /*
- * Always IPI idle.
- */
- if (cpri > PRI_MIN_IDLE)
- goto sendipi;
- /*
- * If we're realtime or better and there is timeshare or worse running
- * send an IPI.
- */
- if (pri < PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
- goto sendipi;
- /*
- * Otherwise only IPI if we exceed the threshold.
- */
- if (pri > preempt_thresh)
+ cpri = pcpu_find(cpu)->pc_curthread->td_priority;
+ if (!sched_shouldpreempt(pri, cpri, 1))
return;
-sendipi:
- ctd->td_flags |= TDF_NEEDRESCHED;
+ tdq->tdq_ipipending = 1;
ipi_selected(1 << cpu, IPI_PREEMPT);
}
@@ -857,7 +949,7 @@
* index.
*/
static struct thread *
-runq_steal_from(struct runq *rq, u_char start)
+runq_steal_from(struct runq *rq, int cpu, u_char start)
{
struct thread *td;
struct rqbits *rqb;
@@ -886,7 +978,8 @@
pri += (i << RQB_L2BPW);
rqh = &rq->rq_queues[pri];
TAILQ_FOREACH(td, rqh, td_procq) {
- if (first && THREAD_CAN_MIGRATE(td))
+ if (first && THREAD_CAN_MIGRATE(td) &&
+ THREAD_CAN_SCHED(td, cpu))
return (td);
first = 1;
}
@@ -903,7 +996,7 @@
* Steals load from a standard linear queue.
>>> TRUNCATED FOR MAIL (1000 lines) <<<
More information about the p4-projects
mailing list