PERFORCE change 123983 for review
Ulf Lilleengen
lulf at FreeBSD.org
Mon Jul 23 23:02:17 UTC 2007
http://perforce.freebsd.org/chv.cgi?CH=123983
Change 123983 by lulf at lulf_carrot on 2007/07/23 23:01:14
IFC
Affected files ...
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/conf/NOTES#11 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/genassym.c#3 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/swtch.s#3 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_mutex.c#6 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_resource.c#7 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_4bsd.c#4 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_ule.c#6 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/sys/mutex.h#3 integrate
Differences ...
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/conf/NOTES#11 (text+ko) ====
@@ -1,4 +1,4 @@
-# $FreeBSD: src/sys/conf/NOTES,v 1.1444 2007/07/14 21:49:23 rwatson Exp $
+# $FreeBSD: src/sys/conf/NOTES,v 1.1445 2007/07/18 02:51:21 jeff Exp $
#
# NOTES -- Lines that can be cut/pasted into kernel and hints configs.
#
@@ -176,10 +176,11 @@
# queue and no CPU affinity which makes it suboptimal for SMP. It has very
# good interactivity and priority selection.
#
-# SCHED_ULE is a new scheduler that has been designed for SMP and has some
-# advantages for UP as well. It is intended to replace the 4BSD scheduler
-# over time. NOTE: SCHED_ULE is currently considered experimental and is
-# not recommended for production use at this time.
+# SCHED_ULE provides significant performance advantages over 4BSD on many
+# workloads on SMP machines. It supports cpu-affinity, per-cpu runqueues
+# and scheduler locks. It also has a stronger notion of interactivity
+# which leads to better responsiveness even on uniprocessor machines. This
+# will eventually become the default scheduler.
#
options SCHED_4BSD
#options SCHED_ULE
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/genassym.c#3 (text+ko) ====
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/genassym.c,v 1.157 2007/06/06 07:35:07 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/genassym.c,v 1.158 2007/07/17 22:34:14 jeff Exp $");
#include "opt_apic.h"
#include "opt_compat.h"
@@ -81,6 +81,7 @@
ASSYM(P_SFLAG, offsetof(struct proc, p_sflag));
ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
ASSYM(TD_PROC, offsetof(struct thread, td_proc));
ASSYM(TD_MD, offsetof(struct thread, td_md));
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/swtch.s#3 (text+ko) ====
@@ -29,15 +29,32 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.154 2007/06/06 07:35:07 davidxu Exp $
+ * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.155 2007/07/17 22:34:14 jeff Exp $
*/
#include "opt_npx.h"
+#include "opt_sched.h"
#include <machine/asmacros.h>
#include "assym.s"
+#if defined(SMP) && defined(SCHED_ULE)
+#define SETOP xchgl
+#define BLOCK_SPIN(reg) \
+ movl $blocked_lock,%eax ; \
+ 100: ; \
+ lock ; \
+ cmpxchgl %eax,TD_LOCK(reg) ; \
+ jne 101f ; \
+ pause ; \
+ jmp 100b ; \
+ 101:
+#else
+#define SETOP movl
+#define BLOCK_SPIN(reg)
+#endif
+
/*****************************************************************************/
/* Scheduling */
/*****************************************************************************/
@@ -91,6 +108,7 @@
* 0(%esp) = ret
* 4(%esp) = oldtd
* 8(%esp) = newtd
+ * 12(%esp) = newlock
*/
ENTRY(cpu_switch)
@@ -145,13 +163,14 @@
#endif
/* Save is done. Now fire up new thread. Leave old vmspace. */
+ movl 4(%esp),%edi
movl 8(%esp),%ecx /* New thread */
+ movl 12(%esp),%esi /* New lock */
#ifdef INVARIANTS
testl %ecx,%ecx /* no thread? */
jz badsw3 /* no, panic */
#endif
movl TD_PCB(%ecx),%edx
- movl PCPU(CPUID), %esi
/* switch address space */
movl PCB_CR3(%edx),%eax
@@ -160,11 +179,14 @@
#else
cmpl %eax,IdlePTD /* Kernel address space? */
#endif
- je sw1
+ je sw0
movl %cr3,%ebx /* The same address space? */
cmpl %ebx,%eax
- je sw1
+ je sw0
movl %eax,%cr3 /* new address space */
+ movl %esi,%eax
+ movl PCPU(CPUID),%esi
+ SETOP %eax,TD_LOCK(%edi) /* Switchout td_lock */
/* Release bit from old pmap->pm_active */
movl PCPU(CURPMAP), %ebx
@@ -182,8 +204,12 @@
lock
#endif
btsl %esi, PM_ACTIVE(%ebx) /* set new */
+ jmp sw1
+sw0:
+ SETOP %esi,TD_LOCK(%edi) /* Switchout td_lock */
sw1:
+ BLOCK_SPIN(%ecx)
/*
* At this point, we've switched address spaces and are ready
* to load up the rest of the next context.
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_mutex.c#6 (text+ko) ====
@@ -34,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.196 2007/06/09 18:09:37 mjacob Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.198 2007/07/18 20:46:05 jeff Exp $");
#include "opt_adaptive_mutexes.h"
#include "opt_ddb.h"
@@ -118,7 +118,6 @@
* System-wide mutexes
*/
struct mtx blocked_lock;
-struct mtx sched_lock;
struct mtx Giant;
#ifdef LOCK_PROFILING
@@ -473,9 +472,12 @@
{
struct mtx *m;
uintptr_t tid;
- int i;
+ int i, contested;
+ uint64_t waittime;
- i = 0;
+
+ contested = i = 0;
+ waittime = 0;
tid = (uintptr_t)curthread;
for (;;) {
retry:
@@ -488,6 +490,7 @@
m->mtx_recurse++;
break;
}
+ lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
/* Give interrupts a chance while we spin. */
spinlock_exit();
while (m->mtx_lock != MTX_UNOWNED) {
@@ -508,6 +511,8 @@
break;
_rel_spin_lock(m); /* does spinlock_exit() */
}
+ lock_profile_obtain_lock_success(&m->lock_object, contested,
+ waittime, (file), (line));
WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
}
@@ -769,7 +774,6 @@
* Initialize mutexes.
*/
mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
- mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */
mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_resource.c#7 (text+ko) ====
@@ -35,7 +35,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.179 2007/07/12 18:01:31 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.180 2007/07/17 01:08:09 jeff Exp $");
#include "opt_compat.h"
@@ -840,6 +840,14 @@
p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
PCPU_SET(switchtime, u);
}
+ /* Make sure the per-thread stats are current. */
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_runtime == 0)
+ continue;
+ thread_lock(td);
+ ruxagg(&p->p_rux, td);
+ thread_unlock(td);
+ }
calcru1(p, &p->p_rux, up, sp);
}
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_4bsd.c#4 (text+ko) ====
@@ -33,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.102 2007/06/12 07:47:09 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.103 2007/07/18 20:46:05 jeff Exp $");
#include "opt_hwpmc_hooks.h"
@@ -101,6 +101,7 @@
((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
static struct td_sched td_sched0;
+struct mtx sched_lock;
static int sched_tdcnt; /* Total runnable threads in the system. */
static int sched_quantum; /* Roundrobin scheduling quantum in ticks. */
@@ -578,6 +579,7 @@
thread0.td_sched = &td_sched0;
thread0.td_lock = &sched_lock;
td_sched0.ts_thread = &thread0;
+ mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
}
int
==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_ule.c#6 (text+ko) ====
@@ -24,8 +24,19 @@
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/*
+ * This file implements the ULE scheduler. ULE supports independent CPU
+ * run queues and fine grain locking. It has superior interactive
+ * performance under load even on uni-processor systems.
+ *
+ * etymology:
+ * ULE is the last three letters in schedule. It owes it's name to a
+ * generic user created for a scheduling system by Paul Mikesell at
+ * Isilon Systems and a general lack of creativity on the part of the author.
+ */
+
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.199 2007/06/15 19:33:58 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.200 2007/07/17 22:53:23 jeff Exp $");
#include "opt_hwpmc_hooks.h"
#include "opt_sched.h"
@@ -64,26 +75,23 @@
#error "SCHED_ULE requires options PREEMPTION"
#endif
-/*
- * TODO:
- * Pick idle from affinity group or self group first.
- * Implement pick_score.
- */
-
-#define KTR_ULE 0x0 /* Enable for pickpri debugging. */
+#define KTR_ULE 0
/*
- * Thread scheduler specific section.
+ * Thread scheduler specific section. All fields are protected
+ * by the thread lock.
*/
struct td_sched {
- TAILQ_ENTRY(td_sched) ts_procq; /* (j/z) Run queue. */
- int ts_flags; /* (j) TSF_* flags. */
- struct thread *ts_thread; /* (*) Active associated thread. */
- u_char ts_rqindex; /* (j) Run queue index. */
- int ts_slptime;
- int ts_slice;
- struct runq *ts_runq;
+ TAILQ_ENTRY(td_sched) ts_procq; /* Run queue. */
+ struct thread *ts_thread; /* Active associated thread. */
+ struct runq *ts_runq; /* Run-queue we're queued on. */
+ short ts_flags; /* TSF_* flags. */
+ u_char ts_rqindex; /* Run queue index. */
u_char ts_cpu; /* CPU that we have affinity for. */
+ int ts_slptick; /* Tick when we went to sleep. */
+ int ts_slice; /* Ticks of slice remaining. */
+ u_int ts_slptime; /* Number of ticks we vol. slept */
+ u_int ts_runtime; /* Number of ticks we were running */
/* The following variables are only used for pctcpu calculation */
int ts_ltick; /* Last tick that we were running on */
int ts_ftick; /* First tick that we were running on */
@@ -91,10 +99,6 @@
#ifdef SMP
int ts_rltick; /* Real last tick, for affinity. */
#endif
-
- /* originally from kg_sched */
- u_int skg_slptime; /* Number of ticks we vol. slept */
- u_int skg_runtime; /* Number of ticks we were running */
};
/* flags kept in ts_flags */
#define TSF_BOUND 0x0001 /* Thread can not migrate. */
@@ -165,33 +169,40 @@
* due to rounding would be unacceptably high.
* realstathz: stathz is sometimes 0 and run off of hz.
* sched_slice: Runtime of each thread before rescheduling.
+ * preempt_thresh: Priority threshold for preemption and remote IPIs.
*/
static int sched_interact = SCHED_INTERACT_THRESH;
static int realstathz;
static int tickincr;
static int sched_slice;
+static int preempt_thresh = PRI_MIN_KERN;
+#define SCHED_BAL_SECS 2 /* How often we run the rebalance algorithm. */
+
/*
- * tdq - per processor runqs and statistics.
+ * tdq - per processor runqs and statistics. All fields are protected by the
+ * tdq_lock. The load and lowpri may be accessed without to avoid excess
+ * locking in sched_pickcpu();
*/
struct tdq {
+ struct mtx tdq_lock; /* Protects all fields below. */
+ struct runq tdq_realtime; /* real-time run queue. */
+ struct runq tdq_timeshare; /* timeshare run queue. */
struct runq tdq_idle; /* Queue of IDLE threads. */
- struct runq tdq_timeshare; /* timeshare run queue. */
- struct runq tdq_realtime; /* real-time run queue. */
+ int tdq_load; /* Aggregate load. */
u_char tdq_idx; /* Current insert index. */
u_char tdq_ridx; /* Current removal index. */
- short tdq_flags; /* Thread queue flags */
- int tdq_load; /* Aggregate load. */
#ifdef SMP
- int tdq_transferable;
+ u_char tdq_lowpri; /* Lowest priority thread. */
+ int tdq_transferable; /* Transferable thread count. */
LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */
struct tdq_group *tdq_group; /* Our processor group. */
#else
int tdq_sysload; /* For loadavg, !ITHD load. */
#endif
-};
+ char tdq_name[16]; /* lock name. */
+} __aligned(64);
-#define TDQF_BUSY 0x0001 /* Queue is marked as busy */
#ifdef SMP
/*
@@ -210,9 +221,9 @@
int tdg_load; /* Total load of this group. */
int tdg_transferable; /* Transferable load of this group. */
LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */
-};
+} __aligned(64);
-#define SCHED_AFFINITY_DEFAULT (hz / 100)
+#define SCHED_AFFINITY_DEFAULT (max(1, hz / 300))
#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity)
/*
@@ -220,28 +231,23 @@
*/
static int rebalance = 0;
static int pick_pri = 0;
+static int pick_zero = 0;
static int affinity;
static int tryself = 1;
static int tryselfidle = 1;
-static int ipi_ast = 0;
-static int ipi_preempt = 1;
-static int ipi_thresh = PRI_MIN_KERN;
-static int steal_htt = 1;
-static int steal_busy = 1;
-static int busy_thresh = 4;
+static int steal_htt = 0;
+static int steal_idle = 0;
static int topology = 0;
/*
* One thread queue per processor.
*/
static volatile cpumask_t tdq_idle;
-static volatile cpumask_t tdq_busy;
static int tdg_maxid;
static struct tdq tdq_cpu[MAXCPU];
static struct tdq_group tdq_groups[MAXCPU];
-static int bal_tick;
-static int gbal_tick;
-static int balance_groups;
+static struct callout balco;
+static struct callout gbalco;
#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)])
#define TDQ_CPU(x) (&tdq_cpu[(x)])
@@ -255,14 +261,18 @@
#define TDQ_CPU(x) (&tdq_cpu)
#endif
+#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type))
+#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t)))
+#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
+#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t)))
+#define TDQ_LOCKPTR(t) (&(t)->tdq_lock)
+
static void sched_priority(struct thread *);
static void sched_thread_priority(struct thread *, u_char);
static int sched_interact_score(struct thread *);
static void sched_interact_update(struct thread *);
static void sched_interact_fork(struct thread *);
static void sched_pctcpu_update(struct td_sched *);
-static inline void sched_pin_td(struct thread *td);
-static inline void sched_unpin_td(struct thread *td);
/* Operations on per processor queues */
static struct td_sched * tdq_choose(struct tdq *);
@@ -273,19 +283,21 @@
static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
void tdq_print(int cpu);
static void runq_print(struct runq *rq);
+static void tdq_add(struct tdq *, struct thread *, int);
#ifdef SMP
-static int tdq_pickidle(struct tdq *, struct td_sched *);
-static int tdq_pickpri(struct tdq *, struct td_sched *, int);
+static void tdq_move(struct tdq *, struct tdq *);
+static int tdq_idled(struct tdq *);
+static void tdq_notify(struct td_sched *);
+static struct td_sched *tdq_steal(struct tdq *, int);
static struct td_sched *runq_steal(struct runq *);
-static void sched_balance(void);
-static void sched_balance_groups(void);
+static int sched_pickcpu(struct td_sched *, int);
+static void sched_balance(void *);
+static void sched_balance_groups(void *);
static void sched_balance_group(struct tdq_group *);
static void sched_balance_pair(struct tdq *, struct tdq *);
-static void sched_smp_tick(struct thread *);
-static void tdq_move(struct tdq *, int);
-static int tdq_idled(struct tdq *);
-static void tdq_notify(struct td_sched *);
-static struct td_sched *tdq_steal(struct tdq *, int);
+static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
+static inline struct mtx *thread_block_switch(struct thread *);
+static inline void thread_unblock_switch(struct thread *, struct mtx *);
#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
#endif
@@ -296,18 +308,9 @@
static void sched_initticks(void *dummy);
SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
-static inline void
-sched_pin_td(struct thread *td)
-{
- td->td_pinned++;
-}
-
-static inline void
-sched_unpin_td(struct thread *td)
-{
- td->td_pinned--;
-}
-
+/*
+ * Print the threads waiting on a run-queue.
+ */
static void
runq_print(struct runq *rq)
{
@@ -332,6 +335,9 @@
}
}
+/*
+ * Print the status of a per-cpu thread queue. Should be a ddb show cmd.
+ */
void
tdq_print(int cpu)
{
@@ -340,8 +346,10 @@
tdq = TDQ_CPU(cpu);
printf("tdq:\n");
+ printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq));
+ printf("\tlock name %s\n", tdq->tdq_name);
printf("\tload: %d\n", tdq->tdq_load);
- printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
+ printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
printf("\trealtime runq:\n");
runq_print(&tdq->tdq_realtime);
@@ -351,22 +359,26 @@
runq_print(&tdq->tdq_idle);
#ifdef SMP
printf("\tload transferable: %d\n", tdq->tdq_transferable);
+ printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
#endif
}
+#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
+/*
+ * Add a thread to the actual run-queue. Keeps transferable counts up to
+ * date with what is actually on the run-queue. Selects the correct
+ * queue position for timeshare threads.
+ */
static __inline void
tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
{
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
#ifdef SMP
if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
tdq->tdq_transferable++;
tdq->tdq_group->tdg_transferable++;
ts->ts_flags |= TSF_XFERABLE;
- if (tdq->tdq_transferable >= busy_thresh &&
- (tdq->tdq_flags & TDQF_BUSY) == 0) {
- tdq->tdq_flags |= TDQF_BUSY;
- atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq));
- }
}
#endif
if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -379,7 +391,6 @@
* This queue contains only priorities between MIN and MAX
* realtime. Use the whole queue to represent these values.
*/
-#define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
if ((flags & SRQ_BORROWING) == 0) {
pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
pri = (pri + tdq->tdq_idx) % RQ_NQS;
@@ -398,19 +409,22 @@
runq_add(ts->ts_runq, ts, flags);
}
+/*
+ * Remove a thread from a run-queue. This typically happens when a thread
+ * is selected to run. Running threads are not on the queue and the
+ * transferable count does not reflect them.
+ */
static __inline void
tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
{
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ KASSERT(ts->ts_runq != NULL,
+ ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
#ifdef SMP
if (ts->ts_flags & TSF_XFERABLE) {
tdq->tdq_transferable--;
tdq->tdq_group->tdg_transferable--;
ts->ts_flags &= ~TSF_XFERABLE;
- if (tdq->tdq_transferable < busy_thresh &&
- (tdq->tdq_flags & TDQF_BUSY)) {
- atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq));
- tdq->tdq_flags &= ~TDQF_BUSY;
- }
}
#endif
if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -429,11 +443,17 @@
runq_remove(ts->ts_runq, ts);
}
+/*
+ * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load
+ * for this thread to the referenced thread queue.
+ */
static void
tdq_load_add(struct tdq *tdq, struct td_sched *ts)
{
int class;
- mtx_assert(&sched_lock, MA_OWNED);
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
class = PRI_BASE(ts->ts_thread->td_pri_class);
tdq->tdq_load++;
CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load);
@@ -446,11 +466,17 @@
#endif
}
+/*
+ * Remove the load from a thread that is transitioning to a sleep state or
+ * exiting.
+ */
static void
tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
{
int class;
- mtx_assert(&sched_lock, MA_OWNED);
+
+ THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
class = PRI_BASE(ts->ts_thread->td_pri_class);
if (class != PRI_ITHD &&
(ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
@@ -459,27 +485,14 @@
#else
tdq->tdq_sysload--;
#endif
+ KASSERT(tdq->tdq_load != 0,
+ ("tdq_load_rem: Removing with 0 load on queue %d", (int)TDQ_ID(tdq)));
tdq->tdq_load--;
CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
ts->ts_runq = NULL;
}
#ifdef SMP
-static void
-sched_smp_tick(struct thread *td)
-{
- struct tdq *tdq;
-
- tdq = TDQ_SELF();
- if (rebalance) {
- if (ticks >= bal_tick)
- sched_balance();
- if (ticks >= gbal_tick && balance_groups)
- sched_balance_groups();
- }
- td->td_sched->ts_rltick = ticks;
-}
-
/*
* sched_balance is a simple CPU load balancing algorithm. It operates by
* finding the least loaded and most loaded cpu and equalizing their load
@@ -489,15 +502,11 @@
* installations will only have 2 cpus. Secondly, load balancing too much at
* once can have an unpleasant effect on the system. The scheduler rarely has
* enough information to make perfect decisions. So this algorithm chooses
- * algorithm simplicity and more gradual effects on load in larger systems.
+ * simplicity and more gradual effects on load in larger systems.
*
- * It could be improved by considering the priorities and slices assigned to
- * each task prior to balancing them. There are many pathological cases with
- * any approach and so the semi random algorithm below may work as well as any.
- *
*/
static void
-sched_balance(void)
+sched_balance(void *arg)
{
struct tdq_group *high;
struct tdq_group *low;
@@ -505,8 +514,9 @@
int cnt;
int i;
- bal_tick = ticks + (random() % (hz * 2));
- if (smp_started == 0)
+ callout_reset(&balco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
+ sched_balance, NULL);
+ if (smp_started == 0 || rebalance == 0)
return;
low = high = NULL;
i = random() % (tdg_maxid + 1);
@@ -529,18 +539,25 @@
LIST_FIRST(&low->tdg_members));
}
+/*
+ * Balance load between CPUs in a group. Will only migrate within the group.
+ */
static void
-sched_balance_groups(void)
+sched_balance_groups(void *arg)
{
int i;
- gbal_tick = ticks + (random() % (hz * 2));
- mtx_assert(&sched_lock, MA_OWNED);
- if (smp_started)
- for (i = 0; i <= tdg_maxid; i++)
- sched_balance_group(TDQ_GROUP(i));
+ callout_reset(&gbalco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
+ sched_balance_groups, NULL);
+ if (smp_started == 0 || rebalance == 0)
+ return;
+ for (i = 0; i <= tdg_maxid; i++)
+ sched_balance_group(TDQ_GROUP(i));
}
+/*
+ * Finds the greatest imbalance between two tdqs in a group.
+ */
static void
sched_balance_group(struct tdq_group *tdg)
{
@@ -564,6 +581,24 @@
sched_balance_pair(high, low);
}
+/*
+ * Lock two thread queues using their address to maintain lock order.
+ */
+static void
+tdq_lock_pair(struct tdq *one, struct tdq *two)
+{
+ if (one < two) {
+ TDQ_LOCK(one);
+ TDQ_LOCK_FLAGS(two, MTX_DUPOK);
+ } else {
+ TDQ_LOCK(two);
+ TDQ_LOCK_FLAGS(one, MTX_DUPOK);
+ }
+}
+
+/*
+ * Transfer load between two imbalanced thread queues.
+ */
static void
sched_balance_pair(struct tdq *high, struct tdq *low)
{
@@ -574,6 +609,7 @@
int diff;
int i;
+ tdq_lock_pair(high, low);
/*
* If we're transfering within a group we have to use this specific
* tdq's transferable count, otherwise we can steal from other members
@@ -588,31 +624,37 @@
high_load = high->tdq_group->tdg_load;
low_load = low->tdq_group->tdg_load;
}
- if (transferable == 0)
- return;
/*
* Determine what the imbalance is and then adjust that to how many
* threads we actually have to give up (transferable).
*/
- diff = high_load - low_load;
- move = diff / 2;
- if (diff & 0x1)
- move++;
- move = min(move, transferable);
- for (i = 0; i < move; i++)
- tdq_move(high, TDQ_ID(low));
+ if (transferable != 0) {
+ diff = high_load - low_load;
+ move = diff / 2;
+ if (diff & 0x1)
+ move++;
+ move = min(move, transferable);
+ for (i = 0; i < move; i++)
+ tdq_move(high, low);
+ }
+ TDQ_UNLOCK(high);
+ TDQ_UNLOCK(low);
return;
}
+/*
+ * Move a thread from one thread queue to another.
+ */
static void
-tdq_move(struct tdq *from, int cpu)
+tdq_move(struct tdq *from, struct tdq *to)
{
+ struct td_sched *ts;
+ struct thread *td;
struct tdq *tdq;
- struct tdq *to;
- struct td_sched *ts;
+ int cpu;
tdq = from;
- to = TDQ_CPU(cpu);
+ cpu = TDQ_ID(to);
ts = tdq_steal(tdq, 1);
if (ts == NULL) {
struct tdq_group *tdg;
@@ -625,26 +667,42 @@
break;
}
if (ts == NULL)
- panic("tdq_move: No threads available with a "
- "transferable count of %d\n",
- tdg->tdg_transferable);
+ return;
}
if (tdq == to)
return;
- sched_rem(ts->ts_thread);
+ td = ts->ts_thread;
+ /*
+ * Although the run queue is locked the thread may be blocked. Lock
+ * it to clear this.
+ */
+ thread_lock(td);
+ /* Drop recursive lock on from. */
+ TDQ_UNLOCK(from);
+ sched_rem(td);
ts->ts_cpu = cpu;
- sched_pin_td(ts->ts_thread);
- sched_add(ts->ts_thread, SRQ_YIELDING);
- sched_unpin_td(ts->ts_thread);
+ td->td_lock = TDQ_LOCKPTR(to);
+ tdq_add(to, td, SRQ_YIELDING);
}
+/*
+ * This tdq has idled. Try to steal a thread from another cpu and switch
+ * to it.
+ */
static int
tdq_idled(struct tdq *tdq)
{
struct tdq_group *tdg;
struct tdq *steal;
struct td_sched *ts;
+ struct thread *td;
+ int highload;
+ int highcpu;
+ int load;
+ int cpu;
+ /* We don't want to be preempted while we're iterating over tdqs */
+ spinlock_enter();
tdg = tdq->tdq_group;
/*
* If we're in a cpu group, try and steal threads from another cpu in
@@ -654,51 +712,59 @@
LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
if (steal == tdq || steal->tdq_transferable == 0)
continue;
+ TDQ_LOCK(steal);
ts = tdq_steal(steal, 0);
if (ts)
goto steal;
+ TDQ_UNLOCK(steal);
}
}
- if (steal_busy) {
- while (tdq_busy) {
- int cpu;
-
- cpu = ffs(tdq_busy);
- if (cpu == 0)
- break;
- cpu--;
+ for (;;) {
+ if (steal_idle == 0)
+ break;
+ highcpu = 0;
+ highload = 0;
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (CPU_ABSENT(cpu))
+ continue;
steal = TDQ_CPU(cpu);
- if (steal->tdq_transferable == 0)
+ load = TDQ_CPU(cpu)->tdq_transferable;
+ if (load < highload)
continue;
- ts = tdq_steal(steal, 1);
- if (ts == NULL)
- continue;
- CTR5(KTR_ULE,
- "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X",
- ts->ts_thread, ts->ts_thread->td_proc->p_comm,
- ts->ts_thread->td_priority, cpu, tdq_busy);
+ highload = load;
+ highcpu = cpu;
+ }
+ if (highload < 2)
+ break;
+ steal = TDQ_CPU(highcpu);
+ TDQ_LOCK(steal);
+ if (steal->tdq_transferable > 1 &&
+ (ts = tdq_steal(steal, 1)) != NULL)
goto steal;
- }
+ TDQ_UNLOCK(steal);
+ break;
}
- /*
- * We only set the idled bit when all of the cpus in the group are
- * idle. Otherwise we could get into a situation where a thread bounces
- * back and forth between two idle cores on seperate physical CPUs.
- */
- tdg->tdg_idlemask |= PCPU_GET(cpumask);
- if (tdg->tdg_idlemask == tdg->tdg_cpumask)
- atomic_set_int(&tdq_idle, tdg->tdg_mask);
+ spinlock_exit();
return (1);
steal:
- sched_rem(ts->ts_thread);
- ts->ts_cpu = PCPU_GET(cpuid);
- sched_pin_td(ts->ts_thread);
- sched_add(ts->ts_thread, SRQ_YIELDING);
- sched_unpin_td(ts->ts_thread);
+ td = ts->ts_thread;
+ thread_lock(td);
+ spinlock_exit();
+ MPASS(td->td_lock == TDQ_LOCKPTR(steal));
+ TDQ_UNLOCK(steal);
+ sched_rem(td);
+ sched_setcpu(ts, PCPU_GET(cpuid), SRQ_YIELDING);
+ tdq_add(tdq, td, SRQ_YIELDING);
+ MPASS(td->td_lock == curthread->td_lock);
+ mi_switch(SW_VOL, NULL);
+ thread_unlock(curthread);
return (0);
}
+/*
+ * Notify a remote cpu of new work. Sends an IPI if criteria are met.
+ */
static void
tdq_notify(struct td_sched *ts)
{
@@ -734,29 +800,74 @@
/*
* Otherwise only IPI if we exceed the threshold.
*/
- if (pri > ipi_thresh)
+ if (pri > preempt_thresh)
return;
sendipi:
ctd->td_flags |= TDF_NEEDRESCHED;
- if (cpri < PRI_MIN_IDLE) {
- if (ipi_ast)
- ipi_selected(1 << cpu, IPI_AST);
- else if (ipi_preempt)
- ipi_selected(1 << cpu, IPI_PREEMPT);
- } else
- ipi_selected(1 << cpu, IPI_PREEMPT);
+ ipi_selected(1 << cpu, IPI_PREEMPT);
+}
+
+/*
+ * Steals load from a timeshare queue. Honors the rotating queue head
+ * index.
+ */
+static struct td_sched *
+runq_steal_from(struct runq *rq, u_char start)
+{
+ struct td_sched *ts;
+ struct rqbits *rqb;
+ struct rqhead *rqh;
+ int first;
+ int bit;
+ int pri;
+ int i;
+
+ rqb = &rq->rq_status;
+ bit = start & (RQB_BPW -1);
+ pri = 0;
+ first = 0;
+again:
+ for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
+ if (rqb->rqb_bits[i] == 0)
+ continue;
+ if (bit != 0) {
+ for (pri = bit; pri < RQB_BPW; pri++)
+ if (rqb->rqb_bits[i] & (1ul << pri))
+ break;
>>> TRUNCATED FOR MAIL (1000 lines) <<<
More information about the p4-projects
mailing list