PERFORCE change 123983 for review

Mon Jul 23 23:02:17 UTC 2007

http://perforce.freebsd.org/chv.cgi?CH=123983

Change 123983 by lulf at lulf_carrot on 2007/07/23 23:01:14

	IFC

Affected files ...

.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/conf/NOTES#11 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/genassym.c#3 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/swtch.s#3 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_mutex.c#6 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_resource.c#7 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_4bsd.c#4 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_ule.c#6 integrate
.. //depot/projects/soc2007/lulf/gvinum_fixup/sys/sys/mutex.h#3 integrate

Differences ...

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/conf/NOTES#11 (text+ko) ====

@@ -1,4 +1,4 @@
-# $FreeBSD: src/sys/conf/NOTES,v 1.1444 2007/07/14 21:49:23 rwatson Exp $
+# $FreeBSD: src/sys/conf/NOTES,v 1.1445 2007/07/18 02:51:21 jeff Exp $
 #
 # NOTES -- Lines that can be cut/pasted into kernel and hints configs.
 #
@@ -176,10 +176,11 @@
 # queue and no CPU affinity which makes it suboptimal for SMP.  It has very
 # good interactivity and priority selection.
 #
-# SCHED_ULE is a new scheduler that has been designed for SMP and has some
-# advantages for UP as well.  It is intended to replace the 4BSD scheduler
-# over time.  NOTE: SCHED_ULE is currently considered experimental and is
-# not recommended for production use at this time.
+# SCHED_ULE provides significant performance advantages over 4BSD on many
+# workloads on SMP machines.  It supports cpu-affinity, per-cpu runqueues
+# and scheduler locks.  It also has a stronger notion of interactivity 
+# which leads to better responsiveness even on uniprocessor machines.  This
+# will eventually become the default scheduler.
 #
 options 	SCHED_4BSD
 #options 	SCHED_ULE

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/genassym.c#3 (text+ko) ====

@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/i386/i386/genassym.c,v 1.157 2007/06/06 07:35:07 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/i386/i386/genassym.c,v 1.158 2007/07/17 22:34:14 jeff Exp $");
 
 #include "opt_apic.h"
 #include "opt_compat.h"
@@ -81,6 +81,7 @@
 ASSYM(P_SFLAG, offsetof(struct proc, p_sflag));
 
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(TD_LOCK, offsetof(struct thread, td_lock));
 ASSYM(TD_PCB, offsetof(struct thread, td_pcb));
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
 ASSYM(TD_MD, offsetof(struct thread, td_md));

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/i386/i386/swtch.s#3 (text+ko) ====

@@ -29,15 +29,32 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.154 2007/06/06 07:35:07 davidxu Exp $
+ * $FreeBSD: src/sys/i386/i386/swtch.s,v 1.155 2007/07/17 22:34:14 jeff Exp $
  */
 
 #include "opt_npx.h"
+#include "opt_sched.h"
 
 #include <machine/asmacros.h>
 
 #include "assym.s"
 
+#if defined(SMP) && defined(SCHED_ULE)
+#define	SETOP		xchgl
+#define	BLOCK_SPIN(reg)							\
+		movl		$blocked_lock,%eax ;			\
+	100: ;								\
+		lock ;							\
+		cmpxchgl	%eax,TD_LOCK(reg) ;			\
+		jne		101f ;					\
+		pause ;							\
+		jmp		100b ;					\
+	101:
+#else
+#define	SETOP		movl
+#define	BLOCK_SPIN(reg)
+#endif
+
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
@@ -91,6 +108,7 @@
  * 0(%esp) = ret
  * 4(%esp) = oldtd
  * 8(%esp) = newtd
+ * 12(%esp) = newlock
  */
 ENTRY(cpu_switch)
 
@@ -145,13 +163,14 @@
 #endif
 
 	/* Save is done.  Now fire up new thread. Leave old vmspace. */
+	movl	4(%esp),%edi
 	movl	8(%esp),%ecx			/* New thread */
+	movl	12(%esp),%esi			/* New lock */
 #ifdef INVARIANTS
 	testl	%ecx,%ecx			/* no thread? */
 	jz	badsw3				/* no, panic */
 #endif
 	movl	TD_PCB(%ecx),%edx
-	movl	PCPU(CPUID), %esi
 
 	/* switch address space */
 	movl	PCB_CR3(%edx),%eax
@@ -160,11 +179,14 @@
 #else
 	cmpl	%eax,IdlePTD			/* Kernel address space? */
 #endif
-	je	sw1
+	je	sw0
 	movl	%cr3,%ebx			/* The same address space? */
 	cmpl	%ebx,%eax
-	je	sw1
+	je	sw0
 	movl	%eax,%cr3			/* new address space */
+	movl	%esi,%eax
+	movl	PCPU(CPUID),%esi
+	SETOP	%eax,TD_LOCK(%edi)		/* Switchout td_lock */
 
 	/* Release bit from old pmap->pm_active */
 	movl	PCPU(CURPMAP), %ebx
@@ -182,8 +204,12 @@
 	lock
 #endif
 	btsl	%esi, PM_ACTIVE(%ebx)		/* set new */
+	jmp	sw1
 
+sw0:
+	SETOP	%esi,TD_LOCK(%edi)		/* Switchout td_lock */
 sw1:
+	BLOCK_SPIN(%ecx)
 	/*
 	 * At this point, we've switched address spaces and are ready
 	 * to load up the rest of the next context.

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_mutex.c#6 (text+ko) ====

@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.196 2007/06/09 18:09:37 mjacob Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.198 2007/07/18 20:46:05 jeff Exp $");
 
 #include "opt_adaptive_mutexes.h"
 #include "opt_ddb.h"
@@ -118,7 +118,6 @@
  * System-wide mutexes
  */
 struct mtx blocked_lock;
-struct mtx sched_lock;
 struct mtx Giant;
 
 #ifdef LOCK_PROFILING
@@ -473,9 +472,12 @@
 {
 	struct mtx *m;
 	uintptr_t tid;
-	int i;
+	int i, contested;
+	uint64_t waittime;
 
-	i = 0;
+	
+	contested = i = 0;
+	waittime = 0;
 	tid = (uintptr_t)curthread;
 	for (;;) {
 retry:
@@ -488,6 +490,7 @@
 				m->mtx_recurse++;
 				break;
 			}
+			lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
 			/* Give interrupts a chance while we spin. */
 			spinlock_exit();
 			while (m->mtx_lock != MTX_UNOWNED) {
@@ -508,6 +511,8 @@
 			break;
 		_rel_spin_lock(m);	/* does spinlock_exit() */
 	}
+	lock_profile_obtain_lock_success(&m->lock_object, contested,	
+	    waittime, (file), (line));
 	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 }
 
@@ -769,7 +774,6 @@
 	 * Initialize mutexes.
 	 */
 	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
-	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
 	mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
 	blocked_lock.mtx_lock = 0xdeadc0de;	/* Always blocked. */
 	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/kern_resource.c#7 (text+ko) ====

@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.179 2007/07/12 18:01:31 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.180 2007/07/17 01:08:09 jeff Exp $");
 
 #include "opt_compat.h"
 
@@ -840,6 +840,14 @@
 		p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
 		PCPU_SET(switchtime, u);
 	}
+	/* Make sure the per-thread stats are current. */
+	FOREACH_THREAD_IN_PROC(p, td) {
+		if (td->td_runtime == 0)
+			continue;
+		thread_lock(td);
+		ruxagg(&p->p_rux, td);
+		thread_unlock(td);
+	}
 	calcru1(p, &p->p_rux, up, sp);
 }
 

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_4bsd.c#4 (text+ko) ====

@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.102 2007/06/12 07:47:09 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.103 2007/07/18 20:46:05 jeff Exp $");
 
 #include "opt_hwpmc_hooks.h"
 
@@ -101,6 +101,7 @@
     ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
 
 static struct td_sched td_sched0;
+struct mtx sched_lock;
 
 static int	sched_tdcnt;	/* Total runnable threads in the system. */
 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
@@ -578,6 +579,7 @@
 	thread0.td_sched = &td_sched0;
 	thread0.td_lock = &sched_lock;
 	td_sched0.ts_thread = &thread0;
+	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
 }
 
 int

==== //depot/projects/soc2007/lulf/gvinum_fixup/sys/kern/sched_ule.c#6 (text+ko) ====

@@ -24,8 +24,19 @@
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/*
+ * This file implements the ULE scheduler.  ULE supports independent CPU
+ * run queues and fine grain locking.  It has superior interactive
+ * performance under load even on uni-processor systems.
+ *
+ * etymology:
+ *   ULE is the last three letters in schedule.  It owes it's name to a
+ * generic user created for a scheduling system by Paul Mikesell at
+ * Isilon Systems and a general lack of creativity on the part of the author.
+ */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.199 2007/06/15 19:33:58 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.200 2007/07/17 22:53:23 jeff Exp $");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
@@ -64,26 +75,23 @@
 #error	"SCHED_ULE requires options PREEMPTION"
 #endif
 
-/*
- * TODO:
- *	Pick idle from affinity group or self group first.
- *	Implement pick_score.
- */
-
-#define	KTR_ULE	0x0		/* Enable for pickpri debugging. */
+#define	KTR_ULE	0
 
 /*
- * Thread scheduler specific section.
+ * Thread scheduler specific section.  All fields are protected
+ * by the thread lock.
  */
 struct td_sched {	
-	TAILQ_ENTRY(td_sched) ts_procq;	/* (j/z) Run queue. */
-	int		ts_flags;	/* (j) TSF_* flags. */
-	struct thread	*ts_thread;	/* (*) Active associated thread. */
-	u_char		ts_rqindex;	/* (j) Run queue index. */
-	int		ts_slptime;
-	int		ts_slice;
-	struct runq	*ts_runq;
+	TAILQ_ENTRY(td_sched) ts_procq;	/* Run queue. */
+	struct thread	*ts_thread;	/* Active associated thread. */
+	struct runq	*ts_runq;	/* Run-queue we're queued on. */
+	short		ts_flags;	/* TSF_* flags. */
+	u_char		ts_rqindex;	/* Run queue index. */
 	u_char		ts_cpu;		/* CPU that we have affinity for. */
+	int		ts_slptick;	/* Tick when we went to sleep. */
+	int		ts_slice;	/* Ticks of slice remaining. */
+	u_int		ts_slptime;	/* Number of ticks we vol. slept */
+	u_int		ts_runtime;	/* Number of ticks we were running */
 	/* The following variables are only used for pctcpu calculation */
 	int		ts_ltick;	/* Last tick that we were running on */
 	int		ts_ftick;	/* First tick that we were running on */
@@ -91,10 +99,6 @@
 #ifdef SMP
 	int		ts_rltick;	/* Real last tick, for affinity. */
 #endif
-
-	/* originally from kg_sched */
-	u_int	skg_slptime;		/* Number of ticks we vol. slept */
-	u_int	skg_runtime;		/* Number of ticks we were running */
 };
 /* flags kept in ts_flags */
 #define	TSF_BOUND	0x0001		/* Thread can not migrate. */
@@ -165,33 +169,40 @@
  *			due to rounding would be unacceptably high.
  * realstathz:		stathz is sometimes 0 and run off of hz.
  * sched_slice:		Runtime of each thread before rescheduling.
+ * preempt_thresh:	Priority threshold for preemption and remote IPIs.
  */
 static int sched_interact = SCHED_INTERACT_THRESH;
 static int realstathz;
 static int tickincr;
 static int sched_slice;
+static int preempt_thresh = PRI_MIN_KERN;
 
+#define	SCHED_BAL_SECS	2	/* How often we run the rebalance algorithm. */
+
 /*
- * tdq - per processor runqs and statistics.
+ * tdq - per processor runqs and statistics.  All fields are protected by the
+ * tdq_lock.  The load and lowpri may be accessed without to avoid excess
+ * locking in sched_pickcpu();
  */
 struct tdq {
+	struct mtx	tdq_lock;		/* Protects all fields below. */
+	struct runq	tdq_realtime;		/* real-time run queue. */
+	struct runq	tdq_timeshare;		/* timeshare run queue. */
 	struct runq	tdq_idle;		/* Queue of IDLE threads. */
-	struct runq	tdq_timeshare;		/* timeshare run queue. */
-	struct runq	tdq_realtime;		/* real-time run queue. */
+	int		tdq_load;		/* Aggregate load. */
 	u_char		tdq_idx;		/* Current insert index. */
 	u_char		tdq_ridx;		/* Current removal index. */
-	short		tdq_flags;		/* Thread queue flags */
-	int		tdq_load;		/* Aggregate load. */
 #ifdef SMP
-	int		tdq_transferable;
+	u_char		tdq_lowpri;		/* Lowest priority thread. */
+	int		tdq_transferable;	/* Transferable thread count. */
 	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
 	struct tdq_group *tdq_group;		/* Our processor group. */
 #else
 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 #endif
-};
+	char		tdq_name[16];		/* lock name. */
+} __aligned(64);
 
-#define	TDQF_BUSY	0x0001			/* Queue is marked as busy */
 
 #ifdef SMP
 /*
@@ -210,9 +221,9 @@
 	int	tdg_load;		/* Total load of this group. */
 	int	tdg_transferable;	/* Transferable load of this group. */
 	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
-};
+} __aligned(64);
 
-#define	SCHED_AFFINITY_DEFAULT	(hz / 100)
+#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 300))
 #define	SCHED_AFFINITY(ts)	((ts)->ts_rltick > ticks - affinity)
 
 /*
@@ -220,28 +231,23 @@
  */
 static int rebalance = 0;
 static int pick_pri = 0;
+static int pick_zero = 0;
 static int affinity;
 static int tryself = 1;
 static int tryselfidle = 1;
-static int ipi_ast = 0;
-static int ipi_preempt = 1;
-static int ipi_thresh = PRI_MIN_KERN;
-static int steal_htt = 1;
-static int steal_busy = 1;
-static int busy_thresh = 4;
+static int steal_htt = 0;
+static int steal_idle = 0;
 static int topology = 0;
 
 /*
  * One thread queue per processor.
  */
 static volatile cpumask_t tdq_idle;
-static volatile cpumask_t tdq_busy;
 static int tdg_maxid;
 static struct tdq	tdq_cpu[MAXCPU];
 static struct tdq_group tdq_groups[MAXCPU];
-static int bal_tick;
-static int gbal_tick;
-static int balance_groups;
+static struct callout balco;
+static struct callout gbalco;
 
 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
@@ -255,14 +261,18 @@
 #define	TDQ_CPU(x)	(&tdq_cpu)
 #endif
 
+#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
+#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
+#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
+#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
+#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
+
 static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
 static int sched_interact_score(struct thread *);
 static void sched_interact_update(struct thread *);
 static void sched_interact_fork(struct thread *);
 static void sched_pctcpu_update(struct td_sched *);
-static inline void sched_pin_td(struct thread *td);
-static inline void sched_unpin_td(struct thread *td);
 
 /* Operations on per processor queues */
 static struct td_sched * tdq_choose(struct tdq *);
@@ -273,19 +283,21 @@
 static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
 void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
+static void tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
-static int tdq_pickidle(struct tdq *, struct td_sched *);
-static int tdq_pickpri(struct tdq *, struct td_sched *, int);
+static void tdq_move(struct tdq *, struct tdq *);
+static int tdq_idled(struct tdq *);
+static void tdq_notify(struct td_sched *);
+static struct td_sched *tdq_steal(struct tdq *, int);
 static struct td_sched *runq_steal(struct runq *);
-static void sched_balance(void);
-static void sched_balance_groups(void);
+static int sched_pickcpu(struct td_sched *, int);
+static void sched_balance(void *);
+static void sched_balance_groups(void *);
 static void sched_balance_group(struct tdq_group *);
 static void sched_balance_pair(struct tdq *, struct tdq *);
-static void sched_smp_tick(struct thread *);
-static void tdq_move(struct tdq *, int);
-static int tdq_idled(struct tdq *);
-static void tdq_notify(struct td_sched *);
-static struct td_sched *tdq_steal(struct tdq *, int);
+static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
+static inline struct mtx *thread_block_switch(struct thread *);
+static inline void thread_unblock_switch(struct thread *, struct mtx *);
 
 #define	THREAD_CAN_MIGRATE(td)	 ((td)->td_pinned == 0)
 #endif
@@ -296,18 +308,9 @@
 static void sched_initticks(void *dummy);
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
 
-static inline void
-sched_pin_td(struct thread *td)
-{
-	td->td_pinned++;
-}
-
-static inline void
-sched_unpin_td(struct thread *td)
-{
-	td->td_pinned--;
-}
-
+/*
+ * Print the threads waiting on a run-queue.
+ */
 static void
 runq_print(struct runq *rq)
 {
@@ -332,6 +335,9 @@
 	}
 }
 
+/*
+ * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
+ */
 void
 tdq_print(int cpu)
 {
@@ -340,8 +346,10 @@
 	tdq = TDQ_CPU(cpu);
 
 	printf("tdq:\n");
+	printf("\tlockptr         %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tlock name       %s\n", tdq->tdq_name);
 	printf("\tload:           %d\n", tdq->tdq_load);
-	printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
+	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
 	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
 	printf("\trealtime runq:\n");
 	runq_print(&tdq->tdq_realtime);
@@ -351,22 +359,26 @@
 	runq_print(&tdq->tdq_idle);
 #ifdef SMP
 	printf("\tload transferable: %d\n", tdq->tdq_transferable);
+	printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
 #endif
 }
 
+#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
+/*
+ * Add a thread to the actual run-queue.  Keeps transferable counts up to
+ * date with what is actually on the run-queue.  Selects the correct
+ * queue position for timeshare threads.
+ */
 static __inline void
 tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
 {
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
 #ifdef SMP
 	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
 		tdq->tdq_transferable++;
 		tdq->tdq_group->tdg_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
-		if (tdq->tdq_transferable >= busy_thresh &&
-		    (tdq->tdq_flags & TDQF_BUSY) == 0) {
-			tdq->tdq_flags |= TDQF_BUSY;
-			atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq));
-		}
 	}
 #endif
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -379,7 +391,6 @@
 		 * This queue contains only priorities between MIN and MAX
 		 * realtime.  Use the whole queue to represent these values.
 		 */
-#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
 		if ((flags & SRQ_BORROWING) == 0) {
 			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
 			pri = (pri + tdq->tdq_idx) % RQ_NQS;
@@ -398,19 +409,22 @@
 		runq_add(ts->ts_runq, ts, flags);
 }
 
+/* 
+ * Remove a thread from a run-queue.  This typically happens when a thread
+ * is selected to run.  Running threads are not on the queue and the
+ * transferable count does not reflect them.
+ */
 static __inline void
 tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
 {
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	KASSERT(ts->ts_runq != NULL,
+	    ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
 #ifdef SMP
 	if (ts->ts_flags & TSF_XFERABLE) {
 		tdq->tdq_transferable--;
 		tdq->tdq_group->tdg_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
-		if (tdq->tdq_transferable < busy_thresh && 
-		    (tdq->tdq_flags & TDQF_BUSY)) {
-			atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq));
-			tdq->tdq_flags &= ~TDQF_BUSY;
-		}
 	}
 #endif
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -429,11 +443,17 @@
 		runq_remove(ts->ts_runq, ts);
 }
 
+/*
+ * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
+ * for this thread to the referenced thread queue.
+ */
 static void
 tdq_load_add(struct tdq *tdq, struct td_sched *ts)
 {
 	int class;
-	mtx_assert(&sched_lock, MA_OWNED);
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
 	tdq->tdq_load++;
 	CTR2(KTR_SCHED, "cpu %jd load: %d", TDQ_ID(tdq), tdq->tdq_load);
@@ -446,11 +466,17 @@
 #endif
 }
 
+/*
+ * Remove the load from a thread that is transitioning to a sleep state or
+ * exiting.
+ */
 static void
 tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
 {
 	int class;
-	mtx_assert(&sched_lock, MA_OWNED);
+
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
 	if (class != PRI_ITHD &&
 	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
@@ -459,27 +485,14 @@
 #else
 		tdq->tdq_sysload--;
 #endif
+	KASSERT(tdq->tdq_load != 0,
+	    ("tdq_load_rem: Removing with 0 load on queue %d", (int)TDQ_ID(tdq)));
 	tdq->tdq_load--;
 	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
 	ts->ts_runq = NULL;
 }
 
 #ifdef SMP
-static void
-sched_smp_tick(struct thread *td)
-{
-	struct tdq *tdq;
-
-	tdq = TDQ_SELF();
-	if (rebalance) {
-		if (ticks >= bal_tick)
-			sched_balance();
-		if (ticks >= gbal_tick && balance_groups)
-			sched_balance_groups();
-	}
-	td->td_sched->ts_rltick = ticks;
-}
-
 /*
  * sched_balance is a simple CPU load balancing algorithm.  It operates by
  * finding the least loaded and most loaded cpu and equalizing their load
@@ -489,15 +502,11 @@
  * installations will only have 2 cpus.  Secondly, load balancing too much at
  * once can have an unpleasant effect on the system.  The scheduler rarely has
  * enough information to make perfect decisions.  So this algorithm chooses
- * algorithm simplicity and more gradual effects on load in larger systems.
+ * simplicity and more gradual effects on load in larger systems.
  *
- * It could be improved by considering the priorities and slices assigned to
- * each task prior to balancing them.  There are many pathological cases with
- * any approach and so the semi random algorithm below may work as well as any.
- *
  */
 static void
-sched_balance(void)
+sched_balance(void *arg)
 {
 	struct tdq_group *high;
 	struct tdq_group *low;
@@ -505,8 +514,9 @@
 	int cnt;
 	int i;
 
-	bal_tick = ticks + (random() % (hz * 2));
-	if (smp_started == 0)
+	callout_reset(&balco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
+	    sched_balance, NULL);
+	if (smp_started == 0 || rebalance == 0)
 		return;
 	low = high = NULL;
 	i = random() % (tdg_maxid + 1);
@@ -529,18 +539,25 @@
 		    LIST_FIRST(&low->tdg_members));
 }
 
+/*
+ * Balance load between CPUs in a group.  Will only migrate within the group.
+ */
 static void
-sched_balance_groups(void)
+sched_balance_groups(void *arg)
 {
 	int i;
 
-	gbal_tick = ticks + (random() % (hz * 2));
-	mtx_assert(&sched_lock, MA_OWNED);
-	if (smp_started)
-		for (i = 0; i <= tdg_maxid; i++)
-			sched_balance_group(TDQ_GROUP(i));
+	callout_reset(&gbalco, max(hz / 2, random() % (hz * SCHED_BAL_SECS)),
+	    sched_balance_groups, NULL);
+	if (smp_started == 0 || rebalance == 0)
+		return;
+	for (i = 0; i <= tdg_maxid; i++)
+		sched_balance_group(TDQ_GROUP(i));
 }
 
+/*
+ * Finds the greatest imbalance between two tdqs in a group.
+ */
 static void
 sched_balance_group(struct tdq_group *tdg)
 {
@@ -564,6 +581,24 @@
 		sched_balance_pair(high, low);
 }
 
+/*
+ * Lock two thread queues using their address to maintain lock order.
+ */
+static void
+tdq_lock_pair(struct tdq *one, struct tdq *two)
+{
+	if (one < two) {
+		TDQ_LOCK(one);
+		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
+	} else {
+		TDQ_LOCK(two);
+		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
+	}
+}
+
+/*
+ * Transfer load between two imbalanced thread queues.
+ */
 static void
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
@@ -574,6 +609,7 @@
 	int diff;
 	int i;
 
+	tdq_lock_pair(high, low);
 	/*
 	 * If we're transfering within a group we have to use this specific
 	 * tdq's transferable count, otherwise we can steal from other members
@@ -588,31 +624,37 @@
 		high_load = high->tdq_group->tdg_load;
 		low_load = low->tdq_group->tdg_load;
 	}
-	if (transferable == 0)
-		return;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * threads we actually have to give up (transferable).
 	 */
-	diff = high_load - low_load;
-	move = diff / 2;
-	if (diff & 0x1)
-		move++;
-	move = min(move, transferable);
-	for (i = 0; i < move; i++)
-		tdq_move(high, TDQ_ID(low));
+	if (transferable != 0) {
+		diff = high_load - low_load;
+		move = diff / 2;
+		if (diff & 0x1)
+			move++;
+		move = min(move, transferable);
+		for (i = 0; i < move; i++)
+			tdq_move(high, low);
+	}
+	TDQ_UNLOCK(high);
+	TDQ_UNLOCK(low);
 	return;
 }
 
+/*
+ * Move a thread from one thread queue to another.
+ */
 static void
-tdq_move(struct tdq *from, int cpu)
+tdq_move(struct tdq *from, struct tdq *to)
 {
+	struct td_sched *ts;
+	struct thread *td;
 	struct tdq *tdq;
-	struct tdq *to;
-	struct td_sched *ts;
+	int cpu;
 
 	tdq = from;
-	to = TDQ_CPU(cpu);
+	cpu = TDQ_ID(to);
 	ts = tdq_steal(tdq, 1);
 	if (ts == NULL) {
 		struct tdq_group *tdg;
@@ -625,26 +667,42 @@
 			break;
 		}
 		if (ts == NULL)
-			panic("tdq_move: No threads available with a "
-			    "transferable count of %d\n", 
-			    tdg->tdg_transferable);
+			return;
 	}
 	if (tdq == to)
 		return;
-	sched_rem(ts->ts_thread);
+	td = ts->ts_thread;
+	/*
+	 * Although the run queue is locked the thread may be blocked.  Lock
+	 * it to clear this.
+	 */
+	thread_lock(td);
+	/* Drop recursive lock on from. */
+	TDQ_UNLOCK(from);
+	sched_rem(td);
 	ts->ts_cpu = cpu;
-	sched_pin_td(ts->ts_thread);
-	sched_add(ts->ts_thread, SRQ_YIELDING);
-	sched_unpin_td(ts->ts_thread);
+	td->td_lock = TDQ_LOCKPTR(to);
+	tdq_add(to, td, SRQ_YIELDING);
 }
 
+/*
+ * This tdq has idled.  Try to steal a thread from another cpu and switch
+ * to it.
+ */
 static int
 tdq_idled(struct tdq *tdq)
 {
 	struct tdq_group *tdg;
 	struct tdq *steal;
 	struct td_sched *ts;
+	struct thread *td;
+	int highload;
+	int highcpu;
+	int load;
+	int cpu;
 
+	/* We don't want to be preempted while we're iterating over tdqs */
+	spinlock_enter();
 	tdg = tdq->tdq_group;
 	/*
 	 * If we're in a cpu group, try and steal threads from another cpu in
@@ -654,51 +712,59 @@
 		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
 			if (steal == tdq || steal->tdq_transferable == 0)
 				continue;
+			TDQ_LOCK(steal);
 			ts = tdq_steal(steal, 0);
 			if (ts)
 				goto steal;
+			TDQ_UNLOCK(steal);
 		}
 	}
-	if (steal_busy) {
-		while (tdq_busy) {
-			int cpu;
-
-			cpu = ffs(tdq_busy);
-			if (cpu == 0)
-				break;
-			cpu--;
+	for (;;) {
+		if (steal_idle == 0)
+			break;
+		highcpu = 0;
+		highload = 0;
+		for (cpu = 0; cpu <= mp_maxid; cpu++) {
+			if (CPU_ABSENT(cpu))
+				continue;
 			steal = TDQ_CPU(cpu);
-			if (steal->tdq_transferable == 0)
+			load = TDQ_CPU(cpu)->tdq_transferable;
+			if (load < highload)
 				continue;
-			ts = tdq_steal(steal, 1);
-			if (ts == NULL)
-				continue;
-			CTR5(KTR_ULE,
-			    "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X",
-			    ts->ts_thread, ts->ts_thread->td_proc->p_comm,
-			    ts->ts_thread->td_priority, cpu, tdq_busy);
+			highload = load;
+			highcpu = cpu;
+		}
+		if (highload < 2)
+			break;
+		steal = TDQ_CPU(highcpu);
+		TDQ_LOCK(steal);
+		if (steal->tdq_transferable > 1 &&
+		    (ts = tdq_steal(steal, 1)) != NULL)
 			goto steal;
-		}
+		TDQ_UNLOCK(steal);
+		break;
 	}
-	/*
-	 * We only set the idled bit when all of the cpus in the group are
-	 * idle.  Otherwise we could get into a situation where a thread bounces
-	 * back and forth between two idle cores on seperate physical CPUs.
-	 */
-	tdg->tdg_idlemask |= PCPU_GET(cpumask);
-	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
-		atomic_set_int(&tdq_idle, tdg->tdg_mask);
+	spinlock_exit();
 	return (1);
 steal:
-	sched_rem(ts->ts_thread);
-	ts->ts_cpu = PCPU_GET(cpuid);
-	sched_pin_td(ts->ts_thread);
-	sched_add(ts->ts_thread, SRQ_YIELDING);
-	sched_unpin_td(ts->ts_thread);
+	td = ts->ts_thread;
+	thread_lock(td);
+	spinlock_exit();
+	MPASS(td->td_lock == TDQ_LOCKPTR(steal));
+	TDQ_UNLOCK(steal);
+	sched_rem(td);
+	sched_setcpu(ts, PCPU_GET(cpuid), SRQ_YIELDING);
+	tdq_add(tdq, td, SRQ_YIELDING);
+	MPASS(td->td_lock == curthread->td_lock);
+	mi_switch(SW_VOL, NULL);
+	thread_unlock(curthread);
 
 	return (0);
 }
 
+/*
+ * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
+ */
 static void
 tdq_notify(struct td_sched *ts)
 {
@@ -734,29 +800,74 @@
 	/*
 	 * Otherwise only IPI if we exceed the threshold.
 	 */
-	if (pri > ipi_thresh)
+	if (pri > preempt_thresh)
 		return;
 sendipi:
 	ctd->td_flags |= TDF_NEEDRESCHED;
-	if (cpri < PRI_MIN_IDLE) {
-		if (ipi_ast)
-			ipi_selected(1 << cpu, IPI_AST);
-		else if (ipi_preempt)
-			ipi_selected(1 << cpu, IPI_PREEMPT);
-	} else 
-		ipi_selected(1 << cpu, IPI_PREEMPT);
+	ipi_selected(1 << cpu, IPI_PREEMPT);
+}
+
+/*
+ * Steals load from a timeshare queue.  Honors the rotating queue head
+ * index.
+ */
+static struct td_sched *
+runq_steal_from(struct runq *rq, u_char start)
+{
+	struct td_sched *ts;
+	struct rqbits *rqb;
+	struct rqhead *rqh;
+	int first;
+	int bit;
+	int pri;
+	int i;
+
+	rqb = &rq->rq_status;
+	bit = start & (RQB_BPW -1);
+	pri = 0;
+	first = 0;
+again:
+	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
+		if (rqb->rqb_bits[i] == 0)
+			continue;
+		if (bit != 0) {
+			for (pri = bit; pri < RQB_BPW; pri++)
+				if (rqb->rqb_bits[i] & (1ul << pri))
+					break;

>>> TRUNCATED FOR MAIL (1000 lines) <<<