PERFORCE change 55771 for review

Fri Jun 25 07:33:09 GMT 2004

http://perforce.freebsd.org/chv.cgi?CH=55771

Change 55771 by julian at julian_jules1 on 2004/06/25 07:32:03

	safety safe for churn
	nowhere near finished (or even compiles)

Affected files ...

.. //depot/projects/nsched/sys/kern/sched_4bsd.c#18 edit
.. //depot/projects/nsched/sys/kern/sched_ule.c#7 edit

Differences ...

==== //depot/projects/nsched/sys/kern/sched_4bsd.c#18 (text+ko) ====

@@ -52,12 +52,6 @@
 #include <sys/queue.h>
 #include <machine/critical.h>
 #include <sys/thr.h> /* XXXKSE */
-#if 0
-#include <vm/vm.h>
-#include <vm/vm_extern.h>
-#include <vm/pmap.h>
-#include <vm/vm_map.h>
-#endif
 #include <vm/uma.h>
 #include <machine/critical.h>
 

==== //depot/projects/nsched/sys/kern/sched_ule.c#7 (text+ko) ====

@@ -1,3 +1,4 @@
+
 /*-
  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff at freebsd.org>
  * All rights reserved.
@@ -34,6 +35,7 @@
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/queue.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
@@ -55,6 +57,10 @@
 
 #define KTR_ULE         KTR_NFS
 
+#include <vm/uma.h>
+#include <machine/critical.h>
+
+
 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 /* XXX This is bogus compatability crap for ps */
 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
@@ -78,45 +84,221 @@
  * These datastructures are allocated within their parent datastructure but
  * are scheduler specific.
  */
+/*-
+ * Description of a process.
+ *
+ * Below is a key of locks used to protect each member of struct proc.  The
+ * lock is indicated by a reference to a specific character in parens in the
+ * associated comment.
+ *      * - not yet protected
+ *      a - only touched by curproc or parent during fork/wait
+ *      b - created at fork, never changes
+ *		(exception aiods switch vmspaces, but they are also
+ *		marked 'P_SYSTEM' so hopefully it will be left alone)
+ *      c - locked by proc mtx
+ *      d - locked by allproc_lock lock
+ *      e - locked by proctree_lock lock
+ *      f - session mtx
+ *      g - process group mtx
+ *      h - callout_lock mtx
+ *      i - by curproc or the master session mtx
+ *      j - locked by sched_lock mtx
+ *      k - only accessed by curthread
+ *      l - the attaching proc or attaching proc parent
+ *      m - Giant
+ *      n - not locked, lazy
+ *      o - ktrace lock
+ *      p - select lock (sellock)
+ *      q - td_contested lock
+ *      r - p_peers lock
+ *      x - created at fork, only changes during single threading in exec
+ *      z - zombie threads/kse/ksegroup lock
+ *
+ */
+/***************
+ * In pictures:
+ With a single run queue used by all processors:
+
+ RUNQ: --->KSE---KSE--...               SLEEPQ:[]---THREAD---THREAD---THREAD
+	   |   /                               []---THREAD
+	   KSEG---THREAD--THREAD--THREAD       []
+					       []---THREAD---THREAD
+
+  (processors run THREADs from the KSEG until they are exhausted or
+  the KSEG exhausts its quantum)
+
+With PER-CPU run queues:
+KSEs on the separate run queues directly
+They would be given priorities calculated from the KSEG.
+
+ *
+ *****************/
+/************************************************************************
+ * Definitions of the run queues we use here.
+ */
+
+/*
+ * Copyright (c) 2001 Jake Burkholder <jake at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/sys/runq.h,v 1.4 2002/05/25 01:12:23 jake Exp $
+ */
+
+#ifndef	_RUNQ_H_
+#define	_RUNQ_H_
+
+#include <machine/runq.h>
+
+struct kse;
+
+/*
+ * Run queue parameters.
+ */
+
+#define	RQ_NQS		(64)		/* Number of run queues. */
+#define	RQ_PPQ		(4)		/* Priorities per queue. */
 
-struct ke_sched {
-	int		ske_slice;
-	struct runq	*ske_runq;
+/*
+ * Head of run queues.
+ */
+TAILQ_HEAD(rqhead, kse);
+
+/*
+ * Bit array which maintains the status of a run queue.  When a queue is
+ * non-empty the bit corresponding to the queue number will be set.
+ */
+struct rqbits {
+	rqb_word_t rqb_bits[RQB_LEN];
+};
+
+/*
+ * Run queue structure.  Contains an array of run queues on which processes
+ * are placed, and a structure to maintain the status of each queue.
+ */
+struct runq {
+	struct	rqbits rq_status;
+	struct	rqhead rq_queues[RQ_NQS];
+};
+
+#endif  /* end of Jake copyright file */
+
+
+/*
+ * The schedulable entity that can be given a context to run.
+ * A process may have several of these. Probably one per processor
+ * but posibly a few more. In this universe they are grouped
+ * with a KSEG that contains the priority and niceness
+ * for the group.
+ */
+struct kse {
+	struct proc	*ke_proc;	/* (*) Associated process. */
+	struct ksegrp	*ke_ksegrp;	/* (*) Associated KSEG. */
+	TAILQ_ENTRY(kse) ke_kglist;	/* (*) Queue of KSEs in ke_ksegrp. */
+	TAILQ_ENTRY(kse) ke_kgrlist;	/* (*) Queue of KSEs in this state. */
+	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
+
+#define	ke_startzero ke_flags
+	int		ke_flags;	/* (j) KEF_* flags. */
+	struct thread	*ke_thread;	/* (*) Active associated thread. */
+	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
+	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
+	char		ke_rqindex;	/* (j) Run queue index. */
+	enum {
+		KES_UNUSED = 0x0,
+		KES_IDLE,
+		KES_ONRUNQ,
+		KES_UNQUEUED,		/* in transit */
+		KES_THREAD		/* slaved to thread state */
+	} ke_state;			/* (j) KSE status. */
+#define	ke_endzero ke_dummy
+	u_char		ke_dummy;
+	int		ke_slice;
+	struct runq	*ke_runq;
 	/* The following variables are only used for pctcpu calculation */
-	int		ske_ltick;	/* Last tick that we were running on */
-	int		ske_ftick;	/* First tick that we were running on */
-	int		ske_ticks;	/* Tick count */
+	int		ke_ltick;	/* Last tick that we were running on */
+	int		ke_ftick;	/* First tick that we were running on */
+	int		ke_ticks;	/* Tick count */
 	/* CPU that we have affinity for. */
-	u_char		ske_cpu;
+	u_char		ke_cpu;
 };
-#define	ke_slice	ke_sched->ske_slice
-#define	ke_runq		ke_sched->ske_runq
-#define	ke_ltick	ke_sched->ske_ltick
-#define	ke_ftick	ke_sched->ske_ftick
-#define	ke_ticks	ke_sched->ske_ticks
-#define	ke_cpu		ke_sched->ske_cpu
-#define	ke_assign	ke_procq.tqe_next
+
+/* flags kept in ke_flags */
+#define	KEF_ASSIGNED	0x00001	/* KSE is being migrated. */
+#define	KEF_BOUND	0x00002	/* KSE can not migrate. */
+#define	KEF_DIDRUN	0x02000	/* KSE actually ran. */
+#define	KEF_EXIT	0x04000	/* KSE is being killed. */
+
+#define	FIRST_KSE_IN_KSEGRP(kg)	TAILQ_FIRST(&(kg)->kg_kseq)
+#define	FIRST_KSE_IN_PROC(p)	FIRST_KSE_IN_KSEGRP(FIRST_KSEGRP_IN_PROC(p))
+
+static struct kse kse0;			/* Primary kse in proc0. */
+static struct	kse *kse_alloc(void);
+static void	kse_free(struct kse *ke);
+static void	kse_stash(struct kse *ke);
+static void	kse_unlink(struct kse *ke);
+static void     kse_reassign(struct kse *ke);
+static void     kse_link(struct kse *ke, struct ksegrp *kg);
 
-#define	KEF_ASSIGNED	KEF_SCHED0	/* KSE is being migrated. */
-#define	KEF_BOUND	KEF_SCHED1	/* KSE can not migrate. */
 
+/*
+ * Scheduler specific extensions to various structures.
+ */
 struct kg_sched {
 	int	skg_slptime;		/* Number of ticks we vol. slept */
 	int	skg_runtime;		/* Number of ticks we were running */
+	TAILQ_HEAD(, kse) skg_kseq;	/* (ke_kglist) All KSEs. */
+	TAILQ_HEAD(, kse) skg_iq;	/* (ke_kgrlist) All idle KSEs. */
+	struct thread *skg_last_assigned; /* (j) Last thread assigned */
+					/* ( to a KSE). */
+	int	skg_runq_kses;		/* (j) Num KSEs on runq. */
+	int	skg_idle_kses;		/* (j) Num KSEs on iq. */
+	int	skg_kses;		/* (j) Num KSEs in group. */
+	int	skg_concurrancy;	/* (j) desired concurrancy */
+
 };
 #define	kg_slptime	kg_sched->skg_slptime
 #define	kg_runtime	kg_sched->skg_runtime
+#define kg_kseq		kg_sched->skg_kseq
+#define kg_iq		kg_sched->skg_iq
+#define kg_last_assigned kg_sched->skg_last_assigned
+#define kg_runq_kses	kg_sched->skg_runq_kses
+#define kg_idle_kses	kg_sched->skg_idle_kses
+#define kg_kses		kg_sched->skg_kses
 
+
 struct td_sched {
 	int	std_slptime;
+	struct kse	*std_last_kse;	/* (j) Previous value of td_kse. */
+	struct kse	*std_kse;	/* (j) Current KSE if running. */
 };
 #define	td_slptime	td_sched->std_slptime
+#define td_last_kse	td_sched->std_last_kse
+#define td_kse		td_sched->std_kse
 
 struct td_sched td_sched;
-struct ke_sched ke_sched;
 struct kg_sched kg_sched;
 
-struct ke_sched *kse0_sched = &ke_sched;
 struct kg_sched *ksegrp0_sched = &kg_sched;
 struct p_sched *proc0_sched = NULL;
 struct td_sched *thread0_sched = &td_sched;
@@ -286,6 +468,31 @@
 static void kseq_notify(struct kse *ke, int cpu);
 static void kseq_assign(struct kseq *);
 static struct kse *kseq_steal(struct kseq *kseq, int stealidle);
+#endif
+
+static void	runq_add(struct runq *, struct kse *);
+static int	runq_check(struct runq *);
+static struct	kse *runq_choose(struct runq *);
+static void	runq_init(struct runq *);
+static void	runq_remove(struct runq *, struct kse *);
+
+   
+static void     setup_runqs(void);
+static void     roundrobin(void *arg);
+static void     schedcpu(void);
+static void     schedcpu_thread(void);
+static void     maybe_resched(struct thread *td);
+static void     updatepri(struct ksegrp *kg);
+static void     resetpriority(struct ksegrp *kg);
+static void     sched_add(struct thread *td);
+static void     sched_rem(struct thread *td);
+static struct kse * sched_choose(void);
+static void     adjustrunqueue( struct thread *td, int newpri) ;
+
+static void	sched_fork_kse(struct thread *td, struct kse *child);
+static void	sched_exit_kse(struct kse *ke, struct thread *td);
+
+#ifdef SMP
 /*
  * On P4 Xeons the round-robin interrupt delivery is broken.  As a result of
  * this, we can't pin interrupts to the cpu that they were delivered to, 
@@ -301,6 +508,7 @@
 #endif /* !__i386__ */
 #endif
 
+#define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
 void
 kseq_print(int cpu)
 {
@@ -1123,7 +1331,7 @@
 }
 
 void
-sched_switch(struct thread *td)
+sched_switch(struct thread *td , int flags)
 {
 	struct thread *newtd;
 	struct kse *ke;
@@ -1158,7 +1366,7 @@
 				kse_reassign(ke);
 		}
 	}
-	newtd = choosethread();
+	newtd = choosethread(flags);
 	if (td != newtd)
 		cpu_switch(td, newtd);
 	sched_lock.mtx_lock = (uintptr_t)td;
@@ -1247,20 +1455,25 @@
  * priority.
  */
 void
-sched_fork(struct proc *p, struct proc *p1)
+sched_fork(struct thread *td, struct proc *p1)
 {
 
+	struct proc *p;
+
+	p = td->td_proc;
+
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	p1->p_nice = p->p_nice;
-	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
-	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
-	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
+	sched_fork_ksegrp(td, FIRST_KSEGRP_IN_PROC(p1));
+	sched_fork_kse(td, FIRST_KSE_IN_PROC(p1));
+	sched_fork_thread(td, FIRST_THREAD_IN_PROC(p1));
 }
 
-void
-sched_fork_kse(struct kse *ke, struct kse *child)
+static void
+sched_fork_kse(struct thread *td, struct kse *child)
 {
+	struct kse *ke = td->td_kse;
 
 	child->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
 	child->ke_cpu = ke->ke_cpu;
@@ -1273,8 +1486,10 @@
 }
 
 void
-sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
+sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
 {
+	struct ksegrp *kg = td->td_ksegrp;
+
 	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
 
 	child->kg_slptime = kg->kg_slptime;
@@ -1290,11 +1505,6 @@
 }
 
 void
-sched_fork_thread(struct thread *td, struct thread *child)
-{
-}
-
-void
 sched_class(struct ksegrp *kg, int class)
 {
 	struct kseq *kseq;
@@ -1348,29 +1558,31 @@
  * Return some of the child's priority and interactivity to the parent.
  */
 void
-sched_exit(struct proc *p, struct proc *child)
+sched_exit(struct proc *p, struct thread *td )
 {
 	mtx_assert(&sched_lock, MA_OWNED);
-	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
-	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child));
+	sched_exit_kse(FIRST_KSE_IN_PROC(p), td);
+	sched_exit_ksegrp(p, td);
 }
 
 void
-sched_exit_kse(struct kse *ke, struct kse *child)
+sched_exit_kse(struct kse *ke, struct thread *td)
 {
-	kseq_load_rem(KSEQ_CPU(child->ke_cpu), child);
+	kseq_load_rem(KSEQ_CPU(child->ke_cpu), td->td_kse);
 }
 
 void
-sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
+sched_exit_ksegrp(struct proc *p, struct thread *td)
 {
-	/* kg->kg_slptime += child->kg_slptime; */
-	kg->kg_runtime += child->kg_runtime;
+	struct ksegrp *kg = FIRST_KSEGRP_IN_PROC(p);
+
+	/* kg->kg_slptime += td->td_ksegrp->kg_slptime; */
+	kg->kg_runtime += td->td_ksegrp->kg_runtime;
 	sched_interact_update(kg);
 }
 
 void
-sched_exit_thread(struct thread *td, struct thread *child)
+sched_exit_thread(struct proc *p, struct thread *child)
 {
 }
 
@@ -1726,12 +1938,6 @@
 }
 
 int
-sched_sizeof_kse(void)
-{
-	return (sizeof(struct kse) + sizeof(struct ke_sched));
-}
-
-int
 sched_sizeof_ksegrp(void)
 {
 	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
@@ -1748,3 +1954,1062 @@
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
+
+/*
+ * Copyright (c) 2001 Jake Burkholder <jake at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/***
+Here is the logic..
+
+If there are N processors, then there are at most N KSEs (kernel
+schedulable entities) working to process threads that belong to a
+KSEGROUP (kg). If there are X of these KSEs actually running at the
+moment in question, then there are at most M (N-X) of these KSEs on
+the run queue, as running KSEs are not on the queue.
+
+Runnable threads are queued off the KSEGROUP in priority order.
+If there are M or more threads runnable, the top M threads
+(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
+their priority from those threads and are put on the run queue.
+
+The last thread that had a priority high enough to have a KSE associated
+with it, AND IS ON THE RUN QUEUE is pointed to by
+kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
+assigned as all the available KSEs are activly running, or because there
+are no threads queued, that pointer is NULL.
+
+When a KSE is removed from the run queue to become runnable, we know
+it was associated with the highest priority thread in the queue (at the head
+of the queue). If it is also the last assigned we know M was 1 and must
+now be 0. Since the thread is no longer queued that pointer must be
+removed from it. Since we know there were no more KSEs available,
+(M was 1 and is now 0) and since we are not FREEING our KSE
+but using it, we know there are STILL no more KSEs available, we can prove
+that the next thread in the ksegrp list will not have a KSE to assign to
+it, so we can show that the pointer must be made 'invalid' (NULL).
+
+The pointer exists so that when a new thread is made runnable, it can
+have its priority compared with the last assigned thread to see if
+it should 'steal' its KSE or not.. i.e. is it 'earlier'
+on the list than that thread or later.. If it's earlier, then the KSE is
+removed from the last assigned (which is now not assigned a KSE)
+and reassigned to the new thread, which is placed earlier in the list.
+The pointer is then backed up to the previous thread (which may or may not
+be the new thread).
+
+When a thread sleeps or is removed, the KSE becomes available and if there 
+are queued threads that are not assigned KSEs, the highest priority one of
+them is assigned the KSE, which is then placed back on the run queue at
+the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
+to point to it.
+
+The following diagram shows 2 KSEs and 3 threads from a single process.
+
+ RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
+              \    \____   
+               \        \
+    KSEGROUP---thread--thread--thread    (queued in priority order)
+        \                 / 
+         \_______________/
+          (last_assigned)
+
+The result of this scheme is that the M available KSEs are always
+queued at the priorities they have inherrited from the M highest priority
+threads for that KSEGROUP. If this situation changes, the KSEs are 
+reassigned to keep this true.
+***/
+
+
+CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
+
+
+/* END */
+
+void
+sched_thread_exit(struct thread *td)
+{
+	struct kse *ke;
+
+	ke = td->td_kse;
+
+	if ((td->td_proc->p_flag & P_SA) && ke != NULL) {
+		ke->ke_thread = NULL;
+		td->td_kse = NULL;
+		kse_reassign(ke);
+	}
+	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
+		sched_tdcnt--;
+	
+}
+
+/* 
+ * special version of the above for thr..
+ * work towards merging them.
+ * Special code for thr library thread
+ * Called from:
+ *  thr_exit1()
+ */
+void
+sched_thr_exit(struct thread *td)
+{
+	struct kse *ke;
+
+	ke = td->td_kse;
+
+	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
+		sched_tdcnt--;
+
+	/* td is about to be freed, but keep it clean */
+	td->td_kse	= NULL;
+	td->td_last_kse	= NULL;
+	kse_unlink(ke); 	/* also frees it */
+}
+
+
+/*
+ * Allocate any resources the scheduler needs to allocate or set up 
+ * on a new process at fork() time.
+ * Called from:
+ *  fork1()
+ */
+void
+sched_fork(struct thread *td, struct proc *child)
+{
+	struct thread *newtd;
+	struct kse *newke;
+
+	newtd = FIRST_THREAD_IN_PROC(child);
+	newke = FIRST_KSE_IN_PROC(child);
+	bzero(&newke->ke_startzero,
+		(unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
+	newke->ke_state = KES_THREAD;
+	newke->ke_cpticks = 0;
+	sched_fork_ksegrp(td, FIRST_KSEGRP_IN_PROC(child));
+        newke->ke_thread = newtd;
+	newtd->td_kse = newke;
+}
+
+static uma_zone_t kse_zone;
+
+struct  kse kse0;
+static struct kg_sched kg_sched0;
+static struct td_sched td_sched0;
+
+
+extern struct mtx kse_zombie_lock;
+TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
+
+/*
+ * Occasionally the scheduler may need to do some GC..
+ * Called from:
+ *  thread_reap()
+ */
+void
+sched_GC(void)
+{
+	struct kse *ke_first, *ke_next;
+
+	if (!TAILQ_EMPTY(&zombie_kses)) {
+		mtx_lock_spin(&kse_zombie_lock);
+		ke_first = TAILQ_FIRST(&zombie_kses);
+		if (ke_first)
+			TAILQ_INIT(&zombie_kses);
+		mtx_unlock_spin(&kse_zombie_lock);
+		while (ke_first) {
+			ke_next = TAILQ_NEXT(ke_first, ke_procq);
+			kse_free(ke_first);
+			ke_first = ke_next;
+		}
+	}
+}
+
+/*
+ * Very early in the boot some setup of scheduler-specific
+ * parts of proc0 and of soem scheduler resources needs to be done.
+ * Called from:
+ *  proc0_init()
+ */
+void
+schedinit(void)
+{
+	/*
+	 * Set up the scheduler specific parts of proc0.
+	 */
+	ksegrp0.kg_sched = &kg_sched0;
+	proc0.p_sched = NULL; /* XXX */
+	thread0.td_sched = &td_sched0;
+	
+	/* 
+	 * and link in our own per scheduler struct
+	 */
+	kse_link(&kse0, &ksegrp0);
+	/*
+	 * and set it up as if BOUND and running
+	 */
+	kse0.ke_thread = &thread0;
+	thread0.td_kse = &kse0; /* we are running */
+	kse0.ke_state = KES_THREAD;
+
+	kse_zone = uma_zcreate("KSE", sizeof (struct kse),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
+}
+
+/* 
+ * for now have special thr code
+ * later on, clean these up into common code.
+ * Called from:
+ *  thr_create()
+ */
+int
+sched_thr_newthread(struct thread *td, struct thread *newtd, int flags)
+{
+	struct kse *newke;
+	/* Initialize our kse structure. */
+	newke = kse_alloc();
+	bzero(&newke->ke_startzero,
+	    RANGEOF(struct kse, ke_startzero, ke_endzero));
+
+	/* Link the thread and kse into the ksegrp and make it runnable. */
+	mtx_lock_spin(&sched_lock);
+
+	thread_link(newtd, td->td_ksegrp);
+	kse_link(newke, td->td_ksegrp);
+
+	/* Bind this thread and kse together. */
+	newtd->td_kse = newke;
+	newke->ke_thread = newtd;
+	bzero(&newke->ke_startzero,
+		(unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
+	newke->ke_state = KES_THREAD;
+	newke->ke_cpticks = 0;
+	sched_fork_kse(td->td_kse, newke);
+
+	TD_SET_CAN_RUN(newtd);
+	if ((flags & THR_SUSPENDED) == 0)
+		setrunqueue(newtd);
+
+	mtx_unlock_spin(&sched_lock);
+	return (0);	/* the API could fail but not in this case */
+}
+
+/*****************************
+ * KSE zone/allocation methods.
+ */
+/*
+ * Allocate a kse.
+ */
+static struct kse *
+kse_alloc(void)
+{
+	return (uma_zalloc(kse_zone, M_WAITOK));
+}
+
+/*
+ * Deallocate a kse.
+ */
+static void
+kse_free(struct kse *td)
+{
+	uma_zfree(kse_zone, td);
+}
+
+/*
+ * Stash an embarasingly extra kse into the zombie kse queue.
+ * Called from:
+ *  kse_unlink() (local)
+ */
+static void
+kse_stash(struct kse *ke)
+{
+	mtx_lock_spin(&kse_zombie_lock);
+	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
+	mtx_unlock_spin(&kse_zombie_lock);
+}
+
+/*
+ * KSE is linked into kse group.   
+ * Called from:
+ *  sched_newproc()  (local)
+ *  sched_thr_newthread()  (local)
+ *  schedinit()  (local)
+ *  sched_set_concurrancy()  (local)
+ *  
+ */
+static void
+kse_link( struct kse *ke, struct ksegrp *kg)
+{
+	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
+	kg->kg_kses++;
+	ke->ke_state    = KES_UNQUEUED;
+	ke->ke_proc     = kg->kg_proc;		/* really just a shortcut */
+	ke->ke_ksegrp   = kg;
+	ke->ke_thread   = NULL;
+	ke->ke_oncpu    = NOCPU;
+	ke->ke_flags    = 0;
+}
+
+/*
+ *  Allocate scheduler specific per-process resources.
+ * The thread and ksegrp have already been linked in.
+ * Called from:
+ *  proc_init() (UMA init method)
+ */
+int
+sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
+{
+	struct kse *ke;
+
+	/* 
+	 * For a new process, allocate a single KSE to the ksegrp.
+	 */
+	ke = kse_alloc();
+	if (ke) {
+		kse_link(ke, kg);
+		td->td_kse = ke;
+		ke->ke_thread = td;
+		return (0);
+	}
+	return (ENOMEM );
+}
+
+/*
+ * Ksegrp is being either created or recycled.
+ * Fix up the per-scheduler resources associated with it.
+ * Called from:
+ *  ksegrp_dtor()
+ *  ksegrp_initi()
+ */
+void
+sched_init_ksegrp(struct ksegrp *kg)
+{
+
+	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
+	TAILQ_INIT(&kg->kg_iq);		/* all idle kses in ksegrp */
+	kg->kg_kses       = 0;
+	kg->kg_runq_kses  = 0; /* XXXKSE change name */
+	kg->kg_idle_kses  = 0;
+}
+
+/*
+ * thread is being either created or recycled.
+ * Fix up the per-scheduler resources associated with it.
+ * Called from:
+ *  thread_dtor()
+ *  thread_initi()
+ */
+/* Assumes td->td_sched is already set up */
+void
+sched_init_thread(struct thread *td)
+{
+	td->td_last_kse = NULL;
+	td->td_kse      = NULL;
+}
+
+
+/*
+ * code to take the per-scheduler KSE structure
+ * off the ksegrp it is hanging off and free it
+ * Called from:
+ *  sched_destroyproc()
+ *  sched_thr_exit()
+ *  sched_set_concurrancy() via REDUCE_KSES()
+ *  kse_reassign() via REDUCE_KSES()
+ */
+static void
+kse_unlink(struct kse *ke)
+{
+	struct ksegrp *kg;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	kg = ke->ke_ksegrp;
+	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
+	if (ke->ke_state == KES_IDLE) {
+		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
+		kg->kg_idle_kses--;
+	}
+	/*
+	 * Aggregate stats from the KSE
+	 * ## none yet ##
+	 */
+
+	kse_stash(ke);
+}
+
+/* 
+ * Whenever we have idle KSEs and there are too many for the concurrancy,
+ * then free as many as we can. Don't free too many if we have threads
+ * to run/kill.
+ */
+#define REDUCE_KSES(kg, skg) 					\
+do {								\
+	while ((skg->skg_concurrancy < skg->skg_kses) &&	\
+    	    (skg->skg_idle_kses > 0) &&				\
+	    (skg->skg_kses > kg->kg_numthreads)) {			\
+		kse_unlink(TAILQ_FIRST(&skg->skg_iq));		\
+	}							\
+} while (0)
+
+/*
+ * Called by the uma process fini routine..
+ * undo anything we may have done in the uma_init method.
+ * Panic if it's not all 1:1:1:1
+ * Called from:
+ *  proc_fini() (UMA method)
+ */
+void
+sched_destroyproc(struct proc *p)
+{
+	struct ksegrp *kg;
+	struct kg_sched *skg;
+	
+	KASSERT((p->p_numthreads == 1), ("Cached proc with > 1 thread "));
+	KASSERT((p->p_numksegrps == 1), ("Cached proc with > 1 ksegrp "));
+
+	kg = FIRST_KSEGRP_IN_PROC(p);
+	
+	KASSERT((kg->kg_kses == 1), ("Cached proc with > 1 kse "));
+
+	skg = kg->kg_sched;
+	kse_unlink(TAILQ_FIRST(&skg->skg_iq));		\
+}
+
+/*
+ * (Re) assign resources to allow the ksegrp to implement
+ * teh requested concurrancy. At this time it means allocating
+ * or freeing KSE structures.
+ * Called from:
+ *  kern_execve()  (reverting to non threaded)
+ *  kern_exit()  (reverting to non threaded)
+ *  thread_exit() (during removal of ksegrp)
+ *  sched_exit_ksegrp() (local)
+ *  kse_exit()  (decreasing)
+ *  kse_create() (increasing)
+ */
+void
+sched_set_concurrancy(struct ksegrp *kg, int concurrancy)
+{
+	struct kse *newke;
+	struct kg_sched *skg;
+
+	skg = kg->kg_sched;
+	skg->skg_concurrancy = concurrancy;
+	REDUCE_KSES(kg, skg);
+	while (skg->skg_kses < skg->skg_concurrancy) {
+		newke = kse_alloc();
+		bzero(&newke->ke_startzero, RANGEOF(struct kse,
+		      ke_startzero, ke_endzero));
+#if 0
+		mtx_lock_spin(&sched_lock);
+		bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
+		      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
+		mtx_unlock_spin(&sched_lock);
+#endif
+		mtx_lock_spin(&sched_lock);
+		kse_link(newke, kg);
+		bzero(&newke->ke_startzero,
+		    (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
+		newke->ke_state = KES_THREAD;
+		newke->ke_cpticks = 0;
+		/* Add engine */
+		kse_reassign(newke);
+		mtx_unlock_spin(&sched_lock);
+	}
+}
+
+
+CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
+
+#if 0
+static void runq_readjust(struct runq *rq, struct kse *ke);
+#endif
+/*
+ * Select the KSE that will be run next.  From that find the thread, and
+ * remove it from the KSEGRP's run queue.  If there is thread clustering,
+ * this will be what does it.
+ * XXX Change to take an argument indicating 
+ * if the switch is voluntary or involuntary.
+ * Called from:
+ *  thr_exit1()
+ *  thread_exit()
+ *  sched_switch()  (local)
+ *  init_secondary()  (start up 2ndary processors)
+ */
+struct thread *
+choosethread(int flags)
+{
+	struct kse *ke;
+	struct thread *td;
+	struct ksegrp *kg;
+
+#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
+	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
+		/* Shutting down, run idlethread on AP's */
+		td = PCPU_GET(idlethread);
+		ke = td->td_kse;
+		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
+		ke->ke_flags |= KEF_DIDRUN;
+		TD_SET_RUNNING(td);
+		return (td);
+	}
+#endif
+
+retry:
+	kg = curthread->td_ksegrp;
+#if 0
+	if (flags & SW_VOL) {
+		if (kg->kg_runnable) {
+			td = TAILQ_FIRST(&kg->kg_runq);
+		}
+	}
+	if (ke == NULL)
+#endif
+		ke = sched_choose();
+	if (ke) {

>>> TRUNCATED FOR MAIL (1000 lines) <<<