scheduler CORE for RELENG_6 #2
Stepan A. Baranov
rosmir at gmail.com
Tue Oct 24 02:50:53 PDT 2006
Hello,
In previous email I attached patch without extension txt.
Stepan Baranov.
-------------- next part --------------
Index: ./sys/sys/sched.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/sched.h,v
retrieving revision 1.24
diff -u -r1.24 sched.h
--- ./sys/sys/sched.h 19 Apr 2005 04:01:25 -0000 1.24
+++ ./sys/sys/sched.h 24 Oct 2006 09:45:32 -0000
@@ -78,6 +78,8 @@
void sched_add(struct thread *td, int flags);
void sched_clock(struct thread *td);
void sched_rem(struct thread *td);
+void sched_tick(void);
+void sched_relinquish(struct thread *td);
/*
* Binding makes cpu affinity permanent while pinning is used to temporarily
Index: ./sys/sys/systm.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/systm.h,v
retrieving revision 1.234.2.5
diff -u -r1.234.2.5 systm.h
--- ./sys/sys/systm.h 6 Jul 2006 08:32:50 -0000 1.234.2.5
+++ ./sys/sys/systm.h 24 Oct 2006 09:45:35 -0000
@@ -239,6 +239,12 @@
int unsetenv(const char *name);
int testenv(const char *name);
+typedef uint64_t (cpu_tick_f)(void);
+void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var);
+extern cpu_tick_f *cpu_ticks;
+uint64_t cpu_tickrate(void);
+uint64_t cputick2usec(uint64_t tick);
+
#ifdef APM_FIXUP_CALLTODO
struct timeval;
void adjust_timeout_calltodo(struct timeval *time_change);
Index: ./sys/kern/sched_ule.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/sched_ule.c,v
retrieving revision 1.153.2.3
diff -u -r1.153.2.3 sched_ule.c
--- ./sys/kern/sched_ule.c 27 Sep 2005 12:00:31 -0000 1.153.2.3
+++ ./sys/kern/sched_ule.c 24 Oct 2006 09:45:38 -0000
@@ -1962,6 +1962,19 @@
return (td->td_kse->ke_flags & KEF_BOUND);
}
+void
+sched_relinquish(struct thread *td)
+{
+ struct ksegrp *kg;
+
+ kg = td->td_ksegrp;
+ mtx_lock_spin(&sched_lock);
+ if (kg->kg_pri_class == PRI_TIMESHARE)
+ sched_prio(td, PRI_MAX_TIMESHARE);
+ mi_switch(SW_VOL, NULL);
+ mtx_unlock_spin(&sched_lock);
+}
+
int
sched_load(void)
{
@@ -1995,5 +2008,10 @@
{
return (sizeof(struct thread) + sizeof(struct td_sched));
}
+
+void
+sched_tick(void)
+{
+}
#define KERN_SWITCH_INCLUDE 1
#include "kern/kern_switch.c"
Index: ./sys/kern/sched_core.c
===================================================================
RCS file: ./sys/kern/sched_core.c
diff -N ./sys/kern/sched_core.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ ./sys/kern/sched_core.c 24 Oct 2006 09:45:43 -0000
@@ -0,0 +1,1741 @@
+/*-
+ * Copyright (c) 2005-2006, David Xu <yfxu at corp.netease.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/sched_core.c,v 1.9 2006/06/29 12:29:20 davidxu Exp $");
+
+#include "opt_hwpmc_hooks.h"
+#include "opt_sched.h"
+
+#define kse td_sched
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/turnstile.h>
+#include <sys/unistd.h>
+#include <sys/vmmeter.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+/* get process's nice value, skip value 20 which is not supported */
+#define PROC_NICE(p) MIN((p)->p_nice, 19)
+
+/* convert nice to kernel thread priority */
+#define NICE_TO_PRI(nice) (PUSER + 20 + (nice))
+
+/* get process's static priority */
+#define PROC_PRI(p) NICE_TO_PRI(PROC_NICE(p))
+
+/* convert kernel thread priority to user priority */
+#define USER_PRI(pri) MIN((pri) - PUSER, 39)
+
+/* convert nice value to user priority */
+#define PROC_USER_PRI(p) (PROC_NICE(p) + 20)
+
+/* maximum user priority, highest prio + 1 */
+#define MAX_USER_PRI 40
+
+/* maximum kernel priority its nice is 19 */
+#define PUSER_MAX (PUSER + 39)
+
+/* ticks and nanosecond converters */
+#define NS_TO_HZ(n) ((n) / (1000000000 / hz))
+#define HZ_TO_NS(h) ((h) * (1000000000 / hz))
+
+/* ticks and microsecond converters */
+#define MS_TO_HZ(m) ((m) / (1000000 / hz))
+
+#define PRI_SCORE_RATIO 25
+#define MAX_SCORE (MAX_USER_PRI * PRI_SCORE_RATIO / 100)
+#define MAX_SLEEP_TIME (def_timeslice * MAX_SCORE)
+#define NS_MAX_SLEEP_TIME (HZ_TO_NS(MAX_SLEEP_TIME))
+#define STARVATION_TIME (MAX_SLEEP_TIME)
+
+#define CURRENT_SCORE(kg) \
+ (MAX_SCORE * NS_TO_HZ((kg)->kg_slptime) / MAX_SLEEP_TIME)
+
+#define SCALE_USER_PRI(x, upri) \
+ MAX(x * (upri + 1) / (MAX_USER_PRI/2), min_timeslice)
+
+/*
+ * For a thread whose nice is zero, the score is used to determine
+ * if it is an interactive thread.
+ */
+#define INTERACTIVE_BASE_SCORE (MAX_SCORE * 20)/100
+
+/*
+ * Calculate a score which a thread must have to prove itself is
+ * an interactive thread.
+ */
+#define INTERACTIVE_SCORE(ke) \
+ (PROC_NICE((ke)->ke_proc) * MAX_SCORE / 40 + INTERACTIVE_BASE_SCORE)
+
+/* Test if a thread is an interactive thread */
+#define THREAD_IS_INTERACTIVE(ke) \
+ ((ke)->ke_ksegrp->kg_user_pri <= \
+ PROC_PRI((ke)->ke_proc) - INTERACTIVE_SCORE(ke))
+
+/*
+ * Calculate how long a thread must sleep to prove itself is an
+ * interactive sleep.
+ */
+#define INTERACTIVE_SLEEP_TIME(ke) \
+ (HZ_TO_NS(MAX_SLEEP_TIME * \
+ (MAX_SCORE / 2 + INTERACTIVE_SCORE((ke)) + 1) / MAX_SCORE - 1))
+
+#define CHILD_WEIGHT 90
+#define PARENT_WEIGHT 90
+#define EXIT_WEIGHT 3
+
+#define SCHED_LOAD_SCALE 128UL
+
+#define IDLE 0
+#define IDLE_IDLE 1
+#define NOT_IDLE 2
+
+#define KQB_LEN (8) /* Number of priority status words. */
+#define KQB_L2BPW (5) /* Log2(sizeof(rqb_word_t) * NBBY)). */
+#define KQB_BPW (1<<KQB_L2BPW) /* Bits in an rqb_word_t. */
+
+#define KQB_BIT(pri) (1 << ((pri) & (KQB_BPW - 1)))
+#define KQB_WORD(pri) ((pri) >> KQB_L2BPW)
+#define KQB_FFS(word) (ffs(word) - 1)
+
+#define KQ_NQS 256
+
+/*
+ * Type of run queue status word.
+ */
+typedef u_int32_t kqb_word_t;
+
+/*
+ * Head of run queues.
+ */
+TAILQ_HEAD(krqhead, kse);
+
+/*
+ * Bit array which maintains the status of a run queue. When a queue is
+ * non-empty the bit corresponding to the queue number will be set.
+ */
+struct krqbits {
+ kqb_word_t rqb_bits[KQB_LEN];
+};
+
+/*
+ * Run queue structure. Contains an array of run queues on which processes
+ * are placed, and a structure to maintain the status of each queue.
+ */
+struct krunq {
+ struct krqbits rq_status;
+ struct krqhead rq_queues[KQ_NQS];
+};
+
+/*
+ * The following datastructures are allocated within their parent structure
+ * but are scheduler specific.
+ */
+/*
+ * The schedulable entity that can be given a context to run. A process may
+ * have several of these.
+ */
+struct kse {
+ struct thread *ke_thread; /* (*) Active associated thread. */
+ TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */
+ int ke_flags; /* (j) KEF_* flags. */
+ fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */
+ u_char ke_rqindex; /* (j) Run queue index. */
+ enum {
+ KES_THREAD = 0x0, /* slaved to thread state */
+ KES_ONRUNQ
+ } ke_state; /* (j) thread sched specific status. */
+ int ke_slice; /* Time slice in ticks */
+ struct kseq *ke_kseq; /* Kseq the thread belongs to */
+ struct krunq *ke_runq; /* Assiociated runqueue */
+#ifdef SMP
+ int ke_cpu; /* CPU that we have affinity for. */
+ int ke_wakeup_cpu; /* CPU that has activated us. */
+#endif
+ int ke_activated; /* How is the thread activated. */
+ uint64_t ke_timestamp; /* Last timestamp dependent on state.*/
+ unsigned ke_lastran; /* Last timestamp the thread ran. */
+
+ /* The following variables are only used for pctcpu calculation */
+ int ke_ltick; /* Last tick that we were running on */
+ int ke_ftick; /* First tick that we were running on */
+ int ke_ticks; /* Tick count */
+};
+
+#define td_kse td_sched
+#define ke_proc ke_thread->td_proc
+#define ke_ksegrp ke_thread->td_ksegrp
+
+/* flags kept in ke_flags */
+#define KEF_BOUND 0x0001 /* Thread can not migrate. */
+#define KEF_PREEMPTED 0x0002 /* Thread was preempted. */
+#define KEF_MIGRATING 0x0004 /* Thread is migrating. */
+#define KEF_SLEEP 0x0008 /* Thread did sleep. */
+#define KEF_DIDRUN 0x0010 /* Thread actually ran. */
+#define KEF_EXIT 0x0020 /* Thread is being killed. */
+#define KEF_NEXTRQ 0x0400 /* Thread should be in next queue. */
+#define KEF_FIRST_SLICE 0x0800 /* Thread has first time slice left. */
+
+struct kg_sched {
+ struct thread *skg_last_assigned; /* (j) Last thread assigned to */
+ /* the system scheduler */
+ u_long skg_slptime; /* (j) Number of ticks we vol. slept */
+ u_long skg_runtime; /* (j) Temp total run time. */
+ int skg_avail_opennings; /* (j) Num unfilled slots in group.*/
+ int skg_concurrency; /* (j) Num threads requested in group.*/
+};
+#define kg_last_assigned kg_sched->skg_last_assigned
+#define kg_avail_opennings kg_sched->skg_avail_opennings
+#define kg_concurrency kg_sched->skg_concurrency
+#define kg_slptime kg_sched->skg_slptime
+#define kg_runtime kg_sched->skg_runtime
+
+#define SLOT_RELEASE(kg) (kg)->kg_avail_opennings++
+#define SLOT_USE(kg) (kg)->kg_avail_opennings--
+
+/*
+ * Cpu percentage computation macros and defines.
+ *
+ * SCHED_CPU_TIME: Number of seconds to average the cpu usage across.
+ * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across.
+ */
+
+#define SCHED_CPU_TIME 10
+#define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME)
+
+/*
+ * kseq - per processor runqs and statistics.
+ */
+struct kseq {
+ struct krunq *ksq_curr; /* Current queue. */
+ struct krunq *ksq_next; /* Next timeshare queue. */
+ struct krunq ksq_timeshare[2]; /* Run queues for !IDLE. */
+ struct krunq ksq_idle; /* Queue of IDLE threads. */
+ int ksq_load;
+ uint64_t ksq_last_timestamp; /* Per-cpu last clock tick */
+ unsigned ksq_expired_tick; /* First expired tick */
+ signed char ksq_expired_nice; /* Lowest nice in nextq */
+};
+
+static struct kse kse0;
+static struct kg_sched kg_sched0;
+
+static int min_timeslice = 5;
+static int def_timeslice = 100;
+static int granularity = 10;
+static int realstathz;
+static int sched_tdcnt;
+static struct kseq kseq_global;
+
+/*
+ * One kse queue per processor.
+ */
+#ifdef SMP
+static struct kseq kseq_cpu[MAXCPU];
+
+#define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)])
+#define KSEQ_CPU(x) (&kseq_cpu[(x)])
+#define KSEQ_ID(x) ((x) - kseq_cpu)
+
+static cpumask_t cpu_sibling[MAXCPU];
+
+#else /* !SMP */
+
+#define KSEQ_SELF() (&kseq_global)
+#define KSEQ_CPU(x) (&kseq_global)
+#endif
+
+/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+static void sched_setup(void *dummy);
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
+
+static void sched_initticks(void *dummy);
+SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
+
+static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
+
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "CORE", 0,
+ "Scheduler name");
+
+#ifdef SMP
+/* Enable forwarding of wakeups to all other cpus */
+SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP");
+
+static int runq_fuzz = 0;
+SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
+
+static int forward_wakeup_enabled = 1;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
+ &forward_wakeup_enabled, 0,
+ "Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeups_requested = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
+ &forward_wakeups_requested, 0,
+ "Requests for Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeups_delivered = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
+ &forward_wakeups_delivered, 0,
+ "Completed Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeup_use_mask = 1;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
+ &forward_wakeup_use_mask, 0,
+ "Use the mask of idle cpus");
+
+static int forward_wakeup_use_loop = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
+ &forward_wakeup_use_loop, 0,
+ "Use a loop to find idle cpus");
+
+static int forward_wakeup_use_single = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW,
+ &forward_wakeup_use_single, 0,
+ "Only signal one idle cpu");
+
+static int forward_wakeup_use_htt = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW,
+ &forward_wakeup_use_htt, 0,
+ "account for htt");
+#endif
+
+static void slot_fill(struct ksegrp *);
+
+static void krunq_add(struct krunq *, struct kse *);
+static struct kse *krunq_choose(struct krunq *);
+static void krunq_clrbit(struct krunq *rq, int pri);
+static int krunq_findbit(struct krunq *rq);
+static void krunq_init(struct krunq *);
+static void krunq_remove(struct krunq *, struct kse *);
+
+static struct kse * kseq_choose(struct kseq *);
+static void kseq_load_add(struct kseq *, struct kse *);
+static void kseq_load_rem(struct kseq *, struct kse *);
+static void kseq_runq_add(struct kseq *, struct kse *);
+static void kseq_runq_rem(struct kseq *, struct kse *);
+static void kseq_setup(struct kseq *);
+
+static int sched_is_timeshare(struct ksegrp *kg);
+static struct kse *sched_choose(void);
+static int sched_calc_pri(struct ksegrp *kg);
+static int sched_starving(struct kseq *, unsigned, struct kse *);
+static void sched_pctcpu_update(struct kse *);
+static void sched_thread_priority(struct thread *, u_char);
+static uint64_t sched_timestamp(void);
+static int sched_recalc_pri(struct kse *ke, uint64_t now);
+static int sched_timeslice(struct kse *ke);
+static void sched_update_runtime(struct kse *ke, uint64_t now);
+static void sched_commit_runtime(struct kse *ke);
+
+/*
+ * Initialize a run structure.
+ */
+static void
+krunq_init(struct krunq *rq)
+{
+ int i;
+
+ bzero(rq, sizeof *rq);
+ for (i = 0; i < KQ_NQS; i++)
+ TAILQ_INIT(&rq->rq_queues[i]);
+}
+
+/*
+ * Clear the status bit of the queue corresponding to priority level pri,
+ * indicating that it is empty.
+ */
+static inline void
+krunq_clrbit(struct krunq *rq, int pri)
+{
+ struct krqbits *rqb;
+
+ rqb = &rq->rq_status;
+ rqb->rqb_bits[KQB_WORD(pri)] &= ~KQB_BIT(pri);
+}
+
+/*
+ * Find the index of the first non-empty run queue. This is done by
+ * scanning the status bits, a set bit indicates a non-empty queue.
+ */
+static int
+krunq_findbit(struct krunq *rq)
+{
+ struct krqbits *rqb;
+ int pri;
+ int i;
+
+ rqb = &rq->rq_status;
+ for (i = 0; i < KQB_LEN; i++) {
+ if (rqb->rqb_bits[i]) {
+ pri = KQB_FFS(rqb->rqb_bits[i]) + (i << KQB_L2BPW);
+ return (pri);
+ }
+ }
+ return (-1);
+}
+
+static int
+krunq_check(struct krunq *rq)
+{
+ struct krqbits *rqb;
+ int i;
+
+ rqb = &rq->rq_status;
+ for (i = 0; i < KQB_LEN; i++) {
+ if (rqb->rqb_bits[i])
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Set the status bit of the queue corresponding to priority level pri,
+ * indicating that it is non-empty.
+ */
+static inline void
+krunq_setbit(struct krunq *rq, int pri)
+{
+ struct krqbits *rqb;
+
+ rqb = &rq->rq_status;
+ rqb->rqb_bits[KQB_WORD(pri)] |= KQB_BIT(pri);
+}
+
+/*
+ * Add the KSE to the queue specified by its priority, and set the
+ * corresponding status bit.
+ */
+static void
+krunq_add(struct krunq *rq, struct kse *ke)
+{
+ struct krqhead *rqh;
+ int pri;
+
+ pri = ke->ke_thread->td_priority;
+ ke->ke_rqindex = pri;
+ krunq_setbit(rq, pri);
+ rqh = &rq->rq_queues[pri];
+ if (ke->ke_flags & KEF_PREEMPTED)
+ TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
+ else
+ TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
+}
+
+/*
+ * Find the highest priority process on the run queue.
+ */
+static struct kse *
+krunq_choose(struct krunq *rq)
+{
+ struct krqhead *rqh;
+ struct kse *ke;
+ int pri;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ if ((pri = krunq_findbit(rq)) != -1) {
+ rqh = &rq->rq_queues[pri];
+ ke = TAILQ_FIRST(rqh);
+ KASSERT(ke != NULL, ("krunq_choose: no thread on busy queue"));
+#ifdef SMP
+ if (pri <= PRI_MAX_ITHD || runq_fuzz <= 0)
+ return (ke);
+
+ /*
+ * In the first couple of entries, check if
+ * there is one for our CPU as a preference.
+ */
+ struct kse *ke2 = ke;
+ const int mycpu = PCPU_GET(cpuid);
+ const int mymask = 1 << mycpu;
+ int count = runq_fuzz;
+
+ while (count-- && ke2) {
+ const int cpu = ke2->ke_wakeup_cpu;
+ if (cpu_sibling[cpu] & mymask) {
+ ke = ke2;
+ break;
+ }
+ ke2 = TAILQ_NEXT(ke2, ke_procq);
+ }
+#endif
+ return (ke);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Remove the KSE from the queue specified by its priority, and clear the
+ * corresponding status bit if the queue becomes empty.
+ * Caller must set ke->ke_state afterwards.
+ */
+static void
+krunq_remove(struct krunq *rq, struct kse *ke)
+{
+ struct krqhead *rqh;
+ int pri;
+
+ KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
+ ("runq_remove: process swapped out"));
+ pri = ke->ke_rqindex;
+ rqh = &rq->rq_queues[pri];
+ KASSERT(ke != NULL, ("krunq_remove: no proc on busy queue"));
+ TAILQ_REMOVE(rqh, ke, ke_procq);
+ if (TAILQ_EMPTY(rqh))
+ krunq_clrbit(rq, pri);
+}
+
+static inline void
+kseq_runq_add(struct kseq *kseq, struct kse *ke)
+{
+ krunq_add(ke->ke_runq, ke);
+ ke->ke_kseq = kseq;
+}
+
+static inline void
+kseq_runq_rem(struct kseq *kseq, struct kse *ke)
+{
+ krunq_remove(ke->ke_runq, ke);
+ ke->ke_kseq = NULL;
+ ke->ke_runq = NULL;
+}
+
+static inline void
+kseq_load_add(struct kseq *kseq, struct kse *ke)
+{
+ kseq->ksq_load++;
+ if ((ke->ke_proc->p_flag & P_NOLOAD) == 0)
+ sched_tdcnt++;
+}
+
+static inline void
+kseq_load_rem(struct kseq *kseq, struct kse *ke)
+{
+ kseq->ksq_load--;
+ if ((ke->ke_proc->p_flag & P_NOLOAD) == 0)
+ sched_tdcnt--;
+}
+
+/*
+ * Pick the highest priority task we have and return it.
+ */
+static struct kse *
+kseq_choose(struct kseq *kseq)
+{
+ struct krunq *swap;
+ struct kse *ke;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ ke = krunq_choose(kseq->ksq_curr);
+ if (ke != NULL)
+ return (ke);
+
+ kseq->ksq_expired_nice = PRIO_MAX + 1;
+ kseq->ksq_expired_tick = 0;
+ swap = kseq->ksq_curr;
+ kseq->ksq_curr = kseq->ksq_next;
+ kseq->ksq_next = swap;
+ ke = krunq_choose(kseq->ksq_curr);
+ if (ke != NULL)
+ return (ke);
+
+ return krunq_choose(&kseq->ksq_idle);
+}
+
+static inline uint64_t
+sched_timestamp(void)
+{
+ uint64_t now = cputick2usec(cpu_ticks()) * 1000;
+ return (now);
+}
+
+static inline int
+sched_timeslice(struct kse *ke)
+{
+ struct proc *p = ke->ke_proc;
+
+ if (ke->ke_proc->p_nice < 0)
+ return SCALE_USER_PRI(def_timeslice*4, PROC_USER_PRI(p));
+ else
+ return SCALE_USER_PRI(def_timeslice, PROC_USER_PRI(p));
+}
+
+static inline int
+sched_is_timeshare(struct ksegrp *kg)
+{
+ return (kg->kg_pri_class == PRI_TIMESHARE);
+}
+
+static int
+sched_calc_pri(struct ksegrp *kg)
+{
+ int score, pri;
+
+ if (sched_is_timeshare(kg)) {
+ score = CURRENT_SCORE(kg) - MAX_SCORE / 2;
+ pri = PROC_PRI(kg->kg_proc) - score;
+ if (pri < PUSER)
+ pri = PUSER;
+ else if (pri > PUSER_MAX)
+ pri = PUSER_MAX;
+ return (pri);
+ }
+ return (kg->kg_user_pri);
+}
+
+static int
+sched_recalc_pri(struct kse *ke, uint64_t now)
+{
+ uint64_t delta;
+ unsigned int sleep_time;
+ struct ksegrp *kg;
+
+ kg = ke->ke_ksegrp;
+ delta = now - ke->ke_timestamp;
+ if (__predict_false(!sched_is_timeshare(kg)))
+ return (kg->kg_user_pri);
+
+ if (delta > NS_MAX_SLEEP_TIME)
+ sleep_time = NS_MAX_SLEEP_TIME;
+ else
+ sleep_time = (unsigned int)delta;
+ if (__predict_false(sleep_time == 0))
+ goto out;
+
+ if (ke->ke_activated != -1 &&
+ sleep_time > INTERACTIVE_SLEEP_TIME(ke)) {
+ kg->kg_slptime = HZ_TO_NS(MAX_SLEEP_TIME - def_timeslice);
+ } else {
+ sleep_time *= (MAX_SCORE - CURRENT_SCORE(kg)) ? : 1;
+
+ /*
+ * If thread is waking from uninterruptible sleep, it is
+ * unlikely an interactive sleep, limit its sleep time to
+ * prevent it from being an interactive thread.
+ */
+ if (ke->ke_activated == -1) {
+ if (kg->kg_slptime >= INTERACTIVE_SLEEP_TIME(ke))
+ sleep_time = 0;
+ else if (kg->kg_slptime + sleep_time >=
+ INTERACTIVE_SLEEP_TIME(ke)) {
+ kg->kg_slptime = INTERACTIVE_SLEEP_TIME(ke);
+ sleep_time = 0;
+ }
+ }
+
+ /*
+ * Thread gets priority boost here.
+ */
+ kg->kg_slptime += sleep_time;
+
+ /* Sleep time should never be larger than maximum */
+ if (kg->kg_slptime > NS_MAX_SLEEP_TIME)
+ kg->kg_slptime = NS_MAX_SLEEP_TIME;
+ }
+
+out:
+ return (sched_calc_pri(kg));
+}
+
+static void
+sched_update_runtime(struct kse *ke, uint64_t now)
+{
+ uint64_t runtime;
+ struct ksegrp *kg = ke->ke_ksegrp;
+
+ if (sched_is_timeshare(kg)) {
+ if ((int64_t)(now - ke->ke_timestamp) < NS_MAX_SLEEP_TIME) {
+ runtime = now - ke->ke_timestamp;
+ if ((int64_t)(now - ke->ke_timestamp) < 0)
+ runtime = 0;
+ } else {
+ runtime = NS_MAX_SLEEP_TIME;
+ }
+ runtime /= (CURRENT_SCORE(kg) ? : 1);
+ kg->kg_runtime += runtime;
+ ke->ke_timestamp = now;
+ }
+}
+
+static void
+sched_commit_runtime(struct kse *ke)
+{
+ struct ksegrp *kg = ke->ke_ksegrp;
+
+ if (kg->kg_runtime > kg->kg_slptime)
+ kg->kg_slptime = 0;
+ else
+ kg->kg_slptime -= kg->kg_runtime;
+ kg->kg_runtime = 0;
+}
+
+static void
+kseq_setup(struct kseq *kseq)
+{
+ krunq_init(&kseq->ksq_timeshare[0]);
+ krunq_init(&kseq->ksq_timeshare[1]);
+ krunq_init(&kseq->ksq_idle);
+ kseq->ksq_curr = &kseq->ksq_timeshare[0];
+ kseq->ksq_next = &kseq->ksq_timeshare[1];
+ kseq->ksq_expired_nice = PRIO_MAX + 1;
+ kseq->ksq_expired_tick = 0;
+}
+
+static void
+sched_setup(void *dummy)
+{
+#ifdef SMP
+ int i;
+#endif
+
+ /*
+ * To avoid divide-by-zero, we set realstathz a dummy value
+ * in case which sched_clock() called before sched_initticks().
+ */
+ realstathz = hz;
+ min_timeslice = MAX(5 * hz / 1000, 1);
+ def_timeslice = MAX(100 * hz / 1000, 1);
+ granularity = MAX(10 * hz / 1000, 1);
+
+ kseq_setup(&kseq_global);
+#ifdef SMP
+ runq_fuzz = MIN(mp_ncpus * 2, 8);
+ /*
+ * Initialize the kseqs.
+ */
+ for (i = 0; i < MAXCPU; i++) {
+ struct kseq *ksq;
+
+ ksq = &kseq_cpu[i];
+ kseq_setup(&kseq_cpu[i]);
+ cpu_sibling[i] = 1 << i;
+ }
+ if (smp_topology != NULL) {
+ int i, j;
+ cpumask_t visited;
+ struct cpu_group *cg;
+
+ visited = 0;
+ for (i = 0; i < smp_topology->ct_count; i++) {
+ cg = &smp_topology->ct_group[i];
+ if (cg->cg_mask & visited)
+ panic("duplicated cpumask in ct_group.");
+ if (cg->cg_mask == 0)
+ continue;
+ visited |= cg->cg_mask;
+ for (j = 0; j < MAXCPU; j++) {
+ if ((cg->cg_mask & (1 << j)) != 0)
+ cpu_sibling[j] |= cg->cg_mask;
+ }
+ }
+ }
+#endif
+
+ mtx_lock_spin(&sched_lock);
+ kseq_load_add(KSEQ_SELF(), &kse0);
+ mtx_unlock_spin(&sched_lock);
+}
+
+/* ARGSUSED */
+static void
+sched_initticks(void *dummy)
+{
+ mtx_lock_spin(&sched_lock);
+ realstathz = stathz ? stathz : hz;
+ mtx_unlock_spin(&sched_lock);
+}
+
+/*
+ * Very early in the boot some setup of scheduler-specific
+ * parts of proc0 and of soem scheduler resources needs to be done.
+ * Called from:
+ * proc0_init()
+ */
+void
+schedinit(void)
+{
+ /*
+ * Set up the scheduler specific parts of proc0.
+ */
+ proc0.p_sched = NULL; /* XXX */
+ ksegrp0.kg_sched = &kg_sched0;
+ thread0.td_sched = &kse0;
+ kse0.ke_thread = &thread0;
+ kse0.ke_state = KES_THREAD;
+ kse0.ke_slice = 100;
+ kg_sched0.skg_concurrency = 1;
+ kg_sched0.skg_avail_opennings = 0; /* we are already running */
+}
+
+/*
+ * This is only somewhat accurate since given many processes of the same
+ * priority they will switch when their slices run out, which will be
+ * at most SCHED_SLICE_MAX.
+ */
+int
+sched_rr_interval(void)
+{
+ return (def_timeslice);
+}
+
+static void
+sched_pctcpu_update(struct kse *ke)
+{
+ /*
+ * Adjust counters and watermark for pctcpu calc.
+ */
+ if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
+ /*
+ * Shift the tick count out so that the divide doesn't
+ * round away our results.
+ */
+ ke->ke_ticks <<= 10;
+ ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
+ SCHED_CPU_TICKS;
+ ke->ke_ticks >>= 10;
+ } else
+ ke->ke_ticks = 0;
+ ke->ke_ltick = ticks;
+ ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
+}
+
+static void
+sched_thread_priority(struct thread *td, u_char prio)
+{
+ struct kse *ke;
+
+ ke = td->td_kse;
+ mtx_assert(&sched_lock, MA_OWNED);
+ if (__predict_false(td->td_priority == prio))
+ return;
+
+ if (TD_ON_RUNQ(td)) {
+ /*
+ * If the priority has been elevated due to priority
+ * propagation, we may have to move ourselves to a new
+ * queue. We still call adjustrunqueue below in case kse
+ * needs to fix things up.
+ */
+ if (prio < td->td_priority && ke->ke_runq != NULL &&
+ ke->ke_runq != ke->ke_kseq->ksq_curr) {
+ krunq_remove(ke->ke_runq, ke);
+ ke->ke_runq = ke->ke_kseq->ksq_curr;
+ krunq_add(ke->ke_runq, ke);
+ }
+ adjustrunqueue(td, prio);
+ } else
+ td->td_priority = prio;
+}
+
+/*
+ * Update a thread's priority when it is lent another thread's
+ * priority.
+ */
+void
+sched_lend_prio(struct thread *td, u_char prio)
+{
+
+ td->td_flags |= TDF_BORROWING;
+ sched_thread_priority(td, prio);
+}
+
+/*
+ * Restore a thread's priority when priority propagation is
+ * over. The prio argument is the minimum priority the thread
+ * needs to have to satisfy other possible priority lending
+ * requests. If the thread's regular priority is less
+ * important than prio, the thread will keep a priority boost
+ * of prio.
+ */
+void
+sched_unlend_prio(struct thread *td, u_char prio)
+{
+ u_char base_pri;
+
+ if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
+ td->td_base_pri <= PRI_MAX_TIMESHARE)
+ base_pri = td->td_ksegrp->kg_user_pri;
+ else
+ base_pri = td->td_base_pri;
+ if (prio >= base_pri) {
+ td->td_flags &= ~TDF_BORROWING;
+ sched_thread_priority(td, base_pri);
+ } else
+ sched_lend_prio(td, prio);
+}
+
+void
+sched_prio(struct thread *td, u_char prio)
+{
+ u_char oldprio;
+
+ if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE)
+ prio = MIN(prio, PUSER_MAX);
+
+ /* First, update the base priority. */
+ td->td_base_pri = prio;
+
+ /*
+ * If the thread is borrowing another thread's priority, don't
+ * ever lower the priority.
+ */
+ if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
+ return;
+
+ /* Change the real priority. */
+ oldprio = td->td_priority;
+ sched_thread_priority(td, prio);
+
+ /*
+ * If the thread is on a turnstile, then let the turnstile update
+ * its state.
+ */
+ if (TD_ON_LOCK(td) && oldprio != prio)
+ turnstile_adjust(td, oldprio);
+}
+
+void
+sched_switch(struct thread *td, struct thread *newtd, int flags)
+{
+ struct kseq *ksq;
+ struct kse *ke;
+ struct ksegrp *kg;
+ uint64_t now;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+
+ now = sched_timestamp();
+ ke = td->td_kse;
+ kg = td->td_ksegrp;
+ ksq = KSEQ_SELF();
+
+ td->td_lastcpu = td->td_oncpu;
+ td->td_oncpu = NOCPU;
+ td->td_flags &= ~TDF_NEEDRESCHED;
+ td->td_owepreempt = 0;
+
+ if (td == PCPU_GET(idlethread)) {
+ TD_SET_CAN_RUN(td);
+ } else {
+ sched_update_runtime(ke, now);
+ /* We are ending our run so make our slot available again */
+ SLOT_RELEASE(td->td_ksegrp);
+ kseq_load_rem(ksq, ke);
+ if (TD_IS_RUNNING(td)) {
+ setrunqueue(td, (flags & SW_PREEMPT) ?
+ SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+ SRQ_OURSELF|SRQ_YIELDING);
+ } else {
+ if ((td->td_proc->p_flag & P_HADTHREADS) &&
+ (newtd == NULL ||
+ newtd->td_ksegrp != td->td_ksegrp)) {
+ /*
+ * We will not be on the run queue.
+ * So we must be sleeping or similar.
+ * Don't use the slot if we will need it
+ * for newtd.
+ */
+ slot_fill(td->td_ksegrp);
+ }
+ ke->ke_flags &= ~KEF_NEXTRQ;
+ }
+ }
+
+ if (newtd != NULL) {
+ /*
+ * If we bring in a thread account for it as if it had been
+ * added to the run queue and then chosen.
+ */
+ SLOT_USE(newtd->td_ksegrp);
+ newtd->td_kse->ke_flags |= KEF_DIDRUN;
+ newtd->td_kse->ke_timestamp = now;
+ TD_SET_RUNNING(newtd);
+ kseq_load_add(ksq, newtd->td_kse);
+ } else {
+ newtd = choosethread();
+ /* sched_choose sets ke_timestamp, just reuse it */
+ }
+ if (td != newtd) {
+ ke->ke_lastran = tick;
+
+#ifdef HWPMC_HOOKS
+ if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+ PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
+#endif
+ cpu_switch(td, newtd);
+#ifdef HWPMC_HOOKS
+ if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+ PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
+#endif
+ }
+
+ sched_lock.mtx_lock = (uintptr_t)td;
+
+ td->td_oncpu = PCPU_GET(cpuid);
+}
+
+void
+sched_nice(struct proc *p, int nice)
+{
+ struct ksegrp *kg;
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ mtx_assert(&sched_lock, MA_OWNED);
+ p->p_nice = nice;
+ FOREACH_KSEGRP_IN_PROC(p, kg) {
+ if (kg->kg_pri_class == PRI_TIMESHARE) {
+ kg->kg_user_pri = sched_calc_pri(kg);
+ FOREACH_THREAD_IN_GROUP(kg, td)
+ td->td_flags |= TDF_NEEDRESCHED;
+ }
+ }
+}
+
+void
+sched_sleep(struct thread *td)
+{
+ struct kse *ke;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ ke = td->td_kse;
+ if (td->td_flags & TDF_SINTR)
+ ke->ke_activated = 0;
+ else
+ ke->ke_activated = -1;
+ ke->ke_flags |= KEF_SLEEP;
+}
+
+void
+sched_wakeup(struct thread *td)
+{
+ struct kse *ke;
+ struct ksegrp *kg;
+ struct kseq *kseq, *mykseq;
+ uint64_t now;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ ke = td->td_kse;
+ kg = td->td_ksegrp;
+ mykseq = KSEQ_SELF();
+ if (ke->ke_flags & KEF_SLEEP) {
+ ke->ke_flags &= ~KEF_SLEEP;
+ if (sched_is_timeshare(kg)) {
+ sched_commit_runtime(ke);
+ now = sched_timestamp();
+ kseq = KSEQ_CPU(td->td_lastcpu);
+#ifdef SMP
+ if (kseq != mykseq)
+ now = now - mykseq->ksq_last_timestamp +
+ kseq->ksq_last_timestamp;
+#endif
+ kg->kg_user_pri = sched_recalc_pri(ke, now);
+ }
+ }
+ setrunqueue(td, SRQ_BORING);
+}
+
+/*
+ * Penalize the parent for creating a new child and initialize the child's
+ * priority.
+ */
+void
+sched_fork(struct thread *td, struct thread *childtd)
+{
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ sched_fork_ksegrp(td, childtd->td_ksegrp);
+ sched_fork_thread(td, childtd);
+}
+
+void
+sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
+{
+ struct ksegrp *kg = td->td_ksegrp;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ child->kg_slptime = kg->kg_slptime * CHILD_WEIGHT / 100;
+ if (child->kg_pri_class == PRI_TIMESHARE)
+ child->kg_user_pri = sched_calc_pri(child);
+ kg->kg_slptime = kg->kg_slptime * PARENT_WEIGHT / 100;
+}
+
+void
+sched_fork_thread(struct thread *td, struct thread *child)
+{
+ struct kse *ke;
+ struct kse *ke2;
+
+ sched_newthread(child);
+
+ ke = td->td_kse;
+ ke2 = child->td_kse;
+ ke2->ke_slice = (ke->ke_slice + 1) >> 1;
+ ke2->ke_flags |= KEF_FIRST_SLICE | (ke->ke_flags & KEF_NEXTRQ);
+ ke2->ke_activated = 0;
+ ke->ke_slice >>= 1;
+ if (ke->ke_slice == 0) {
+ ke->ke_slice = 1;
+ sched_tick();
+ }
+
+ /* Grab our parents cpu estimation information. */
+ ke2->ke_ticks = ke->ke_ticks;
+ ke2->ke_ltick = ke->ke_ltick;
+ ke2->ke_ftick = ke->ke_ftick;
+}
+
+void
+sched_class(struct ksegrp *kg, int class)
+{
+ mtx_assert(&sched_lock, MA_OWNED);
+ kg->kg_pri_class = class;
+}
+
+/*
+ * Return some of the child's priority and interactivity to the parent.
+ */
+void
+sched_exit(struct proc *p, struct thread *childtd)
+{
+ mtx_assert(&sched_lock, MA_OWNED);
+ sched_exit_thread(FIRST_THREAD_IN_PROC(p), childtd);
+ sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
+}
+
+void
+sched_exit_ksegrp(struct ksegrp *parentkg, struct thread *td)
+{
+ if (td->td_ksegrp->kg_slptime < parentkg->kg_slptime) {
+ parentkg->kg_slptime = parentkg->kg_slptime /
+ (EXIT_WEIGHT) * (EXIT_WEIGHT - 1) +
+ td->td_ksegrp->kg_slptime / EXIT_WEIGHT;
+ }
+}
+
+void
+sched_exit_thread(struct thread *td, struct thread *childtd)
+{
+ struct kse *childke = childtd->td_kse;
+ struct kse *parentke = td->td_kse;
+
+ kseq_load_rem(KSEQ_SELF(), childke);
+ sched_update_runtime(childke, sched_timestamp());
+ sched_commit_runtime(childke);
+ if ((childke->ke_flags & KEF_FIRST_SLICE) &&
+ td->td_proc == childtd->td_proc->p_pptr) {
+ parentke->ke_slice += childke->ke_slice;
+ if (parentke->ke_slice > sched_timeslice(parentke))
+ parentke->ke_slice = sched_timeslice(parentke);
+ }
+}
+
+static int
+sched_starving(struct kseq *ksq, unsigned now, struct kse *ke)
+{
+ uint64_t delta;
+
+ if (ke->ke_proc->p_nice > ksq->ksq_expired_nice)
+ return (1);
+ if (ksq->ksq_expired_tick == 0)
+ return (0);
+ delta = HZ_TO_NS((uint64_t)now - ksq->ksq_expired_tick);
+ if (delta > STARVATION_TIME * ksq->ksq_load)
+ return (1);
+ return (0);
+}
+
+/*
+ * An interactive thread has smaller time slice granularity,
+ * a cpu hog can have larger granularity.
+ */
+static inline int
+sched_timeslice_split(struct kse *ke)
+{
+ int score, g;
+
+ score = (int)(MAX_SCORE - CURRENT_SCORE(ke->ke_ksegrp));
+ if (score == 0)
+ score = 1;
+#ifdef SMP
+ g = granularity * ((1 << score) - 1) * smp_cpus;
+#else
+ g = granularity * ((1 << score) - 1);
+#endif
+ return (ke->ke_slice >= g && ke->ke_slice % g == 0);
+}
+
+void
+sched_tick(void)
+{
+ struct thread *td;
+ struct proc *p;
+ struct kse *ke;
+ struct ksegrp *kg;
+ struct kseq *kseq;
+ uint64_t now;
+ int cpuid;
+ int class;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+
+ td = curthread;
+ ke = td->td_kse;
+ kg = td->td_ksegrp;
+ p = td->td_proc;
+ class = PRI_BASE(kg->kg_pri_class);
+ now = sched_timestamp();
+ cpuid = PCPU_GET(cpuid);
+ kseq = KSEQ_CPU(cpuid);
+ kseq->ksq_last_timestamp = now;
+
+ if (class == PRI_IDLE) {
+ /*
+ * Processes of equal idle priority are run round-robin.
+ */
+ if (td != PCPU_GET(idlethread) && --ke->ke_slice <= 0) {
+ ke->ke_slice = def_timeslice;
+ td->td_flags |= TDF_NEEDRESCHED;
+ }
+ return;
+ }
+
+ if (class == PRI_REALTIME) {
+ /*
+ * Realtime scheduling, do round robin for RR class, FIFO
+ * is not affected.
+ */
+ if (PRI_NEED_RR(kg->kg_pri_class) && --ke->ke_slice <= 0) {
+ ke->ke_slice = def_timeslice;
+ td->td_flags |= TDF_NEEDRESCHED;
+ }
+ return;
+ }
+
+ /*
+ * We skip kernel thread, though it may be classified as TIMESHARE.
+ */
+ if (class != PRI_TIMESHARE || (p->p_flag & P_KTHREAD) != 0)
+ return;
+
+ if (--ke->ke_slice <= 0) {
+ td->td_flags |= TDF_NEEDRESCHED;
+ sched_update_runtime(ke, now);
+ sched_commit_runtime(ke);
+ kg->kg_user_pri = sched_calc_pri(kg);
+ ke->ke_slice = sched_timeslice(ke);
+ ke->ke_flags &= ~KEF_FIRST_SLICE;
+ if (ke->ke_flags & KEF_BOUND || td->td_pinned) {
+ if (kseq->ksq_expired_tick == 0)
+ kseq->ksq_expired_tick = tick;
+ } else {
+ if (kseq_global.ksq_expired_tick == 0)
+ kseq_global.ksq_expired_tick = tick;
+ }
+ if (!THREAD_IS_INTERACTIVE(ke) ||
+ sched_starving(kseq, tick, ke) ||
+ sched_starving(&kseq_global, tick, ke)) {
+ /* The thead becomes cpu hog, schedule it off. */
+ ke->ke_flags |= KEF_NEXTRQ;
+ if (ke->ke_flags & KEF_BOUND || td->td_pinned) {
+ if (p->p_nice < kseq->ksq_expired_nice)
+ kseq->ksq_expired_nice = p->p_nice;
+ } else {
+ if (p->p_nice < kseq_global.ksq_expired_nice)
+ kseq_global.ksq_expired_nice =
+ p->p_nice;
+ }
+ }
+ } else {
+ /*
+ * Don't allow an interactive thread which has long timeslice
+ * to monopolize CPU, split the long timeslice into small
+ * chunks. This essentially does round-robin between
+ * interactive threads.
+ */
+ if (THREAD_IS_INTERACTIVE(ke) && sched_timeslice_split(ke))
+ td->td_flags |= TDF_NEEDRESCHED;
+ }
+}
+
+void
+sched_clock(struct thread *td)
+{
+ struct ksegrp *kg;
+ struct kse *ke;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ ke = td->td_kse;
+ kg = ke->ke_ksegrp;
+
+ /* Adjust ticks for pctcpu */
+ ke->ke_ticks++;
+ ke->ke_ltick = ticks;
+
+ /* Go up to one second beyond our max and then trim back down */
+ if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
+ sched_pctcpu_update(ke);
+}
+
+static int
+kseq_runnable(struct kseq *kseq)
+{
+ return (krunq_check(kseq->ksq_curr) ||
+ krunq_check(kseq->ksq_next) ||
+ krunq_check(&kseq->ksq_idle));
+}
+
+int
+sched_runnable(void)
+{
+#ifdef SMP
+ return (kseq_runnable(&kseq_global) || kseq_runnable(KSEQ_SELF()));
+#else
+ return (kseq_runnable(&kseq_global));
+#endif
+}
+
+void
+sched_userret(struct thread *td)
+{
+ struct ksegrp *kg;
+
+ KASSERT((td->td_flags & TDF_BORROWING) == 0,
+ ("thread with borrowed priority returning to userland"));
+ kg = td->td_ksegrp;
+ if (td->td_priority != kg->kg_user_pri) {
+ mtx_lock_spin(&sched_lock);
+ td->td_priority = kg->kg_user_pri;
+ td->td_base_pri = kg->kg_user_pri;
+ mtx_unlock_spin(&sched_lock);
+ }
+}
+
+struct kse *
+sched_choose(void)
+{
+ struct kse *ke;
+ struct kseq *kseq;
+
+#ifdef SMP
+ struct kse *kecpu;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ kseq = &kseq_global;
+ ke = kseq_choose(&kseq_global);
+ kecpu = kseq_choose(KSEQ_SELF());
+
+ if (ke == NULL ||
+ (kecpu != NULL &&
+ kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
+ ke = kecpu;
+ kseq = KSEQ_SELF();
+ }
+#else
+ kseq = &kseq_global;
+ ke = kseq_choose(kseq);
+#endif
+
+ if (ke != NULL) {
+ kseq_runq_rem(kseq, ke);
+ ke->ke_state = KES_THREAD;
+ ke->ke_flags &= ~KEF_PREEMPTED;
+ ke->ke_timestamp = sched_timestamp();
+ }
+
+ return (ke);
+}
+
+#ifdef SMP
+static int
+forward_wakeup(int cpunum, cpumask_t me)
+{
+ cpumask_t map, dontuse;
+ cpumask_t map2;
+ struct pcpu *pc;
+ cpumask_t id, map3;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+
+ CTR0(KTR_RUNQ, "forward_wakeup()");
+
+ if ((!forward_wakeup_enabled) ||
+ (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
+ return (0);
+ if (!smp_started || cold || panicstr)
+ return (0);
+
+ forward_wakeups_requested++;
+
+ /*
+ * check the idle mask we received against what we calculated before
+ * in the old version.
+ */
+ /*
+ * don't bother if we should be doing it ourself..
+ */
+ if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum)))
+ return (0);
+
+ dontuse = me | stopped_cpus | hlt_cpus_mask;
+ map3 = 0;
+ if (forward_wakeup_use_loop) {
+ SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
+ id = pc->pc_cpumask;
+ if ( (id & dontuse) == 0 &&
+ pc->pc_curthread == pc->pc_idlethread) {
+ map3 |= id;
+ }
+ }
+ }
+
+ if (forward_wakeup_use_mask) {
+ map = 0;
+ map = idle_cpus_mask & ~dontuse;
+
+ /* If they are both on, compare and use loop if different */
+ if (forward_wakeup_use_loop) {
+ if (map != map3) {
+ printf("map (%02X) != map3 (%02X)\n",
+ map, map3);
+ map = map3;
+ }
+ }
+ } else {
+ map = map3;
+ }
+ /* If we only allow a specific CPU, then mask off all the others */
+ if (cpunum != NOCPU) {
+ KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
+ map &= (1 << cpunum);
+ } else {
+ /* Try choose an idle die. */
+ if (forward_wakeup_use_htt) {
+ map2 = (map & (map >> 1)) & 0x5555;
+ if (map2) {
+ map = map2;
+ }
+ }
+
+ /* set only one bit */
+ if (forward_wakeup_use_single) {
+ map = map & ((~map) + 1);
+ }
+ }
+ if (map) {
+ forward_wakeups_delivered++;
+ ipi_selected(map, IPI_AST);
+ return (1);
+ }
+ return (0);
+}
+#endif
+
+void
+sched_add(struct thread *td, int flags)
+{
+ struct kseq *ksq;
+ struct ksegrp *kg;
+ struct kse *ke;
+ struct thread *mytd;
+ int class;
+ int nextrq;
+ int need_resched = 0;
+#ifdef SMP
+ int cpu;
+ int mycpu;
+ int pinned;
+ struct kseq *myksq;
+#endif
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ mytd = curthread;
+ ke = td->td_kse;
+ kg = td->td_ksegrp;
+ KASSERT(ke->ke_state != KES_ONRUNQ,
+ ("sched_add: kse %p (%s) already in run queue", ke,
+ ke->ke_proc->p_comm));
+ KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
+ ("sched_add: process swapped out"));
+ KASSERT(ke->ke_runq == NULL,
+ ("sched_add: KSE %p is still assigned to a run queue", ke));
+
+ class = PRI_BASE(kg->kg_pri_class);
+#ifdef SMP
+ mycpu = PCPU_GET(cpuid);
+ myksq = KSEQ_CPU(mycpu);
+ ke->ke_wakeup_cpu = mycpu;
+#endif
+ nextrq = (ke->ke_flags & KEF_NEXTRQ);
+ ke->ke_flags &= ~KEF_NEXTRQ;
+ if (flags & SRQ_PREEMPTED)
+ ke->ke_flags |= KEF_PREEMPTED;
+ ksq = &kseq_global;
+#ifdef SMP
+ if (td->td_pinned != 0) {
+ cpu = td->td_lastcpu;
+ ksq = KSEQ_CPU(cpu);
+ pinned = 1;
+ } else if ((ke)->ke_flags & KEF_BOUND) {
+ cpu = ke->ke_cpu;
+ ksq = KSEQ_CPU(cpu);
+ pinned = 1;
+ } else {
+ pinned = 0;
+ cpu = NOCPU;
+ }
+#endif
+ switch (class) {
+ case PRI_ITHD:
+ case PRI_REALTIME:
+ ke->ke_runq = ksq->ksq_curr;
+ break;
+ case PRI_TIMESHARE:
+ if ((td->td_flags & TDF_BORROWING) == 0 && nextrq)
+ ke->ke_runq = ksq->ksq_next;
+ else
+ ke->ke_runq = ksq->ksq_curr;
+ break;
+ case PRI_IDLE:
+ /*
+ * This is for priority prop.
+ */
+ if (td->td_priority < PRI_MIN_IDLE)
+ ke->ke_runq = ksq->ksq_curr;
+ else
+ ke->ke_runq = &ksq->ksq_idle;
+ break;
+ default:
+ panic("Unknown pri class.");
+ break;
+ }
+
+#ifdef SMP
+ if ((ke->ke_runq == kseq_global.ksq_curr ||
+ ke->ke_runq == myksq->ksq_curr) &&
+ td->td_priority < mytd->td_priority) {
+#else
+ if (ke->ke_runq == kseq_global.ksq_curr &&
+ td->td_priority < mytd->td_priority) {
+#endif
+ struct krunq *rq;
+
+ rq = ke->ke_runq;
+ ke->ke_runq = NULL;
+ if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
+ return;
+ ke->ke_runq = rq;
+ need_resched = TDF_NEEDRESCHED;
+ }
+
+ SLOT_USE(kg);
+ ke->ke_state = KES_ONRUNQ;
+ kseq_runq_add(ksq, ke);
+ kseq_load_add(ksq, ke);
+
+#ifdef SMP
+ if (pinned) {
+ if (cpu != mycpu) {
+ struct thread *running = pcpu_find(cpu)->pc_curthread;
+ if (ksq->ksq_curr == ke->ke_runq &&
+ running->td_priority < td->td_priority) {
+ if (td->td_priority <= PRI_MAX_ITHD)
+ ipi_selected(1 << cpu, IPI_PREEMPT);
+ else {
+ running->td_flags |= TDF_NEEDRESCHED;
+ ipi_selected(1 << cpu, IPI_AST);
+ }
+ }
+ } else
+ curthread->td_flags |= need_resched;
+ } else {
+ cpumask_t me = 1 << mycpu;
+ cpumask_t idle = idle_cpus_mask & me;
+ int forwarded = 0;
+
+ if (!idle && ((flags & SRQ_INTR) == 0) &&
+ (idle_cpus_mask & ~(hlt_cpus_mask | me)))
+ forwarded = forward_wakeup(cpu, me);
+ if (forwarded == 0)
+ curthread->td_flags |= need_resched;
+ }
+#else
+ mytd->td_flags |= need_resched;
+#endif
+}
+
+void
+sched_rem(struct thread *td)
+{
+ struct kseq *kseq;
+ struct kse *ke;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ ke = td->td_kse;
+ KASSERT((ke->ke_state == KES_ONRUNQ),
+ ("sched_rem: KSE not on run queue"));
+
+ kseq = ke->ke_kseq;
+ SLOT_RELEASE(td->td_ksegrp);
+ kseq_runq_rem(kseq, ke);
+ kseq_load_rem(kseq, ke);
+ ke->ke_state = KES_THREAD;
+}
+
+fixpt_t
+sched_pctcpu(struct thread *td)
+{
+ fixpt_t pctcpu;
+ struct kse *ke;
+
+ pctcpu = 0;
+ ke = td->td_kse;
+ if (ke == NULL)
+ return (0);
+
+ mtx_lock_spin(&sched_lock);
+ if (ke->ke_ticks) {
+ int rtick;
+
+ /*
+ * Don't update more frequently than twice a second. Allowing
+ * this causes the cpu usage to decay away too quickly due to
+ * rounding errors.
+ */
+ if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
+ ke->ke_ltick < (ticks - (hz / 2)))
+ sched_pctcpu_update(ke);
+ /* How many rtick per second ? */
+ rtick = MIN(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
+ pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
+ }
+
+ ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
+ mtx_unlock_spin(&sched_lock);
+
+ return (pctcpu);
+}
+
+void
+sched_bind(struct thread *td, int cpu)
+{
+ struct kse *ke;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+ ke = td->td_kse;
+ ke->ke_flags |= KEF_BOUND;
+#ifdef SMP
+ ke->ke_cpu = cpu;
+ if (PCPU_GET(cpuid) == cpu)
+ return;
+ mi_switch(SW_VOL, NULL);
+#endif
+}
+
+void
+sched_unbind(struct thread *td)
+{
+ mtx_assert(&sched_lock, MA_OWNED);
+ td->td_kse->ke_flags &= ~KEF_BOUND;
+}
+
+int
+sched_is_bound(struct thread *td)
+{
+ mtx_assert(&sched_lock, MA_OWNED);
+ return (td->td_kse->ke_flags & KEF_BOUND);
+}
+
+int
+sched_load(void)
+{
+ return (sched_tdcnt);
+}
+
+void
+sched_relinquish(struct thread *td)
+{
+ struct ksegrp *kg;
+
+ kg = td->td_ksegrp;
+ mtx_lock_spin(&sched_lock);
+ if (sched_is_timeshare(kg)) {
+ sched_prio(td, PRI_MAX_TIMESHARE);
+ td->td_kse->ke_flags |= KEF_NEXTRQ;
+ }
+ mi_switch(SW_VOL, NULL);
+ mtx_unlock_spin(&sched_lock);
+}
+
+int
+sched_sizeof_ksegrp(void)
+{
+ return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
+}
+
+int
+sched_sizeof_proc(void)
+{
+ return (sizeof(struct proc));
+}
+
+int
+sched_sizeof_thread(void)
+{
+ return (sizeof(struct thread) + sizeof(struct td_sched));
+}
+#define KERN_SWITCH_INCLUDE 1
+#include "kern/kern_switch.c"
Index: ./sys/kern/sched_4bsd.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/sched_4bsd.c,v
retrieving revision 1.77.2.1
diff -u -r1.77.2.1 sched_4bsd.c
--- ./sys/kern/sched_4bsd.c 16 Jun 2006 22:11:55 -0000 1.77.2.1
+++ ./sys/kern/sched_4bsd.c 24 Oct 2006 09:45:51 -0000
@@ -1354,6 +1354,19 @@
return (td->td_kse->ke_flags & KEF_BOUND);
}
+void
+sched_relinquish(struct thread *td)
+{
+ struct ksegrp *kg;
+
+ kg = td->td_ksegrp;
+ mtx_lock_spin(&sched_lock);
+ if (kg->kg_pri_class == PRI_TIMESHARE)
+ sched_prio(td, PRI_MAX_TIMESHARE);
+ mi_switch(SW_VOL, NULL);
+ mtx_unlock_spin(&sched_lock);
+}
+
int
sched_load(void)
{
@@ -1386,5 +1399,10 @@
return (0);
}
+
+void
+sched_tick(void)
+{
+}
#define KERN_SWITCH_INCLUDE 1
#include "kern/kern_switch.c"
Index: ./sys/kern/kern_switch.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_switch.c,v
retrieving revision 1.116.2.1
diff -u -r1.116.2.1 kern_switch.c
--- ./sys/kern/kern_switch.c 6 Aug 2005 03:06:25 -0000 1.116.2.1
+++ ./sys/kern/kern_switch.c 24 Oct 2006 09:45:54 -0000
@@ -303,7 +303,12 @@
if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
/* We only care about the kse in the run queue. */
td->td_priority = newpri;
- if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
+#ifndef SCHED_CORE
+ if (ke->ke_rqindex != (newpri / RQ_PPQ))
+#else
+ if (ke->ke_rqindex != newpri)
+#endif
+ {
sched_rem(td);
sched_add(td, SRQ_BORING);
}
Index: ./sys/kern/kern_clock.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.178.2.3
diff -u -r1.178.2.3 kern_clock.c
--- ./sys/kern/kern_clock.c 10 Mar 2006 19:37:33 -0000 1.178.2.3
+++ ./sys/kern/kern_clock.c 24 Oct 2006 09:45:59 -0000
@@ -204,6 +204,7 @@
* Run current process's virtual and profile time, as needed.
*/
mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
+ sched_tick();
if (p->p_flag & P_SA) {
/* XXXKSE What to do? */
} else {
Index: ./sys/kern/kern_tc.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_tc.c,v
retrieving revision 1.164
diff -u -r1.164 kern_tc.c
--- ./sys/kern/kern_tc.c 26 Mar 2005 20:04:28 -0000 1.164
+++ ./sys/kern/kern_tc.c 24 Oct 2006 09:46:02 -0000
@@ -8,7 +8,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.164 2005/03/26 20:04:28 phk Exp $");
+__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/kern_tc.c,v 1.177 2006/08/04 07:56:35 yar Exp $");
#include "opt_ntp.h"
@@ -61,7 +61,7 @@
struct timehands *th_next;
};
-extern struct timehands th0;
+static struct timehands th0;
static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0};
static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9};
static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8};
@@ -88,7 +88,7 @@
static struct timecounter *timecounters = &dummy_timecounter;
time_t time_second = 1;
-time_t time_uptime = 0;
+time_t time_uptime = 1;
static struct bintime boottimebin;
struct timeval boottime;
@@ -97,6 +97,7 @@
NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime");
SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, "");
static int timestepwarnings;
SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
@@ -116,6 +117,7 @@
#undef TC_STATS
static void tc_windup(void);
+static void cpu_tick_calibrate(int);
static int
sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
@@ -131,6 +133,27 @@
#endif
return SYSCTL_OUT(req, &boottime, sizeof(boottime));
}
+
+static int
+sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
+{
+ u_int ncount;
+ struct timecounter *tc = arg1;
+
+ ncount = tc->tc_get_timecount(tc);
+ return sysctl_handle_int(oidp, &ncount, sizeof(ncount), req);
+}
+
+static int
+sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
+{
+ u_int64_t freq;
+ struct timecounter *tc = arg1;
+
+ freq = tc->tc_frequency;
+ return sysctl_handle_int(oidp, &freq, sizeof(freq), req);
+}
+
/*
* Return the difference between the timehands' counter value now and what
* was when we copied it to the timehands' offset_count.
@@ -307,6 +330,7 @@
tc_init(struct timecounter *tc)
{
u_int u;
+ struct sysctl_oid *tc_root;
u = tc->tc_frequency / tc->tc_counter_mask;
/* XXX: We need some margin here, 10% is a guess */
@@ -328,6 +352,24 @@
tc->tc_next = timecounters;
timecounters = tc;
/*
+ * Set up sysctl tree for this counter.
+ */
+ tc_root = SYSCTL_ADD_NODE(NULL,
+ SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
+ CTLFLAG_RW, 0, "timecounter description");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
+ "mask for implemented bits");
+ SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc),
+ sysctl_kern_timecounter_get, "IU", "current timecounter value");
+ SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "frequency", CTLTYPE_QUAD | CTLFLAG_RD, tc, sizeof(*tc),
+ sysctl_kern_timecounter_freq, "IU", "timecounter frequency");
+ SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
+ "goodness of time counter");
+ /*
* Never automatically use a timecounter with negative quality.
* Even though we run on the dummy counter, switching here may be
* worse since this timecounter may not be monotonous.
@@ -360,12 +402,14 @@
void
tc_setclock(struct timespec *ts)
{
- struct timespec ts2;
+ struct timespec tbef, taft;
struct bintime bt, bt2;
+ cpu_tick_calibrate(1);
nsetclock++;
- binuptime(&bt2);
+ nanotime(&tbef);
timespec2bintime(ts, &bt);
+ binuptime(&bt2);
bintime_sub(&bt, &bt2);
bintime_add(&bt2, &boottimebin);
boottimebin = bt;
@@ -373,12 +417,15 @@
/* XXX fiddle all the little crinkly bits around the fiords... */
tc_windup();
+ nanotime(&taft);
if (timestepwarnings) {
- bintime2timespec(&bt2, &ts2);
- log(LOG_INFO, "Time stepped from %jd.%09ld to %jd.%09ld\n",
- (intmax_t)ts2.tv_sec, ts2.tv_nsec,
+ log(LOG_INFO,
+ "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
+ (intmax_t)tbef.tv_sec, tbef.tv_nsec,
+ (intmax_t)taft.tv_sec, taft.tv_nsec,
(intmax_t)ts->tv_sec, ts->tv_nsec);
}
+ cpu_tick_calibrate(1);
}
/*
@@ -475,8 +522,8 @@
* x = a * 2^32 / 10^9 = a * 4.294967296
*
* The range of th_adjustment is +/- 5000PPM so inside a 64bit int
- * we can only multiply by about 850 without overflowing, but that
- * leaves suitably precise fractions for multiply before divide.
+ * we can only multiply by about 850 without overflowing, that
+ * leaves no suitably precise fractions for multiply before divide.
*
* Divide before multiply with a fraction of 2199/512 results in a
* systematic undercompensation of 10PPM of th_adjustment. On a
@@ -749,11 +796,16 @@
tc_ticktock(void)
{
static int count;
+ static time_t last_calib;
if (++count < tc_tick)
return;
count = 0;
tc_windup();
+ if (time_uptime != last_calib && !(time_uptime & 0xf)) {
+ cpu_tick_calibrate(0);
+ last_calib = time_uptime;
+ }
}
static void
@@ -782,3 +834,147 @@
}
SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL)
+
+/* Cpu tick handling -------------------------------------------------*/
+
+static int cpu_tick_variable;
+static uint64_t cpu_tick_frequency;
+
+static uint64_t
+tc_cpu_ticks(void)
+{
+ static uint64_t base;
+ static unsigned last;
+ unsigned u;
+ struct timecounter *tc;
+
+ tc = timehands->th_counter;
+ u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
+ if (u < last)
+ base += (uint64_t)tc->tc_counter_mask + 1;
+ last = u;
+ return (u + base);
+}
+
+/*
+ * This function gets called ever 16 seconds on only one designated
+ * CPU in the system from hardclock() via tc_ticktock().
+ *
+ * Whenever the real time clock is stepped we get called with reset=1
+ * to make sure we handle suspend/resume and similar events correctly.
+ */
+
+static void
+cpu_tick_calibrate(int reset)
+{
+ static uint64_t c_last;
+ uint64_t c_this, c_delta;
+ static struct bintime t_last;
+ struct bintime t_this, t_delta;
+ uint32_t divi;
+
+ if (reset) {
+ /* The clock was stepped, abort & reset */
+ t_last.sec = 0;
+ return;
+ }
+
+ /* we don't calibrate fixed rate cputicks */
+ if (!cpu_tick_variable)
+ return;
+
+ getbinuptime(&t_this);
+ c_this = cpu_ticks();
+ if (t_last.sec != 0) {
+ c_delta = c_this - c_last;
+ t_delta = t_this;
+ bintime_sub(&t_delta, &t_last);
+ /*
+ * Validate that 16 +/- 1/256 seconds passed.
+ * After division by 16 this gives us a precision of
+ * roughly 250PPM which is sufficient
+ */
+ if (t_delta.sec > 16 || (
+ t_delta.sec == 16 && t_delta.frac >= (0x01LL << 56))) {
+ /* too long */
+ if (bootverbose)
+ printf("%ju.%016jx too long\n",
+ (uintmax_t)t_delta.sec,
+ (uintmax_t)t_delta.frac);
+ } else if (t_delta.sec < 15 ||
+ (t_delta.sec == 15 && t_delta.frac <= (0xffLL << 56))) {
+ /* too short */
+ if (bootverbose)
+ printf("%ju.%016jx too short\n",
+ (uintmax_t)t_delta.sec,
+ (uintmax_t)t_delta.frac);
+ } else {
+ /* just right */
+ /*
+ * Headroom:
+ * 2^(64-20) / 16[s] =
+ * 2^(44) / 16[s] =
+ * 17.592.186.044.416 / 16 =
+ * 1.099.511.627.776 [Hz]
+ */
+ divi = t_delta.sec << 20;
+ divi |= t_delta.frac >> (64 - 20);
+ c_delta <<= 20;
+ c_delta /= divi;
+ if (c_delta > cpu_tick_frequency) {
+ if (0 && bootverbose)
+ printf("cpu_tick increased to %ju Hz\n",
+ c_delta);
+ cpu_tick_frequency = c_delta;
+ }
+ }
+ }
+ c_last = c_this;
+ t_last = t_this;
+}
+
+void
+set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var)
+{
+
+ if (func == NULL) {
+ cpu_ticks = tc_cpu_ticks;
+ } else {
+ cpu_tick_frequency = freq;
+ cpu_tick_variable = var;
+ cpu_ticks = func;
+ }
+}
+
+uint64_t
+cpu_tickrate(void)
+{
+
+ if (cpu_ticks == tc_cpu_ticks)
+ return (tc_getfrequency());
+ return (cpu_tick_frequency);
+}
+
+/*
+ * We need to be slightly careful converting cputicks to microseconds.
+ * There is plenty of margin in 64 bits of microseconds (half a million
+ * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
+ * before divide conversion (to retain precision) we find that the
+ * margin shrinks to 1.5 hours (one millionth of 146y).
+ * With a three prong approach we never lose significant bits, no
+ * matter what the cputick rate and length of timeinterval is.
+ */
+
+uint64_t
+cputick2usec(uint64_t tick)
+{
+
+ if (tick > 18446744073709551LL) /* floor(2^64 / 1000) */
+ return (tick / (cpu_tickrate() / 1000000LL));
+ else if (tick > 18446744073709LL) /* floor(2^64 / 1000000) */
+ return ((tick * 1000LL) / (cpu_tickrate() / 1000LL));
+ else
+ return ((tick * 1000000LL) / cpu_tickrate());
+}
+
+cpu_tick_f *cpu_ticks = tc_cpu_ticks;
Index: ./sys/kern/kern_synch.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.270.2.6
diff -u -r1.270.2.6 kern_synch.c
--- ./sys/kern/kern_synch.c 6 Jul 2006 08:32:50 -0000 1.270.2.6
+++ ./sys/kern/kern_synch.c 24 Oct 2006 09:46:06 -0000
@@ -537,14 +537,7 @@
int
yield(struct thread *td, struct yield_args *uap)
{
- struct ksegrp *kg;
-
- kg = td->td_ksegrp;
mtx_assert(&Giant, MA_NOTOWNED);
- mtx_lock_spin(&sched_lock);
- sched_prio(td, PRI_MAX_TIMESHARE);
- mi_switch(SW_VOL, NULL);
- mtx_unlock_spin(&sched_lock);
- td->td_retval[0] = 0;
+ sched_relinquish(td);
return (0);
}
Index: ./sys/conf/options
===================================================================
RCS file: /home/ncvs/src/sys/conf/options,v
retrieving revision 1.510.2.19
diff -u -r1.510.2.19 options
--- ./sys/conf/options 2 Sep 2006 13:12:08 -0000 1.510.2.19
+++ ./sys/conf/options 24 Oct 2006 09:46:09 -0000
@@ -127,6 +127,7 @@
PUC_FASTINTR opt_puc.h
QUOTA
SCHED_4BSD opt_sched.h
+SCHED_CORE opt_sched.h
SCHED_ULE opt_sched.h
SHOW_BUSYBUFS
SLEEPQUEUE_PROFILING
Index: ./sys/conf/files
===================================================================
RCS file: /home/ncvs/src/sys/conf/files,v
retrieving revision 1.1031.2.45
diff -u -r1.1031.2.45 files
--- ./sys/conf/files 21 Oct 2006 05:28:50 -0000 1.1031.2.45
+++ ./sys/conf/files 24 Oct 2006 09:46:12 -0000
@@ -1302,6 +1302,7 @@
kern/md4c.c optional netsmb
kern/md5c.c standard
kern/sched_4bsd.c optional sched_4bsd
+kern/sched_core.c optional sched_core
kern/sched_ule.c optional sched_ule
kern/subr_autoconf.c standard
kern/subr_blist.c standard
Index: ./sys/posix4/ksched.c
===================================================================
RCS file: /home/ncvs/src/sys/posix4/ksched.c,v
retrieving revision 1.26
diff -u -r1.26 ksched.c
--- ./sys/posix4/ksched.c 7 Jan 2005 02:29:19 -0000 1.26
+++ ./sys/posix4/ksched.c 24 Oct 2006 09:46:15 -0000
@@ -246,9 +246,7 @@
*/
int ksched_yield(register_t *ret, struct ksched *ksched)
{
- mtx_lock_spin(&sched_lock);
- curthread->td_flags |= TDF_NEEDRESCHED;
- mtx_unlock_spin(&sched_lock);
+ sched_relinquish(curthread);
return 0;
}
More information about the freebsd-hackers
mailing list