PERFORCE change 165014 for review
Prashant Vaibhav
pvaibhav at FreeBSD.org
Tue Jun 23 23:13:34 UTC 2009
http://perforce.freebsd.org/chv.cgi?CH=165014
Change 165014 by pvaibhav at pvaibhav_matrix on 2009/06/23 23:12:45
Syncing branch with local copy. Implemented in this commit are -
1. Generic priority queue implementation based on binary heaps.
Check comments in sys/bheap.h for more information.
2. callout/timeout scheme modified to use the new binary heap-based
priority queue instead of old "callout wheel" data structure.
Not everything has been heavily tested, however. Calling for help
with testing on SMP systems.
Affected files ...
.. //depot/projects/soc2009/calloutapi/src/sys/kern/kern_timeout.c#2 edit
.. //depot/projects/soc2009/calloutapi/src/sys/sys/bheap.h#1 add
.. //depot/projects/soc2009/calloutapi/src/sys/sys/callout.h#2 edit
Differences ...
==== //depot/projects/soc2009/calloutapi/src/sys/kern/kern_timeout.c#2 (text+ko) ====
@@ -64,9 +64,6 @@
SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0,
"struct callout *");
-static int avg_depth;
-SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
- "Average number of items examined per softclock call. Units = 1/1000");
static int avg_gcalls;
SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
"Average number of Giant callouts made per softclock call. Units = 1/1000");
@@ -80,17 +77,15 @@
* TODO:
* allocate more timeout table slots when table overflows.
*/
-int callwheelsize, callwheelbits, callwheelmask;
-
struct callout_cpu {
struct mtx cc_lock;
- struct callout *cc_callout;
- struct callout_tailq *cc_callwheel;
- struct callout_list cc_callfree;
- struct callout *cc_next;
+ struct callout *cc_callout; /* pointer to local callouts */
+ struct callout** cc_pq_mem; /* memory for the priority Q */
+ struct callout_queue cc_callqueue; /* master priority queue */
+ struct callout_list cc_callfree; /* available local callouts */
struct callout *cc_curr;
void *cc_cookie;
- int cc_softticks;
+ int cc_monoticks; /* monotonic increasing tick */
int cc_cancel;
int cc_waiting;
};
@@ -141,19 +136,15 @@
timeout_cpu = PCPU_GET(cpuid);
cc = CC_CPU(timeout_cpu);
- /*
- * Calculate callout wheel size
- */
- for (callwheelsize = 1, callwheelbits = 0;
- callwheelsize < ncallout;
- callwheelsize <<= 1, ++callwheelbits)
- ;
- callwheelmask = callwheelsize - 1;
+ /* Reserve space for locally allocated callouts (timeout()) */
cc->cc_callout = (struct callout *)v;
v = (caddr_t)(cc->cc_callout + ncallout);
- cc->cc_callwheel = (struct callout_tailq *)v;
- v = (caddr_t)(cc->cc_callwheel + callwheelsize);
+
+ /* Reserve space for callout queue's internal array */
+ cc->cc_pq_mem = (struct callout**)v;
+ v = (caddr_t)(cc->cc_pq_mem + ncallout);
+
return(v);
}
@@ -164,17 +155,26 @@
int i;
mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
+
+ /*
+ * Initialize the priority queue
+ */
+ MINHEAP_INIT(&cc->cc_callqueue, callout, ncallout, cc->cc_pq_mem);
+
+ /* Set monoticks to 0 */
+ cc->cc_monoticks = 0;
+
+ /*
+ * Initialize list of available local callouts
+ */
SLIST_INIT(&cc->cc_callfree);
- for (i = 0; i < callwheelsize; i++) {
- TAILQ_INIT(&cc->cc_callwheel[i]);
- }
if (cc->cc_callout == NULL)
return;
for (i = 0; i < ncallout; i++) {
c = &cc->cc_callout[i];
callout_init(c, 0);
c->c_flags = CALLOUT_LOCAL_ALLOC;
- SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+ SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links);
}
}
@@ -220,8 +220,8 @@
INTR_MPSAFE, &cc->cc_cookie))
panic("died while creating standard software ithreads");
cc->cc_callout = NULL; /* Only cpu0 handles timeout(). */
- cc->cc_callwheel = malloc(
- sizeof(struct callout_tailq) * callwheelsize, M_CALLOUT,
+ cc->cc_pq_mem = (struct callout**) malloc(
+ sizeof(struct callout*) * ncallout, M_CALLOUT,
M_WAITOK);
callout_cpu_init(cc);
}
@@ -235,29 +235,34 @@
{
struct callout_cpu *cc;
int need_softclock;
- int bucket;
/*
* Process callouts at a very low cpu priority, so we don't keep the
* relatively high clock interrupt priority any longer than necessary.
*/
- need_softclock = 0;
cc = CC_SELF();
mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
- for (; (cc->cc_softticks - ticks) < 0; cc->cc_softticks++) {
- bucket = cc->cc_softticks & callwheelmask;
- if (!TAILQ_EMPTY(&cc->cc_callwheel[bucket])) {
- need_softclock = 1;
- break;
- }
- }
- mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
+
+ cc->cc_monoticks++; /* XXX: Can we just use the global 'ticks'? */
+ need_softclock = 0;
+
+ /*
+ * Check if the first pending callout's time has already arrived (or
+ * has passed). If yes, schedule the softclock() which will then
+ * expire this (and any remaining) callout as appropriate.
+ */
+ if (MINHEAP_EMPTY(&cc->cc_callqueue))
+ need_softclock = 0;
+ else if (MINHEAP_FIRST(&cc->cc_callqueue)->c_time <= cc->cc_monoticks)
+ need_softclock = 1;
/*
- * swi_sched acquires the thread lock, so we don't want to call it
- * with cc_lock held; incorrect locking order.
+ * swi_sched acquires the thread lock, so we don't want to call
+ * it with cc_lock held; incorrect locking order.
*/
- if (need_softclock)
+ if (need_softclock == 1) {
+ mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
swi_sched(cc->cc_cookie, 0);
+ }
}
static struct callout_cpu *
@@ -278,29 +283,29 @@
}
/*
- * The callout mechanism is based on the work of Adam M. Costello and
- * George Varghese, published in a technical report entitled "Redesigning
- * the BSD Callout and Timer Facilities" and modified slightly for inclusion
- * in FreeBSD by Justin T. Gibbs. The original work on the data structures
- * used in this implementation was published by G. Varghese and T. Lauck in
- * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
- * the Efficient Implementation of a Timer Facility" in the Proceedings of
- * the 11th ACM Annual Symposium on Operating Systems Principles,
- * Austin, Texas Nov 1987.
+ * The callout mechanism is a work-in-progress, based on the idea of using
+ * binary heap based priority queues containing callouts to expire in the
+ * future. The expiry time is (currently) specified as the number of ticks
+ * of the hardware clock (HZ/sec). This is an absolute value.
+ *
+ * The softclock() function checks that the first pending (ie. highest priority
+ * callout) in the queue has an expiry time that has either arrived or already
+ * passed. If yes, that callout is handled. Following this, the callout is
+ * removed from the queue and the next callout is checked in a similar fashion,
+ * until the expiry time of the next-in-line callout is in the future.
+ *
+ * For more information, ref: http://wiki.freebsd.org/SOC2009PrashantVaibhav
*/
/*
* Software (low priority) clock interrupt.
- * Run periodic events from timeout queue.
*/
void
softclock(void *arg)
{
struct callout_cpu *cc;
struct callout *c;
- struct callout_tailq *bucket;
int curticks;
- int steps; /* #steps since we last allowed interrupts */
int depth;
int mpcalls;
int lockcalls;
@@ -320,159 +325,144 @@
lockcalls = 0;
gcalls = 0;
depth = 0;
- steps = 0;
cc = (struct callout_cpu *)arg;
CC_LOCK(cc);
- while (cc->cc_softticks != ticks) {
- /*
- * cc_softticks may be modified by hard clock, so cache
- * it while we work on a given bucket.
- */
- curticks = cc->cc_softticks;
- cc->cc_softticks++;
- bucket = &cc->cc_callwheel[curticks & callwheelmask];
- c = TAILQ_FIRST(bucket);
- while (c) {
- depth++;
- if (c->c_time != curticks) {
- c = TAILQ_NEXT(c, c_links.tqe);
- ++steps;
- if (steps >= MAX_SOFTCLOCK_STEPS) {
- cc->cc_next = c;
- /* Give interrupts a chance. */
- CC_UNLOCK(cc);
- ; /* nothing */
- CC_LOCK(cc);
- c = cc->cc_next;
- steps = 0;
- }
- } else {
- void (*c_func)(void *);
- void *c_arg;
- struct lock_class *class;
- struct lock_object *c_lock;
- int c_flags, sharedlock;
+ /*
+ * ticks may be modified by hard clock, so cache it
+ */
+ curticks = cc->cc_monoticks;
+ CTR1(KTR_CALLOUT, "softclock run at %d", curticks);
+ while (!MINHEAP_EMPTY(&cc->cc_callqueue)) {
+ c = MINHEAP_FIRST(&cc->cc_callqueue);
+ if (c->c_time > curticks) {
+ /*
+ * We've reached a callout whose expiry time is
+ * in the future, so we stop.
+ */
+ break;
+ } else {
+ /*
+ * This callout needs to be expired now
+ */
+ void (*c_func)(void *);
+ void *c_arg;
+ struct lock_class *class;
+ struct lock_object *c_lock;
+ int c_flags, sharedlock;
- cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
- TAILQ_REMOVE(bucket, c, c_links.tqe);
- class = (c->c_lock != NULL) ?
- LOCK_CLASS(c->c_lock) : NULL;
- sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ?
- 0 : 1;
- c_lock = c->c_lock;
- c_func = c->c_func;
- c_arg = c->c_arg;
- c_flags = c->c_flags;
- if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
- c->c_flags = CALLOUT_LOCAL_ALLOC;
- } else {
- c->c_flags =
- (c->c_flags & ~CALLOUT_PENDING);
+ MINHEAP_REMOVE(&cc->cc_callqueue, c, c_qe, c_time);
+ class = (c->c_lock != NULL) ?
+ LOCK_CLASS(c->c_lock) : NULL;
+ sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
+ c_lock = c->c_lock;
+ c_func = c->c_func;
+ c_arg = c->c_arg;
+ c_flags = c->c_flags;
+
+ if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+ c->c_flags = CALLOUT_LOCAL_ALLOC;
+ else
+ c->c_flags = (c->c_flags & ~CALLOUT_PENDING);
+ cc->cc_curr = c;
+ cc->cc_cancel = 0;
+ CC_UNLOCK(cc);
+ if (c_lock != NULL) {
+ class->lc_lock(c_lock, sharedlock);
+ /*
+ * The callout may have been cancelled
+ * while we switched locks.
+ */
+ if (cc->cc_cancel) {
+ class->lc_unlock(c_lock);
+ goto skip;
}
- cc->cc_curr = c;
- cc->cc_cancel = 0;
- CC_UNLOCK(cc);
- if (c_lock != NULL) {
- class->lc_lock(c_lock, sharedlock);
- /*
- * The callout may have been cancelled
- * while we switched locks.
- */
- if (cc->cc_cancel) {
- class->lc_unlock(c_lock);
- goto skip;
- }
- /* The callout cannot be stopped now. */
- cc->cc_cancel = 1;
+ /* The callout cannot be stopped now. */
+ cc->cc_cancel = 1;
- if (c_lock == &Giant.lock_object) {
- gcalls++;
- CTR3(KTR_CALLOUT,
- "callout %p func %p arg %p",
- c, c_func, c_arg);
- } else {
- lockcalls++;
- CTR3(KTR_CALLOUT, "callout lock"
- " %p func %p arg %p",
- c, c_func, c_arg);
- }
+ if (c_lock == &Giant.lock_object) {
+ gcalls++;
+ CTR3(KTR_CALLOUT,
+ "callout %p func %p arg %p",
+ c, c_func, c_arg);
} else {
- mpcalls++;
- CTR3(KTR_CALLOUT,
- "callout mpsafe %p func %p arg %p",
- c, c_func, c_arg);
+ lockcalls++;
+ CTR3(KTR_CALLOUT, "callout lock"
+ " %p func %p arg %p",
+ c, c_func, c_arg);
}
+ } else {
+ mpcalls++;
+ CTR3(KTR_CALLOUT,
+ "callout mpsafe %p func %p arg %p",
+ c, c_func, c_arg);
+ }
#ifdef DIAGNOSTIC
- binuptime(&bt1);
+ binuptime(&bt1);
#endif
- THREAD_NO_SLEEPING();
- SDT_PROBE(callout_execute, kernel, ,
- callout_start, c, 0, 0, 0, 0);
- c_func(c_arg);
- SDT_PROBE(callout_execute, kernel, ,
- callout_end, c, 0, 0, 0, 0);
- THREAD_SLEEPING_OK();
+ THREAD_NO_SLEEPING();
+ SDT_PROBE(callout_execute, kernel, ,
+ callout_start, c, 0, 0, 0, 0);
+ c_func(c_arg);
+ SDT_PROBE(callout_execute, kernel, ,
+ callout_end, c, 0, 0, 0, 0);
+ THREAD_SLEEPING_OK();
#ifdef DIAGNOSTIC
- binuptime(&bt2);
- bintime_sub(&bt2, &bt1);
- if (bt2.frac > maxdt) {
- if (lastfunc != c_func ||
- bt2.frac > maxdt * 2) {
- bintime2timespec(&bt2, &ts2);
- printf(
- "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
- c_func, c_arg,
- (intmax_t)ts2.tv_sec,
- ts2.tv_nsec);
- }
- maxdt = bt2.frac;
- lastfunc = c_func;
+ binuptime(&bt2);
+ bintime_sub(&bt2, &bt1);
+ if (bt2.frac > maxdt) {
+ if (lastfunc != c_func ||
+ bt2.frac > maxdt * 2)
+ {
+ bintime2timespec(&bt2, &ts2);
+ printf(
+ "Expensive timeout(9) function:"
+ "%p(%p) %jd.%09ld s\n",
+ c_func, c_arg,
+ (intmax_t)ts2.tv_sec,
+ ts2.tv_nsec);
}
+ maxdt = bt2.frac;
+ lastfunc = c_func;
+ }
#endif
- CTR1(KTR_CALLOUT, "callout %p finished", c);
- if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
- class->lc_unlock(c_lock);
- skip:
- CC_LOCK(cc);
+ CTR1(KTR_CALLOUT, "callout %p finished", c);
+ if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
+ class->lc_unlock(c_lock);
+skip:
+ CC_LOCK(cc);
+ /*
+ * If the current callout is locally
+ * allocated (from timeout(9))
+ * then put it on the freelist.
+ *
+ * Note: we need to check the cached
+ * copy of c_flags because if it was not
+ * local, then it's not safe to deref the
+ * callout pointer.
+ */
+ if (c_flags & CALLOUT_LOCAL_ALLOC) {
+ KASSERT(c->c_flags == CALLOUT_LOCAL_ALLOC,
+ ("corrupted callout"));
+ c->c_func = NULL;
+ SLIST_INSERT_HEAD(&cc->cc_callfree, c,
+ c_links);
+ }
+ cc->cc_curr = NULL;
+ if (cc->cc_waiting) {
/*
- * If the current callout is locally
- * allocated (from timeout(9))
- * then put it on the freelist.
- *
- * Note: we need to check the cached
- * copy of c_flags because if it was not
- * local, then it's not safe to deref the
- * callout pointer.
+ * There is someone waiting
+ * for the callout to complete.
*/
- if (c_flags & CALLOUT_LOCAL_ALLOC) {
- KASSERT(c->c_flags ==
- CALLOUT_LOCAL_ALLOC,
- ("corrupted callout"));
- c->c_func = NULL;
- SLIST_INSERT_HEAD(&cc->cc_callfree, c,
- c_links.sle);
- }
- cc->cc_curr = NULL;
- if (cc->cc_waiting) {
- /*
- * There is someone waiting
- * for the callout to complete.
- */
- cc->cc_waiting = 0;
- CC_UNLOCK(cc);
- wakeup(&cc->cc_waiting);
- CC_LOCK(cc);
- }
- steps = 0;
- c = cc->cc_next;
+ cc->cc_waiting = 0;
+ CC_UNLOCK(cc);
+ wakeup(&cc->cc_waiting);
+ CC_LOCK(cc);
}
}
- }
- avg_depth += (depth * 1000 - avg_depth) >> 8;
+ };
avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
- cc->cc_next = NULL;
CC_UNLOCK(cc);
}
@@ -509,7 +499,7 @@
if (new == NULL)
/* XXX Attempt to malloc first */
panic("timeout table full");
- SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
+ SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links);
callout_reset(new, to_ticks, ftn, arg);
handle.callout = new;
CC_UNLOCK(cc);
@@ -597,12 +587,10 @@
}
}
if (c->c_flags & CALLOUT_PENDING) {
- if (cc->cc_next == c) {
- cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
- }
- TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c,
- c_links.tqe);
-
+ /* TODO: Just modify the key of existing callouts instead of
+ * removing and re-inserting it
+ */
+ MINHEAP_REMOVE(&cc->cc_callqueue, c, c_qe, c_time);
cancelled = 1;
c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
}
@@ -622,11 +610,12 @@
c->c_arg = arg;
c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
c->c_func = ftn;
- c->c_time = ticks + to_ticks;
- TAILQ_INSERT_TAIL(&cc->cc_callwheel[c->c_time & callwheelmask],
- c, c_links.tqe);
- CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d",
- cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks);
+ c->c_time = cc->cc_monoticks + to_ticks;
+
+ MINHEAP_INSERT(&cc->cc_callqueue, c, c_qe, c_time);
+
+ CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p at %d",
+ cancelled ? "re" : "", c, c->c_func, c->c_arg, c->c_time);
CC_UNLOCK(cc);
return (cancelled);
@@ -766,18 +755,14 @@
c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
- if (cc->cc_next == c) {
- cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
- }
- TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c,
- c_links.tqe);
+ MINHEAP_REMOVE(&cc->cc_callqueue, c, c_qe, c_time);
CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
c, c->c_func, c->c_arg);
if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
c->c_func = NULL;
- SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+ SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links);
}
CC_UNLOCK(cc);
return (1);
@@ -838,48 +823,13 @@
adjust_timeout_calltodo(time_change)
struct timeval *time_change;
{
- register struct callout *p;
- unsigned long delta_ticks;
-
- /*
- * How many ticks were we asleep?
- * (stolen from tvtohz()).
- */
-
- /* Don't do anything */
- if (time_change->tv_sec < 0)
- return;
- else if (time_change->tv_sec <= LONG_MAX / 1000000)
- delta_ticks = (time_change->tv_sec * 1000000 +
- time_change->tv_usec + (tick - 1)) / tick + 1;
- else if (time_change->tv_sec <= LONG_MAX / hz)
- delta_ticks = time_change->tv_sec * hz +
- (time_change->tv_usec + (tick - 1)) / tick + 1;
- else
- delta_ticks = LONG_MAX;
-
- if (delta_ticks > INT_MAX)
- delta_ticks = INT_MAX;
-
- /*
- * Now rip through the timer calltodo list looking for timers
- * to expire.
+ /*
+ * pvaibhav: This function is no longer required because all timers
+ * whose expiry times are in the past or present are automatically
+ * going to be expired on wake up when callout_tick() runs and then
+ * subsequently calls softclock().
*/
- /* don't collide with softclock() */
- CC_LOCK(cc);
- for (p = calltodo.c_next; p != NULL; p = p->c_next) {
- p->c_time -= delta_ticks;
-
- /* Break if the timer had more time on it than delta_ticks */
- if (p->c_time > 0)
- break;
-
- /* take back the ticks the timer didn't use (p->c_time <= 0) */
- delta_ticks = -p->c_time;
- }
- CC_UNLOCK(cc);
-
return;
}
#endif /* APM_FIXUP_CALLTODO */
==== //depot/projects/soc2009/calloutapi/src/sys/sys/callout.h#2 (text+ko) ====
@@ -39,23 +39,23 @@
#define _SYS_CALLOUT_H_
#include <sys/queue.h>
+#include <sys/bheap.h>
struct lock_object;
SLIST_HEAD(callout_list, callout);
-TAILQ_HEAD(callout_tailq, callout);
+MINHEAP_HEAD(callout_queue, callout);
+MINHEAP_ENTRY(callout_queue_entry);
struct callout {
- union {
- SLIST_ENTRY(callout) sle;
- TAILQ_ENTRY(callout) tqe;
- } c_links;
int c_time; /* ticks to the event */
void *c_arg; /* function argument */
void (*c_func)(void *); /* function to call */
struct lock_object *c_lock; /* lock to handle */
int c_flags; /* state of this entry */
volatile int c_cpu; /* CPU we're scheduled on */
+ SLIST_ENTRY(callout) c_links;
+ struct callout_queue_entry c_qe; /* data for priority queue */
};
#define CALLOUT_LOCAL_ALLOC 0x0001 /* was allocated from callfree */
More information about the p4-projects
mailing list