svn commit: r238907 - projects/calloutng/sys/kern
Attilio Rao
attilio at freebsd.org
Mon Jul 30 14:24:28 UTC 2012
On 7/30/12, Davide Italiano <davide at freebsd.org> wrote:
> On Mon, Jul 30, 2012 at 4:02 PM, Attilio Rao <attilio at freebsd.org> wrote:
>> On 7/30/12, Davide Italiano <davide at freebsd.org> wrote:
>>> Author: davide
>>> Date: Mon Jul 30 13:50:37 2012
>>> New Revision: 238907
>>> URL: http://svn.freebsd.org/changeset/base/238907
>>>
>>> Log:
>>> - Fix a LOR deadlock dropping the callout lock while executing the
>>> handler
>>> directly from hardware interrupt context.
>>>
>>> - Add migration support for callouts which runs from hw interrupt
>>> context.
>>>
>>> - Refactor a couple of comments to reflect the new world order.
>>>
>>> TODO: implement proper stats for direct callouts.
>>>
>>> Just a note: I'd like to thank flo@ for his invaluable help in testing
>>> and
>>> issues, mav@ that helped me in tackling the problem and Yahoo! which
>>> provided access to the machines on which tests were run.
>>>
>>> Reported by: avg, flo [1]
>>> Discused with: mav
>>>
>>> Modified:
>>> projects/calloutng/sys/kern/kern_timeout.c
>>>
>>> Modified: projects/calloutng/sys/kern/kern_timeout.c
>>> ==============================================================================
>>> --- projects/calloutng/sys/kern/kern_timeout.c Mon Jul 30
>>> 12:25:20
>>> 2012 (r238906)
>>> +++ projects/calloutng/sys/kern/kern_timeout.c Mon Jul 30
>>> 13:50:37
>>> 2012 (r238907)
>>> @@ -91,45 +91,62 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpca
>>> int callwheelsize, callwheelmask;
>>>
>>> /*
>>> - * The callout cpu migration entity represents informations necessary
>>> for
>>> - * describing the migrating callout to the new callout cpu.
>>> + * The callout cpu exec entities represent informations necessary for
>>> + * describing the state of callouts currently running on the CPU and
>>> the
>>> ones
>>> + * necessary for migrating callouts to the new callout cpu. In
>>> particular,
>>> + * the first entry of the array cc_exec_entity holds informations for
>>> callout
>>> + * running in SWI thread context, while the second one holds
>>> informations
>>> + * for callout running directly from hardware interrupt context.
>>> * The cached informations are very important for deferring migration
>>> when
>>> * the migrating callout is already running.
>>> */
>>> -struct cc_mig_ent {
>>> +struct cc_exec {
>>> + struct callout *cc_next;
>>> + struct callout *cc_curr;
>>> #ifdef SMP
>>> void (*ce_migration_func)(void *);
>>> void *ce_migration_arg;
>>> int ce_migration_cpu;
>>> struct bintime ce_migration_time;
>>> #endif
>>> + int cc_cancel;
>>> + int cc_waiting;
>>> };
>>>
>>> /*
>>> - * There is one struct callout_cpu per cpu, holding all relevant
>>> + * There is one struct callou_cpu per cpu, holding all relevant
>>> * state for the callout processing thread on the individual CPU.
>>> */
>>> struct callout_cpu {
>>> - struct cc_mig_ent cc_migrating_entity;
>>> + struct cc_exec cc_exec_entity[2];
>>> struct mtx cc_lock;
>>> struct callout *cc_callout;
>>> struct callout_tailq *cc_callwheel;
>>> struct callout_tailq cc_expireq;
>>> struct callout_list cc_callfree;
>>> - struct callout *cc_next;
>>> - struct callout *cc_curr;
>>> struct bintime cc_firstevent;
>>> struct bintime cc_lastscan;
>>> void *cc_cookie;
>>> - int cc_cancel;
>>> - int cc_waiting;
>>> };
>>>
>>> +#define cc_exec_curr cc_exec_entity[0].cc_curr
>>> +#define cc_exec_next cc_exec_entity[0].cc_next
>>> +#define cc_exec_cancel cc_exec_entity[0].cc_cancel
>>> +#define cc_exec_waiting cc_exec_entity[0].cc_waiting
>>> +#define cc_exec_curr_dir cc_exec_entity[1].cc_curr
>>> +#define cc_exec_next_dir cc_exec_entity[1].cc_next
>>> +#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel
>>> +#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting
>>> +
>>> #ifdef SMP
>>> -#define cc_migration_func
>>> cc_migrating_entity.ce_migration_func
>>> -#define cc_migration_arg
>>> cc_migrating_entity.ce_migration_arg
>>> -#define cc_migration_cpu
>>> cc_migrating_entity.ce_migration_cpu
>>> -#define cc_migration_time
>>> cc_migrating_entity.ce_migration_time
>>> +#define cc_migration_func
>>> cc_exec_entity[0].ce_migration_func
>>> +#define cc_migration_arg cc_exec_entity[0].ce_migration_arg
>>> +#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu
>>> +#define cc_migration_time
>>> cc_exec_entity[0].ce_migration_time
>>> +#define cc_migration_func_dir
>>> cc_exec_entity[1].ce_migration_func
>>> +#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg
>>> +#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu
>>> +#define cc_migration_time_dir
>>> cc_exec_entity[1].ce_migration_time
>>>
>>> struct callout_cpu cc_cpu[MAXCPU];
>>> #define CPUBLOCK MAXCPU
>>> @@ -156,6 +173,9 @@ struct callout_cpu cc_cpu;
>>>
>>> static int timeout_cpu;
>>> void (*callout_new_inserted)(int cpu, struct bintime bt) = NULL;
>>> +static struct callout *
>>> +softclock_call_cc(struct callout *c, struct callout_cpu *cc, int
>>> *mpcalls,
>>> + int *lockcalls, int *gcalls, int direct);
>>>
>>> static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
>>>
>>> @@ -180,15 +200,18 @@ static MALLOC_DEFINE(M_CALLOUT, "callout
>>> * Resets the migration entity tied to a specific callout cpu.
>>> */
>>> static void
>>> -cc_cme_cleanup(struct callout_cpu *cc)
>>> +cc_cme_cleanup(struct callout_cpu *cc, int direct)
>>> {
>>> -
>>> +
>>> + cc->cc_exec_entity[direct].cc_curr = NULL;
>>> + cc->cc_exec_entity[direct].cc_next = NULL;
>>> + cc->cc_exec_entity[direct].cc_cancel = 0;
>>> + cc->cc_exec_entity[direct].cc_waiting = 0;
>>> #ifdef SMP
>>> - cc->cc_migration_cpu = CPUBLOCK;
>>> - cc->cc_migration_time.sec = 0;
>>> - cc->cc_migration_time.frac = 0;
>>> - cc->cc_migration_func = NULL;
>>> - cc->cc_migration_arg = NULL;
>>> + cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
>>> + bintime_clear(&cc->cc_exec_entity[direct].ce_migration_time);
>>> + cc->cc_exec_entity[direct].ce_migration_func = NULL;
>>> + cc->cc_exec_entity[direct].ce_migration_arg = NULL;
>>> #endif
>>> }
>>>
>>> @@ -196,11 +219,12 @@ cc_cme_cleanup(struct callout_cpu *cc)
>>> * Checks if migration is requested by a specific callout cpu.
>>> */
>>> static int
>>> -cc_cme_migrating(struct callout_cpu *cc)
>>> +cc_cme_migrating(struct callout_cpu *cc, int direct)
>>> {
>>>
>>> #ifdef SMP
>>> - return (cc->cc_migration_cpu != CPUBLOCK);
>>> +
>>> + return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
>>> #else
>>> return (0);
>>> #endif
>>> @@ -246,7 +270,8 @@ callout_cpu_init(struct callout_cpu *cc)
>>> TAILQ_INIT(&cc->cc_callwheel[i]);
>>> }
>>> TAILQ_INIT(&cc->cc_expireq);
>>> - cc_cme_cleanup(cc);
>>> + for (i = 0; i < 2; i++)
>>> + cc_cme_cleanup(cc, i);
>>> if (cc->cc_callout == NULL)
>>> return;
>>> for (i = 0; i < ncallout; i++) {
>>> @@ -355,7 +380,8 @@ callout_process(struct bintime *now)
>>> struct callout *tmp;
>>> struct callout_cpu *cc;
>>> struct callout_tailq *sc;
>>> - int cpu, first, future, last, need_softclock;
>>> + int cpu, first, future, gcalls, mpcalls, last, lockcalls,
>>> + need_softclock;
>>>
>>> /*
>>> * Process callouts at a very low cpu priority, so we don't keep
>>> the
>>> @@ -376,7 +402,8 @@ callout_process(struct bintime *now)
>>> first &= callwheelmask;
>>> for (;;) {
>>> sc = &cc->cc_callwheel[first];
>>> - TAILQ_FOREACH(tmp, sc, c_links.tqe) {
>>> + tmp = TAILQ_FIRST(sc);
>>> + while (tmp != NULL) {
>>> next = tmp->c_time;
>>> bintime_sub(&next, &tmp->c_precision);
>>> if (bintime_cmp(&next, now, <=)) {
>>> @@ -385,17 +412,20 @@ callout_process(struct bintime *now)
>>> * directly from hardware interrupt
>>> context.
>>> */
>>> if (tmp->c_flags & CALLOUT_DIRECT) {
>>> - tmp->c_func(tmp->c_arg);
>>> TAILQ_REMOVE(sc, tmp,
>>> c_links.tqe);
>>> - tmp->c_flags &= ~CALLOUT_PENDING;
>>> + tmp = softclock_call_cc(tmp, cc,
>>> + &mpcalls, &lockcalls, &gcalls,
>>> 1);
>>> } else {
>>> TAILQ_INSERT_TAIL(&cc->cc_expireq,
>>> tmp, c_staiter);
>>> TAILQ_REMOVE(sc, tmp,
>>> c_links.tqe);
>>> tmp->c_flags |= CALLOUT_PROCESSED;
>>> need_softclock = 1;
>>> + tmp = TAILQ_NEXT(tmp,
>>> c_links.tqe);
>>> }
>>> }
>>> + else
>>> + tmp = TAILQ_NEXT(tmp, c_links.tqe);
>>> }
>>> if (first == last)
>>> break;
>>> @@ -561,11 +591,12 @@ callout_cc_add(struct callout *c, struct
>>> }
>>>
>>> static void
>>> -callout_cc_del(struct callout *c, struct callout_cpu *cc)
>>> +callout_cc_del(struct callout *c, struct callout_cpu *cc, int direct)
>>> {
>>> -
>>> - if (cc->cc_next == c)
>>> - cc->cc_next = TAILQ_NEXT(c, c_staiter);
>>> + if (direct && cc->cc_exec_next_dir == c)
>>> + cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
>>> + else if (!direct && cc->cc_exec_next == c)
>>> + cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
>>> if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
>>> c->c_func = NULL;
>>> SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
>>> @@ -574,13 +605,13 @@ callout_cc_del(struct callout *c, struct
>>>
>>> static struct callout *
>>> softclock_call_cc(struct callout *c, struct callout_cpu *cc, int
>>> *mpcalls,
>>> - int *lockcalls, int *gcalls)
>>> + int *lockcalls, int *gcalls, int direct)
>>> {
>>> void (*c_func)(void *);
>>> void *c_arg;
>>> struct lock_class *class;
>>> struct lock_object *c_lock;
>>> - int c_flags, sharedlock;
>>> + int c_flags, flags, sharedlock;
>>> #ifdef SMP
>>> struct callout_cpu *new_cc;
>>> void (*new_func)(void *);
>>> @@ -595,7 +626,14 @@ softclock_call_cc(struct callout *c, str
>>> static timeout_t *lastfunc;
>>> #endif
>>>
>>> - cc->cc_next = TAILQ_NEXT(c, c_staiter);
>>> + if (direct)
>>> + cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
>>> + else {
>>> + if ((c->c_flags & CALLOUT_PROCESSED) == 0)
>>> + cc->cc_exec_next = TAILQ_NEXT(c, c_links.tqe);
>>> + else
>>> + cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
>>> + }
>>> class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
>>> sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
>>> c_lock = c->c_lock;
>>> @@ -606,8 +644,8 @@ softclock_call_cc(struct callout *c, str
>>> c->c_flags = CALLOUT_LOCAL_ALLOC;
>>> else
>>> c->c_flags &= ~CALLOUT_PENDING;
>>> - cc->cc_curr = c;
>>> - cc->cc_cancel = 0;
>>> + cc->cc_exec_entity[direct].cc_curr = c;
>>> + cc->cc_exec_entity[direct].cc_cancel = 0;
>>> CC_UNLOCK(cc);
>>> if (c_lock != NULL) {
>>> class->lc_lock(c_lock, sharedlock);
>>> @@ -615,13 +653,12 @@ softclock_call_cc(struct callout *c, str
>>> * The callout may have been cancelled
>>> * while we switched locks.
>>> */
>>> - if (cc->cc_cancel) {
>>> + if (cc->cc_exec_entity[direct].cc_cancel) {
>>> class->lc_unlock(c_lock);
>>> goto skip;
>>> }
>>> /* The callout cannot be stopped now. */
>>> - cc->cc_cancel = 1;
>>> -
>>> + cc->cc_exec_entity[direct].cc_cancel = 1;
>>> if (c_lock == &Giant.lock_object) {
>>> (*gcalls)++;
>>> CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
>>> @@ -639,11 +676,13 @@ softclock_call_cc(struct callout *c, str
>>> #ifdef DIAGNOSTIC
>>> binuptime(&bt1);
>>> #endif
>>> - THREAD_NO_SLEEPING();
>>> + if (!direct)
>>> + THREAD_NO_SLEEPING();
>>> SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0,
>>> 0);
>>> c_func(c_arg);
>>> SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
>>> - THREAD_SLEEPING_OK();
>>> + if (!direct)
>>> + THREAD_SLEEPING_OK();
>>
>> I didn't look into the full patch, however I have a question about
>> this: was it necessary because now this is running in interrupt
>> context so it can catch the nested sleeping check case? Or there is
>> something else?
>> I think it would be a good thing to keep up THREAD_NO_SLEEPING() also
>> in the direct dispatch case.
>>
>> Attilio
>>
>>
>> --
>> Peace can only be achieved by understanding - A. Einstein
>
> Thanks for the comment, Attilio.
> Yes, it's exactly what you thought. If direct flag is equal to one
> you're sure you're processing a callout which runs directly from
> hardware interrupt context. In this case, the running thread cannot
> sleep and it's likely you have TDP_NOSLEEPING flags set, failing the
> KASSERT() in THREAD_NO_SLEEPING() and leading to panic if kernel is
> compiled with INVARIANTS.
> In case you're running from SWI context (direct equals to zero) code
> remains the same as before.
> I think what I'm doing works due the assumption thread running never
> sleeps. Do you suggest some other way to handle this?
Possibly the quicker way to do this is to have a way to deal with the
TDP_NOSLEEPING flag in recursed way, thus implement the same logic as
VFS_LOCK_GIANT() does, for example.
You will need to change the few callers of THREAD_NO_SLEEPING(), but
the patch should be no longer than 10/15 lines.
Attilio
--
Peace can only be achieved by understanding - A. Einstein
More information about the svn-src-projects
mailing list