svn commit: r238907 - projects/calloutng/sys/kern

Mon Jul 30 14:17:22 UTC 2012

On Mon, Jul 30, 2012 at 4:02 PM, Attilio Rao <attilio at freebsd.org> wrote:
> On 7/30/12, Davide Italiano <davide at freebsd.org> wrote:
>> Author: davide
>> Date: Mon Jul 30 13:50:37 2012
>> New Revision: 238907
>> URL: http://svn.freebsd.org/changeset/base/238907
>>
>> Log:
>>   - Fix a LOR deadlock dropping the callout lock while executing the
>> handler
>>   directly from hardware interrupt context.
>>
>>   - Add migration support for callouts which runs from hw interrupt
>> context.
>>
>>   - Refactor a couple of comments to reflect the new world order.
>>
>>   TODO: implement proper stats for direct callouts.
>>
>>   Just a note: I'd like to thank flo@ for his invaluable help in testing
>> and
>>   issues, mav@ that helped me in tackling the problem and Yahoo! which
>>   provided access to the machines on which tests were run.
>>
>>   Reported by:                avg, flo [1]
>>   Discused with:              mav
>>
>> Modified:
>>   projects/calloutng/sys/kern/kern_timeout.c
>>
>> Modified: projects/calloutng/sys/kern/kern_timeout.c
>> ==============================================================================
>> --- projects/calloutng/sys/kern/kern_timeout.c        Mon Jul 30 12:25:20
>> 2012  (r238906)
>> +++ projects/calloutng/sys/kern/kern_timeout.c        Mon Jul 30 13:50:37
>> 2012  (r238907)
>> @@ -91,45 +91,62 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpca
>>  int callwheelsize, callwheelmask;
>>
>>  /*
>> - * The callout cpu migration entity represents informations necessary for
>> - * describing the migrating callout to the new callout cpu.
>> + * The callout cpu exec entities represent informations necessary for
>> + * describing the state of callouts currently running on the CPU and the
>> ones
>> + * necessary for migrating callouts to the new callout cpu. In particular,
>> + * the first entry of the array cc_exec_entity holds informations for
>> callout
>> + * running in SWI thread context, while the second one holds informations
>> + * for callout running directly from hardware interrupt context.
>>   * The cached informations are very important for deferring migration when
>>   * the migrating callout is already running.
>>   */
>> -struct cc_mig_ent {
>> +struct cc_exec {
>> +     struct callout          *cc_next;
>> +     struct callout          *cc_curr;
>>  #ifdef SMP
>>       void                    (*ce_migration_func)(void *);
>>       void                    *ce_migration_arg;
>>       int                     ce_migration_cpu;
>>       struct bintime          ce_migration_time;
>>  #endif
>> +     int                     cc_cancel;
>> +     int                     cc_waiting;
>>  };
>>
>>  /*
>> - * There is one struct callout_cpu per cpu, holding all relevant
>> + * There is one struct callou_cpu per cpu, holding all relevant
>>   * state for the callout processing thread on the individual CPU.
>>   */
>>  struct callout_cpu {
>> -     struct cc_mig_ent       cc_migrating_entity;
>> +     struct cc_exec          cc_exec_entity[2];
>>       struct mtx              cc_lock;
>>       struct callout          *cc_callout;
>>       struct callout_tailq    *cc_callwheel;
>>       struct callout_tailq    cc_expireq;
>>       struct callout_list     cc_callfree;
>> -     struct callout          *cc_next;
>> -     struct callout          *cc_curr;
>>       struct bintime          cc_firstevent;
>>       struct bintime          cc_lastscan;
>>       void                    *cc_cookie;
>> -     int                     cc_cancel;
>> -     int                     cc_waiting;
>>  };
>>
>> +#define      cc_exec_curr            cc_exec_entity[0].cc_curr
>> +#define      cc_exec_next            cc_exec_entity[0].cc_next
>> +#define      cc_exec_cancel          cc_exec_entity[0].cc_cancel
>> +#define      cc_exec_waiting         cc_exec_entity[0].cc_waiting
>> +#define      cc_exec_curr_dir        cc_exec_entity[1].cc_curr
>> +#define      cc_exec_next_dir        cc_exec_entity[1].cc_next
>> +#define      cc_exec_cancel_dir      cc_exec_entity[1].cc_cancel
>> +#define      cc_exec_waiting_dir     cc_exec_entity[1].cc_waiting
>> +
>>  #ifdef SMP
>> -#define      cc_migration_func       cc_migrating_entity.ce_migration_func
>> -#define      cc_migration_arg        cc_migrating_entity.ce_migration_arg
>> -#define      cc_migration_cpu        cc_migrating_entity.ce_migration_cpu
>> -#define      cc_migration_time       cc_migrating_entity.ce_migration_time
>> +#define      cc_migration_func       cc_exec_entity[0].ce_migration_func
>> +#define      cc_migration_arg        cc_exec_entity[0].ce_migration_arg
>> +#define      cc_migration_cpu        cc_exec_entity[0].ce_migration_cpu
>> +#define      cc_migration_time       cc_exec_entity[0].ce_migration_time
>> +#define      cc_migration_func_dir   cc_exec_entity[1].ce_migration_func
>> +#define      cc_migration_arg_dir    cc_exec_entity[1].ce_migration_arg
>> +#define      cc_migration_cpu_dir    cc_exec_entity[1].ce_migration_cpu
>> +#define      cc_migration_time_dir   cc_exec_entity[1].ce_migration_time
>>
>>  struct callout_cpu cc_cpu[MAXCPU];
>>  #define      CPUBLOCK        MAXCPU
>> @@ -156,6 +173,9 @@ struct callout_cpu cc_cpu;
>>
>>  static int timeout_cpu;
>>  void (*callout_new_inserted)(int cpu, struct bintime bt) = NULL;
>> +static struct callout *
>> +softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
>> +    int *lockcalls, int *gcalls, int direct);
>>
>>  static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
>>
>> @@ -180,15 +200,18 @@ static MALLOC_DEFINE(M_CALLOUT, "callout
>>   * Resets the migration entity tied to a specific callout cpu.
>>   */
>>  static void
>> -cc_cme_cleanup(struct callout_cpu *cc)
>> +cc_cme_cleanup(struct callout_cpu *cc, int direct)
>>  {
>> -
>> +
>> +     cc->cc_exec_entity[direct].cc_curr = NULL;
>> +     cc->cc_exec_entity[direct].cc_next = NULL;
>> +     cc->cc_exec_entity[direct].cc_cancel = 0;
>> +     cc->cc_exec_entity[direct].cc_waiting = 0;
>>  #ifdef SMP
>> -     cc->cc_migration_cpu = CPUBLOCK;
>> -     cc->cc_migration_time.sec = 0;
>> -     cc->cc_migration_time.frac = 0;
>> -     cc->cc_migration_func = NULL;
>> -     cc->cc_migration_arg = NULL;
>> +     cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
>> +     bintime_clear(&cc->cc_exec_entity[direct].ce_migration_time);
>> +     cc->cc_exec_entity[direct].ce_migration_func = NULL;
>> +     cc->cc_exec_entity[direct].ce_migration_arg = NULL;
>>  #endif
>>  }
>>
>> @@ -196,11 +219,12 @@ cc_cme_cleanup(struct callout_cpu *cc)
>>   * Checks if migration is requested by a specific callout cpu.
>>   */
>>  static int
>> -cc_cme_migrating(struct callout_cpu *cc)
>> +cc_cme_migrating(struct callout_cpu *cc, int direct)
>>  {
>>
>>  #ifdef SMP
>> -     return (cc->cc_migration_cpu != CPUBLOCK);
>> +
>> +     return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
>>  #else
>>       return (0);
>>  #endif
>> @@ -246,7 +270,8 @@ callout_cpu_init(struct callout_cpu *cc)
>>               TAILQ_INIT(&cc->cc_callwheel[i]);
>>       }
>>       TAILQ_INIT(&cc->cc_expireq);
>> -     cc_cme_cleanup(cc);
>> +     for (i = 0; i < 2; i++)
>> +             cc_cme_cleanup(cc, i);
>>       if (cc->cc_callout == NULL)
>>               return;
>>       for (i = 0; i < ncallout; i++) {
>> @@ -355,7 +380,8 @@ callout_process(struct bintime *now)
>>       struct callout *tmp;
>>       struct callout_cpu *cc;
>>       struct callout_tailq *sc;
>> -     int cpu, first, future, last, need_softclock;
>> +     int cpu, first, future, gcalls, mpcalls, last, lockcalls,
>> +         need_softclock;
>>
>>       /*
>>        * Process callouts at a very low cpu priority, so we don't keep the
>> @@ -376,7 +402,8 @@ callout_process(struct bintime *now)
>>       first &= callwheelmask;
>>       for (;;) {
>>               sc = &cc->cc_callwheel[first];
>> -             TAILQ_FOREACH(tmp, sc, c_links.tqe) {
>> +             tmp = TAILQ_FIRST(sc);
>> +             while (tmp != NULL) {
>>                       next = tmp->c_time;
>>                       bintime_sub(&next, &tmp->c_precision);
>>                       if (bintime_cmp(&next, now, <=)) {
>> @@ -385,17 +412,20 @@ callout_process(struct bintime *now)
>>                                * directly from hardware interrupt context.
>>                                */
>>                               if (tmp->c_flags & CALLOUT_DIRECT) {
>> -                                     tmp->c_func(tmp->c_arg);
>>                                       TAILQ_REMOVE(sc, tmp, c_links.tqe);
>> -                                     tmp->c_flags &= ~CALLOUT_PENDING;
>> +                                     tmp = softclock_call_cc(tmp, cc,
>> +                                         &mpcalls, &lockcalls, &gcalls, 1);
>>                               } else {
>>                                       TAILQ_INSERT_TAIL(&cc->cc_expireq,
>>                                           tmp, c_staiter);
>>                                       TAILQ_REMOVE(sc, tmp, c_links.tqe);
>>                                       tmp->c_flags |= CALLOUT_PROCESSED;
>>                                       need_softclock = 1;
>> +                                     tmp = TAILQ_NEXT(tmp, c_links.tqe);
>>                               }
>>                       }
>> +                     else
>> +                             tmp = TAILQ_NEXT(tmp, c_links.tqe);
>>               }
>>               if (first == last)
>>                       break;
>> @@ -561,11 +591,12 @@ callout_cc_add(struct callout *c, struct
>>  }
>>
>>  static void
>> -callout_cc_del(struct callout *c, struct callout_cpu *cc)
>> +callout_cc_del(struct callout *c, struct callout_cpu *cc, int direct)
>>  {
>> -
>> -     if (cc->cc_next == c)
>> -             cc->cc_next = TAILQ_NEXT(c, c_staiter);
>> +     if (direct && cc->cc_exec_next_dir == c)
>> +             cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
>> +     else if (!direct && cc->cc_exec_next == c)
>> +             cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
>>       if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
>>               c->c_func = NULL;
>>               SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
>> @@ -574,13 +605,13 @@ callout_cc_del(struct callout *c, struct
>>
>>  static struct callout *
>>  softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
>> -    int *lockcalls, int *gcalls)
>> +    int *lockcalls, int *gcalls, int direct)
>>  {
>>       void (*c_func)(void *);
>>       void *c_arg;
>>       struct lock_class *class;
>>       struct lock_object *c_lock;
>> -     int c_flags, sharedlock;
>> +     int c_flags, flags, sharedlock;
>>  #ifdef SMP
>>       struct callout_cpu *new_cc;
>>       void (*new_func)(void *);
>> @@ -595,7 +626,14 @@ softclock_call_cc(struct callout *c, str
>>       static timeout_t *lastfunc;
>>  #endif
>>
>> -     cc->cc_next = TAILQ_NEXT(c, c_staiter);
>> +     if (direct)
>> +             cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
>> +     else {
>> +             if ((c->c_flags & CALLOUT_PROCESSED) == 0)
>> +                     cc->cc_exec_next = TAILQ_NEXT(c, c_links.tqe);
>> +             else
>> +                     cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
>> +     }
>>       class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
>>       sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
>>       c_lock = c->c_lock;
>> @@ -606,8 +644,8 @@ softclock_call_cc(struct callout *c, str
>>               c->c_flags = CALLOUT_LOCAL_ALLOC;
>>       else
>>               c->c_flags &= ~CALLOUT_PENDING;
>> -     cc->cc_curr = c;
>> -     cc->cc_cancel = 0;
>> +     cc->cc_exec_entity[direct].cc_curr = c;
>> +     cc->cc_exec_entity[direct].cc_cancel = 0;
>>       CC_UNLOCK(cc);
>>       if (c_lock != NULL) {
>>               class->lc_lock(c_lock, sharedlock);
>> @@ -615,13 +653,12 @@ softclock_call_cc(struct callout *c, str
>>                * The callout may have been cancelled
>>                * while we switched locks.
>>                */
>> -             if (cc->cc_cancel) {
>> +             if (cc->cc_exec_entity[direct].cc_cancel) {
>>                       class->lc_unlock(c_lock);
>>                       goto skip;
>>               }
>>               /* The callout cannot be stopped now. */
>> -             cc->cc_cancel = 1;
>> -
>> +             cc->cc_exec_entity[direct].cc_cancel = 1;
>>               if (c_lock == &Giant.lock_object) {
>>                       (*gcalls)++;
>>                       CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
>> @@ -639,11 +676,13 @@ softclock_call_cc(struct callout *c, str
>>  #ifdef DIAGNOSTIC
>>       binuptime(&bt1);
>>  #endif
>> -     THREAD_NO_SLEEPING();
>> +     if (!direct)
>> +             THREAD_NO_SLEEPING();
>>       SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
>>       c_func(c_arg);
>>       SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
>> -     THREAD_SLEEPING_OK();
>> +     if (!direct)
>> +             THREAD_SLEEPING_OK();
>
> I didn't look into the full patch, however I have a question about
> this: was it necessary because now this is running in interrupt
> context so it can catch the nested sleeping check  case? Or there is
> something else?
> I think it would be a good thing to keep up THREAD_NO_SLEEPING() also
> in the direct dispatch case.
>
> Attilio
>
>
> --
> Peace can only be achieved by understanding - A. Einstein

Thanks for the comment, Attilio.
Yes, it's exactly what you thought. If direct flag is equal to one
you're sure you're processing a callout which runs directly from
hardware interrupt context. In this case, the running thread cannot
sleep and it's likely you have TDP_NOSLEEPING flags set, failing the
KASSERT() in THREAD_NO_SLEEPING() and leading to panic if kernel is
compiled with INVARIANTS.
In case you're running from SWI context (direct equals to zero) code
remains the same as before.
I think what I'm doing works due the assumption thread running never
sleeps. Do you suggest some other way to handle this?

Davide