svn commit: r238907 - projects/calloutng/sys/kern

Mon Jul 30 14:24:28 UTC 2012

On 7/30/12, Davide Italiano <davide at freebsd.org> wrote:
> On Mon, Jul 30, 2012 at 4:02 PM, Attilio Rao <attilio at freebsd.org> wrote:
>> On 7/30/12, Davide Italiano <davide at freebsd.org> wrote:
>>> Author: davide
>>> Date: Mon Jul 30 13:50:37 2012
>>> New Revision: 238907
>>> URL: http://svn.freebsd.org/changeset/base/238907
>>>
>>> Log:
>>>   - Fix a LOR deadlock dropping the callout lock while executing the
>>> handler
>>>   directly from hardware interrupt context.
>>>
>>>   - Add migration support for callouts which runs from hw interrupt
>>> context.
>>>
>>>   - Refactor a couple of comments to reflect the new world order.
>>>
>>>   TODO: implement proper stats for direct callouts.
>>>
>>>   Just a note: I'd like to thank flo@ for his invaluable help in testing
>>> and
>>>   issues, mav@ that helped me in tackling the problem and Yahoo! which
>>>   provided access to the machines on which tests were run.
>>>
>>>   Reported by:                avg, flo [1]
>>>   Discused with:              mav
>>>
>>> Modified:
>>>   projects/calloutng/sys/kern/kern_timeout.c
>>>
>>> Modified: projects/calloutng/sys/kern/kern_timeout.c
>>> ==============================================================================
>>> --- projects/calloutng/sys/kern/kern_timeout.c        Mon Jul 30
>>> 12:25:20
>>> 2012  (r238906)
>>> +++ projects/calloutng/sys/kern/kern_timeout.c        Mon Jul 30
>>> 13:50:37
>>> 2012  (r238907)
>>> @@ -91,45 +91,62 @@ SYSCTL_INT(_debug, OID_AUTO, to_avg_mpca
>>>  int callwheelsize, callwheelmask;
>>>
>>>  /*
>>> - * The callout cpu migration entity represents informations necessary
>>> for
>>> - * describing the migrating callout to the new callout cpu.
>>> + * The callout cpu exec entities represent informations necessary for
>>> + * describing the state of callouts currently running on the CPU and
>>> the
>>> ones
>>> + * necessary for migrating callouts to the new callout cpu. In
>>> particular,
>>> + * the first entry of the array cc_exec_entity holds informations for
>>> callout
>>> + * running in SWI thread context, while the second one holds
>>> informations
>>> + * for callout running directly from hardware interrupt context.
>>>   * The cached informations are very important for deferring migration
>>> when
>>>   * the migrating callout is already running.
>>>   */
>>> -struct cc_mig_ent {
>>> +struct cc_exec {
>>> +     struct callout          *cc_next;
>>> +     struct callout          *cc_curr;
>>>  #ifdef SMP
>>>       void                    (*ce_migration_func)(void *);
>>>       void                    *ce_migration_arg;
>>>       int                     ce_migration_cpu;
>>>       struct bintime          ce_migration_time;
>>>  #endif
>>> +     int                     cc_cancel;
>>> +     int                     cc_waiting;
>>>  };
>>>
>>>  /*
>>> - * There is one struct callout_cpu per cpu, holding all relevant
>>> + * There is one struct callou_cpu per cpu, holding all relevant
>>>   * state for the callout processing thread on the individual CPU.
>>>   */
>>>  struct callout_cpu {
>>> -     struct cc_mig_ent       cc_migrating_entity;
>>> +     struct cc_exec          cc_exec_entity[2];
>>>       struct mtx              cc_lock;
>>>       struct callout          *cc_callout;
>>>       struct callout_tailq    *cc_callwheel;
>>>       struct callout_tailq    cc_expireq;
>>>       struct callout_list     cc_callfree;
>>> -     struct callout          *cc_next;
>>> -     struct callout          *cc_curr;
>>>       struct bintime          cc_firstevent;
>>>       struct bintime          cc_lastscan;
>>>       void                    *cc_cookie;
>>> -     int                     cc_cancel;
>>> -     int                     cc_waiting;
>>>  };
>>>
>>> +#define      cc_exec_curr            cc_exec_entity[0].cc_curr
>>> +#define      cc_exec_next            cc_exec_entity[0].cc_next
>>> +#define      cc_exec_cancel          cc_exec_entity[0].cc_cancel
>>> +#define      cc_exec_waiting         cc_exec_entity[0].cc_waiting
>>> +#define      cc_exec_curr_dir        cc_exec_entity[1].cc_curr
>>> +#define      cc_exec_next_dir        cc_exec_entity[1].cc_next
>>> +#define      cc_exec_cancel_dir      cc_exec_entity[1].cc_cancel
>>> +#define      cc_exec_waiting_dir     cc_exec_entity[1].cc_waiting
>>> +
>>>  #ifdef SMP
>>> -#define      cc_migration_func
>>> cc_migrating_entity.ce_migration_func
>>> -#define      cc_migration_arg
>>> cc_migrating_entity.ce_migration_arg
>>> -#define      cc_migration_cpu
>>> cc_migrating_entity.ce_migration_cpu
>>> -#define      cc_migration_time
>>> cc_migrating_entity.ce_migration_time
>>> +#define      cc_migration_func
>>> cc_exec_entity[0].ce_migration_func
>>> +#define      cc_migration_arg        cc_exec_entity[0].ce_migration_arg
>>> +#define      cc_migration_cpu        cc_exec_entity[0].ce_migration_cpu
>>> +#define      cc_migration_time
>>> cc_exec_entity[0].ce_migration_time
>>> +#define      cc_migration_func_dir
>>> cc_exec_entity[1].ce_migration_func
>>> +#define      cc_migration_arg_dir    cc_exec_entity[1].ce_migration_arg
>>> +#define      cc_migration_cpu_dir    cc_exec_entity[1].ce_migration_cpu
>>> +#define      cc_migration_time_dir
>>> cc_exec_entity[1].ce_migration_time
>>>
>>>  struct callout_cpu cc_cpu[MAXCPU];
>>>  #define      CPUBLOCK        MAXCPU
>>> @@ -156,6 +173,9 @@ struct callout_cpu cc_cpu;
>>>
>>>  static int timeout_cpu;
>>>  void (*callout_new_inserted)(int cpu, struct bintime bt) = NULL;
>>> +static struct callout *
>>> +softclock_call_cc(struct callout *c, struct callout_cpu *cc, int
>>> *mpcalls,
>>> +    int *lockcalls, int *gcalls, int direct);
>>>
>>>  static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
>>>
>>> @@ -180,15 +200,18 @@ static MALLOC_DEFINE(M_CALLOUT, "callout
>>>   * Resets the migration entity tied to a specific callout cpu.
>>>   */
>>>  static void
>>> -cc_cme_cleanup(struct callout_cpu *cc)
>>> +cc_cme_cleanup(struct callout_cpu *cc, int direct)
>>>  {
>>> -
>>> +
>>> +     cc->cc_exec_entity[direct].cc_curr = NULL;
>>> +     cc->cc_exec_entity[direct].cc_next = NULL;
>>> +     cc->cc_exec_entity[direct].cc_cancel = 0;
>>> +     cc->cc_exec_entity[direct].cc_waiting = 0;
>>>  #ifdef SMP
>>> -     cc->cc_migration_cpu = CPUBLOCK;
>>> -     cc->cc_migration_time.sec = 0;
>>> -     cc->cc_migration_time.frac = 0;
>>> -     cc->cc_migration_func = NULL;
>>> -     cc->cc_migration_arg = NULL;
>>> +     cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
>>> +     bintime_clear(&cc->cc_exec_entity[direct].ce_migration_time);
>>> +     cc->cc_exec_entity[direct].ce_migration_func = NULL;
>>> +     cc->cc_exec_entity[direct].ce_migration_arg = NULL;
>>>  #endif
>>>  }
>>>
>>> @@ -196,11 +219,12 @@ cc_cme_cleanup(struct callout_cpu *cc)
>>>   * Checks if migration is requested by a specific callout cpu.
>>>   */
>>>  static int
>>> -cc_cme_migrating(struct callout_cpu *cc)
>>> +cc_cme_migrating(struct callout_cpu *cc, int direct)
>>>  {
>>>
>>>  #ifdef SMP
>>> -     return (cc->cc_migration_cpu != CPUBLOCK);
>>> +
>>> +     return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
>>>  #else
>>>       return (0);
>>>  #endif
>>> @@ -246,7 +270,8 @@ callout_cpu_init(struct callout_cpu *cc)
>>>               TAILQ_INIT(&cc->cc_callwheel[i]);
>>>       }
>>>       TAILQ_INIT(&cc->cc_expireq);
>>> -     cc_cme_cleanup(cc);
>>> +     for (i = 0; i < 2; i++)
>>> +             cc_cme_cleanup(cc, i);
>>>       if (cc->cc_callout == NULL)
>>>               return;
>>>       for (i = 0; i < ncallout; i++) {
>>> @@ -355,7 +380,8 @@ callout_process(struct bintime *now)
>>>       struct callout *tmp;
>>>       struct callout_cpu *cc;
>>>       struct callout_tailq *sc;
>>> -     int cpu, first, future, last, need_softclock;
>>> +     int cpu, first, future, gcalls, mpcalls, last, lockcalls,
>>> +         need_softclock;
>>>
>>>       /*
>>>        * Process callouts at a very low cpu priority, so we don't keep
>>> the
>>> @@ -376,7 +402,8 @@ callout_process(struct bintime *now)
>>>       first &= callwheelmask;
>>>       for (;;) {
>>>               sc = &cc->cc_callwheel[first];
>>> -             TAILQ_FOREACH(tmp, sc, c_links.tqe) {
>>> +             tmp = TAILQ_FIRST(sc);
>>> +             while (tmp != NULL) {
>>>                       next = tmp->c_time;
>>>                       bintime_sub(&next, &tmp->c_precision);
>>>                       if (bintime_cmp(&next, now, <=)) {
>>> @@ -385,17 +412,20 @@ callout_process(struct bintime *now)
>>>                                * directly from hardware interrupt
>>> context.
>>>                                */
>>>                               if (tmp->c_flags & CALLOUT_DIRECT) {
>>> -                                     tmp->c_func(tmp->c_arg);
>>>                                       TAILQ_REMOVE(sc, tmp,
>>> c_links.tqe);
>>> -                                     tmp->c_flags &= ~CALLOUT_PENDING;
>>> +                                     tmp = softclock_call_cc(tmp, cc,
>>> +                                         &mpcalls, &lockcalls, &gcalls,
>>> 1);
>>>                               } else {
>>>                                       TAILQ_INSERT_TAIL(&cc->cc_expireq,
>>>                                           tmp, c_staiter);
>>>                                       TAILQ_REMOVE(sc, tmp,
>>> c_links.tqe);
>>>                                       tmp->c_flags |= CALLOUT_PROCESSED;
>>>                                       need_softclock = 1;
>>> +                                     tmp = TAILQ_NEXT(tmp,
>>> c_links.tqe);
>>>                               }
>>>                       }
>>> +                     else
>>> +                             tmp = TAILQ_NEXT(tmp, c_links.tqe);
>>>               }
>>>               if (first == last)
>>>                       break;
>>> @@ -561,11 +591,12 @@ callout_cc_add(struct callout *c, struct
>>>  }
>>>
>>>  static void
>>> -callout_cc_del(struct callout *c, struct callout_cpu *cc)
>>> +callout_cc_del(struct callout *c, struct callout_cpu *cc, int direct)
>>>  {
>>> -
>>> -     if (cc->cc_next == c)
>>> -             cc->cc_next = TAILQ_NEXT(c, c_staiter);
>>> +     if (direct && cc->cc_exec_next_dir == c)
>>> +             cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
>>> +     else if (!direct && cc->cc_exec_next == c)
>>> +             cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
>>>       if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
>>>               c->c_func = NULL;
>>>               SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
>>> @@ -574,13 +605,13 @@ callout_cc_del(struct callout *c, struct
>>>
>>>  static struct callout *
>>>  softclock_call_cc(struct callout *c, struct callout_cpu *cc, int
>>> *mpcalls,
>>> -    int *lockcalls, int *gcalls)
>>> +    int *lockcalls, int *gcalls, int direct)
>>>  {
>>>       void (*c_func)(void *);
>>>       void *c_arg;
>>>       struct lock_class *class;
>>>       struct lock_object *c_lock;
>>> -     int c_flags, sharedlock;
>>> +     int c_flags, flags, sharedlock;
>>>  #ifdef SMP
>>>       struct callout_cpu *new_cc;
>>>       void (*new_func)(void *);
>>> @@ -595,7 +626,14 @@ softclock_call_cc(struct callout *c, str
>>>       static timeout_t *lastfunc;
>>>  #endif
>>>
>>> -     cc->cc_next = TAILQ_NEXT(c, c_staiter);
>>> +     if (direct)
>>> +             cc->cc_exec_next_dir = TAILQ_NEXT(c, c_links.tqe);
>>> +     else {
>>> +             if ((c->c_flags & CALLOUT_PROCESSED) == 0)
>>> +                     cc->cc_exec_next = TAILQ_NEXT(c, c_links.tqe);
>>> +             else
>>> +                     cc->cc_exec_next = TAILQ_NEXT(c, c_staiter);
>>> +     }
>>>       class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
>>>       sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
>>>       c_lock = c->c_lock;
>>> @@ -606,8 +644,8 @@ softclock_call_cc(struct callout *c, str
>>>               c->c_flags = CALLOUT_LOCAL_ALLOC;
>>>       else
>>>               c->c_flags &= ~CALLOUT_PENDING;
>>> -     cc->cc_curr = c;
>>> -     cc->cc_cancel = 0;
>>> +     cc->cc_exec_entity[direct].cc_curr = c;
>>> +     cc->cc_exec_entity[direct].cc_cancel = 0;
>>>       CC_UNLOCK(cc);
>>>       if (c_lock != NULL) {
>>>               class->lc_lock(c_lock, sharedlock);
>>> @@ -615,13 +653,12 @@ softclock_call_cc(struct callout *c, str
>>>                * The callout may have been cancelled
>>>                * while we switched locks.
>>>                */
>>> -             if (cc->cc_cancel) {
>>> +             if (cc->cc_exec_entity[direct].cc_cancel) {
>>>                       class->lc_unlock(c_lock);
>>>                       goto skip;
>>>               }
>>>               /* The callout cannot be stopped now. */
>>> -             cc->cc_cancel = 1;
>>> -
>>> +             cc->cc_exec_entity[direct].cc_cancel = 1;
>>>               if (c_lock == &Giant.lock_object) {
>>>                       (*gcalls)++;
>>>                       CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
>>> @@ -639,11 +676,13 @@ softclock_call_cc(struct callout *c, str
>>>  #ifdef DIAGNOSTIC
>>>       binuptime(&bt1);
>>>  #endif
>>> -     THREAD_NO_SLEEPING();
>>> +     if (!direct)
>>> +             THREAD_NO_SLEEPING();
>>>       SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0,
>>> 0);
>>>       c_func(c_arg);
>>>       SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
>>> -     THREAD_SLEEPING_OK();
>>> +     if (!direct)
>>> +             THREAD_SLEEPING_OK();
>>
>> I didn't look into the full patch, however I have a question about
>> this: was it necessary because now this is running in interrupt
>> context so it can catch the nested sleeping check  case? Or there is
>> something else?
>> I think it would be a good thing to keep up THREAD_NO_SLEEPING() also
>> in the direct dispatch case.
>>
>> Attilio
>>
>>
>> --
>> Peace can only be achieved by understanding - A. Einstein
>
> Thanks for the comment, Attilio.
> Yes, it's exactly what you thought. If direct flag is equal to one
> you're sure you're processing a callout which runs directly from
> hardware interrupt context. In this case, the running thread cannot
> sleep and it's likely you have TDP_NOSLEEPING flags set, failing the
> KASSERT() in THREAD_NO_SLEEPING() and leading to panic if kernel is
> compiled with INVARIANTS.
> In case you're running from SWI context (direct equals to zero) code
> remains the same as before.
> I think what I'm doing works due the assumption thread running never
> sleeps. Do you suggest some other way to handle this?

Possibly the quicker way to do this is to have a way to deal with the
TDP_NOSLEEPING flag in recursed way, thus implement the same logic as
VFS_LOCK_GIANT() does, for example.
You will need to change the few callers of THREAD_NO_SLEEPING(), but
the patch should be no longer than 10/15 lines.

Attilio

-- 
Peace can only be achieved by understanding - A. Einstein