USB-related problem

Tue Nov 10 18:57:07 UTC 2015

+ benno, jeff, kib

cool, so now we know where syncer is hanging. nice work!

That syncer loop that calls it waiting for the list to be empty is
also a bit suss; it looks like it could also get stuck in a loop and
never yield.

-a

On 10 November 2015 at 10:54, Eugene Grosbein <eugen at grosbein.net> wrote:
> On 11.11.2015 01:05, Eugene Grosbein wrote:
>> On 10.11.2015 04:43, Adrian Chadd wrote:
>>> Hi,
>>>
>>> How much C do you know? Are you able to add some debugging to
>>> sched_sync() (which IIRC is the syncer thread) and work out where it's
>>> getting stuck and spinning?
>>
>> I did. I've added new global kernel symbol, variable schedstat
>> so I can check its value using ddb. And it's always equals to 5
>> when deadlock occurs, discovering the loop
>
> I've digged it a bit more. I tried to make double sure it loops
> and added loop counter "schedloop" hoping to see some insane large value.
> But it does not loop this way and schedloop is always 1.
>
> It just sits inside sync_vnode(). So I've added another variable "svnodestate"
> to trace sync_vnode() and svnodestate is 9.
>
> That is, VOP_FSYNC(vp, MNT_LAZY, td) does not return:
>
> --- sys/kern/vfs_subr.c.orig    2015-10-08 23:33:53.935440000 +0700
> +++ /sys/kern/vfs_subr.c        2015-11-11 01:39:09.450390000 +0700
> @@ -1738,217 +1738,272 @@ sysctl_vfs_worklist_len(SYSCTL_HANDLER_A
>         return (error);
>  }
>
>  SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
>      sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
>
>  static struct proc *updateproc;
>  static void sched_sync(void);
>  static struct kproc_desc up_kp = {
>         "syncer",
>         sched_sync,
>         &updateproc
>  };
>  SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
>
> +volatile unsigned svnodestate = 0;
> +
>  static int
>  sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
>  {
>         struct vnode *vp;
>         struct mount *mp;
>
> +       svnodestate = 0;
>         *bo = LIST_FIRST(slp);
>         if (*bo == NULL)
>                 return (0);
> +
> +       svnodestate = 1;
>         vp = (*bo)->__bo_vnode; /* XXX */
>         if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
>                 return (1);
>         /*
>          * We use vhold in case the vnode does not
>          * successfully sync.  vhold prevents the vnode from
>          * going away when we unlock the sync_mtx so that
>          * we can acquire the vnode interlock.
>          */
> +       svnodestate = 2;
>         vholdl(vp);
> +       svnodestate = 3;
>         mtx_unlock(&sync_mtx);
>         VI_UNLOCK(vp);
> +       svnodestate = 4;
>         if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
> +               svnodestate = 5;
>                 vdrop(vp);
> +               svnodestate = 6;
>                 mtx_lock(&sync_mtx);
> +               svnodestate = 7;
>                 return (*bo == LIST_FIRST(slp));
>         }
> +       svnodestate = 8;
>         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
> +       svnodestate = 9;
>         (void) VOP_FSYNC(vp, MNT_LAZY, td);
> +       svnodestate = 10;
>         VOP_UNLOCK(vp, 0);
> +       svnodestate = 11;
>         vn_finished_write(mp);
> +       svnodestate = 12;
>         BO_LOCK(*bo);
> +       svnodestate = 13;
>         if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
>                 /*
>                  * Put us back on the worklist.  The worklist
>                  * routine will remove us from our current
>                  * position and then add us back in at a later
>                  * position.
>                  */
> +               svnodestate = 14;
>                 vn_syncer_add_to_worklist(*bo, syncdelay);
>         }
> +       svnodestate = 15;
>         BO_UNLOCK(*bo);
> +       svnodestate = 16;
>         vdrop(vp);
> +       svnodestate = 17;
>         mtx_lock(&sync_mtx);
> +       svnodestate = 18;
>         return (0);
>  }
>
>  static int first_printf = 1;
>
> +volatile unsigned schedstate = 0;
> +volatile unsigned schedloop = 0;
> +
>  /*
>   * System filesystem synchronizer daemon.
>   */
>  static void
>  sched_sync(void)
>  {
>         struct synclist *next, *slp;
>         struct bufobj *bo;
>         long starttime;
>         struct thread *td = curthread;
>         int last_work_seen;
>         int net_worklist_len;
>         int syncer_final_iter;
>         int error;
>
>         last_work_seen = 0;
>         syncer_final_iter = 0;
>         syncer_state = SYNCER_RUNNING;
>         starttime = time_uptime;
>         td->td_pflags |= TDP_NORUNNINGBUF;
>
>         EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
>             SHUTDOWN_PRI_LAST);
>
> +       schedstate = 0;
>         mtx_lock(&sync_mtx);
>         for (;;) {
> +               schedstate = 1;
> +
>                 if (syncer_state == SYNCER_FINAL_DELAY &&
>                     syncer_final_iter == 0) {
>                         mtx_unlock(&sync_mtx);
>                         kproc_suspend_check(td->td_proc);
>                         mtx_lock(&sync_mtx);
>                 }
> +               schedstate = 2;
> +
>                 net_worklist_len = syncer_worklist_len - sync_vnode_count;
>                 if (syncer_state != SYNCER_RUNNING &&
>                     starttime != time_uptime) {
>                         if (first_printf) {
>                                 printf("\nSyncing disks, vnodes remaining...");
>                                 first_printf = 0;
>                         }
>                         printf("%d ", net_worklist_len);
>                 }
>                 starttime = time_uptime;
>
>                 /*
>                  * Push files whose dirty time has expired.  Be careful
>                  * of interrupt race on slp queue.
>                  *
>                  * Skip over empty worklist slots when shutting down.
>                  */
>                 do {
> +                       schedstate = 3;
>                         slp = &syncer_workitem_pending[syncer_delayno];
>                         syncer_delayno += 1;
>                         if (syncer_delayno == syncer_maxdelay)
>                                 syncer_delayno = 0;
>                         next = &syncer_workitem_pending[syncer_delayno];
>                         /*
>                          * If the worklist has wrapped since the
>                          * it was emptied of all but syncer vnodes,
>                          * switch to the FINAL_DELAY state and run
>                          * for one more second.
>                          */
>                         if (syncer_state == SYNCER_SHUTTING_DOWN &&
>                             net_worklist_len == 0 &&
>                             last_work_seen == syncer_delayno) {
>                                 syncer_state = SYNCER_FINAL_DELAY;
>                                 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
>                         }
>                 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
>                     syncer_worklist_len > 0);
>
> +               schedstate = 4;
> +
>                 /*
>                  * Keep track of the last time there was anything
>                  * on the worklist other than syncer vnodes.
>                  * Return to the SHUTTING_DOWN state if any
>                  * new work appears.
>                  */
>                 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
>                         last_work_seen = syncer_delayno;
>                 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
>                         syncer_state = SYNCER_SHUTTING_DOWN;
> +               schedloop=0;
>                 while (!LIST_EMPTY(slp)) {
> +                       schedstate = 5;
> +                       schedloop++;
> +
>                         error = sync_vnode(slp, &bo, td);
>                         if (error == 1) {
>                                 LIST_REMOVE(bo, bo_synclist);
>                                 LIST_INSERT_HEAD(next, bo, bo_synclist);
>                                 continue;
>                         }
> +                       schedstate = 6;
>
>                         if (first_printf == 0) {
> +                               schedstate = 7;
>                                 /*
>                                  * Drop the sync mutex, because some watchdog
>                                  * drivers need to sleep while patting
>                                  */
>                                 mtx_unlock(&sync_mtx);
>                                 wdog_kern_pat(WD_LASTVAL);
>                                 mtx_lock(&sync_mtx);
>                         }
>
> +                       schedstate = 8;
>                 }
> +               schedstate = 9;
> +
>                 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
>                         syncer_final_iter--;
>                 /*
>                  * The variable rushjob allows the kernel to speed up the
>                  * processing of the filesystem syncer process. A rushjob
>                  * value of N tells the filesystem syncer to process the next
>                  * N seconds worth of work on its queue ASAP. Currently rushjob
>                  * is used by the soft update code to speed up the filesystem
>                  * syncer process when the incore state is getting so far
>                  * ahead of the disk that the kernel memory pool is being
>                  * threatened with exhaustion.
>                  */
>                 if (rushjob > 0) {
>                         rushjob -= 1;
>                         continue;
>                 }
>                 /*
>                  * Just sleep for a short period of time between
>                  * iterations when shutting down to allow some I/O
>                  * to happen.
>                  *
>                  * If it has taken us less than a second to process the
>                  * current work, then wait. Otherwise start right over
>                  * again. We can still lose time if any single round
>                  * takes more than two seconds, but it does not really
>                  * matter as we are just trying to generally pace the
>                  * filesystem activity.
>                  */
> +
> +               schedstate = 10;
> +
>                 if (syncer_state != SYNCER_RUNNING ||
>                     time_uptime == starttime) {
> +                       schedstate = 11;
>                         thread_lock(td);
>                         sched_prio(td, PPAUSE);
>                         thread_unlock(td);
>                 }
> -               if (syncer_state != SYNCER_RUNNING)
> +               schedstate = 12;
> +
> +               if (syncer_state != SYNCER_RUNNING) {
> +                       schedstate = 13;
> +
>                         cv_timedwait(&sync_wakeup, &sync_mtx,
>                             hz / SYNCER_SHUTDOWN_SPEEDUP);
> -               else if (time_uptime == starttime)
> +               }
> +               else if (time_uptime == starttime) {
> +                       schedstate = 14;
>                         cv_timedwait(&sync_wakeup, &sync_mtx, hz);
> +               }
> +               schedstate = 15;
>         }
> +       schedstate = 16;
>  }
>
>  /*
>   * Request the syncer daemon to speed up its work.
>   * We never push it to speed up more than half of its
>   * normal turn time, otherwise it could take over the cpu.
>   */
>  int
>  speedup_syncer(void)
>  {
>         int ret = 0;
>
>         mtx_lock(&sync_mtx);
>         if (rushjob < syncdelay / 2) {
>                 rushjob += 1;
>