svn commit: r349029 - head/sys/kern

Fri Jun 14 12:58:46 UTC 2019

On Fri, 14 Jun 2019, Alexander Motin wrote:

> Log:
>  Update td_runtime of running thread on each statclock().
>
>  Normally td_runtime is updated on context switch, but there are some kernel
>  threads that due to high absolute priority may run for many seconds without
>  context switches (yes, that is bad, but that is true), which means their
>  td_runtime was not updated all that time, that made them invisible for top
>  other then as some general CPU usage.
>
>  MFC after:	1 week
>  Sponsored by:	iXsystems, Inc.

This and more is supposed to be done in calcru().  It is also necessary to
adjust for the current timeslice.

I thought that calcru() was fixed, but the fix seems to be only in my
version of FreeBSD-5.2.

The bug seems to be in fill_kinfo_proc_only().  calcru() should update
td_runtime for all threads in the proc (and sum into rux_rutime), but
fill_kinfo_proc_only() uses rux_runtime to initialize ki_runtime before
calling calcru(), so sees a stale value.  This was broken in r132855
(the initialization of ki_runtime was moved too early as part of a fix
for zombies).

Using calcru() for its side effect is a bit magic.  get_thread_cputime()
does this less hackishly for curthread (by adding the timeslice without
updating td_runtime), but is broken for other threads (by using the
stale td_runtime).

td_runtime is unavailable in userland.  top and ps can only see ki_runtime
which is per-process.

Actually, calcru() seems to have the same bug as get_thread_cputime().
This is fixed in my version of FreeBSD-5-2, but is clearly broken in
plain FreeBSD-FreeBSD-5.2, and -current seems to be even more broken
than plain 5.2 -- it is missing the large comment about this bug that
I added in r130858.  -current first updates td_runtime for curthread.
Then it loops over all threads in the process, but does nothing unless
td_incruntime != 0, and when td_incruntime != 0 it updates rux_runtime
using the value in td_incruntime, and this value is stale except for
curthread.

td_runtime is updated in one other place: in rufetchtd(), but this function
has the same bug eas everywhere else -- it only updates the runtimes for
curthread.

Here is my fix for this in FreeBSD-5.2:

XX Index: kern_resource.c
XX ===================================================================
XX RCS file: /home/ncvs/src/sys/kern/kern_resource.c,v
XX retrieving revision 1.140
XX diff -u -2 -r1.140 kern_resource.c
XX --- kern_resource.c	21 Jun 2004 17:46:27 -0000	1.140
XX @@ -702,18 +720,80 @@
XX  	struct timeval *ip;
XX  {
XX -	struct bintime bt, rt;
XX -	struct timeval tv;
XX +	struct bintime bt;
XX +	struct rusage_ext rux;

-current already has most of the infrastructure for this -- I invented
rusage_ext partly to fix this problem, and -current has a more sophisticated
version of rusage_ext.

-current also uses cputicks instead of bintimes.

XX  	struct thread *td;
XX -	/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
XX -	u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
XX -	int problemcase;
XX +	int bt_valid, proc_locked, sched_locked;
XX 
XX -	mtx_assert(&sched_lock, MA_OWNED);
XX -	/* XXX: why spl-protect ?  worst case is an off-by-one report */
XX +	proc_locked = mtx_owned(&p->p_mtx);
XX +	sched_locked = mtx_owned(&sched_lock);
XX +	if (!proc_locked && !sched_locked)
XX +		PROC_LOCK(p);
XX +	if (!sched_locked)
XX +		mtx_lock_spin(&sched_lock);

This also has some restructuring that -current already has in a more
sophisticated way.

XX +	rux = p->p_rux;
XX +	bt_valid = 0;
XX +	FOREACH_THREAD_IN_PROC(p, td) {
XX +		if (TD_IS_RUNNING(td)) {
XX +			/*
XX +			 * Adjust for the current time slice.  This is
XX +			 * important since the adjustment is on the order
XX +			 * of a time quantum, which is much greater than
XX +			 * the precision of binuptime().
XX +			 */
XX +			KASSERT(td->td_oncpu != NOCPU,
XX +			    ("calcru: running thread has no CPU"));
XX +			if (!bt_valid) {
XX +				binuptime(&bt);
XX +				bt_valid = 1;
XX +			}
XX +			bintime_sub(&bt,
XX +			    &pcpu_find(td->td_oncpu)->pc_switchtime);
XX +			if (bt.sec < 0) {
XX +				printf(
XX +    "calcru: negative delta-runtime of %jd sec + %jd wsec for pid %d (%s)\n",
XX +				    (intmax_t)bt.sec, bt.frac,
XX +				    p->p_pid, p->p_comm);
XX +				bt.sec = 0;
XX +				bt.frac = 0;
XX +			}
XX +			bintime_add(&rux.rux_runtime, &bt);
XX +		}
XX +	}
XX +	if (!sched_locked)
XX +		mtx_unlock_spin(&sched_lock);

That is the main part of the fix.  It is necessary to loop over all threads
in the process.

XX +	calcru1(p, &rux, up, sp, ip);
XX +	p->p_rux.rux_uu = rux.rux_uu;
XX +	p->p_rux.rux_su = rux.rux_su;
XX +	p->p_rux.rux_iu = rux.rux_iu;
XX +	if (!proc_locked && !sched_locked)
XX +		PROC_UNLOCK(p);
XX +}
XX +
XX +void
XX +calccru(p, up, sp)
XX +	struct proc *p;
XX +	struct timeval *up;
XX +	struct timeval *sp;
XX +{
XX 
XX -	ut = p->p_uticks;
XX -	st = p->p_sticks;
XX -	it = p->p_iticks;
XX +	PROC_LOCK_ASSERT(p, MA_OWNED);
XX +	calcru1(p, &p->p_crux, up, sp, NULL);
XX +}
XX 
XX +static void
XX +calcru1(p, ruxp, up, sp, ip)
XX +	struct proc *p;
XX +	struct rusage_ext *ruxp;
XX +	struct timeval *up;
XX +	struct timeval *sp;
XX +	struct timeval *ip;
XX +{
XX +	struct timeval tv;
XX +	/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
XX +	u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
XX +
XX +	ut = ruxp->rux_uticks;
XX +	st = ruxp->rux_sticks;
XX +	it = ruxp->rux_iticks;
XX  	tt = ut + st + it;
XX  	if (tt == 0) {
XX @@ -721,42 +801,20 @@
XX  		tt = 1;
XX  	}
XX -	rt = p->p_runtime;
XX -	problemcase = 0;
XX -	FOREACH_THREAD_IN_PROC(p, td) {
XX -		/*
XX -		 * Adjust for the current time slice.  This is actually fairly
XX -		 * important since the error here is on the order of a time
XX -		 * quantum, which is much greater than the sampling error.
XX -		 */
XX -		if (td == curthread) {
XX -			binuptime(&bt);
XX -			bintime_sub(&bt, PCPU_PTR(switchtime));
XX -			bintime_add(&rt, &bt);
XX -		} else if (TD_IS_RUNNING(td)) {
XX -			/*
XX -			 * XXX: this case should add the difference between
XX -			 * the current time and the switch time as above,
XX -			 * but the switch time is inaccessible, so we can't
XX -			 * do the adjustment and will end up with a wrong
XX -			 * runtime.  A previous call with a different
XX -			 * curthread may have obtained a (right or wrong)
XX -			 * runtime that is in advance of ours.  Just set a
XX -			 * flag to avoid warning about this known problem.
XX -			 */
XX -			problemcase = 1;

This is the large comment about the bug.  It was broken by removing it in
r136152.  r136152 was mostly by me to introduce rusage_ext, but it seems
to have been incomplete -- it even removed the adjustment of td_runtime
for curthread.  The latter wasn't restored until 7 years later in r244188.

XX -		}
XX -	}
XX -	bintime2timeval(&rt, &tv);
XX +	bintime2timeval(&ruxp->rux_runtime, &tv);
XX  	tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
XX -	ptu = p->p_uu + p->p_su + p->p_iu;
XX +	ptu = ruxp->rux_uu + ruxp->rux_su + ruxp->rux_iu;
XX  	if (tu < ptu) {
XX -		if (!problemcase)
XX -			printf(
XX -"calcru: runtime went backwards from %ju usec to %ju usec for pid %d (%s)\n",
XX -			    (uintmax_t)ptu, (uintmax_t)tu, p->p_pid, p->p_comm);
XX +		printf(
XX +"calcru1: runtime went backwards from %ju usec (uu = %ju, su = %ju, iu = %ju) to %ju usec for pid %d (%s)\n",
XX +		    (uintmax_t)ptu,
XX +		    (uintmax_t)ruxp->rux_uu, (uintmax_t)ruxp->rux_su,
XX +		    (uintmax_t)ruxp->rux_iu,
XX +		    (uintmax_t)tu, p->p_pid, p->p_comm);
XX  		tu = ptu;
XX  	}
XX  	if ((int64_t)tu < 0) {
XX -		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
XX +		/* XXX p is only needed to print these diagnostics. */
XX +		printf(
XX +		    "calcru1: negative runtime of %jd usec for pid %d (%s)\n",
XX  		    (intmax_t)tu, p->p_pid, p->p_comm);
XX  		tu = ptu;

-current still has many bugs that let the times go backwards.  I don't
remember if 'problemcase' gave negative times, but it still occurs and
gives garbage times.  Userland could do a related negative times check:
assert(ki_runtime >= sum of user, sys and interrupt times) often fails
because ki_runtime uses a stale rux_runtime time while the sum equals
the non-stale rux_runtime (after scaling).

Bruce