"process slock" vs. "scrlock" lock order

Sat Oct 2 20:03:26 UTC 2010

On Sat, 2 Oct 2010, Kostik Belousov wrote:

> On Sat, Oct 02, 2010 at 12:41:07AM -0700, Don Lewis wrote:
>> The hard coded lock order list in subr_witness.c has "scrlock" listed
>> before "process slock".  This causes a lock order reversal when
>> calcru1(), which requires "process slock" to be held, calls printf() to
>> report unexpected runtime problems.  The call to printf() eventually
>> gets into the console code which locks "scrlock".  This normally isn't
>> slock" for the duration of the printf() calls.
>> ...
>> Thoughts?
>
> Yes, printing from under a spinlock is somewhat epidemic. Moving the printf
> out of process slock looks as the right solution.

No, it is shooting the messenger.  printf() (and console functions)
may be called with any locks held (except ones related to printf and
console functions themselves, and even those must be blown open for
printfs from panics and possibly from debuggers (if you have a reeentrant
debugger)).

> On the other hand, all
> calcru() callers unlock slock immediately after calcru(), and calcru1()
> sometimes only called with thread lock held, not process slock.
>
> I propose the following refinement, it does not need relock of process slock
> at all. Lets drop slock in calcru(), and do neccessary print after that.
> No need to reacquire the slock.

This might be cleaner for other reasons.

> diff --git a/sys/compat/linux/linux_misc.c b/sys/compat/linux/linux_misc.c
> index d2cf6b6..6a599f6 100644
> --- a/sys/compat/linux/linux_misc.c
> +++ b/sys/compat/linux/linux_misc.c
> @@ -691,7 +691,6 @@ linux_times(struct thread *td, struct linux_times_args *args)
> 		PROC_LOCK(p);
> 		PROC_SLOCK(p);
> 		calcru(p, &utime, &stime);
> -		PROC_SUNLOCK(p);
> 		calccru(p, &cutime, &cstime);
> 		PROC_UNLOCK(p);
>

Clean to remove lots of these.

> diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
> index ec2d6b6..13cc50c 100644
> --- a/sys/kern/kern_resource.c
> +++ b/sys/kern/kern_resource.c
> @@ -72,8 +72,15 @@ static struct rwlock uihashtbl_lock;
> static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
> static u_long uihash;		/* size of hash table - 1 */
>
> +struct calcru1_warn {
> +	int64_t neg_runtime;
> +	int64_t new_runtime;
> +	int64_t old_runtime;
> +};
> +
> static void	calcru1(struct proc *p, struct rusage_ext *ruxp,
> -		    struct timeval *up, struct timeval *sp);
> +		    struct timeval *up, struct timeval *sp,
> +		    struct calcru1_warn *w);
> static int	donice(struct thread *td, struct proc *chgp, int n);
> static struct uidinfo *uilookup(uid_t uid);
> static void	ruxagg_locked(struct rusage_ext *rux, struct thread *td);
> @@ -797,6 +804,20 @@ getrlimit(td, uap)
> 	return (error);
> }
>
> +static void
> +print_calcru1_warn(struct proc *p, const struct calcru1_warn *w)
> +{
> +
> +	if (w->neg_runtime > 0)
> +		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
> +		    (intmax_t)w->neg_runtime, p->p_pid, p->p_comm);
> +	if (w->new_runtime > 0)
> +		printf("calcru: runtime went backwards from %ju usec "
> +		    "to %ju usec for pid %d (%s)\n",
> +		    (uintmax_t)w->old_runtime, (uintmax_t)w->new_runtime,
> +		    p->p_pid, p->p_comm);
> +}
> +
> /*
>  * Transform the running time and tick information for children of proc p
>  * into user and system time usage.
> @@ -807,24 +828,33 @@ calccru(p, up, sp)
> 	struct timeval *up;
> 	struct timeval *sp;
> {
> +	struct calcru1_warn w;
>
> 	PROC_LOCK_ASSERT(p, MA_OWNED);
> -	calcru1(p, &p->p_crux, up, sp);
> +	bzero(&w, sizeof(w));
> +	calcru1(p, &p->p_crux, up, sp, &w);
> +	print_calcru1_warn(p, &w);
> }
>
> /*
>  * Transform the running time and tick information in proc p into user
>  * and system time usage.  If appropriate, include the current time slice
>  * on this CPU.
> + *
> + * The process slock shall be locked on entry, and it is unlocked
> + * after function returned.
>  */
> void
> calcru(struct proc *p, struct timeval *up, struct timeval *sp)
> {
> 	struct thread *td;
> 	uint64_t u;
> +	struct calcru1_warn w;
>
> 	PROC_LOCK_ASSERT(p, MA_OWNED);
> 	PROC_SLOCK_ASSERT(p, MA_OWNED);
> +
> +	bzero(&w, sizeof(w));
> 	/*
> 	 * If we are getting stats for the current process, then add in the
> 	 * stats that this thread has accumulated in its current time slice.
> @@ -843,12 +873,14 @@ calcru(struct proc *p, struct timeval *up, struct timeval *sp)
> 			continue;
> 		ruxagg(p, td);
> 	}
> -	calcru1(p, &p->p_rux, up, sp);
> +	calcru1(p, &p->p_rux, up, sp, &w);
> +	PROC_SUNLOCK(p);
> +	print_calcru1_warn(p, &w);
> }
>
> static void
> calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
> -    struct timeval *sp)
> +    struct timeval *sp, struct calcru1_warn *w)
> {
> 	/* {user, system, interrupt, total} {ticks, usec}: */
> 	uint64_t ut, uu, st, su, it, tt, tu;
> @@ -865,8 +897,7 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
> 	tu = cputick2usec(ruxp->rux_runtime);
> 	if ((int64_t)tu < 0) {
> 		/* XXX: this should be an assert /phk */
> -		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
> -		    (intmax_t)tu, p->p_pid, p->p_comm);
> +		w->neg_runtime = tu;
> 		tu = ruxp->rux_tu;
> 	}
>
> @@ -903,10 +934,8 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
> 		 * serious, so lets keep it and hope laptops can be made
> 		 * more truthful about their CPU speed via ACPI.
> 		 */
> -		printf("calcru: runtime went backwards from %ju usec "
> -		    "to %ju usec for pid %d (%s)\n",
> -		    (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
> -		    p->p_pid, p->p_comm);
> +		w->new_runtime = tu;
> +		w->old_runtime = ruxp->rux_tu;
> 		uu = (tu * ut) / tt;
> 		su = (tu * st) / tt;
> 	}
> @@ -946,6 +975,7 @@ kern_getrusage(struct thread *td, int who, struct rusage *rup)
> {
> 	struct proc *p;
> 	int error;
> +	struct calcru1_warn w;
>
> 	error = 0;
> 	p = td->td_proc;
> @@ -962,13 +992,15 @@ kern_getrusage(struct thread *td, int who, struct rusage *rup)
> 		break;
>
> 	case RUSAGE_THREAD:
> +		bzero(&w, sizeof(w));
> 		PROC_SLOCK(p);
> 		ruxagg(p, td);
> 		PROC_SUNLOCK(p);
> 		thread_lock(td);
> 		*rup = td->td_ru;
> -		calcru1(p, &td->td_rux, &rup->ru_utime, &rup->ru_stime);
> +		calcru1(p, &td->td_rux, &rup->ru_utime, &rup->ru_stime, &w);
> 		thread_unlock(td);
> +		print_calcru1_warn(p, &w);
> 		break;
>
> 	default:
> @@ -1069,7 +1101,6 @@ rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
> 	PROC_SLOCK(p);
> 	rufetch(p, ru);
> 	calcru(p, up, sp);
> -	PROC_SUNLOCK(p);
> }
>
> /*

Not clean to ad mounds of code to defer a couple of normal printfs.  I
think the only relationship of calcru() to the problem is that it has
a printf that is actually executed quite often.  Just about any printf
within a locked region may become a messenger for the problem if the
printf is actually executed.

To see lots more bugs in console drivers, put printfs in lots of critical
place and arrange for them to be executed frequently.  Ones near malloc
might be good.  I once used the one in the following timeout handler to
demonstrate the missing locking in syscons:

% static void
% foo(void *arg)
% {
% #if 0
%     sccnputc(0, '*');
%     timeout_handle = timeout(foo, NULL, 1);
% #else
%     /*
%      * Fills up log if done every tick so only do it every 10 ticks and
%      * wait a bit longer for races.
%      */
%     printf("*");
%     timeout_handle = timeout(foo, NULL, 10);
% #endif
% }

This printf can contend with write(2) or perhaps another printf.  Panics
were easiest to demonstrate with write(2).  Timeouts can easily interrupt
write(2), so the above printf contended with write(2) any time the
timeout is scheduled while syscons is active with write(2).  (Some
console drivers have locking to prevent the contention, but they must
be careful about deadlock.  The printf cannot be blocked.)  Giant
locking reduced this problem a bit.  It makes the above timeout handler
Giant-locked.  Syscons remains Giant-locked, so there is enough locking
to prevent contention from the above, but the above printf is broken
(it blocks).  The blocking doesn't matter here, but it would in a more
critical context.  More critical contexts wouldn't be Giant-locked
anyway, so they would contend.  I never got around to changing the
above to be an MPSAFE callout handler so that Giant locking doesn't
help.  It is in fact not MPSAFE, but only because the console driver
is not even UPSAFE.  You can see how old the above is from its non-KNF
style which I once preferred.

Bruce