"process slock" vs. "scrlock" lock order

Kostik Belousov kostikbel at gmail.com
Sat Oct 2 10:03:03 UTC 2010


On Sat, Oct 02, 2010 at 12:41:07AM -0700, Don Lewis wrote:
> The hard coded lock order list in subr_witness.c has "scrlock" listed
> before "process slock".  This causes a lock order reversal when
> calcru1(), which requires "process slock" to be held, calls printf() to
> report unexpected runtime problems.  The call to printf() eventually
> gets into the console code which locks "scrlock".  This normally isn't
> noticed because both of these are spin locks, and hardly anyone uses
> WITNESS without disabling the checking of spinlocks with
> WITNESS_SKIPSPIN.  If spin lock checking is not disabled, the result is
> a silent reset because witness catches the LOR, which recurses into
> printf(), which ends up causing a panic in cnputs().
> 
> One obvious fix would be to move "scrlock" to a later spot in the list,
> but I suspect the same problem could occur with the "sio" or "uart"
> locks if a serial console is being used.  It might not be possible to
> fix them the same way because there might be cases where they are in the
> input path and get locked before "process slock" or other spin locks
> that can be held when calling printf().
> 
> Another fix for this particular case would be to rearrange the code in
> calcru1() so that the calls to printf() occur after ruxp->rux_* are
> updated and where I assume it would be safe to temporarily drop "process
> slock" for the duration of the printf() calls.
> 
> Thoughts?

Yes, printing from under a spinlock is somewhat epidemic. Moving the printf
out of process slock looks as the right solution. On the other hand, all
calcru() callers unlock slock immediately after calcru(), and calcru1()
sometimes only called with thread lock held, not process slock.

I propose the following refinement, it does not need relock of process slock
at all. Lets drop slock in calcru(), and do neccessary print after that.
No need to reacquire the slock.

diff --git a/sys/compat/linux/linux_misc.c b/sys/compat/linux/linux_misc.c
index d2cf6b6..6a599f6 100644
--- a/sys/compat/linux/linux_misc.c
+++ b/sys/compat/linux/linux_misc.c
@@ -691,7 +691,6 @@ linux_times(struct thread *td, struct linux_times_args *args)
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 		calcru(p, &utime, &stime);
-		PROC_SUNLOCK(p);
 		calccru(p, &cutime, &cstime);
 		PROC_UNLOCK(p);
 
diff --git a/sys/compat/svr4/svr4_misc.c b/sys/compat/svr4/svr4_misc.c
index 6f80fe6..554eb44 100644
--- a/sys/compat/svr4/svr4_misc.c
+++ b/sys/compat/svr4/svr4_misc.c
@@ -865,7 +865,6 @@ svr4_sys_times(td, uap)
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 	calcru(p, &utime, &stime);
-	PROC_SUNLOCK(p);
 	calccru(p, &cutime, &cstime);
 	PROC_UNLOCK(p);
 
@@ -1278,7 +1277,6 @@ loop:
 			ru = p->p_ru;
 			PROC_SLOCK(p);
 			calcru(p, &ru.ru_utime, &ru.ru_stime);
-			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 			sx_sunlock(&proctree_lock);
 
@@ -1305,7 +1303,6 @@ loop:
 			ru = p->p_ru;
 			PROC_SLOCK(p);
 			calcru(p, &ru.ru_utime, &ru.ru_stime);
-			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 
 		        if (((uap->options & SVR4_WNOWAIT)) == 0) {
@@ -1329,7 +1326,6 @@ loop:
 			status = SIGCONT;
 			PROC_SLOCK(p);
 			calcru(p, &ru.ru_utime, &ru.ru_stime);
-			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 
 		        if (((uap->options & SVR4_WNOWAIT)) == 0) {
diff --git a/sys/fs/procfs/procfs_status.c b/sys/fs/procfs/procfs_status.c
index 7850504..12f08f6 100644
--- a/sys/fs/procfs/procfs_status.c
+++ b/sys/fs/procfs/procfs_status.c
@@ -125,7 +125,6 @@ procfs_doprocstatus(PFS_FILL_ARGS)
 
 		PROC_SLOCK(p);
 		calcru(p, &ut, &st);
-		PROC_SUNLOCK(p);
 		start = p->p_stats->p_start;
 		timevaladd(&start, &boottime);
 		sbuf_printf(sb, " %jd,%ld %jd,%ld %jd,%ld",
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 8358f75..7819d7b 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -703,8 +703,8 @@ proc_reap(struct thread *td, struct proc *p, int *status, int options,
 	if (rusage) {
 		*rusage = p->p_ru;
 		calcru(p, &rusage->ru_utime, &rusage->ru_stime);
-	}
-	PROC_SUNLOCK(p);
+	} else
+		PROC_SUNLOCK(p);
 	td->td_retval[0] = p->p_pid;
 	if (status)
 		*status = p->p_xstat;	/* convert to int */
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index 4899946..fb0be15 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -783,7 +783,6 @@ fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
 		timevaladd(&kp->ki_start, &boottime);
 		PROC_SLOCK(p);
 		calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
-		PROC_SUNLOCK(p);
 		calccru(p, &kp->ki_childutime, &kp->ki_childstime);
 
 		/* Some callers want child-times in a single value */
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
index ec2d6b6..13cc50c 100644
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -72,8 +72,15 @@ static struct rwlock uihashtbl_lock;
 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
 static u_long uihash;		/* size of hash table - 1 */
 
+struct calcru1_warn {
+	int64_t neg_runtime;
+	int64_t new_runtime;
+	int64_t old_runtime;
+};
+
 static void	calcru1(struct proc *p, struct rusage_ext *ruxp,
-		    struct timeval *up, struct timeval *sp);
+		    struct timeval *up, struct timeval *sp,
+		    struct calcru1_warn *w);
 static int	donice(struct thread *td, struct proc *chgp, int n);
 static struct uidinfo *uilookup(uid_t uid);
 static void	ruxagg_locked(struct rusage_ext *rux, struct thread *td);
@@ -797,6 +804,20 @@ getrlimit(td, uap)
 	return (error);
 }
 
+static void
+print_calcru1_warn(struct proc *p, const struct calcru1_warn *w)
+{
+
+	if (w->neg_runtime > 0)
+		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
+		    (intmax_t)w->neg_runtime, p->p_pid, p->p_comm);
+	if (w->new_runtime > 0)
+		printf("calcru: runtime went backwards from %ju usec "
+		    "to %ju usec for pid %d (%s)\n",
+		    (uintmax_t)w->old_runtime, (uintmax_t)w->new_runtime,
+		    p->p_pid, p->p_comm);
+}
+
 /*
  * Transform the running time and tick information for children of proc p
  * into user and system time usage.
@@ -807,24 +828,33 @@ calccru(p, up, sp)
 	struct timeval *up;
 	struct timeval *sp;
 {
+	struct calcru1_warn w;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	calcru1(p, &p->p_crux, up, sp);
+	bzero(&w, sizeof(w));
+	calcru1(p, &p->p_crux, up, sp, &w);
+	print_calcru1_warn(p, &w);
 }
 
 /*
  * Transform the running time and tick information in proc p into user
  * and system time usage.  If appropriate, include the current time slice
  * on this CPU.
+ *
+ * The process slock shall be locked on entry, and it is unlocked
+ * after function returned.
  */
 void
 calcru(struct proc *p, struct timeval *up, struct timeval *sp)
 {
 	struct thread *td;
 	uint64_t u;
+	struct calcru1_warn w;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+	bzero(&w, sizeof(w));
 	/*
 	 * If we are getting stats for the current process, then add in the
 	 * stats that this thread has accumulated in its current time slice.
@@ -843,12 +873,14 @@ calcru(struct proc *p, struct timeval *up, struct timeval *sp)
 			continue;
 		ruxagg(p, td);
 	}
-	calcru1(p, &p->p_rux, up, sp);
+	calcru1(p, &p->p_rux, up, sp, &w);
+	PROC_SUNLOCK(p);
+	print_calcru1_warn(p, &w);
 }
 
 static void
 calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
-    struct timeval *sp)
+    struct timeval *sp, struct calcru1_warn *w)
 {
 	/* {user, system, interrupt, total} {ticks, usec}: */
 	uint64_t ut, uu, st, su, it, tt, tu;
@@ -865,8 +897,7 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
 	tu = cputick2usec(ruxp->rux_runtime);
 	if ((int64_t)tu < 0) {
 		/* XXX: this should be an assert /phk */
-		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
-		    (intmax_t)tu, p->p_pid, p->p_comm);
+		w->neg_runtime = tu;
 		tu = ruxp->rux_tu;
 	}
 
@@ -903,10 +934,8 @@ calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
 		 * serious, so lets keep it and hope laptops can be made
 		 * more truthful about their CPU speed via ACPI.
 		 */
-		printf("calcru: runtime went backwards from %ju usec "
-		    "to %ju usec for pid %d (%s)\n",
-		    (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
-		    p->p_pid, p->p_comm);
+		w->new_runtime = tu;
+		w->old_runtime = ruxp->rux_tu;
 		uu = (tu * ut) / tt;
 		su = (tu * st) / tt;
 	}
@@ -946,6 +975,7 @@ kern_getrusage(struct thread *td, int who, struct rusage *rup)
 {
 	struct proc *p;
 	int error;
+	struct calcru1_warn w;
 
 	error = 0;
 	p = td->td_proc;
@@ -962,13 +992,15 @@ kern_getrusage(struct thread *td, int who, struct rusage *rup)
 		break;
 
 	case RUSAGE_THREAD:
+		bzero(&w, sizeof(w));
 		PROC_SLOCK(p);
 		ruxagg(p, td);
 		PROC_SUNLOCK(p);
 		thread_lock(td);
 		*rup = td->td_ru;
-		calcru1(p, &td->td_rux, &rup->ru_utime, &rup->ru_stime);
+		calcru1(p, &td->td_rux, &rup->ru_utime, &rup->ru_stime, &w);
 		thread_unlock(td);
+		print_calcru1_warn(p, &w);
 		break;
 
 	default:
@@ -1069,7 +1101,6 @@ rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
 	PROC_SLOCK(p);
 	rufetch(p, ru);
 	calcru(p, up, sp);
-	PROC_SUNLOCK(p);
 }
 
 /*
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index 3aea2bd..d603958 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -204,7 +204,6 @@ kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 		calcru(p, &user, &sys);
-		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
@@ -212,7 +211,6 @@ kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 		calcru(p, &user, &sys);
-		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		timevaladd(&user, &sys);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 196 bytes
Desc: not available
Url : http://lists.freebsd.org/pipermail/freebsd-arch/attachments/20101002/87ee586e/attachment.pgp


More information about the freebsd-arch mailing list