[PATCH] Deadlock detection on production environments

Attilio Rao attilio at freebsd.org
Fri Aug 15 16:10:41 UTC 2008


Below is reported a patch which should identify deadlocks (or, better,
a class of them) in system thought to run in production environments
(so with no addictional overhead or no addictional debugging options
like WITNESS).

The main idea is to check for times spent by threads sleeping on a
sleepqueue or blocking on a turnstile. If these times exceed an
exagerated threshold, we can say the threads deadlocked. This seems a
very effective way of identify deadlocks in particular if considering
the following things:
- LOR can happen only between locks within the same class (for
example, sleepqueued locks (sx, lockmgr), turnstiled locks (mtx,
rwlock) and spinlocks).
- spinlocks alredy have a way to identify deadlocks between eachother.

As scanning all the threads in the system is an operation which
requires some locking, the operation is deferred into a specific
context, the "deadlock resolver" which will periodically (with 1
second interval) check for sleep times and block times of any thread
and it will report threshold exceedings.
In order to implement this idea I had to fill some gaps in the basic support.
For example, sleep ticks were saved once the thread was entering the
sleepqueue, but blocking ticks weren't in the turnstiles. In order to
cope with this, in the patch turnstiles save / clean the block times.
There were no way to differentiate a thread sleeping on a sleepqueue
linked to a lock or a waitchannel (msleep, condvar, etc.). In order to
cope with this, I added sleepq_type() which returns the type of the
waitchannel.
The implementation has been pretty much straightforward if not for a
LOR which was present between the threadlock and the sleepq_type().
Now it should be solved. Note that actually the default threshold is
30 min for sleepqueues and 15 mins for turnstiles.

This approach doesn't solve all kind of deadlocks. For example, it
won't solve the problem of missed wakeups on a waitchannel.
Theoretically, we could check for sleepqueue linked to a waitchannel
too, but it would not be simple to differentiate between a volountary,
long, sleeping (ie: users input waiting) and a deadlock. If you have
good ideas about it, please let me know.
A case where this approach will lead to a false positive is the case
of bugged code. For example, if a thread owns a sxlock and then
performs a very long sleep on a waitchannel (for example, the users
input waiting) with the sx held. In this case, the lock should be
dropped before to perform the sleeping and there is no way to cope
with badly written code.
Also we should cope with the general ticks counter overflowing. What
if ticks counter overflows? it is right we should add heuristic in
order to identify such case and workaround it?
For the moment I left it out.
The final concern is about a possible deadlock for the thread lock
resolver when acquiring the allproc_lock.
The solution for this should be mainly MD, I guess, and it should
involve the NMI (or a relative concept for architectures different
from IA32). On ia32 systems, for example, we could use a NMI handler,
linked to a watchdog, performing appropriate checks on the resolver
runtimes, in order to verify that it didn't break out.

The patch has been reviewed and discussed by jeff@ and has been
developed on the behalf of Nokia.

Thanks,
Attilio
==================================
--- /usr/src/sys/kern/kern_clock.c	2008-05-31 23:42:15.000000000 +0200
+++ src/sys/kern/kern_clock.c	2008-08-15 15:34:51.000000000 +0200
@@ -48,14 +48,16 @@
 #include <sys/callout.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
-#include <sys/lock.h>
+#include <sys/kthread.h>
 #include <sys/ktr.h>
+#include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
+#include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
@@ -159,6 +161,117 @@
 SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD,
     0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics");

+static int slptime_threshold = 1800;
+static int blktime_threshold = 900;
+
+static void
+deadlkres(void)
+{
+	struct proc *p;
+	struct thread *td;
+	void *wchan;
+	int blkticks, slpticks, slptype, tticks;
+
+	for (;;) {
+		blkticks = blktime_threshold * hz;
+		slpticks = slptime_threshold * hz;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			PROC_SLOCK(p);
+			FOREACH_THREAD_IN_PROC(p, td) {
+				thread_lock(td);
+				if (TD_ON_LOCK(td)) {
+
+					/*
+					 * The thread should be blocked on a
+					 * turnstile, so just check if the
+					 * turnstile channel is in good state.
+					 */
+					MPASS(td->td_blocked != NULL);
+					tticks = ticks - td->td_blktick;
+					if (tticks > blkticks) {
+
+						/*
+						 * A thread stuck for too long
+						 * on a turnstile.  Act as
+						 * appropriate.
+						 */
+						thread_unlock(td);
+						PROC_SUNLOCK(p);
+						sx_sunlock(&allproc_lock);
+		panic("%s: deadlock detected for %p, blocked for %d ticks\n",
+						    __func__, td, tticks);
+					}
+				} else if (TD_ON_SLEEPQ(td)) {
+
+					/*
+					 * Check if the thread is sleeping
+					 * on a lock, otherwise skip the check.
+					 * Drop the thread lock in order to
+					 * avoid a LOR with sq_chain spinlock.
+					 * Note that the proc spinlock will
+					 * prevent the thread from exiting.
+					 */
+					wchan = td->td_wchan;
+					thread_unlock(td);
+					slptype = sleepq_type(wchan);
+					thread_lock(td);
+
+					/*
+					 * Check if we lost the race for
+					 * accessing again the thread.
+					 */
+					if (!TD_ON_SLEEPQ(td) ||
+					    wchan != td->td_wchan ||
+					    slptype == -1) {
+						thread_unlock(td);
+						continue;
+					}
+					tticks = ticks - td->td_slptick;
+					if ((slptype == SLEEPQ_SX ||
+					    slptype == SLEEPQ_LK) &&
+					    tticks > slpticks) {
+
+						/*
+						 * A thread stuck for too long
+						 * on a sleepqueue linked to
+						 * a lock.  Act as appropriate.
+						 */
+						thread_unlock(td);
+						PROC_SUNLOCK(p);
+						sx_sunlock(&allproc_lock);
+		panic("%s: deadlock detected for %p, blocked for %d ticks\n",
+						    __func__, td, tticks);
+					}
+				}
+				thread_unlock(td);
+			}
+			PROC_SUNLOCK(p);
+		}
+		sx_sunlock(&allproc_lock);
+
+		/*
+		 * Sleep for one second, than try again to find deadlocks.
+		 */
+		pause("deadlkres", 1 * hz);
+	}
+}
+
+static struct kthread_desc deadlkres_kd = {
+	"deadlkres",
+	deadlkres,
+	(struct thread **)NULL
+};
+SYSINIT(deadlkres, SI_SUB_CLOCKS, SI_ORDER_ANY, kthread_start, &deadlkres_kd);
+
+SYSCTL_NODE(_debug, OID_AUTO, deadlock, CTLFLAG_RW, 0, "Deadlock detection");
+SYSCTL_INT(_debug_deadlock, OID_AUTO, slptime_threshold, CTLFLAG_RW,
+    &slptime_threshold, 0,
+    "Number of seconds within is valid to sleep on a sleepqueue");
+SYSCTL_INT(_debug_deadlock, OID_AUTO, blktime_threshold, CTLFLAG_RW,
+    &blktime_threshold, 0,
+    "Number of seconds within is valid to block on a turnstile");
+
 void
 read_cpu_time(long *cp_time)
 {
--- /usr/src/sys/kern/subr_sleepqueue.c	2008-08-13 18:41:11.000000000 +0200
+++ src/sys/kern/subr_sleepqueue.c	2008-08-15 15:30:10.000000000 +0200
@@ -121,8 +121,8 @@
 	LIST_ENTRY(sleepqueue) sq_hash;		/* (c) Chain and free list. */
 	LIST_HEAD(, sleepqueue) sq_free;	/* (c) Free queues. */
 	void	*sq_wchan;			/* (c) Wait channel. */
-#ifdef INVARIANTS
 	int	sq_type;			/* (c) Queue type. */
+#ifdef INVARIANTS
 	struct lock_object *sq_lock;		/* (c) Associated lock. */
 #endif
 };
@@ -313,7 +313,6 @@
 		    ("thread's sleep queue has a non-empty free list"));
 		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
 		sq->sq_lock = lock;
-		sq->sq_type = flags & SLEEPQ_TYPE;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth++;
@@ -326,6 +325,7 @@
 		sq = td->td_sleepqueue;
 		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
 		sq->sq_wchan = wchan;
+		sq->sq_type = flags & SLEEPQ_TYPE;
 	} else {
 		MPASS(wchan == sq->sq_wchan);
 		MPASS(lock == sq->sq_lock);
@@ -644,6 +644,29 @@
 }

 /*
+ * Returns the type of sleepqueue given a waitchannel.
+ */
+int
+sleepq_type(void *wchan)
+{
+	struct sleepqueue *sq;
+	int type;
+
+	MPASS(wchan != NULL);
+
+	sleepq_lock(wchan);
+	sq = sleepq_lookup(wchan);
+	if (sq == NULL) {
+		sleepq_release(wchan);
+		return (-1);
+	}
+	type = sq->sq_type;
+	sleepq_release(wchan);
+
+	return (type);
+}
+
+/*
  * Removes a thread from a sleep queue and makes it
  * runnable.
  */
@@ -1144,8 +1167,8 @@
 	return;
 found:
 	db_printf("Wait channel: %p\n", sq->sq_wchan);
-#ifdef INVARIANTS
 	db_printf("Queue type: %d\n", sq->sq_type);
+#ifdef INVARIANTS
 	if (sq->sq_lock) {
 		lock = sq->sq_lock;
 		db_printf("Associated Interlock: %p - (%s) %s\n", lock,
--- /usr/src/sys/kern/subr_turnstile.c	2008-05-31 23:42:15.000000000 +0200
+++ src/sys/kern/subr_turnstile.c	2008-08-15 15:30:10.000000000 +0200
@@ -732,6 +732,7 @@
 	td->td_tsqueue = queue;
 	td->td_blocked = ts;
 	td->td_lockname = lock->lo_name;
+	td->td_blktick = ticks;
 	TD_SET_LOCK(td);
 	mtx_unlock_spin(&tc->tc_lock);
 	propagate_priority(td);
@@ -924,6 +925,7 @@
 		MPASS(TD_CAN_RUN(td));
 		td->td_blocked = NULL;
 		td->td_lockname = NULL;
+		td->td_blktick = 0;
 #ifdef INVARIANTS
 		td->td_tsqueue = 0xff;
 #endif
--- /usr/src/sys/sys/proc.h	2008-08-13 18:42:06.000000000 +0200
+++ src/sys/sys/proc.h	2008-08-15 15:30:10.000000000 +0200
@@ -213,6 +213,7 @@
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	u_int		td_estcpu;	/* (t) estimated cpu utilization */
 	u_int		td_slptick;	/* (t) Time at sleep. */
+	u_int		td_blktick;	/* (t) Time spent blocked. */
 	struct rusage	td_ru;		/* (t) rusage information */
 	uint64_t	td_incruntime;	/* (t) Cpu ticks to transfer to proc. */
 	uint64_t	td_runtime;	/* (t) How many cpu ticks we've run. */
--- /usr/src/sys/sys/sleepqueue.h	2008-08-13 18:42:06.000000000 +0200
+++ src/sys/sys/sleepqueue.h	2008-08-15 15:30:10.000000000 +0200
@@ -109,6 +109,7 @@
 void	sleepq_set_timeout(void *wchan, int timo);
 int	sleepq_timedwait(void *wchan, int pri);
 int	sleepq_timedwait_sig(void *wchan, int pri);
+int	sleepq_type(void *wchan);
 void	sleepq_wait(void *wchan, int pri);
 int	sleepq_wait_sig(void *wchan, int pri);


More information about the freebsd-arch mailing list