deadlocks with intr NFS mounts and ^Z (or: PCATCH and sleepable locks)

Kostik Belousov kostikbel at gmail.com
Sat Jun 20 16:15:50 UTC 2009


On Fri, Jun 19, 2009 at 06:23:28PM +0200, Jilles Tjoelker wrote:
> I have been having trouble with deadlocks with NFS mounts for a while,
> and I have found at least one way it can deadlock. It seems an issue
> with the sleep/lock system.
> 
> NFS sleeps while holding a lockmgr lock, waiting for a reply from the
> server. When the mount is set intr, this is an interruptible sleep, so
> that interrupting signals can abort the sleep. However, this also means
> that SIGSTOP etc will suspend the thread without waking it up first, so
> it will be suspended with a lock held.
> 
> If it holds the wrong locks, it is possible that the shell will not be
> able to run, and the process cannot be continued in the normal manner.
> 
> Due to some other things I do not understand, it is then possible that
> the process cannot be continued at all (SIGCONT seems ignored), but in
> simple cases SIGCONT works, and things go back to normal.
> 
> In any case, this situation is undesirable, as even 'umount -f' doesn't
> work while the thread is suspended.
> 
> Of course, this reasoning applies to any code that goes to sleep
> interruptibly while holding a lock (sx or lockmgr). Is this supposed to
> be possible (likely useful)? If so, a third type of sleep would be
> needed that is interrupted by signals but not suspended? If not,
> something should check that it doesn't happen and NFS intr mounts may
> need to check for signals periodically or find a way to avoid sleeping
> with locks held.
> 
> The td_locks field is only accessible for the current thread, so it
> cannot be used to check if suspending is safe.
> 
> Also, making SIGSTOP and the like interrupt/restart syscalls is not
> acceptable unless you find some way to do it such that userland won't
> notice. For example, a read of 10 megabytes from a regular file with
> that much available must not return less then 10 megabytes.

Note that NFS does check for the signals during i/o, so you may get
short reads anyway.

I do think that the right solution both there and with SINGLE_NO_EXIT
case for thread_single is to stop at the usermode boundary instead of
suspending a thread in the interruptible sleep state.

I set error code returned from interrupted msleep() to ERESTART,
that seems to be the right thing, at least to restart the i/o that
transferred no data upon receiving SIGSTOP.

My current patch is below. It contains some not strictly related
changes, e.g. for wakeup().

diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 5c1d553..28f4f4f 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -2310,18 +2310,22 @@ static void
 sig_suspend_threads(struct thread *td, struct proc *p, int sending)
 {
 	struct thread *td2;
+	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
+	wakeup_swapper = 0;
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		thread_lock(td2);
 		td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
-		    (td2->td_flags & TDF_SINTR) &&
-		    !TD_IS_SUSPENDED(td2)) {
-			thread_suspend_one(td2);
-		} else {
+		    (td2->td_flags & TDF_SINTR)) {
+			if (TD_IS_SUSPENDED(td2))
+				wakeup_swapper |= thread_unsuspend_one(td2);
+			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR))
+				wakeup_swapper |= sleepq_abort(td2, ERESTART);
+		} else if (!TD_IS_SUSPENDED(td2)) {
 			if (sending || td != td2)
 				td2->td_flags |= TDF_ASTPENDING;
 #ifdef SMP
@@ -2331,6 +2335,8 @@ sig_suspend_threads(struct thread *td, struct proc *p, int sending)
 		}
 		thread_unlock(td2);
 	}
+	if (wakeup_swapper)
+		kick_proc0();
 }
 
 int
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index b91c1a5..d27d027 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -344,11 +344,16 @@ wakeup(void *ident)
 {
 	int wakeup_swapper;
 
+ repeat:
 	sleepq_lock(ident);
 	wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0);
 	sleepq_release(ident);
-	if (wakeup_swapper)
-		kick_proc0();
+	if (wakeup_swapper) {
+		if (ident == &proc0)
+			goto repeat;
+		else
+			kick_proc0();
+	}
 }
 
 /*
@@ -361,11 +366,16 @@ wakeup_one(void *ident)
 {
 	int wakeup_swapper;
 
+ repeat:
 	sleepq_lock(ident);
 	wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0);
 	sleepq_release(ident);
-	if (wakeup_swapper)
-		kick_proc0();
+	if (wakeup_swapper) {
+		if (ident == &proc0)
+			goto repeat;
+		else
+			kick_proc0();
+	}
 }
 
 static void
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index bb8779b..800a1d1 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -504,6 +504,22 @@ thread_unlink(struct thread *td)
 	/* Must  NOT clear links to proc! */
 }
 
+static int
+recalc_remaining(struct proc *p, int mode)
+{
+	int remaining;
+
+	if (mode == SINGLE_EXIT)
+		remaining = p->p_numthreads;
+	else if (mode == SINGLE_BOUNDARY)
+		remaining = p->p_numthreads - p->p_boundary_count;
+	else if (mode == SINGLE_NO_EXIT)
+		remaining = p->p_numthreads - p->p_suspcount;
+	else
+		panic("recalc_remaining: wrong mode %d", mode);
+	return (remaining);
+}
+
 /*
  * Enforce single-threading.
  *
@@ -551,12 +567,7 @@ thread_single(int mode)
 	p->p_flag |= P_STOPPED_SINGLE;
 	PROC_SLOCK(p);
 	p->p_singlethread = td;
-	if (mode == SINGLE_EXIT)
-		remaining = p->p_numthreads;
-	else if (mode == SINGLE_BOUNDARY)
-		remaining = p->p_numthreads - p->p_boundary_count;
-	else
-		remaining = p->p_numthreads - p->p_suspcount;
+	remaining = recalc_remaining(p, mode);
 	while (remaining != 1) {
 		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
 			goto stopme;
@@ -587,18 +598,17 @@ thread_single(int mode)
 						wakeup_swapper |=
 						    sleepq_abort(td2, ERESTART);
 					break;
+				case SINGLE_NO_EXIT:
+					if (TD_IS_SUSPENDED(td2) &&
+					    !(td2->td_flags & TDF_BOUNDARY))
+						wakeup_swapper |=
+						    thread_unsuspend_one(td2);
+					if (TD_ON_SLEEPQ(td2) &&
+					    (td2->td_flags & TDF_SINTR))
+						wakeup_swapper |=
+						    sleepq_abort(td2, ERESTART);
+					break;
 				default:
-					if (TD_IS_SUSPENDED(td2)) {
-						thread_unlock(td2);
-						continue;
-					}
-					/*
-					 * maybe other inhibited states too?
-					 */
-					if ((td2->td_flags & TDF_SINTR) &&
-					    (td2->td_inhibitors &
-					    (TDI_SLEEPING | TDI_SWAPPED)))
-						thread_suspend_one(td2);
 					break;
 				}
 			}
@@ -611,12 +621,7 @@ thread_single(int mode)
 		}
 		if (wakeup_swapper)
 			kick_proc0();
-		if (mode == SINGLE_EXIT)
-			remaining = p->p_numthreads;
-		else if (mode == SINGLE_BOUNDARY)
-			remaining = p->p_numthreads - p->p_boundary_count;
-		else
-			remaining = p->p_numthreads - p->p_suspcount;
+		remaining = recalc_remaining(p, mode);
 
 		/*
 		 * Maybe we suspended some threads.. was it enough?
@@ -630,12 +635,7 @@ stopme:
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_switch(td);
-		if (mode == SINGLE_EXIT)
-			remaining = p->p_numthreads;
-		else if (mode == SINGLE_BOUNDARY)
-			remaining = p->p_numthreads - p->p_boundary_count;
-		else
-			remaining = p->p_numthreads - p->p_suspcount;
+		remaining = recalc_remaining(p, mode);
 	}
 	if (mode == SINGLE_EXIT) {
 		/*
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 195 bytes
Desc: not available
Url : http://lists.freebsd.org/pipermail/freebsd-arch/attachments/20090620/345bdaba/attachment.pgp


More information about the freebsd-arch mailing list