adaptive rwlock deadlock
Philippe Jalaber
pjalaber at gmail.com
Tue Oct 13 13:28:08 UTC 2015
I am able to reproduce the freeze with a simple kernel module modcrash.c.
It freezes with a Freebsd 9.3 and 10.0 VM, but not with a Freebsd 10.1,
10.2 or 11.0 VM.
It also freezes on a VIA appliance running Freebsd 9.3 and 10.2.
All freezes can be reproduced with one or more cpus.
Here the modcrash.c source code. Just comment the #define CRASHME (which
moves a pause call) and it does not freeze any more.
Please note that you can't klunload the module, it crashes the system.
Philippe
#include <sys/param.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/libkern.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mbuf.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/kthread.h>
static void modcrash_attach(void);
static void modcrash_detach(void);
static void modcrash_thread1(void *);
static void modcrash_thread2(void *);
static void modcrash_thread3(void *);
struct thread *thread1_td, *thread2_td, *thread3_td;
static struct rwlock inplock;
RW_SYSINIT(inplock, &inplock, "inplock");
static struct rwlock glock;
RW_SYSINIT(glock, &glock, "glock");
static int volatile global_busy = 0;
#define MILLISECONDS(ms) ((hz/1000)*(ms))
static void
modcrash_attach(void)
{
if ( kthread_add(modcrash_thread1, NULL, NULL, &thread1_td, 0, 0,
"modcrash1") != 0 )
printf("error\n");
if ( kthread_add(modcrash_thread2, NULL, NULL, &thread2_td, 0, 0,
"modcrash2") != 0 )
printf("error\n");
if ( kthread_add(modcrash_thread3, NULL, NULL, &thread3_td, 0, 0,
"modcrash3") != 0 )
printf("error\n");
}
static void
modcrash_detach(void)
{
}
static void
modcrash_thread1(void *arg)
{
int busy;
printf("thread 1 running\n");
while ( 1 )
{
rw_wlock(&inplock);
do
{
rw_rlock(&glock);
busy = global_busy;
if ( busy )
{
rw_runlock(&glock);
}
} while ( busy );
rw_runlock(&glock);
rw_wunlock(&inplock);
pause("thread 1", MILLISECONDS(10));
}
kthread_exit();
}
static void
modcrash_thread2(void *arg)
{
int busy;
printf("thread 2 running\n");
while ( 1 )
{
rw_wlock(&glock);
if ( (random() & 1) == 0 )
{
global_busy = 1;
busy = 1;
}
else
{
busy = 0;
}
rw_wunlock(&glock);
#define CRASHME
#ifdef CRASHME
pause("thread 2", MILLISECONDS(10));
#endif
if ( busy )
{
rw_wlock(&glock);
global_busy = 0;
rw_wunlock(&glock);
}
#ifndef CRASHME
pause("thread 2", MILLISECONDS(10));
#endif
}
kthread_exit();
}
static void
modcrash_thread3(void *context)
{
printf("thread 3 running\n");
while ( 1 )
{
rw_wlock(&inplock);
rw_wunlock(&inplock);
pause("thread 3", MILLISECONDS(10));
}
kthread_exit();
}
static int
modcrashmodevent(module_t mod, int type, void *data)
{
switch (type) {
case MOD_LOAD:
modcrash_attach();
break;
case MOD_UNLOAD:
modcrash_detach();
break;
default:
return EOPNOTSUPP;
}
return 0;
}
static moduledata_t modcrash_mod = {
"modcrash",
modcrashmodevent,
0
};
DECLARE_MODULE(modcrash, modcrash_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
MODULE_VERSION(modcrash, 1);
2015-08-06 10:09 GMT+02:00 Philippe Jalaber <pjalaber at gmail.com>:
> 2015-08-05 17:41 GMT+02:00 John Baldwin <jhb at freebsd.org>:
>
>> On Wednesday, August 05, 2015 04:27:53 PM Philippe Jalaber wrote:
>> > 2015-08-04 22:10 GMT+02:00 John Baldwin <jhb at freebsd.org>:
>> >
>> > > On Tuesday, July 07, 2015 12:10:19 PM Philippe Jalaber wrote:
>> > > > Hi,
>> > > >
>> > > > I am facing a strange problem using the network stack and adaptive
>> > > rwlocks
>> > > > running Freebsd 9.3.
>> > > > Basically I can reproduce the problem with 3 threads:
>> > > >
>> > > > 1) thread 1 has taken the rwlock of structure inpcb in exclusive
>> mode in
>> > > > tcp_input.c. This thread also runs my own code and repeatedly takes
>> a
>> > > > rwlock (called g_rwlock) in shared mode and releases it, until a
>> shared
>> > > > object is marked not "busy" any more:
>> > > >
>> > > > rwlock(inp_lock);
>> > > > ....
>> > > > do { // thread is active waiting in the loop
>> > > > rlock(g_rwlock);
>> > > > o = find();
>> > > > if ( o == NULL )
>> > > > break;
>> > > > busy = o.busy;
>> > > > if (o != NULL && busy)
>> > > > runlock(g_rwlock);
>> > > > } while ( busy );
>> > > >
>> > > > if ( o != NULL )
>> > > > {
>> > > > // do something with o
>> > > > ....
>> > > > }
>> > > > runlock(g_rwlock);
>> > > > ....
>> > > >
>> > > > 2) thread 2 wants to set the shared object as "ready". So it tries
>> to
>> > > take
>> > > > g_rwlock in exclusive mode and is blocked in
>> _rw_wlock_hard at kern_rwlock.c
>> > > :815
>> > > > "turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE)" because
>> thread 1
>> > > has
>> > > > already taken it in shared mode:
>> > > >
>> > > > wlock(g_rwlock);
>> > > > o = find();
>> > > > if ( o != NULL )
>> > > > o.busy = 1;
>> > > > wunlock(g_rwlock);
>> > > >
>> > > > // o is busy so work on it without any lock
>> > > > ....
>> > > >
>> > > > wlock(g_rwlock); // thread is blocked here
>> > > > o.busy = 0;
>> > > > maybe_delete(o);
>> > > > wunlock(g_rwlock);
>> > > >
>> > > > 3) thread 3 spins on the same inpcb rwlock than thread 1 in
>> > > > _rw_wlock_hard at kern_rwlock.c:721 "while ((struct
>> > > > thread*)RW_OWNER(rw->rw_lock) == owner && TD_IS_RUNNING(owner)) "
>> > > >
>> > > >
>> > > > My target machine has two cpus.
>> > > > Thread 1 is pinned to cpu 0.
>> > > > Thread 2 and Thread 3 are pinned to cpu 1.
>> > > > Thread 1 and Thread 2 have a priority of 28.
>> > > > Thread 3 has a priority of 127
>> > > >
>> > > > Now what seems to happen is that when thread 1 calls
>> runlock(g_rwlock),
>> > > it
>> > > > calls turnstile_broadcast at kern_rwlock.c:650, but thread 2 never
>> regains
>> > > > control because thread 3 is spinning on the inpcb rwlock. Also the
>> > > > condition TD_IS_RUNNING(owner) is always true because thread 1 is
>> active
>> > > > waiting in a loop. So the 3 threads deadlock.
>> > > > Note that if I compile the kernel without adaptive rwlocks it works
>> > > without
>> > > > any problem.
>> > > > A workaround is to add a call to "sched_relinquish(curthread)" in
>> thread
>> > > 1
>> > > > in the loop just after the call to runlock.
>> > >
>> > > It sounds like we are not forcing a preemption on CPU 1 in this case
>> via
>> > > sched_add().
>> > >
>> > > For SCHED_4BSD you could try the 'FULL_PREEMPTION' kernel option.
>> > > For ULE you can adjust 'preempt_thresh' on the fly, though I think the
>> > > default setting should actually still work.
>> > >
>> > > Can you use KTR or some such to determine if IPI_PREEMPT is being
>> sent by
>> > > CPU 0 to CPU 1 in this case?
>> > >
>> > > > I am also wondering about the code in _rw_runlock after
>> > > > "turnstile_broadcast(ts, queue)". Isn't the flag
>> RW_LOCK_WRITE_WAITERS
>> > > > definitely lost if the other thread which is blocked in
>> turnstile_wait
>> > > > never regains control ?
>> > >
>> > > All the write waiters are awakened by a broadcast (as opposed to a
>> signal
>> > > operation). They are on the run queue, not on the turnstile queue
>> anymore,
>> > > so there aren't any write waiters left (the bit only tracks if there
>> are
>> > > waiters on the turnstile).
>> > >
>> > > --
>> > > John Baldwin
>> > >
>> >
>> > I Use ULE scheduler.
>> > Here's the KTR output using ktrdump on a vmcore after watchdog.
>> >
>> > 75447 ipi_selected: cpu: 1 ipi: fc
>> > 75446 stop_cpus() with 252 type
>> > 75445 ipi_cpu: cpu: 1 ipi: 2
>> > 75444 ipi_cpu: cpu: 1 ipi: 2
>> > 75443 ipi_cpu: cpu: 1 ipi: 2
>> > 75442 ipi_cpu: cpu: 1 ipi: 2
>> > 75441 ipi_cpu: cpu: 1 ipi: 2
>> > ....
>> > 3862 ipi_cpu: cpu: 1 ipi: 2
>> > 3861 ipi_cpu: cpu: 1 ipi: 2
>> > 3860 ipi_cpu: cpu: 1 ipi: 2
>> > 3859 ipi_cpu: cpu: 1 ipi: 2
>> > 3858 ipi_cpu: cpu: 1 ipi: 2
>> > 3857 ipi_selected: cpu: 1 ipi: f3
>> > 3856 ipi_cpu: cpu: 1 ipi: 2
>> > 3855 ipi_cpu: cpu: 1 ipi: 2
>> > 3854 ipi_cpu: cpu: 1 ipi: 2
>> > 3853 ipi_selected: cpu: 0 ipi: f3
>> > 3852 ipi_cpu: cpu: 1 ipi: 2
>> > 3851 ipi_selected: cpu: 1 ipi: f3
>> > 3850 ipi_cpu: cpu: 1 ipi: 2
>> > 3849 ipi_cpu: cpu: 1 ipi: 2
>> > 3848 ipi_selected: cpu: 0 ipi: f3
>> > 3847 ipi_cpu: cpu: 1 ipi: 2
>> > 3846 ipi_cpu: cpu: 1 ipi: 2
>> > 3845 ipi_cpu: cpu: 1 ipi: 2
>> > 3844 ipi_cpu: cpu: 1 ipi: 2
>> > 3843 ipi_cpu: cpu: 1 ipi: 2
>> > 3842 ipi_cpu: cpu: 1 ipi: 2
>> > 3841 ipi_cpu: cpu: 1 ipi: 2
>> > 3840 ipi_cpu: cpu: 1 ipi: 2
>> > 3839 ipi_cpu: cpu: 1 ipi: 2
>> > 3838 ipi_cpu: cpu: 1 ipi: 2
>> > 3837 ipi_cpu: cpu: 1 ipi: 2
>> > 3836 ipi_cpu: cpu: 1 ipi: 2
>> > 3835 ipi_cpu: cpu: 0 ipi: 1
>> > 3834 ipi_cpu: cpu: 0 ipi: 1
>> > 3833 ipi_cpu: cpu: 0 ipi: 1
>> > 3832 ipi_cpu: cpu: 0 ipi: 1
>> > 3831 ipi_cpu: cpu: 0 ipi: 1
>> > 3830 ipi_cpu: cpu: 0 ipi: 1
>>
>> Unfortunately this has a lot of other noise. Can you add some
>> traces specifically in sched_ule in tdq_notify to note that it
>> is deciding to notify a CPU due to scheduling a thread?
>>
>> --
>> John Baldwin
>>
>
> OK. Here's the patch I have used in tdq_notify on Freebsd 9.3:
>
> --- kern/sched_ule.c 2015-08-06 09:03:49.000000000 +0200
> +++ kern/sched_ule.c 2015-08-06 09:03:38.000000000 +0200
> @@ -1020,6 +1020,7 @@
> return;
> }
> tdq->tdq_ipipending = 1;
> + CTR3(KTR_SMP, "%s: cpu: %d IPI_PREEMPT=%x", __func__, cpu,
> IPI_PREEMPT);
> ipi_cpu(cpu, IPI_PREEMPT);
> }
>
> Please find the output of ktrdump in attachment.
> The last IPI_PREEMPT sent is at index #9424. Hope it helps.
>
>
>
More information about the freebsd-hackers
mailing list