scheduler IPIs during shutdown (was: svn commit: r193170 - projects/pnet/sys/kern (fwd))

Mon Jun 1 14:56:48 UTC 2009

On Sun, 31 May 2009, Robert Watson wrote:

> - System shutdown kicks off, and CPU 0 starts IPIing CPUs to shut them down,
>  which apparently can occur while holding the thread lock:

Just ran into a very similar issue on another i386 box, this one a VMWare VM 
-- hand transcribed, I'm afraid:

   db> bt
   Tracing pid 1 tid 100002 td 0xc456fd80
   kdb_enter()
   panic()
   _mtx_lock_spin_failed()
   _thread_lock_flags()
   hardclock_cpu()
   hardclock()
     lapic_handle_timer()
   Xtimerint()
   --- interrupt
   DELAY()
   cpu_reset()
   shutdown_reset()
   boot()
   reboot()
   syscall()
   Xint0x80_syscall()
   db> show alllocks
   Process 1 (init) thread 0xc456fd80 (100002)
   exclusive sleep mutx Giant (Giant) ...
   db> trace 100003                         <-- idle thread on cpu 1
   cpustop_handler()
   ipi_nmi_handler()
   trap()
   calltrap()
   --- trap 0x13
   tdq_move()
   sched_idletd()
   fork_exit()
   fork_trampoline()

It looks like (a) we should really be disabling interrupts before the DELAY() 
so we don't have to deal with hardclock, and (b) perhaps an NMI for cpustop 
isn't the right thing, since it stops CPUs while they hold spinlocks?

Robert N M Watson
Computer Laboratory
University of Cambridge

>
> db> trace 100005
> Tracing pid 11 tid 100005 td 0xc4d546c0
> cpustop_handler(2,c4a53b64,c0b70cad,c0d9c680,c4a53ae4,...) at 
> cpustop_handler+0x32
> ipi_nmi_handler(c0d9c680,c4a53ae4,2,f,c4d52a90,...) at ipi_nmi_handler+0x2f
> trap(c4a53b70) at trap+0x2d
> calltrap() at calltrap+0x6
> --- trap 0x13, eip = 0xc0887a61, esp = 0xc4a53bb0, ebp = 0xc4a53bdc ---
> sched_switch(c4d546c0,0,60c,187,5dd9d2a1,...) at sched_switch+0x401
> mi_switch(60c,0,c0c3d1ae,813,1,...) at mi_switch+0x200
> sched_preempt(c4d546c0,1,c4d54d80,c4a53cb4,c0b5428e,...) at 
> sched_preempt+0x9f
> ipi_bitmap_handler(8,28,28,31,c0d8e500,...) at ipi_bitmap_handler+0x34
> Xipi_intr_bitmap_handler() at Xipi_intr_bitmap_handler+0x2e
> --- interrupt, eip = 0xc0856a17, esp = 0xc4a53c8c, ebp = 0xc4a53cb4 ---
> _thread_lock_flags(c4d546c0,0,c0c3d1ae,a04,c4d546c0,...) at 
> _thread_lock_flags+0x177
> sched_idletd(0,c4a53d38,c0c36ad3,335,c4d52a90,...) at sched_idletd+0x251
> fork_exit(c0888510,0,c4a53d38) at fork_exit+0xb8
> fork_trampoline() at fork_trampoline+0x8
> --- trap 0, eip = 0, esp = 0xc4a53d70, ebp = 0 ---
>
>
> - hardclock fires on CPU 0 and tries to schedule a SWI -- typically softclock
>  or netisr, which bumps into the "leaked" thread lock:
>
> db> bt
> Tracing pid 1 tid 100002 td 0xc4d54d80
> kdb_enter(c0c3b5e5,c0c3b5e5,c0c39ccd,c4a49ad0,0,...) at kdb_enter+0x3a
> panic(c0c39ccd,c4d546c0,c0d8eb44,c4d546c0,186a5,...) at panic+0x136
> _mtx_lock_spin_failed(1,9,c0c36d52,320,0,...) at _mtx_lock_spin_failed+0x51
> _thread_lock_flags(c4d54240,0,c0c36d52,320,dd313180,...) at 
> _thread_lock_flags+0x175
> intr_event_schedule_thread(c4a49b58,c4a49b68,c0915dc4,c4d3a180,0,...) at 
> intr_event_schedule_thread+0xc7
> swi_sched(c4d3a180,0,c4a49b74,c0859a2f,c0d893d4,...) at swi_sched+0x28
> netisr_sched_poll(c0d893d4,c4a49b88,c0824064,0,2,...) at 
> netisr_sched_poll+0x24
> hardclock_device_poll(0,2) at hardclock_device_poll+0xcf
> hardclock(0,c0b74a49,0,27f,5dd9e8a2,...) at hardclock+0x44
> lapic_handle_timer(c4a49bb0) at lapic_handle_timer+0x9f
> Xtimerint() at Xtimerint+0x1f
> --- interrupt, eip = 0xc0b74a49, esp = 0xc4a49bf0, ebp = 0xc4a49c0c ---
> DELAY(f4240,0,c4d40980,c4a49c2c,c08653d3,...) at DELAY+0x89
> cpu_reset(f4240,c4a49c68,c0865eaf,0,0,...) at cpu_reset+0xd5
> shutdown_reset(0,0,c0c3b4a3,1a7,0,...) at shutdown_reset+0x23
> boot(c0d89210,0,c0c3b4a3,ad,c4a49d2c,...) at boot+0x75f
> reboot(c4d54d80,c4a49cf8,4,c0c428a4,c0d1dc28,...) at reboot+0x4b
> syscall(c4a49d38) at syscall+0x2a3
> Xint0x80_syscall() at Xint0x80_syscall+0x20
>
> Possibly hardclock should simply know not to kick off softclock at that point 
> in the shutdown (just like I've taught it not to kick off netisr in the 
> device_polling code in my branch), but more generally, sched_ule probably 
> needs to not hold the per-cpu runqueue lock, and/or sched_ule needs to not 
> try to start work on CPUs that aren't running.  If the SWI is pinned on or 
> bound to an already shut down CPU, then we should in fact panic, but 
> otherwise it seems it should preempt the current thread and run it on the CPU 
> managing the shutdown.
>
> Or, we could go with the old-world order view and not allow interrupts to 
> fire, but I wonder if all our device drivers would be comfortable with that.
>
> Robert N M Watson
> Computer Laboratory
> University of Cambridge
>
> ---------- Forwarded message ----------
> Date: Sun, 31 May 2009 13:52:17 +0000 (UTC)
> From: Robert Watson <rwatson at FreeBSD.org>
> To: src-committers at freebsd.org, svn-src-projects at freebsd.org
> Subject: svn commit: r193170 - projects/pnet/sys/kern
>
> Author: rwatson
> Date: Sun May 31 13:52:17 2009
> New Revision: 193170
> URL: http://svn.freebsd.org/changeset/base/193170
>
> Log:
>  Add a shutdown handler for device polling -- don't issue wakeups to the
>  netisr once file systems are done syncing, otherwise the scheduler may
>  generate IPIs to CPUs that have already been shutdown, leading to a
>  panic.
>
>  As similar panics (spin lock 0xc0d8e500 (sched lock 1) held by
>  0xc4d546c0 (tid 100005) too long) have been reported on both 7.x and 8.x
>  in other code, we might want to think about whether there's some missing
>  scheduler shutdown logic to handle this case for unpinned/unbound
>  threads by migrating them to the CPU managing the shutdown and allowing
>  them to preempt.
>
> Modified:
>  projects/pnet/sys/kern/kern_poll.c
>
> Modified: projects/pnet/sys/kern/kern_poll.c
> ==============================================================================
> --- projects/pnet/sys/kern/kern_poll.c	Sun May 31 12:36:14 2009 
> (r193169)
> +++ projects/pnet/sys/kern/kern_poll.c	Sun May 31 13:52:17 2009 
> (r193170)
> @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
> #include <sys/kernel.h>
> #include <sys/kthread.h>
> #include <sys/proc.h>
> +#include <sys/eventhandler.h>
> #include <sys/resourcevar.h>
> #include <sys/socket.h>			/* needed by net/if.h 
> */
> #include <sys/sockio.h>
> @@ -110,6 +111,7 @@ SYSCTL_UINT(_kern_polling, OID_AUTO, bur
>
> static int	netisr_poll_scheduled;
> static int	netisr_pollmore_scheduled;
> +static int	poll_shutting_down;
>
> static int poll_burst_max_sysctl(SYSCTL_HANDLER_ARGS)
> {
> @@ -261,10 +263,19 @@ struct pollrec {
> static struct pollrec pr[POLL_LIST_LEN];
>
> static void
> +poll_shutdown(void *arg, int howto)
> +{
> +
> +	poll_shutting_down = 1;
> +}
> +
> +static void
> init_device_poll(void)
> {
>
> 	mtx_init(&poll_mtx, "polling", NULL, MTX_DEF);
> +	EVENTHANDLER_REGISTER(shutdown_post_sync, poll_shutdown, NULL,
> +	    SHUTDOWN_PRI_LAST);
> }
> SYSINIT(device_poll, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, init_device_poll, 
> NULL);
>
> @@ -288,7 +299,7 @@ hardclock_device_poll(void)
> 	static struct timeval prev_t, t;
> 	int delta;
>
> -	if (poll_handlers == 0)
> +	if (poll_handlers == 0 || poll_shutting_down)
> 		return;
>
> 	microuptime(&t);
> _______________________________________________
> freebsd-current at freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-current
> To unsubscribe, send any mail to "freebsd-current-unsubscribe at freebsd.org"
>