Re: panic: data abort in critical section or under mutex (was: Re: panic: Unknown kernel exception 0 esr_el1 2000000 (on 14-CURRENT/aarch64 Feb 28))

From: Mark Johnston <markj_at_freebsd.org>
Date: Mon, 07 Mar 2022 15:13:37 UTC
On Mon, Mar 07, 2022 at 02:46:09PM +0100, Ronald Klop wrote:
> Dear Mark Johnston,
> 
> I did some binary search in the kernels and came to the conclusion that https://cgit.freebsd.org/src/commit/?id=1517b8d5a7f58897200497811de1b18809c07d3e still works and https://cgit.freebsd.org/src/commit/?id=407c34e735b5d17e2be574808a09e6d729b0a45a panics.
> 
> I suspect your commit in https://cgit.freebsd.org/src/commit/?id=c84bb8cd771ce4bed58152e47a32dda470bef23a.
> 
> Last panic:
> 
> panic: vm_fault failed: ffff00000046e708 error 1
> cpuid = 1
> time = 1646660058
> KDB: stack backtrace:
> db_trace_self() at db_trace_self
> db_trace_self_wrapper() at db_trace_self_wrapper+0x30
> vpanic() at vpanic+0x174
> panic() at panic+0x44
> data_abort() at data_abort+0x2e8
> handle_el1h_sync() at handle_el1h_sync+0x10
> --- exception, esr 0x96000004
> _rm_rlock_debug() at _rm_rlock_debug+0x8c
> osd_get() at osd_get+0x5c
> zio_execute() at zio_execute+0xf8
> taskqueue_run_locked() at taskqueue_run_locked+0x178
> taskqueue_thread_loop() at taskqueue_thread_loop+0xc8
> fork_exit() at fork_exit+0x74
> fork_trampoline() at fork_trampoline+0x14
> KDB: enter: panic
> [ thread pid 0 tid 100129 ]
> Stopped at      kdb_enter+0x44: undefined       f902011f
> db>
> 
> A more recent kernel (912df91) still panics. See below.
> 
> Do you have time to look into this? What can I provide in information to help?

Hmm.  So after my rmlock commits, we have the following disassembly for
_rm_rlock() (with a few annotations added by me).  Note that the pcpu
pointer is stored in register x18 by convention.

   0xffff00000046e304 <+0>:     stp     x29, x30, [sp, #-16]!
   0xffff00000046e308 <+4>:     mov     x29, sp
   0xffff00000046e30c <+8>:     ldr     x8, [x18]
   0xffff00000046e310 <+12>:    ldr     x9, [x18]
   0xffff00000046e314 <+16>:    ldr     x10, [x18]
   0xffff00000046e318 <+20>:    cmp     x9, x10
   0xffff00000046e31c <+24>:    b.ne    0xffff00000046e3cc <_rm_rlock+200>  // b.any
   0xffff00000046e320 <+28>:    ldr     x9, [x18]
   0xffff00000046e324 <+32>:    ldrh    w9, [x9, #314]
   0xffff00000046e328 <+36>:    cbnz    w9, 0xffff00000046e3c0 <_rm_rlock+188>
   0xffff00000046e32c <+40>:    str     wzr, [x1, #32]
   0xffff00000046e330 <+44>:    stp     x0, x8, [x1, #16]
   0xffff00000046e334 <+48>:    ldrb    w9, [x0, #10]
   0xffff00000046e338 <+52>:    tbz     w9, #4, 0xffff00000046e358 <_rm_rlock+84>
   0xffff00000046e33c <+56>:    ldr     x9, [x18]
   0xffff00000046e340 <+60>:    ldr     w10, [x9, #888]
   0xffff00000046e344 <+64>:    add     w10, w10, #0x1
   0xffff00000046e348 <+68>:    str     w10, [x9, #888]
   0xffff00000046e34c <+72>:    ldr     x9, [x18]
   0xffff00000046e350 <+76>:    ldr     w9, [x9, #888]
   0xffff00000046e354 <+80>:    cbz     w9, 0xffff00000046e3f4 <_rm_rlock+240>
   0xffff00000046e358 <+84>:    ldr     w9, [x8, #1212]
   0xffff00000046e35c <+88>:    add     x10, x18, #0x90
   0xffff00000046e360 <+92>:    add     w9, w9, #0x1
   0xffff00000046e364 <+96>:    str     w9, [x8, #1212]  <------- critical_enter
   0xffff00000046e368 <+100>:   str     x10, [x1, #8]
   0xffff00000046e36c <+104>:   ldr     x9, [x18, #144]
   0xffff00000046e370 <+108>:   str     x9, [x1]
   0xffff00000046e374 <+112>:   str     x1, [x9, #8]
   0xffff00000046e378 <+116>:   str     x1, [x18, #144]
   0xffff00000046e37c <+120>:   ldr     x9, [x18]
   0xffff00000046e380 <+124>:   ldr     w10, [x9, #356]
   0xffff00000046e384 <+128>:   add     w10, w10, #0x1
   0xffff00000046e388 <+132>:   str     w10, [x9, #356]
   0xffff00000046e38c <+136>:   ldr     w9, [x8, #1212]
   0xffff00000046e390 <+140>:   sub     w9, w9, #0x1
   0xffff00000046e394 <+144>:   str     w9, [x8, #1212]  <------- critical_exit
   0xffff00000046e398 <+148>:   ldrb    w8, [x8, #304]
   0xffff00000046e39c <+152>:   ldr     w9, [x18, #60]   <------- loading &pc->pc_cpuid
   ...

A (the?) problem is that the compiler is treating "pc" as an alias
for x18, but the rmlock code assumes that the pcpu pointer is loaded
once, as it dereferences "pc" outside of the critical section.  On
arm64, if a context switch occurs between the store at _rm_rlock+144 and
the load at +152, and the thread is migrated to another CPU, then we'll
end up using the wrong CPU ID in the rm->rm_writecpus test.

I suspect the problem is unique to arm64 as its get_pcpu()
implementation is different from the others in that it doesn't use
volatile-qualified inline assembly.  This has been the case since
https://cgit.freebsd.org/src/commit/?id=63c858a04d56529eddbddf85ad04fc8e99e73762
.

I haven't been able to reproduce any crashes running poudriere in an
arm64 AWS instance, though.  Could you please try the patch below and
confirm whether it fixes your panics?  I verified that the apparent
problem described above is gone with the patch.

diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c
index 0cdcfb8fec62..e51c25136ae0 100644
--- a/sys/kern/kern_rmlock.c
+++ b/sys/kern/kern_rmlock.c
@@ -437,6 +437,7 @@ _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
 {
 	struct thread *td = curthread;
 	struct pcpu *pc;
+	int cpuid;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
@@ -452,6 +453,7 @@ _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
 	atomic_interrupt_fence();
 
 	pc = get_pcpu();
+	cpuid = pc->pc_cpuid;
 	rm_tracker_add(pc, tracker);
 	sched_pin();
 
@@ -463,7 +465,7 @@ _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
 	 * conditional jump.
 	 */
 	if (__predict_true(0 == (td->td_owepreempt |
-	    CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus))))
+	    CPU_ISSET(cpuid, &rm->rm_writecpus))))
 		return (1);
 
 	/* We do not have a read token and need to acquire one. */