seems I finally found what upset kqemu on amd64 SMP... shared gdt! (please test patch :)

Juergen Lock nox at jelal.kn-bremen.de
Thu May 1 10:54:22 UTC 2008


On Wed, Apr 30, 2008 at 12:24:58AM +0200, Juergen Lock wrote:
> Yeah, the amd64 kernel reuses the same gdt to setup all cpus, causing
> kqemu to end up restoring the interrupt stackpointer (after running
> guest code using its own cpu state) from the tss of the last cpu,
> regardless which cpu it happened to run on.  And that then causes the last
> cpu's (usually) idle thread's stack to get smashed and the host doing
> multiple panics...  (Which also explains why pinning qemu onto cpu 1
> worked on a 2-way host.)
> 
Hmm maybe the following is a little more clear:  kqemu sets up its own
cpu state and has to save and restore the original state because of that,
so among other things it does an str insn (store task register), and later
an ltr insn (load task register) using the value it got from the first
str insn.  That ltr insn loads the selector for the tss which is stored
in the gdt, and that entry in the gdt is different for each cpu, but since
a single gdt was reused to setup the cpus at boot (in init_secondary() in
/sys/amd64/amd64/mp_machdep.c), it still points to the tss for the last
cpu, instead of to the right one for the cpu the ltr insn gets executed on.
That is what the kqemu_tss_workaround() in the patch `fixes'...

>  Here's the patch I just tested, of course you'd want to disable this
> once the gdt is no longer shared, so assuming someone wants to fix this,
> please also do an OSVERSION bump...

 The patch applied with offsets (I still had debug code in when I made it),
here is a rebased version:

Index: kqemu-freebsd.c
@@ -33,6 +33,11 @@
 
 #include <machine/vmparam.h>
 #include <machine/stdarg.h>
+#ifdef __x86_64__
+#include <sys/pcpu.h>
+#include <machine/segments.h>
+#include <machine/tss.h>
+#endif
 
 #include "kqemu-kernel.h"
 
@@ -234,6 +239,19 @@
     va_end(ap);
 }
 
+#ifdef __x86_64__
+/* called with interrupts disabled */
+void CDECL kqemu_tss_workaround(void)
+{
+    int gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
+
+    gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[PCPU_GET(cpuid)];
+    ssdtosyssd(&gdt_segs[GPROC0_SEL],
+       (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
+    ltr(gsel_tss);
+}
+#endif
+
 struct kqemu_instance { 
 #if __FreeBSD_version >= 500000
     TAILQ_ENTRY(kqemu_instance) kqemu_ent;
Index: common/kernel.c
@@ -1025,6 +1025,9 @@
 #ifdef __x86_64__
     uint16_t saved_ds, saved_es;
     unsigned long fs_base, gs_base;
+#ifdef __FreeBSD__
+    struct kqemu_global_state *g = s->global_state;
+#endif
 #endif
     
 #ifdef PROFILE
@@ -1188,6 +1191,13 @@
             apic_restore_nmi(s, apic_nmi_mask);
         }
         profile_record(s);
+#ifdef __FreeBSD__
+#ifdef __x86_64__
+        spin_lock(&g->lock);
+        kqemu_tss_workaround();
+        spin_unlock(&g->lock);
+#endif
+#endif
 
         if (s->mon_req == MON_REQ_IRQ) {
             struct kqemu_exception_regs *r;
Index: kqemu-kernel.h
@@ -44,4 +44,10 @@
 
 void CDECL kqemu_log(const char *fmt, ...);
 
+#ifdef __FreeBSD__
+#ifdef __x86_64__
+void CDECL kqemu_tss_workaround(void);
+#endif
+#endif
+
 #endif /* KQEMU_KERNEL_H */


More information about the freebsd-amd64 mailing list