git: 72f9c9d82fce - main - bhyve: Split vmexit handling into a separate file

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Wed, 04 Oct 2023 16:53:59 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=72f9c9d82fce84fcb68c9aa1f32fabcf0c0038e9

commit 72f9c9d82fce84fcb68c9aa1f32fabcf0c0038e9
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2023-10-04 16:22:56 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2023-10-04 16:53:16 +0000

    bhyve: Split vmexit handling into a separate file
    
    Put it in amd64, since most of it is MD and won't be used on arm64.  Add
    a bit of glue to bhyverun.h to make CPU startup and shutdown work
    without having to export more global variables.  AP startup will be
    reworked further in a future revision.
    
    This makes bhyverun.c much more machine-independent.
    
    No functional change intended.
    
    Reviewed by:    corvink, jhb
    MFC after:      1 week
    Sponsored by:   Innovate UK
    Differential Revision:  https://reviews.freebsd.org/D40556
---
 usr.sbin/bhyve/Makefile       |   1 +
 usr.sbin/bhyve/amd64/vmexit.c | 493 ++++++++++++++++++++++++++++++++++++++++++
 usr.sbin/bhyve/bhyverun.c     | 468 ++-------------------------------------
 usr.sbin/bhyve/bhyverun.h     |   7 +
 usr.sbin/bhyve/vmexit.h       |  34 +++
 5 files changed, 551 insertions(+), 452 deletions(-)

diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index c2ccd60790ae..5f13d457d914 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -75,6 +75,7 @@ SRCS=	\
 	usb_mouse.c		\
 	vga.c			\
 	virtio.c		\
+	vmexit.c		\
 	vmgenc.c		\
 	xmsr.c
 
diff --git a/usr.sbin/bhyve/amd64/vmexit.c b/usr.sbin/bhyve/amd64/vmexit.c
new file mode 100644
index 000000000000..152fecaf424c
--- /dev/null
+++ b/usr.sbin/bhyve/amd64/vmexit.c
@@ -0,0 +1,493 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+#include <amd64/vmm/intel/vmcs.h>
+#include <x86/apicreg.h>
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "config.h"
+#include "debug.h"
+#include "gdb.h"
+#include "inout.h"
+#include "mem.h"
+#ifdef BHYVE_SNAPSHOT
+#include "snapshot.h"
+#endif
+#include "spinup_ap.h"
+#include "vmexit.h"
+#include "xmsr.h"
+
+static int
+vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
+{
+	struct vm_exit *vme;
+	int error;
+	int bytes, port, in;
+
+	vme = vmrun->vm_exit;
+	port = vme->u.inout.port;
+	bytes = vme->u.inout.bytes;
+	in = vme->u.inout.in;
+
+	error = emulate_inout(ctx, vcpu, vme);
+	if (error) {
+		fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
+		    in ? "in" : "out",
+		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
+		    port, vme->rip);
+		return (VMEXIT_ABORT);
+	} else {
+		return (VMEXIT_CONTINUE);
+	}
+}
+
+static int
+vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
+    struct vm_run *vmrun)
+{
+	struct vm_exit *vme;
+	uint64_t val;
+	uint32_t eax, edx;
+	int error;
+
+	vme = vmrun->vm_exit;
+
+	val = 0;
+	error = emulate_rdmsr(vcpu, vme->u.msr.code, &val);
+	if (error != 0) {
+		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
+		    vme->u.msr.code, vcpu_id(vcpu));
+		if (get_config_bool("x86.strictmsr")) {
+			vm_inject_gp(vcpu);
+			return (VMEXIT_CONTINUE);
+		}
+	}
+
+	eax = val;
+	error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax);
+	assert(error == 0);
+
+	edx = val >> 32;
+	error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx);
+	assert(error == 0);
+
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
+    struct vm_run *vmrun)
+{
+	struct vm_exit *vme;
+	int error;
+
+	vme = vmrun->vm_exit;
+
+	error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval);
+	if (error != 0) {
+		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
+		    vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu));
+		if (get_config_bool("x86.strictmsr")) {
+			vm_inject_gp(vcpu);
+			return (VMEXIT_CONTINUE);
+		}
+	}
+	return (VMEXIT_CONTINUE);
+}
+
+static const char * const vmx_exit_reason_desc[] = {
+	[EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
+	[EXIT_REASON_EXT_INTR] = "External interrupt",
+	[EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
+	[EXIT_REASON_INIT] = "INIT signal",
+	[EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
+	[EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
+	[EXIT_REASON_SMI] = "Other SMI",
+	[EXIT_REASON_INTR_WINDOW] = "Interrupt window",
+	[EXIT_REASON_NMI_WINDOW] = "NMI window",
+	[EXIT_REASON_TASK_SWITCH] = "Task switch",
+	[EXIT_REASON_CPUID] = "CPUID",
+	[EXIT_REASON_GETSEC] = "GETSEC",
+	[EXIT_REASON_HLT] = "HLT",
+	[EXIT_REASON_INVD] = "INVD",
+	[EXIT_REASON_INVLPG] = "INVLPG",
+	[EXIT_REASON_RDPMC] = "RDPMC",
+	[EXIT_REASON_RDTSC] = "RDTSC",
+	[EXIT_REASON_RSM] = "RSM",
+	[EXIT_REASON_VMCALL] = "VMCALL",
+	[EXIT_REASON_VMCLEAR] = "VMCLEAR",
+	[EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
+	[EXIT_REASON_VMPTRLD] = "VMPTRLD",
+	[EXIT_REASON_VMPTRST] = "VMPTRST",
+	[EXIT_REASON_VMREAD] = "VMREAD",
+	[EXIT_REASON_VMRESUME] = "VMRESUME",
+	[EXIT_REASON_VMWRITE] = "VMWRITE",
+	[EXIT_REASON_VMXOFF] = "VMXOFF",
+	[EXIT_REASON_VMXON] = "VMXON",
+	[EXIT_REASON_CR_ACCESS] = "Control-register accesses",
+	[EXIT_REASON_DR_ACCESS] = "MOV DR",
+	[EXIT_REASON_INOUT] = "I/O instruction",
+	[EXIT_REASON_RDMSR] = "RDMSR",
+	[EXIT_REASON_WRMSR] = "WRMSR",
+	[EXIT_REASON_INVAL_VMCS] =
+	    "VM-entry failure due to invalid guest state",
+	[EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
+	[EXIT_REASON_MWAIT] = "MWAIT",
+	[EXIT_REASON_MTF] = "Monitor trap flag",
+	[EXIT_REASON_MONITOR] = "MONITOR",
+	[EXIT_REASON_PAUSE] = "PAUSE",
+	[EXIT_REASON_MCE_DURING_ENTRY] =
+	    "VM-entry failure due to machine-check event",
+	[EXIT_REASON_TPR] = "TPR below threshold",
+	[EXIT_REASON_APIC_ACCESS] = "APIC access",
+	[EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
+	[EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
+	[EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
+	[EXIT_REASON_EPT_FAULT] = "EPT violation",
+	[EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
+	[EXIT_REASON_INVEPT] = "INVEPT",
+	[EXIT_REASON_RDTSCP] = "RDTSCP",
+	[EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
+	[EXIT_REASON_INVVPID] = "INVVPID",
+	[EXIT_REASON_WBINVD] = "WBINVD",
+	[EXIT_REASON_XSETBV] = "XSETBV",
+	[EXIT_REASON_APIC_WRITE] = "APIC write",
+	[EXIT_REASON_RDRAND] = "RDRAND",
+	[EXIT_REASON_INVPCID] = "INVPCID",
+	[EXIT_REASON_VMFUNC] = "VMFUNC",
+	[EXIT_REASON_ENCLS] = "ENCLS",
+	[EXIT_REASON_RDSEED] = "RDSEED",
+	[EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
+	[EXIT_REASON_XSAVES] = "XSAVES",
+	[EXIT_REASON_XRSTORS] = "XRSTORS"
+};
+
+static const char *
+vmexit_vmx_desc(uint32_t exit_reason)
+{
+
+	if (exit_reason >= nitems(vmx_exit_reason_desc) ||
+	    vmx_exit_reason_desc[exit_reason] == NULL)
+		return ("Unknown");
+	return (vmx_exit_reason_desc[exit_reason]);
+}
+
+#define	DEBUG_EPT_MISCONFIG
+#ifdef DEBUG_EPT_MISCONFIG
+#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
+
+static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
+static int ept_misconfig_ptenum;
+#endif
+
+static int
+vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
+{
+	struct vm_exit *vme;
+
+	vme = vmrun->vm_exit;
+
+	fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu));
+	fprintf(stderr, "\treason\t\tVMX\n");
+	fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip);
+	fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length);
+	fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status);
+	fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason,
+	    vmexit_vmx_desc(vme->u.vmx.exit_reason));
+	fprintf(stderr, "\tqualification\t0x%016lx\n",
+	    vme->u.vmx.exit_qualification);
+	fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type);
+	fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error);
+#ifdef DEBUG_EPT_MISCONFIG
+	if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+		vm_get_register(vcpu,
+		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
+		    &ept_misconfig_gpa);
+		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
+		    &ept_misconfig_ptenum);
+		fprintf(stderr, "\tEPT misconfiguration:\n");
+		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
+		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
+		    ept_misconfig_ptenum, ept_misconfig_pte[0],
+		    ept_misconfig_pte[1], ept_misconfig_pte[2],
+		    ept_misconfig_pte[3]);
+	}
+#endif	/* DEBUG_EPT_MISCONFIG */
+	return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun)
+{
+	struct vm_exit *vme;
+
+	vme = vmrun->vm_exit;
+
+	fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu));
+	fprintf(stderr, "\treason\t\tSVM\n");
+	fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip);
+	fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length);
+	fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode);
+	fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1);
+	fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2);
+	return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
+    struct vm_run *vmrun)
+{
+	assert(vmrun->vm_exit->inst_length == 0);
+
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
+    struct vm_run *vmrun)
+{
+	assert(vmrun->vm_exit->inst_length == 0);
+
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
+    struct vm_run *vmrun __unused)
+{
+	/*
+	 * Just continue execution with the next instruction. We use
+	 * the HLT VM exit as a way to be friendly with the host
+	 * scheduler.
+	 */
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
+    struct vm_run *vmrun __unused)
+{
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu,
+    struct vm_run *vmrun)
+{
+	assert(vmrun->vm_exit->inst_length == 0);
+
+#ifdef BHYVE_SNAPSHOT
+	checkpoint_cpu_suspend(vcpu_id(vcpu));
+#endif
+	gdb_cpu_mtrap(vcpu);
+#ifdef BHYVE_SNAPSHOT
+	checkpoint_cpu_resume(vcpu_id(vcpu));
+#endif
+
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu,
+    struct vm_run *vmrun)
+{
+	struct vm_exit *vme;
+	struct vie *vie;
+	int err, i, cs_d;
+	enum vm_cpu_mode mode;
+
+	vme = vmrun->vm_exit;
+
+	vie = &vme->u.inst_emul.vie;
+	if (!vie->decoded) {
+		/*
+		 * Attempt to decode in userspace as a fallback.  This allows
+		 * updating instruction decode in bhyve without rebooting the
+		 * kernel (rapid prototyping), albeit with much slower
+		 * emulation.
+		 */
+		vie_restart(vie);
+		mode = vme->u.inst_emul.paging.cpu_mode;
+		cs_d = vme->u.inst_emul.cs_d;
+		if (vmm_decode_instruction(mode, cs_d, vie) != 0)
+			goto fail;
+		if (vm_set_register(vcpu, VM_REG_GUEST_RIP,
+		    vme->rip + vie->num_processed) != 0)
+			goto fail;
+	}
+
+	err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie,
+	    &vme->u.inst_emul.paging);
+	if (err) {
+		if (err == ESRCH) {
+			EPRINTLN("Unhandled memory access to 0x%lx\n",
+			    vme->u.inst_emul.gpa);
+		}
+		goto fail;
+	}
+
+	return (VMEXIT_CONTINUE);
+
+fail:
+	fprintf(stderr, "Failed to emulate instruction sequence [ ");
+	for (i = 0; i < vie->num_valid; i++)
+		fprintf(stderr, "%02x", vie->inst[i]);
+	FPRINTLN(stderr, " ] at 0x%lx", vme->rip);
+	return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
+{
+	struct vm_exit *vme;
+	enum vm_suspend_how how;
+	int vcpuid = vcpu_id(vcpu);
+
+	vme = vmrun->vm_exit;
+
+	how = vme->u.suspended.how;
+
+	fbsdrun_deletecpu(vcpuid);
+
+	switch (how) {
+	case VM_SUSPEND_RESET:
+		exit(0);
+	case VM_SUSPEND_POWEROFF:
+		if (get_config_bool_default("destroy_on_poweroff", false))
+			vm_destroy(ctx);
+		exit(1);
+	case VM_SUSPEND_HALT:
+		exit(2);
+	case VM_SUSPEND_TRIPLEFAULT:
+		exit(3);
+	default:
+		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
+		exit(100);
+	}
+	return (0);	/* NOTREACHED */
+}
+
+static int
+vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu,
+    struct vm_run *vmrun __unused)
+{
+
+#ifdef BHYVE_SNAPSHOT
+	checkpoint_cpu_suspend(vcpu_id(vcpu));
+#endif
+	gdb_cpu_suspend(vcpu);
+#ifdef BHYVE_SNAPSHOT
+	checkpoint_cpu_resume(vcpu_id(vcpu));
+#endif
+	/*
+	 * XXX-MJ sleep for a short period to avoid chewing up the CPU in the
+	 * window between activation of the vCPU thread and the STARTUP IPI.
+	 */
+	usleep(1000);
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_breakpoint(struct vmctx *ctx __unused, struct vcpu *vcpu,
+    struct vm_run *vmrun)
+{
+	gdb_cpu_breakpoint(vcpu, vmrun->vm_exit);
+	return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_ipi(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
+    struct vm_run *vmrun)
+{
+	struct vm_exit *vme;
+	cpuset_t *dmask;
+	int error = -1;
+	int i;
+
+	dmask = vmrun->cpuset;
+	vme = vmrun->vm_exit;
+
+	switch (vme->u.ipi.mode) {
+	case APIC_DELMODE_INIT:
+		CPU_FOREACH_ISSET(i, dmask) {
+			error = fbsdrun_suspendcpu(i);
+			if (error) {
+				warnx("failed to suspend cpu %d", i);
+				break;
+			}
+		}
+		break;
+	case APIC_DELMODE_STARTUP:
+		CPU_FOREACH_ISSET(i, dmask) {
+			spinup_ap(fbsdrun_vcpu(i),
+			    vme->u.ipi.vector << PAGE_SHIFT);
+		}
+		error = 0;
+		break;
+	default:
+		break;
+	}
+
+	return (error);
+}
+
+int vmexit_task_switch(struct vmctx *, struct vcpu *, struct vm_run *);
+
+const vmexit_handler_t vmexit_handlers[VM_EXITCODE_MAX] = {
+	[VM_EXITCODE_INOUT]  = vmexit_inout,
+	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
+	[VM_EXITCODE_VMX]    = vmexit_vmx,
+	[VM_EXITCODE_SVM]    = vmexit_svm,
+	[VM_EXITCODE_BOGUS]  = vmexit_bogus,
+	[VM_EXITCODE_REQIDLE] = vmexit_reqidle,
+	[VM_EXITCODE_RDMSR]  = vmexit_rdmsr,
+	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
+	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
+	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
+	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
+	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
+	[VM_EXITCODE_DEBUG] = vmexit_debug,
+	[VM_EXITCODE_BPT] = vmexit_breakpoint,
+	[VM_EXITCODE_IPI] = vmexit_ipi,
+	[VM_EXITCODE_HLT] = vmexit_hlt,
+	[VM_EXITCODE_PAUSE] = vmexit_pause,
+};
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index df40c7a7cc2a..1cf7fc5bcc24 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -41,11 +41,7 @@
 #include <sys/un.h>
 #endif
 
-#include <amd64/vmm/intel/vmcs.h>
-#include <x86/apicreg.h>
-
 #include <machine/atomic.h>
-#include <machine/segments.h>
 
 #ifndef WITHOUT_CAPSICUM
 #include <capsicum_helpers.h>
@@ -73,11 +69,6 @@
 #include <libxo/xo.h>
 #endif
 
-#include <machine/vmm.h>
-#ifndef WITHOUT_CAPSICUM
-#include <machine/vmm_dev.h>
-#endif
-#include <machine/vmm_instruction_emul.h>
 #include <vmmapi.h>
 
 #include "bhyverun.h"
@@ -112,83 +103,14 @@
 #include "snapshot.h"
 #endif
 #include "tpm_device.h"
-#include "xmsr.h"
-#include "spinup_ap.h"
 #include "rtc.h"
 #include "vmgenc.h"
+#include "vmexit.h"
+#include "xmsr.h"
 
 #define MB		(1024UL * 1024)
 #define GB		(1024UL * MB)
 
-static const char * const vmx_exit_reason_desc[] = {
-	[EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)",
-	[EXIT_REASON_EXT_INTR] = "External interrupt",
-	[EXIT_REASON_TRIPLE_FAULT] = "Triple fault",
-	[EXIT_REASON_INIT] = "INIT signal",
-	[EXIT_REASON_SIPI] = "Start-up IPI (SIPI)",
-	[EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)",
-	[EXIT_REASON_SMI] = "Other SMI",
-	[EXIT_REASON_INTR_WINDOW] = "Interrupt window",
-	[EXIT_REASON_NMI_WINDOW] = "NMI window",
-	[EXIT_REASON_TASK_SWITCH] = "Task switch",
-	[EXIT_REASON_CPUID] = "CPUID",
-	[EXIT_REASON_GETSEC] = "GETSEC",
-	[EXIT_REASON_HLT] = "HLT",
-	[EXIT_REASON_INVD] = "INVD",
-	[EXIT_REASON_INVLPG] = "INVLPG",
-	[EXIT_REASON_RDPMC] = "RDPMC",
-	[EXIT_REASON_RDTSC] = "RDTSC",
-	[EXIT_REASON_RSM] = "RSM",
-	[EXIT_REASON_VMCALL] = "VMCALL",
-	[EXIT_REASON_VMCLEAR] = "VMCLEAR",
-	[EXIT_REASON_VMLAUNCH] = "VMLAUNCH",
-	[EXIT_REASON_VMPTRLD] = "VMPTRLD",
-	[EXIT_REASON_VMPTRST] = "VMPTRST",
-	[EXIT_REASON_VMREAD] = "VMREAD",
-	[EXIT_REASON_VMRESUME] = "VMRESUME",
-	[EXIT_REASON_VMWRITE] = "VMWRITE",
-	[EXIT_REASON_VMXOFF] = "VMXOFF",
-	[EXIT_REASON_VMXON] = "VMXON",
-	[EXIT_REASON_CR_ACCESS] = "Control-register accesses",
-	[EXIT_REASON_DR_ACCESS] = "MOV DR",
-	[EXIT_REASON_INOUT] = "I/O instruction",
-	[EXIT_REASON_RDMSR] = "RDMSR",
-	[EXIT_REASON_WRMSR] = "WRMSR",
-	[EXIT_REASON_INVAL_VMCS] =
-	    "VM-entry failure due to invalid guest state",
-	[EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading",
-	[EXIT_REASON_MWAIT] = "MWAIT",
-	[EXIT_REASON_MTF] = "Monitor trap flag",
-	[EXIT_REASON_MONITOR] = "MONITOR",
-	[EXIT_REASON_PAUSE] = "PAUSE",
-	[EXIT_REASON_MCE_DURING_ENTRY] =
-	    "VM-entry failure due to machine-check event",
-	[EXIT_REASON_TPR] = "TPR below threshold",
-	[EXIT_REASON_APIC_ACCESS] = "APIC access",
-	[EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI",
-	[EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR",
-	[EXIT_REASON_LDTR_TR] = "Access to LDTR or TR",
-	[EXIT_REASON_EPT_FAULT] = "EPT violation",
-	[EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration",
-	[EXIT_REASON_INVEPT] = "INVEPT",
-	[EXIT_REASON_RDTSCP] = "RDTSCP",
-	[EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired",
-	[EXIT_REASON_INVVPID] = "INVVPID",
-	[EXIT_REASON_WBINVD] = "WBINVD",
-	[EXIT_REASON_XSETBV] = "XSETBV",
-	[EXIT_REASON_APIC_WRITE] = "APIC write",
-	[EXIT_REASON_RDRAND] = "RDRAND",
-	[EXIT_REASON_INVPCID] = "INVPCID",
-	[EXIT_REASON_VMFUNC] = "VMFUNC",
-	[EXIT_REASON_ENCLS] = "ENCLS",
-	[EXIT_REASON_RDSEED] = "RDSEED",
-	[EXIT_REASON_PM_LOG_FULL] = "Page-modification log full",
-	[EXIT_REASON_XSAVES] = "XSAVES",
-	[EXIT_REASON_XRSTORS] = "XRSTORS"
-};
-
-typedef int (*vmexit_handler_t)(struct vmctx *, struct vcpu *, struct vm_run *);
-
 int guest_ncpus;
 uint16_t cpu_cores, cpu_sockets, cpu_threads;
 
@@ -508,6 +430,12 @@ fbsdrun_virtio_msix(void)
 	return (get_config_bool_default("virtio_msix", true));
 }
 
+struct vcpu *
+fbsdrun_vcpu(int vcpuid)
+{
+	return (vcpu_info[vcpuid].vcpu);
+}
+
 static void *
 fbsdrun_start_thread(void *param)
 {
@@ -554,7 +482,7 @@ fbsdrun_addcpu(struct vcpu_info *vi)
 	assert(error == 0);
 }
 
-static void
+void
 fbsdrun_deletecpu(int vcpu)
 {
 	static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER;
@@ -581,376 +509,12 @@ fbsdrun_deletecpu(int vcpu)
 	pthread_mutex_unlock(&resetcpu_mtx);
 }
 
-static int
-vmexit_inout(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
-{
-	struct vm_exit *vme;
-	int error;
-	int bytes, port, in;
-
-	vme = vmrun->vm_exit;
-	port = vme->u.inout.port;
-	bytes = vme->u.inout.bytes;
-	in = vme->u.inout.in;
-
-	error = emulate_inout(ctx, vcpu, vme);
-	if (error) {
-		fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
-		    in ? "in" : "out",
-		    bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
-		    port, vme->rip);
-		return (VMEXIT_ABORT);
-	} else {
-		return (VMEXIT_CONTINUE);
-	}
-}
-
-static int
-vmexit_rdmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
-    struct vm_run *vmrun)
-{
-	struct vm_exit *vme;
-	uint64_t val;
-	uint32_t eax, edx;
-	int error;
-
-	vme = vmrun->vm_exit;
-
-	val = 0;
-	error = emulate_rdmsr(vcpu, vme->u.msr.code, &val);
-	if (error != 0) {
-		fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
-		    vme->u.msr.code, vcpu_id(vcpu));
-		if (get_config_bool("x86.strictmsr")) {
-			vm_inject_gp(vcpu);
-			return (VMEXIT_CONTINUE);
-		}
-	}
-
-	eax = val;
-	error = vm_set_register(vcpu, VM_REG_GUEST_RAX, eax);
-	assert(error == 0);
-
-	edx = val >> 32;
-	error = vm_set_register(vcpu, VM_REG_GUEST_RDX, edx);
-	assert(error == 0);
-
-	return (VMEXIT_CONTINUE);
-}
-
-static int
-vmexit_wrmsr(struct vmctx *ctx __unused, struct vcpu *vcpu,
-    struct vm_run *vmrun)
-{
-	struct vm_exit *vme;
-	int error;
-
-	vme = vmrun->vm_exit;
-
-	error = emulate_wrmsr(vcpu, vme->u.msr.code, vme->u.msr.wval);
-	if (error != 0) {
-		fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
-		    vme->u.msr.code, vme->u.msr.wval, vcpu_id(vcpu));
-		if (get_config_bool("x86.strictmsr")) {
-			vm_inject_gp(vcpu);
-			return (VMEXIT_CONTINUE);
-		}
-	}
-	return (VMEXIT_CONTINUE);
-}
-
-#define	DEBUG_EPT_MISCONFIG
-#ifdef DEBUG_EPT_MISCONFIG
-#define	VMCS_GUEST_PHYSICAL_ADDRESS	0x00002400
-
-static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4];
-static int ept_misconfig_ptenum;
-#endif
-
-static const char *
-vmexit_vmx_desc(uint32_t exit_reason)
-{
-
-	if (exit_reason >= nitems(vmx_exit_reason_desc) ||
-	    vmx_exit_reason_desc[exit_reason] == NULL)
-		return ("Unknown");
-	return (vmx_exit_reason_desc[exit_reason]);
-}
-
-static int
-vmexit_vmx(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
-{
-	struct vm_exit *vme;
-
-	vme = vmrun->vm_exit;
-
-	fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu));
-	fprintf(stderr, "\treason\t\tVMX\n");
-	fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip);
-	fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length);
-	fprintf(stderr, "\tstatus\t\t%d\n", vme->u.vmx.status);
-	fprintf(stderr, "\texit_reason\t%u (%s)\n", vme->u.vmx.exit_reason,
-	    vmexit_vmx_desc(vme->u.vmx.exit_reason));
-	fprintf(stderr, "\tqualification\t0x%016lx\n",
-	    vme->u.vmx.exit_qualification);
-	fprintf(stderr, "\tinst_type\t\t%d\n", vme->u.vmx.inst_type);
-	fprintf(stderr, "\tinst_error\t\t%d\n", vme->u.vmx.inst_error);
-#ifdef DEBUG_EPT_MISCONFIG
-	if (vme->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) {
-		vm_get_register(vcpu,
-		    VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS),
-		    &ept_misconfig_gpa);
-		vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte,
-		    &ept_misconfig_ptenum);
-		fprintf(stderr, "\tEPT misconfiguration:\n");
-		fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa);
-		fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n",
-		    ept_misconfig_ptenum, ept_misconfig_pte[0],
-		    ept_misconfig_pte[1], ept_misconfig_pte[2],
-		    ept_misconfig_pte[3]);
-	}
-#endif	/* DEBUG_EPT_MISCONFIG */
-	return (VMEXIT_ABORT);
-}
-
-static int
-vmexit_svm(struct vmctx *ctx __unused, struct vcpu *vcpu, struct vm_run *vmrun)
-{
-	struct vm_exit *vme;
-
-	vme = vmrun->vm_exit;
-
-	fprintf(stderr, "vm exit[%d]\n", vcpu_id(vcpu));
-	fprintf(stderr, "\treason\t\tSVM\n");
-	fprintf(stderr, "\trip\t\t0x%016lx\n", vme->rip);
-	fprintf(stderr, "\tinst_length\t%d\n", vme->inst_length);
-	fprintf(stderr, "\texitcode\t%#lx\n", vme->u.svm.exitcode);
-	fprintf(stderr, "\texitinfo1\t%#lx\n", vme->u.svm.exitinfo1);
-	fprintf(stderr, "\texitinfo2\t%#lx\n", vme->u.svm.exitinfo2);
-	return (VMEXIT_ABORT);
-}
-
-static int
-vmexit_bogus(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
-    struct vm_run *vmrun)
-{
-	assert(vmrun->vm_exit->inst_length == 0);
-
-	return (VMEXIT_CONTINUE);
-}
-
-static int
-vmexit_reqidle(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
-    struct vm_run *vmrun)
-{
-	assert(vmrun->vm_exit->inst_length == 0);
-
-	return (VMEXIT_CONTINUE);
-}
-
-static int
-vmexit_hlt(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
-    struct vm_run *vmrun __unused)
-{
-	/*
-	 * Just continue execution with the next instruction. We use
-	 * the HLT VM exit as a way to be friendly with the host
-	 * scheduler.
-	 */
-	return (VMEXIT_CONTINUE);
-}
-
-static int
-vmexit_pause(struct vmctx *ctx __unused, struct vcpu *vcpu __unused,
-    struct vm_run *vmrun __unused)
-{
-	return (VMEXIT_CONTINUE);
-}
-
-static int
-vmexit_mtrap(struct vmctx *ctx __unused, struct vcpu *vcpu,
-    struct vm_run *vmrun)
-{
-	assert(vmrun->vm_exit->inst_length == 0);
-
-#ifdef BHYVE_SNAPSHOT
-	checkpoint_cpu_suspend(vcpu_id(vcpu));
-#endif
-	gdb_cpu_mtrap(vcpu);
-#ifdef BHYVE_SNAPSHOT
-	checkpoint_cpu_resume(vcpu_id(vcpu));
-#endif
-
-	return (VMEXIT_CONTINUE);
-}
-
-static int
-vmexit_inst_emul(struct vmctx *ctx __unused, struct vcpu *vcpu,
-    struct vm_run *vmrun)
-{
-	struct vm_exit *vme;
-	struct vie *vie;
-	int err, i, cs_d;
-	enum vm_cpu_mode mode;
-
-	vme = vmrun->vm_exit;
-
-	vie = &vme->u.inst_emul.vie;
-	if (!vie->decoded) {
-		/*
-		 * Attempt to decode in userspace as a fallback.  This allows
-		 * updating instruction decode in bhyve without rebooting the
-		 * kernel (rapid prototyping), albeit with much slower
-		 * emulation.
-		 */
-		vie_restart(vie);
-		mode = vme->u.inst_emul.paging.cpu_mode;
-		cs_d = vme->u.inst_emul.cs_d;
-		if (vmm_decode_instruction(mode, cs_d, vie) != 0)
-			goto fail;
-		if (vm_set_register(vcpu, VM_REG_GUEST_RIP,
-		    vme->rip + vie->num_processed) != 0)
-			goto fail;
-	}
-
-	err = emulate_mem(vcpu, vme->u.inst_emul.gpa, vie,
-	    &vme->u.inst_emul.paging);
-	if (err) {
-		if (err == ESRCH) {
-			EPRINTLN("Unhandled memory access to 0x%lx\n",
-			    vme->u.inst_emul.gpa);
-		}
-		goto fail;
-	}
-
-	return (VMEXIT_CONTINUE);
-
-fail:
-	fprintf(stderr, "Failed to emulate instruction sequence [ ");
-	for (i = 0; i < vie->num_valid; i++)
-		fprintf(stderr, "%02x", vie->inst[i]);
-	FPRINTLN(stderr, " ] at 0x%lx", vme->rip);
-	return (VMEXIT_ABORT);
-}
-
-static int
-vmexit_suspend(struct vmctx *ctx, struct vcpu *vcpu, struct vm_run *vmrun)
-{
-	struct vm_exit *vme;
-	enum vm_suspend_how how;
-	int vcpuid = vcpu_id(vcpu);
-
-	vme = vmrun->vm_exit;
-
-	how = vme->u.suspended.how;
-
-	fbsdrun_deletecpu(vcpuid);
-
-	switch (how) {
-	case VM_SUSPEND_RESET:
-		exit(0);
-	case VM_SUSPEND_POWEROFF:
-		if (get_config_bool_default("destroy_on_poweroff", false))
-			vm_destroy(ctx);
-		exit(1);
-	case VM_SUSPEND_HALT:
-		exit(2);
-	case VM_SUSPEND_TRIPLEFAULT:
-		exit(3);
-	default:
-		fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how);
-		exit(100);
-	}
-	return (0);	/* NOTREACHED */
-}
-
-static int
-vmexit_debug(struct vmctx *ctx __unused, struct vcpu *vcpu,
-    struct vm_run *vmrun __unused)
-{
-
-#ifdef BHYVE_SNAPSHOT
-	checkpoint_cpu_suspend(vcpu_id(vcpu));
-#endif
-	gdb_cpu_suspend(vcpu);
-#ifdef BHYVE_SNAPSHOT
-	checkpoint_cpu_resume(vcpu_id(vcpu));
-#endif
-	/*
-	 * XXX-MJ sleep for a short period to avoid chewing up the CPU in the
-	 * window between activation of the vCPU thread and the STARTUP IPI.
-	 */
-	usleep(1000);
-	return (VMEXIT_CONTINUE);
-}
*** 150 LINES SKIPPED ***