svn commit: r234761 - in projects/bhyve: lib/libvmmapi
sys/amd64/include sys/amd64/vmm sys/amd64/vmm/intel
sys/amd64/vmm/io usr.sbin/bhyve
Peter Grehan
grehan at FreeBSD.org
Sat Apr 28 16:28:01 UTC 2012
Author: grehan
Date: Sat Apr 28 16:28:00 2012
New Revision: 234761
URL: http://svn.freebsd.org/changeset/base/234761
Log:
MSI-x interrupt support for PCI pass-thru devices.
Includes instruction emulation for memory r/w access. This
opens the door for io-apic, local apic, hpet timer, and
legacy device emulation.
Submitted by: ryan dot berryhill at sandvine dot com
Reviewed by: grehan
Obtained from: Sandvine
Added:
projects/bhyve/usr.sbin/bhyve/instruction_emul.c (contents, props changed)
projects/bhyve/usr.sbin/bhyve/instruction_emul.h (contents, props changed)
Modified:
projects/bhyve/lib/libvmmapi/vmmapi.c
projects/bhyve/lib/libvmmapi/vmmapi.h
projects/bhyve/sys/amd64/include/vmm.h
projects/bhyve/sys/amd64/include/vmm_dev.h
projects/bhyve/sys/amd64/vmm/intel/vmcs.h
projects/bhyve/sys/amd64/vmm/intel/vmx.c
projects/bhyve/sys/amd64/vmm/io/ppt.c
projects/bhyve/sys/amd64/vmm/io/ppt.h
projects/bhyve/sys/amd64/vmm/io/vlapic.c
projects/bhyve/sys/amd64/vmm/vmm_dev.c
projects/bhyve/usr.sbin/bhyve/Makefile
projects/bhyve/usr.sbin/bhyve/fbsdrun.c
projects/bhyve/usr.sbin/bhyve/pci_emul.c
projects/bhyve/usr.sbin/bhyve/pci_emul.h
projects/bhyve/usr.sbin/bhyve/pci_passthru.c
Modified: projects/bhyve/lib/libvmmapi/vmmapi.c
==============================================================================
--- projects/bhyve/lib/libvmmapi/vmmapi.c Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/lib/libvmmapi/vmmapi.c Sat Apr 28 16:28:00 2012 (r234761)
@@ -454,6 +454,25 @@ vm_setup_msi(struct vmctx *ctx, int vcpu
return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
}
+int
+vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+ struct vm_pptdev_msix pptmsix;
+
+ bzero(&pptmsix, sizeof(pptmsix));
+ pptmsix.vcpu = vcpu;
+ pptmsix.bus = bus;
+ pptmsix.slot = slot;
+ pptmsix.func = func;
+ pptmsix.idx = idx;
+ pptmsix.msg = msg;
+ pptmsix.addr = addr;
+ pptmsix.vector_control = vector_control;
+
+ return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
+}
+
uint64_t *
vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
int *ret_entries)
Modified: projects/bhyve/lib/libvmmapi/vmmapi.h
==============================================================================
--- projects/bhyve/lib/libvmmapi/vmmapi.h Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/lib/libvmmapi/vmmapi.h Sat Apr 28 16:28:00 2012 (r234761)
@@ -77,6 +77,8 @@ int vm_map_pptdev_mmio(struct vmctx *ctx
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int vm_setup_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
int dest, int vector, int numvec);
+int vm_setup_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
/*
* Return a pointer to the statistics buffer. Note that this is not MT-safe.
Modified: projects/bhyve/sys/amd64/include/vmm.h
==============================================================================
--- projects/bhyve/sys/amd64/include/vmm.h Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/sys/amd64/include/vmm.h Sat Apr 28 16:28:00 2012 (r234761)
@@ -227,7 +227,8 @@ enum vm_exitcode {
VM_EXITCODE_HLT,
VM_EXITCODE_MTRAP,
VM_EXITCODE_PAUSE,
- VM_EXITCODE_MAX,
+ VM_EXITCODE_PAGING,
+ VM_EXITCODE_MAX
};
struct vm_exit {
@@ -243,6 +244,9 @@ struct vm_exit {
uint16_t port;
uint32_t eax; /* valid for out */
} inout;
+ struct {
+ uint64_t cr3;
+ } paging;
/*
* VMX specific payload. Used when there is no "better"
* exitcode to represent the VM-exit.
Modified: projects/bhyve/sys/amd64/include/vmm_dev.h
==============================================================================
--- projects/bhyve/sys/amd64/include/vmm_dev.h Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/sys/amd64/include/vmm_dev.h Sat Apr 28 16:28:00 2012 (r234761)
@@ -108,6 +108,17 @@ struct vm_pptdev_msi {
int destcpu;
};
+struct vm_pptdev_msix {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int idx;
+ uint32_t msg;
+ uint32_t vector_control;
+ uint64_t addr;
+};
+
struct vm_nmi {
int cpuid;
};
@@ -143,6 +154,7 @@ enum {
IOCNUM_UNBIND_PPTDEV,
IOCNUM_MAP_PPTDEV_MMIO,
IOCNUM_PPTDEV_MSI,
+ IOCNUM_PPTDEV_MSIX,
IOCNUM_INJECT_NMI,
IOCNUM_VM_STATS,
IOCNUM_VM_STAT_DESC,
@@ -182,6 +194,8 @@ enum {
_IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
#define VM_PPTDEV_MSI \
_IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define VM_PPTDEV_MSIX \
+ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
#define VM_INJECT_NMI \
_IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
#define VM_STATS \
Modified: projects/bhyve/sys/amd64/vmm/intel/vmcs.h
==============================================================================
--- projects/bhyve/sys/amd64/vmm/intel/vmcs.h Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/sys/amd64/vmm/intel/vmcs.h Sat Apr 28 16:28:00 2012 (r234761)
@@ -65,6 +65,7 @@ uint64_t vmcs_read(uint32_t encoding);
#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
#endif /* _KERNEL */
Modified: projects/bhyve/sys/amd64/vmm/intel/vmx.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/intel/vmx.c Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/sys/amd64/vmm/intel/vmx.c Sat Apr 28 16:28:00 2012 (r234761)
@@ -1185,6 +1185,10 @@ vmx_exit_process(struct vmx *vmx, int vc
case EXIT_REASON_CPUID:
handled = vmx_handle_cpuid(vcpu, vmxctx);
break;
+ case EXIT_REASON_EPT_FAULT:
+ vmexit->exitcode = VM_EXITCODE_PAGING;
+ vmexit->u.paging.cr3 = vmcs_guest_cr3();
+ break;
default:
break;
}
Modified: projects/bhyve/sys/amd64/vmm/io/ppt.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/io/ppt.c Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/sys/amd64/vmm/io/ppt.c Sat Apr 28 16:28:00 2012 (r234761)
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/pciio.h>
@@ -56,9 +57,12 @@ __FBSDID("$FreeBSD$");
#define MAX_MMIOSEGS (PCIR_MAX_BAR_0 + 1)
#define MAX_MSIMSGS 32
+MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
+
struct pptintr_arg { /* pptintr(pptintr_arg) */
struct pptdev *pptdev;
- int msg;
+ int vec;
+ int vcpu;
};
static struct pptdev {
@@ -75,6 +79,16 @@ static struct pptdev {
void *cookie[MAX_MSIMSGS];
struct pptintr_arg arg[MAX_MSIMSGS];
} msi;
+
+ struct {
+ int num_msgs;
+ int startrid;
+ int msix_table_rid;
+ struct resource *msix_table_res;
+ struct resource **res;
+ void **cookie;
+ struct pptintr_arg *arg;
+ } msix;
} pptdevs[32];
static int num_pptdevs;
@@ -209,6 +223,57 @@ ppt_teardown_msi(struct pptdev *ppt)
ppt->msi.num_msgs = 0;
}
+static void
+ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
+{
+ int rid;
+ struct resource *res;
+ void *cookie;
+
+ rid = ppt->msix.startrid + idx;
+ res = ppt->msix.res[idx];
+ cookie = ppt->msix.cookie[idx];
+
+ if (cookie != NULL)
+ bus_teardown_intr(ppt->dev, res, cookie);
+
+ if (res != NULL)
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
+
+ ppt->msix.res[idx] = NULL;
+ ppt->msix.cookie[idx] = NULL;
+}
+
+static void
+ppt_teardown_msix(struct pptdev *ppt)
+{
+ int i, error;
+
+ if (ppt->msix.num_msgs == 0)
+ return;
+
+ for (i = 0; i < ppt->msix.num_msgs; i++)
+ ppt_teardown_msix_intr(ppt, i);
+
+ if (ppt->msix.msix_table_res) {
+ bus_release_resource(ppt->dev, SYS_RES_MEMORY,
+ ppt->msix.msix_table_rid,
+ ppt->msix.msix_table_res);
+ ppt->msix.msix_table_res = NULL;
+ ppt->msix.msix_table_rid = 0;
+ }
+
+ free(ppt->msix.res, M_PPTMSIX);
+ free(ppt->msix.cookie, M_PPTMSIX);
+ free(ppt->msix.arg, M_PPTMSIX);
+
+ error = pci_release_msi(ppt->dev);
+ if (error)
+ printf("ppt_teardown_msix: Failed to release MSI-X resources (error %i)\n", error);
+
+ ppt->msix.num_msgs = 0;
+}
+
int
ppt_assign_device(struct vm *vm, int bus, int slot, int func)
{
@@ -244,6 +309,7 @@ ppt_unassign_device(struct vm *vm, int b
return (EBUSY);
ppt_unmap_mmio(vm, ppt);
ppt_teardown_msi(ppt);
+ ppt_teardown_msix(ppt);
iommu_remove_device(vm_iommu_domain(vm), bus, slot, func);
ppt->vm = NULL;
return (0);
@@ -309,10 +375,10 @@ pptintr(void *arg)
pptarg = arg;
ppt = pptarg->pptdev;
- vec = ppt->msi.vector + pptarg->msg;
+ vec = pptarg->vec;
if (ppt->vm != NULL)
- (void) lapic_set_intr(ppt->vm, ppt->msi.vcpu, vec);
+ (void) lapic_set_intr(ppt->vm, pptarg->vcpu, vec);
else {
/*
* XXX
@@ -431,7 +497,7 @@ ppt_setup_msi(struct vm *vm, int vcpu, i
break;
ppt->msi.arg[i].pptdev = ppt;
- ppt->msi.arg[i].msg = i;
+ ppt->msi.arg[i].vec = vector + i;
error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
INTR_TYPE_NET | INTR_MPSAFE,
@@ -448,3 +514,110 @@ ppt_setup_msi(struct vm *vm, int vcpu, i
return (0);
}
+
+int
+ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr)
+{
+ struct pptdev *ppt;
+ struct pci_devinfo *dinfo;
+ int numvec, vector_count, rid, error;
+ size_t res_size, cookie_size, arg_size;
+
+ ppt = ppt_find(bus, slot, func);
+ if (ppt == NULL)
+ return (ENOENT);
+ if (ppt->vm != vm) /* Make sure we own this device */
+ return (EBUSY);
+
+ dinfo = device_get_ivars(ppt->dev);
+ if (!dinfo)
+ return (ENXIO);
+
+ /*
+ * First-time configuration:
+ * Allocate the MSI-X table
+ * Allocate the IRQ resources
+ * Set up some variables in ppt->msix
+ */
+ if (!ppt->msix.msix_table_res) {
+ ppt->msix.res = NULL;
+ ppt->msix.cookie = NULL;
+ ppt->msix.arg = NULL;
+
+ rid = dinfo->cfg.msix.msix_table_bar;
+ ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev, SYS_RES_MEMORY,
+ &rid, RF_ACTIVE);
+ if (ppt->msix.msix_table_res == NULL)
+ return (ENOSPC);
+
+ ppt->msix.msix_table_rid = rid;
+
+ vector_count = numvec = pci_msix_count(ppt->dev);
+
+ error = pci_alloc_msix(ppt->dev, &numvec);
+ if (error)
+ return (error);
+ else if (vector_count != numvec) {
+ pci_release_msi(ppt->dev);
+ return (ENOSPC);
+ }
+
+ ppt->msix.num_msgs = numvec;
+
+ ppt->msix.startrid = 1;
+
+ res_size = numvec * sizeof(ppt->msix.res[0]);
+ cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
+ arg_size = numvec * sizeof(ppt->msix.arg[0]);
+
+ ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK);
+ ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX, M_WAITOK);
+ ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK);
+ if (ppt->msix.res == NULL || ppt->msix.cookie == NULL ||
+ ppt->msix.arg == NULL) {
+ ppt_teardown_msix(ppt);
+ return (ENOSPC);
+ }
+ bzero(ppt->msix.res, res_size);
+ bzero(ppt->msix.cookie, cookie_size);
+ bzero(ppt->msix.arg, arg_size);
+ }
+
+ if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ /* Tear down the IRQ if it's already set up */
+ ppt_teardown_msix_intr(ppt, idx);
+
+ /* Allocate the IRQ resource */
+ ppt->msix.cookie[idx] = NULL;
+ rid = ppt->msix.startrid + idx;
+ ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
+ &rid, RF_ACTIVE);
+ if (ppt->msix.res[idx] == NULL)
+ return (ENXIO);
+
+ ppt->msix.arg[idx].pptdev = ppt;
+ ppt->msix.arg[idx].vec = msg;
+ ppt->msix.arg[idx].vcpu = (addr >> 12) & 0xFF;
+
+ /* Setup the MSI-X interrupt */
+ error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
+ INTR_TYPE_NET | INTR_MPSAFE,
+ pptintr, NULL, &ppt->msix.arg[idx],
+ &ppt->msix.cookie[idx]);
+
+ if (error != 0) {
+ bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
+ bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
+ ppt->msix.cookie[idx] = NULL;
+ ppt->msix.res[idx] = NULL;
+ return (ENXIO);
+ }
+ } else {
+ /* Masked, tear it down if it's already been set up */
+ ppt_teardown_msix_intr(ppt, idx);
+ }
+
+ return (0);
+}
+
Modified: projects/bhyve/sys/amd64/vmm/io/ppt.h
==============================================================================
--- projects/bhyve/sys/amd64/vmm/io/ppt.h Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/sys/amd64/vmm/io/ppt.h Sat Apr 28 16:28:00 2012 (r234761)
@@ -36,5 +36,6 @@ int ppt_map_mmio(struct vm *vm, int bus,
vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
int destcpu, int vector, int numvec);
-
+int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
+ int idx, uint32_t msg, uint32_t vector_control, uint64_t addr);
#endif
Modified: projects/bhyve/sys/amd64/vmm/io/vlapic.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/io/vlapic.c Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/sys/amd64/vmm/io/vlapic.c Sat Apr 28 16:28:00 2012 (r234761)
@@ -778,6 +778,7 @@ vlapic_init(struct vm *vm, int vcpuid)
void
vlapic_cleanup(struct vlapic *vlapic)
{
+ vlapic_op_halt(vlapic);
vdev_unregister(vlapic);
free(vlapic, M_VLAPIC);
}
Modified: projects/bhyve/sys/amd64/vmm/vmm_dev.c
==============================================================================
--- projects/bhyve/sys/amd64/vmm/vmm_dev.c Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/sys/amd64/vmm/vmm_dev.c Sat Apr 28 16:28:00 2012 (r234761)
@@ -158,6 +158,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long c
struct vm_pptdev *pptdev;
struct vm_pptdev_mmio *pptmmio;
struct vm_pptdev_msi *pptmsi;
+ struct vm_pptdev_msix *pptmsix;
struct vm_nmi *vmnmi;
struct vm_stats *vmstats;
struct vm_stat_desc *statdesc;
@@ -240,6 +241,14 @@ vmmdev_ioctl(struct cdev *cdev, u_long c
pptmsi->destcpu, pptmsi->vector,
pptmsi->numvec);
break;
+ case VM_PPTDEV_MSIX:
+ pptmsix = (struct vm_pptdev_msix *)data;
+ error = ppt_setup_msix(sc->vm, pptmsix->vcpu,
+ pptmsix->bus, pptmsix->slot,
+ pptmsix->func, pptmsix->idx,
+ pptmsix->msg, pptmsix->vector_control,
+ pptmsix->addr);
+ break;
case VM_MAP_PPTDEV_MMIO:
pptmmio = (struct vm_pptdev_mmio *)data;
error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
Modified: projects/bhyve/usr.sbin/bhyve/Makefile
==============================================================================
--- projects/bhyve/usr.sbin/bhyve/Makefile Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/usr.sbin/bhyve/Makefile Sat Apr 28 16:28:00 2012 (r234761)
@@ -4,7 +4,8 @@
PROG= bhyve
-SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c mevent.c
+SRCS= atpic.c consport.c dbgport.c elcr.c fbsdrun.c inout.c
+SRCS+= instruction_emul.c mevent.c
SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
SRCS+= pci_virtio_net.c pit_8254.c post.c rtc.c uart.c xmsr.c
Modified: projects/bhyve/usr.sbin/bhyve/fbsdrun.c
==============================================================================
--- projects/bhyve/usr.sbin/bhyve/fbsdrun.c Sat Apr 28 14:42:49 2012 (r234760)
+++ projects/bhyve/usr.sbin/bhyve/fbsdrun.c Sat Apr 28 16:28:00 2012 (r234761)
@@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$");
#include "mevent.h"
#include "pci_emul.h"
#include "xmsr.h"
+#include "instruction_emul.h"
#define DEFAULT_GUEST_HZ 100
#define DEFAULT_GUEST_TSLICE 200
@@ -108,6 +109,7 @@ struct fbsdstats {
uint64_t vmexit_hlt;
uint64_t vmexit_pause;
uint64_t vmexit_mtrap;
+ uint64_t vmexit_paging;
uint64_t cpu_switch_rotate;
uint64_t cpu_switch_direct;
int io_reset;
@@ -412,6 +414,20 @@ vmexit_mtrap(struct vmctx *ctx, struct v
return (VMEXIT_RESTART);
}
+static int
+vmexit_paging(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ stats.vmexit_paging++;
+
+ if (emulate_instruction(ctx, *pvcpu, vmexit->rip, vmexit->u.paging.cr3) != 0) {
+ printf("Failed to emulate instruction at 0x%lx\n", vmexit->rip);
+ return (VMEXIT_ABORT);
+ }
+
+ return (VMEXIT_CONTINUE);
+}
+
static void
sigalrm(int sig)
{
@@ -446,12 +462,13 @@ setup_timeslice(void)
}
static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
- [VM_EXITCODE_INOUT] = vmexit_inout,
- [VM_EXITCODE_VMX] = vmexit_vmx,
- [VM_EXITCODE_BOGUS] = vmexit_bogus,
- [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
- [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
- [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+ [VM_EXITCODE_INOUT] = vmexit_inout,
+ [VM_EXITCODE_VMX] = vmexit_vmx,
+ [VM_EXITCODE_BOGUS] = vmexit_bogus,
+ [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
+ [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
+ [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+ [VM_EXITCODE_PAGING] = vmexit_paging
};
static void
Added: projects/bhyve/usr.sbin/bhyve/instruction_emul.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ projects/bhyve/usr.sbin/bhyve/instruction_emul.c Sat Apr 28 16:28:00 2012 (r234761)
@@ -0,0 +1,555 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <strings.h>
+#include <unistd.h>
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "fbsdrun.h"
+#include "instruction_emul.h"
+
+#define PREFIX_LOCK 0xF0
+#define PREFIX_REPNE 0xF2
+#define PREFIX_REPE 0xF3
+#define PREFIX_CS_OVERRIDE 0x2E
+#define PREFIX_SS_OVERRIDE 0x36
+#define PREFIX_DS_OVERRIDE 0x3E
+#define PREFIX_ES_OVERRIDE 0x26
+#define PREFIX_FS_OVERRIDE 0x64
+#define PREFIX_GS_OVERRIDE 0x65
+#define PREFIX_BRANCH_NOT_TAKEN 0x2E
+#define PREFIX_BRANCH_TAKEN 0x3E
+#define PREFIX_OPSIZE 0x66
+#define PREFIX_ADDRSIZE 0x67
+
+#define OPCODE_2BYTE_ESCAPE 0x0F
+#define OPCODE_3BYTE_ESCAPE 0x38
+
+#define MODRM_MOD_MASK 0xC0
+#define MODRM_MOD_SHIFT 6
+#define MODRM_RM_MASK 0x07
+#define MODRM_RM_SHIFT 0
+#define MODRM_REG_MASK 0x38
+#define MODRM_REG_SHIFT 3
+
+#define MOD_INDIRECT 0x0
+#define MOD_INDIRECT_DISP8 0x1
+#define MOD_INDIRECT_DISP32 0x2
+#define MOD_DIRECT 0x3
+
+#define RM_EAX 0x0
+#define RM_ECX 0x1
+#define RM_EDX 0x2
+#define RM_EBX 0x3
+#define RM_SIB 0x4
+#define RM_DISP32 0x5
+#define RM_EBP RM_DISP32
+#define RM_ESI 0x6
+#define RM_EDI 0x7
+
+#define REG_EAX 0x0
+#define REG_ECX 0x1
+#define REG_EDX 0x2
+#define REG_EBX 0x3
+#define REG_ESP 0x4
+#define REG_EBP 0x5
+#define REG_ESI 0x6
+#define REG_EDI 0x7
+#define REG_R8 0x8
+#define REG_R9 0x9
+#define REG_R10 0xA
+#define REG_R11 0xB
+#define REG_R12 0xC
+#define REG_R13 0xD
+#define REG_R14 0xE
+#define REG_R15 0xF
+
+#define HAS_MODRM 1
+#define FROM_RM (1<<1)
+#define FROM_REG (1<<2)
+#define TO_RM (1<<3)
+#define TO_REG (1<<4)
+
+#define REX_MASK 0xF0
+#define REX_PREFIX 0x40
+#define is_rex_prefix(x) ( ((x) & REX_MASK) == REX_PREFIX )
+#define REX_W_MASK 0x8
+#define REX_R_MASK 0x4
+#define REX_X_MASK 0x2
+#define REX_B_MASK 0x1
+
+#define is_prefix(x) ((x) == PREFIX_LOCK || (x) == PREFIX_REPNE || \
+ (x) == PREFIX_REPE || (x) == PREFIX_CS_OVERRIDE || \
+ (x) == PREFIX_SS_OVERRIDE || (x) == PREFIX_DS_OVERRIDE || \
+ (x) == PREFIX_ES_OVERRIDE || (x) == PREFIX_FS_OVERRIDE || \
+ (x) == PREFIX_GS_OVERRIDE || (x) == PREFIX_BRANCH_NOT_TAKEN || \
+ (x) == PREFIX_BRANCH_TAKEN || (x) == PREFIX_OPSIZE || \
+ (x) == PREFIX_ADDRSIZE || is_rex_prefix((x)))
+
+#define PAGE_FRAME_MASK 0x80
+#define PAGE_OFFSET_MASK 0xFFF
+#define PAGE_TABLE_ENTRY_MASK (~PAGE_OFFSET_MASK)
+#define PML4E_OFFSET_MASK 0x0000FF8000000000
+#define PML4E_SHIFT 39
+
+#define MAX_EMULATED_REGIONS 8
+int registered_regions = 0;
+struct memory_region
+{
+ uintptr_t start;
+ uintptr_t end;
+ emulated_read_func_t memread;
+ emulated_write_func_t memwrite;
+ void *arg;
+} emulated_regions[MAX_EMULATED_REGIONS];
+
+struct decoded_instruction
+{
+ void *instruction;
+ uint8_t *opcode;
+ uint8_t *modrm;
+ uint8_t *sib;
+ uint8_t *displacement;
+ uint8_t *immediate;
+
+ uint8_t opcode_flags;
+
+ uint8_t addressing_mode;
+ uint8_t rm;
+ uint8_t reg;
+ uint8_t rex_r;
+ uint8_t rex_w;
+ uint8_t rex_b;
+ uint8_t rex_x;
+
+ int32_t disp;
+};
+
+static enum vm_reg_name vm_reg_name_mappings[] = {
+ [REG_EAX] = VM_REG_GUEST_RAX,
+ [REG_EBX] = VM_REG_GUEST_RBX,
+ [REG_ECX] = VM_REG_GUEST_RCX,
+ [REG_EDX] = VM_REG_GUEST_RDX,
+ [REG_ESP] = VM_REG_GUEST_RSP,
+ [REG_EBP] = VM_REG_GUEST_RBP,
+ [REG_ESI] = VM_REG_GUEST_RSI,
+ [REG_EDI] = VM_REG_GUEST_RDI,
+ [REG_R8] = VM_REG_GUEST_R8,
+ [REG_R9] = VM_REG_GUEST_R9,
+ [REG_R10] = VM_REG_GUEST_R10,
+ [REG_R11] = VM_REG_GUEST_R11,
+ [REG_R12] = VM_REG_GUEST_R12,
+ [REG_R13] = VM_REG_GUEST_R13,
+ [REG_R14] = VM_REG_GUEST_R14,
+ [REG_R15] = VM_REG_GUEST_R15
+};
+
+uint8_t one_byte_opcodes[256] = {
+ [0x89] = HAS_MODRM | FROM_REG | TO_RM,
+ [0x8B] = HAS_MODRM | FROM_RM | TO_REG,
+};
+
+static uintptr_t
+gla2gpa(uint64_t gla, uint64_t guest_cr3)
+{
+ uint64_t *table;
+ uint64_t mask, entry;
+ int level, shift;
+ uintptr_t page_frame;
+
+ table = paddr_guest2host(guest_cr3 & PAGE_TABLE_ENTRY_MASK);
+ mask = PML4E_OFFSET_MASK;
+ shift = PML4E_SHIFT;
+ for (level = 0; level < 4; ++level)
+ {
+ entry = table[(gla & mask) >> shift];
+ table = (uint64_t*)(entry & PAGE_TABLE_ENTRY_MASK);
+
+ /* This entry does not point to another page table */
+ if (entry & PAGE_FRAME_MASK || level >= 3)
+ break;
+
+ table = paddr_guest2host((uintptr_t)table);
+ mask >>= 9;
+ shift -= 9;
+ }
+
+ mask = (1 << shift) - 1;
+ page_frame = ((uintptr_t)table & ~mask);
+ return (page_frame | (gla & mask));
+}
+
+static void *
+gla2hla(uint64_t gla, uint64_t guest_cr3)
+{
+ uintptr_t gpa;
+
+ gpa = gla2gpa(gla, guest_cr3);
+ return paddr_guest2host(gpa);
+}
+
+/*
+ * Decodes all of the prefixes of the instruction. Only a subset of REX
+ * prefixes are currently supported. If any unsupported prefix is
+ * encountered, returns -1.
+ */
+static int
+decode_prefixes(struct decoded_instruction *decoded)
+{
+ uint8_t *current_prefix;
+
+ current_prefix = decoded->instruction;
+
+ if (is_rex_prefix(*current_prefix)) {
+ decoded->rex_w = *current_prefix & REX_W_MASK;
+ decoded->rex_r = *current_prefix & REX_R_MASK;
+ decoded->rex_x = *current_prefix & REX_X_MASK;
+ decoded->rex_b = *current_prefix & REX_B_MASK;
+ current_prefix++;
+ } else if (is_prefix(*current_prefix)) {
+ return (-1);
+ }
+
+ decoded->opcode = current_prefix;
+ return (0);
+}
+
+/*
+ * Decodes the instruction's opcode. If the opcode is not understood, returns
+ * -1 indicating an error. Sets the instruction's mod_rm pointer to the
+ * location of the ModR/M field.
+ */
+static int
+decode_opcode(struct decoded_instruction *decoded)
+{
+ uint8_t opcode, flags;
+
+ opcode = *decoded->opcode;
+ flags = one_byte_opcodes[opcode];
+
+ if (!flags)
+ return (-1);
+
+ if (flags & HAS_MODRM) {
+ decoded->modrm = decoded->opcode + 1;
+ }
+
+ decoded->opcode_flags = flags;
+
+ return (0);
+}
+
+/*
+ * Decodes the instruction's ModR/M field. Sets the instruction's sib pointer
+ * to the location of the SIB if one is expected to be present, or 0 if not.
+ */
+static int
+decode_mod_rm(struct decoded_instruction *decoded)
+{
+ uint8_t modrm;
+ uint8_t *extension_operands;
+
+ if (decoded->modrm) {
+ modrm = *decoded->modrm;
+
+ decoded->addressing_mode = (modrm & MODRM_MOD_MASK) >> MODRM_MOD_SHIFT;
+ decoded->rm = (modrm & MODRM_RM_MASK) >> MODRM_RM_SHIFT;
+ decoded->reg = (modrm & MODRM_REG_MASK) >> MODRM_REG_SHIFT;
+
+ if (decoded->rex_b)
+ decoded->rm |= (1<<3);
+
+ if (decoded->rex_r)
+ decoded->reg |= (1<<3);
+
+ extension_operands = decoded->modrm + 1;
+
+ if (decoded->rm == RM_SIB) {
+ decoded->sib = decoded->modrm + 1;
+ extension_operands = decoded->sib + 1;
+ }
+
+ switch (decoded->addressing_mode) {
+ case MOD_INDIRECT:
+ case MOD_DIRECT:
+ decoded->displacement = 0;
+ break;
+ case MOD_INDIRECT_DISP8:
+ decoded->displacement = extension_operands;
+ break;
+ case MOD_INDIRECT_DISP32:
+ decoded->displacement = extension_operands;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Decodes the instruction's SIB field. No such instructions are currently
+ * supported, so do nothing and return -1 if there is a SIB field, 0 otherwise.
+ */
+static int
+decode_sib(struct decoded_instruction *decoded)
+{
+
+ if (decoded->sib)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * Grabs and saves the instruction's immediate operand and displacement if
+ * they are present. Immediates are not currently supported, so if an
+ * immediate is present it will return -1 indicating an error.
+ */
+static int
+decode_extension_operands(struct decoded_instruction *decoded)
+{
+
+ if (decoded->displacement) {
+ if (decoded->addressing_mode == MOD_INDIRECT_DISP8) {
+ decoded->disp = (int32_t)*decoded->displacement;
+ } else if (decoded->addressing_mode == MOD_INDIRECT_DISP32) {
+ decoded->disp = *((int32_t*)decoded->displacement);
+ }
+ }
+
+ if (decoded->immediate) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+decode_instruction(void *instr, struct decoded_instruction *decoded)
+{
+ int error;
+
+ bzero(decoded, sizeof(*decoded));
+ decoded->instruction = instr;
+
+ error = decode_prefixes(decoded);
+ if (error)
+ return (error);
+
+ error = decode_opcode(decoded);
+ if (error)
+ return (error);
+
+ error = decode_mod_rm(decoded);
+ if (error)
+ return (error);
+
+ error = decode_sib(decoded);
+ if (error)
+ return (error);
+
+ error = decode_extension_operands(decoded);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+static struct memory_region *
+find_region(uintptr_t addr)
+{
+ int i;
+
+ for (i = 0; i < registered_regions; ++i) {
+ if (emulated_regions[i].start <= addr &&
+ emulated_regions[i].end >= addr) {
+ return &emulated_regions[i];
+ }
+ }
+
+ return (0);
+}
+
+static enum vm_reg_name
+get_vm_reg_name(uint8_t reg)
+{
+ return vm_reg_name_mappings[reg];
+}
+
+static int
+get_operand(struct vmctx *vm, int vcpu, uint64_t guest_cr3,
+ const struct decoded_instruction *instruction, uint64_t *operand)
+{
+ enum vm_reg_name regname;
+ uint64_t reg;
+ uintptr_t target;
+ int error;
+ uint8_t rm, addressing_mode;
+ struct memory_region *emulated_memory;
+
+ if (instruction->opcode_flags & FROM_RM) {
+ rm = instruction->rm;
+ addressing_mode = instruction->addressing_mode;
+ } else if (instruction->opcode_flags & FROM_REG) {
+ rm = instruction->reg;
+ addressing_mode = MOD_DIRECT;
+ } else
+ return (-1);
+
+ regname = get_vm_reg_name(rm);
+ error = vm_get_register(vm, vcpu, regname, ®);
+ if (error)
+ return (error);
+
+ switch (addressing_mode) {
+ case MOD_DIRECT:
+ *operand = reg;
+ return (0);
+ case MOD_INDIRECT:
+ target = gla2gpa(reg, guest_cr3);
+ emulated_memory = find_region(target);
+ if (emulated_memory) {
+ return emulated_memory->memread(vm, vcpu, target,
+ 4, operand,
+ emulated_memory->arg);
+ }
+ return (-1);
+ case MOD_INDIRECT_DISP8:
+ case MOD_INDIRECT_DISP32:
+ target = gla2gpa(reg, guest_cr3);
+ target += instruction->disp;
+ emulated_memory = find_region(target);
+ if (emulated_memory) {
+ return emulated_memory->memread(vm, vcpu, target,
+ 4, operand,
+ emulated_memory->arg);
+ }
+ return (-1);
+ default:
+ return (-1);
+ }
+}
+
+static int
+perform_write(struct vmctx *vm, int vcpu, uint64_t guest_cr3,
+ const struct decoded_instruction *instruction, uint64_t operand)
+{
+ enum vm_reg_name regname;
+ uintptr_t target;
+ int error;
+ uint64_t reg;
+ struct memory_region *emulated_memory;
+ uint8_t addressing_mode;
+
+ if (instruction->opcode_flags & TO_RM) {
+ reg = instruction->rm;
+ addressing_mode = instruction->addressing_mode;
+ } else if (instruction->opcode_flags & TO_REG) {
+ reg = instruction->reg;
+ addressing_mode = MOD_DIRECT;
+ } else
+ return (-1);
+
+ regname = get_vm_reg_name(reg);
+ error = vm_get_register(vm, vcpu, regname, ®);
+ if (error)
+ return (error);
+
+ switch(addressing_mode) {
+ case MOD_DIRECT:
+ return vm_set_register(vm, vcpu, regname, operand);
+ case MOD_INDIRECT:
+ target = gla2gpa(reg, guest_cr3);
+ emulated_memory = find_region(target);
+ if (emulated_memory) {
+ return emulated_memory->memwrite(vm, vcpu, target,
+ 4, operand,
+ emulated_memory->arg);
+ }
+ return (-1);
+ default:
+ return (-1);
+ }
+}
+
+static int
+emulate_decoded_instruction(struct vmctx *vm, int vcpu, uint64_t cr3,
+ const struct decoded_instruction *instruction)
+{
+ uint64_t operand;
+ int error;
+
+ error = get_operand(vm, vcpu, cr3, instruction, &operand);
+ if (error)
+ return (error);
+
+ return perform_write(vm, vcpu, cr3, instruction, operand);
+}
+
+int
+emulate_instruction(struct vmctx *vm, int vcpu, uint64_t rip, uint64_t cr3)
+{
+ struct decoded_instruction instr;
+ int error;
+ void *instruction = gla2hla(rip, cr3);
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-projects
mailing list