svn commit: r284899 - in stable/10: lib/libvmmapi share/examples/bhyve sys/amd64/include sys/amd64/vmm sys/amd64/vmm/amd sys/amd64/vmm/intel sys/amd64/vmm/io sys/kern usr.sbin/bhyve usr.sbin/bhyvectl

Neel Natu neel at FreeBSD.org
Sun Jun 28 01:21:59 UTC 2015


Author: neel
Date: Sun Jun 28 01:21:55 2015
New Revision: 284899
URL: https://svnweb.freebsd.org/changeset/base/284899

Log:
  MFC r279444:
  Allow passthrough devices to be hinted.
  
  MFC r279683:
  When ICW1 is issued the edge sense circuit is reset which means that
  following an initialization a low-to-high transistion is necesary to
  generate an interrupt.
  
  MFC r279925:
  Add -p parameter to list PCI device to pass through to the guest.
  
  MFC r281559:
  Fix handling of BUS_PROBE_NOWILDCARD in 'device_probe_child()'.
  
  MFC r280447:
  When fetching an instruction in non-64bit mode, consider the value of the
  code segment base address.
  
  MFC r280725:
  Move legacy interrupt allocation for virtio devices to common code.
  
  MFC r280775:
  Fix the RTC device model to operate correctly in 12-hour mode.
  
  MFC r280929:
  Fix "MOVS" instruction memory to MMIO emulation.
  
  MFC r280968:
  Display instruction bytes and %rip prior to aborting due to an instruction
  emulation error.
  
  MFC r281145:
  Enhance the support for Group 1 Extended opcodes for CMP, AND, OR instructions.
  
  MFC r281542:
  Initialize 'error' before use (Coverity IDs 1249748, 1249747, 1249751, 1249749)
  
  MFC r281561:
  Prior to aborting due to an ioport error, it is always interesting to see what
  the guest's %rip is.
  
  MFC r281611:
  If the number of guest vcpus is less than '1' then flag it as an error.
  
  MFC r281612:
  Prefer 'vcpu_should_yield()' over checking 'curthread->td_flags' directly.
  
  MFC r281630:
  Relax the check on which vectors can be delivered through the APIC. According
  to the Intel SDM vectors 16 through 255 are allowed to be delivered via the
  local APIC.
  
  MFC r281879:
  Missing break in switch case (Coverity ID 1292499)
  
  MFC r281946:
  Don't allow guest to modify readonly bits in the PCI config 'status' register.
  
  MFC r281987:
  STOS/STOSB/STOSW/STOSD/STOSQ instruction emulation.
  
  MFC r282206:
  Implement the century byte in the RTC.

Modified:
  stable/10/lib/libvmmapi/vmmapi.c
  stable/10/lib/libvmmapi/vmmapi.h
  stable/10/share/examples/bhyve/vmrun.sh
  stable/10/sys/amd64/include/vmm.h
  stable/10/sys/amd64/include/vmm_instruction_emul.h
  stable/10/sys/amd64/vmm/amd/svm.c
  stable/10/sys/amd64/vmm/intel/vmx.c
  stable/10/sys/amd64/vmm/io/ppt.c
  stable/10/sys/amd64/vmm/io/vatpic.c
  stable/10/sys/amd64/vmm/io/vrtc.c
  stable/10/sys/amd64/vmm/vmm.c
  stable/10/sys/amd64/vmm/vmm_dev.c
  stable/10/sys/amd64/vmm/vmm_instruction_emul.c
  stable/10/sys/amd64/vmm/vmm_lapic.c
  stable/10/sys/kern/subr_bus.c
  stable/10/usr.sbin/bhyve/acpi.c
  stable/10/usr.sbin/bhyve/bhyverun.c
  stable/10/usr.sbin/bhyve/pci_emul.c
  stable/10/usr.sbin/bhyve/pci_virtio_block.c
  stable/10/usr.sbin/bhyve/pci_virtio_net.c
  stable/10/usr.sbin/bhyve/virtio.c
  stable/10/usr.sbin/bhyvectl/bhyvectl.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/lib/libvmmapi/vmmapi.c
==============================================================================
--- stable/10/lib/libvmmapi/vmmapi.c	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/lib/libvmmapi/vmmapi.c	Sun Jun 28 01:21:55 2015	(r284899)
@@ -979,6 +979,18 @@ gla2gpa(struct vmctx *ctx, int vcpu, str
 	return (error);
 }
 
+int
+vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+    uint64_t gla, int prot, uint64_t *gpa)
+{
+	int error, fault;
+
+	error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, gpa);
+	if (fault)
+		error = fault;
+	return (error);
+}
+
 #ifndef min
 #define	min(a,b)	(((a) < (b)) ? (a) : (b))
 #endif

Modified: stable/10/lib/libvmmapi/vmmapi.h
==============================================================================
--- stable/10/lib/libvmmapi/vmmapi.h	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/lib/libvmmapi/vmmapi.h	Sun Jun 28 01:21:55 2015	(r284899)
@@ -63,6 +63,8 @@ int	vm_get_memory_seg(struct vmctx *ctx,
 int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
 int	vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
+int	vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
+		   uint64_t gla, int prot, uint64_t *gpa);
 uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
 void	vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
 void	vm_set_memflags(struct vmctx *ctx, int flags);

Modified: stable/10/share/examples/bhyve/vmrun.sh
==============================================================================
--- stable/10/share/examples/bhyve/vmrun.sh	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/share/examples/bhyve/vmrun.sh	Sun Jun 28 01:21:55 2015	(r284899)
@@ -62,6 +62,7 @@ usage() {
 	echo "       -i: force boot of the Installation CDROM image"
 	echo "       -I: Installation CDROM image location (default is ${DEFAULT_ISOFILE})"
 	echo "       -m: memory size (default is ${DEFAULT_MEMSIZE})"
+	echo "       -p: pass-through a host PCI device at bus/slot/func (e.g. 10/0/0)"
 	echo "       -t: tap device for virtio-net (default is $DEFAULT_TAPDEV)"
 	echo ""
 	[ -n "$msg" ] && errmsg "$msg"
@@ -89,8 +90,9 @@ disk_total=0
 apic_opt=""
 gdbport=0
 loader_opt=""
+pass_total=0
 
-while getopts ac:C:d:e:g:hH:iI:m:t: c ; do
+while getopts ac:C:d:e:g:hH:iI:m:p:t: c ; do
 	case $c in
 	a)
 		apic_opt="-a"
@@ -126,6 +128,10 @@ while getopts ac:C:d:e:g:hH:iI:m:t: c ; 
 	m)
 		memsize=${OPTARG}
 		;;
+	p)
+		eval "pass_dev${pass_total}=\"${OPTARG}\""
+		pass_total=$(($pass_total + 1))
+		;;
 	t)
 		eval "tap_dev${tap_total}=\"${OPTARG}\""
 		tap_total=$(($tap_total + 1))
@@ -249,6 +255,14 @@ while [ 1 ]; do
 	    i=$(($i + 1))
 	done
 
+	i=0
+	while [ $i -lt $pass_total ] ; do
+	    eval "pass=\$pass_dev${i}"
+	    devargs="$devargs -s $nextslot:0,passthru,${pass} "
+	    nextslot=$(($nextslot + 1))
+	    i=$(($i + 1))
+        done
+
 	${FBSDRUN} -c ${cpus} -m ${memsize} ${apic_opt} -A -H -P	\
 		-g ${gdbport}						\
 		-s 0:0,hostbridge					\

Modified: stable/10/sys/amd64/include/vmm.h
==============================================================================
--- stable/10/sys/amd64/include/vmm.h	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/include/vmm.h	Sun Jun 28 01:21:55 2015	(r284899)
@@ -551,6 +551,7 @@ struct vm_exit {
 		struct {
 			uint64_t	gpa;
 			uint64_t	gla;
+			uint64_t	cs_base;
 			int		cs_d;		/* CS.D */
 			struct vm_guest_paging paging;
 			struct vie	vie;

Modified: stable/10/sys/amd64/include/vmm_instruction_emul.h
==============================================================================
--- stable/10/sys/amd64/include/vmm_instruction_emul.h	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/include/vmm_instruction_emul.h	Sun Jun 28 01:21:55 2015	(r284899)
@@ -90,7 +90,7 @@ int vmm_fetch_instruction(struct vm *vm,
  * Returns 1 if an exception was injected into the guest.
  * Returns -1 otherwise.
  */
-int vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
     uint64_t gla, int prot, uint64_t *gpa);
 
 void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);

Modified: stable/10/sys/amd64/vmm/amd/svm.c
==============================================================================
--- stable/10/sys/amd64/vmm/amd/svm.c	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/vmm/amd/svm.c	Sun Jun 28 01:21:55 2015	(r284899)
@@ -799,8 +799,14 @@ svm_handle_inst_emul(struct vmcb *vmcb, 
 	KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
 
 	switch(paging->cpu_mode) {
+	case CPU_MODE_REAL:
+		vmexit->u.inst_emul.cs_base = seg.base;
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
+		vmexit->u.inst_emul.cs_base = seg.base;
+
 		/*
 		 * Section 4.8.1 of APM2, Default Operand Size or D bit.
 		 */
@@ -808,6 +814,7 @@ svm_handle_inst_emul(struct vmcb *vmcb, 
 		    1 : 0;
 		break;
 	default:
+		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;	
 	}
@@ -1911,7 +1918,7 @@ svm_vmrun(void *arg, int vcpu, register_
 		}
 
 		/* We are asked to give the cpu by scheduler. */
-		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
+		if (vcpu_should_yield(vm, vcpu)) {
 			enable_gintr();
 			vm_exit_astpending(vm, vcpu, state->rip);
 			break;

Modified: stable/10/sys/amd64/vmm/intel/vmx.c
==============================================================================
--- stable/10/sys/amd64/vmm/intel/vmx.c	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/vmm/intel/vmx.c	Sun Jun 28 01:21:55 2015	(r284899)
@@ -1785,12 +1785,18 @@ vmexit_inst_emul(struct vm_exit *vmexit,
 	vmexit->u.inst_emul.gla = gla;
 	vmx_paging_info(paging);
 	switch (paging->cpu_mode) {
+	case CPU_MODE_REAL:
+		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
+		vmexit->u.inst_emul.cs_d = 0;
+		break;
 	case CPU_MODE_PROTECTED:
 	case CPU_MODE_COMPATIBILITY:
+		vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 		csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 		vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
 		break;
 	default:
+		vmexit->u.inst_emul.cs_base = 0;
 		vmexit->u.inst_emul.cs_d = 0;
 		break;
 	}

Modified: stable/10/sys/amd64/vmm/io/ppt.c
==============================================================================
--- stable/10/sys/amd64/vmm/io/ppt.c	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/vmm/io/ppt.c	Sun Jun 28 01:21:55 2015	(r284899)
@@ -56,7 +56,6 @@ __FBSDID("$FreeBSD$");
 
 /* XXX locking */
 
-#define	MAX_PPTDEVS	(sizeof(pptdevs) / sizeof(pptdevs[0]))
 #define	MAX_MSIMSGS	32
 
 /*
@@ -77,9 +76,10 @@ struct pptintr_arg {				/* pptintr(pptin
 	uint64_t	msg_data;
 };
 
-static struct pptdev {
+struct pptdev {
 	device_t	dev;
 	struct vm	*vm;			/* owner of this device */
+	TAILQ_ENTRY(pptdev)	next;
 	struct vm_memory_segment mmio[MAX_MMIOSEGS];
 	struct {
 		int	num_msgs;		/* guest state */
@@ -99,7 +99,7 @@ static struct pptdev {
 		void **cookie;
 		struct pptintr_arg *arg;
 	} msix;
-} pptdevs[64];
+};
 
 SYSCTL_DECL(_hw_vmm);
 SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
@@ -108,6 +108,8 @@ static int num_pptdevs;
 SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
     "number of pci passthru devices");
 
+static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
+
 static int
 ppt_probe(device_t dev)
 {
@@ -125,26 +127,30 @@ ppt_probe(device_t dev)
 	 * - be allowed by administrator to be used in this role
 	 * - be an endpoint device
 	 */
-	if (vmm_is_pptdev(bus, slot, func) &&
-	    (dinfo->cfg.hdrtype & PCIM_HDRTYPE) == PCIM_HDRTYPE_NORMAL)
+	if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
+		return (ENXIO);
+	else if (vmm_is_pptdev(bus, slot, func))
 		return (0);
 	else
-		return (ENXIO);
+		/*
+		 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
+		 * SR-IOV infrastructure specified as "ppt" passthrough devices.
+		 * All normal devices that did not have "ppt" specified as their
+		 * driver will not be matched by this.
+		 */
+		return (BUS_PROBE_NOWILDCARD);
 }
 
 static int
 ppt_attach(device_t dev)
 {
-	int n;
+	struct pptdev *ppt;
 
-	if (num_pptdevs >= MAX_PPTDEVS) {
-		printf("ppt_attach: maximum number of pci passthrough devices "
-		       "exceeded\n");
-		return (ENXIO);
-	}
+	ppt = device_get_softc(dev);
 
-	n = num_pptdevs++;
-	pptdevs[n].dev = dev;
+	num_pptdevs++;
+	TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
+	ppt->dev = dev;
 
 	if (bootverbose)
 		device_printf(dev, "attached\n");
@@ -155,10 +161,14 @@ ppt_attach(device_t dev)
 static int
 ppt_detach(device_t dev)
 {
-	/*
-	 * XXX check whether there are any pci passthrough devices assigned
-	 * to guests before we allow this driver to detach.
-	 */
+	struct pptdev *ppt;
+
+	ppt = device_get_softc(dev);
+
+	if (ppt->vm != NULL)
+		return (EBUSY);
+	num_pptdevs--;
+	TAILQ_REMOVE(&pptdev_list, ppt, next);
 
 	return (0);
 }
@@ -172,22 +182,23 @@ static device_method_t ppt_methods[] = {
 };
 
 static devclass_t ppt_devclass;
-DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, 0);
+DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
 DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
 
 static struct pptdev *
 ppt_find(int bus, int slot, int func)
 {
 	device_t dev;
-	int i, b, s, f;
+	struct pptdev *ppt;
+	int b, s, f;
 
-	for (i = 0; i < num_pptdevs; i++) {
-		dev = pptdevs[i].dev;
+	TAILQ_FOREACH(ppt, &pptdev_list, next) {
+		dev = ppt->dev;
 		b = pci_get_bus(dev);
 		s = pci_get_slot(dev);
 		f = pci_get_function(dev);
 		if (bus == b && slot == s && func == f)
-			return (&pptdevs[i]);
+			return (ppt);
 	}
 	return (NULL);
 }
@@ -297,11 +308,12 @@ ppt_avail_devices(void)
 int
 ppt_assigned_devices(struct vm *vm)
 {
-	int i, num;
+	struct pptdev *ppt;
+	int num;
 
 	num = 0;
-	for (i = 0; i < num_pptdevs; i++) {
-		if (pptdevs[i].vm == vm)
+	TAILQ_FOREACH(ppt, &pptdev_list, next) {
+		if (ppt->vm == vm)
 			num++;
 	}
 	return (num);
@@ -310,12 +322,11 @@ ppt_assigned_devices(struct vm *vm)
 boolean_t
 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
 {
-	int i, n;
+	int i;
 	struct pptdev *ppt;
 	struct vm_memory_segment *seg;
 
-	for (n = 0; n < num_pptdevs; n++) {
-		ppt = &pptdevs[n];
+	TAILQ_FOREACH(ppt, &pptdev_list, next) {
 		if (ppt->vm != vm)
 			continue;
 
@@ -377,12 +388,13 @@ ppt_unassign_device(struct vm *vm, int b
 int
 ppt_unassign_all(struct vm *vm)
 {
-	int i, bus, slot, func;
+	struct pptdev *ppt;
+	int bus, slot, func;
 	device_t dev;
 
-	for (i = 0; i < num_pptdevs; i++) {
-		if (pptdevs[i].vm == vm) {
-			dev = pptdevs[i].dev;
+	TAILQ_FOREACH(ppt, &pptdev_list, next) {
+		if (ppt->vm == vm) {
+			dev = ppt->dev;
 			bus = pci_get_bus(dev);
 			slot = pci_get_slot(dev);
 			func = pci_get_function(dev);

Modified: stable/10/sys/amd64/vmm/io/vatpic.c
==============================================================================
--- stable/10/sys/amd64/vmm/io/vatpic.c	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/vmm/io/vatpic.c	Sun Jun 28 01:21:55 2015	(r284899)
@@ -275,6 +275,7 @@ vatpic_icw1(struct vatpic *vatpic, struc
 	atpic->ready = false;
 
 	atpic->icw_num = 1;
+	atpic->request = 0;
 	atpic->mask = 0;
 	atpic->lowprio = 7;
 	atpic->rd_cmd_reg = 0;

Modified: stable/10/sys/amd64/vmm/io/vrtc.c
==============================================================================
--- stable/10/sys/amd64/vmm/io/vrtc.c	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/vmm/io/vrtc.c	Sun Jun 28 01:21:55 2015	(r284899)
@@ -63,9 +63,12 @@ struct rtcdev {
 	uint8_t	reg_b;
 	uint8_t	reg_c;
 	uint8_t	reg_d;
-	uint8_t	nvram[128 - 14];
+	uint8_t	nvram[36];
+	uint8_t	century;
+	uint8_t	nvram2[128 - 51];
 } __packed;
 CTASSERT(sizeof(struct rtcdev) == 128);
+CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY);
 
 struct vrtc {
 	struct vm	*vm;
@@ -214,9 +217,27 @@ secs_to_rtc(time_t rtctime, struct vrtc 
 	rtc->sec = rtcset(rtc, ct.sec);
 	rtc->min = rtcset(rtc, ct.min);
 
-	hour = ct.hour;
-	if ((rtc->reg_b & RTCSB_24HR) == 0)
-		hour = (hour % 12) + 1;	    /* convert to a 12-hour format */
+	if (rtc->reg_b & RTCSB_24HR) {
+		hour = ct.hour;
+	} else {
+		/*
+		 * Convert to the 12-hour format.
+		 */
+		switch (ct.hour) {
+		case 0:			/* 12 AM */
+		case 12:		/* 12 PM */
+			hour = 12;
+			break;
+		default:
+			/*
+			 * The remaining 'ct.hour' values are interpreted as:
+			 * [1  - 11] ->  1 - 11 AM
+			 * [13 - 23] ->  1 - 11 PM
+			 */
+			hour = ct.hour % 12;
+			break;
+		}
+	}
 
 	rtc->hour = rtcset(rtc, hour);
 
@@ -227,6 +248,7 @@ secs_to_rtc(time_t rtctime, struct vrtc 
 	rtc->day_of_month = rtcset(rtc, ct.day);
 	rtc->month = rtcset(rtc, ct.mon);
 	rtc->year = rtcset(rtc, ct.year % 100);
+	rtc->century = rtcset(rtc, ct.year / 100);
 }
 
 static int
@@ -256,7 +278,7 @@ rtc_to_secs(struct vrtc *vrtc)
 	struct timespec ts;
 	struct rtcdev *rtc;
 	struct vm *vm;
-	int error, hour, pm, year;
+	int century, error, hour, pm, year;
 
 	KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__));
 
@@ -287,9 +309,26 @@ rtc_to_secs(struct vrtc *vrtc)
 	}
 	error = rtcget(rtc, hour, &ct.hour);
 	if ((rtc->reg_b & RTCSB_24HR) == 0) {
-		ct.hour -= 1;
-		if (pm)
-			ct.hour += 12;
+		if (ct.hour >= 1 && ct.hour <= 12) {
+			/*
+			 * Convert from 12-hour format to internal 24-hour
+			 * representation as follows:
+			 *
+			 *    12-hour format		ct.hour
+			 *	12	AM		0
+			 *	1 - 11	AM		1 - 11
+			 *	12	PM		12
+			 *	1 - 11	PM		13 - 23
+			 */
+			if (ct.hour == 12)
+				ct.hour = 0;
+			if (pm)
+				ct.hour += 12;
+		} else {
+			VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d",
+			    rtc->hour, ct.hour);
+			goto fail;
+		}
 	}
 
 	if (error || ct.hour < 0 || ct.hour > 23) {
@@ -323,10 +362,14 @@ rtc_to_secs(struct vrtc *vrtc)
 		VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year);
 		goto fail;
 	}
-	if (year >= 70)
-		ct.year = 1900 + year;
-	else
-		ct.year = 2000 + year;
+
+	error = rtcget(rtc, rtc->century, &century);
+	ct.year = century * 100 + year;
+	if (error || ct.year < POSIX_BASE_YEAR) {
+		VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century,
+		    ct.year);
+		goto fail;
+	}
 
 	error = clock_ct_to_ts(&ct, &ts);
 	if (error || ts.tv_sec < 0) {
@@ -338,7 +381,12 @@ rtc_to_secs(struct vrtc *vrtc)
 	}
 	return (ts.tv_sec);		/* success */
 fail:
-	return (VRTC_BROKEN_TIME);	/* failure */
+	/*
+	 * Stop updating the RTC if the date/time fields programmed by
+	 * the guest are invalid.
+	 */
+	VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected");
+	return (VRTC_BROKEN_TIME);
 }
 
 static int
@@ -593,13 +641,6 @@ vrtc_set_reg_b(struct vrtc *vrtc, uint8_
 		if ((newval & RTCSB_HALT) == 0) {
 			rtctime = rtc_to_secs(vrtc);
 			if (rtctime == VRTC_BROKEN_TIME) {
-				/*
-				 * Stop updating the RTC if the date/time
-				 * programmed by the guest is not correct.
-				 */
-				VM_CTR0(vrtc->vm, "Invalid RTC date/time "
-				    "programming detected");
-
 				if (rtc_flag_broken_time)
 					return (-1);
 			}
@@ -742,7 +783,7 @@ vrtc_nvram_write(struct vm *vm, int offs
 	 * Don't allow writes to RTC control registers or the date/time fields.
 	 */
 	if (offset < offsetof(struct rtcdev, nvram[0]) ||
-	    offset >= sizeof(struct rtcdev)) {
+	    offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) {
 		VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d",
 		    offset);
 		return (EINVAL);
@@ -776,7 +817,7 @@ vrtc_nvram_read(struct vm *vm, int offse
 	/*
 	 * Update RTC date/time fields if necessary.
 	 */
-	if (offset < 10) {
+	if (offset < 10 || offset == RTC_CENTURY) {
 		curtime = vrtc_curtime(vrtc);
 		secs_to_rtc(curtime, vrtc, 0);
 	}
@@ -837,13 +878,17 @@ vrtc_data_handler(struct vm *vm, int vcp
 	curtime = vrtc_curtime(vrtc);
 	vrtc_time_update(vrtc, curtime);
 
-	if (in) {
-		/*
-		 * Update RTC date/time fields if necessary.
-		 */
-		if (offset < 10)
-			secs_to_rtc(curtime, vrtc, 0);
+	/*
+	 * Update RTC date/time fields if necessary.
+	 *
+	 * This is not just for reads of the RTC. The side-effect of writing
+	 * the century byte requires other RTC date/time fields (e.g. sec)
+	 * to be updated here.
+	 */
+	if (offset < 10 || offset == RTC_CENTURY)
+		secs_to_rtc(curtime, vrtc, 0);
 
+	if (in) {
 		if (offset == 12) {
 			/*
 			 * XXX
@@ -887,6 +932,18 @@ vrtc_data_handler(struct vm *vm, int vcp
 			*((uint8_t *)rtc + offset) = *val;
 			break;
 		}
+
+		/*
+		 * XXX some guests (e.g. OpenBSD) write the century byte
+		 * outside of RTCSB_HALT so re-calculate the RTC date/time.
+		 */
+		if (offset == RTC_CENTURY && !rtc_halted(vrtc)) {
+			curtime = rtc_to_secs(vrtc);
+			error = vrtc_time_update(vrtc, curtime);
+			KASSERT(!error, ("vrtc_time_update error %d", error));
+			if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time)
+				error = -1;
+		}
 	}
 	VRTC_UNLOCK(vrtc);
 	return (error);

Modified: stable/10/sys/amd64/vmm/vmm.c
==============================================================================
--- stable/10/sys/amd64/vmm/vmm.c	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/vmm/vmm.c	Sun Jun 28 01:21:55 2015	(r284899)
@@ -218,6 +218,11 @@ SYSCTL_INT(_hw_vmm, OID_AUTO, trace_gues
     &trace_guest_exceptions, 0,
     "Trap into hypervisor on all guest exceptions and reflect them back");
 
+static int vmm_force_iommu = 0;
+TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu);
+SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0,
+    "Force use of I/O MMU even if no passthrough devices were found.");
+
 static void
 vcpu_cleanup(struct vm *vm, int i, bool destroy)
 {
@@ -322,7 +327,7 @@ vmm_handler(module_t mod, int what, void
 	switch (what) {
 	case MOD_LOAD:
 		vmmdev_init();
-		if (ppt_avail_devices() > 0)
+		if (vmm_force_iommu || ppt_avail_devices() > 0)
 			iommu_init();
 		error = vmm_init();
 		if (error == 0)
@@ -1248,7 +1253,7 @@ vm_handle_inst_emul(struct vm *vm, int v
 	struct vie *vie;
 	struct vcpu *vcpu;
 	struct vm_exit *vme;
-	uint64_t gla, gpa;
+	uint64_t gla, gpa, cs_base;
 	struct vm_guest_paging *paging;
 	mem_region_read_t mread;
 	mem_region_write_t mwrite;
@@ -1260,6 +1265,7 @@ vm_handle_inst_emul(struct vm *vm, int v
 
 	gla = vme->u.inst_emul.gla;
 	gpa = vme->u.inst_emul.gpa;
+	cs_base = vme->u.inst_emul.cs_base;
 	cs_d = vme->u.inst_emul.cs_d;
 	vie = &vme->u.inst_emul.vie;
 	paging = &vme->u.inst_emul.paging;
@@ -1274,8 +1280,8 @@ vm_handle_inst_emul(struct vm *vm, int v
 		 * maximum size instruction.
 		 */
 		length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE;
-		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
-		    length, vie);
+		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
+		    cs_base, length, vie);
 	} else {
 		/*
 		 * The instruction bytes have already been copied into 'vie'
@@ -2328,7 +2334,7 @@ vm_copy_setup(struct vm *vm, int vcpuid,
 	remaining = len;
 	while (remaining > 0) {
 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
-		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
 		if (error)
 			return (error);
 		off = gpa & PAGE_MASK;

Modified: stable/10/sys/amd64/vmm/vmm_dev.c
==============================================================================
--- stable/10/sys/amd64/vmm/vmm_dev.c	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/vmm/vmm_dev.c	Sun Jun 28 01:21:55 2015	(r284899)
@@ -440,10 +440,10 @@ vmmdev_ioctl(struct cdev *cdev, u_long c
 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
 		gg = (struct vm_gla2gpa *)data;
-		error = vmm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
+		error = vm_gla2gpa(sc->vm, gg->vcpuid, &gg->paging, gg->gla,
 		    gg->prot, &gg->gpa);
 		KASSERT(error == 0 || error == 1 || error == -1,
-		    ("%s: vmm_gla2gpa unknown error %d", __func__, error));
+		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
 		if (error >= 0) {
 			/*
 			 * error = 0: the translation was successful

Modified: stable/10/sys/amd64/vmm/vmm_instruction_emul.c
==============================================================================
--- stable/10/sys/amd64/vmm/vmm_instruction_emul.c	Sat Jun 27 23:28:56 2015	(r284898)
+++ stable/10/sys/amd64/vmm/vmm_instruction_emul.c	Sun Jun 28 01:21:55 2015	(r284899)
@@ -71,6 +71,8 @@ enum {
 	VIE_OP_TYPE_CMP,
 	VIE_OP_TYPE_POP,
 	VIE_OP_TYPE_MOVS,
+	VIE_OP_TYPE_GROUP1,
+	VIE_OP_TYPE_STOS,
 	VIE_OP_TYPE_LAST
 };
 
@@ -145,6 +147,16 @@ static const struct vie_op one_byte_opco
 		.op_type = VIE_OP_TYPE_MOVS,
 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
 	},
+	[0xAA] = {
+		.op_byte = 0xAA,
+		.op_type = VIE_OP_TYPE_STOS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
+	[0xAB] = {
+		.op_byte = 0xAB,
+		.op_type = VIE_OP_TYPE_STOS,
+		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+	},
 	[0xC6] = {
 		/* XXX Group 11 extended opcode - not just MOV */
 		.op_byte = 0xC6,
@@ -161,15 +173,15 @@ static const struct vie_op one_byte_opco
 		.op_type = VIE_OP_TYPE_AND,
 	},
 	[0x81] = {
-		/* XXX Group 1 extended opcode - not just AND */
+		/* XXX Group 1 extended opcode */
 		.op_byte = 0x81,
-		.op_type = VIE_OP_TYPE_AND,
+		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM,
 	},
 	[0x83] = {
-		/* XXX Group 1 extended opcode - not just OR */
+		/* XXX Group 1 extended opcode */
 		.op_byte = 0x83,
-		.op_type = VIE_OP_TYPE_OR,
+		.op_type = VIE_OP_TYPE_GROUP1,
 		.op_flags = VIE_OP_F_IMM8,
 	},
 	[0x8F] = {
@@ -634,7 +646,7 @@ emulate_movs(void *vm, int vcpuid, uint6
 #else
 	struct iovec copyinfo[2];
 #endif
-	uint64_t dstaddr, srcaddr, val;
+	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
 	uint64_t rcx, rdi, rsi, rflags;
 	int error, opsize, seg, repeat;
 
@@ -669,7 +681,7 @@ emulate_movs(void *vm, int vcpuid, uint6
 	 * (1)  memory		memory		n/a
 	 * (2)  memory		mmio		emulated
 	 * (3)  mmio		memory		emulated
-	 * (4)  mmio		mmio		not emulated
+	 * (4)  mmio		mmio		emulated
 	 *
 	 * At this point we don't have sufficient information to distinguish
 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
@@ -694,7 +706,8 @@ emulate_movs(void *vm, int vcpuid, uint6
 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
-		goto done;
+		if (error)
+			goto done;
 	} else if (error > 0) {
 		/*
 		 * Resume guest execution to handle fault.
@@ -705,37 +718,55 @@ emulate_movs(void *vm, int vcpuid, uint6
 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
 		 * if 'srcaddr' is in the mmio space.
 		 */
-	}
 
-	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
-	    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
-	if (error)
-		goto done;
-
-	error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
-	    PROT_WRITE, copyinfo, nitems(copyinfo));
-	if (error == 0) {
-		/*
-		 * case (3): read from MMIO and write to system memory.
-		 *
-		 * A MMIO read can have side-effects so we commit to it
-		 * only after vm_copy_setup() is successful. If a page-fault
-		 * needs to be injected into the guest then it will happen
-		 * before the MMIO read is attempted.
-		 */
-		error = memread(vm, vcpuid, gpa, &val, opsize, arg);
+		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
 		if (error)
 			goto done;
 
-		vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
-		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
-	} else if (error > 0) {
-		/*
-		 * Resume guest execution to handle fault.
-		 */
-		goto done;
-	} else {
-		goto done;
+		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
+		    PROT_WRITE, copyinfo, nitems(copyinfo));
+		if (error == 0) {
+			/*
+			 * case (3): read from MMIO and write to system memory.
+			 *
+			 * A MMIO read can have side-effects so we
+			 * commit to it only after vm_copy_setup() is
+			 * successful. If a page-fault needs to be
+			 * injected into the guest then it will happen
+			 * before the MMIO read is attempted.
+			 */
+			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
+			if (error)
+				goto done;
+
+			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
+			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+		} else if (error > 0) {
+			/*
+			 * Resume guest execution to handle fault.
+			 */
+			goto done;
+		} else {
+			/*
+			 * Case (4): read from and write to mmio.
+			 */
+			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
+			    PROT_READ, &srcgpa);
+			if (error)
+				goto done;
+			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
+			if (error)
+				goto done;
+
+			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
+			   PROT_WRITE, &dstgpa);
+			if (error)
+				goto done;
+			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
+			if (error)
+				goto done;
+		}
 	}
 
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
@@ -783,6 +814,68 @@ done:
 }
 
 static int
+emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *arg)
+{
+	int error, opsize, repeat;
+	uint64_t val;
+	uint64_t rcx, rdi, rflags;
+
+	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
+	repeat = vie->repz_present | vie->repnz_present;
+
+	if (repeat) {
+		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
+		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
+
+		/*
+		 * The count register is %rcx, %ecx or %cx depending on the
+		 * address size of the instruction.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
+			return (0);
+	}
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
+
+	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
+	if (error)
+		return (error);
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
+	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
+
+	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+	if (rflags & PSL_D)
+		rdi -= opsize;
+	else
+		rdi += opsize;
+
+	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
+	    vie->addrsize);
+	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
+
+	if (repeat) {
+		rcx = rcx - 1;
+		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
+		    rcx, vie->addrsize);
+		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
+
+		/*
+		 * Repeat the instruction if the count register is not zero.
+		 */
+		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
+			vm_restart_instruction(vm, vcpuid);
+	}
+
+	return (0);
+}
+
+static int
 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 {
@@ -820,16 +913,18 @@ emulate_and(void *vm, int vcpuid, uint64
 		error = vie_update_register(vm, vcpuid, reg, result, size);
 		break;
 	case 0x81:
+	case 0x83:
 		/*
-		 * AND/OR mem (ModRM:r/m) with immediate and store the
+		 * AND mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * AND: i = 4
-		 * OR:  i = 1
-		 * 81 /i		op r/m16, imm16
-		 * 81 /i		op r/m32, imm32
-		 * REX.W + 81 /i	op r/m64, imm32 sign-extended to 64
+		 * 81 /4		and r/m16, imm16
+		 * 81 /4		and r/m32, imm32
+		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
 		 *
+		 * 83 /4		and r/m16, imm8 sign-extended to 16
+		 * 83 /4		and r/m32, imm8 sign-extended to 32
+		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
 		 */
 
 		/* get the first operand */
@@ -838,26 +933,11 @@ emulate_and(void *vm, int vcpuid, uint64
 			break;
 
                 /*
-                 * perform the operation with the pre-fetched immediate
-                 * operand and write the result
-                 */
-		switch (vie->reg & 7) {
-		case 0x4:
-			/* modrm:reg == b100, AND */
-			result = val1 & vie->immediate;
-			break;
-		case 0x1:
-			/* modrm:reg == b001, OR */
-			result = val1 | vie->immediate;
-			break;
-		default:
-			error = EINVAL;
-			break;
-		}
-		if (error)
-			break;
-
-		error = memwrite(vm, vcpuid, gpa, result, size, arg);
+		 * perform the operation with the pre-fetched immediate
+		 * operand and write the result
+		 */
+                result = val1 & vie->immediate;
+                error = memwrite(vm, vcpuid, gpa, result, size, arg);
 		break;
 	default:
 		break;
@@ -894,20 +974,20 @@ emulate_or(void *vm, int vcpuid, uint64_
 	error = EINVAL;
 
 	switch (vie->op.op_byte) {
+	case 0x81:
 	case 0x83:
 		/*
 		 * OR mem (ModRM:r/m) with immediate and store the
 		 * result in mem.
 		 *
-		 * 83 /1		OR r/m16, imm8 sign-extended to 16
-		 * 83 /1		OR r/m32, imm8 sign-extended to 32
-		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
+		 * 81 /1		or r/m16, imm16
+		 * 81 /1		or r/m32, imm32
+		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
 		 *
-		 * Currently, only the OR operation of the 0x83 opcode
-		 * is implemented (ModRM:reg = b001).
+		 * 83 /1		or r/m16, imm8 sign-extended to 16
+		 * 83 /1		or r/m32, imm8 sign-extended to 32
+		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
 		 */
-		if ((vie->reg & 7) != 1)
-			break;
 
 		/* get the first operand */
                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
@@ -978,11 +1058,37 @@ emulate_cmp(void *vm, int vcpuid, uint64
 		if (error)
 			return (error);
 
+		rflags2 = getcc(size, op1, op2);
+		break;
+	case 0x81:
+	case 0x83:
+		/*
+		 * 81 /7		cmp r/m16, imm16
+		 * 81 /7		cmp r/m32, imm32
+		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
+		 *
+		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
+		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
+		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
+		 *
+		 * Compare mem (ModRM:r/m) with immediate and set
+		 * status flags according to the results.  The
+		 * comparison is performed by subtracting the
+		 * immediate from the first operand and then setting
+		 * the status flags.
+		 *
+		 */
+
+		/* get the first operand */
+                error = memread(vm, vcpuid, gpa, &op1, size, arg);
+		if (error)
+			return (error);
+
+		rflags2 = getcc(size, op1, vie->immediate);
 		break;
 	default:
 		return (EINVAL);
 	}
-	rflags2 = getcc(size, op1, op2);
 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 	if (error)
 		return (error);
@@ -1201,6 +1307,34 @@ emulate_pop(void *vm, int vcpuid, uint64
 	return (error);
 }
 
+static int
+emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+    struct vm_guest_paging *paging, mem_region_read_t memread,
+    mem_region_write_t memwrite, void *memarg)
+{
+	int error;
+
+	switch (vie->reg & 7) {
+	case 0x1:	/* OR */
+		error = emulate_or(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	case 0x4:	/* AND */
+		error = emulate_and(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	case 0x7:	/* CMP */
+		error = emulate_cmp(vm, vcpuid, gpa, vie,
+		    memread, memwrite, memarg);
+		break;
+	default:
+		error = EINVAL;
+		break;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list