svn commit: r269700 - in head: sys/amd64/vmm usr.sbin/bhyve

Neel Natu neel at FreeBSD.org
Fri Aug 8 03:49:03 UTC 2014


Author: neel
Date: Fri Aug  8 03:49:01 2014
New Revision: 269700
URL: http://svnweb.freebsd.org/changeset/base/269700

Log:
  Support PCI extended config space in bhyve.
  
  Add the ACPI MCFG table to advertise the extended config memory window.
  
  Introduce a new flag MEM_F_IMMUTABLE for memory ranges that cannot be deleted
  or moved in the guest's address space. The PCI extended config space is an
  example of an immutable memory range.
  
  Add emulation for the "movzw" instruction. This instruction is used by FreeBSD
  to read a 16-bit extended config space register.
  
  CR:		https://phabric.freebsd.org/D505
  Reviewed by:	jhb, grehan
  Requested by:	tychon

Modified:
  head/sys/amd64/vmm/vmm_instruction_emul.c
  head/usr.sbin/bhyve/acpi.c
  head/usr.sbin/bhyve/mem.c
  head/usr.sbin/bhyve/mem.h
  head/usr.sbin/bhyve/pci_emul.c
  head/usr.sbin/bhyve/pci_emul.h

Modified: head/sys/amd64/vmm/vmm_instruction_emul.c
==============================================================================
--- head/sys/amd64/vmm/vmm_instruction_emul.c	Fri Aug  8 01:57:15 2014	(r269699)
+++ head/sys/amd64/vmm/vmm_instruction_emul.c	Fri Aug  8 03:49:01 2014	(r269700)
@@ -82,6 +82,10 @@ static const struct vie_op two_byte_opco
 		.op_byte = 0xB6,
 		.op_type = VIE_OP_TYPE_MOVZX,
 	},
+	[0xB7] = {
+		.op_byte = 0xB7,
+		.op_type = VIE_OP_TYPE_MOVZX,
+	},
 	[0xBE] = {
 		.op_byte = 0xBE,
 		.op_type = VIE_OP_TYPE_MOVSX,
@@ -505,6 +509,25 @@ emulate_movx(void *vm, int vcpuid, uint6
 		/* write the result */
 		error = vie_update_register(vm, vcpuid, reg, val, size);
 		break;
+	case 0xB7:
+		/*
+		 * MOV and zero extend word from mem (ModRM:r/m) to
+		 * reg (ModRM:reg).
+		 *
+		 * 0F B7/r		movzx r32, r/m16
+		 * REX.W + 0F B7/r	movzx r64, r/m16
+		 */
+		error = memread(vm, vcpuid, gpa, &val, 2, arg);
+		if (error)
+			return (error);
+
+		reg = gpr_map[vie->reg];
+
+		/* zero-extend word */
+		val = (uint16_t)val;
+
+		error = vie_update_register(vm, vcpuid, reg, val, size);
+		break;
 	case 0xBE:
 		/*
 		 * MOV and sign extend byte from mem (ModRM:r/m) to

Modified: head/usr.sbin/bhyve/acpi.c
==============================================================================
--- head/usr.sbin/bhyve/acpi.c	Fri Aug  8 01:57:15 2014	(r269699)
+++ head/usr.sbin/bhyve/acpi.c	Fri Aug  8 03:49:01 2014	(r269700)
@@ -40,12 +40,13 @@
  *  Layout
  *  ------
  *   RSDP  ->   0xf2400    (36 bytes fixed)
- *     RSDT  ->   0xf2440    (36 bytes + 4*N table addrs, 2 used)
- *     XSDT  ->   0xf2480    (36 bytes + 8*N table addrs, 2 used)
+ *     RSDT  ->   0xf2440    (36 bytes + 4*7 table addrs, 4 used)
+ *     XSDT  ->   0xf2480    (36 bytes + 8*7 table addrs, 4 used)
  *       MADT  ->   0xf2500  (depends on #CPUs)
  *       FADT  ->   0xf2600  (268 bytes)
  *       HPET  ->   0xf2740  (56 bytes)
- *         FACS  ->   0xf2780 (64 bytes)
+ *       MCFG  ->   0xf2780  (60 bytes)
+ *         FACS  ->   0xf27C0 (64 bytes)
  *         DSDT  ->   0xf2800 (variable - can go up to 0x100000)
  */
 
@@ -80,7 +81,8 @@ __FBSDID("$FreeBSD$");
 #define MADT_OFFSET		0x100
 #define FADT_OFFSET		0x200
 #define	HPET_OFFSET		0x340
-#define FACS_OFFSET		0x380
+#define	MCFG_OFFSET		0x380
+#define FACS_OFFSET		0x3C0
 #define DSDT_OFFSET		0x400
 
 #define	BHYVE_ASL_TEMPLATE	"bhyve.XXXXXXX"
@@ -178,6 +180,8 @@ basl_fwrite_rsdt(FILE *fp)
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n",
 	    basl_acpi_base + HPET_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n",
+	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
@@ -216,6 +220,8 @@ basl_fwrite_xsdt(FILE *fp)
 	    basl_acpi_base + FADT_OFFSET);
 	EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n",
 	    basl_acpi_base + HPET_OFFSET);
+	EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n",
+	    basl_acpi_base + MCFG_OFFSET);
 
 	EFFLUSH(fp);
 
@@ -583,6 +589,39 @@ err_exit:
 }
 
 static int
+basl_fwrite_mcfg(FILE *fp)
+{
+	int err = 0;
+
+	EFPRINTF(fp, "/*\n");
+	EFPRINTF(fp, " * bhyve MCFG template\n");
+	EFPRINTF(fp, " */\n");
+	EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n");
+	EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n");
+	EFPRINTF(fp, "[0001]\t\tRevision : 01\n");
+	EFPRINTF(fp, "[0001]\t\tChecksum : 00\n");
+	EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n");
+	EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG  \"\n");
+	EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n");
+
+	/* iasl will fill in the compiler ID/revision fields */
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n");
+	EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n");
+	EFPRINTF(fp, "[0008]\t\tReserved : 0\n");
+	EFPRINTF(fp, "\n");
+
+	EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base());
+	EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n");
+	EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n");
+	EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n");
+	EFPRINTF(fp, "[0004]\t\tReserved : 0\n");
+	EFFLUSH(fp);
+	return (0);
+err_exit:
+	return (errno);
+}
+
+static int
 basl_fwrite_facs(FILE *fp)
 {
 	int err;
@@ -921,6 +960,7 @@ static struct {
 	{ basl_fwrite_madt, MADT_OFFSET },
 	{ basl_fwrite_fadt, FADT_OFFSET },
 	{ basl_fwrite_hpet, HPET_OFFSET },
+	{ basl_fwrite_mcfg, MCFG_OFFSET },
 	{ basl_fwrite_facs, FACS_OFFSET },
 	{ basl_fwrite_dsdt, DSDT_OFFSET },
 	{ NULL }

Modified: head/usr.sbin/bhyve/mem.c
==============================================================================
--- head/usr.sbin/bhyve/mem.c	Fri Aug  8 01:57:15 2014	(r269699)
+++ head/usr.sbin/bhyve/mem.c	Fri Aug  8 03:49:01 2014	(r269700)
@@ -162,7 +162,7 @@ emulate_mem(struct vmctx *ctx, int vcpu,
 
 {
 	struct mmio_rb_range *entry;
-	int err;
+	int err, immutable;
 	
 	pthread_rwlock_rdlock(&mmio_rwlock);
 	/*
@@ -186,9 +186,27 @@ emulate_mem(struct vmctx *ctx, int vcpu,
 	}
 
 	assert(entry != NULL);
+
+	/*
+	 * An 'immutable' memory range is guaranteed to be never removed
+	 * so there is no need to hold 'mmio_rwlock' while calling the
+	 * handler.
+	 *
+	 * XXX writes to the PCIR_COMMAND register can cause register_mem()
+	 * to be called. If the guest is using PCI extended config space
+	 * to modify the PCIR_COMMAND register then register_mem() can
+	 * deadlock on 'mmio_rwlock'. However by registering the extended
+	 * config space window as 'immutable' the deadlock can be avoided.
+	 */
+	immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
+	if (immutable)
+		pthread_rwlock_unlock(&mmio_rwlock);
+
 	err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
 				      mem_read, mem_write, &entry->mr_param);
-	pthread_rwlock_unlock(&mmio_rwlock);
+
+	if (!immutable)
+		pthread_rwlock_unlock(&mmio_rwlock);
 
 	return (err);
 }
@@ -246,6 +264,7 @@ unregister_mem(struct mem_range *memp)
 		mr = &entry->mr_param;
 		assert(mr->name == memp->name);
 		assert(mr->base == memp->base && mr->size == memp->size); 
+		assert((mr->flags & MEM_F_IMMUTABLE) == 0);
 		RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
 
 		/* flush Per-vCPU cache */	

Modified: head/usr.sbin/bhyve/mem.h
==============================================================================
--- head/usr.sbin/bhyve/mem.h	Fri Aug  8 01:57:15 2014	(r269699)
+++ head/usr.sbin/bhyve/mem.h	Fri Aug  8 03:49:01 2014	(r269700)
@@ -48,6 +48,7 @@ struct mem_range {
 #define	MEM_F_READ		0x1
 #define	MEM_F_WRITE		0x2
 #define	MEM_F_RW		0x3
+#define	MEM_F_IMMUTABLE		0x4	/* mem_range cannot be unregistered */
 
 void	init_mem(void);
 int     emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,

Modified: head/usr.sbin/bhyve/pci_emul.c
==============================================================================
--- head/usr.sbin/bhyve/pci_emul.c	Fri Aug  8 01:57:15 2014	(r269699)
+++ head/usr.sbin/bhyve/pci_emul.c	Fri Aug  8 03:49:01 2014	(r269700)
@@ -109,16 +109,20 @@ static uint64_t pci_emul_membase64;
 #define	PCI_EMUL_IOBASE		0x2000
 #define	PCI_EMUL_IOLIMIT	0x10000
 
-#define	PCI_EMUL_MEMLIMIT32	0xE0000000	/* 3.5GB */
+#define	PCI_EMUL_ECFG_BASE	0xE0000000		    /* 3.5GB */
+#define	PCI_EMUL_ECFG_SIZE	(MAXBUSES * 1024 * 1024)    /* 1MB per bus */
+SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
+
+#define	PCI_EMUL_MEMLIMIT32	PCI_EMUL_ECFG_BASE
 
 #define	PCI_EMUL_MEMBASE64	0xD000000000UL
 #define	PCI_EMUL_MEMLIMIT64	0xFD00000000UL
 
 static struct pci_devemu *pci_emul_finddev(char *name);
-static void	pci_lintr_route(struct pci_devinst *pi);
-static void	pci_lintr_update(struct pci_devinst *pi);
-
-static struct mem_range pci_mem_hole;
+static void pci_lintr_route(struct pci_devinst *pi);
+static void pci_lintr_update(struct pci_devinst *pi);
+static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
+    int func, int coff, int bytes, uint32_t *val);
 
 /*
  * I/O access
@@ -1023,12 +1027,37 @@ pci_emul_fallback_handler(struct vmctx *
 	return (0);
 }
 
+static int
+pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+    int bytes, uint64_t *val, void *arg1, long arg2)
+{
+	int bus, slot, func, coff, in;
+
+	coff = addr & 0xfff;
+	func = (addr >> 12) & 0x7;
+	slot = (addr >> 15) & 0x1f;
+	bus = (addr >> 20) & 0xff;
+	in = (dir == MEM_F_READ);
+	if (in)
+		*val = ~0UL;
+	pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
+	return (0);
+}
+
+uint64_t
+pci_ecfg_base(void)
+{
+
+	return (PCI_EMUL_ECFG_BASE);
+}
+
 #define	BUSIO_ROUNDUP		32
 #define	BUSMEM_ROUNDUP		(1024 * 1024)
 
 int
 init_pci(struct vmctx *ctx)
 {
+	struct mem_range mr;
 	struct pci_devemu *pde;
 	struct businfo *bi;
 	struct slotinfo *si;
@@ -1112,22 +1141,34 @@ init_pci(struct vmctx *ctx)
 	 * The guest physical memory map looks like the following:
 	 * [0,		    lowmem)		guest system memory
 	 * [lowmem,	    lowmem_limit)	memory hole (may be absent)
-	 * [lowmem_limit,   4GB)		PCI hole (32-bit BAR allocation)
+	 * [lowmem_limit,   0xE0000000)		PCI hole (32-bit BAR allocation)
+	 * [0xE0000000,	    0xF0000000)		PCI extended config window
+	 * [0xF0000000,	    4GB)		LAPIC, IOAPIC, HPET, firmware
 	 * [4GB,	    4GB + highmem)
-	 *
+	 */
+
+	/*
 	 * Accesses to memory addresses that are not allocated to system
 	 * memory or PCI devices return 0xff's.
 	 */
 	lowmem = vm_get_lowmem_size(ctx);
+	bzero(&mr, sizeof(struct mem_range));
+	mr.name = "PCI hole";
+	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+	mr.base = lowmem;
+	mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
+	mr.handler = pci_emul_fallback_handler;
+	error = register_mem_fallback(&mr);
+	assert(error == 0);
 
-	memset(&pci_mem_hole, 0, sizeof(struct mem_range));
-	pci_mem_hole.name = "PCI hole";
-	pci_mem_hole.flags = MEM_F_RW;
-	pci_mem_hole.base = lowmem;
-	pci_mem_hole.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
-	pci_mem_hole.handler = pci_emul_fallback_handler;
-
-	error = register_mem_fallback(&pci_mem_hole);
+	/* PCI extended config space */
+	bzero(&mr, sizeof(struct mem_range));
+	mr.name = "PCI ECFG";
+	mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+	mr.base = PCI_EMUL_ECFG_BASE;
+	mr.size = PCI_EMUL_ECFG_SIZE;
+	mr.handler = pci_emul_ecfg_handler;
+	error = register_mem(&mr);
 	assert(error == 0);
 
 	return (0);
@@ -1612,41 +1653,6 @@ pci_emul_hdrtype_fixup(int bus, int slot
 	}
 }
 
-static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
-
-static int
-pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
-{
-	uint32_t x;
-
-	if (bytes != 4) {
-		if (in)
-			*eax = (bytes == 2) ? 0xffff : 0xff;
-		return (0);
-	}
-
-	if (in) {
-		x = (cfgbus << 16) |
-		    (cfgslot << 11) |
-		    (cfgfunc << 8) |
-		    cfgoff;
-                if (cfgenable)
-			x |= CONF1_ENABLE;	       
-		*eax = x;
-	} else {
-		x = *eax;
-		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
-		cfgoff = x & PCI_REGMAX;
-		cfgfunc = (x >> 8) & PCI_FUNCMAX;
-		cfgslot = (x >> 11) & PCI_SLOTMAX;
-		cfgbus = (x >> 16) & PCI_BUSMAX;
-	}
-
-	return (0);
-}
-INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
-
 static uint32_t
 bits_changed(uint32_t old, uint32_t new, uint32_t mask)
 {
@@ -1709,41 +1715,51 @@ pci_emul_cmdwrite(struct pci_devinst *pi
 	pci_lintr_update(pi);
 }	
 
-static int
-pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
-		 uint32_t *eax, void *arg)
+static void
+pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
+    int coff, int bytes, uint32_t *eax)
 {
 	struct businfo *bi;
 	struct slotinfo *si;
 	struct pci_devinst *pi;
 	struct pci_devemu *pe;
-	int coff, idx, needcfg;
+	int idx, needcfg;
 	uint64_t addr, bar, mask;
 
-	assert(bytes == 1 || bytes == 2 || bytes == 4);
-
-	if ((bi = pci_businfo[cfgbus]) != NULL) {
-		si = &bi->slotinfo[cfgslot];
-		pi = si->si_funcs[cfgfunc].fi_devi;
+	if ((bi = pci_businfo[bus]) != NULL) {
+		si = &bi->slotinfo[slot];
+		pi = si->si_funcs[func].fi_devi;
 	} else
 		pi = NULL;
 
-	coff = cfgoff + (port - CONF1_DATA_PORT);
-
-#if 0
-	printf("pcicfg-%s from 0x%0x of %d bytes (%d/%d/%d)\n\r",
-		in ? "read" : "write", coff, bytes, cfgbus, cfgslot, cfgfunc);
-#endif
-
 	/*
-	 * Just return if there is no device at this cfgslot:cfgfunc,
-	 * if the guest is doing an un-aligned access, or if the config
-	 * address word isn't enabled.
+	 * Just return if there is no device at this slot:func or if the
+	 * the guest is doing an un-aligned access.
 	 */
-	if (!cfgenable || pi == NULL || (coff & (bytes - 1)) != 0) {
+	if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
+	    (coff & (bytes - 1)) != 0) {
 		if (in)
 			*eax = 0xffffffff;
-		return (0);
+		return;
+	}
+
+	/*
+	 * Ignore all writes beyond the standard config space and return all
+	 * ones on reads.
+	 */
+	if (coff >= PCI_REGMAX + 1) {
+		if (in) {
+			*eax = 0xffffffff;
+			/*
+			 * Extended capabilities begin at offset 256 in config
+			 * space. Absence of extended capabilities is signaled
+			 * with all 0s in the extended capability header at
+			 * offset 256.
+			 */
+			if (coff <= PCI_REGMAX + 4)
+				*eax = 0x00000000;
+		}
+		return;
 	}
 
 	pe = pi->pi_d;
@@ -1754,8 +1770,8 @@ pci_emul_cfgdata(struct vmctx *ctx, int 
 	if (in) {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgread != NULL) {
-			needcfg = pe->pe_cfgread(ctx, vcpu, pi,
-						    coff, bytes, eax);
+			needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
+			    eax);
 		} else {
 			needcfg = 1;
 		}
@@ -1769,12 +1785,12 @@ pci_emul_cfgdata(struct vmctx *ctx, int 
 				*eax = pci_get_cfgdata32(pi, coff);
 		}
 
-		pci_emul_hdrtype_fixup(cfgbus, cfgslot, coff, bytes, eax);
+		pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
 	} else {
 		/* Let the device emulation override the default handler */
 		if (pe->pe_cfgwrite != NULL &&
 		    (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
-			return (0);
+			return;
 
 		/*
 		 * Special handling for write to BAR registers
@@ -1785,7 +1801,7 @@ pci_emul_cfgdata(struct vmctx *ctx, int 
 			 * 4-byte aligned.
 			 */
 			if (bytes != 4 || (coff & 0x3) != 0)
-				return (0);
+				return;
 			idx = (coff - PCIR_BAR(0)) / 4;
 			mask = ~(pi->pi_bar[idx].size - 1);
 			switch (pi->pi_bar[idx].type) {
@@ -1843,7 +1859,57 @@ pci_emul_cfgdata(struct vmctx *ctx, int 
 			CFGWRITE(pi, coff, *eax, bytes);
 		}
 	}
+}
+
+static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	uint32_t x;
 
+	if (bytes != 4) {
+		if (in)
+			*eax = (bytes == 2) ? 0xffff : 0xff;
+		return (0);
+	}
+
+	if (in) {
+		x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
+		if (cfgenable)
+			x |= CONF1_ENABLE;
+		*eax = x;
+	} else {
+		x = *eax;
+		cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
+		cfgoff = x & PCI_REGMAX;
+		cfgfunc = (x >> 8) & PCI_FUNCMAX;
+		cfgslot = (x >> 11) & PCI_SLOTMAX;
+		cfgbus = (x >> 16) & PCI_BUSMAX;
+	}
+
+	return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+		 uint32_t *eax, void *arg)
+{
+	int coff;
+
+	assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+	coff = cfgoff + (port - CONF1_DATA_PORT);
+	if (cfgenable) {
+		pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
+		    eax);
+	} else {
+		/* Ignore accesses to cfgdata if not enabled by cfgaddr */
+		if (in)
+			*eax = 0xffffffff;
+	}
 	return (0);
 }
 

Modified: head/usr.sbin/bhyve/pci_emul.h
==============================================================================
--- head/usr.sbin/bhyve/pci_emul.h	Fri Aug  8 01:57:15 2014	(r269699)
+++ head/usr.sbin/bhyve/pci_emul.h	Fri Aug  8 03:49:01 2014	(r269700)
@@ -235,6 +235,7 @@ uint64_t pci_emul_msix_tread(struct pci_
 int	pci_count_lintr(int bus);
 void	pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
 void	pci_write_dsdt(void);
+uint64_t pci_ecfg_base(void);
 int	pci_bus_configured(int bus);
 
 static __inline void 


More information about the svn-src-all mailing list