git: 346876412a3b - stable/13 - bhyve: add ROM emulation

From: Emmanuel Vadot <manu_at_FreeBSD.org>
Date: Mon, 16 May 2022 12:50:12 UTC
The branch stable/13 has been updated by manu:

URL: https://cgit.FreeBSD.org/src/commit/?id=346876412a3b833757db3506037d4c7045724dee

commit 346876412a3b833757db3506037d4c7045724dee
Author:     Corvin Köhne <CorvinK@beckhoff.com>
AuthorDate: 2022-03-10 10:28:06 +0000
Commit:     Emmanuel Vadot <manu@FreeBSD.org>
CommitDate: 2022-05-16 12:41:14 +0000

    bhyve: add ROM emulation
    
    Some PCI devices especially GPUs require a ROM to work properly.
    The ROM is executed by boot firmware to initialize the device.
    To add a ROM to a device use the new ROM option for passthru device
    (e.g. -s passthru,0/2/0,rom=<path>/<to>/<rom>).
    
    It's necessary that the ROM is executed by the boot firmware.
    It won't be executed by any OS.
    Additionally, the boot firmware should be configured to execute the
    ROM file.
    For that reason, it's only possible to use a ROM when using
    OVMF with enabled bus enumeration.
    
    Differential Revision:  https://reviews.freebsd.org/D33129
    Sponsored by:   Beckhoff Automation GmbH & Co. KG
    MFC after:      1 month
    
    (cherry picked from commit e47fe3183e1f2d531230ffc627ec165ea23e72cb)
---
 lib/libvmmapi/vmmapi.h        |   1 +
 sys/amd64/vmm/vmm.c           |   2 +-
 usr.sbin/bhyve/bhyve.8        |   5 ++
 usr.sbin/bhyve/bhyve_config.5 |   2 +
 usr.sbin/bhyve/pci_emul.c     | 117 ++++++++++++++++++++++++++++++++++++++---
 usr.sbin/bhyve/pci_emul.h     |  11 +++-
 usr.sbin/bhyve/pci_passthru.c | 118 ++++++++++++++++++++++++++++++++++++++----
 7 files changed, 236 insertions(+), 20 deletions(-)

diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 55ca7395d63b..d2ee9cae127b 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -73,6 +73,7 @@ enum {
 	VM_SYSMEM,
 	VM_BOOTROM,
 	VM_FRAMEBUFFER,
+	VM_PCIROM,
 };
 
 /*
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 5a5744a3fb16..3d6fca939498 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -134,7 +134,7 @@ struct mem_seg {
 	bool	sysmem;
 	struct vm_object *object;
 };
-#define	VM_MAX_MEMSEGS	3
+#define	VM_MAX_MEMSEGS	4
 
 struct mem_map {
 	vm_paddr_t	gpa;
diff --git a/usr.sbin/bhyve/bhyve.8 b/usr.sbin/bhyve/bhyve.8
index 4f5b544cfc65..f0c61228cf77 100644
--- a/usr.sbin/bhyve/bhyve.8
+++ b/usr.sbin/bhyve/bhyve.8
@@ -500,6 +500,11 @@ Connect to a PCI device on the host at the selector described by
 and
 .Ar function
 numbers.
+.It Li rom= Ns Ar romfile
+Add
+.Ar romfile
+as option ROM to the PCI device.
+The ROM will be loaded by firmware and should be capable of initializing the device.
 .El
 .Pp
 Guest memory must be wired using the
diff --git a/usr.sbin/bhyve/bhyve_config.5 b/usr.sbin/bhyve/bhyve_config.5
index a9cea3849c01..77d0bbaedaca 100644
--- a/usr.sbin/bhyve/bhyve_config.5
+++ b/usr.sbin/bhyve/bhyve_config.5
@@ -500,6 +500,8 @@ Host PCI bus address of device to pass through.
 Host PCI slot address of device to pass through.
 .It Va func Ta integer Ta Ta
 Host PCI function address of device to pass through.
+.It Va rom Ta path Ta Ta
+ROM file of the device which will be executed by OVMF to init the device.
 .El
 .Ss VirtIO 9p Settings
 Each VirtIO 9p device exposes a single filesystem from a host path.
diff --git a/usr.sbin/bhyve/pci_emul.c b/usr.sbin/bhyve/pci_emul.c
index e9c88922b0ef..7b62b09b53f7 100644
--- a/usr.sbin/bhyve/pci_emul.c
+++ b/usr.sbin/bhyve/pci_emul.c
@@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/linker_set.h>
+#include <sys/mman.h>
 
 #include <ctype.h>
 #include <err.h>
@@ -101,6 +102,9 @@ static struct businfo *pci_businfo[MAXBUSES];
 SET_DECLARE(pci_devemu_set, struct pci_devemu);
 
 static uint64_t pci_emul_iobase;
+static uint8_t *pci_emul_rombase;
+static uint64_t pci_emul_romoffset;
+static uint8_t *pci_emul_romlim;
 static uint64_t pci_emul_membase32;
 static uint64_t pci_emul_membase64;
 static uint64_t pci_emul_memlim64;
@@ -118,6 +122,8 @@ TAILQ_HEAD(pci_bar_list, pci_bar_allocation) pci_bars = TAILQ_HEAD_INITIALIZER(
 #define	PCI_EMUL_IOBASE		0x2000
 #define	PCI_EMUL_IOLIMIT	0x10000
 
+#define PCI_EMUL_ROMSIZE 0x10000000
+
 #define	PCI_EMUL_ECFG_BASE	0xE0000000		    /* 3.5GB */
 #define	PCI_EMUL_ECFG_SIZE	(MAXBUSES * 1024 * 1024)    /* 1MB per bus */
 SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
@@ -562,6 +568,12 @@ modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
 			(*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration,
 					  pi->pi_bar[idx].addr);
 		break;
+	case PCIBAR_ROM:
+		error = 0;
+		if (pe->pe_baraddr != NULL)
+			(*pe->pe_baraddr)(pi->pi_vmctx, pi, idx, registration,
+			    pi->pi_bar[idx].addr);
+		break;
 	default:
 		error = EINVAL;
 		break;
@@ -583,6 +595,14 @@ register_bar(struct pci_devinst *pi, int idx)
 	modify_bar_registration(pi, idx, 1);
 }
 
+/* Is the ROM enabled for the emulated pci device? */
+static int
+romen(struct pci_devinst *pi)
+{
+	return (pi->pi_bar[PCI_ROM_IDX].lobits & PCIM_BIOS_ENABLE) ==
+	    PCIM_BIOS_ENABLE;
+}
+
 /* Are we decoding i/o port accesses for the emulated pci device? */
 static int
 porten(struct pci_devinst *pi)
@@ -649,7 +669,8 @@ int
 pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
     uint64_t size)
 {
-	assert(idx >= 0 && idx <= PCI_BARMAX);
+	assert((type == PCIBAR_ROM) || (idx >= 0 && idx <= PCI_BARMAX));
+	assert((type != PCIBAR_ROM) || (idx == PCI_ROM_IDX));
 
 	if ((size & (size - 1)) != 0)
 		size = 1UL << flsl(size);	/* round up to a power of 2 */
@@ -658,6 +679,9 @@ pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
 	if (type == PCIBAR_IO) {
 		if (size < 4)
 			size = 4;
+	} else if (type == PCIBAR_ROM) {
+		if (size < ~PCIM_BIOS_ADDR_MASK + 1)
+			size = ~PCIM_BIOS_ADDR_MASK + 1;
 	} else {
 		if (size < 16)
 			size = 16;
@@ -773,6 +797,13 @@ pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx,
 		mask = PCIM_BAR_MEM_BASE;
 		lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
 		break;
+	case PCIBAR_ROM:
+		/* do not claim memory for ROM. OVMF will do it for us. */
+		baseptr = NULL;
+		limit = 0;
+		mask = PCIM_BIOS_ADDR_MASK;
+		lobits = 0;
+		break;
 	default:
 		printf("pci_emul_alloc_base: invalid bar type %d\n", type);
 		assert(0);
@@ -807,7 +838,57 @@ pci_emul_assign_bar(struct pci_devinst *const pdi, const int idx,
 		pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
 	}
 
-	register_bar(pdi, idx);
+	if (type != PCIBAR_ROM) {
+		register_bar(pdi, idx);
+	}
+
+	return (0);
+}
+
+int
+pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size,
+    void **const addr)
+{
+	/* allocate ROM space once on first call */
+	if (pci_emul_rombase == 0) {
+		pci_emul_rombase = vm_create_devmem(pdi->pi_vmctx, VM_PCIROM,
+		    "pcirom", PCI_EMUL_ROMSIZE);
+		if (pci_emul_rombase == MAP_FAILED) {
+			warnx("%s: failed to create rom segment", __func__);
+			return (-1);
+		}
+		pci_emul_romlim = pci_emul_rombase + PCI_EMUL_ROMSIZE;
+		pci_emul_romoffset = 0;
+	}
+
+	/* ROM size should be a power of 2 and greater than 2 KB */
+	const uint64_t rom_size = MAX(1UL << flsl(size),
+	    ~PCIM_BIOS_ADDR_MASK + 1);
+
+	/* check if ROM fits into ROM space */
+	if (pci_emul_romoffset + rom_size > PCI_EMUL_ROMSIZE) {
+		warnx("%s: no space left in rom segment:", __func__);
+		warnx("%16lu bytes left",
+		    PCI_EMUL_ROMSIZE - pci_emul_romoffset);
+		warnx("%16lu bytes required by %d/%d/%d", rom_size, pdi->pi_bus,
+		    pdi->pi_slot, pdi->pi_func);
+		return (-1);
+	}
+
+	/* allocate ROM BAR */
+	const int error = pci_emul_alloc_bar(pdi, PCI_ROM_IDX, PCIBAR_ROM,
+	    rom_size);
+	if (error)
+		return error;
+
+	/* return address */
+	*addr = pci_emul_rombase + pci_emul_romoffset;
+
+	/* save offset into ROM Space */
+	pdi->pi_romoffset = pci_emul_romoffset;
+
+	/* increase offset for next ROM */
+	pci_emul_romoffset += rom_size;
 
 	return (0);
 }
@@ -1885,7 +1966,7 @@ pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old)
 	 * If the MMIO or I/O address space decoding has changed then
 	 * register/unregister all BARs that decode that address space.
 	 */
-	for (i = 0; i <= PCI_BARMAX; i++) {
+	for (i = 0; i <= PCI_BARMAX_WITH_ROM; i++) {
 		switch (pi->pi_bar[i].type) {
 			case PCIBAR_NONE:
 			case PCIBAR_MEMHI64:
@@ -1899,6 +1980,11 @@ pci_emul_cmd_changed(struct pci_devinst *pi, uint16_t old)
 						unregister_bar(pi, i);
 				}
 				break;
+			case PCIBAR_ROM:
+				/* skip (un-)register of ROM if it disabled */
+				if (!romen(pi))
+					break;
+				/* fallthrough */
 			case PCIBAR_MEM32:
 			case PCIBAR_MEM64:
 				/* MMIO address space decoding changed? */
@@ -2019,16 +2105,21 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
 			return;
 
 		/*
-		 * Special handling for write to BAR registers
+		 * Special handling for write to BAR and ROM registers
 		 */
-		if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+		if ((coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) ||
+		    (coff >= PCIR_BIOS && coff < PCIR_BIOS + 4)) {
 			/*
 			 * Ignore writes to BAR registers that are not
 			 * 4-byte aligned.
 			 */
 			if (bytes != 4 || (coff & 0x3) != 0)
 				return;
-			idx = (coff - PCIR_BAR(0)) / 4;
+			if (coff != PCIR_BIOS) {
+				idx = (coff - PCIR_BAR(0)) / 4;
+			} else {
+				idx = PCI_ROM_IDX;
+			}
 			mask = ~(pi->pi_bar[idx].size - 1);
 			switch (pi->pi_bar[idx].type) {
 			case PCIBAR_NONE:
@@ -2071,6 +2162,20 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
 							   PCIBAR_MEMHI64);
 				}
 				break;
+			case PCIBAR_ROM:
+				addr = bar = *eax & mask;
+				if (memen(pi) && romen(pi)) {
+					unregister_bar(pi, idx);
+				}
+				pi->pi_bar[idx].addr = addr;
+				pi->pi_bar[idx].lobits = *eax &
+				    PCIM_BIOS_ENABLE;
+				/* romen could have changed it value */
+				if (memen(pi) && romen(pi)) {
+					register_bar(pi, idx);
+				}
+				bar |= pi->pi_bar[idx].lobits;
+				break;
 			default:
 				assert(0);
 			}
diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h
index 6eac0720f09f..c10c4af4698b 100644
--- a/usr.sbin/bhyve/pci_emul.h
+++ b/usr.sbin/bhyve/pci_emul.h
@@ -42,6 +42,8 @@
 #include <assert.h>
 
 #define	PCI_BARMAX	PCIR_MAX_BAR_0	/* BAR registers in a Type 0 header */
+#define PCI_BARMAX_WITH_ROM (PCI_BARMAX + 1)
+#define PCI_ROM_IDX (PCI_BARMAX + 1)
 
 struct vmctx;
 struct pci_devinst;
@@ -92,7 +94,8 @@ enum pcibar_type {
 	PCIBAR_IO,
 	PCIBAR_MEM32,
 	PCIBAR_MEM64,
-	PCIBAR_MEMHI64
+	PCIBAR_MEMHI64,
+	PCIBAR_ROM,
 };
 
 struct pcibar {
@@ -165,7 +168,9 @@ struct pci_devinst {
 	void      *pi_arg;		/* devemu-private data */
 
 	u_char	  pi_cfgdata[PCI_REGMAX + 1];
-	struct pcibar pi_bar[PCI_BARMAX + 1];
+	/* ROM is handled like a BAR */
+	struct pcibar pi_bar[PCI_BARMAX_WITH_ROM + 1];
+	uint64_t pi_romoffset;
 };
 
 struct msicap {
@@ -229,6 +234,8 @@ int	init_pci(struct vmctx *ctx);
 void	pci_callback(void);
 int	pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
 	    enum pcibar_type type, uint64_t size);
+int 	pci_emul_alloc_rom(struct pci_devinst *const pdi, const uint64_t size,
+    	    void **const addr);
 int	pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
 int	pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
 void	pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes,
diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c
index 3fe446f42eee..56f049a6d312 100644
--- a/usr.sbin/bhyve/pci_passthru.c
+++ b/usr.sbin/bhyve/pci_passthru.c
@@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mman.h>
 #include <sys/pciio.h>
 #include <sys/ioctl.h>
+#include <sys/stat.h>
 
 #include <dev/io/iodev.h>
 #include <dev/pci/pcireg.h>
@@ -80,7 +81,8 @@ static int pcifd = -1;
 
 struct passthru_softc {
 	struct pci_devinst *psc_pi;
-	struct pcibar psc_bar[PCI_BARMAX + 1];
+	/* ROM is handled like a BAR */
+	struct pcibar psc_bar[PCI_BARMAX_WITH_ROM + 1];
 	struct {
 		int		capoff;
 		int		msgctrl;
@@ -666,6 +668,58 @@ passthru_legacy_config(nvlist_t *nvl, const char *opts)
 	set_config_value_node(nvl, "slot", value);
 	snprintf(value, sizeof(value), "%d", func);
 	set_config_value_node(nvl, "func", value);
+
+	return (pci_parse_legacy_config(nvl, strchr(opts, ',')));
+}
+
+static int
+passthru_init_rom(struct vmctx *const ctx, struct passthru_softc *const sc,
+    const char *const romfile)
+{
+	if (romfile == NULL) {
+		return (0);
+	}
+
+	const int fd = open(romfile, O_RDONLY);
+	if (fd < 0) {
+		warnx("%s: can't open romfile \"%s\"", __func__, romfile);
+		return (-1);
+	}
+
+	struct stat sbuf;
+	if (fstat(fd, &sbuf) < 0) {
+		warnx("%s: can't fstat romfile \"%s\"", __func__, romfile);
+		close(fd);
+		return (-1);
+	}
+	const uint64_t rom_size = sbuf.st_size;
+
+	void *const rom_data = mmap(NULL, rom_size, PROT_READ, MAP_SHARED, fd,
+	    0);
+	if (rom_data == MAP_FAILED) {
+		warnx("%s: unable to mmap romfile \"%s\" (%d)", __func__,
+		    romfile, errno);
+		close(fd);
+		return (-1);
+	}
+
+	void *rom_addr;
+	int error = pci_emul_alloc_rom(sc->psc_pi, rom_size, &rom_addr);
+	if (error) {
+		warnx("%s: failed to alloc rom segment", __func__);
+		munmap(rom_data, rom_size);
+		close(fd);
+		return (error);
+	}
+	memcpy(rom_addr, rom_data, rom_size);
+
+	sc->psc_bar[PCI_ROM_IDX].type = PCIBAR_ROM;
+	sc->psc_bar[PCI_ROM_IDX].addr = (uint64_t)rom_addr;
+	sc->psc_bar[PCI_ROM_IDX].size = rom_size;
+
+	munmap(rom_data, rom_size);
+	close(fd);
+
 	return (0);
 }
 
@@ -714,7 +768,15 @@ passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
 	sc->psc_pi = pi;
 
 	/* initialize config space */
-	error = cfginit(ctx, pi, bus, slot, func);
+	if ((error = cfginit(ctx, pi, bus, slot, func)) != 0)
+		goto done;
+
+	/* initialize ROM */
+	if ((error = passthru_init_rom(ctx, sc,
+            get_config_value_node(nvl, "rom"))) != 0)
+		goto done;
+
+	error = 0;		/* success */
 done:
 	if (error) {
 		free(sc);
@@ -726,7 +788,8 @@ done:
 static int
 bar_access(int coff)
 {
-	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
+	if ((coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) ||
+	    coff == PCIR_BIOS)
 		return (1);
 	else
 		return (0);
@@ -1018,16 +1081,49 @@ passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
 }
 
 static void
-passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
-	      int enabled, uint64_t address)
+passthru_addr_rom(struct pci_devinst *const pi, const int idx,
+    const int enabled)
 {
+	const uint64_t addr = pi->pi_bar[idx].addr;
+	const uint64_t size = pi->pi_bar[idx].size;
 
-	if (pi->pi_bar[baridx].type == PCIBAR_IO)
-		return;
-	if (baridx == pci_msix_table_bar(pi))
-		passthru_msix_addr(ctx, pi, baridx, enabled, address);
-	else
-		passthru_mmio_addr(ctx, pi, baridx, enabled, address);
+	if (!enabled) {
+		if (vm_munmap_memseg(pi->pi_vmctx, addr, size) != 0) {
+			errx(4, "%s: munmap_memseg @ [%016lx - %016lx] failed",
+			    __func__, addr, addr + size);
+		}
+
+	} else {
+		if (vm_mmap_memseg(pi->pi_vmctx, addr, VM_PCIROM,
+			pi->pi_romoffset, size, PROT_READ | PROT_EXEC) != 0) {
+			errx(4, "%s: mnmap_memseg @ [%016lx - %016lx]  failed",
+			    __func__, addr, addr + size);
+		}
+	}
+}
+
+static void
+passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
+    int enabled, uint64_t address)
+{
+	switch (pi->pi_bar[baridx].type) {
+	case PCIBAR_IO:
+		/* IO BARs are emulated */
+		break;
+	case PCIBAR_ROM:
+		passthru_addr_rom(pi, baridx, enabled);
+		break;
+	case PCIBAR_MEM32:
+	case PCIBAR_MEM64:
+		if (baridx == pci_msix_table_bar(pi))
+			passthru_msix_addr(ctx, pi, baridx, enabled, address);
+		else
+			passthru_mmio_addr(ctx, pi, baridx, enabled, address);
+		break;
+	default:
+		errx(4, "%s: invalid BAR type %d", __func__,
+		    pi->pi_bar[baridx].type);
+	}
 }
 
 struct pci_devemu passthru = {