From nobody Sun Oct 31 14:02:15 2021 X-Original-To: dev-commits-src-branches@mlmmj.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mlmmj.nyi.freebsd.org (Postfix) with ESMTP id BAADE182677D; Sun, 31 Oct 2021 14:02:15 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "R3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 4HhyXR3mh2z4cjn; Sun, 31 Oct 2021 14:02:15 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org (gitrepo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:5]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 509B11F2AA; Sun, 31 Oct 2021 14:02:15 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org ([127.0.1.44]) by gitrepo.freebsd.org (8.16.1/8.16.1) with ESMTP id 19VE2FCk053755; Sun, 31 Oct 2021 14:02:15 GMT (envelope-from git@gitrepo.freebsd.org) Received: (from git@localhost) by gitrepo.freebsd.org (8.16.1/8.16.1/Submit) id 19VE2FXs053754; Sun, 31 Oct 2021 14:02:15 GMT (envelope-from git) Date: Sun, 31 Oct 2021 14:02:15 GMT Message-Id: <202110311402.19VE2FXs053754@gitrepo.freebsd.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org From: Mark Johnston Subject: git: 925211125cec - stable/13 - Revert "bhyve: Map the MSI-X table unconditionally for passthrough" List-Id: Commits to the stable branches of the FreeBSD src repository List-Archive: https://lists.freebsd.org/archives/dev-commits-src-branches List-Help: List-Post: List-Subscribe: List-Unsubscribe: Sender: owner-dev-commits-src-branches@freebsd.org X-BeenThere: dev-commits-src-branches@freebsd.org MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit X-Git-Committer: markj X-Git-Repository: src X-Git-Refname: refs/heads/stable/13 X-Git-Reftype: branch X-Git-Commit: 925211125cec3481cb0fc14444868ab51d904222 Auto-Submitted: auto-generated X-ThisMailContainsUnwantedMimeParts: N The branch stable/13 has been updated by markj: URL: https://cgit.FreeBSD.org/src/commit/?id=925211125cec3481cb0fc14444868ab51d904222 commit 925211125cec3481cb0fc14444868ab51d904222 Author: Mark Johnston AuthorDate: 2021-10-31 13:59:59 +0000 Commit: Mark Johnston CommitDate: 2021-10-31 13:59:59 +0000 Revert "bhyve: Map the MSI-X table unconditionally for passthrough" This reverts commit 382eec24c0284bd7dc5997b85abc9ee70ea704a1. This change causes a regression where a VM using passthrough no longer starts. Until this is resolved, revert the commit. Reported by: Raúl Muñoz --- usr.sbin/bhyve/pci_emul.h | 4 +- usr.sbin/bhyve/pci_passthru.c | 186 +++++++++++++++++++++++++----------------- 2 files changed, 114 insertions(+), 76 deletions(-) diff --git a/usr.sbin/bhyve/pci_emul.h b/usr.sbin/bhyve/pci_emul.h index 5b6a17119960..031a6113fac4 100644 --- a/usr.sbin/bhyve/pci_emul.h +++ b/usr.sbin/bhyve/pci_emul.h @@ -157,8 +157,8 @@ struct pci_devinst { int pba_size; int function_mask; struct msix_table_entry *table; /* allocated at runtime */ - uint8_t *mapped_addr; - size_t mapped_size; + void *pba_page; + int pba_page_offset; } pi_msix; void *pi_arg; /* devemu-private data */ diff --git a/usr.sbin/bhyve/pci_passthru.c b/usr.sbin/bhyve/pci_passthru.c index bf99c646c480..2c6a2ebd8dd9 100644 --- a/usr.sbin/bhyve/pci_passthru.c +++ b/usr.sbin/bhyve/pci_passthru.c @@ -43,10 +43,7 @@ __FBSDID("$FreeBSD$"); #include #include -#include - #include -#include #ifndef WITHOUT_CAPSICUM #include @@ -72,12 +69,17 @@ __FBSDID("$FreeBSD$"); #define _PATH_DEVPCI "/dev/pci" #endif +#ifndef _PATH_MEM +#define _PATH_MEM "/dev/mem" +#endif + #define LEGACY_SUPPORT 1 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) #define MSIX_CAPLEN 12 static int pcifd = -1; +static int memfd = -1; struct passthru_softc { struct pci_devinst *psc_pi; @@ -288,30 +290,30 @@ msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) uint64_t *src64; uint64_t data; size_t entry_offset; - uint32_t table_offset; - int index, table_count; + int index; pi = sc->psc_pi; - - table_offset = pi->pi_msix.table_offset; - table_count = pi->pi_msix.table_count; - if (offset < table_offset || - offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { - switch (size) { + if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && + offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + switch(size) { case 1: - src8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); + src8 = (uint8_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); data = *src8; break; case 2: - src16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); + src16 = (uint16_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); data = *src16; break; case 4: - src32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); + src32 = (uint32_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); data = *src32; break; case 8: - src64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); + src64 = (uint64_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); data = *src64; break; default: @@ -320,28 +322,32 @@ msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) return (data); } - offset -= table_offset; + if (offset < pi->pi_msix.table_offset) + return (-1); + + offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; - assert(index < table_count); + if (index >= pi->pi_msix.table_count) + return (-1); entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; - switch (size) { + switch(size) { case 1: - src8 = (uint8_t *)((uint8_t *)entry + entry_offset); + src8 = (uint8_t *)((void *)entry + entry_offset); data = *src8; break; case 2: - src16 = (uint16_t *)((uint8_t *)entry + entry_offset); + src16 = (uint16_t *)((void *)entry + entry_offset); data = *src16; break; case 4: - src32 = (uint32_t *)((uint8_t *)entry + entry_offset); + src32 = (uint32_t *)((void *)entry + entry_offset); data = *src32; break; case 8: - src64 = (uint64_t *)((uint8_t *)entry + entry_offset); + src64 = (uint64_t *)((void *)entry + entry_offset); data = *src64; break; default: @@ -362,39 +368,46 @@ msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, uint32_t *dest32; uint64_t *dest64; size_t entry_offset; - uint32_t table_offset, vector_control; - int index, table_count; + uint32_t vector_control; + int index; pi = sc->psc_pi; - - table_offset = pi->pi_msix.table_offset; - table_count = pi->pi_msix.table_count; - if (offset < table_offset || - offset >= table_offset + table_count * MSIX_TABLE_ENTRY_SIZE) { - switch (size) { + if (pi->pi_msix.pba_page != NULL && offset >= pi->pi_msix.pba_offset && + offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + switch(size) { case 1: - dest8 = (uint8_t *)(pi->pi_msix.mapped_addr + offset); + dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); *dest8 = data; break; case 2: - dest16 = (uint16_t *)(pi->pi_msix.mapped_addr + offset); + dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); *dest16 = data; break; case 4: - dest32 = (uint32_t *)(pi->pi_msix.mapped_addr + offset); + dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); *dest32 = data; break; case 8: - dest64 = (uint64_t *)(pi->pi_msix.mapped_addr + offset); + dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); *dest64 = data; break; + default: + break; } return; } - offset -= table_offset; + if (offset < pi->pi_msix.table_offset) + return; + + offset -= pi->pi_msix.table_offset; index = offset / MSIX_TABLE_ENTRY_SIZE; - assert(index < table_count); + if (index >= pi->pi_msix.table_count) + return; entry = &pi->pi_msix.table[index]; entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; @@ -422,10 +435,13 @@ msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, static int init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) { - struct pci_devinst *pi = sc->psc_pi; - struct pci_bar_mmap pbm; int b, s, f; + int idx; + size_t remaining; uint32_t table_size, table_offset; + uint32_t pba_size, pba_offset; + vm_paddr_t start; + struct pci_devinst *pi = sc->psc_pi; assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); @@ -433,48 +449,55 @@ init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) s = sc->psc_sel.pc_dev; f = sc->psc_sel.pc_func; - /* - * Map the region of the BAR containing the MSI-X table. This is - * necessary for two reasons: - * 1. The PBA may reside in the first or last page containing the MSI-X - * table. - * 2. While PCI devices are not supposed to use the page(s) containing - * the MSI-X table for other purposes, some do in practice. + /* + * If the MSI-X table BAR maps memory intended for + * other uses, it is at least assured that the table + * either resides in its own page within the region, + * or it resides in a page shared with only the PBA. */ - memset(&pbm, 0, sizeof(pbm)); - pbm.pbm_sel = sc->psc_sel; - pbm.pbm_flags = PCIIO_BAR_MMAP_RW; - pbm.pbm_reg = PCIR_BAR(pi->pi_msix.pba_bar); - pbm.pbm_memattr = VM_MEMATTR_DEVICE; - - if (ioctl(pcifd, PCIOCBARMMAP, &pbm) != 0) { - warn("Failed to map MSI-X table BAR on %d/%d/%d", b, s, f); - return (-1); - } - assert(pbm.pbm_bar_off == 0); - pi->pi_msix.mapped_addr = (uint8_t *)(uintptr_t)pbm.pbm_map_base; - pi->pi_msix.mapped_size = pbm.pbm_map_length; - table_offset = rounddown2(pi->pi_msix.table_offset, 4096); table_size = pi->pi_msix.table_offset - table_offset; table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; table_size = roundup2(table_size, 4096); - /* - * Unmap any pages not covered by the table, we do not need to emulate - * accesses to them. Avoid releasing address space to help ensure that - * a buggy out-of-bounds access causes a crash. - */ - if (table_offset != 0) - if (mprotect(pi->pi_msix.mapped_addr, table_offset, - PROT_NONE) != 0) - warn("Failed to unmap MSI-X table BAR region"); - if (table_offset + table_size != pi->pi_msix.mapped_size) - if (mprotect(pi->pi_msix.mapped_addr, - pi->pi_msix.mapped_size - (table_offset + table_size), - PROT_NONE) != 0) - warn("Failed to unmap MSI-X table BAR region"); + idx = pi->pi_msix.table_bar; + start = pi->pi_bar[idx].addr; + remaining = pi->pi_bar[idx].size; + + if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { + pba_offset = pi->pi_msix.pba_offset; + pba_size = pi->pi_msix.pba_size; + if (pba_offset >= table_offset + table_size || + table_offset >= pba_offset + pba_size) { + /* + * If the PBA does not share a page with the MSI-x + * tables, no PBA emulation is required. + */ + pi->pi_msix.pba_page = NULL; + pi->pi_msix.pba_page_offset = 0; + } else { + /* + * The PBA overlaps with either the first or last + * page of the MSI-X table region. Map the + * appropriate page. + */ + if (pba_offset <= table_offset) + pi->pi_msix.pba_page_offset = table_offset; + else + pi->pi_msix.pba_page_offset = table_offset + + table_size - 4096; + pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ | + PROT_WRITE, MAP_SHARED, memfd, start + + pi->pi_msix.pba_page_offset); + if (pi->pi_msix.pba_page == MAP_FAILED) { + warn( + "Failed to map PBA page for MSI-X on %d/%d/%d", + b, s, f); + return (-1); + } + } + } return (0); } @@ -622,7 +645,7 @@ passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) #ifndef WITHOUT_CAPSICUM cap_rights_t rights; cap_ioctl_t pci_ioctls[] = - { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR, PCIOCBARIO, PCIOCBARMMAP }; + { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR, PCIOCBARIO }; #endif sc = NULL; @@ -653,6 +676,21 @@ passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif + if (memfd < 0) { + memfd = open(_PATH_MEM, O_RDWR, 0); + if (memfd < 0) { + warn("failed to open %s", _PATH_MEM); + return (error); + } + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_clear(&rights, CAP_IOCTL); + cap_rights_set(&rights, CAP_MMAP_RW); + if (caph_rights_limit(memfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + #define GET_INT_CONFIG(var, name) do { \ value = get_config_value_node(nvl, name); \ if (value == NULL) { \