git: bcecad2c24aa - main - riscv: IOMMU support

From: Ruslan Bukin <br_at_FreeBSD.org>
Date: Tue, 12 May 2026 10:20:41 UTC
The branch main has been updated by br:

URL: https://cgit.FreeBSD.org/src/commit/?id=bcecad2c24aa500913559c00f1be8b364a3ff150

commit bcecad2c24aa500913559c00f1be8b364a3ff150
Author:     Ruslan Bukin <br@FreeBSD.org>
AuthorDate: 2026-05-12 10:09:04 +0000
Commit:     Ruslan Bukin <br@FreeBSD.org>
CommitDate: 2026-05-12 10:11:32 +0000

    riscv: IOMMU support
    
    Support for RISC-V IOMMU spec v1.0.1 (ratified)
      https://github.com/riscv-non-isa/riscv-iommu
    
    Supports translation for PCI devices only.
    Supports 1 or 2-level device-directory-table (DDT).
    Supports SV39 and SV48 virtual memory system (on per-device basis).
    Supports both "standard" and "extended" device-context (DC) structure.
    Supports "bypass" mode to disable translation for a particular device.
    Supports WSI (Wire-Signalled Interrupts) only.
    
    This includes both PCI-bus and FDT attachment drivers.
    
    Note in case of PCI-bus attachment, interrupts are not available. In this
    case no error report is provided in case of translation fault. Otherwise
    interrupts are not needed.
    
    Differential Revision:  https://reviews.freebsd.org/D55922
---
 sys/conf/files.riscv             |    9 +
 sys/riscv/conf/GENERIC           |    1 +
 sys/riscv/include/bus_dma_impl.h |    3 +
 sys/riscv/include/iommu.h        |   10 +
 sys/riscv/iommu/iommu.c          | 1351 ++++++++++++++++++++++++++++++++++++++
 sys/riscv/iommu/iommu.h          |  359 ++++++++++
 sys/riscv/iommu/iommu_fdt.c      |  145 ++++
 sys/riscv/iommu/iommu_frontend.c |  505 ++++++++++++++
 sys/riscv/iommu/iommu_frontend.h |   38 ++
 sys/riscv/iommu/iommu_if.m       |  147 +++++
 sys/riscv/iommu/iommu_pci.c      |  172 +++++
 sys/riscv/iommu/iommu_pmap.c     |  629 ++++++++++++++++++
 sys/riscv/iommu/iommu_pmap.h     |   49 ++
 13 files changed, 3418 insertions(+)

diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv
index 90a74367a54f..58a31b5f326e 100644
--- a/sys/conf/files.riscv
+++ b/sys/conf/files.riscv
@@ -6,6 +6,7 @@ cddl/dev/fbt/riscv/fbt_isa.c				optional dtrace_fbt | dtraceall compile-with "${
 crypto/des/des_enc.c		optional	netsmb
 dev/cpufreq/cpufreq_dt.c	optional	cpufreq fdt
 dev/ofw/ofw_cpu.c		optional	fdt
+dev/ofw/ofw_pci.c		optional 	pci fdt
 dev/ofw/ofw_pcib.c		optional 	pci fdt
 dev/pci/pci_dw.c		optional	pci fdt
 dev/pci/pci_dw_if.m		optional	pci fdt
@@ -42,6 +43,14 @@ libkern/memset.c		standard
 libkern/strcmp.c		standard
 libkern/strlen.c		standard
 libkern/strncmp.c		standard
+riscv/iommu/iommu_frontend.c	standard
+riscv/iommu/iommu_if.m		standard
+riscv/iommu/iommu.c		standard
+riscv/iommu/iommu_fdt.c		optional	fdt
+riscv/iommu/iommu_pci.c		optional	pci
+riscv/iommu/iommu_pmap.c	optional	iommu
+dev/iommu/busdma_iommu.c	optional	iommu
+dev/iommu/iommu_gas.c		optional	iommu
 riscv/riscv/aplic.c		standard
 riscv/riscv/autoconf.c		standard
 riscv/riscv/bus_machdep.c	standard
diff --git a/sys/riscv/conf/GENERIC b/sys/riscv/conf/GENERIC
index 827d5efef50b..cce2787ed5d7 100644
--- a/sys/riscv/conf/GENERIC
+++ b/sys/riscv/conf/GENERIC
@@ -77,6 +77,7 @@ options 	RACCT			# Resource accounting framework
 options 	RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default
 options 	RCTL			# Resource limits
 options 	SMP
+options 	IOMMU
 
 # RISC-V SBI console
 device		rcons
diff --git a/sys/riscv/include/bus_dma_impl.h b/sys/riscv/include/bus_dma_impl.h
index 09fd29b74f8e..8c2040a68f52 100644
--- a/sys/riscv/include/bus_dma_impl.h
+++ b/sys/riscv/include/bus_dma_impl.h
@@ -41,6 +41,7 @@ struct bus_dma_tag_common {
 	int		  flags;
 	bus_dma_lock_t	 *lockfunc;
 	void		 *lockfuncarg;
+	int		  domain;
 };
 
 struct bus_dma_impl {
@@ -52,6 +53,8 @@ struct bus_dma_impl {
 	int (*tag_destroy)(bus_dma_tag_t dmat);
 	int (*map_create)(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp);
 	int (*map_destroy)(bus_dma_tag_t dmat, bus_dmamap_t map);
+	int (*tag_set_domain)(bus_dma_tag_t);
+	bool (*id_mapped)(bus_dma_tag_t, vm_paddr_t, bus_size_t);
 	int (*mem_alloc)(bus_dma_tag_t dmat, void** vaddr, int flags,
 	    bus_dmamap_t *mapp);
 	void (*mem_free)(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map);
diff --git a/sys/riscv/include/iommu.h b/sys/riscv/include/iommu.h
new file mode 100644
index 000000000000..38214b7003f4
--- /dev/null
+++ b/sys/riscv/include/iommu.h
@@ -0,0 +1,10 @@
+/*-
+ * This file is in the public domain.
+ */
+
+#ifndef	_MACHINE_IOMMU_H_
+#define	_MACHINE_IOMMU_H_
+
+#include <riscv/iommu/iommu_frontend.h>
+
+#endif /* !_MACHINE_IOMMU_H_ */
diff --git a/sys/riscv/iommu/iommu.c b/sys/riscv/iommu/iommu.c
new file mode 100644
index 000000000000..59df2e68384f
--- /dev/null
+++ b/sys/riscv/iommu/iommu.c
@@ -0,0 +1,1351 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2026 Ruslan Bukin <br@bsdpad.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/bitstring.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/rman.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/tree.h>
+#include <sys/taskqueue.h>
+#include <sys/refcount.h>
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/rman.h>
+
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+
+#include <dev/iommu/iommu.h>
+#include <riscv/iommu/iommu_pmap.h>
+#include <riscv/iommu/iommu.h>
+
+#include "iommu_if.h"
+
+#define	dprintf(fmt, ...)
+
+MALLOC_DEFINE(M_IOMMU, "RISCV_IOMMU", "RISC-V IOMMU");
+
+#define	RD4(sc, reg)		bus_read_4(sc->res[0], (reg))
+#define	WR4(sc, reg, val)	bus_write_4(sc->res[0], (reg), (val))
+#define	RD8(sc, reg)		bus_read_8(sc->res[0], (reg))
+#define	WR8(sc, reg, val)	bus_write_8(sc->res[0], (reg), (val))
+
+#define	CQ_ENTRY_DWORDS		2	/* 16-byte */
+#define	CQ_ENTRY_COUNT		8192	/* Amount of 16-byte entries. */
+#define	FQ_ENTRY_DWORDS		4	/* 32-byte */
+#define	FQ_ENTRY_COUNT		8192	/* Amount of 32-byte entries. */
+#define	PQ_ENTRY_DWORDS		2	/* 16-byte */
+#define	PQ_ENTRY_COUNT		8192	/* Amount of 16-byte entries. */
+
+#define	DDT_NON_LEAF_DWORDS	1
+#define	DDT_DC_STD_DWORDS	4	/* Standard-format DC. */
+#define	DDT_DC_EXT_DWORDS	8	/* Extended-format DC. */
+#define	DDT_L1_DID_BITS		9	/* All formats. */
+
+#define	QUEUE_ALIGN		(1024 * 1024)	/* TODO */
+#define	QUEUE_HEAD(q)		((q)->csr + RISCV_IOMMU_CQH - RISCV_IOMMU_CQB)
+#define	QUEUE_TAIL(q)		((q)->csr + RISCV_IOMMU_CQT - RISCV_IOMMU_CQB)
+#define	QUEUE_IPSR(q)		(1 << (q)->idx)
+
+#define	PHYS_TO_PPN(p)		((p) >> 12)
+
+struct riscv_iommu_fq_event {
+	uint16_t cause_id;
+	char *descr;
+};
+
+static struct riscv_iommu_fq_event fq_events[] = {
+	{ FQ_CAUSE_INST_FAULT,		"Instruction access fault" },
+	{ FQ_CAUSE_RD_ADDR_MISALIGNED,	"Read address misaligned" },
+	{ FQ_CAUSE_RD_FAULT,		"Read access fault" },
+	{ FQ_CAUSE_WR_ADDR_MISALIGNED,	"Write/AMO address misaligned" },
+	{ FQ_CAUSE_WR_FAULT,		"Write/AMO access fault" },
+	{ FQ_CAUSE_INST_FAULT_S,	"Instruction page fault" },
+	{ FQ_CAUSE_RD_FAULT_S,		"Read page fault" },
+	{ FQ_CAUSE_WR_FAULT_S,		"Write/AMO page fault" },
+	{ FQ_CAUSE_INST_FAULT_VS,	"Instruction guest page fault" },
+	{ FQ_CAUSE_RD_FAULT_VS,		"Read guest-page fault" },
+	{ FQ_CAUSE_WR_FAULT_VS,		"Write/AMO guest-page fault" },
+	{ FQ_CAUSE_DMA_DISABLED,	"All inbound transactions disallowed" },
+	{ FQ_CAUSE_DDT_LOAD_FAULT,	"DDT entry load access fault" },
+	{ FQ_CAUSE_DDT_INVALID,		"DDT entry not valid" },
+	{ FQ_CAUSE_DDT_MISCONFIGURED,	"DDT entry misconfigured" },
+	{ FQ_CAUSE_TR_TYPE_DISALLOWED,	"Transaction type disallowed" },
+	{ FQ_CAUSE_MSI_LOAD_FAULT,	"MSI PTE load access fault" },
+	{ FQ_CAUSE_MSI_INVALID, 	"MSI PTE not valid" },
+	{ FQ_CAUSE_MSI_MISCONFIGURED,	"MSI PTE misconfigured" },
+	{ FQ_CAUSE_MRIF_FAULT,		"MRIF access fault" },
+	{ FQ_CAUSE_PDT_LOAD_FAULT,	"PDT entry load access fault" },
+	{ FQ_CAUSE_PDT_INVALID,		"PDT entry not valid" },
+	{ FQ_CAUSE_PDT_MISCONFIGURED,	"PDT entry misconfigured" },
+	{ FQ_CAUSE_DDT_CORRUPTED,	"DDT data corruption" },
+	{ FQ_CAUSE_PDT_CORRUPTED,	"PDT data corruption" },
+	{ FQ_CAUSE_MSI_PT_CORRUPTED,	"MSI PT data corruption" },
+	{ FQ_CAUSE_MRIF_CORRUPTED,	"MSI MRIF data corruption" },
+	{ FQ_CAUSE_INTERNAL_DP_ERROR,	"Internal data path error" },
+	{ FQ_CAUSE_MSI_WR_FAULT,	"IOMMU MSI write access fault" },
+	{ FQ_CAUSE_PT_CORRUPTED,	"1st/2nd-stage PT data corruption" },
+	{ 0, NULL },
+};
+
+static void
+riscv_iommu_init_pscids(struct riscv_iommu_softc *sc)
+{
+
+	sc->pscid_set_size = (1 << sc->pscid_bits);
+	sc->pscid_set = bit_alloc(sc->pscid_set_size, M_IOMMU, M_WAITOK);
+	mtx_init(&sc->pscid_set_mutex, "pscid set", NULL, MTX_SPIN);
+}
+
+static int
+riscv_iommu_pscid_alloc(struct riscv_iommu_softc *sc, int *new_pscid)
+{
+
+	mtx_lock_spin(&sc->pscid_set_mutex);
+	bit_ffc(sc->pscid_set, sc->pscid_set_size, new_pscid);
+	if (*new_pscid == -1) {
+		mtx_unlock_spin(&sc->pscid_set_mutex);
+		return (ENOMEM);
+	}
+	bit_set(sc->pscid_set, *new_pscid);
+	mtx_unlock_spin(&sc->pscid_set_mutex);
+
+	return (0);
+}
+
+static void
+riscv_iommu_pscid_free(struct riscv_iommu_softc *sc, int pscid)
+{
+
+	mtx_lock_spin(&sc->pscid_set_mutex);
+	bit_clear(sc->pscid_set, pscid);
+	mtx_unlock_spin(&sc->pscid_set_mutex);
+}
+
+static uint32_t
+riscv_iommu_q_inc_tail(struct riscv_iommu_queue *q)
+{
+
+	return ((q->lc.tail + 1) & q->mask);
+}
+
+static uint32_t
+riscv_iommu_q_inc_head(struct riscv_iommu_queue *q)
+{
+
+	return ((q->lc.head + 1) & q->mask);
+}
+
+static int
+riscv_iommu_q_has_space(struct riscv_iommu_queue *q)
+{
+
+	if (riscv_iommu_q_inc_tail(q) != q->lc.head)
+		return (1);
+
+	return (0);
+}
+
+static int
+riscv_iommu_q_empty(struct riscv_iommu_queue *q)
+{
+
+	if (q->lc.tail == q->lc.head)
+		return (1);
+
+	return (0);
+}
+
+static int
+riscv_iommu_dequeue(struct riscv_iommu_softc *sc, struct riscv_iommu_queue *q,
+    void *data)
+{
+	void *entry_addr;
+
+	q->lc.val = RD8(sc, q->head_off);
+	entry_addr = (void *)((uint64_t)q->vaddr + q->lc.head * q->entry_size);
+	memcpy(data, entry_addr, q->entry_size);
+	q->lc.head = riscv_iommu_q_inc_head(q);
+	WR4(sc, q->head_off, q->lc.head);
+
+	return (0);
+}
+
+static int
+riscv_iommu_enqueue(struct riscv_iommu_softc *sc, struct riscv_iommu_queue *q,
+    void *data)
+{
+	void *entry_addr;
+
+	RISCV_IOMMU_LOCK(sc);
+
+	/* Ensure that a space is available. */
+	do {
+		q->lc.head = RD4(sc, q->head_off);
+	} while (riscv_iommu_q_has_space(q) == 0);
+
+	/* Write the command to the current tail entry. */
+	entry_addr = (void *)((uint64_t)q->vaddr + q->lc.tail * q->entry_size);
+	memcpy(entry_addr, data, q->entry_size);
+
+	/* Increment tail index. */
+	q->lc.tail = riscv_iommu_q_inc_tail(q);
+	WR4(sc, q->tail_off, q->lc.tail);
+
+	RISCV_IOMMU_UNLOCK(sc);
+
+	return (0);
+}
+
+static void
+riscv_iommu_sync(struct riscv_iommu_softc *sc, struct riscv_iommu_queue *q)
+{
+	struct riscv_iommu_command cmd;
+	uint64_t reg;
+
+	bzero(&cmd, sizeof(struct riscv_iommu_command));
+	reg = COMMAND_OPCODE_IOFENCE;
+	reg |= FUNC_IOFENCE_FUNC_C | FUNC_IOFENCE_PR | FUNC_IOFENCE_PW;
+	cmd.dword0 = reg;
+
+	riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+	/*
+	 * FUNC_IOFENCE_WSI does not seem to be implemented in QEMU,
+	 * so ensure all requests are processed in polling mode;
+	 */
+	do {
+		q->lc.head = RD4(sc, q->head_off);
+	} while (riscv_iommu_q_empty(q) == 0);
+}
+
+static int
+riscv_iommu_inval_ddt(struct riscv_iommu_softc *sc)
+{
+	struct riscv_iommu_command cmd;
+	uint64_t reg;
+
+	bzero(&cmd, sizeof(struct riscv_iommu_command));
+	reg = COMMAND_OPCODE_IODIR;
+	reg |= FUNC_IODIR_INVAL_DDT;
+	cmd.dword0 = reg;
+
+	riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+	return (0);
+}
+
+static int
+riscv_iommu_inval_ddt_did(struct riscv_iommu_softc *sc, int did)
+{
+	struct riscv_iommu_command cmd;
+	uint64_t reg;
+
+	bzero(&cmd, sizeof(struct riscv_iommu_command));
+	reg = COMMAND_OPCODE_IODIR;
+	reg |= FUNC_IODIR_INVAL_DDT;
+	reg |= FUNC_IODIR_DV;
+	reg |= (uint64_t)did << FUNC_IODIR_DID_S;
+	cmd.dword0 = reg;
+
+	riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+	return (0);
+}
+
+/* Invalidate entire address space. */
+static int
+riscv_iommu_inval_vma(struct riscv_iommu_softc *sc)
+{
+	struct riscv_iommu_command cmd;
+	uint64_t reg;
+
+	bzero(&cmd, sizeof(struct riscv_iommu_command));
+	reg = COMMAND_OPCODE_IOTINVAL;
+	reg |= FUNC_IOTINVAL_VMA;
+	cmd.dword0 = reg;
+
+	riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+	return (0);
+}
+
+static int
+riscv_iommu_inval_vma_page(struct riscv_iommu_softc *sc, vm_offset_t addr,
+    int pscid)
+{
+	struct riscv_iommu_command cmd;
+	uint64_t reg;
+
+	bzero(&cmd, sizeof(struct riscv_iommu_command));
+	reg = COMMAND_OPCODE_IOTINVAL;
+	reg |= FUNC_IOTINVAL_VMA;
+	reg |= FUNC_IOTINVAL_AV;
+	reg |= FUNC_IOTINVAL_PSCV;
+	reg |= pscid << FUNC_IOTINVAL_PSCID_S;
+	cmd.dword0 = reg;
+	cmd.dword1 = PHYS_TO_PPN(addr) << FUNC_IOTINVAL_ADDR_S;
+
+	riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+	return (0);
+}
+
+static int
+riscv_iommu_inval_vma_pscid(struct riscv_iommu_softc *sc, int pscid)
+{
+	struct riscv_iommu_command cmd;
+	uint64_t reg;
+
+	bzero(&cmd, sizeof(struct riscv_iommu_command));
+	reg = COMMAND_OPCODE_IOTINVAL;
+	reg |= FUNC_IOTINVAL_VMA;
+	reg |= FUNC_IOTINVAL_PSCV;
+	reg |= pscid << FUNC_IOTINVAL_PSCID_S;
+	cmd.dword0 = reg;
+
+	riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+	return (0);
+}
+
+static int
+riscv_iommu_set_mode(struct riscv_iommu_softc *sc)
+{
+	struct riscv_iommu_ddt *ddt;
+	uint64_t reg;
+	uint64_t base;
+
+	reg = RD8(sc, RISCV_IOMMU_DDTP);
+	if (reg & DDTP_BUSY)
+		return (ENXIO);
+
+	ddt = &sc->ddt;
+	base = ddt->base | (sc->iommu_mode << DDTP_IOMMU_MODE_S);
+	WR8(sc, RISCV_IOMMU_DDTP, base);
+
+	reg = RD8(sc, RISCV_IOMMU_DDTP);
+	if (reg != base) {
+		device_printf(sc->dev, "could not set mode\n");
+		return (ENXIO);
+	}
+
+	riscv_iommu_inval_ddt(sc);
+	riscv_iommu_inval_vma(sc);
+
+	return (0);
+}
+
+static int
+riscv_iommu_enable_queue(struct riscv_iommu_softc *sc,
+    struct riscv_iommu_queue *q)
+{
+	uint32_t reg;
+	int timeout;
+
+	if (q == &sc->cq)
+		WR4(sc, QUEUE_TAIL(q), 0);
+	else
+		WR4(sc, QUEUE_HEAD(q), 0);
+
+	reg = CQCSR_CQEN | CQCSR_CIE | CQCSR_CQMF;
+	WR4(sc, q->csr, reg);
+
+	timeout = 1000;
+	do {
+		reg = RD4(sc, RISCV_IOMMU_CQCSR);
+		if ((reg & CQCSR_BUSY) == 0)
+			break;
+		DELAY(10);
+	} while (timeout--);
+
+	if (timeout <= 0) {
+		device_printf(sc->dev, "could not enable command queue\n");
+		return (-1);
+	}
+
+	if ((reg & CQCSR_CQON) == 0) {
+		device_printf(sc->dev, "could not activate command queue\n");
+		return (-1);
+	}
+
+	/* RW1C interrupt pending bit. */
+	WR4(sc, RISCV_IOMMU_IPSR, QUEUE_IPSR(q));
+
+	return (0);
+}
+
+static int
+riscv_iommu_init_queue(struct riscv_iommu_softc *sc,
+    struct riscv_iommu_queue *q, uint64_t base, uint32_t dwords)
+{
+	uint64_t reg;
+	int sz;
+
+	q->entry_size = dwords * 8;
+	sz = (1 << q->size_log2) * q->entry_size;
+
+	/* Set up the command circular buffer */
+	q->vaddr = contigmalloc(sz, M_IOMMU, M_WAITOK | M_ZERO, 0,
+	    (1ul << 48) - 1, QUEUE_ALIGN, 0);
+	if (q->vaddr == NULL) {
+		device_printf(sc->dev, "failed to allocate %d bytes\n", sz);
+		return (-1);
+	}
+
+	q->mask = (1 << q->size_log2) - 1;
+	q->head_off = (uint32_t)base - RISCV_IOMMU_CQB + RISCV_IOMMU_CQH;
+	q->tail_off = (uint32_t)base - RISCV_IOMMU_CQB + RISCV_IOMMU_CQT;
+	q->paddr = vtophys(q->vaddr);
+	q->base = (sc->cq.size_log2 - 1) << CQB_LOG2SZ_1_S;
+	q->base |= PHYS_TO_PPN(q->paddr) << CQB_PPN_S;
+	WR8(sc, base, q->base);
+
+	/* Verify it sticks. */
+	reg = RD8(sc, base);
+	if (reg != q->base) {
+		device_printf(sc->dev, "could not init queue\n");
+		return (ENXIO);
+	}
+
+	return (0);
+}
+
+static int
+riscv_iommu_init_queues(struct riscv_iommu_softc *sc)
+{
+	int error;
+
+	sc->cq.size_log2 = ilog2(CQ_ENTRY_COUNT);
+	sc->fq.size_log2 = ilog2(FQ_ENTRY_COUNT);
+	sc->pq.size_log2 = ilog2(PQ_ENTRY_COUNT);
+
+	sc->cq.csr = RISCV_IOMMU_CQCSR;
+	sc->fq.csr = RISCV_IOMMU_FQCSR;
+	sc->pq.csr = RISCV_IOMMU_PQCSR;
+
+	sc->cq.idx = 0;
+	sc->fq.idx = 1;
+	sc->pq.idx = 3;
+
+	/* Command queue (CQ). */
+	error = riscv_iommu_init_queue(sc, &sc->cq, RISCV_IOMMU_CQB,
+	    CQ_ENTRY_DWORDS);
+	if (error)
+		return (error);
+
+	/* Fault queue (FQ). */
+	error = riscv_iommu_init_queue(sc, &sc->fq, RISCV_IOMMU_FQB,
+	    FQ_ENTRY_DWORDS);
+	if (error)
+		return (error);
+
+	/* Page request queue (PQ). */
+	error = riscv_iommu_init_queue(sc, &sc->pq, RISCV_IOMMU_PQB,
+	    PQ_ENTRY_DWORDS);
+	if (error)
+		return (error);
+
+	error = riscv_iommu_enable_queue(sc, &sc->cq);
+	if (error)
+		return (error);
+
+	error = riscv_iommu_enable_queue(sc, &sc->fq);
+	if (error)
+		return (error);
+
+	error = riscv_iommu_enable_queue(sc, &sc->pq);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+static int
+riscv_iommu_init_pagedir(struct riscv_iommu_softc *sc)
+{
+
+	return (0);
+}
+
+static void
+riscv_iommu_print_fault(struct riscv_iommu_softc *sc,
+    struct riscv_iommu_fq_record *rec)
+{
+	struct riscv_iommu_fq_event *ev;
+	uint16_t cause_id;
+	uint16_t ttyp;
+	uint32_t did;
+	uint32_t pid;
+	bool pv, priv;
+	int i;
+
+	cause_id = (rec->hdr & FQR_HDR_CAUSE_M) >> FQR_HDR_CAUSE_S;
+	ttyp = (rec->hdr & FQR_HDR_TTYP_M) >> FQR_HDR_TTYP_S;
+	did = (rec->hdr & FQR_HDR_DID_M) >> FQR_HDR_DID_S;
+	pid = (rec->hdr & FQR_HDR_PID_M) >> FQR_HDR_PID_S;
+	pv = (rec->hdr & FQR_HDR_PV) ? 1 : 0;
+	priv = (rec->hdr & FQR_HDR_PRIV) ? 1 : 0;
+
+	ev = NULL;
+	for (i = 0; fq_events[i].cause_id != 0; i++) {
+		if (fq_events[i].cause_id == cause_id) {
+			ev = &fq_events[i];
+			break;
+		}
+	}
+
+	if (ev == NULL) {
+		device_printf(sc->dev, "Fault: unknown fault 0x%x received\n",
+		    cause_id);
+		return;
+	}
+
+	device_printf(sc->dev, "Fault: event 0x%x received: %s\n",
+	    ev->cause_id, ev->descr);
+	device_printf(sc->dev, "    hdr 0x%lx\n", rec->hdr);
+	device_printf(sc->dev, "    iotval 0x%lx\n", rec->iotval);
+	device_printf(sc->dev, "    iotval2 0x%lx\n", rec->iotval2);
+	device_printf(sc->dev, "    ttyp 0x%x did 0x%x pid 0x%x pv %d priv %d"
+	    "\n", ttyp, did, pid, pv, priv);
+}
+
+static int
+riscv_cq_intr(void *arg)
+{
+	struct riscv_iommu_softc *sc;
+	struct riscv_iommu_queue *q;
+	uint32_t reg;
+
+	sc = arg;
+	q = &sc->cq;
+
+	reg = RD4(sc, q->csr);
+	printf("%s: pending %x\n", __func__, reg);
+
+	/* Clear pending bit. */
+	WR4(sc, RISCV_IOMMU_IPSR, IPSR_CIP);
+
+	return (FILTER_HANDLED);
+}
+
+static int
+riscv_fq_intr(void *arg)
+{
+	struct riscv_iommu_fq_record rec;
+	struct riscv_iommu_softc *sc;
+	struct riscv_iommu_queue *q;
+	uint32_t reg;
+
+	sc = arg;
+	q = &sc->fq;
+
+	reg = RD4(sc, q->csr);
+	printf("%s: pending %x\n", __func__, reg);
+
+	/* Clear pending bit. */
+	WR4(sc, RISCV_IOMMU_IPSR, IPSR_FIP);
+
+	do {
+		riscv_iommu_dequeue(sc, q, &rec);
+		riscv_iommu_print_fault(sc, &rec);
+	} while (!riscv_iommu_q_empty(q));
+
+	return (FILTER_HANDLED);
+}
+
+static int
+riscv_pm_intr(void *arg)
+{
+	struct riscv_iommu_softc *sc;
+
+	sc = arg;
+
+	printf("%s\n", __func__);
+
+	/* Clear pending bit. */
+	WR4(sc, RISCV_IOMMU_IPSR, IPSR_PMIP);
+
+	return (FILTER_HANDLED);
+}
+
+static int
+riscv_pq_intr(void *arg)
+{
+	struct riscv_iommu_softc *sc;
+	struct riscv_iommu_queue *q;
+	uint32_t reg;
+
+	sc = arg;
+	q = &sc->pq;
+
+	reg = RD4(sc, q->csr);
+	printf("%s: pending %x\n", __func__, reg);
+
+	/* Clear pending bit. */
+	WR4(sc, RISCV_IOMMU_IPSR, IPSR_PIP);
+
+	return (FILTER_HANDLED);
+}
+
+static int
+riscv_iommu_init_ddt_linear(struct riscv_iommu_softc *sc)
+{
+	struct riscv_iommu_ddt *ddt;
+	uint64_t size;
+	uint64_t reg;
+
+	ddt = &sc->ddt;
+	ddt->num_top_entries = (1 << sc->l0_did_bits);
+
+	size = ddt->num_top_entries * (sc->dc_dwords << 3);
+
+	if (bootverbose)
+		device_printf(sc->dev, "linear ddt size %ld, num_top_entries "
+		    "%d\n", size, ddt->num_top_entries);
+
+	ddt->vaddr = contigmalloc(size, M_IOMMU,
+	    M_WAITOK | M_ZERO,	/* flags */
+	    0,			/* low */
+	    (1ul << 48) - 1,	/* high */
+	    size,		/* alignment */
+	    0);			/* boundary */
+	if (ddt->vaddr == NULL) {
+		device_printf(sc->dev, "failed to allocate ddt\n");
+		return (ENXIO);
+	}
+
+	reg = vtophys(ddt->vaddr);
+	if (bootverbose)
+		device_printf(sc->dev, "ddt base %p size %lx\n", ddt->vaddr,
+		    size);
+	ddt->base = PHYS_TO_PPN(reg) << DDTP_PPN_S;
+
+	return (0);
+}
+
+static int
+riscv_iommu_init_ddt_2lvl(struct riscv_iommu_softc *sc)
+{
+	struct riscv_iommu_ddt *ddt;
+	uint64_t size;
+	uint64_t reg;
+	uint64_t sz;
+
+	ddt = &sc->ddt;
+	ddt->num_top_entries = (1 << DDT_L1_DID_BITS);
+
+	size = ddt->num_top_entries * (DDT_NON_LEAF_DWORDS << 3);
+
+	if (bootverbose)
+		device_printf(sc->dev, "%s: size %lu, l1 entries %d, size "
+		    "%lu\n", __func__, size, ddt->num_top_entries, size);
+
+	ddt->vaddr = contigmalloc(size, M_IOMMU,
+	    M_WAITOK | M_ZERO,	/* flags */
+	    0,			/* low */
+	    (1ul << 48) - 1,	/* high */
+	    size,		/* alignment */
+	    0);			/* boundary */
+	if (ddt->vaddr == NULL) {
+		device_printf(sc->dev, "Failed to allocate 2lvl ddt.\n");
+		return (ENOMEM);
+	}
+
+	sz = ddt->num_top_entries * sizeof(struct l1_desc);
+	ddt->l1 = malloc(sz, M_IOMMU, M_WAITOK | M_ZERO);
+
+	reg = vtophys(ddt->vaddr);
+	if (bootverbose)
+		device_printf(sc->dev, "ddt base %p size %lx\n", ddt->vaddr,
+		    size);
+	ddt->base = PHYS_TO_PPN(reg) << DDTP_PPN_S;
+
+	return (0);
+}
+
+static int
+riscv_iommu_init_l0_directory(struct riscv_iommu_softc *sc, int sid)
+{
+	struct riscv_iommu_ddt *ddt;
+	struct l1_desc *l1_desc;
+	uint64_t *l1e;
+	uint64_t val;
+	size_t size;
+	int i;
+
+	ddt = &sc->ddt;
+	l1_desc = &ddt->l1[sid >> sc->l0_did_bits];
+	if (l1_desc->va) {
+		/* Already allocated. */
+		return (0);
+	}
+
+	size = (1 << sc->l0_did_bits) * (sc->dc_dwords << 3);
+
+	l1_desc->va = contigmalloc(size, M_IOMMU,
+	    M_WAITOK | M_ZERO,	/* flags */
+	    0,			/* low */
+	    (1ul << 48) - 1,	/* high */
+	    size,		/* alignment */
+	    0);			/* boundary */
+	if (l1_desc->va == NULL) {
+		device_printf(sc->dev, "failed to allocate l0 directory\n");
+		return (ENXIO);
+	}
+
+	l1_desc->pa = vtophys(l1_desc->va);
+
+	i = sid >> sc->l0_did_bits;
+	l1e = (void *)((uint64_t)ddt->vaddr + DDT_NON_LEAF_DWORDS * 8 * i);
+
+	/* Install the L1 entry. */
+	val = PHYS_TO_PPN(l1_desc->pa) << DC_NON_LEAF_ENTRY_PPN_S;
+	val |= DC_NON_LEAF_ENTRY_VALID;
+	*l1e = val;
+
+	return (0);
+}
+
+static void *
+riscv_iommu_get_dc_addr(struct riscv_iommu_softc *sc, int did)
+{
+	struct riscv_iommu_ddt *ddt;
+	struct l1_desc *l1_desc;
+	uintptr_t l0_base;
+	void *addr;
+	int l0_offs;
+	int l1_idx;
+
+	ddt = &sc->ddt;
+
+	l0_offs = sc->dc_dwords * 8 * (did & ((1 << sc->l0_did_bits) - 1));
+
+	if (sc->iommu_mode == DDTP_IOMMU_MODE_2LVL) {
+		l1_idx = (did >> sc->l0_did_bits) &
+		    ((1 << DDT_L1_DID_BITS) - 1);
+		l1_desc = &ddt->l1[l1_idx];
+		l0_base = (uintptr_t)l1_desc->va;
+	} else
+		l0_base = (uintptr_t)ddt->vaddr;
+
+	addr = (void *)(l0_base + l0_offs);
+
+	dprintf("ddt vaddr %p addr %p\n", ddt->vaddr, addr);
+
+	return (addr);
+}
+
+static int
+riscv_iommu_init_dc(struct riscv_iommu_softc *sc,
+    struct riscv_iommu_domain *domain, int did, bool bypass)
+{
+	struct riscv_iommu_dc_base *dc_base;
+	struct riscv_iommu_dc *dc;
+	struct riscv_iommu_pmap *p;
+
+	dc = riscv_iommu_get_dc_addr(sc, did);
+	dc_base = &dc->base;
+
+	device_printf(sc->dev, "address translation for device id"
+	    " 0x%x is %s.\n", did, bypass ? "bypassed" : "enabled");
+
+	p = &domain->p;
+
+	bzero(dc_base, sizeof(struct riscv_iommu_dc_base));
+	if (bypass == false)
+		dc_base->fsc = p->pm_satp;
+	dc_base->ta = (domain->pscid << DC_TA_PSCID_S) | DC_TA_V;
+
+	riscv_iommu_inval_ddt_did(sc, did);
+	riscv_iommu_sync(sc, &sc->cq);
+	dc_base->tc |= DC_TC_V;
+	riscv_iommu_inval_ddt_did(sc, did);
+	riscv_iommu_inval_vma(sc);
+	riscv_iommu_sync(sc, &sc->cq);
+
+	return (0);
+}
+
+static void
+riscv_iommu_deinit_dc(struct riscv_iommu_softc *sc, int did)
+{
+	struct riscv_iommu_dc_base *dc_base;
+	struct riscv_iommu_dc *dc;
+
+	dc = riscv_iommu_get_dc_addr(sc, did);
+	dc_base = &dc->base;
+	dc_base->tc &= ~DC_TC_V;
+
+	riscv_iommu_inval_ddt_did(sc, did);
+	riscv_iommu_sync(sc, &sc->cq);
+}
+
+static int
+riscv_iommu_setup_interrupts(struct riscv_iommu_softc *sc)
+{
+	device_t dev;
+	int error;
+
+	dev = sc->dev;
+
+	if (sc->res[1] == NULL || sc->res[2] == NULL ||
+	    sc->res[3] == NULL || sc->res[4] == NULL) {
+		device_printf(dev, "Warning: no interrupt resources "
+		    "provided.\n");
+		return (ENXIO);
+	}
+
+	error = bus_setup_intr(dev, sc->res[1], INTR_TYPE_MISC,
+	    riscv_cq_intr, NULL, sc, &sc->intr_cookie[0]);
+	if (error) {
+		device_printf(dev, "Couldn't setup cq interrupt handler\n");
+		return (ENXIO);
+	}
+
+	error = bus_setup_intr(dev, sc->res[2], INTR_TYPE_MISC,
+	    riscv_fq_intr, NULL, sc, &sc->intr_cookie[1]);
+	if (error) {
+		device_printf(dev, "Couldn't setup fq interrupt handler\n");
+		return (ENXIO);
+	}
+
+	error = bus_setup_intr(dev, sc->res[3], INTR_TYPE_MISC,
+	    riscv_pm_intr, NULL, sc, &sc->intr_cookie[2]);
+	if (error) {
+		device_printf(dev, "Couldn't setup pm interrupt handler\n");
+		return (ENXIO);
+	}
+
+	error = bus_setup_intr(dev, sc->res[4], INTR_TYPE_MISC,
+	    riscv_pq_intr, NULL, sc, &sc->intr_cookie[3]);
+	if (error) {
+		device_printf(dev, "Couldn't setup pq interrupt handler\n");
+		return (ENXIO);
+	}
*** 2583 LINES SKIPPED ***