git: 03b330e1916f - main - riscv: add stage 2 translation to pmap.

From: Ruslan Bukin <br_at_FreeBSD.org>
Date: Wed, 05 Jun 2024 13:40:51 UTC
The branch main has been updated by br:

URL: https://cgit.FreeBSD.org/src/commit/?id=03b330e1916f468b16f7dbd0b7bd67b567a1eb1e

commit 03b330e1916f468b16f7dbd0b7bd67b567a1eb1e
Author:     Ruslan Bukin <br@FreeBSD.org>
AuthorDate: 2024-06-05 13:08:35 +0000
Commit:     Ruslan Bukin <br@FreeBSD.org>
CommitDate: 2024-06-05 13:36:57 +0000

    riscv: add stage 2 translation to pmap.
    
    Add basic stage 2 translation support (guest-physical to host-physical).
    
    RISC-V hypervisor spec[1] introduces new translation schemes: Sv32x4,
    Sv39x4, Sv48x4 and Sv57x4.
    In each case, the size of the incoming address is widened by 2 bits (e.g.
    Sv39 becomes 41-bit system).
    To accommodate the 2 extra bits, the root page table (only) is expanded
    by a factor of four to be 16 KiB instead of the usual 4 KiB. The rest of
    page table system (including PTE format) is similar.
    This gives us 4x of memory space in each scheme, but it does not make sense
    to support all that memory for now.
    Allocate required amount of pages for the top directory in case of stage 2,
    but leave it unused.
    
    1. https://github.com/riscv/riscv-isa-manual/blob/main/src/hypervisor.adoc
    
    Reviewed by:    mhorne
    Sponsored by:   UKRI
    Differential Revision:  https://reviews.freebsd.org/D45481
---
 sys/riscv/include/pmap.h |  8 ++++++++
 sys/riscv/riscv/pmap.c   | 46 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/sys/riscv/include/pmap.h b/sys/riscv/include/pmap.h
index e10cbacb6e1f..8123231144bb 100644
--- a/sys/riscv/include/pmap.h
+++ b/sys/riscv/include/pmap.h
@@ -67,6 +67,12 @@ struct md_page {
 	vm_memattr_t		pv_memattr;
 };
 
+enum pmap_stage {
+	PM_INVALID,
+	PM_STAGE1,
+	PM_STAGE2,
+};
+
 struct pmap {
 	struct mtx		pm_mtx;
 	struct pmap_statistics	pm_stats;	/* pmap statictics */
@@ -76,6 +82,7 @@ struct pmap {
 	TAILQ_HEAD(,pv_chunk)	pm_pvchunk;	/* list of mappings in pmap */
 	LIST_ENTRY(pmap)	pm_list;	/* List of all pmaps */
 	struct vm_radix		pm_root;
+	enum pmap_stage		pm_stage;
 };
 
 typedef struct pmap *pmap_t;
@@ -134,6 +141,7 @@ vm_paddr_t pmap_kextract(vm_offset_t va);
 void	pmap_kremove(vm_offset_t);
 void	pmap_kremove_device(vm_offset_t, vm_size_t);
 void	*pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma);
+int	pmap_pinit_stage(pmap_t, enum pmap_stage);
 bool	pmap_page_is_mapped(vm_page_t m);
 bool	pmap_ps_enabled(pmap_t);
 
diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c
index 0bdf3be8ea39..8176975b049c 100644
--- a/sys/riscv/riscv/pmap.c
+++ b/sys/riscv/riscv/pmap.c
@@ -632,6 +632,7 @@ pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen)
 
 	/* Set this early so we can use the pagetable walking functions */
 	kernel_pmap_store.pm_top = (pd_entry_t *)l1pt;
+	kernel_pmap_store.pm_stage = PM_STAGE1;
 	PMAP_LOCK_INIT(kernel_pmap);
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	vm_radix_init(&kernel_pmap->pm_root);
@@ -1324,6 +1325,7 @@ pmap_pinit0(pmap_t pmap)
 {
 	PMAP_LOCK_INIT(pmap);
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
+	pmap->pm_stage = PM_STAGE1;
 	pmap->pm_top = kernel_pmap->pm_top;
 	pmap->pm_satp = pmap_satp_mode() |
 	    (vtophys(pmap->pm_top) >> PAGE_SHIFT);
@@ -1334,23 +1336,35 @@ pmap_pinit0(pmap_t pmap)
 }
 
 int
-pmap_pinit(pmap_t pmap)
+pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage)
 {
 	vm_paddr_t topphys;
-	vm_page_t mtop;
+	vm_page_t m;
 	size_t i;
 
-	mtop = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
-	    VM_ALLOC_WAITOK);
+	/*
+	 * Top directory is 4 pages in hypervisor case.
+	 * Current address space layout makes 3 of them unused.
+	 */
+	if (stage == PM_STAGE1)
+		m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
+		    VM_ALLOC_WAITOK);
+	else
+		m = vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
+		    4, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT);
 
-	topphys = VM_PAGE_TO_PHYS(mtop);
+	topphys = VM_PAGE_TO_PHYS(m);
 	pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys);
 	pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT);
+	pmap->pm_stage = stage;
 
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 
 	CPU_ZERO(&pmap->pm_active);
 
+	if (stage == PM_STAGE2)
+		goto finish;
+
 	if (pmap_mode == PMAP_MODE_SV39) {
 		/*
 		 * Copy L1 entries from the kernel pmap.  This must be done with
@@ -1371,12 +1385,20 @@ pmap_pinit(pmap_t pmap)
 		pmap->pm_top[i] = kernel_pmap->pm_top[i];
 	}
 
+finish:
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	vm_radix_init(&pmap->pm_root);
 
 	return (1);
 }
 
+int
+pmap_pinit(pmap_t pmap)
+{
+
+	return (pmap_pinit_stage(pmap, PM_STAGE1));
+}
+
 /*
  * This routine is called if the desired page table page does not exist.
  *
@@ -1609,6 +1631,8 @@ void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
+	int npages;
+	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
@@ -1616,15 +1640,23 @@ pmap_release(pmap_t pmap)
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
+	if (pmap->pm_stage == PM_STAGE2)
+		goto finish;
+
 	if (pmap_mode == PMAP_MODE_SV39) {
 		mtx_lock(&allpmaps_lock);
 		LIST_REMOVE(pmap, pm_list);
 		mtx_unlock(&allpmaps_lock);
 	}
 
+finish:
+	npages = pmap->pm_stage == PM_STAGE2 ? 4 : 1;
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top));
-	vm_page_unwire_noq(m);
-	vm_page_free(m);
+	for (i = 0; i < npages; i++) {
+		vm_page_unwire_noq(m);
+		vm_page_free(m);
+		m++;
+	}
 }
 
 static int