git: de09dcebd720 - main - riscv: rework page table bootstrap

From: Mitchell Horne <mhorne_at_FreeBSD.org>
Date: Thu, 20 Jun 2024 19:00:21 UTC
The branch main has been updated by mhorne:

URL: https://cgit.FreeBSD.org/src/commit/?id=de09dcebd720d5776df4cc4e67ffc7da757e4305

commit de09dcebd720d5776df4cc4e67ffc7da757e4305
Author:     Mitchell Horne <mhorne@FreeBSD.org>
AuthorDate: 2024-06-20 18:30:17 +0000
Commit:     Mitchell Horne <mhorne@FreeBSD.org>
CommitDate: 2024-06-20 18:33:19 +0000

    riscv: rework page table bootstrap
    
    The overall goal of the change is to reduce the amount of work done in
    locore assembly, and defer as much as possible until pmap_bootstrap().
    Currently, half the setup is done in assembly, and then we pass the l1pt
    address to pmap_bootstrap() where it is amended with other mappings.
    
    Inspiration and understanding has been taken from amd64's
    create_pagetables() routine, and I try to present the page table
    construction in the same way: a linear procedure with commentary
    explaining what we are doing and why. Thus the core of the new
    implementation is contained in pmap_create_pagetables().
    
    Once pmap_create_pagetables() has finished, we switch to the new
    pagetable root and leave the bootstrap ones created by locore behind,
    resulting in a minimal 8kB of wasted space.
    
    Having the whole procedure in one place, in C code, allows it to be more
    easily understood, while also making it more amenable to future changes
    which depend on CPU feature/errata detection.
    
    Note that with this change the size of the early devmap is bumped up
    from one to four L2 pages (8MB).
    
    Reviewed by:    markj
    MFC after:      1 month
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D45327
---
 sys/riscv/include/pte.h     |   2 -
 sys/riscv/include/vmparam.h |   7 +-
 sys/riscv/riscv/genassym.c  |   2 +
 sys/riscv/riscv/locore.S    |  87 +++++------
 sys/riscv/riscv/pmap.c      | 355 ++++++++++++++++++++++++++++----------------
 5 files changed, 272 insertions(+), 181 deletions(-)

diff --git a/sys/riscv/include/pte.h b/sys/riscv/include/pte.h
index da7bd051e122..031cae667f0c 100644
--- a/sys/riscv/include/pte.h
+++ b/sys/riscv/include/pte.h
@@ -93,5 +93,3 @@ typedef	uint64_t	pn_t;			/* page number */
 #define	PTE_SIZE	8
 
 #endif /* !_MACHINE_PTE_H_ */
-
-/* End of pte.h */
diff --git a/sys/riscv/include/vmparam.h b/sys/riscv/include/vmparam.h
index 7bfa587ce37c..c750791bb280 100644
--- a/sys/riscv/include/vmparam.h
+++ b/sys/riscv/include/vmparam.h
@@ -238,13 +238,16 @@
 extern vm_paddr_t dmap_phys_base;
 extern vm_paddr_t dmap_phys_max;
 extern vm_offset_t dmap_max_addr;
-extern vm_offset_t init_pt_va;
 #endif
 
 #define	ZERO_REGION_SIZE	(64 * 1024)	/* 64KB */
 
+/*
+ * The top of KVA is reserved for early device mappings.
+ */
 #define	DEVMAP_MAX_VADDR	VM_MAX_KERNEL_ADDRESS
-#define	PMAP_MAPDEV_EARLY_SIZE	L2_SIZE
+#define	DEVMAP_MIN_VADDR	(DEVMAP_MAX_VADDR - PMAP_MAPDEV_EARLY_SIZE)
+#define	PMAP_MAPDEV_EARLY_SIZE	(4 * L2_SIZE)
 
 /*
  * No non-transparent large page support in the pmap.
diff --git a/sys/riscv/riscv/genassym.c b/sys/riscv/riscv/genassym.c
index b1e1034fd479..105e17e679b7 100644
--- a/sys/riscv/riscv/genassym.c
+++ b/sys/riscv/riscv/genassym.c
@@ -60,6 +60,8 @@ ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS);
 ASSYM(VM_MAX_KERNEL_ADDRESS, VM_MAX_KERNEL_ADDRESS);
 ASSYM(PMAP_MAPDEV_EARLY_SIZE, PMAP_MAPDEV_EARLY_SIZE);
 
+ASSYM(PM_SATP, offsetof(struct pmap, pm_satp));
+
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_RA, offsetof(struct pcb, pcb_ra));
diff --git a/sys/riscv/riscv/locore.S b/sys/riscv/riscv/locore.S
index 17fdcc8ef55c..f7363fd025a7 100644
--- a/sys/riscv/riscv/locore.S
+++ b/sys/riscv/riscv/locore.S
@@ -1,6 +1,10 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
  * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
+ * Copyright (c) 2019-2021 Mitchell Horne <mhorne@FreeBSD.org>
+ * Copyright (c) 2022-2024 The FreeBSD Foundation
  *
  * Portions of this software were developed by SRI International and the
  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
@@ -10,6 +14,9 @@
  * Computer Laboratory as part of the CTSRD Project, with support from the
  * UK Higher Education Innovation Fund (HEIF).
  *
+ * Portions of this software were developed by Mitchell Horne
+ * <mhorne@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -36,7 +43,6 @@
 
 #include <machine/asm.h>
 #include <machine/param.h>
-#include <machine/trap.h>
 #include <machine/riscvreg.h>
 #include <machine/pte.h>
 
@@ -104,16 +110,24 @@ _start:
 	mv	a1, zero
 
 	/*
-	 * Set up page tables: map a 1GB region starting at KERNBASE using 2MB
-	 * superpages, starting from the first 2MB physical page into which the
-	 * kernel was loaded.  Also reserve an L2 page for the early device map
-	 * and map the DTB, if any, using the second-last entry of that L2
-	 * page.  This is hopefully enough to get us to pmap_bootstrap().
+	 * Set up page tables: Our goal is to enable virtual memory, doing the
+	 * minimum amount of work in assembly; just what is required to
+	 * bootstrap. We will construct the real page tables in C code, in
+	 * pmap_bootstrap().
+	 *
+	 * Here we map a 1GB region starting at KERNBASE using 2MB superpages,
+	 * starting from the first 2MB physical page into which the kernel was
+	 * loaded.
 	 *
-	 * Implementations are required to provide SV39 mode, so we use that
-	 * initially and will optionally enable SV48 mode during kernel pmap
-	 * initialization.
+	 * We also use an L1 entry to create a 1GB identity map (1:1 PA->VA).
+	 * This is useful for two reasons:
+	 *  - handling the DTB pointer passed from SBI firmware (physical addr)
+	 *  - simpler construction of pagetables in pmap_bootstrap()
 	 *
+	 * Implementations are required to provide Sv39 mode, so we use that
+	 * here and will conditionally enable Sv48 (or higher) later.
+	 *
+	 * We arrive here with:
 	 *  a0 - modulep or zero
 	 *  a1 - zero or dtbp
 	 */
@@ -122,7 +136,7 @@ pagetables:
 	jal	get_physmem
 
 	/* Construct 1GB Identity Map (1:1 PA->VA) */
-	lla	s1, pagetable_l1
+	lla	s1, bootstrap_pt_l1
 
 	srli	s2, s9, L1_SHIFT	/* kernstart >> L1_SHIFT */
 	andi	a5, s2, Ln_ADDR_MASK	/* & Ln_ADDR_MASK */
@@ -136,11 +150,11 @@ pagetables:
 	add	t0, s1, a5
 	sd	t6, (t0)		/* Store new PTE */
 
-	/* Construct the virtual address space */
+	/* Construct the virtual address space at KERNBASE */
 
 	/* Add L1 entry for kernel */
-	lla	s1, pagetable_l1
-	lla	s2, pagetable_l2	/* Link to next level PN */
+	lla	s1, bootstrap_pt_l1
+	lla	s2, bootstrap_pt_l2	/* Link to next level PN */
 	srli	s2, s2, PAGE_SHIFT
 
 	li	a5, KERNBASE
@@ -157,9 +171,9 @@ pagetables:
 	sd	t6, (t0)
 
 	/* Level 2 superpages (512 x 2MiB) */
-	lla	s1, pagetable_l2
+	lla	s1, bootstrap_pt_l2
 	srli	t4, s9, L2_SHIFT	/* Div physmem base by 2 MiB */
-	li	t2, 512			/* Build 512 entries */
+	li	t2, Ln_ENTRIES		/* Build 512 entries */
 	add	t3, t4, t2
 	li	t0, (PTE_KERN | PTE_X)
 1:
@@ -171,24 +185,6 @@ pagetables:
 	addi	t4, t4, 1
 	bltu	t4, t3, 1b
 
-	/* Create an L1 table entry for early devmap */
-	lla	s1, pagetable_l1
-	lla	s2, pagetable_l2_devmap	/* Link to next level PN */
-	srli	s2, s2, PAGE_SHIFT
-
-	li	a5, (VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE)
-	srli	a5, a5, L1_SHIFT	/* >> L1_SHIFT */
-	andi	a5, a5, Ln_ADDR_MASK	/* & Ln_ADDR_MASK */
-	li	t4, PTE_V
-	slli	t5, s2, PTE_PPN0_S	/* (s2 << PTE_PPN0_S) */
-	or	t6, t4, t5
-
-	/* Store the L1 table entry */
-	li	a6, PTE_SIZE
-	mulw	a5, a5, a6
-	add	t0, s1, a5
-	sd	t6, (t0)
-
 	/* Page tables END */
 
 	/*
@@ -203,7 +199,7 @@ pagetables:
 	csrw	stvec, t0
 
 	/* Set page tables base register */
-	lla	s2, pagetable_l1
+	lla	s2, bootstrap_pt_l1
 	srli	s2, s2, PAGE_SHIFT
 	li	t0, SATP_MODE_SV39
 	or	s2, s2, t0
@@ -244,8 +240,6 @@ va:
 	bltu	t0, t1, 1b
 
 	/* Fill riscv_bootparams */
-	la	t0, pagetable_l1
-	sd	t0, RISCV_BOOTPARAMS_KERN_L1PT(sp)
 	sd	s9, RISCV_BOOTPARAMS_KERN_PHYS(sp)
 
 	la	t0, initstack
@@ -278,12 +272,13 @@ initstack:
 	.space  (PAGE_SIZE * KSTACK_PAGES)
 initstack_end:
 
-	.align	12
-pagetable_l1:
-	.space	PAGE_SIZE
-pagetable_l2:
+/*
+ * Static space for the bootstrap page tables. Unused after pmap_bootstrap().
+ */
+	.balign	PAGE_SIZE
+bootstrap_pt_l1:
 	.space	PAGE_SIZE
-pagetable_l2_devmap:
+bootstrap_pt_l2:
 	.space	PAGE_SIZE
 
 	.align 3
@@ -292,10 +287,6 @@ virt_map:
 hart_lottery:
 	.space	4
 
-	.globl init_pt_va
-init_pt_va:
-	.quad pagetable_l2	/* XXX: Keep page tables VA */
-
 #ifndef SMP
 ENTRY(mpentry)
 1:
@@ -343,10 +334,8 @@ ENTRY(mpentry)
 	csrw	stvec, t0
 
 	/* Set page tables base register */
-	lla	s2, pagetable_l1
-	srli	s2, s2, PAGE_SHIFT
-	li	t0, SATP_MODE_SV39
-	or	s2, s2, t0
+	lla	t2, kernel_pmap_store
+	ld	s2, PM_SATP(t2)
 	sfence.vma
 	csrw	satp, s2
 
diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c
index ca051a9e4416..937bb22371e2 100644
--- a/sys/riscv/riscv/pmap.c
+++ b/sys/riscv/riscv/pmap.c
@@ -243,8 +243,7 @@ CTASSERT((DMAP_MIN_ADDRESS  & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
 CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
 
 /*
- * This code assumes that the early DEVMAP is L2_SIZE aligned and is fully
- * contained within a single L2 entry.
+ * This code assumes that the early DEVMAP is L2_SIZE aligned.
  */
 CTASSERT((PMAP_MAPDEV_EARLY_SIZE & L2_OFFSET) == 0);
 
@@ -324,6 +323,8 @@ static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 
+static uint64_t pmap_satp_mode(void);
+
 #define	pmap_clear(pte)			pmap_store(pte, 0)
 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
 #define	pmap_load_store(pte, entry)	atomic_swap_64(pte, entry)
@@ -361,6 +362,28 @@ pagezero(void *p)
     ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
 
+/*
+ * Construct a page table entry of the specified level pointing to physical
+ * address pa, with PTE bits 'bits'.
+ *
+ * A leaf PTE of any level must point to an address matching its alignment,
+ * e.g. L2 pages must be 2MB aligned in memory.
+ */
+#define	L1_PTE(pa, bits)	((((pa) >> L1_SHIFT) << PTE_PPN2_S) | (bits))
+#define	L2_PTE(pa, bits)	((((pa) >> L2_SHIFT) << PTE_PPN1_S) | (bits))
+#define	L3_PTE(pa, bits)	((((pa) >> L3_SHIFT) << PTE_PPN0_S) | (bits))
+
+/*
+ * Construct a page directory entry (PDE), pointing to next level entry at pa,
+ * with PTE bits 'bits'.
+ *
+ * Unlike PTEs, page directory entries can point to any 4K-aligned physical
+ * address.
+ */
+#define	L0_PDE(pa, bits)	L3_PTE(pa, bits)
+#define	L1_PDE(pa, bits)	L3_PTE(pa, bits)
+#define	L2_PDE(pa, bits)	L3_PTE(pa, bits)
+
 static __inline pd_entry_t *
 pmap_l0(pmap_t pmap, vm_offset_t va)
 {
@@ -501,45 +524,20 @@ pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
 	mtx_unlock(&allpmaps_lock);
 }
 
+/*
+ * This should only be used during pmap bootstrap e.g. by
+ * pmap_create_pagetables().
+ */
 static pt_entry_t *
-pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
-    u_int *l2_slot)
+pmap_early_alloc_tables(vm_paddr_t *freemempos, int npages)
 {
-	pt_entry_t *l2;
-	pd_entry_t *l1 __diagused;
-
-	l1 = (pd_entry_t *)l1pt;
-	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
+	pt_entry_t *pt;
 
-	/* Check locore has used a table L1 map */
-	KASSERT((l1[*l1_slot] & PTE_RX) == 0,
-		("Invalid bootstrap L1 table"));
+	pt = (pt_entry_t *)*freemempos;
+	*freemempos += npages * PAGE_SIZE;
+	bzero(pt, npages * PAGE_SIZE);
 
-	/* Find the address of the L2 table */
-	l2 = (pt_entry_t *)init_pt_va;
-	*l2_slot = pmap_l2_index(va);
-
-	return (l2);
-}
-
-static vm_paddr_t
-pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
-{
-	u_int l1_slot, l2_slot;
-	pt_entry_t *l2;
-	vm_paddr_t ret;
-
-	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
-
-	/* Check locore has used L2 superpages */
-	KASSERT((l2[l2_slot] & PTE_RX) != 0,
-		("Invalid bootstrap L2 table"));
-
-	/* L2 is superpages */
-	ret = L2PTE_TO_PHYS(l2[l2_slot]);
-	ret += (va & L2_OFFSET);
-
-	return (ret);
+	return (pt);
 }
 
 static void
@@ -575,38 +573,152 @@ pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa)
 	sfence_vma();
 }
 
-static vm_offset_t
-pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
+/*
+ *	Create a new set of pagetables to run the kernel with.
+ *
+ *	An initial, temporary setup was created in locore.S, which serves well
+ *	enough to get us this far. It mapped kernstart -> KERNBASE, using 2MB
+ *	superpages, and created a 1GB identity map, which allows this function
+ *	to dereference physical addresses.
+ *
+ *	The memory backing these page tables is allocated in the space
+ *	immediately following the kernel's preload area. Depending on the size
+ *	of this area, some, all, or none of these pages can be implicitly
+ *	mapped by the kernel's 2MB mappings. This memory will only ever be
+ *	accessed through the direct map, however.
+ */
+static vm_paddr_t
+pmap_create_pagetables(vm_paddr_t kernstart, vm_size_t kernlen,
+    vm_paddr_t min_pa, vm_paddr_t max_pa, vm_paddr_t *root_pt_phys)
 {
-	vm_offset_t l3pt;
-	pt_entry_t entry;
-	pd_entry_t *l2;
-	vm_paddr_t pa;
-	u_int l2_slot;
-	pn_t pn;
+	pt_entry_t *l0, *l1, *kern_l2, *kern_l3, *devmap_l3;
+	pd_entry_t *devmap_l2;
+	vm_paddr_t kernend, freemempos, pa;
+	int nkernl2, nkernl3, ndevmapl3;
+	int i, slot;
+	int mode;
 
-	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
+	kernend = kernstart + kernlen;
 
-	l2 = pmap_l2(kernel_pmap, va);
-	l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1));
-	l2_slot = pmap_l2_index(va);
-	l3pt = l3_start;
+	/* Static allocations begin after the kernel staging area. */
+	freemempos = roundup2(kernend, PAGE_SIZE);
 
-	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
-		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
+	/* Detect Sv48 mode. */
+	mode = PMAP_MODE_SV39;
+	TUNABLE_INT_FETCH("vm.pmap.mode", &mode);
 
-		pa = pmap_early_vtophys(l1pt, l3pt);
-		pn = (pa / PAGE_SIZE);
-		entry = (PTE_V);
-		entry |= (pn << PTE_PPN0_S);
-		pmap_store(&l2[l2_slot], entry);
-		l3pt += PAGE_SIZE;
+	if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) {
+		/*
+		 * Sv48 mode: allocate an L0 page table to be the root. The
+		 * layout of KVA is otherwise identical to Sv39.
+		 */
+		l0 = pmap_early_alloc_tables(&freemempos, 1);
+		*root_pt_phys = (vm_paddr_t)l0;
+		pmap_mode = PMAP_MODE_SV48;
+	} else {
+		l0 = NULL;
+	}
+
+	/*
+	 * Allocate an L1 page table.
+	 */
+	l1 = pmap_early_alloc_tables(&freemempos, 1);
+	if (pmap_mode == PMAP_MODE_SV39)
+		*root_pt_phys = (vm_paddr_t)l1;
+
+	/*
+	 * Allocate a set of L2 page tables for KVA. Most likely, only 1 is
+	 * needed.
+	 */
+	nkernl2 = howmany(howmany(kernlen, L2_SIZE), Ln_ENTRIES);
+	kern_l2 = pmap_early_alloc_tables(&freemempos, nkernl2);
+
+	/*
+	 * Allocate an L2 page table for the static devmap, located at the end
+	 * of KVA. We can expect that the devmap will always be less than 1GB
+	 * in size.
+	 */
+	devmap_l2 = pmap_early_alloc_tables(&freemempos, 1);
+
+	/* Allocate L3 page tables for the devmap. */
+	ndevmapl3 = howmany(howmany(PMAP_MAPDEV_EARLY_SIZE, L3_SIZE),
+	    Ln_ENTRIES);
+	devmap_l3 = pmap_early_alloc_tables(&freemempos, ndevmapl3);
+
+	/*
+	 * Allocate some L3 bootstrap pages, for early KVA allocations before
+	 * vm_mem_init() has run. For example, the message buffer.
+	 *
+	 * A somewhat arbitrary choice of 32MB. This should be more than enough
+	 * for any early allocations. There is no need to worry about waste, as
+	 * whatever is not used will be consumed by later calls to
+	 * pmap_growkernel().
+	 */
+	nkernl3 = 16;
+	kern_l3 = pmap_early_alloc_tables(&freemempos, nkernl3);
+
+	/* Allocations are done. */
+	if (freemempos < roundup2(kernend, L2_SIZE))
+		freemempos = roundup2(kernend, L2_SIZE);
+
+	/*
+	 * Map the kernel (and preloaded modules or data) using L2 superpages.
+	 *
+	 * kernstart is 2MB-aligned. This is enforced by loader(8) and required
+	 * by locore assembly.
+	 *
+	 * TODO: eventually, this should be done with proper permissions for
+	 * each segment, rather than mapping the entire kernel and preloaded
+	 * modules RWX.
+	 */
+	slot = pmap_l2_index(KERNBASE);
+	for (pa = kernstart; pa < kernend; pa += L2_SIZE, slot++) {
+		pmap_store(&kern_l2[slot], L2_PTE(pa, PTE_KERN | PTE_X));
+	}
+
+	/*
+	 * Connect the L3 bootstrap pages to the kernel L2 table. The L3 PTEs
+	 * themselves are invalid.
+	 */
+	slot = pmap_l2_index(freemempos - kernstart + KERNBASE);
+	for (i = 0; i < nkernl3; i++, slot++) {
+		pa = (vm_paddr_t)kern_l3 + ptoa(i);
+		pmap_store(&kern_l2[slot], L2_PDE(pa, PTE_V));
+	}
+
+	/* Connect the L2 tables to the L1 table. */
+	slot = pmap_l1_index(KERNBASE);
+	for (i = 0; i < nkernl2; i++, slot++) {
+		pa = (vm_paddr_t)kern_l2 + ptoa(i);
+		pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
+	}
+
+	/* Connect the L1 table to L0, if in use. */
+	if (pmap_mode == PMAP_MODE_SV48) {
+		slot = pmap_l0_index(KERNBASE);
+		pmap_store(&l0[slot], L0_PDE((vm_paddr_t)l1, PTE_V));
+	}
+
+	/*
+	 * Connect the devmap L3 pages to the L2 table. The devmap PTEs
+	 * themselves are invalid.
+	 */
+	slot = pmap_l2_index(DEVMAP_MIN_VADDR);
+	for (i = 0; i < ndevmapl3; i++, slot++) {
+		pa = (vm_paddr_t)devmap_l3 + ptoa(i);
+		pmap_store(&devmap_l2[slot], L2_PDE(pa, PTE_V));
 	}
 
-	/* Clean the L2 page table */
-	memset((void *)l3_start, 0, l3pt - l3_start);
+	/* Connect the devmap L2 pages to the L1 table. */
+	slot = pmap_l1_index(DEVMAP_MIN_VADDR);
+	pa = (vm_paddr_t)devmap_l2;
+	pmap_store(&l1[slot], L1_PDE(pa, PTE_V));
+
+	/* Bootstrap the direct map. */
+	pmap_bootstrap_dmap((vm_offset_t)l1, min_pa, max_pa);
 
-	return (l3pt);
+	/* Return the next position of free memory */
+	return (freemempos);
 }
 
 /*
@@ -616,19 +728,17 @@ void
 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen)
 {
 	vm_paddr_t physmap[PHYS_AVAIL_ENTRIES];
-	uint64_t satp;
-	vm_offset_t dpcpu, freemempos, l0pv, msgbufpv;
-	vm_paddr_t l0pa, l1pa, max_pa, min_pa, pa;
-	pd_entry_t *l0p;
-	u_int l1_slot, l2_slot;
+	vm_paddr_t freemempos;
+	vm_paddr_t max_pa, min_pa, pa;
+	vm_paddr_t root_pt_phys;
+	vm_offset_t freeva;
+	vm_offset_t dpcpu, msgbufpv;
+	pt_entry_t *pte;
 	u_int physmap_idx;
-	int i, mode;
+	int i;
 
 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
 
-	/* Set this early so we can use the pagetable walking functions */
-	kernel_pmap_store.pm_top = (pd_entry_t *)l1pt;
-	kernel_pmap_store.pm_stage = PM_STAGE1;
 	PMAP_LOCK_INIT(kernel_pmap);
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	vm_radix_init(&kernel_pmap->pm_root);
@@ -664,74 +774,63 @@ pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen)
 	printf("min_pa %lx\n", min_pa);
 	printf("max_pa %lx\n", max_pa);
 
-	/* Create a direct map region early so we can use it for pa -> va */
-	pmap_bootstrap_dmap(l1pt, min_pa, max_pa);
-
-	/*
-	 * Read the page table to find out what is already mapped.
-	 * This assumes we have mapped a block of memory from KERNBASE
-	 * using a single L1 entry.
-	 */
-	(void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
-
-	/* Sanity check the index, KERNBASE should be the first VA */
-	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
-
-	freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE);
-
-	/* Create the l3 tables for the early devmap */
-	freemempos = pmap_bootstrap_l3(l1pt,
-	    VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE, freemempos);
+	/* Create a new set of pagetables to run the kernel in. */
+	freemempos = pmap_create_pagetables(kernstart, kernlen, min_pa, max_pa,
+	    &root_pt_phys);
 
+	/* Switch to the newly created page tables. */
+	kernel_pmap->pm_stage = PM_STAGE1;
+	kernel_pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(root_pt_phys);
+	kernel_pmap->pm_satp = atop(root_pt_phys) | pmap_satp_mode();
+	csr_write(satp, kernel_pmap->pm_satp);
 	sfence_vma();
 
-#define alloc_pages(var, np)						\
-	(var) = freemempos;						\
-	freemempos += (np * PAGE_SIZE);					\
-	memset((char *)(var), 0, ((np) * PAGE_SIZE));
-
-	mode = 0;
-	TUNABLE_INT_FETCH("vm.pmap.mode", &mode);
-	if (mode == PMAP_MODE_SV48 && (mmu_caps & MMU_SV48) != 0) {
-		/*
-		 * Enable SV48 mode: allocate an L0 page and set SV48 mode in
-		 * SATP.  If the implementation does not provide SV48 mode,
-		 * the mode read back from the (WARL) SATP register will be
-		 * unchanged, and we continue in SV39 mode.
-		 */
-		alloc_pages(l0pv, 1);
-		l0p = (void *)l0pv;
-		l1pa = pmap_early_vtophys(l1pt, l1pt);
-		l0p[pmap_l0_index(KERNBASE)] = PTE_V |
-		    ((l1pa >> PAGE_SHIFT) << PTE_PPN0_S);
-
-		l0pa = pmap_early_vtophys(l1pt, l0pv);
-		csr_write(satp, (l0pa >> PAGE_SHIFT) | SATP_MODE_SV48);
-		satp = csr_read(satp);
-		if ((satp & SATP_MODE_M) == SATP_MODE_SV48) {
-			pmap_mode = PMAP_MODE_SV48;
-			kernel_pmap_store.pm_top = l0p;
-		} else {
-			/* Mode didn't change, give the page back. */
-			freemempos -= PAGE_SIZE;
-		}
-	}
-
-	/* Allocate dynamic per-cpu area. */
-	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
+	/*
+	 * Now, we need to make a few more static reservations from KVA.
+	 *
+	 * Set freeva to freemempos virtual address, and be sure to advance
+	 * them together.
+	 */
+	freeva = freemempos - kernstart + KERNBASE;
+#define reserve_space(var, pa, size)					\
+	do {								\
+		var = freeva;						\
+		pa = freemempos;					\
+		freeva += size;						\
+		freemempos += size;					\
+	} while (0)
+
+	/* Allocate the dynamic per-cpu area. */
+	reserve_space(dpcpu, pa, DPCPU_SIZE);
+
+	/* Map it. */
+	pte = pmap_l3(kernel_pmap, dpcpu);
+	KASSERT(pte != NULL, ("Bootstrap pages missing"));
+	for (i = 0; i < howmany(DPCPU_SIZE, PAGE_SIZE); i++)
+		pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN));
+
+	/* Now, it can be initialized. */
 	dpcpu_init((void *)dpcpu, 0);
 
 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
-	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
+	reserve_space(msgbufpv, pa, round_page(msgbufsize));
 	msgbufp = (void *)msgbufpv;
 
-	virtual_avail = roundup2(freemempos, L2_SIZE);
-	virtual_end = VM_MAX_KERNEL_ADDRESS - PMAP_MAPDEV_EARLY_SIZE;
-	kernel_vm_end = virtual_avail;
+	/* Map it. */
+	pte = pmap_l3(kernel_pmap, msgbufpv);
+	KASSERT(pte != NULL, ("Bootstrap pages missing"));
+	for (i = 0; i < howmany(msgbufsize, PAGE_SIZE); i++)
+		pmap_store(&pte[i], L3_PTE(pa + ptoa(i), PTE_KERN));
+
+#undef	reserve_space
 
-	pa = pmap_early_vtophys(l1pt, freemempos);
+	/* Mark the bounds of our available virtual address space */
+	virtual_avail = kernel_vm_end = freeva;
+	virtual_end = DEVMAP_MIN_VADDR;
 
-	physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC);
+	/* Exclude the reserved physical memory from allocations. */
+	physmem_exclude_region(kernstart, freemempos - kernstart,
+	    EXFLAG_NOALLOC);
 }
 
 /*