git: 1ae25866767d - main - kexec: Introduce basic arm64 support

From: Justin Hibbits <jhibbits_at_FreeBSD.org>
Date: Mon, 27 Oct 2025 14:34:38 UTC
The branch main has been updated by jhibbits:

URL: https://cgit.FreeBSD.org/src/commit/?id=1ae25866767d686067fe6678b62681b7a8f0d361

commit 1ae25866767d686067fe6678b62681b7a8f0d361
Author:     Justin Hibbits <jhibbits@FreeBSD.org>
AuthorDate: 2025-10-26 02:45:00 +0000
Commit:     Justin Hibbits <jhibbits@FreeBSD.org>
CommitDate: 2025-10-27 14:33:50 +0000

    kexec: Introduce basic arm64 support
    
    This works on older arm64 platforms, but may not work with arm64 devices
    using GICv3, due to a quirk in the GICv3, where some registers are
    write-once.
    
    Most of the kexec reboot work on arm64 can be done entirely in C code,
    by disabling the MMU, as the kernel is carved out of the vm_phys_segs
    array, so cannot be overwritten.
    
    Reviewed by:    andrew
    Sponsored by:   Juniper Networks, Inc.
    Differential Revision:  https://reviews.freebsd.org/D51621
---
 sys/arm64/arm64/kexec_support.c | 188 ++++++++++++++++++++++++++++++++++++++++
 sys/arm64/arm64/locore.S        |  44 ++++++++++
 sys/arm64/arm64/mp_machdep.c    |  78 +++++++++++++++++
 sys/arm64/include/cpufunc.h     |   7 ++
 sys/arm64/include/kexec.h       |  33 +++++++
 sys/arm64/include/pcpu.h        |   3 +-
 sys/arm64/include/smp.h         |   1 +
 sys/conf/files.arm64            |   1 +
 sys/dev/psci/psci.c             |  13 +++
 sys/dev/psci/psci.h             |   1 +
 10 files changed, 368 insertions(+), 1 deletion(-)

diff --git a/sys/arm64/arm64/kexec_support.c b/sys/arm64/arm64/kexec_support.c
new file mode 100644
index 000000000000..8b9719c05b67
--- /dev/null
+++ b/sys/arm64/arm64/kexec_support.c
@@ -0,0 +1,188 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/kexec.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+
+#include <machine/armreg.h>
+#include <machine/pmap.h>
+#include <machine/pte.h>
+
+/*
+ * Idea behind this:
+ *
+ * kexec_load_md():
+ * - Update boot page tables (identity map) to include all pages needed before
+ *   disabling MMU.
+ *
+ * kexec_reboot_md():
+ * - Copy pages into target(s)
+ * - Do "other stuff"
+ * - Does not return
+ */
+
+extern pt_entry_t pagetable_l0_ttbr0_bootstrap[];
+extern unsigned long initstack_end[];
+void switch_stack(void *, void (*)(void *, void *, struct kexec_image *), void *);
+
+#define	SCTLR_EL1_NO_MMU	(SCTLR_RES1 | SCTLR_LSMAOE | SCTLR_nTLSMD | \
+		SCTLR_EIS | SCTLR_TSCXT | SCTLR_EOS)
+#define	vm_page_offset(m)	((vm_offset_t)(m) - vm_page_base)
+static inline vm_page_t
+phys_vm_page(vm_page_t m, vm_offset_t vm_page_v, vm_paddr_t vm_page_p)
+{
+	return ((vm_page_t)((vm_offset_t)m - vm_page_v + vm_page_p));
+}
+
+/* First 2 args are filler for switch_stack() */
+static void __aligned(16) __dead2
+kexec_reboot_bottom( void *arg1 __unused, void *arg2 __unused,
+    struct kexec_image *image)
+{
+	void (*e)(void) = (void *)image->entry;
+	vm_offset_t	vm_page_base = (vm_offset_t)vm_page_array;
+	vm_paddr_t	vm_page_phys = pmap_kextract((vm_offset_t)vm_page_array);
+	struct kexec_segment_stage *phys_segs =
+	    (void *)pmap_kextract((vm_offset_t)&image->segments);
+	vm_paddr_t from_pa, to_pa;
+	vm_size_t size;
+	vm_page_t	first, m, mp;
+	struct pctrie_iter pct_i;
+
+	/*
+	 * Create a linked list of all pages in the object before we disable the
+	 * MMU.  Once the MMU is disabled we can't use the vm_radix iterators,
+	 * as they rely on virtual address pointers.
+	 */
+	first = NULL;
+	vm_radix_iter_init(&pct_i, &image->map_obj->rtree);
+	VM_RADIX_FORALL(m, &pct_i) {
+		if (first == NULL)
+			first = m;
+		else
+			SLIST_INSERT_AFTER(mp, m, plinks.s.ss);
+		mp = m;
+	}
+
+	/*
+	 * We're running out of the identity map now, disable the MMU before we
+	 * continue.  It's possible page tables can be overwritten, which would
+	 * be very bad if we were running with the MMU enabled.
+	 */
+	WRITE_SPECIALREG(sctlr_el1, SCTLR_EL1_NO_MMU);
+	isb();
+	for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+		if (phys_segs[i].size == 0)
+			break;
+		to_pa = phys_segs[i].target;
+		/* Copy the segment here... */
+		for (vm_page_t p = phys_segs[i].first_page;
+		    p != NULL && to_pa - phys_segs[i].target < phys_segs[i].size;
+		    p = SLIST_NEXT(p, plinks.s.ss)) {
+			p = phys_vm_page(p, vm_page_base, vm_page_phys);
+			from_pa = p->phys_addr;
+			if (p->phys_addr == to_pa) {
+				to_pa += PAGE_SIZE;
+				continue;
+			}
+			for (size = PAGE_SIZE / sizeof(register_t);
+			    size > 0; --size) {
+				*(register_t *)to_pa = *(register_t *)from_pa;
+				to_pa += sizeof(register_t);
+				from_pa += sizeof(register_t);
+			}
+		}
+	}
+	invalidate_icache();
+	e();
+	while (1)
+		;
+}
+
+void
+kexec_reboot_md(struct kexec_image *image)
+{
+	uintptr_t ptr;
+	register_t reg;
+
+	for (int i = 0; i < KEXEC_SEGMENT_MAX; i++) {
+		if (image->segments[i].size > 0)
+			cpu_dcache_inv_range((void *)PHYS_TO_DMAP(image->segments[i].target),
+			    image->segments[i].size);
+	}
+	ptr = pmap_kextract((vm_offset_t)kexec_reboot_bottom);
+	serror_disable();
+
+	reg = pmap_kextract((vm_offset_t)pagetable_l0_ttbr0_bootstrap);
+	set_ttbr0(reg);
+	cpu_tlb_flushID();
+
+	typeof(kexec_reboot_bottom) *p = (void *)ptr;
+	switch_stack((void *)pmap_kextract((vm_offset_t)initstack_end),
+	    p, image);
+	while (1)
+		;
+}
+
+int
+kexec_load_md(struct kexec_image *image)
+{
+	vm_paddr_t tmp;
+	pt_entry_t *pte;
+
+	/* Create L2 page blocks for the trampoline. L0/L1 are from the startup. */
+
+	/*
+	 * There are exactly 2 pages before the pagetable_l0_ttbr0_bootstrap, so
+	 * move to there.
+	 */
+	pte = pagetable_l0_ttbr0_bootstrap;
+	pte -= (Ln_ENTRIES * 2);	/* move to start of L2 pages */
+
+	/*
+	 * Populate the identity map with symbols we know we'll need before we
+	 * turn off the MMU.
+	 */
+	tmp = pmap_kextract((vm_offset_t)kexec_reboot_bottom);
+	pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN);
+	tmp = pmap_kextract((vm_offset_t)initstack_end);
+	pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN);
+	/* We'll need vm_page_array for doing offset calculations. */
+	tmp = pmap_kextract((vm_offset_t)&vm_page_array);
+	pte[pmap_l2_index(tmp)] = (tmp | L2_BLOCK | ATTR_AF | ATTR_S1_UXN);
+
+	return (0);
+}
diff --git a/sys/arm64/arm64/locore.S b/sys/arm64/arm64/locore.S
index d35e334905a7..3ec12140f139 100644
--- a/sys/arm64/arm64/locore.S
+++ b/sys/arm64/arm64/locore.S
@@ -325,6 +325,19 @@ mp_virtdone:
 
 	b	init_secondary
 LEND(mpentry_common)
+
+ENTRY(mp_cpu_spinloop)
+0:
+	wfe
+	ldr	x0, mp_cpu_spin_table_release_addr
+	cbz	x0, 0b
+	blr	x0
+	.globl mp_cpu_spin_table_release_addr
+mp_cpu_spin_table_release_addr:
+	.quad	0
+	.globl mp_cpu_spinloop_end
+mp_cpu_spinloop_end:
+END(mp_cpu_spinloop)
 #endif
 
 /*
@@ -475,6 +488,29 @@ LENTRY(enter_kernel_el)
 	eret
 LEND(enter_kernel_el)
 
+/* Turn off the MMU.  Install ttbr0 from the bootstrap page table, and go there.
+ * Does not return.
+ * - x0 - target address to jump to after stopping the MMU.
+ * - x1 - kernel load address
+ */
+ENTRY(stop_mmu)
+	mov	x16, x0	/* Save target. */
+	ldr	x2, =(1f - KERNBASE)
+	add	x17, x1, x2
+	ldr	x3, =(pagetable_l0_ttbr0_bootstrap - KERNBASE)
+	add	x1, x1, x3
+	msr	ttbr0_el1, x1
+	isb
+	br	x17
+1:
+	BTI_J
+	mrs	x0, sctlr_el1
+	bic	x0, x0, SCTLR_M
+	bic	x0, x0, SCTLR_C
+	msr	sctlr_el1, x0
+	isb
+	br	x16
+END(stop_mmu)
 /*
  * Get the physical address the kernel was loaded at.
  */
@@ -1094,12 +1130,19 @@ tcr:
 	    TCR_SH0_IS | TCR_ORGN0_WBWA | TCR_IRGN0_WBWA)
 LEND(start_mmu)
 
+ENTRY(switch_stack)
+	mov	sp, x0
+	mov 	x16, x1
+	br	x16
+END(switch_stack)
+
 ENTRY(abort)
 	b abort
 END(abort)
 
 .bss
 	.align	PAGE_SHIFT
+	.globl	initstack_end
 initstack:
 	.space	BOOT_STACK_SIZE
 initstack_end:
@@ -1116,6 +1159,7 @@ initstack_end:
 	 *           L0 for user
 	 */
 	.globl pagetable_l0_ttbr1
+	.globl pagetable_l0_ttbr0_bootstrap
 pagetable:
 pagetable_l3_ttbr1:
 	.space	(PAGE_SIZE * L3_PAGE_COUNT)
diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c
index e4d011df3a06..0bdd2ecfd8a7 100644
--- a/sys/arm64/arm64/mp_machdep.c
+++ b/sys/arm64/arm64/mp_machdep.c
@@ -60,6 +60,7 @@
 #include <machine/debug_monitor.h>
 #include <machine/intr.h>
 #include <machine/smp.h>
+#include <machine/vmparam.h>
 #ifdef VFP
 #include <machine/vfp.h>
 #endif
@@ -103,6 +104,7 @@ static void ipi_hardclock(void *);
 static void ipi_preempt(void *);
 static void ipi_rendezvous(void *);
 static void ipi_stop(void *);
+static void ipi_off(void *);
 
 #ifdef FDT
 static u_int fdt_cpuid;
@@ -193,6 +195,7 @@ release_aps(void *dummy __unused)
 	intr_ipi_setup(IPI_STOP, "stop", ipi_stop, NULL);
 	intr_ipi_setup(IPI_STOP_HARD, "stop hard", ipi_stop, NULL);
 	intr_ipi_setup(IPI_HARDCLOCK, "hardclock", ipi_hardclock, NULL);
+	intr_ipi_setup(IPI_OFF, "off", ipi_off, NULL);
 
 	atomic_store_int(&aps_started, 0);
 	atomic_store_rel_int(&aps_ready, 1);
@@ -390,6 +393,34 @@ ipi_stop(void *dummy __unused)
 	CTR0(KTR_SMP, "IPI_STOP (restart)");
 }
 
+void stop_mmu(vm_paddr_t, vm_paddr_t) __dead2;
+extern uint32_t mp_cpu_spinloop[];
+extern uint32_t mp_cpu_spinloop_end[];
+extern uint64_t mp_cpu_spin_table_release_addr;
+static void
+ipi_off(void *dummy __unused)
+{
+	CTR0(KTR_SMP, "IPI_OFF");
+	if (psci_present)
+		psci_cpu_off();
+	else {
+		uint64_t release_addr;
+		vm_size_t size;
+
+		size = (vm_offset_t)&mp_cpu_spin_table_release_addr -
+		    (vm_offset_t)mp_cpu_spinloop;
+		release_addr = PCPU_GET(release_addr) - size;
+		isb();
+		invalidate_icache();
+		/* Go catatonic, don't take any interrupts. */
+		intr_disable();
+		stop_mmu(release_addr, pmap_kextract(KERNBASE));
+
+
+	}
+	CTR0(KTR_SMP, "IPI_OFF failed");
+}
+
 struct cpu_group *
 cpu_topo(void)
 {
@@ -511,6 +542,7 @@ start_cpu(u_int cpuid, uint64_t target_cpu, int domain, vm_paddr_t release_addr)
 	pcpu_init(pcpup, cpuid, sizeof(struct pcpu));
 	pcpup->pc_mpidr = target_cpu & CPU_AFF_MASK;
 	bootpcpu = pcpup;
+	pcpup->pc_release_addr = release_addr;
 
 	dpcpu[cpuid - 1] = (void *)(pcpup + 1);
 	dpcpu_init(dpcpu[cpuid - 1], cpuid);
@@ -752,6 +784,52 @@ cpu_mp_start(void)
 	}
 }
 
+void
+cpu_mp_stop(void)
+{
+
+	/* Short-circuit for single-CPU */
+	if (CPU_COUNT(&all_cpus) == 1)
+		return;
+
+	KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("Not on the first CPU!\n"));
+
+	/*
+	 * If we use spin-table, assume U-boot method for now (single address
+	 * shared by all CPUs).
+	 */
+	if (!psci_present) {
+		int cpu;
+		vm_paddr_t release_addr;
+		void *release_vaddr;
+		vm_size_t size;
+
+		/* Find the shared release address. */
+		CPU_FOREACH(cpu) {
+			release_addr = pcpu_find(cpu)->pc_release_addr;
+			if (release_addr != 0)
+				break;
+		}
+		/* No release address? No way of notifying other CPUs. */
+		if (release_addr == 0)
+			return;
+
+		size = (vm_offset_t)&mp_cpu_spinloop_end -
+		    (vm_offset_t)&mp_cpu_spinloop;
+
+		release_addr -= (vm_offset_t)&mp_cpu_spin_table_release_addr -
+		    (vm_offset_t)mp_cpu_spinloop;
+
+		release_vaddr = pmap_mapdev(release_addr, size);
+		bcopy(mp_cpu_spinloop, release_vaddr, size);
+		cpu_dcache_wbinv_range(release_vaddr, size);
+		pmap_unmapdev(release_vaddr, size);
+		invalidate_icache();
+	}
+	ipi_all_but_self(IPI_OFF);
+	DELAY(1000000);
+}
+
 /* Introduce rest of cores to the world */
 void
 cpu_mp_announce(void)
diff --git a/sys/arm64/include/cpufunc.h b/sys/arm64/include/cpufunc.h
index e6e1f682794e..e9eee643216b 100644
--- a/sys/arm64/include/cpufunc.h
+++ b/sys/arm64/include/cpufunc.h
@@ -96,6 +96,13 @@ serror_enable(void)
 	__asm __volatile("msr daifclr, #(" __XSTRING(DAIF_A) ")");
 }
 
+static __inline void
+serror_disable(void)
+{
+
+	__asm __volatile("msr daifset, #(" __XSTRING(DAIF_A) ")");
+}
+
 static __inline register_t
 get_midr(void)
 {
diff --git a/sys/arm64/include/kexec.h b/sys/arm64/include/kexec.h
new file mode 100644
index 000000000000..0a8c7a053331
--- /dev/null
+++ b/sys/arm64/include/kexec.h
@@ -0,0 +1,33 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Juniper Networks, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef	_ARM64_KEXEC_H_
+#define	_ARM64_KEXEC_H_
+
+#define KEXEC_MD_PAGES(x) 0
+
+#endif	/* _ARM64_KEXEC_H_ */
diff --git a/sys/arm64/include/pcpu.h b/sys/arm64/include/pcpu.h
index 09bd8fa8a966..73399d2c3f8c 100644
--- a/sys/arm64/include/pcpu.h
+++ b/sys/arm64/include/pcpu.h
@@ -50,7 +50,8 @@ struct debug_monitor_state;
 	struct pmap *pc_curvmpmap;					\
 	uint64_t pc_mpidr;						\
 	u_int	pc_bcast_tlbi_workaround;				\
-	char __pad[197]
+	uint64_t pc_release_addr;					\
+	char __pad[189]
 
 #ifdef _KERNEL
 
diff --git a/sys/arm64/include/smp.h b/sys/arm64/include/smp.h
index 500cd1ef4f02..4a5bfda3ac1c 100644
--- a/sys/arm64/include/smp.h
+++ b/sys/arm64/include/smp.h
@@ -40,6 +40,7 @@ enum {
 	IPI_STOP,
 	IPI_STOP_HARD,
 	IPI_HARDCLOCK,
+	IPI_OFF,
 	INTR_IPI_COUNT,
 };
 
diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64
index 2f412fa3cb1b..882aca705336 100644
--- a/sys/conf/files.arm64
+++ b/sys/conf/files.arm64
@@ -55,6 +55,7 @@ arm64/arm64/gic_v3_acpi.c			optional acpi
 arm64/arm64/gic_v3_fdt.c			optional fdt
 arm64/arm64/hyp_stub.S				standard
 arm64/arm64/identcpu.c				standard
+arm64/arm64/kexec_support.c			standard
 arm64/arm64/locore.S				standard no-obj
 arm64/arm64/machdep.c				standard
 arm64/arm64/machdep_boot.c			standard
diff --git a/sys/dev/psci/psci.c b/sys/dev/psci/psci.c
index 497b23d2d4c3..2b250401ae83 100644
--- a/sys/dev/psci/psci.c
+++ b/sys/dev/psci/psci.c
@@ -474,6 +474,19 @@ psci_cpu_on(unsigned long cpu, unsigned long entry, unsigned long context_id)
 	return (psci_call(fnid, cpu, entry, context_id));
 }
 
+int
+psci_cpu_off(void)
+{
+	uint32_t fnid;
+
+	fnid = PSCI_FNID_CPU_OFF;
+	if (psci_softc != NULL)
+		fnid = psci_softc->psci_fnids[PSCI_FN_CPU_OFF];
+
+	/* Returns PSCI_RETVAL_DENIED on error. */
+	return (psci_call(fnid, 0, 0, 0));
+}
+
 static void
 psci_shutdown(void *xsc, int howto)
 {
diff --git a/sys/dev/psci/psci.h b/sys/dev/psci/psci.h
index 451d40c0178d..6704eaf26c71 100644
--- a/sys/dev/psci/psci.h
+++ b/sys/dev/psci/psci.h
@@ -39,6 +39,7 @@ typedef int (*psci_callfn_t)(register_t, register_t, register_t, register_t,
 extern bool psci_present;
 
 int	psci_cpu_on(unsigned long, unsigned long, unsigned long);
+int	psci_cpu_off(void);	/* Operates on caller. */
 void	psci_reset(void);
 int32_t	psci_features(uint32_t);
 int	psci_get_version(void);