git: 2069a2a08f6e - main - kboot: Improve amd64 booting

From: Warner Losh <imp_at_FreeBSD.org>
Date: Fri, 03 Feb 2023 15:50:50 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=2069a2a08f6e555285be71be042e85c75b6feb02

commit 2069a2a08f6e555285be71be042e85c75b6feb02
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2023-02-03 15:40:13 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2023-02-03 15:41:40 +0000

    kboot: Improve amd64 booting
    
    Copy more of the necessary state for FreeBSD to boot:
    o Copy EFI memory tables
    o Create custom page tables needed for the kernel to find itself
    o Simplify the passing of args to the trampoline by putting them
      on the stack rather than in dedicated memory.
    
    This is only partially successful... we get only part way through the
    amd64 startup code before dying. However, it's much further than before
    the changes.
    
    Sponsored by:           Netflix
    Reviewed by:            tsoome, kevans
    Differential Revision:  https://reviews.freebsd.org/D38259
---
 stand/kboot/arch/amd64/amd64_tramp.S   | 123 ++++++++++------
 stand/kboot/arch/amd64/elf64_freebsd.c | 249 ++++++++++++++++++++++++---------
 2 files changed, 262 insertions(+), 110 deletions(-)

diff --git a/stand/kboot/arch/amd64/amd64_tramp.S b/stand/kboot/arch/amd64/amd64_tramp.S
index 877705407f92..b95e99cbaf0f 100644
--- a/stand/kboot/arch/amd64/amd64_tramp.S
+++ b/stand/kboot/arch/amd64/amd64_tramp.S
@@ -1,9 +1,6 @@
 /*-
- * Copyright (c) 2013 The FreeBSD Foundation
- * All rights reserved.
+ * Copyright (c) 2022 Netflix, Inc
  *
- * This software was developed by Benno Rice under sponsorship from
- * the FreeBSD Foundation.
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -24,53 +21,87 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * $FreeBSD$
  */
 
-#include <machine/asmacros.h>
-
-#define ASM_FILE
-#include "multiboot2.h"
+/*
+ * This is the trampoline that starts the FreeBSD kernel. Since the Linux kernel
+ * calls this routine with no args, and has a different environment than the
+ * boot loader provides and that the kernel expects, this code is responsible
+ * for setting all that up and calling the normal kernel entry point. It's
+ * analogous to the "purgatory" code in the linux kernel. Details about these
+ * operations are contained in comments below. On amd64, the kernel starts all
+ * the APs so we don't have to worry about them here.
+ */
 
+/*
+ * Keep in sync with elf64_freebsd.c. Kexec starts tramp w/o any parameters, so
+ * store them here. This is constructed to be a useful stack:
+ *
+ * struct trampoline_data {
+ *	uint64_t	pt4;			// Page table address to pop
+ *	uint64_t	entry;			// return address to jump to kernel
+ *	uint32_t	fill1;			// 0
+ *	uint32_t	modulep;		// 4 module metadata
+ *	uint32_t	kernend;		// 8 kernel end
+ *	uint32_t	fill2;			// 12
+ * };
+ *
+ * loader.kboot will construct a stack that btext expects, which is arguments on
+ * the stack, not in registers, and these args are 32-bit not 64
+ *
+ * Processor is already in long mode when we're called, paging is enabled and
+ * boot loader loads things such that:
+ * - kernel mapped at KERNBASE, aligned to 2MB, below 4GB, contiguous memory
+ * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
+ * - kernel is mapped with 2M superpages
+ * - The kernel, modules and metadata is in first 4GB which is unity mapped
+ * - There's additional memory after loader provided data for early allocations
+ *
+ * Unlike EFI, we don't support copying the staging area. We tell Linux to land
+ * the kernel in its final location with the needed alignment, etc. We copy the
+ * trampoline code to 1MB offset above KERNBASE since that memory is otherwise
+ * free and safely above the lower 1MB swamp we inherited from IBM PC, though
+ * this code makes no assumptions about where that might.
+ *
+ * Thus, the trampoline just needs to set %rsp to that stack pop the %cr3 value,
+ * set it and then retq to jump to the kernel with its stack args filled in.
+ * Since the handoff to this code used to be from 32-bit code, it uses the i386
+ * calling conventions which put the arguments on the stack. The kernel's btext
+ * routine expects this setup.
+ */
 	.text
-	.globl	amd64_tramp
-
+	.globl	tramp
+tramp:
+	cli				/* Make sure we don't get interrupted. */
+	leaq	tramp_pt4(%rip), %rsp	/* Setup our pre-filled-in stack */
+	popq	%rax			/* Pop off the PT4 ptr for %cr3 */
+	movq	%rax, %cr3		/* set the page table */
+	retq				/* Return addr and args already on stack */
 /*
- * void amd64_tramp(uint64_t stack, void *copy_finish, uint64_t kernend,
- *		    uint64_t modulep, uint64_t pagetable, uint64_t entry)
+ * The following is the stack for the above code. The stack will increase in
+ * address as things are popped off of it, so we start with the stack pointing
+ * to tramp_pt4.
  */
-amd64_tramp:
-	cli			/* Make sure we don't get interrupted. */
-	movq	%rdi,%rsp	/* Switch to our temporary stack. */
-
-	movq	%rdx,%r12	/* Stash the kernel values for later. */
-	movq	%rcx,%r13
-	movq	%r8,%r14
-	movq	%r9,%r15
-
-	callq	*%rsi		/* Call copy_finish so we're all ready to go. */
-
-	pushq	%r12		/* Push kernend. */
-	salq	$32,%r13	/* Shift modulep and push it. */
-	pushq	%r13
-	pushq	%r15		/* Push the entry address. */
-	movq	%r14,%cr3	/* Switch page tables. */
-	ret			/* "Return" to kernel entry. */
-
-	ALIGN_TEXT
-amd64_tramp_end:
-
-/* void multiboot2_exec(uint64_t entry, uint64_t multiboot_info, uint64_t stack) */
-	.globl	multiboot2_exec
-multiboot2_exec:
-	movq	%rdx,%rsp
-	pushq	%rdi
-	movq	%rsi,%rbx
-	movq	$MULTIBOOT2_BOOTLOADER_MAGIC,%rax
-	ret
+	.p2align	3		/* Stack has to be 8 byte aligned */
+trampoline_data:
+tramp_pt4:	.quad	0		/* New %cr3 value */
+tramp_entry:	.quad	0		/* Entry to kernel (btext) */
+	/* %rsp points here on entry to amd64 kernel's btext */
+		.long	0		/* 0 filler, ignored (current loaders set to 0) */
+tramp_modulep:	.long	0		/* 4 moudlep */
+tramp_kernend:	.long	0		/* 8 kernend */
+		.long	0		/* 12 alignment filler (also 0) */
+tramp_end:
 
 	.data
-	.globl	amd64_tramp_size
-amd64_tramp_size:
-	.long	amd64_tramp_end-amd64_tramp
+	.type   tramp_size,@object
+	.globl	tramp_size
+tramp_size:
+	.long	tramp_end-tramp
+	.size	tramp_size, 4
+
+	.type   tramp_data_offset,@object
+	.globl	tramp_data_offset
+tramp_data_offset:
+	.long	trampoline_data-tramp
+	.size	tramp_data_offset, 4
diff --git a/stand/kboot/arch/amd64/elf64_freebsd.c b/stand/kboot/arch/amd64/elf64_freebsd.c
index a45a0db32e44..68588c0f2f02 100644
--- a/stand/kboot/arch/amd64/elf64_freebsd.c
+++ b/stand/kboot/arch/amd64/elf64_freebsd.c
@@ -41,9 +41,12 @@ __FBSDID("$FreeBSD$");
 #ifdef EFI
 #include <efi.h>
 #include <efilib.h>
+#else
+#include "host_syscall.h"
 #endif
 
 #include "bootstrap.h"
+#include "kboot.h"
 
 #include "platform/acfreebsd.h"
 #include "acconfig.h"
@@ -53,9 +56,7 @@ __FBSDID("$FreeBSD$");
 
 #ifdef EFI
 #include "loader_efi.h"
-#endif
 
-#ifdef EFI
 static EFI_GUID acpi_guid = ACPI_TABLE_GUID;
 static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID;
 #endif
@@ -63,9 +64,11 @@ static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID;
 #ifdef EFI
 #define LOADER_PAGE_SIZE EFI_PAGE_SIZE
 #else
-#define LOADER_PAGE_SIZE 8192
+#define LOADER_PAGE_SIZE PAGE_SIZE
 #endif
 
+extern vm_offset_t kboot_get_phys_load_segment(void);
+
 extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp,
     bool exit_bs);
 
@@ -81,13 +84,13 @@ static struct file_format amd64_elf_obj = {
 	.l_exec = elf64_obj_exec,
 };
 
-#if 0
+#ifdef EFI
 extern struct file_format multiboot2;
 extern struct file_format multiboot2_obj;
 #endif
 
 struct file_format *file_formats[] = {
-#if 0
+#ifdef EFI
 	&multiboot2,
 	&multiboot2_obj,
 #endif
@@ -96,21 +99,44 @@ struct file_format *file_formats[] = {
 	NULL
 };
 
-#ifdef EFI
+#ifndef	EFI
+/*
+ * We create the stack that we want. We have the address of the page tables
+ * we make on top (so we pop that off and set %cr3). We have the entry point
+ * to the kernel (which retq pops off) This leaves the stack that the btext
+ * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes
+ * to keep this aligned. This makes the trampoline very simple.
+ */
+struct trampoline_data {
+	uint64_t	pt4;			// Page table address to pop
+	uint64_t	entry;			// return address to jump to kernel
+	uint32_t	fill1;			// 0
+	uint32_t	modulep;		// 4 module metadata
+	uint32_t	kernend;		// 8 kernel end
+	uint32_t	fill2;			// 12
+};
+_Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data");
+#endif
+
 static pml4_entry_t *PT4;
-static pdp_entry_t *PT3;
 static pdp_entry_t *PT3_l, *PT3_u;
-static pd_entry_t *PT2;
 static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1;
 
+#ifdef EFI
+static pdp_entry_t *PT3;
+static pd_entry_t *PT2;
+
 extern EFI_PHYSICAL_ADDRESS staging;
 
 static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend,
     uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry);
 #endif
 
-extern uintptr_t amd64_tramp;
-extern uint32_t amd64_tramp_size;
+extern uintptr_t tramp;
+extern uint32_t tramp_size;
+#ifndef EFI
+extern uint32_t tramp_data_offset;
+#endif
 
 /*
  * There is an ELF kernel and one or more ELF modules loaded.
@@ -120,15 +146,27 @@ extern uint32_t amd64_tramp_size;
 static int
 elf64_exec(struct preloaded_file *fp)
 {
-#ifdef EFI
 	struct file_metadata	*md;
 	Elf_Ehdr 		*ehdr;
-	vm_offset_t		modulep, kernend, trampcode, trampstack;
+	vm_offset_t		modulep, kernend;
 	int			err, i;
-	ACPI_TABLE_RSDP		*rsdp;
 	char			buf[24];
+#ifdef EFI
+	ACPI_TABLE_RSDP		*rsdp = NULL;
 	int			revision;
-	bool			copy_auto;
+	int			copy_auto;
+	vm_offset_t		trampstack, trampcode;
+#else
+	vm_offset_t		rsdp = 0;
+	void			*trampcode;
+	int			nseg;
+	void			*kseg;
+	vm_offset_t		trampolinebase;
+	uint64_t		*trampoline;
+	struct trampoline_data	*trampoline_data;
+	vm_offset_t		staging;
+	int			error;
+#endif
 
 #ifdef EFI
 	copy_auto = copy_staging == COPY_STAGING_AUTO;
@@ -136,66 +174,49 @@ elf64_exec(struct preloaded_file *fp)
 		copy_staging = fp->f_kernphys_relocatable ?
 		    COPY_STAGING_DISABLE : COPY_STAGING_ENABLE;
 #else
-	copy_auto = COPY_STAGING_DISABLE; /* XXX */
+	/*
+	 * Figure out where to put it.
+	 *
+	 * Linux does not allow to do kexec_load into any part of memory. Ask
+	 * arch_loadaddr to resolve the first available chunk of physical memory
+	 * where loading is possible (load_addr).
+	 *
+	 * The kernel is loaded at the 'base' address in continguous physical
+	 * pages (using 2MB super pages). The first such page is unused by the
+	 * kernel and serves as a good place to put not only the trampoline, but
+	 * the page table pages that the trampoline needs to setup the proper
+	 * kernel starting environment.
+	 */
+	staging = trampolinebase = kboot_get_phys_load_segment();
+	trampolinebase += 1ULL << 20;	/* Copy trampoline to base + 1MB, kernel will wind up at 2MB */
+	printf("Load address at %#jx\n", (uintmax_t)trampolinebase);
+	printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset);
 #endif
 
 	/*
 	 * Report the RSDP to the kernel. While this can be found with
 	 * a BIOS boot, the RSDP may be elsewhere when booted from UEFI.
-	 * The old code used the 'hints' method to communite this to
-	 * the kernel. However, while convenient, the 'hints' method
-	 * is fragile and does not work when static hints are compiled
-	 * into the kernel. Instead, move to setting different tunables
-	 * that start with acpi. The old 'hints' can be removed before
-	 * we branch for FreeBSD 12.
 	 */
-
 #ifdef EFI
 	rsdp = efi_get_table(&acpi20_guid);
 	if (rsdp == NULL) {
 		rsdp = efi_get_table(&acpi_guid);
 	}
 #else
-	rsdp = NULL;
-#warning "write me"
+	rsdp = acpi_rsdp();
 #endif
-	if (rsdp != NULL) {
+	if (rsdp != 0) {
 		sprintf(buf, "0x%016llx", (unsigned long long)rsdp);
-		setenv("hint.acpi.0.rsdp", buf, 1);
 		setenv("acpi.rsdp", buf, 1);
-		revision = rsdp->Revision;
-		if (revision == 0)
-			revision = 1;
-		sprintf(buf, "%d", revision);
-		setenv("hint.acpi.0.revision", buf, 1);
-		setenv("acpi.revision", buf, 1);
-		strncpy(buf, rsdp->OemId, sizeof(rsdp->OemId));
-		buf[sizeof(rsdp->OemId)] = '\0';
-		setenv("hint.acpi.0.oem", buf, 1);
-		setenv("acpi.oem", buf, 1);
-		sprintf(buf, "0x%016x", rsdp->RsdtPhysicalAddress);
-		setenv("hint.acpi.0.rsdt", buf, 1);
-		setenv("acpi.rsdt", buf, 1);
-		if (revision >= 2) {
-			/* XXX extended checksum? */
-			sprintf(buf, "0x%016llx",
-			    (unsigned long long)rsdp->XsdtPhysicalAddress);
-			setenv("hint.acpi.0.xsdt", buf, 1);
-			setenv("acpi.xsdt", buf, 1);
-			sprintf(buf, "%d", rsdp->Length);
-			setenv("hint.acpi.0.xsdt_length", buf, 1);
-			setenv("acpi.xsdt_length", buf, 1);
-		}
 	}
-
 	if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL)
 		return (EFTYPE);
 	ehdr = (Elf_Ehdr *)&(md->md_data);
 
+#ifdef EFI
 	trampcode = copy_staging == COPY_STAGING_ENABLE ?
 	    (vm_offset_t)0x0000000040000000 /* 1G */ :
 	    (vm_offset_t)0x0000000100000000; /* 4G */;
-#ifdef EFI
 	err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1,
 	    (EFI_PHYSICAL_ADDRESS *)&trampcode);
 	if (EFI_ERROR(err)) {
@@ -204,17 +225,22 @@ elf64_exec(struct preloaded_file *fp)
 			copy_staging = COPY_STAGING_AUTO;
 		return (ENOMEM);
 	}
+	trampstack = trampcode + LOADER_PAGE_SIZE - 8;
 #else
-#warning "Write me"
+	// XXX Question: why not just use malloc?
+	trampcode = host_getmem(LOADER_PAGE_SIZE);
+	if (trampcode == NULL) {
+		printf("Unable to allocate trampoline\n");
+		return (ENOMEM);
+	}
 #endif
 	bzero((void *)trampcode, LOADER_PAGE_SIZE);
-	trampstack = trampcode + LOADER_PAGE_SIZE - 8;
-	bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size);
+	bcopy((void *)&tramp, (void *)trampcode, tramp_size);
 	trampoline = (void *)trampcode;
 
+#ifdef EFI
 	if (copy_staging == COPY_STAGING_ENABLE) {
 		PT4 = (pml4_entry_t *)0x0000000040000000;
-#ifdef EFI
 		err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
 		    (EFI_PHYSICAL_ADDRESS *)&PT4);
 		if (EFI_ERROR(err)) {
@@ -224,9 +250,6 @@ elf64_exec(struct preloaded_file *fp)
 				copy_staging = COPY_STAGING_AUTO;
 			return (ENOMEM);
 		}
-#else
-#warning "Write me"
-#endif
 		bzero(PT4, 3 * LOADER_PAGE_SIZE);
 		PT3 = &PT4[512];
 		PT2 = &PT3[512];
@@ -259,7 +282,6 @@ elf64_exec(struct preloaded_file *fp)
 		}
 	} else {
 		PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */
-#ifdef EFI
 		err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9,
 		    (EFI_PHYSICAL_ADDRESS *)&PT4);
 		if (EFI_ERROR(err)) {
@@ -269,10 +291,6 @@ elf64_exec(struct preloaded_file *fp)
 				copy_staging = COPY_STAGING_AUTO;
 			return (ENOMEM);
 		}
-#else
-#warning "Write me"
-#endif
-
 		bzero(PT4, 9 * LOADER_PAGE_SIZE);
 
 		PT3_l = &PT4[NPML4EPG * 1];
@@ -308,10 +326,84 @@ elf64_exec(struct preloaded_file *fp)
 			    PG_V | PG_RW | PG_PS;
 		}
 	}
+#else
+	{
+		vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1;
 
+		/* We'll find a place for these later */
+		PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE);
+		bzero(PT4, 9 * LOADER_PAGE_SIZE);
+
+		PT3_l = &PT4[NPML4EPG * 1];
+		PT3_u = &PT4[NPML4EPG * 2];
+		PT2_l0 = &PT4[NPML4EPG * 3];
+		PT2_l1 = &PT4[NPML4EPG * 4];
+		PT2_l2 = &PT4[NPML4EPG * 5];
+		PT2_l3 = &PT4[NPML4EPG * 6];
+		PT2_u0 = &PT4[NPML4EPG * 7];
+		PT2_u1 = &PT4[NPML4EPG * 8];
+
+		pabase = trampolinebase + LOADER_PAGE_SIZE;
+		pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1;
+		pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2;
+		pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3;
+		pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4;
+		pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5;
+		pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6;
+		pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7;
+		pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8;
+
+		/* 1:1 mapping of lower 4G */
+		PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW;
+		PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW;
+		PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW;
+		PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW;
+		PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW;
+		for (i = 0; i < 4 * NPDEPG; i++) {	/* we overflow PT2_l0 into _l1, etc */
+			PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
+			    PG_RW | PG_PS;
+		}
+
+		/* mapping of kernel 2G below top */
+		PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW;
+		PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW;
+		PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW;
+		/* compat mapping of phys @0 */
+		PT2_u0[0] = PG_PS | PG_V | PG_RW;
+		/* this maps past staging area */
+		/*
+		 * Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure
+		 * out where we loaded the kernel. This is PT2_u0[1] (since
+		 * these map 2MB pages. So the PA that this maps has to be
+		 * kboot's staging + 2MB.  For UEFI we do 'i - 1' since we load
+		 * the kernel right at staging (and assume the first address we
+		 * load is 2MB in efi_copyin). However for kboot, staging + 1 *
+		 * NBPDR == staging + 2MB which is where the kernel starts. Our
+		 * trampoline need not be mapped into the kernel space since we
+		 * execute PA==VA for that, and the trampoline can just go away
+		 * once the kernel is called.
+		 *
+		 * Staging should likely be as low as possible, though, because
+		 * all the 'early' allocations are at kernend (which the kernel
+		 * calls physfree).
+		 */
+		for (i = 1; i < 2 * NPDEPG; i++) {	/* we overflow PT2_u0 into _u1 */
+			PT2_u0[i] = ((pd_entry_t)staging +
+			    ((pd_entry_t)i) * NBPDR) |
+			    PG_V | PG_RW | PG_PS;
+			if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging);
+		}
+	}
+#endif
+
+#ifdef EFI
 	printf("staging %#lx (%scopying) tramp %p PT4 %p\n",
 	    staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ",
 	    trampoline, PT4);
+#else
+	printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase,
+	    (void *)trampolinebase + LOADER_PAGE_SIZE);
+#endif
 	printf("Start @ 0x%lx ...\n", ehdr->e_entry);
 
 #ifdef EFI
@@ -321,17 +413,46 @@ elf64_exec(struct preloaded_file *fp)
 	if (err != 0) {
 #ifdef EFI
 		efi_time_init();
-#endif
 		if (copy_auto)
 			copy_staging = COPY_STAGING_AUTO;
+#endif
 		return (err);
 	}
 
 	dev_cleanup();
 
+#ifdef EFI
 	trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ?
 	    efi_copy_finish : efi_copy_finish_nop, kernend, modulep,
 	    PT4, ehdr->e_entry);
+#else
+	trampoline_data = (void *)trampoline + tramp_data_offset;
+	trampoline_data->entry = ehdr->e_entry;
+	trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE;
+	/*
+	 * So we compute the VA of the module data by modulep + KERNBASE....
+	 * need to make sure that that address is mapped right. We calculate
+	 * the start of available memory to allocate via kernend (which is
+	 * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better
+	 * make sure we're not overwriting the last 2MB of the kernel :).
+	 */
+	trampoline_data->modulep = modulep;	/* Offset from KERNBASE */
+	trampoline_data->kernend = kernend;	/* Offset from the load address */
+	trampoline_data->fill1 = trampoline_data->fill2 = 0;
+	printf("Modulep = %lx kernend %lx\n", modulep, kernend);
+	/* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */
+	/* Copy the trampoline to the ksegs */
+	archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size);
+	/* Copy the page table to the ksegs */
+	archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE);
+
+	if (archsw.arch_kexec_kseg_get == NULL)
+		panic("architecture did not provide kexec segment mapping");
+	archsw.arch_kexec_kseg_get(&nseg, &kseg);
+	error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64);
+	if (error != 0)
+		panic("kexec_load returned error: %d", error);
+	host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0);
 #endif
 
 	panic("exec returned");