git: 2069a2a08f6e - main - kboot: Improve amd64 booting
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 03 Feb 2023 15:50:50 UTC
The branch main has been updated by imp: URL: https://cgit.FreeBSD.org/src/commit/?id=2069a2a08f6e555285be71be042e85c75b6feb02 commit 2069a2a08f6e555285be71be042e85c75b6feb02 Author: Warner Losh <imp@FreeBSD.org> AuthorDate: 2023-02-03 15:40:13 +0000 Commit: Warner Losh <imp@FreeBSD.org> CommitDate: 2023-02-03 15:41:40 +0000 kboot: Improve amd64 booting Copy more of the necessary state for FreeBSD to boot: o Copy EFI memory tables o Create custom page tables needed for the kernel to find itself o Simplify the passing of args to the trampoline by putting them on the stack rather than in dedicated memory. This is only partially successful... we get only part way through the amd64 startup code before dying. However, it's much further than before the changes. Sponsored by: Netflix Reviewed by: tsoome, kevans Differential Revision: https://reviews.freebsd.org/D38259 --- stand/kboot/arch/amd64/amd64_tramp.S | 123 ++++++++++------ stand/kboot/arch/amd64/elf64_freebsd.c | 249 ++++++++++++++++++++++++--------- 2 files changed, 262 insertions(+), 110 deletions(-) diff --git a/stand/kboot/arch/amd64/amd64_tramp.S b/stand/kboot/arch/amd64/amd64_tramp.S index 877705407f92..b95e99cbaf0f 100644 --- a/stand/kboot/arch/amd64/amd64_tramp.S +++ b/stand/kboot/arch/amd64/amd64_tramp.S @@ -1,9 +1,6 @@ /*- - * Copyright (c) 2013 The FreeBSD Foundation - * All rights reserved. + * Copyright (c) 2022 Netflix, Inc * - * This software was developed by Benno Rice under sponsorship from - * the FreeBSD Foundation. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -24,53 +21,87 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ -#include <machine/asmacros.h> - -#define ASM_FILE -#include "multiboot2.h" +/* + * This is the trampoline that starts the FreeBSD kernel. Since the Linux kernel + * calls this routine with no args, and has a different environment than the + * boot loader provides and that the kernel expects, this code is responsible + * for setting all that up and calling the normal kernel entry point. It's + * analogous to the "purgatory" code in the linux kernel. Details about these + * operations are contained in comments below. On amd64, the kernel starts all + * the APs so we don't have to worry about them here. + */ +/* + * Keep in sync with elf64_freebsd.c. Kexec starts tramp w/o any parameters, so + * store them here. This is constructed to be a useful stack: + * + * struct trampoline_data { + * uint64_t pt4; // Page table address to pop + * uint64_t entry; // return address to jump to kernel + * uint32_t fill1; // 0 + * uint32_t modulep; // 4 module metadata + * uint32_t kernend; // 8 kernel end + * uint32_t fill2; // 12 + * }; + * + * loader.kboot will construct a stack that btext expects, which is arguments on + * the stack, not in registers, and these args are 32-bit not 64 + * + * Processor is already in long mode when we're called, paging is enabled and + * boot loader loads things such that: + * - kernel mapped at KERNBASE, aligned to 2MB, below 4GB, contiguous memory + * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M) + * - kernel is mapped with 2M superpages + * - The kernel, modules and metadata is in first 4GB which is unity mapped + * - There's additional memory after loader provided data for early allocations + * + * Unlike EFI, we don't support copying the staging area. We tell Linux to land + * the kernel in its final location with the needed alignment, etc. We copy the + * trampoline code to 1MB offset above KERNBASE since that memory is otherwise + * free and safely above the lower 1MB swamp we inherited from IBM PC, though + * this code makes no assumptions about where that might. + * + * Thus, the trampoline just needs to set %rsp to that stack pop the %cr3 value, + * set it and then retq to jump to the kernel with its stack args filled in. + * Since the handoff to this code used to be from 32-bit code, it uses the i386 + * calling conventions which put the arguments on the stack. The kernel's btext + * routine expects this setup. + */ .text - .globl amd64_tramp - + .globl tramp +tramp: + cli /* Make sure we don't get interrupted. */ + leaq tramp_pt4(%rip), %rsp /* Setup our pre-filled-in stack */ + popq %rax /* Pop off the PT4 ptr for %cr3 */ + movq %rax, %cr3 /* set the page table */ + retq /* Return addr and args already on stack */ /* - * void amd64_tramp(uint64_t stack, void *copy_finish, uint64_t kernend, - * uint64_t modulep, uint64_t pagetable, uint64_t entry) + * The following is the stack for the above code. The stack will increase in + * address as things are popped off of it, so we start with the stack pointing + * to tramp_pt4. */ -amd64_tramp: - cli /* Make sure we don't get interrupted. */ - movq %rdi,%rsp /* Switch to our temporary stack. */ - - movq %rdx,%r12 /* Stash the kernel values for later. */ - movq %rcx,%r13 - movq %r8,%r14 - movq %r9,%r15 - - callq *%rsi /* Call copy_finish so we're all ready to go. */ - - pushq %r12 /* Push kernend. */ - salq $32,%r13 /* Shift modulep and push it. */ - pushq %r13 - pushq %r15 /* Push the entry address. */ - movq %r14,%cr3 /* Switch page tables. */ - ret /* "Return" to kernel entry. */ - - ALIGN_TEXT -amd64_tramp_end: - -/* void multiboot2_exec(uint64_t entry, uint64_t multiboot_info, uint64_t stack) */ - .globl multiboot2_exec -multiboot2_exec: - movq %rdx,%rsp - pushq %rdi - movq %rsi,%rbx - movq $MULTIBOOT2_BOOTLOADER_MAGIC,%rax - ret + .p2align 3 /* Stack has to be 8 byte aligned */ +trampoline_data: +tramp_pt4: .quad 0 /* New %cr3 value */ +tramp_entry: .quad 0 /* Entry to kernel (btext) */ + /* %rsp points here on entry to amd64 kernel's btext */ + .long 0 /* 0 filler, ignored (current loaders set to 0) */ +tramp_modulep: .long 0 /* 4 moudlep */ +tramp_kernend: .long 0 /* 8 kernend */ + .long 0 /* 12 alignment filler (also 0) */ +tramp_end: .data - .globl amd64_tramp_size -amd64_tramp_size: - .long amd64_tramp_end-amd64_tramp + .type tramp_size,@object + .globl tramp_size +tramp_size: + .long tramp_end-tramp + .size tramp_size, 4 + + .type tramp_data_offset,@object + .globl tramp_data_offset +tramp_data_offset: + .long trampoline_data-tramp + .size tramp_data_offset, 4 diff --git a/stand/kboot/arch/amd64/elf64_freebsd.c b/stand/kboot/arch/amd64/elf64_freebsd.c index a45a0db32e44..68588c0f2f02 100644 --- a/stand/kboot/arch/amd64/elf64_freebsd.c +++ b/stand/kboot/arch/amd64/elf64_freebsd.c @@ -41,9 +41,12 @@ __FBSDID("$FreeBSD$"); #ifdef EFI #include <efi.h> #include <efilib.h> +#else +#include "host_syscall.h" #endif #include "bootstrap.h" +#include "kboot.h" #include "platform/acfreebsd.h" #include "acconfig.h" @@ -53,9 +56,7 @@ __FBSDID("$FreeBSD$"); #ifdef EFI #include "loader_efi.h" -#endif -#ifdef EFI static EFI_GUID acpi_guid = ACPI_TABLE_GUID; static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID; #endif @@ -63,9 +64,11 @@ static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID; #ifdef EFI #define LOADER_PAGE_SIZE EFI_PAGE_SIZE #else -#define LOADER_PAGE_SIZE 8192 +#define LOADER_PAGE_SIZE PAGE_SIZE #endif +extern vm_offset_t kboot_get_phys_load_segment(void); + extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp, bool exit_bs); @@ -81,13 +84,13 @@ static struct file_format amd64_elf_obj = { .l_exec = elf64_obj_exec, }; -#if 0 +#ifdef EFI extern struct file_format multiboot2; extern struct file_format multiboot2_obj; #endif struct file_format *file_formats[] = { -#if 0 +#ifdef EFI &multiboot2, &multiboot2_obj, #endif @@ -96,21 +99,44 @@ struct file_format *file_formats[] = { NULL }; -#ifdef EFI +#ifndef EFI +/* + * We create the stack that we want. We have the address of the page tables + * we make on top (so we pop that off and set %cr3). We have the entry point + * to the kernel (which retq pops off) This leaves the stack that the btext + * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes + * to keep this aligned. This makes the trampoline very simple. + */ +struct trampoline_data { + uint64_t pt4; // Page table address to pop + uint64_t entry; // return address to jump to kernel + uint32_t fill1; // 0 + uint32_t modulep; // 4 module metadata + uint32_t kernend; // 8 kernel end + uint32_t fill2; // 12 +}; +_Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data"); +#endif + static pml4_entry_t *PT4; -static pdp_entry_t *PT3; static pdp_entry_t *PT3_l, *PT3_u; -static pd_entry_t *PT2; static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1; +#ifdef EFI +static pdp_entry_t *PT3; +static pd_entry_t *PT2; + extern EFI_PHYSICAL_ADDRESS staging; static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend, uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry); #endif -extern uintptr_t amd64_tramp; -extern uint32_t amd64_tramp_size; +extern uintptr_t tramp; +extern uint32_t tramp_size; +#ifndef EFI +extern uint32_t tramp_data_offset; +#endif /* * There is an ELF kernel and one or more ELF modules loaded. @@ -120,15 +146,27 @@ extern uint32_t amd64_tramp_size; static int elf64_exec(struct preloaded_file *fp) { -#ifdef EFI struct file_metadata *md; Elf_Ehdr *ehdr; - vm_offset_t modulep, kernend, trampcode, trampstack; + vm_offset_t modulep, kernend; int err, i; - ACPI_TABLE_RSDP *rsdp; char buf[24]; +#ifdef EFI + ACPI_TABLE_RSDP *rsdp = NULL; int revision; - bool copy_auto; + int copy_auto; + vm_offset_t trampstack, trampcode; +#else + vm_offset_t rsdp = 0; + void *trampcode; + int nseg; + void *kseg; + vm_offset_t trampolinebase; + uint64_t *trampoline; + struct trampoline_data *trampoline_data; + vm_offset_t staging; + int error; +#endif #ifdef EFI copy_auto = copy_staging == COPY_STAGING_AUTO; @@ -136,66 +174,49 @@ elf64_exec(struct preloaded_file *fp) copy_staging = fp->f_kernphys_relocatable ? COPY_STAGING_DISABLE : COPY_STAGING_ENABLE; #else - copy_auto = COPY_STAGING_DISABLE; /* XXX */ + /* + * Figure out where to put it. + * + * Linux does not allow to do kexec_load into any part of memory. Ask + * arch_loadaddr to resolve the first available chunk of physical memory + * where loading is possible (load_addr). + * + * The kernel is loaded at the 'base' address in continguous physical + * pages (using 2MB super pages). The first such page is unused by the + * kernel and serves as a good place to put not only the trampoline, but + * the page table pages that the trampoline needs to setup the proper + * kernel starting environment. + */ + staging = trampolinebase = kboot_get_phys_load_segment(); + trampolinebase += 1ULL << 20; /* Copy trampoline to base + 1MB, kernel will wind up at 2MB */ + printf("Load address at %#jx\n", (uintmax_t)trampolinebase); + printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset); #endif /* * Report the RSDP to the kernel. While this can be found with * a BIOS boot, the RSDP may be elsewhere when booted from UEFI. - * The old code used the 'hints' method to communite this to - * the kernel. However, while convenient, the 'hints' method - * is fragile and does not work when static hints are compiled - * into the kernel. Instead, move to setting different tunables - * that start with acpi. The old 'hints' can be removed before - * we branch for FreeBSD 12. */ - #ifdef EFI rsdp = efi_get_table(&acpi20_guid); if (rsdp == NULL) { rsdp = efi_get_table(&acpi_guid); } #else - rsdp = NULL; -#warning "write me" + rsdp = acpi_rsdp(); #endif - if (rsdp != NULL) { + if (rsdp != 0) { sprintf(buf, "0x%016llx", (unsigned long long)rsdp); - setenv("hint.acpi.0.rsdp", buf, 1); setenv("acpi.rsdp", buf, 1); - revision = rsdp->Revision; - if (revision == 0) - revision = 1; - sprintf(buf, "%d", revision); - setenv("hint.acpi.0.revision", buf, 1); - setenv("acpi.revision", buf, 1); - strncpy(buf, rsdp->OemId, sizeof(rsdp->OemId)); - buf[sizeof(rsdp->OemId)] = '\0'; - setenv("hint.acpi.0.oem", buf, 1); - setenv("acpi.oem", buf, 1); - sprintf(buf, "0x%016x", rsdp->RsdtPhysicalAddress); - setenv("hint.acpi.0.rsdt", buf, 1); - setenv("acpi.rsdt", buf, 1); - if (revision >= 2) { - /* XXX extended checksum? */ - sprintf(buf, "0x%016llx", - (unsigned long long)rsdp->XsdtPhysicalAddress); - setenv("hint.acpi.0.xsdt", buf, 1); - setenv("acpi.xsdt", buf, 1); - sprintf(buf, "%d", rsdp->Length); - setenv("hint.acpi.0.xsdt_length", buf, 1); - setenv("acpi.xsdt_length", buf, 1); - } } - if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL) return (EFTYPE); ehdr = (Elf_Ehdr *)&(md->md_data); +#ifdef EFI trampcode = copy_staging == COPY_STAGING_ENABLE ? (vm_offset_t)0x0000000040000000 /* 1G */ : (vm_offset_t)0x0000000100000000; /* 4G */; -#ifdef EFI err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, (EFI_PHYSICAL_ADDRESS *)&trampcode); if (EFI_ERROR(err)) { @@ -204,17 +225,22 @@ elf64_exec(struct preloaded_file *fp) copy_staging = COPY_STAGING_AUTO; return (ENOMEM); } + trampstack = trampcode + LOADER_PAGE_SIZE - 8; #else -#warning "Write me" + // XXX Question: why not just use malloc? + trampcode = host_getmem(LOADER_PAGE_SIZE); + if (trampcode == NULL) { + printf("Unable to allocate trampoline\n"); + return (ENOMEM); + } #endif bzero((void *)trampcode, LOADER_PAGE_SIZE); - trampstack = trampcode + LOADER_PAGE_SIZE - 8; - bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size); + bcopy((void *)&tramp, (void *)trampcode, tramp_size); trampoline = (void *)trampcode; +#ifdef EFI if (copy_staging == COPY_STAGING_ENABLE) { PT4 = (pml4_entry_t *)0x0000000040000000; -#ifdef EFI err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, (EFI_PHYSICAL_ADDRESS *)&PT4); if (EFI_ERROR(err)) { @@ -224,9 +250,6 @@ elf64_exec(struct preloaded_file *fp) copy_staging = COPY_STAGING_AUTO; return (ENOMEM); } -#else -#warning "Write me" -#endif bzero(PT4, 3 * LOADER_PAGE_SIZE); PT3 = &PT4[512]; PT2 = &PT3[512]; @@ -259,7 +282,6 @@ elf64_exec(struct preloaded_file *fp) } } else { PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */ -#ifdef EFI err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9, (EFI_PHYSICAL_ADDRESS *)&PT4); if (EFI_ERROR(err)) { @@ -269,10 +291,6 @@ elf64_exec(struct preloaded_file *fp) copy_staging = COPY_STAGING_AUTO; return (ENOMEM); } -#else -#warning "Write me" -#endif - bzero(PT4, 9 * LOADER_PAGE_SIZE); PT3_l = &PT4[NPML4EPG * 1]; @@ -308,10 +326,84 @@ elf64_exec(struct preloaded_file *fp) PG_V | PG_RW | PG_PS; } } +#else + { + vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1; + /* We'll find a place for these later */ + PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE); + bzero(PT4, 9 * LOADER_PAGE_SIZE); + + PT3_l = &PT4[NPML4EPG * 1]; + PT3_u = &PT4[NPML4EPG * 2]; + PT2_l0 = &PT4[NPML4EPG * 3]; + PT2_l1 = &PT4[NPML4EPG * 4]; + PT2_l2 = &PT4[NPML4EPG * 5]; + PT2_l3 = &PT4[NPML4EPG * 6]; + PT2_u0 = &PT4[NPML4EPG * 7]; + PT2_u1 = &PT4[NPML4EPG * 8]; + + pabase = trampolinebase + LOADER_PAGE_SIZE; + pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1; + pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2; + pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3; + pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4; + pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5; + pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6; + pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7; + pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8; + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW; + PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW; + PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW; + PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW; + PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PT2_l0 into _l1, etc */ + PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } + + /* mapping of kernel 2G below top */ + PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW; + PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW; + PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW; + /* compat mapping of phys @0 */ + PT2_u0[0] = PG_PS | PG_V | PG_RW; + /* this maps past staging area */ + /* + * Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure + * out where we loaded the kernel. This is PT2_u0[1] (since + * these map 2MB pages. So the PA that this maps has to be + * kboot's staging + 2MB. For UEFI we do 'i - 1' since we load + * the kernel right at staging (and assume the first address we + * load is 2MB in efi_copyin). However for kboot, staging + 1 * + * NBPDR == staging + 2MB which is where the kernel starts. Our + * trampoline need not be mapped into the kernel space since we + * execute PA==VA for that, and the trampoline can just go away + * once the kernel is called. + * + * Staging should likely be as low as possible, though, because + * all the 'early' allocations are at kernend (which the kernel + * calls physfree). + */ + for (i = 1; i < 2 * NPDEPG; i++) { /* we overflow PT2_u0 into _u1 */ + PT2_u0[i] = ((pd_entry_t)staging + + ((pd_entry_t)i) * NBPDR) | + PG_V | PG_RW | PG_PS; + if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging); + } + } +#endif + +#ifdef EFI printf("staging %#lx (%scopying) tramp %p PT4 %p\n", staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ", trampoline, PT4); +#else + printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase, + (void *)trampolinebase + LOADER_PAGE_SIZE); +#endif printf("Start @ 0x%lx ...\n", ehdr->e_entry); #ifdef EFI @@ -321,17 +413,46 @@ elf64_exec(struct preloaded_file *fp) if (err != 0) { #ifdef EFI efi_time_init(); -#endif if (copy_auto) copy_staging = COPY_STAGING_AUTO; +#endif return (err); } dev_cleanup(); +#ifdef EFI trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ? efi_copy_finish : efi_copy_finish_nop, kernend, modulep, PT4, ehdr->e_entry); +#else + trampoline_data = (void *)trampoline + tramp_data_offset; + trampoline_data->entry = ehdr->e_entry; + trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE; + /* + * So we compute the VA of the module data by modulep + KERNBASE.... + * need to make sure that that address is mapped right. We calculate + * the start of available memory to allocate via kernend (which is + * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better + * make sure we're not overwriting the last 2MB of the kernel :). + */ + trampoline_data->modulep = modulep; /* Offset from KERNBASE */ + trampoline_data->kernend = kernend; /* Offset from the load address */ + trampoline_data->fill1 = trampoline_data->fill2 = 0; + printf("Modulep = %lx kernend %lx\n", modulep, kernend); + /* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */ + /* Copy the trampoline to the ksegs */ + archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size); + /* Copy the page table to the ksegs */ + archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE); + + if (archsw.arch_kexec_kseg_get == NULL) + panic("architecture did not provide kexec segment mapping"); + archsw.arch_kexec_kseg_get(&nseg, &kseg); + error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64); + if (error != 0) + panic("kexec_load returned error: %d", error); + host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0); #endif panic("exec returned");