Anybody willing to test out kload?

Russell Cattelan cattelan at thebarn.com
Thu Nov 15 04:30:07 UTC 2012


A few people have pointed out I sent out the patch in reverse. :-(
I messed up and reversed the tags to git diff.

Here is the corrected patch.
Along with a few corrections and cleanups.

-Russell

-------------- next part --------------
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c609633
diff --git a/sys/amd64/amd64/kload.c b/sys/amd64/amd64/kload.c
new file mode 100644
index 0000000..ed203ae
--- /dev/null
+++ b/sys/amd64/amd64/kload.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2011 - 2012
+ *	Russell Cattelan Digital Elves Inc
+ * Copyright (c) 2011 - 2012
+ *	Isilon Systems, LLC.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/kload.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+
+#include <vm/vm_param.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+
+#define	GUEST_NULL_SEL		0
+#define	GUEST_CODE_SEL		1
+#define	GUEST_DATA_SEL		2
+
+void
+setup_freebsd_gdt(uint64_t *gdtr)
+{
+	gdtr[GUEST_NULL_SEL] = 0x0000000000000000;
+	gdtr[GUEST_CODE_SEL] = 0x0020980000000000;
+	gdtr[GUEST_DATA_SEL] = 0x0000920000000000;
+}
+
+pt_entry_t *
+kload_build_page_table(void)
+{
+	pt_entry_t *PT4;
+	pt_entry_t *PT3;
+	pt_entry_t *PT2;
+	int i;
+	unsigned long va;
+
+	va = (unsigned long)kmem_alloc(kernel_map,PAGE_SIZE * 3);
+	PT4 = (pt_entry_t *)va;
+	PT3 = (pt_entry_t *)(PT4 + (PAGE_SIZE / sizeof(unsigned long)));
+	PT2 = (pt_entry_t *)(PT3 + (PAGE_SIZE / sizeof(unsigned long)));
+
+	if (bootverbose)
+		printf("%s PT4 0x%lx (0x%lx) PT3 0x%lx (0x%lx) "
+		    "PT2 0x%lx (0x%lx)\n",
+		    __func__,
+		    (unsigned long)PT4, (unsigned long)vtophys(PT4),
+		    (unsigned long)PT3, (unsigned long)vtophys(PT3),
+		    (unsigned long)PT2, (unsigned long)vtophys(PT2));
+
+	/*
+	 * The following section is a direct copy of
+	 * head/src/sys/boot/i386/libi386/elf64_freebsd.c:92 at r236688
+	 */
+
+	bzero(PT4, PAGE_SIZE);
+	bzero(PT3, PAGE_SIZE);
+	bzero(PT2, PAGE_SIZE);
+
+	/*
+	 * This is kinda brutal, but every single 1GB VM memory segment points
+	 * to the same first 1GB of physical memory.  But it is more than
+	 * adequate.
+	 */
+	for (i = 0; i < 512; i++) {
+		/*
+		 * Each slot of the level 4 pages points to the
+		 * same level 3 page
+		 */
+		PT4[i] = (pt_entry_t)(vtophys(PT3));
+		PT4[i] |= PG_V | PG_RW | PG_U;
+
+		/*
+		 * Each slot of the level 3 pages points to the
+		 * same level 2 page
+		 */
+		PT3[i] = (pt_entry_t)(vtophys(PT2));
+		PT3[i] |= PG_V | PG_RW | PG_U;
+
+		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
+		PT2[i] = i * (2 * 1024 * 1024);
+		PT2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+	}
+	return ((pt_entry_t *)vtophys(PT4));
+}
diff --git a/sys/amd64/amd64/kload_exec.S b/sys/amd64/amd64/kload_exec.S
new file mode 100644
index 0000000..75bff3b
--- /dev/null
+++ b/sys/amd64/amd64/kload_exec.S
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2011 - 2012
+ *	Russell Cattelan Digital Elves Inc
+ * Copyright (c) 2011 - 2012
+ *	Isilon Systems, LLC.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include "assym.s"
+
+#define	CR0_PG	0x80000000	/* PaGing enable */
+
+#define X86_CR0_PE	0x00000001 /* Protection Enable */
+#define X86_CR0_MP	0x00000002 /* Monitor Coprocessor */
+#define X86_CR0_EM	0x00000004 /* Emulation */
+#define X86_CR0_TS	0x00000008 /* Task Switched */
+#define X86_CR0_ET	0x00000010 /* Extension Type */
+#define X86_CR0_NE	0x00000020 /* Numeric Error */
+#define X86_CR0_WP	0x00010000 /* Write Protect */
+#define X86_CR0_AM	0x00040000 /* Alignment Mask */
+#define X86_CR0_NW	0x20000000 /* Not Write-through */
+#define X86_CR0_CD	0x40000000 /* Cache Disable */
+#define X86_CR0_PG	0x80000000 /* Paging */
+
+#define X86_CR4_PSE	0x00000010 /* enable page size extensions */
+#define X86_CR4_PAE	0x00000020 /* enable physical address extensions */
+
+	.globl relocate_kernel
+relocate_kernel:
+	/* first install the new page table */
+	movq	32(%rcx), %rax /* page table */
+	movq	40(%rcx), %r9  /* address of control_page with new PT */
+	movq	%rax, %cr3
+
+	/*
+	 * Set cr4 to a known state:
+	 *  - page size extensions
+	 *  - physical address extension enabled
+	 */
+	movq	$(X86_CR4_PSE | X86_CR4_PAE), %rax
+	movq	%rax, %cr4
+
+	/* then move the stack to the end of control page */
+	lea 4096(%r9), %rsp
+
+	/* now save stuff onto the new stack */
+	pushq	%rcx	/* arg 4 control page */
+	pushq	%rdx	/* arg 3 code page */
+	pushq	%rsi	/* arg 2 kern base */
+	pushq	%rdi	/* arg 1 va_list */
+
+	/* zero out flags, and disable interrupts */
+	pushq $0
+	popfq
+	cli
+
+	/* install simple gdt */
+	movq	24(%r9), %rax	/* gdt */
+	lgdt	(%rax)
+	movq	56(%r9), %rax
+	lidt	(%rax)		/* null idt */
+	/*
+	 * now move to the code page
+	 * should have been passed code_page based
+	 * on new page table
+	 */
+	movq %rdx, %r8
+	addq	$(identity_mapped - relocate_kernel), %r8
+	/* offset of code segment in new gdt */
+	pushq $0x08
+	pushq %r8
+	/* jump to this spot in the new page */
+	lretq
+identity_mapped:
+
+	movq $0x10,%rax
+	movq %rax,%ds
+	movq %rax,%es
+	movq %rax,%fs
+	movq %rax,%gs
+	movq %rax,%ss
+	
+	/*
+	 * Set cr0 to a known state:
+	 *  - Paging enabled
+	 *  - Alignment check disabled
+	 *  - Write protect disabled
+	 *  - No task switch
+	 *  - Don't do FP software emulation.
+	 *  - Proctected mode enabled
+	 */
+	movq	%cr0, %rax
+	andq	$~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM |  X86_CR0_MP | X86_CR0_NE), %rax
+	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
+	movq	%rax, %cr0
+
+	/* Do the copies */
+	cld
+	/* saved list of source pages */
+	movq 0(%rsp), %rbx
+	/*
+	 * the initial dest page
+	* this is KERNBASE + 0x200000
+	* kernel is contigious in memory
+	*/
+	movq 8(%rsp), %rdi
+0:	/* top, read another word for the indirection page */
+	movq	(%rbx), %rcx
+
+	addq	$8, %rbx
+	testq	$0x1,   %rcx  /* is it a destination page */
+	jz	1f
+	movq	%rcx,	%rdi
+	andq	$0xFFFFFFFFfffff000, %rdi
+	jmp     0b
+1:
+	testq	$0x2,	%rcx  /* is it an indirection page */
+	jz	1f
+	movq	%rcx,	%rbx
+	andq	$0xFFFFFFFFfffff000, %rbx
+	jmp     0b
+1:
+	testq   $0x4,   %rcx /* is it the done indicator */
+	jz      1f
+	jmp     2f
+1:
+	testq   $0x8,   %rcx /* is it the source indicator */
+	jz      0b	     /* Ignore it otherwise */
+	movq    %rcx,   %rsi /* For every source page do a copy */
+	andq    $0xfffffffffffff000, %rsi
+	movq    $512, %rcx
+	rep
+	movsq
+	jmp     0b
+2:
+	/*
+	 * set all of the registers to known values
+	 * leave %rsp alone
+	 */
+	xorq	%rax, %rax
+	xorq	%rbx, %rbx
+	xorq    %rcx, %rcx
+	xorq    %rdx, %rdx
+	xorq    %rsi, %rsi
+	xorq    %rdi, %rdi
+	xorq    %rbp, %rbp
+
+	pushq	16(%r9)	/* physfree */
+	movq	8(%r9), %rax	/* modulep */
+	salq	$32, %rax
+	pushq	%rax
+
+	pushq $0x8
+	pushq	48(%r9)	/* entry # kernel entry pt */
+	lretq
+relocate_kernel_end:
+	.globl relocate_kernel_size
+relocate_kernel_size:
+	.long relocate_kernel_end - relocate_kernel
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index d2e4aad..b085326 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -78,6 +78,9 @@ __FBSDID("$FreeBSD$");
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
+
+// quick hack to access the kload page table so we can set the APs to a know pgtbl */
+extern unsigned long kload_pgtbl;
 /* lock region used by kernel profiling */
 int	mcount_lock;
 
@@ -1409,10 +1412,20 @@ cpustop_handler(void)
 void
 cpususpend_handler(void)
 {
+	register_t cr3, rf;
+	register_t cr0, cr4;
 	u_int cpu;
 
 	cpu = PCPU_GET(cpuid);
 
+	printf("%s called on cpu%d\n",__FUNCTION__,cpu);
+
+	rf = intr_disable();
+	cr3 = rcr3();
+
+	lapic_clear_lapic(1 /* disable lapic */);
+ 	/* shutdown interrupts to the cpu and then set the mask as stopped */
+
 	if (savectx(susppcbs[cpu])) {
 		ctx_fpusave(susppcbs[cpu]->pcb_fpususpend);
 		wbinvd();
@@ -1422,20 +1435,37 @@ cpususpend_handler(void)
 		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
-
-		/* Indicate that we are resumed */
-		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
 
+	/* make sure the page table is not the same one that boot process sets up */
+	load_cr3(kload_pgtbl);
+
+	/* Disable PGE. */
+	cr4 = rcr4();
+	load_cr4(cr4 & ~CR4_PGE);
+
+	/* Disable caches (CD = 1, NW = 0) and paging*/
+	cr0 = rcr0();
+	load_cr0((cr0 & ~CR0_NW) | CR0_CD | CR0_PG);
+
+	/* Flushes caches and TLBs. */
+	wbinvd();
+	invltlb();
+
+	halt();
+
 	/* Wait for resume */
 	while (!CPU_ISSET(cpu, &started_cpus))
 		ia32_pause();
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
+	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 
-	/* Resume MCA and local APIC */
+	/* Restore CR3 and enable interrupts */
+	load_cr3(cr3);
 	mca_resume();
 	lapic_setup(0);
+	intr_restore(rf);
 }
 
 /*
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index ae2f5b9..7fded95 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -227,6 +227,7 @@ int	lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
 	    enum intr_trigger trigger);
 void	lapic_set_tpr(u_int vector);
 void	lapic_setup(int boot);
+void	lapic_clear_lapic(u_int);
 
 #endif /* !LOCORE */
 #endif /* _MACHINE_APICVAR_H_ */
diff --git a/sys/amd64/include/intr_machdep.h b/sys/amd64/include/intr_machdep.h
index 700e35f..a8ef1fc 100644
--- a/sys/amd64/include/intr_machdep.h
+++ b/sys/amd64/include/intr_machdep.h
@@ -158,6 +158,7 @@ struct intsrc *intr_lookup_source(int vector);
 int	intr_register_pic(struct pic *pic);
 int	intr_register_source(struct intsrc *isrc);
 int	intr_remove_handler(void *cookie);
+int	intr_clear_all_handlers(void);
 void	intr_resume(void);
 void	intr_suspend(void);
 void	intrcnt_add(const char *name, u_long **countp);
diff --git a/sys/boot/common/load_elf.c b/sys/boot/common/load_elf.c
index e1e6de7..36df22c 100644
--- a/sys/boot/common/load_elf.c
+++ b/sys/boot/common/load_elf.c
@@ -317,25 +317,30 @@ __elfN(loadimage)(struct preloaded_file *fp, elf_file_t ef, u_int64_t off)
 	    continue;
 
 #ifdef ELF_VERBOSE
-	printf("Segment: 0x%lx at 0x%lx -> 0x%lx-0x%lx",
-	    (long)phdr[i].p_filesz, (long)phdr[i].p_offset,
-	    (long)(phdr[i].p_vaddr + off),
-	    (long)(phdr[i].p_vaddr + off + phdr[i].p_memsz - 1));
+	printf("Segment: filesz 0x%llx @ 0x%016llx ->  vaddr_range 0x%016llx - 0x%016llx",
+	    (long long)phdr[i].p_filesz, (long long)phdr[i].p_offset,
+	    (long long)(phdr[i].p_vaddr + off),
+	    (long long)(phdr[i].p_vaddr + off + phdr[i].p_memsz - 1));
 #else
 	if ((phdr[i].p_flags & PF_W) == 0) {
-	    printf("text=0x%lx ", (long)phdr[i].p_filesz);
+	    printf("text=0x%llx ", (long long)phdr[i].p_filesz);
 	} else {
-	    printf("data=0x%lx", (long)phdr[i].p_filesz);
+	    printf("data=0x%llx", (long long)phdr[i].p_filesz);
 	    if (phdr[i].p_filesz < phdr[i].p_memsz)
-		printf("+0x%lx", (long)(phdr[i].p_memsz -phdr[i].p_filesz));
+		printf("+0x%llx", (long long)(phdr[i].p_memsz -phdr[i].p_filesz));
 	    printf(" ");
 	}
 #endif
 	fpcopy = 0;
 	if (ef->firstlen > phdr[i].p_offset) {
 	    fpcopy = ef->firstlen - phdr[i].p_offset;
-	    archsw.arch_copyin(ef->firstpage + phdr[i].p_offset,
-			       phdr[i].p_vaddr + off, fpcopy);
+	    printf("\n%s:%d firstpage 0x%lx p_offset 0x%lx p_vaddr 0x%lx off 0x%lx\n",
+		   __FUNCTION__,__LINE__,
+		   (unsigned long)ef->firstpage,
+		   (unsigned long) phdr[i].p_offset,
+		   (unsigned long)phdr[i].p_vaddr,
+		   (unsigned long)off);
+	    archsw.arch_copyin(ef->firstpage + phdr[i].p_offset, phdr[i].p_vaddr + off, fpcopy);
 	}
 	if (phdr[i].p_filesz > fpcopy) {
 	    if (kern_pread(ef->fd, phdr[i].p_vaddr + off + fpcopy,
@@ -348,9 +353,11 @@ __elfN(loadimage)(struct preloaded_file *fp, elf_file_t ef, u_int64_t off)
 	/* clear space from oversized segments; eg: bss */
 	if (phdr[i].p_filesz < phdr[i].p_memsz) {
 #ifdef ELF_VERBOSE
-	    printf(" (bss: 0x%lx-0x%lx)",
-		(long)(phdr[i].p_vaddr + off + phdr[i].p_filesz),
-		(long)(phdr[i].p_vaddr + off + phdr[i].p_memsz - 1));
+	    printf("\n\t(bss: 0x%lx-0x%lx) vaddr 0x%lx  size 0x%lx clearing\n",
+		   (long)(phdr[i].p_vaddr + off + phdr[i].p_filesz),
+		   (long)(phdr[i].p_vaddr + off + phdr[i].p_memsz - 1),
+		   (long)(phdr[i].p_vaddr + phdr[i].p_filesz),
+		   (long)(phdr[i].p_memsz - phdr[i].p_filesz) );
 #endif
 
 	    kern_bzero(phdr[i].p_vaddr + off + phdr[i].p_filesz,
diff --git a/sys/boot/userboot/ficl/Makefile b/sys/boot/userboot/ficl/Makefile
index 42b9309..d7818b9 100644
--- a/sys/boot/userboot/ficl/Makefile
+++ b/sys/boot/userboot/ficl/Makefile
@@ -62,6 +62,10 @@ softcore.c: ${SOFTWORDS} softcore.awk
 	(cd ${.CURDIR}/../../ficl/softwords; cat ${SOFTWORDS} \
 	    | awk -f softcore.awk -v datestamp="`LC_ALL=C date`") > ${.TARGET}
 
+beforedepend ${OBJS}: no-machine
+
+no-machine:
+	rm -f   ${.CURDIR}/../../ficl/machine
 #.if ${MACHINE_CPUARCH} == "amd64"
 #${SRCS:M*.c:R:S/$/.o/g}: machine
 #
diff --git a/sys/boot/userboot/test/test.c b/sys/boot/userboot/test/test.c
index 36258a7..77202c1 100644
--- a/sys/boot/userboot/test/test.c
+++ b/sys/boot/userboot/test/test.c
@@ -376,6 +376,12 @@ test_getenv(void *arg, int idx)
 	return (vars[idx]);
 }
 
+static int
+test_buildsmap(void *arg, void **smap_void, size_t *outlen) 
+{
+	return (0);
+}
+
 struct loader_callbacks cb = {
 	.putc = test_putc,
 	.getc = test_getc,
@@ -405,6 +411,7 @@ struct loader_callbacks cb = {
         .getmem = test_getmem,
 
 	.getenv = test_getenv,
+	.buildsmap = test_buildsmap,
 };
 
 void
@@ -464,5 +471,5 @@ main(int argc, char** argv)
 	term.c_lflag &= ~(ICANON|ECHO);
 	tcsetattr(0, TCSAFLUSH, &term);
 
-	func(&cb, NULL, USERBOOT_VERSION_3, disk_fd >= 0);
+	func(&cb, NULL, USERBOOT_VERSION_4, disk_fd >= 0);
 }
diff --git a/sys/boot/userboot/userboot.h b/sys/boot/userboot/userboot.h
index e38927e..0a9d2f1 100644
--- a/sys/boot/userboot/userboot.h
+++ b/sys/boot/userboot/userboot.h
@@ -32,6 +32,7 @@
 #define	USERBOOT_VERSION_1      1
 #define	USERBOOT_VERSION_2      2
 #define	USERBOOT_VERSION_3      3
+#define	USERBOOT_VERSION_4      4
 
 /*
  * Exit codes from the loader
@@ -195,4 +196,11 @@ struct loader_callbacks {
 	 * each invocation will add 1 to the previous value of 'num'.
 	 */
 	const char *	(*getenv)(void *arg, int num);
+
+	/*
+	 * build system smap
+	 * this is for kload to build pass back in a copy of the running
+	 * systems smap returns
+	 */
+	int (*buildsmap)(void *arg, void **smap, size_t *len);
 };
diff --git a/sys/boot/userboot/userboot/bootinfo64.c b/sys/boot/userboot/userboot/bootinfo64.c
index fc7c14d..28c47ab 100644
--- a/sys/boot/userboot/userboot/bootinfo64.c
+++ b/sys/boot/userboot/userboot/bootinfo64.c
@@ -187,33 +187,46 @@ bios_addsmapdata(struct preloaded_file *kfp)
 {
 	uint64_t lowmem, highmem;
 	int smapnum, len;
-	struct smap smap[3], *sm;
+	struct smap *smap = NULL, *sm = NULL;
+	int error = 1;
 
-	CALLBACK(getmem, &lowmem, &highmem);
+	printf("%s\n",__FUNCTION__);
 
-	sm = &smap[0];
+	if (callbacks->buildsmap)
+		error = callbacks->buildsmap(NULL, (void **)&smap, &len);
 
-	sm->base = 0;				/* base memory */
-	sm->length = 640 * 1024;
-	sm->type = SMAP_TYPE_MEMORY;
-	sm++;
+	/* either there is no buildsmap function or it failed
+	 * revert back to using getmem and a simple smap
+	 */
 
-	sm->base = 0x100000;			/* extended memory */
-	sm->length = lowmem - 0x100000;
-	sm->type = SMAP_TYPE_MEMORY;
-	sm++;
+	if (error) {
+		smap = sm = malloc(3 * sizeof(struct smap));
+		CALLBACK(getmem, &lowmem, &highmem);
 
-	smapnum = 2;
+		sm->base = 0;				/* base memory */
+		sm->length = 640 * 1024;
+		sm->type = SMAP_TYPE_MEMORY;
+		sm++;
 
-        if (highmem != 0) {
-                sm->base = 4 * GB;
-                sm->length = highmem;
-                sm->type = SMAP_TYPE_MEMORY;
-		smapnum++;
-        }
+		sm->base = 0x100000;			/* extended memory */
+		sm->length = lowmem - 0x100000;
+		sm->type = SMAP_TYPE_MEMORY;
+		sm++;
 
-        len = smapnum * sizeof (struct smap);
-        file_addmetadata(kfp, MODINFOMD_SMAP, len, &smap[0]);
+		smapnum = 2;
+
+		if (highmem != 0) {
+			sm->base = 4 * GB;
+			sm->length = highmem;
+			sm->type = SMAP_TYPE_MEMORY;
+			smapnum++;
+		}
+
+		len = smapnum * sizeof (struct smap);
+	}
+
+	file_addmetadata(kfp, MODINFOMD_SMAP, len, smap);
+	free(smap);
 }
 
 /*
diff --git a/sys/boot/userboot/userboot/conf.c b/sys/boot/userboot/userboot/conf.c
index 0c57eba..d2c1067 100644
--- a/sys/boot/userboot/userboot/conf.c
+++ b/sys/boot/userboot/userboot/conf.c
@@ -86,8 +86,11 @@ struct file_format *file_formats[] = {
  * data structures from bootstrap.h as well.
  */
 extern struct console userboot_console;
+extern struct console comconsole;
+
 
 struct console *consoles[] = {
 	&userboot_console,
+	&comconsole,
 	NULL
 };
diff --git a/sys/boot/userboot/userboot/main.c b/sys/boot/userboot/userboot/main.c
index 4092b9b..0e2e0b7 100644
--- a/sys/boot/userboot/userboot/main.c
+++ b/sys/boot/userboot/userboot/main.c
@@ -36,8 +36,9 @@ __FBSDID("$FreeBSD$");
 #include "disk.h"
 #include "libuserboot.h"
 
-#define	USERBOOT_VERSION	USERBOOT_VERSION_3
+#define	USERBOOT_VERSION	USERBOOT_VERSION_4
 
+static char malloc_buf[512*1024];
 struct loader_callbacks *callbacks;
 void *callbacks_arg;
 
@@ -67,31 +68,47 @@ exit(int v)
 }
 
 void
+loader_init(void)
+{
+	/*
+	 * It does not hurt to re-call this as it just sets global
+	 * ptrs that never change 
+	 */
+	setheap((void *)malloc_buf, (void *)(malloc_buf + 512*1024));
+}
+
+int
 loader_main(struct loader_callbacks *cb, void *arg, int version, int ndisks)
 {
-	static char malloc[512*1024];
 	const char *var;
 	int i;
-
-        if (version != USERBOOT_VERSION)
-                abort();
-
-	callbacks = cb;
-        callbacks_arg = arg;
-	userboot_disk_maxunit = ndisks;
-
+	
 	/*
 	 * initialise the heap as early as possible.  Once this is done,
 	 * alloc() is usable. The stack is buried inside us, so this is
 	 * safe.
 	 */
-	setheap((void *)malloc, (void *)(malloc + 512*1024));
+	loader_init();
+  
+	if (cb != NULL) {
+		callbacks = cb;
+		callbacks_arg = arg;
+		userboot_disk_maxunit = ndisks;
+	} else {
+		return (EFAULT);
+	}
 
-        /*
+	/*
          * Hook up the console
          */
 	cons_probe();
 
+        if (version != USERBOOT_VERSION) {
+		printf("%s: version expected %d got %d\n", __func__,
+		      USERBOOT_VERSION, version);
+		return(EOPNOTSUPP);
+	}
+
 	/*
 	 * March through the device switch probing for things.
 	 */
@@ -128,11 +145,11 @@ loader_main(struct loader_callbacks *cb, void *arg, int version, int ndisks)
 	extract_currdev();
 
 	if (setjmp(jb))
-		return;
+		return (0);
 
 	interact();			/* doesn't return */
 
-	exit(0);
+	return(0);
 }
 
 /*
diff --git a/sys/boot/userboot/userboot/userboot_cons.c b/sys/boot/userboot/userboot/userboot_cons.c
index 5ecb7c8..5a9a573 100644
--- a/sys/boot/userboot/userboot/userboot_cons.c
+++ b/sys/boot/userboot/userboot/userboot_cons.c
@@ -50,6 +50,18 @@ struct console userboot_console = {
 	userboot_cons_poll,
 };
 
+
+struct console comconsole = {
+	"comconsole",
+	"comsonsole",
+	0,
+	userboot_cons_probe,
+	userboot_cons_init,
+	userboot_cons_putchar,
+	userboot_cons_getchar,
+	userboot_cons_poll,
+};
+
 static void
 userboot_cons_probe(struct console *cp)
 {
diff --git a/sys/conf/files b/sys/conf/files
index 5554ec0..49de90a 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2607,6 +2607,7 @@ kern/kern_khelp.c		standard
 kern/kern_kthread.c		standard
 kern/kern_ktr.c			optional ktr
 kern/kern_ktrace.c		standard
+kern/kern_kload.c		standard
 kern/kern_linker.c		standard
 kern/kern_lock.c		standard
 kern/kern_lockf.c		standard
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index b6a474e..2447c7bd 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -93,6 +93,9 @@ acpi_wakedata.h			optional	acpi			\
 	no-obj no-implicit-rule	before-depend				\
 	clean		"acpi_wakedata.h"
 #
+amd64/amd64/kload_exec.S	standard
+amd64/amd64/kload.c             standard
+#
 amd64/amd64/amd64_mem.c		optional	mem
 #amd64/amd64/apic_vector.S	standard
 amd64/amd64/atomic.c		standard
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 68c24e0..f81a05f 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -3,7 +3,7 @@
  *
  * DO NOT EDIT-- this file is automatically generated.
  * $FreeBSD$
- * created from FreeBSD: head/sys/kern/syscalls.master 242958 2012-11-13 12:52:31Z kib 
+ * created from FreeBSD
  */
 
 #include "opt_compat.h"
@@ -567,4 +567,5 @@ struct sysent sysent[] = {
 	{ AS(posix_fallocate_args), (sy_call_t *)sys_posix_fallocate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 530 = posix_fallocate */
 	{ AS(posix_fadvise_args), (sy_call_t *)sys_posix_fadvise, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 531 = posix_fadvise */
 	{ AS(wait6_args), (sy_call_t *)sys_wait6, AUE_WAIT6, NULL, 0, 0, 0, SY_THR_STATIC },	/* 532 = wait6 */
+	{ AS(kload_args), (sy_call_t *)sys_kload, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 533 = kload */
 };
diff --git a/sys/kern/kern_kload.c b/sys/kern/kern_kload.c
new file mode 100644
index 0000000..636830e
--- /dev/null
+++ b/sys/kern/kern_kload.c
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2011 - 2012
+ *	Russell Cattelan Digital Elves Inc
+ * Copyright (c) 2011 - 2012
+ *	Isilon Systems, LLC.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/kload.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
+#include <machine/segments.h>
+
+#include <vm/vm_param.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_map.h>
+
+
+static struct kload_items *k_items = NULL;
+static MALLOC_DEFINE(M_KLOAD, "kload_items", "kload items");
+int kload_ready = 0;
+
+static vm_offset_t kload_image_va = 0;
+/*
+ * Warning this is somewhat arbitrary, but should go
+ * away once the allocate delays in kmem_alloc_attr are
+ * fixed.
+ */
+#define	IMAGE_PREALLOC	(24 * 1024 * 1024)
+
+static void kload_init(void);
+SYSINIT(kload_mem, SI_SUB_DRIVERS, SI_ORDER_ANY, kload_init, NULL);
+
+static int kload_copyin_segment(struct kload_segment *,int);
+static int kload_add_page(struct kload_items *, unsigned long);
+static void kload_shutdown_final(void *, int);
+static struct region_descriptor *mygdt;
+static	vm_offset_t control_page;
+static	vm_offset_t code_page;
+static 	void *gdt_desc;
+static pt_entry_t *pgtbl;
+unsigned long kload_pgtbl;
+static unsigned long max_addr = 0 , min_addr = 0;
+
+#define GIGMASK			(~((1<<30)-1))
+#define	ONEGIG			(1<<30)
+#define	GUEST_GDTR_LIMIT	(3 * 8 - 1)
+
+extern char kernphys[];
+#define	KLOADBASE		KERNBASE
+
+static void
+update_max_min(vm_offset_t addr, int count)
+{
+	int i;
+
+	for(i = 0; i < count; i++) {
+		if (vtophys(addr + (i * PAGE_SIZE)) < min_addr)
+			min_addr = vtophys(addr + (i * PAGE_SIZE));
+		if (vtophys(addr + (i * PAGE_SIZE)) > max_addr)
+			max_addr = vtophys(addr + (i * PAGE_SIZE));
+	}
+}
+
+static vm_offset_t
+kload_kmem_alloc(vm_map_t map, vm_size_t size)
+{
+	vm_offset_t va;
+	int num_pages;
+
+	va = kmem_alloc_attr(map, size,
+	    M_WAITOK | M_ZERO,
+	    0, (1 << 30) /* 1Gig limit */,
+	    VM_MEMATTR_WRITE_COMBINING);
+
+	num_pages = roundup2(size,PAGE_SIZE) >> PAGE_SHIFT;
+	update_max_min(va, num_pages);
+
+	return (va);
+	}
+
+struct kload_cpage {
+	unsigned long kcp_magic;	/* 0 */
+	unsigned long kcp_modulep;	/* 1 */
+	unsigned long kcp_physfree;	/* 2 */
+	unsigned long kcp_gdt;		/* 3 */
+	unsigned long kcp_pgtbl;	/* 4 */
+	unsigned long kcp_cp;		/* 5 */
+	unsigned long kcp_entry_pt;	/* 6 */
+	unsigned long kcp_idt;		/* 7 */
+} __packed;
+
+static int
+kload_add_page(struct kload_items *items, unsigned long item_m)
+{
+	vm_paddr_t phys;
+	unsigned long va;
+
+	if (*items->item != 0) {
+		printf(" item != 0 0x%lx\n",*items->item);
+		items->item++;
+		items->i_count--;
+	}
+
+
+	if ((items->item == items->last_item) || (items->i_count == 0)) {
+		/* out of space in current page grab a new one */
+		va = (unsigned long)kload_kmem_alloc(kernel_map,PAGE_SIZE);
+		if (items->head_va == 0)
+			items->head_va = va;
+
+		phys = vtophys(va);
+		/* store the address of indrect page */
+		*items->item = (unsigned long)
+		    (vtophys(va) + KLOADBASE) | KLOAD_INDIRECT;
+		items->item = (unsigned long *)va;
+		/* ok now move to new page to start storing address */
+		items->last_item = (unsigned long *)va +
+		    ((PAGE_SIZE/sizeof(unsigned long)) - 1);
+		items->i_count = ((PAGE_SIZE/sizeof(unsigned long)) - 1);
+	}
+	*items->item = item_m;
+	items->item++;
+	items->i_count--;
+
+	return (0);
+}
+
+static void
+kload_init(void)
+{
+	int size = IMAGE_PREALLOC;
+	kload_image_va = kload_kmem_alloc(kernel_map, size);
+	printf("%s 0x%lx preallocated size %d\n", __func__,
+	    kload_image_va, size);
+}
+
+int
+kload_copyin_segment(struct kload_segment *khdr, int seg)
+{
+	int i;
+	int num_pages;
+	int error = 0;
+	vm_offset_t va = kload_image_va;
+
+	num_pages = roundup2(khdr->k_memsz,PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* check to make sure the preallocate space is beg enough */
+	if (va && ((num_pages * PAGE_SIZE) > IMAGE_PREALLOC)) {
+		printf("%s size over 24Meg %d\n", __func__,
+			 num_pages * PAGE_SIZE);
+		kmem_free(kernel_map, va, IMAGE_PREALLOC);
+		va = 0;
+	}
+
+	if (va == 0) {
+		va = kload_kmem_alloc(kernel_map, num_pages * PAGE_SIZE);
+		if (va == 0)
+			return (ENOMEM);
+	}
+
+	/*  need to set up a START dst page */
+	for (i = 0; i < num_pages; i++) {
+		kload_add_page(k_items,
+		    (vtophys(va + (i * PAGE_SIZE)) + KLOADBASE) | KLOAD_SOURCE);
+	}
+	printf("%s starting copyin... ", __func__);
+	*k_items->item = KLOAD_DONE;
+	if ((error = copyin(khdr->k_buf, (void *)va, khdr->k_memsz)) != 0)
+		return (error);
+	printf("copied %d bytes to va %p done marker at %p\n",
+	       (int)khdr->k_memsz, (void *)va, &k_items->item );
+
+	return (error);
+}
+
+int
+sys_kload(struct thread *td, struct kload_args *uap)
+{
+	struct region_descriptor *null_idt;
+	struct kload_cpage *k_cpage;
+	struct kload kld;
+	int error = 0;
+	int i;
+	size_t bufsize = uap->buflen;
+
+	error = priv_check(td, PRIV_REBOOT);
+	if (error)
+		return (error);
+
+	/*
+	 * hook into the shutdown/reboot path so
+	 * we end up here before cpu reset
+	 */
+	EVENTHANDLER_REGISTER(shutdown_final, kload_shutdown_final,
+	    NULL, SHUTDOWN_PRI_KLOAD);
+
+	max_addr = 0;
+	min_addr = ~0UL;
+
+	if (bufsize != sizeof(struct kload)) {
+		printf("Hmm size not right %jd %jd\n", (uintmax_t)bufsize,
+		    (uintmax_t)sizeof(struct kload));
+		return (error);
+	}
+	if ((error = copyin(uap->buf, &kld, bufsize)) != 0)
+		return (error);
+
+	if (k_items == NULL) {
+		if((k_items = malloc(sizeof(struct kload_items),
+				     M_KLOAD, M_WAITOK|M_ZERO)) == NULL)
+			return (ENOMEM);
+
+		k_items->head = 0;
+		k_items->head_va = 0;
+		k_items->item = &k_items->head;
+		k_items->last_item = &k_items->head;
+	}
+
+	control_page = kload_kmem_alloc(kernel_map, PAGE_SIZE * 2);
+	k_cpage = (struct kload_cpage *)control_page;
+	code_page = control_page + PAGE_SIZE;
+
+	printf("copy from %p kernel_kump to 0x%lx size %d\n",
+	       relocate_kernel, (unsigned long)code_page, relocate_kernel_size);
+	memset((void *)control_page, 0, PAGE_SIZE * 2);
+	memcpy((void *)code_page, relocate_kernel, relocate_kernel_size);
+
+	k_cpage->kcp_magic = 0xC0DE;
+	k_cpage->kcp_modulep = kld.k_modulep;
+	k_cpage->kcp_physfree = kld.k_physfree;
+
+	mygdt = (struct region_descriptor *)kload_kmem_alloc(kernel_map,
+	    PAGE_SIZE);
+	k_cpage->kcp_gdt = (unsigned long)vtophys(mygdt) + KLOADBASE;
+
+	gdt_desc = (char *)mygdt + sizeof(struct region_descriptor);
+	setup_freebsd_gdt(gdt_desc);
+	mygdt->rd_limit = GUEST_GDTR_LIMIT;
+	mygdt->rd_base = (unsigned long)(vtophys(gdt_desc) + KLOADBASE);
+
+	/*
+	 * we pass the virt addr of control_page but we need
+	 * new virt addr as well
+	 */
+	k_cpage->kcp_cp = (unsigned long)(vtophys(control_page) + KLOADBASE);
+	k_cpage->kcp_entry_pt = kld.k_entry_pt;
+
+	/* 10 segments should be more than enough */
+	for (i = 0 ; (i < kld.num_hdrs && i <= 10); i++)
+		kload_copyin_segment(&kld.khdr[i],i);
+
+	null_idt = (struct region_descriptor*)
+	    kload_kmem_alloc(kernel_map,PAGE_SIZE);
+	k_cpage->kcp_idt = (unsigned long)vtophys(null_idt) + KLOADBASE;
+	/* Wipe the IDT. */
+	null_idt->rd_limit = 0;
+	null_idt->rd_base = 0;
+	/*
+	 * This must be built after all other allocations so it can
+	 * build a page table entry based on min max addresses
+	 */
+	/* returns new page table phys addr */
+	pgtbl = kload_build_page_table();
+	if (pgtbl == NULL)
+		return (ENOMEM);
+	kload_pgtbl = (unsigned long)pgtbl;
+	k_cpage->kcp_pgtbl = (unsigned long)pgtbl;
+
+	kload_ready = 1;
+
+	if (bootverbose)
+		printf("%s:\n\t"
+		       "head_va         0x%lx (phys 0x%lx)\n\t"
+		       "kernbase        0x%lx\n\t"
+		       "code_page       0x%lx (phys 0x%lx)\n\t"
+		       "control_page    0x%lx (phys 0x%lx)\n\t"
+		       "gdt             0x%lx (phys 0x%lx)\n\t"
+		       "idt             0x%lx (phys 0x%lx)\n\t"
+		       "k_entry_pt      0x%lx\n\t"
+		       "pgtbl                              (phys 0x%lx)\n\t"
+		       "max_addr                           (phys 0x%lx)\n\t"
+		       "min_addr                           (phys 0x%lx)\n\t"
+		       "modulep                            (phys 0x%lx)\n\t"
+		       "physfree                            (phys 0x%lx)\n",
+		       __func__,
+		       (unsigned long)k_items->head_va,
+		       (unsigned long)vtophys(k_items->head_va),
+		       (unsigned long)(KERNBASE + (vm_paddr_t)kernphys),
+		       (unsigned long)(control_page + PAGE_SIZE),
+		       (unsigned long)vtophys(control_page + PAGE_SIZE),
+		       (unsigned long)control_page,
+		       (unsigned long)vtophys(control_page),
+		       (unsigned long)mygdt,(unsigned long)vtophys(mygdt),
+		       (unsigned long)null_idt,(unsigned long)vtophys(null_idt),
+		       (unsigned long)kld.k_entry_pt,
+		       (unsigned long)pgtbl,
+		       (unsigned long)max_addr,
+		       (unsigned long)min_addr,
+		       (unsigned long)kld.k_modulep,
+		       (unsigned long)kld.k_physfree);
+	
+	if(!(uap->flags & (KLOAD_EXEC | KLOAD_REBOOT)))
+		goto just_load;
+#if defined(SMP)
+	/*
+	 * Bind us to CPU 0 so that all shutdown code runs there.  Some
+	 * systems don't shutdown properly (i.e., ACPI power off) if we
+	 * run on another processor.
+	 */
+	printf("Binding process to cpu 0\n");
+	thread_lock(curthread);
+	sched_bind(curthread, 0);
+	thread_unlock(curthread);
+	KASSERT(PCPU_GET(cpuid) == 0, ("%s: not running on cpu 0", __func__));
+#endif
+	if(uap->flags & KLOAD_REBOOT) {
+		mtx_lock(&Giant);
+		kern_reboot(RB_KLOAD);
+		/* should not return */
+		mtx_unlock(&Giant);
+	}
+	/*
+	 * the reboot code will do a module shutdown so it is not
+	 * part kload_shutdown_final but it needs to happen.
+	 * So in the case of exec run it here
+	 */
+	if (bootverbose)
+		printf("%s: module_shutdown\n", __func__);
+	kload_module_shutdown();
+	kload_shutdown_final(NULL, RB_KLOAD);
+just_load:
+	printf("%s: Kernel image loaded waiting for reboot\n", __func__);
+	return (0);
+}
+
+static void
+kload_shutdown_final(void *arg, int howto)
+{
+	int ret;
+	cpuset_t map;
+
+	/* Just to make sure we are on cpu 0 */
+	KASSERT(PCPU_GET(cpuid) == 0, ("%s: not running on cpu 0", __func__));
+	if (kload_ready) {
+		printf("%s: suspend APs\n",__FUNCTION__);
+		map = all_cpus;
+		/* we should be bound to cpu 0 at this point */
+		printf("%s  cpuid %d\n",__FUNCTION__,PCPU_GET(cpuid));
+		CPU_CLR(PCPU_GET(cpuid), &map);
+		CPU_NAND(&map, &stopped_cpus);
+		if (!CPU_EMPTY(&map)) {
+			printf("cpu_reset: Stopping other CPUs\n");
+			suspend_cpus(map);
+		}
+
+		if (bootverbose)
+			printf("%s: clear all handlers\n", __func__);
+		intr_clear_all_handlers();
+
+		if (bootverbose)
+			printf("%s: loapic_clear_lapic\n", __func__);
+		lapic_clear_lapic(1);
+
+		intr_suspend();
+
+		if (bootverbose)
+			printf("%s disable_interrupts cpuid %d\n",
+			    __func__, PCPU_GET(cpuid));
+		disable_intr();
+
+		printf("calling relocate_kernel\n");
+		ret = relocate_kernel(vtophys(k_items->head_va) + KLOADBASE,
+				      /* dest addr i.e. overwrite existing kernel */
+				      KERNBASE + (vm_paddr_t)kernphys,
+				      vtophys(code_page) + KLOADBASE,
+				      control_page);
+		/* currently this will never happen */
+		printf("\trelocate_new_kernel returned %d\n",ret);
+	} else {
+		printf("kload_shutdown_final called without "
+		    "a new kernel loaded\n");
+	}
+}
diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c
index b769320..2a880c5 100644
--- a/sys/kern/kern_module.c
+++ b/sys/kern/kern_module.c
@@ -64,6 +64,7 @@ static TAILQ_HEAD(modulelist, module) modules;
 struct sx modules_sx;
 static int nextid = 1;
 static void module_shutdown(void *, int);
+void kload_module_shutdown(void);
 
 static int
 modevent_nop(module_t mod, int what, void *arg)
@@ -107,6 +108,12 @@ module_shutdown(void *arg1, int arg2)
 }
 
 void
+kload_module_shutdown(void) {
+	module_shutdown(NULL, 0);
+}
+
+
+void
 module_register_init(const void *arg)
 {
 	const moduledata_t *data = (const moduledata_t *)arg;
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 96f2400..6edd7fa 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -3,7 +3,7 @@
  *
  * DO NOT EDIT-- this file is automatically generated.
  * $FreeBSD$
- * created from FreeBSD: head/sys/kern/syscalls.master 242958 2012-11-13 12:52:31Z kib 
+ * created from FreeBSD
  */
 
 const char *syscallnames[] = {
@@ -540,4 +540,5 @@ const char *syscallnames[] = {
 	"posix_fallocate",			/* 530 = posix_fallocate */
 	"posix_fadvise",			/* 531 = posix_fadvise */
 	"wait6",			/* 532 = wait6 */
+	"kload",			/* 533 = kload */
 };
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 148dea3..eb2b648 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -955,5 +955,7 @@
 				    int *status, int options, \
 				    struct __wrusage *wrusage, \
 				    siginfo_t *info); }
+533	AUE_NULL	STD 	{ int kload(const void *buf, size_t buflen, \
+				  int flags); }
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/compat/freebsd32/syscalls.master
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
index c755f92..c48057a 100644
--- a/sys/kern/systrace_args.c
+++ b/sys/kern/systrace_args.c
@@ -3286,6 +3286,15 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
 		*n_args = 6;
 		break;
 	}
+	/* kload */
+	case 533: {
+		struct kload_args *p = params;
+		uarg[0] = (intptr_t) p->buf; /* const void * */
+		uarg[1] = p->buflen; /* size_t */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
 	default:
 		*n_args = 0;
 		break;
@@ -8745,6 +8754,22 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 			break;
 		};
 		break;
+	/* kload */
+	case 533:
+		switch(ndx) {
+		case 0:
+			p = "const void *";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		case 2:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
 	default:
 		break;
 	};
@@ -10638,6 +10663,11 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 		if (ndx == 0 || ndx == 1)
 			p = "int";
 		break;
+	/* kload */
+	case 533:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
 	default:
 		break;
 	};
diff --git a/sys/sys/eventhandler.h b/sys/sys/eventhandler.h
index 6d37bf4..31aa4e4 100644
--- a/sys/sys/eventhandler.h
+++ b/sys/sys/eventhandler.h
@@ -173,6 +173,7 @@ typedef void (*shutdown_fn)(void *, int);
 #define	SHUTDOWN_PRI_FIRST	EVENTHANDLER_PRI_FIRST
 #define	SHUTDOWN_PRI_DEFAULT	EVENTHANDLER_PRI_ANY
 #define	SHUTDOWN_PRI_LAST	EVENTHANDLER_PRI_LAST
+#define	SHUTDOWN_PRI_KLOAD	EVENTHANDLER_PRI_LAST - 100
 
 EVENTHANDLER_DECLARE(shutdown_pre_sync, shutdown_fn);	/* before fs sync */
 EVENTHANDLER_DECLARE(shutdown_post_sync, shutdown_fn);	/* after fs sync */
diff --git a/sys/sys/kload.h b/sys/sys/kload.h
new file mode 100644
index 0000000..0920176
--- /dev/null
+++ b/sys/sys/kload.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2011 - 2012
+ *	Russell Cattelan Digital Elves Inc
+ * Copyright (c) 
+ *	Isilon Systems, LLC.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef __KLOAD_H__
+#define __KLOAD_H__
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#define KLOAD_LOAD		 0
+#define KLOAD_REBOOT		(1 << 0 )
+#define KLOAD_EXEC		(1 << 1 )
+
+struct kload_segment {
+	void		       *k_buf;
+	size_t			k_memsz;
+	unsigned long	       *k_pages;
+	unsigned long		k_seg_start;
+};
+
+struct kload {
+	struct kload_segment	khdr[10];
+	int			num_hdrs;
+	unsigned long		k_entry_pt;
+	unsigned int		k_modulep;
+	unsigned int		k_physfree;
+};
+
+//typedef u_long kload_item_t;
+#define KLOAD_DESTINATION  0x1
+#define KLOAD_INDIRECT     0x2
+#define KLOAD_DONE         0x4
+#define KLOAD_SOURCE       0x8
+
+struct kload_items {
+	unsigned long head;
+	vm_offset_t head_va;
+	unsigned long *last_item;
+	unsigned long *item;
+	int i_count;
+	unsigned long flags;  /* not used yet */
+};
+
+/*
+ * defined in <arch>/kload.c
+ */
+pt_entry_t * kload_build_page_table(void);
+void setup_freebsd_gdt(uint64_t *);
+void kload_module_shutdown(void);
+
+/*
+ * defined in <arch>/kload_exec.S
+ */
+unsigned long relocate_kernel(unsigned long indirection_page,
+    unsigned long page_list, unsigned long code_page,
+    unsigned long control_page);
+extern int relocate_kernel_size;
+
+#endif
diff --git a/sys/sys/reboot.h b/sys/sys/reboot.h
index 6b8e25e..9b70160 100644
--- a/sys/sys/reboot.h
+++ b/sys/sys/reboot.h
@@ -59,6 +59,7 @@
 #define	RB_RESERVED1	0x40000	/* reserved for internal use of boot blocks */
 #define	RB_RESERVED2	0x80000	/* reserved for internal use of boot blocks */
 #define	RB_PAUSE	0x100000 /* pause after each output line during probe */
+#define	RB_KLOAD	0x200000 /* reboot using kload'ed kernel image */
 #define	RB_MULTIPLE	0x20000000	/* use multiple consoles */
 
 #define	RB_BOOTINFO	0x80000000	/* have `struct bootinfo *' arg */
diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h
index 6b0cd67..f90cad2 100644
--- a/sys/sys/syscall.h
+++ b/sys/sys/syscall.h
@@ -3,7 +3,7 @@
  *
  * DO NOT EDIT-- this file is automatically generated.
  * $FreeBSD$
- * created from FreeBSD: head/sys/kern/syscalls.master 242958 2012-11-13 12:52:31Z kib 
+ * created from FreeBSD
  */
 
 #define	SYS_syscall	0
@@ -452,4 +452,5 @@
 #define	SYS_posix_fallocate	530
 #define	SYS_posix_fadvise	531
 #define	SYS_wait6	532
-#define	SYS_MAXSYSCALL	533
+#define	SYS_kload	533
+#define	SYS_MAXSYSCALL	534
diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk
index 25f0470..39fd05b 100644
--- a/sys/sys/syscall.mk
+++ b/sys/sys/syscall.mk
@@ -1,7 +1,7 @@
 # FreeBSD system call names.
 # DO NOT EDIT-- this file is automatically generated.
 # $FreeBSD$
-# created from FreeBSD: head/sys/kern/syscalls.master 242958 2012-11-13 12:52:31Z kib 
+# created from FreeBSD
 MIASM =  \
 	syscall.o \
 	exit.o \
@@ -400,4 +400,5 @@ MIASM =  \
 	rctl_remove_rule.o \
 	posix_fallocate.o \
 	posix_fadvise.o \
-	wait6.o
+	wait6.o \
+	kload.o
diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h
index ef59ad5..81dac15 100644
--- a/sys/sys/sysproto.h
+++ b/sys/sys/sysproto.h
@@ -3,7 +3,7 @@
  *
  * DO NOT EDIT-- this file is automatically generated.
  * $FreeBSD$
- * created from FreeBSD: head/sys/kern/syscalls.master 242958 2012-11-13 12:52:31Z kib 
+ * created from FreeBSD
  */
 
 #ifndef _SYS_SYSPROTO_H_
@@ -1762,6 +1762,11 @@ struct wait6_args {
 	char wrusage_l_[PADL_(struct __wrusage *)]; struct __wrusage * wrusage; char wrusage_r_[PADR_(struct __wrusage *)];
 	char info_l_[PADL_(siginfo_t *)]; siginfo_t * info; char info_r_[PADR_(siginfo_t *)];
 };
+struct kload_args {
+	char buf_l_[PADL_(const void *)]; const void * buf; char buf_r_[PADR_(const void *)];
+	char buflen_l_[PADL_(size_t)]; size_t buflen; char buflen_r_[PADR_(size_t)];
+	char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
+};
 int	nosys(struct thread *, struct nosys_args *);
 void	sys_sys_exit(struct thread *, struct sys_exit_args *);
 int	sys_fork(struct thread *, struct fork_args *);
@@ -2144,6 +2149,7 @@ int	sys_rctl_remove_rule(struct thread *, struct rctl_remove_rule_args *);
 int	sys_posix_fallocate(struct thread *, struct posix_fallocate_args *);
 int	sys_posix_fadvise(struct thread *, struct posix_fadvise_args *);
 int	sys_wait6(struct thread *, struct wait6_args *);
+int	sys_kload(struct thread *, struct kload_args *);
 
 #ifdef COMPAT_43
 
@@ -2840,6 +2846,7 @@ int	freebsd7_shmctl(struct thread *, struct freebsd7_shmctl_args *);
 #define	SYS_AUE_posix_fallocate	AUE_NULL
 #define	SYS_AUE_posix_fadvise	AUE_NULL
 #define	SYS_AUE_wait6	AUE_WAIT6
+#define	SYS_AUE_kload	AUE_NULL
 
 #undef PAD_
 #undef PADL_
diff --git a/sys/x86/x86/intr_machdep.c b/sys/x86/x86/intr_machdep.c
index 31cc80b..eee7678 100644
--- a/sys/x86/x86/intr_machdep.c
+++ b/sys/x86/x86/intr_machdep.c
@@ -197,6 +197,37 @@ intr_add_handler(const char *name, int vector, driver_filter_t filter,
 }
 
 int
+intr_clear_all_handlers(void)
+{
+	int i;
+	struct intsrc *isrc;
+
+	mtx_lock(&intr_table_lock);
+	for (i = 0; i < NUM_IO_INTS; i++) {
+		isrc = interrupt_sources[i];
+		if (isrc != NULL && isrc->is_handlers > 0) {
+			printf("%s:%d isrc[%d] %p is_handlers %d\n",
+			       __FUNCTION__,__LINE__,i,isrc,
+			       isrc->is_handlers);
+			       isrc->is_handlers--;
+			if (isrc->is_handlers == 0) {
+				printf("\t dis_source %p dis_intr %p\n",
+				       isrc->is_pic->pic_disable_source,
+				       isrc->is_pic->pic_disable_intr);
+				isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
+				isrc->is_pic->pic_disable_intr(isrc);
+			}
+			intrcnt_updatename(isrc);
+
+		}
+	}
+	mtx_unlock(&intr_table_lock);
+	return 0;
+}
+
+
+
+int
 intr_remove_handler(void *cookie)
 {
 	struct intsrc *isrc;
diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c
index e994172..6593e8b 100644
--- a/sys/x86/x86/local_apic.c
+++ b/sys/x86/x86/local_apic.c
@@ -346,6 +346,60 @@ lapic_dump(const char* str)
 }
 
 void
+lapic_clear_lapic(u_int disable) {
+
+	struct lapic *la;
+	la = &lapics[lapic_id()];
+
+	uint32_t value;
+
+	if (bootverbose)
+		printf("%s lapic_id(%d) cpu(%d) la %p lapic %p\n",__FUNCTION__,
+		       lapic_id(), PCPU_GET(cpuid), la, lapic);
+
+	/*
+	 * Fist we set the mask bit to keep and new interrupts from
+	 * arriving but allowing any pending interrupts to finish
+	 * *THEN* set the registers to default values
+	 * If the interrupts are not allowed to clear a kload'ed / booted
+	 * kernel will see the old interrupts before the appropriate handlers
+	 * are in place and trigger a panic.
+	 */
+#ifdef notyet
+	/* this seems to be causing APIC error in the new kernel */
+	value = lapic->lvt_error;
+	value |= APIC_LVT_M;
+	lapic->lvt_error = value;
+#endif
+
+	value = lapic->lvt_timer;
+	value |= APIC_LVT_M;
+	lapic->lvt_timer = value;
+
+	value = lapic->lvt_lint0;
+	value |= APIC_LVT_M;
+	lapic->lvt_lint0 = value;
+
+	value = lapic->lvt_lint1;
+	value |= APIC_LVT_M;
+	lapic->lvt_lint1 = value;
+
+	value = lapic->lvt_pcint;
+	value |= APIC_LVT_M;
+	lapic->lvt_pcint = value;
+
+	/* Program timer LVT and setup handler. */
+	lapic->lvt_timer = APIC_LVTT_M; /* masked */
+	lapic->lvt_lint0 = APIC_LVT_M; /* masked */
+	lapic->lvt_lint1 = APIC_LVT_M; /* masked */
+
+	if (disable) {
+		printf("\tlapic disable\n");
+		lapic_disable();
+	}
+}
+
+void
 lapic_setup(int boot)
 {
 	struct lapic *la;
@@ -924,7 +978,20 @@ lapic_handle_error(void)
 	lapic->esr = 0;
 	esr = lapic->esr;
 
-	printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
+	printf("CPU%d: local APIC error 0x%x\t", PCPU_GET(cpuid), esr);
+	if (lapic->esr & APIC_ESR_SEND_CS_ERROR)
+		printf("send_cs_error\n");
+	if (lapic->esr & APIC_ESR_RECEIVE_CS_ERROR)
+		printf("receive_cs_error\n");
+	if (lapic->esr & APIC_ESR_SEND_ACCEPT)
+		printf("send_accept\n");
+	if (lapic->esr & APIC_ESR_RECEIVE_ACCEPT)
+		printf("receive_accept\n");
+	if (lapic->esr & APIC_ESR_SEND_ILLEGAL_VECTOR)
+		printf("send_illegal_vector\n");
+	if (lapic->esr & APIC_ESR_ILLEGAL_REGISTER)
+		printf("illegal_register\n");
+
 	lapic_eoi();
 }
 
diff --git a/sys/x86/x86/nexus.c b/sys/x86/x86/nexus.c
index 9ead8c8..0b28465 100644
--- a/sys/x86/x86/nexus.c
+++ b/sys/x86/x86/nexus.c
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/intr_machdep.h>
 #include <sys/rman.h>
 #include <sys/interrupt.h>
+#include <sys/sysctl.h>
 
 #include <machine/vmparam.h>
 #include <vm/vm.h>
@@ -675,6 +676,52 @@ ram_probe(device_t dev)
 }
 
 static int
+smap_hdlr(SYSCTL_HANDLER_ARGS) {
+
+  /* SYSCTL_HANDLER_ARGS
+     struct sysctl_oid *oidp, void *arg1,
+     intptr_t arg2, struct sysctl_req *req
+  */
+
+	struct bios_smap *smapbase;
+	caddr_t kmdp;
+	uint32_t smapsize = 0;
+
+	/* Retrieve the system memory map from the loader. */
+	kmdp = preload_search_by_type("elf kernel");
+	if (kmdp == NULL)
+		kmdp = preload_search_by_type(ELF_KERN_STR);
+	if (kmdp != NULL) {
+		smapbase = (struct bios_smap *)preload_search_info(kmdp,
+								   MODINFO_METADATA | MODINFOMD_SMAP);
+	} else {
+		smapbase = NULL;
+		goto out;
+	}
+
+
+	printf("%s smapbase %p\n",__FUNCTION__,smapbase);
+	smapsize = *((u_int32_t *)smapbase - 1);
+
+#if 0
+	{
+		struct bios_smap *smap, *smapend;
+		smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
+		for (smap = smapbase; smap < smapend; smap++) {
+			printf("\ttype %d base 0x%lx length 0x%lx\n",
+			       smap->type,smap->base, smap->length);
+		}
+	}
+#endif
+
+out:
+	return (sysctl_handle_opaque(oidp, smapbase, smapsize, req));
+}
+SYSCTL_PROC(_hw, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
+	    0, sizeof(struct bios_smap), smap_hdlr, "S,smap",
+	    "Bios System Map");
+
+static int
 ram_attach(device_t dev)
 {
 	struct bios_smap *smapbase, *smap, *smapend;
diff --git a/usr.sbin/kload/Makefile b/usr.sbin/kload/Makefile
new file mode 100644
index 0000000..0d4a27a
--- /dev/null
+++ b/usr.sbin/kload/Makefile
@@ -0,0 +1,15 @@
+# $FreeBSD$
+
+PROG=	kload
+SRCS=	kload.c
+NO_MAN=
+
+#DPADD+=	${LIBVMMAPI}
+#LDADD+=	-lvmmapi
+
+WARNS?=	3
+
+CFLAGS+=-I${.CURDIR}/../../sys/boot/userboot
+CFLAGS+=-I${.CURDIR}/../../sys
+
+.include <bsd.prog.mk>
diff --git a/usr.sbin/kload/kload.c b/usr.sbin/kload/kload.c
new file mode 100644
index 0000000..51ac3b7
--- /dev/null
+++ b/usr.sbin/kload/kload.c
@@ -0,0 +1,748 @@
+/*
+ * Copyright (c) 2011 - 2012
+ *	Russell Cattelan Digital Elves Inc
+ * Copyright (c) 2011 - 2012
+ *	Isilon Systems, LLC.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * process kill code borrowed from halt.c
+ */
+
+#include <sys/param.h>
+#include <sys/kload.h>
+#include <sys/ioctl.h>
+#include <sys/module.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#include <sys/param.h>
+
+#include <dirent.h>
+#include <dlfcn.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <termios.h>
+#include <userboot.h>
+#include <unistd.h>
+
+char *host_base = "/";
+/* how can we get rid of these? I don't think we need them */
+
+struct termios term, oldterm;
+char *image;
+size_t image_size;
+size_t image_max_used = 0;
+int disk_fd = -1;
+uint64_t regs[16];
+uint64_t pc;
+static int k_execute = 0;
+static int k_reboot = 0;
+static void *dl_lib;
+typedef void *(*M_func)(size_t bytes, const char *file, int line);
+M_func Malloc_func;
+static void k_exit(void *, int);
+static int shutdown_processes(void);
+static u_int get_pageins(void);
+static int kload_load_image(void *image,unsigned long entry_pt);
+
+struct load_file {
+	int l_isdir;
+	size_t l_size;
+	struct stat l_stat;
+	union {
+		int fd;
+		DIR *dir;
+	} l_u;
+};
+
+struct smap {
+	uint64_t	base;
+	uint64_t	length;
+	uint32_t	type;
+} __packed;
+
+static int
+name2oid(char *name, int *oidp)
+{
+	int oid[2];
+	int i;
+	size_t j;
+
+	oid[0] = 0;
+	oid[1] = 3;
+
+	j = CTL_MAXNAME * sizeof(int);
+	i = sysctl(oid, 2, oidp, &j, name, strlen(name));
+	if (i < 0)
+		return (i);
+	j /= sizeof(int);
+
+	return (j);
+}
+
+static void
+k_putc(void *arg, int chr)
+{
+	write(1, &chr, 1);
+}
+
+static int
+k_getc(void *arg)
+{
+	char chr;
+	if(read(0, &chr, 1) == 1)
+		return (chr);
+	return (-1);
+}
+
+static int
+k_poll(void *arg)
+{
+	int n;
+	if (ioctl(0, FIONREAD, &n) >= 0)
+		return (n > 0);
+	return 0;
+}
+
+static int
+k_open(void *arg, const char *filename, void **lf_ret)
+{
+	struct stat st;
+	struct load_file *lf;
+	int error = -1;
+	char path[PATH_MAX];
+
+	if (!host_base) {
+		printf("Host base not set\n");
+		return (ENOENT);
+	}
+
+	strlcpy(path, host_base, PATH_MAX);
+	if (path[strlen(path) - 1] == '/')
+		path[strlen(path) - 1] = 0;
+	strlcat(path, filename, PATH_MAX);
+	lf = malloc(sizeof(struct load_file));
+	if (stat(path, &lf->l_stat) < 0) {
+		error = errno;
+		goto out;
+	}
+
+	lf->l_size = st.st_size;
+	if (S_ISDIR(lf->l_stat.st_mode)) {
+		lf->l_isdir = 1;
+		lf->l_u.dir = opendir(path);
+		if (!lf->l_u.dir) {
+			error = EINVAL;
+			goto out;
+		}
+		*lf_ret = lf;
+		return (0);
+	}
+	if (S_ISREG(lf->l_stat.st_mode)) {
+		lf->l_isdir = 0;
+		lf->l_u.fd = open(path, O_RDONLY);
+		if (lf->l_u.fd < 0) {
+			error = EINVAL;
+			goto out;
+		}
+		*lf_ret = lf;
+		return (0);
+	}
+
+out:
+	free(lf);
+	return (error);
+}
+
+static int
+k_close(void *arg, void *h)
+{
+	struct load_file *lf = (struct load_file *)h;
+
+	if (lf->l_isdir)
+		closedir(lf->l_u.dir);
+	else
+		close(lf->l_u.fd);
+	free(lf);
+
+	return (0);
+}
+
+static int
+k_isdir(void *arg, void *h)
+{
+	return (((struct load_file *)h)->l_isdir);
+}
+
+static int
+k_read(void *arg, void *h, void *dst, size_t size, size_t *resid_return)
+{
+	struct load_file *lf = (struct load_file *)h;
+	ssize_t sz;
+
+	if (lf->l_isdir)
+		return (EINVAL);
+
+	if((sz = read(lf->l_u.fd, dst, size)) < 0)
+		return (EINVAL);
+	*resid_return = size - sz;
+	return (0);
+}
+
+static int
+k_readdir(void *arg, void *h, uint32_t *fileno_return, uint8_t *type_return,
+    size_t *namelen_return, char *name)
+{
+	struct load_file *lf = (struct load_file *)h;
+	struct dirent *dp;
+
+	if (!lf->l_isdir)
+		return (EINVAL);
+
+	dp = readdir(lf->l_u.dir);
+	if (!dp)
+		return (ENOENT);
+
+	/*
+	 * Note: d_namlen is in the range 0..255 and therefore less
+	 * than PATH_MAX so we don't need to test before copying.
+	 */
+	*fileno_return = dp->d_fileno;
+	*type_return = dp->d_type;
+	*namelen_return = dp->d_namlen;
+	memcpy(name, dp->d_name, dp->d_namlen);
+	name[dp->d_namlen] = 0;
+
+	return (0);
+}
+
+static int
+k_seek(void *arg, void *h, uint64_t offset, int whence)
+{
+	struct load_file *lf = (struct load_file *)h;
+
+	if (lf->l_isdir)
+		return (EINVAL);
+
+	if (lseek(lf->l_u.fd, offset, whence) < 0)
+		return (errno);
+
+	return (0);
+}
+
+static int
+k_stat(void *arg, void *h,
+       int *mode_return, int *uid_return,
+       int *gid_return, uint64_t *size_return)
+{
+
+	struct load_file *lf = (struct load_file *)h;
+
+	*mode_return = lf->l_stat.st_mode;
+	*uid_return = lf->l_stat.st_uid;
+	*gid_return = lf->l_stat.st_gid;
+	*size_return = lf->l_stat.st_size;
+	return (0);
+}
+
+static int
+k_diskread(void *arg, int unit, uint64_t offset, void *dst, size_t size,
+    size_t *resid_return)
+{
+	ssize_t n;
+
+	if (unit != 0 || disk_fd == -1)
+		return (EIO);
+	n = pread(disk_fd, dst, size, offset);
+	if (n < 0)
+		return (errno);
+	*resid_return = size - n;
+	return (0);
+}
+
+static int
+k_diskioctl(void *arg, int unit, u_long cmd, void *data)
+{
+	/* not supported on by kload */
+	return (ENOTTY);
+}
+
+/*
+ * This is really confusing since this is not really like doing copyin / copyout
+ * in kernel land this will copy the data pointed to by the "from" ptr and copy
+ * "to" the offset into the load image
+ */
+static int
+k_copy_to_image(void *arg, const void *from, uint64_t to, size_t size)
+{
+	to &= 0x7fffffff;
+	if (to > image_size)
+		return (EFAULT);
+	if (to + size > image_size) {
+		size = image_size - to;
+		printf("WARNING this should never happen\n");
+	}
+	memcpy(&image[to], from, size);
+
+	if (to + size > image_max_used)
+		image_max_used = to + size;
+
+	return (0);
+}
+
+/*
+ * copyout is copying FROM the image at "from" offset to memory pointed to by to
+ * ptr
+ */
+static int
+k_copy_from_image(void *arg, uint64_t from, void *to, size_t size)
+{
+	from &= 0x7fffffff;
+	if (from > image_size)
+		return (EFAULT);
+	if (from + size > image_size)
+		size = image_size - from;
+	memcpy(to, &image[from], size);
+
+	return (0);
+}
+
+static void
+k_setreg(void *arg, int r, uint64_t v)
+{
+	if (r < 0 || r >= 16)
+		return;
+	regs[r] = v;
+}
+
+static void
+k_setmsr(void *arg, int r, uint64_t v)
+{
+	/* Unneeded */
+}
+
+static void
+k_setcr(void *arg, int r, uint64_t v)
+{
+	/* Unneeded */
+}
+
+static void
+k_setgdt(void *arg, uint64_t v, size_t sz)
+{
+	/* Unneeded */
+}
+
+static void
+k_exec(void *arg, uint64_t entry_pt)
+{
+#ifdef DEBUG
+	printf("Execute at 0x%jx\n", entry_pt);
+	printf("image size max used %jd endof page %jd\n", image_max_used,
+	    roundup2(image_max_used, PAGE_SIZE));
+#endif
+	kload_load_image(image, entry_pt);
+	k_exit(arg, 0);
+}
+
+static void
+k_delay(void *arg, int usec)
+{
+	usleep(usec);
+}
+
+static void
+k_exit(void *arg, int v)
+{
+	tcsetattr(0, TCSAFLUSH, &oldterm);
+	exit(v);
+}
+
+static void
+k_getmem(void *arg, uint64_t *lowmem, uint64_t *highmem)
+{
+	int mib[2];
+	unsigned long long physmem;
+	size_t len;
+
+	mib[0] = CTL_HW;
+	mib[1] = HW_PHYSMEM;
+	len = sizeof(physmem);
+	sysctl(mib, 2, &physmem, &len, NULL, 0);
+
+	*lowmem = physmem;
+	*highmem = 0;
+
+	printf("%s:%d lowmem %ju highmem %ju\n",__FUNCTION__,__LINE__,
+	       *lowmem,
+	       *highmem
+		);
+}
+
+static const char *
+k_getenv(void *arg, int idx)
+{
+	static const char *vars[] = {
+		"foo=bar",
+		"bar=barbar",
+		NULL
+	};
+
+	return (vars[idx]);
+}
+
+static int
+k_buildsmap(void *arg, void **smap_void, size_t *outlen) 
+{
+	struct smap *smapbase;
+	size_t i,j;
+	size_t len;
+	char name[] = "hw.smap";
+	int mib[CTL_MAXNAME];
+
+	len = name2oid(name, mib);
+
+	/* get the current smap from the running system */
+	i = sysctl(mib, 2, 0, &j, 0, 0);
+	len = j;
+
+	/*
+	 * Use the malloc function from libstand/userboot.so since
+	 * bios_addsmapdata will free the memory using the libstand Free
+	 * so be careful to use not use standard malloc here
+	 */
+	smapbase = Malloc_func(j, __FILE__, __LINE__);
+	if (!smapbase) {
+		printf("kload failed to allocate space for smap\n");
+		return 1;
+	}
+
+	i = sysctl(mib, 2, smapbase, &j, NULL, 0);
+
+	*outlen = len;
+	*smap_void = smapbase;
+
+#ifdef DEBUG
+	{
+		struct smap *smap, *smapend;
+		smapend = (struct smap *)((uintptr_t)smapbase + len);
+		for (smap = smapbase; smap < smapend; smap++) {
+			printf("\ttype %d base 0x%016lx length 0x%016lx\n",
+			       smap->type, smap->base, smap->length);
+		}
+	}
+#endif
+
+	return 0;
+}
+
+struct loader_callbacks cb = {
+
+	.open = k_open,
+	.close = k_close,
+	.isdir = k_isdir,
+	.read = k_read,
+	.readdir = k_readdir,
+	.seek = k_seek,
+	.stat = k_stat,
+
+	.diskread = k_diskread,
+	.diskioctl = k_diskioctl,
+
+	.copyin = k_copy_to_image,
+	.copyout = k_copy_from_image,
+	.setreg = k_setreg,
+	.setmsr = k_setmsr,
+	.setcr = k_setcr,
+	.setgdt = k_setgdt,
+	.exec = k_exec,
+
+	.delay = k_delay,
+	.exit = k_exit,
+	.getmem = k_getmem,
+
+	.putc = k_putc,
+	.getc = k_getc,
+	.poll = k_poll,
+	.getenv = k_getenv,
+	.buildsmap = k_buildsmap,
+};
+
+static void
+usage(void)
+{
+	printf("usage: kload [-d <disk image path>] "
+	    "[-h <host filesystem path>] [-e | -r]\n");
+	exit(1);
+}
+
+int
+main(int argc, char** argv)
+{
+	int (*loader_main)(struct loader_callbacks *, void *, int, int);
+	void (*loader_init)(void);
+	int (*setenv)(const char *, const char *, int);
+	int opt;
+	char *disk_image = NULL;
+	char karg[20];
+	char kval[128];
+
+	if (geteuid()) {
+		errno = EPERM;
+		err(1, NULL);
+	}
+
+	dl_lib = dlopen("/boot/userboot.so", RTLD_LOCAL);
+	if (!dl_lib) {
+		printf("%s\n", dlerror());
+		return (1);
+	}
+	loader_main = dlsym(dl_lib, "loader_main");
+	if (!loader_main) {
+		printf("%s\n", dlerror());
+		return (1);
+	}
+	Malloc_func = dlsym(dl_lib, "Malloc");
+	if (!Malloc_func) {
+		printf("%s\n", dlerror());
+		return (1);
+	}
+	/*
+	 * pull in the libstand setenv for setting name value pairs
+	 * in the kernel env page
+	 */
+	setenv = dlsym(dl_lib, "setenv");
+	if (!setenv) {
+		printf("%s\n", dlerror());
+		return (1);
+	}
+	loader_init = dlsym(dl_lib, "loader_init");
+	if (!loader_init) {
+		printf("%s\n", dlerror());
+		return (1);
+	}
+	/* call libstand setheap to init memory allocations */
+	loader_init();
+
+	while ((opt = getopt(argc, argv, "d:h:erk:")) != -1) {
+		switch (opt) {
+		case 'd':
+			disk_image = optarg;
+			break;
+
+		case 'h':
+			host_base = optarg;
+			break;
+		case 'e':
+			k_execute = 1;
+			break;
+		case 'r':
+			k_reboot = 1;
+			break;
+		case 'k':
+			memset(karg,0,sizeof(karg));
+			memset(kval,0,sizeof(kval));
+			if(sscanf(optarg,"%[a-zA-Z_-]=%s",karg,kval) == 2) {
+				printf("got value %s %s\n",karg,kval);
+				setenv(karg, kval, 1);
+			} else {
+				fprintf(stderr,"-k failure %s\n",optarg);
+			}
+			break;
+
+		case '?':
+			usage();
+		}
+	}
+
+	image_size = 128*1024*1024;
+	image = malloc(image_size);
+	if (disk_image) {
+		disk_fd = open(disk_image, O_RDONLY);
+		if (disk_fd < 0)
+			err(1, "Can't open disk image '%s'", disk_image);
+	}
+
+	tcgetattr(0, &term);
+	oldterm = term;
+	term.c_iflag &= ~(ICRNL);
+	term.c_lflag &= ~(ICANON|ECHO);
+	tcsetattr(0, TCSAFLUSH, &term);
+
+	return(loader_main(&cb, NULL, USERBOOT_VERSION_4, disk_fd >= 0));
+}
+
+static int
+kload_load_image(void *image, unsigned long entry_pt)
+{
+	char *stack = (char *)image + 0x1000; /* PAGESIZE */
+	struct kload kld;
+	int flags = KLOAD_LOAD;
+	/*
+	 * This must the same value sys/conf/ldscript.xxx
+	 * This value was changed at one point when a new version
+	 * of binutils was imported. The value is aligned to
+	 * max page size supported by given processor
+	 */
+	unsigned long kernphys = 0x200000;
+
+	kld.khdr[0].k_buf = &((char *)image)[kernphys];
+	kld.khdr[0].k_memsz = roundup2(image_max_used,PAGE_SIZE) - kernphys;
+	kld.k_entry_pt = entry_pt;
+	kld.num_hdrs = 1;
+
+	/*
+	 * pull paramaters from the stack page
+	 * a better interface should be developed for kload
+	 * in the future
+	 */
+	kld.k_modulep = ((unsigned int *)stack)[1];
+	kld.k_physfree = ((unsigned int *)stack)[2];
+
+	/*
+	 * Make sure there is 4 pages of kenv pages between the end of the
+	 * kernel and start of free memory.
+	 * Why you ask? Well that is a question without a good answer as of yet
+	 * for some strange reason some ata chips will not respond correctly
+	 * unless free memory starts at greater than 2 pages out.
+	 * The obvoius assumption is that something is getting stommped on but
+	 * that has yet to be determined. Adding this workaround.
+	 */ 
+	kld.k_physfree = MAX(kld.k_modulep + (4 * PAGE_SIZE), kld.k_physfree);
+
+	printf("WARNING kernphys set to 0x%lx make sure this matches kernphys "
+	    "from sys/config/ldscript\n", kernphys);
+
+	if (k_execute) {
+		flags &= ~KLOAD_REBOOT;
+		flags |= KLOAD_EXEC;
+	}
+	if (k_reboot) {
+		flags &= ~KLOAD_EXEC;
+		flags |= KLOAD_REBOOT;
+		shutdown_processes();
+	}
+
+	return (syscall(SYS_kload, &kld, sizeof(struct kload), flags));
+}
+
+static int
+shutdown_processes(void)
+{
+	int i;
+	u_int pageins;
+	int sverrno;
+	/*
+	 * Do a sync early on, so disks start transfers while we're off
+	 * killing processes.  Don't worry about writes done before the
+	 * processes die, the reboot system call syncs the disks.
+	 */
+	sync();
+
+	/*
+	 * Ignore signals that we can get as a result of killing
+	 * parents, group leaders, etc.
+	 */
+	(void)signal(SIGHUP,  SIG_IGN);
+	(void)signal(SIGINT,  SIG_IGN);
+	(void)signal(SIGQUIT, SIG_IGN);
+	(void)signal(SIGTERM, SIG_IGN);
+	(void)signal(SIGTSTP, SIG_IGN);
+
+	/*
+	 * If we're running in a pipeline, we don't want to die
+	 * after killing whatever we're writing to.
+	 */
+	(void)signal(SIGPIPE, SIG_IGN);
+
+	/* Just stop init -- if we fail, we'll restart it. */
+	if (kill(1, SIGTSTP) == -1)
+		err(1, "SIGTSTP init");
+
+	/* Send a SIGTERM first, a chance to save the buffers. */
+	if (kill(-1, SIGTERM) == -1 && errno != ESRCH)
+		err(1, "SIGTERM processes");
+
+	/*
+	 * After the processes receive the signal, start the rest of the
+	 * buffers on their way.  Wait 5 seconds between the SIGTERM and
+	 * the SIGKILL to give everybody a chance. If there is a lot of
+	 * paging activity then wait longer, up to a maximum of approx
+	 * 60 seconds.
+	 */
+	sleep(2);
+	for (i = 0; i < 20; i++) {
+		pageins = get_pageins();
+		sync();
+		sleep(3);
+		if (get_pageins() == pageins)
+			break;
+	}
+
+	for (i = 1;; ++i) {
+		if (kill(-1, SIGKILL) == -1) {
+			if (errno == ESRCH)
+				break;
+			goto restart;
+		}
+		if (i > 5) {
+			(void)fprintf(stderr,
+			    "WARNING: some process(es) wouldn't die\n");
+			break;
+		}
+		(void)sleep(2 * i);
+	}
+	return 1;
+restart:
+	sverrno = errno;
+	errx(1, "%s%s", kill(1, SIGHUP) == -1 ?
+	    "(can't restart init): " : "", strerror(sverrno));
+	/* NOTREACHED */
+	return 0;
+}
+
+static u_int
+get_pageins(void)
+{
+	u_int pageins;
+	size_t len;
+
+	len = sizeof(pageins);
+	if (sysctlbyname("vm.stats.vm.v_swappgsin", &pageins, &len, NULL, 0)
+	    != 0) {
+		warnx("v_swappgsin");
+		return (0);
+	}
+	return (pageins);
+}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 256 bytes
Desc: OpenPGP digital signature
URL: <http://lists.freebsd.org/pipermail/freebsd-current/attachments/20121114/015f73c3/attachment-0001.sig>


More information about the freebsd-current mailing list