git: 1811c1e957ee - main - exec: Reimplement stack address randomization

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Mon, 17 Jan 2022 21:13:09 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=1811c1e957ee1250b08b3246fc0db37ddf64b736

commit 1811c1e957ee1250b08b3246fc0db37ddf64b736
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2022-01-17 16:42:56 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2022-01-17 21:12:36 +0000

    exec: Reimplement stack address randomization
    
    The approach taken by the stack gap implementation was to insert a
    random gap between the top of the fixed stack mapping and the true top
    of the main process stack.  This approach was chosen so as to avoid
    randomizing the previously fixed address of certain process metadata
    stored at the top of the stack, but had some shortcomings.  In
    particular, mlockall(2) calls would wire the gap, bloating the process'
    memory usage, and RLIMIT_STACK included the size of the gap so small
    (< several MB) limits could not be used.
    
    There is little value in storing each process' ps_strings at a fixed
    location, as only very old programs hard-code this address; consumers
    were converted decades ago to use a sysctl-based interface for this
    purpose.  Thus, this change re-implements stack address randomization by
    simply breaking the convention of storing ps_strings at a fixed
    location, and randomizing the location of the entire stack mapping.
    This implementation is simpler and avoids the problems mentioned above,
    while being unlikely to break compatibility anywhere the default ASLR
    settings are used.
    
    The kern.elfN.aslr.stack_gap sysctl is renamed to kern.elfN.aslr.stack,
    and is re-enabled by default.
    
    PR:             260303
    Reviewed by:    kib
    Discussed with: emaste, mw
    MFC after:      1 month
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D33704
---
 share/man/man7/security.7     | 16 ++++----
 sys/i386/linux/imgact_linux.c |  4 ++
 sys/kern/imgact_aout.c        |  4 ++
 sys/kern/imgact_elf.c         | 27 +++++++++-----
 sys/kern/kern_exec.c          | 86 ++++++++++++++++++++++++++++++-------------
 sys/sys/exec.h                |  3 +-
 sys/sys/imgact.h              |  1 +
 sys/vm/vm_map.c               |  4 +-
 sys/vm/vm_map.h               |  9 +++--
 9 files changed, 103 insertions(+), 51 deletions(-)

diff --git a/share/man/man7/security.7 b/share/man/man7/security.7
index bb7e120a1d46..1bb5338e54e6 100644
--- a/share/man/man7/security.7
+++ b/share/man/man7/security.7
@@ -28,7 +28,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd February 28, 2021
+.Dd January 14, 2022
 .Dt SECURITY 7
 .Os
 .Sh NAME
@@ -1062,19 +1062,19 @@ position-independent (PIE) 32bit binaries.
 .It Dv kern.elf32.aslr.honor_sbrk
 Makes ASLR less aggressive and more compatible with old binaries
 relying on the sbrk area.
-.It Dv kern.elf32.aslr.stack_gap
-If ASLR is enabled for a binary, a non-zero value creates a randomized
-stack gap between strings and the end of the aux vector.
-The value is the maximum percentage of main stack to waste on the gap.
-Cannot be greater than 50, i.e., at most half of the stack.
+.It Dv kern.elf32.aslr.stack
+If ASLR is enabled for a binary, a non-zero value enables randomization
+of the stack.
+Otherwise, the stack is mapped at a fixed location determined by the
+process ABI.
 .It Dv kern.elf64.aslr.enable
 64bit binaries ASLR control.
 .It Dv kern.elf64.aslr.pie_enable
 64bit PIE binaries ASLR control.
 .It Dv kern.elf64.aslr.honor_sbrk
 64bit binaries ASLR sbrk compatibility control.
-.It Dv kern.elf64.aslr.stack_gap
-Controls stack gap for 64bit binaries.
+.It Dv kern.elf64.aslr.stack
+Controls stack address randomization for 64bit binaries.
 .It Dv kern.elf32.nxstack
 Enables non-executable stack for 32bit processes.
 Enabled by default if supported by hardware and corresponding binary.
diff --git a/sys/i386/linux/imgact_linux.c b/sys/i386/linux/imgact_linux.c
index 661620b6ceaf..85357f41a705 100644
--- a/sys/i386/linux/imgact_linux.c
+++ b/sys/i386/linux/imgact_linux.c
@@ -213,6 +213,10 @@ exec_linux_imgact(struct image_params *imgp)
 	vmspace->vm_daddr =
 	    (caddr_t)(void *)(uintptr_t)(virtual_offset + a_out->a_text);
 
+	error = exec_map_stack(imgp);
+	if (error != 0)
+		goto fail;
+
 	/* Fill in image_params */
 	imgp->interpreted = 0;
 	imgp->entry_addr = a_out->a_entry;
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
index 6510488c3edd..1818e5665caf 100644
--- a/sys/kern/imgact_aout.c
+++ b/sys/kern/imgact_aout.c
@@ -350,6 +350,10 @@ exec_aout_imgact(struct image_params *imgp)
 	vmspace->vm_daddr = (caddr_t) (uintptr_t)
 			    (virtual_offset + a_out->a_text);
 
+	error = exec_map_stack(imgp);
+	if (error != 0)
+		return (error);
+
 	/* Fill in image_params */
 	imgp->interpreted = 0;
 	imgp->entry_addr = a_out->a_entry;
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index c3d19064f6e5..a0266108ec84 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -201,11 +201,11 @@ SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, honor_sbrk, CTLFLAG_RW,
     &__elfN(aslr_honor_sbrk), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": assume sbrk is used");
 
-static int __elfN(aslr_stack_gap) = 0;
-SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, stack_gap, CTLFLAG_RW,
-    &__elfN(aslr_stack_gap), 0,
+static int __elfN(aslr_stack) = 1;
+SYSCTL_INT(ASLR_NODE_OID, OID_AUTO, stack, CTLFLAG_RWTUN,
+    &__elfN(aslr_stack), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
-    ": maximum percentage of main stack to waste on a random gap");
+    ": enable stack address randomization");
 
 static int __elfN(sigfastblock) = 1;
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, sigfastblock,
@@ -1301,6 +1301,8 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 		if (!__elfN(aslr_honor_sbrk) ||
 		    (imgp->proc->p_flag2 & P2_ASLR_IGNSTART) != 0)
 			imgp->map_flags |= MAP_ASLR_IGNSTART;
+		if (__elfN(aslr_stack))
+			imgp->map_flags |= MAP_ASLR_STACK;
 	}
 
 	if ((!__elfN(allow_wx) && (fctl0 & NT_FREEBSD_FCTL_WXNEEDED) == 0 &&
@@ -1309,14 +1311,16 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 		imgp->map_flags |= MAP_WXORX;
 
 	error = exec_new_vmspace(imgp, sv);
-	vmspace = imgp->proc->p_vmspace;
-	map = &vmspace->vm_map;
 
 	imgp->proc->p_sysent = sv;
 	imgp->proc->p_elf_brandinfo = brand_info;
 
-	maxv = vm_map_max(map) - lim_max(td, RLIMIT_STACK);
-	if (mapsz >= maxv - vm_map_min(map)) {
+	vmspace = imgp->proc->p_vmspace;
+	map = &vmspace->vm_map;
+	maxv = sv->sv_usrstack;
+	if ((imgp->map_flags & MAP_ASLR_STACK) == 0)
+		maxv -= lim_max(td, RLIMIT_STACK);
+	if (error == 0 && mapsz >= maxv - vm_map_min(map)) {
 		uprintf("Excessive mapping size\n");
 		error = ENOEXEC;
 	}
@@ -1342,8 +1346,6 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 	if (error != 0)
 		goto ret;
 
-	entry = (u_long)hdr->e_entry + et_dyn_addr;
-
 	/*
 	 * We load the dynamic linker where a userland call
 	 * to mmap(0, ...) would put it.  The rationale behind this
@@ -1364,6 +1366,7 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 		map->anon_loc = addr;
 	}
 
+	entry = (u_long)hdr->e_entry + et_dyn_addr;
 	imgp->entry_addr = entry;
 
 	if (interp != NULL) {
@@ -1384,6 +1387,10 @@ __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 	} else
 		addr = et_dyn_addr;
 
+	error = exec_map_stack(imgp);
+	if (error != 0)
+		goto ret;
+
 	/*
 	 * Construct auxargs table (used by the copyout_auxargs routine)
 	 */
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 09d2461e4053..0494b73fc405 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -178,19 +178,19 @@ static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
-	int error;
+	vm_offset_t val;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
-		unsigned int val;
-		val = (unsigned int)p->p_sysent->sv_usrstack;
-		error = SYSCTL_OUT(req, &val, sizeof(val));
-	} else
+		unsigned int val32;
+
+		val32 = round_page((unsigned int)p->p_vmspace->vm_stacktop);
+		return (SYSCTL_OUT(req, &val32, sizeof(val32)));
+	}
 #endif
-		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
-		    sizeof(p->p_sysent->sv_usrstack));
-	return error;
+	val = round_page(p->p_vmspace->vm_stacktop);
+	return (SYSCTL_OUT(req, &val, sizeof(val)));
 }
 
 static int
@@ -1106,9 +1106,8 @@ exec_free_abi_mappings(struct proc *p)
 }
 
 /*
- * Destroy old address space, and allocate a new stack.
- *	The new stack is only sgrowsiz large because it is grown
- *	automatically on a page fault.
+ * Run down the current address space and install a new one.  Map the shared
+ * page.
  */
 int
 exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
@@ -1118,11 +1117,8 @@ exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
 	struct vmspace *vmspace = p->p_vmspace;
 	struct thread *td = curthread;
 	vm_object_t obj;
-	struct rlimit rlim_stack;
-	vm_offset_t sv_minuser, stack_addr;
+	vm_offset_t sv_minuser;
 	vm_map_t map;
-	vm_prot_t stack_prot;
-	u_long ssiz;
 
 	imgp->vmspace_destroyed = true;
 	imgp->sysent = sv;
@@ -1157,7 +1153,7 @@ exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
 		 */
 		vm_map_lock(map);
 		vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR |
-		    MAP_ASLR_IGNSTART | MAP_WXORX);
+		    MAP_ASLR_IGNSTART | MAP_ASLR_STACK | MAP_WXORX);
 		vm_map_unlock(map);
 	} else {
 		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
@@ -1183,7 +1179,28 @@ exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
 		}
 	}
 
-	/* Allocate a new stack */
+	return (sv->sv_onexec != NULL ? sv->sv_onexec(p, imgp) : 0);
+}
+
+/*
+ * Compute the stack size limit and map the main process stack.
+ */
+int
+exec_map_stack(struct image_params *imgp)
+{
+	struct rlimit rlim_stack;
+	struct sysentvec *sv;
+	struct proc *p;
+	vm_map_t map;
+	struct vmspace *vmspace;
+	vm_offset_t stack_addr, stack_top;
+	u_long ssiz;
+	int error, find_space, stack_off;
+	vm_prot_t stack_prot;
+
+	p = imgp->proc;
+	sv = p->p_sysent;
+
 	if (imgp->stack_sz != 0) {
 		ssiz = trunc_page(imgp->stack_sz);
 		PROC_LOCK(p);
@@ -1200,27 +1217,46 @@ exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
 	} else {
 		ssiz = maxssiz;
 	}
-	stack_addr = sv->sv_usrstack - ssiz;
-	stack_prot = obj != NULL && imgp->stack_prot != 0 ?
+
+	vmspace = p->p_vmspace;
+	map = &vmspace->vm_map;
+
+	stack_prot = sv->sv_shared_page_obj != NULL && imgp->stack_prot != 0 ?
 	    imgp->stack_prot : sv->sv_stackprot;
-	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, stack_prot,
-	    VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
+	if ((map->flags & MAP_ASLR_STACK) != 0) {
+		stack_addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
+		    lim_max(curthread, RLIMIT_DATA));
+		find_space = VMFS_ANY_SPACE;
+	} else {
+		stack_addr = sv->sv_usrstack - ssiz;
+		find_space = VMFS_NO_SPACE;
+	}
+	error = vm_map_find(map, NULL, 0, &stack_addr, (vm_size_t)ssiz,
+	    sv->sv_usrstack, find_space, stack_prot, VM_PROT_ALL,
+	    MAP_STACK_GROWS_DOWN);
 	if (error != KERN_SUCCESS) {
 		uprintf("exec_new_vmspace: mapping stack size %#jx prot %#x "
-		    "failed mach error %d errno %d\n", (uintmax_t)ssiz,
+		    "failed, mach error %d errno %d\n", (uintmax_t)ssiz,
 		    stack_prot, error, vm_mmap_to_errno(error));
 		return (vm_mmap_to_errno(error));
 	}
-	vmspace->vm_stkgap = 0;
+
+	stack_top = stack_addr + ssiz;
+	if ((map->flags & MAP_ASLR_STACK) != 0) {
+		/* Randomize within the first page of the stack. */
+		arc4rand(&stack_off, sizeof(stack_off), 0);
+		stack_top -= rounddown2(stack_off & PAGE_MASK, sizeof(void *));
+	}
 
 	/*
 	 * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they
 	 * are still used to enforce the stack rlimit on the process stack.
 	 */
-	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)stack_addr;
+	vmspace->vm_stacktop = stack_top;
+	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 
-	return (sv->sv_onexec != NULL ? sv->sv_onexec(p, imgp) : 0);
+	return (0);
 }
 
 /*
diff --git a/sys/sys/exec.h b/sys/sys/exec.h
index 0ae2095c297f..82ee16befe28 100644
--- a/sys/sys/exec.h
+++ b/sys/sys/exec.h
@@ -87,7 +87,8 @@ struct execsw {
  * Prefer the kern.ps_strings or kern.proc.ps_strings sysctls to this constant.
  */
 #define	PS_STRINGS	(USRSTACK - sizeof(struct ps_strings))
-#define	PROC_PS_STRINGS(p)	((p)->p_sysent->sv_psstrings)
+#define	PROC_PS_STRINGS(p)	\
+	((p)->p_vmspace->vm_stacktop - (p)->p_sysent->sv_psstringssz)
 
 int exec_map_first_page(struct image_params *);        
 void exec_unmap_first_page(struct image_params *);       
diff --git a/sys/sys/imgact.h b/sys/sys/imgact.h
index 70e5c2e81579..bc1ab77a491e 100644
--- a/sys/sys/imgact.h
+++ b/sys/sys/imgact.h
@@ -113,6 +113,7 @@ int	exec_check_permissions(struct image_params *);
 void	exec_cleanup(struct thread *td, struct vmspace *);
 int	exec_copyout_strings(struct image_params *, uintptr_t *);
 void	exec_free_args(struct image_args *);
+int	exec_map_stack(struct image_params *);
 int	exec_new_vmspace(struct image_params *, struct sysentvec *);
 void	exec_setregs(struct thread *, struct image_params *, uintptr_t);
 int	exec_shell_imgact(struct image_params *);
diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c
index 1b2b5eb8d5e9..98d3d1e5bb1d 100644
--- a/sys/vm/vm_map.c
+++ b/sys/vm/vm_map.c
@@ -343,7 +343,6 @@ vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
 	vm->vm_taddr = 0;
 	vm->vm_daddr = 0;
 	vm->vm_maxsaddr = 0;
-	vm->vm_stkgap = 0;
 	return (vm);
 }
 
@@ -4264,7 +4263,6 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
-	vm2->vm_stkgap = vm1->vm_stkgap;
 	vm_map_lock(old_map);
 	if (old_map->busy)
 		vm_map_wait_busy(old_map);
@@ -4283,7 +4281,7 @@ vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
 
 	new_map->anon_loc = old_map->anon_loc;
 	new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART |
-	    MAP_WXORX);
+	    MAP_ASLR_STACK | MAP_WXORX);
 
 	VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
 		if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
diff --git a/sys/vm/vm_map.h b/sys/vm/vm_map.h
index 873ff62eec4a..8f318b34e601 100644
--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@@ -223,12 +223,13 @@ struct vm_map {
  * vm_flags_t values
  */
 #define MAP_WIREFUTURE		0x01	/* wire all future pages */
-#define	MAP_BUSY_WAKEUP		0x02
+#define	MAP_BUSY_WAKEUP		0x02	/* thread(s) waiting on busy state */
 #define	MAP_IS_SUB_MAP		0x04	/* has parent */
 #define	MAP_ASLR		0x08	/* enabled ASLR */
-#define	MAP_ASLR_IGNSTART	0x10
-#define	MAP_REPLENISH		0x20
+#define	MAP_ASLR_IGNSTART	0x10	/* ASLR ignores data segment */
+#define	MAP_REPLENISH		0x20	/* kmapent zone needs to be refilled */
 #define	MAP_WXORX		0x40	/* enforce W^X */
+#define	MAP_ASLR_STACK		0x80	/* stack location is randomized */
 
 #ifdef	_KERNEL
 #if defined(KLD_MODULE) && !defined(KLD_TIED)
@@ -293,7 +294,7 @@ struct vmspace {
 	caddr_t vm_taddr;	/* (c) user virtual address of text */
 	caddr_t vm_daddr;	/* (c) user virtual address of data */
 	caddr_t vm_maxsaddr;	/* user VA at max stack growth */
-	vm_size_t vm_stkgap;	/* stack gap size in bytes */
+	vm_offset_t vm_stacktop; /* top of the stack, may not be page-aligned */
 	u_int vm_refcnt;	/* number of references */
 	/*
 	 * Keep the PMAP last, so that CPU-specific variations of that