git: df8dd6025af8 - main - amd64: stop using top of the thread' kernel stack for FPU user save area

Tue Sep 21 17:21:18 UTC 2021

The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=df8dd6025af88a99d34f549fa9591a9b8f9b75b1

commit df8dd6025af88a99d34f549fa9591a9b8f9b75b1
Author:     Konstantin Belousov <kib at FreeBSD.org>
AuthorDate: 2021-09-13 21:05:47 +0000
Commit:     Konstantin Belousov <kib at FreeBSD.org>
CommitDate: 2021-09-21 17:20:15 +0000

    amd64: stop using top of the thread' kernel stack for FPU user save area
    
    Instead do one more allocation at the thread creation time.  This frees
    a lot of space on the stack.
    
    Also do not use alloca() for temporal storage in signal delivery sendsig()
    function and signal return syscall sys_sigreturn().  This saves equal
    amount of space, again by the cost of one more allocation at the thread
    creation time.
    
    A useful experiment now would be to reduce KSTACK_PAGES.
    
    Reviewed by:    jhb, markj
    Tested by:      pho
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D31954
---
 sys/amd64/amd64/exec_machdep.c |  4 ++--
 sys/amd64/amd64/fpu.c          |  2 ++
 sys/amd64/amd64/machdep.c      | 14 --------------
 sys/amd64/amd64/vm_machdep.c   | 22 +++++++++++++---------
 sys/amd64/ia32/ia32_signal.c   |  6 +++---
 sys/amd64/include/proc.h       |  2 ++
 sys/kern/kern_thread.c         |  2 +-
 7 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c
index 1297117638d6..48bda05f9685 100644
--- a/sys/amd64/amd64/exec_machdep.c
+++ b/sys/amd64/amd64/exec_machdep.c
@@ -135,7 +135,7 @@ sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
-		xfpusave = __builtin_alloca(xfpusave_len);
+		xfpusave = (char *)td->td_md.md_fpu_scratch;
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
@@ -674,7 +674,7 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
-		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
+		xfpustate = (char *)td->td_md.md_fpu_scratch;
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c
index d7936b3b1922..24986958d4ca 100644
--- a/sys/amd64/amd64/fpu.c
+++ b/sys/amd64/amd64/fpu.c
@@ -448,6 +448,8 @@ fpuinitstate(void *arg __unused)
 		    xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO);
 	}
 
+	cpu_thread_alloc(&thread0);
+
 	saveintr = intr_disable();
 	stop_emulating();
 
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index d4e2356a9ae1..5c9b64526609 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1258,7 +1258,6 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
-	struct xstate_hdr *xhdr;
 	uint64_t cr3, rsp0;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
@@ -1564,19 +1563,6 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	msgbufinit(msgbufp, msgbufsize);
 	fpuinit();
 
-	/*
-	 * Reinitialize thread0's stack base now that the xsave area size is
-	 * known.  Set up thread0's pcb save area after fpuinit calculated fpu
-	 * save area size.  Zero out the extended state header in fpu save area.
-	 */
-	set_top_of_stack_td(&thread0);
-	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
-	bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
-	if (use_xsave) {
-		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
-		    1);
-		xhdr->xstate_bv = xsave_mask;
-	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	rsp0 = thread0.td_md.md_stack_base;
 	/* Ensure the stack is aligned to 16 bytes */
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index 4567e6e0eb5d..e42d16d61b3a 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -90,19 +90,17 @@ void
 set_top_of_stack_td(struct thread *td)
 {
 	td->td_md.md_stack_base = td->td_kstack +
-	    td->td_kstack_pages * PAGE_SIZE -
-	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN);
+	    td->td_kstack_pages * PAGE_SIZE;
 }
 
 struct savefpu *
 get_pcb_user_save_td(struct thread *td)
 {
-	vm_offset_t p;
-
-	p = td->td_md.md_stack_base;
-	KASSERT((p % XSAVE_AREA_ALIGN) == 0,
-	    ("Unaligned pcb_user_save area ptr %#lx td %p", p, td));
-	return ((struct savefpu *)p);
+	KASSERT(((vm_offset_t)td->td_md.md_usr_fpu_save %
+	    XSAVE_AREA_ALIGN) == 0,
+	    ("Unaligned pcb_user_save area ptr %p td %p",
+	    td->td_md.md_usr_fpu_save, td));
+	return (td->td_md.md_usr_fpu_save);
 }
 
 struct pcb *
@@ -393,6 +391,8 @@ cpu_thread_alloc(struct thread *td)
 	set_top_of_stack_td(td);
 	td->td_pcb = pcb = get_pcb_td(td);
 	td->td_frame = (struct trapframe *)td->td_md.md_stack_base - 1;
+	td->td_md.md_usr_fpu_save = fpu_save_area_alloc();
+	td->td_md.md_fpu_scratch = fpu_save_area_alloc();
 	pcb->pcb_save = get_pcb_user_save_pcb(pcb);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1);
@@ -404,8 +404,12 @@ cpu_thread_alloc(struct thread *td)
 void
 cpu_thread_free(struct thread *td)
 {
-
 	cpu_thread_clean(td);
+
+	fpu_save_area_free(td->td_md.md_usr_fpu_save);
+	td->td_md.md_usr_fpu_save = NULL;
+	fpu_save_area_free(td->td_md.md_fpu_scratch);
+	td->td_md.md_fpu_scratch = NULL;
 }
 
 bool
diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c
index 49b5797d68fd..9b67c7001a87 100644
--- a/sys/amd64/ia32/ia32_signal.c
+++ b/sys/amd64/ia32/ia32_signal.c
@@ -210,7 +210,7 @@ ia32_set_mcontext(struct thread *td, struct ia32_mcontext *mcp)
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
-		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
+		xfpustate = (char *)td->td_md.md_fpu_scratch;
 		ret = copyin(PTRIN(mcp->mc_xfpustate), xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
@@ -579,7 +579,7 @@ ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
-		xfpusave = __builtin_alloca(xfpusave_len);
+		xfpusave = (char *)td->td_md.md_fpu_scratch;
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
@@ -882,7 +882,7 @@ freebsd32_sigreturn(td, uap)
 			    td->td_proc->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
-		xfpustate = __builtin_alloca(xfpustate_len);
+		xfpustate = (char *)td->td_md.md_fpu_scratch;
 		error = copyin(PTRIN(ucp->uc_mcontext.mc_xfpustate),
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h
index 0f8cf50e326d..bd07f70f8d44 100644
--- a/sys/amd64/include/proc.h
+++ b/sys/amd64/include/proc.h
@@ -75,6 +75,8 @@ struct mdthread {
 	int	md_efirt_dis_pf;	/* (k) */
 	struct pcb md_pcb;
 	vm_offset_t md_stack_base;
+	struct savefpu *md_usr_fpu_save;
+	struct savefpu *md_fpu_scratch;
 };
 
 struct mdproc {
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 65c5cc65c87e..62f939406374 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -91,7 +91,7 @@ _Static_assert(offsetof(struct thread, td_pflags) == 0x110,
     "struct thread KBI td_pflags");
 _Static_assert(offsetof(struct thread, td_frame) == 0x4a8,
     "struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x6c0,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0xb8,
     "struct proc KBI p_flag");