PERFORCE change 137455 for review
Peter Wemm
peter at FreeBSD.org
Wed Mar 12 00:37:08 UTC 2008
http://perforce.freebsd.org/chv.cgi?CH=137455
Change 137455 by peter at peter_melody on 2008/03/12 00:36:14
Checkpoint cpu_switch speedup. This gets almost double the gain
that Jeff's patch does on my boxes.
There are still some loose ends in here. WIP.
Affected files ...
.. //depot/projects/hammer/sys/amd64/amd64/cpu_switch.S#43 edit
.. //depot/projects/hammer/sys/amd64/amd64/genassym.c#49 edit
Differences ...
==== //depot/projects/hammer/sys/amd64/amd64/cpu_switch.S#43 (text+ko) ====
@@ -97,43 +97,27 @@
movq TD_PCB(%rdi),%r8
movq (%rsp),%rax /* Hardware registers */
+ movq %r15,PCB_R15(%r8)
+ movq %r14,PCB_R14(%r8)
+ movq %r13,PCB_R13(%r8)
+ movq %r12,PCB_R12(%r8)
+ movq %rbp,PCB_RBP(%r8)
+ movq %rsp,PCB_RSP(%r8)
+ movq %rbx,PCB_RBX(%r8)
movq %rax,PCB_RIP(%r8)
- movq %rbx,PCB_RBX(%r8)
- movq %rsp,PCB_RSP(%r8)
- movq %rbp,PCB_RBP(%r8)
- movq %r12,PCB_R12(%r8)
- movq %r13,PCB_R13(%r8)
- movq %r14,PCB_R14(%r8)
- movq %r15,PCB_R15(%r8)
+
+#if 0
+ /* Save copy of pcb pointer */
+ movq %r8,%r9
+#endif
testl $PCB_32BIT,PCB_FLAGS(%r8)
- jz 1f /* no, skip over */
+ jnz store_gs /* static predict not taken */
+done_store_gs:
- /* Save userland %gs */
- movl %gs,PCB_GS(%r8)
- movq PCB_GS32P(%r8),%rax
- movq (%rax),%rax
- movq %rax,PCB_GS32SD(%r8)
-
-1:
- /* Test if debug registers should be saved. */
testl $PCB_DBREGS,PCB_FLAGS(%r8)
- jz 1f /* no, skip over */
- movq %dr7,%rax /* yes, do the save */
- movq %rax,PCB_DR7(%r8)
- andq $0x0000fc00, %rax /* disable all watchpoints */
- movq %rax,%dr7
- movq %dr6,%rax
- movq %rax,PCB_DR6(%r8)
- movq %dr3,%rax
- movq %rax,PCB_DR3(%r8)
- movq %dr2,%rax
- movq %rax,PCB_DR2(%r8)
- movq %dr1,%rax
- movq %rax,PCB_DR1(%r8)
- movq %dr0,%rax
- movq %rax,PCB_DR0(%r8)
-1:
+ jnz store_dr /* static predict not taken */
+done_store_dr:
/* have we used fp, and need a save? */
cmpq %rdi,PCPU(FPCURTHREAD)
@@ -181,82 +165,133 @@
cmpq %rcx, %rdx
pause
je 1b
- lfence
#endif
/*
* At this point, we've switched address spaces and are ready
* to load up the rest of the next context.
*/
+#if 1
movq TD_PCB(%rsi),%r8
+#endif
+
+ /* Skip loading user fsbase/gsbase for kthreads */
+ testl $TDP_KTHREAD,TD_PFLAGS(%rsi)
+ jnz 2f
+ movq TD_PCB(%rdi),%r9
+ movq PCB_FSBASE(%r8),%r10
+ cmpq PCB_FSBASE(%r9),%r10
+ jz 1f
/* Restore userland %fs */
movl $MSR_FSBASE,%ecx
movl PCB_FSBASE(%r8),%eax
movl PCB_FSBASE+4(%r8),%edx
wrmsr
+1:
+ movq PCB_GSBASE(%r8),%r10
+ cmpq PCB_GSBASE(%r9),%r10
+ jz 2f
/* Restore userland %gs */
movl $MSR_KGSBASE,%ecx
movl PCB_GSBASE(%r8),%eax
movl PCB_GSBASE+4(%r8),%edx
wrmsr
+2:
/* Update the TSS_RSP0 pointer for the next interrupt */
movq PCPU(TSSP), %rax
+ movq %r8, PCPU(RSP0)
+ movq %r8, PCPU(CURPCB)
addq $COMMON_TSS_RSP0, %rax
- leaq -16(%r8), %rbx
- movq %rbx, (%rax)
- movq %rbx, PCPU(RSP0)
+ movq %rsi, PCPU(CURTHREAD) /* into next thread */
+ movq %r8, (%rax)
- movq %r8, PCPU(CURPCB)
- movq %rsi, PCPU(CURTHREAD) /* into next thread */
+ /* Test if debug registers should be restored. */
+ testl $PCB_DBREGS,PCB_FLAGS(%r8)
+ jnz load_dr /* static predict not taken */
+done_load_dr:
testl $PCB_32BIT,PCB_FLAGS(%r8)
- jz 1f /* no, skip over */
+ jnz load_gs /* static predict not taken */
+done_load_gs:
+
+ /* Restore context. */
+ movq PCB_R15(%r8),%r15
+ movq PCB_R14(%r8),%r14
+ movq PCB_R13(%r8),%r13
+ movq PCB_R12(%r8),%r12
+ movq PCB_RBP(%r8),%rbp
+ movq PCB_RSP(%r8),%rsp
+ movq PCB_RBX(%r8),%rbx
+ movq PCB_RIP(%r8),%rax
+ movq %rax,(%rsp)
+ ret
+
+ /*
+ * We order these strangely for several reasons.
+ * 1: I wanted to use static branch prediction hints
+ * 2: Most athlon64/opteron cpus don't have them. They define
+ * a forward branch as 'predict not taken'. Intel cores have
+ * the 'rep' prefix to invert this.
+ * So, to make it work on both forms of cpu we do the detour.
+ * We use jumps rather than call in order to avoid the stack.
+ */
+store_gs:
+ movl %gs,PCB_GS(%r8)
+ movq PCB_GS32P(%r8),%rax
+ movq (%rax),%rax
+ movq %rax,PCB_GS32SD(%r8)
+ jmp done_store_gs
+load_gs:
/* Restore userland %gs while preserving kernel gsbase */
movq PCB_GS32P(%r8),%rax
- movq PCB_GS32SD(%r8),%rbx
- movq %rbx,(%rax)
+ movq PCB_GS32SD(%r8),%rcx
+ movq %rcx,(%rax)
movl $MSR_GSBASE,%ecx
rdmsr
movl PCB_GS(%r8),%gs
wrmsr
+ jmp done_load_gs
-1:
- /* Restore context. */
- movq PCB_RBX(%r8),%rbx
- movq PCB_RSP(%r8),%rsp
- movq PCB_RBP(%r8),%rbp
- movq PCB_R12(%r8),%r12
- movq PCB_R13(%r8),%r13
- movq PCB_R14(%r8),%r14
- movq PCB_R15(%r8),%r15
- movq PCB_RIP(%r8),%rax
- movq %rax,(%rsp)
+store_dr:
+ movq %dr7,%rax /* yes, do the save */
+ movq %rax,PCB_DR7(%r8)
+ andq $0x0000fc00, %rax /* disable all watchpoints */
+ movq %rax,%dr7
+ movq %dr6,%r11
+ movq %dr3,%r12
+ movq %dr2,%r13
+ movq %dr1,%r14
+ movq %dr0,%r15
+ movq %r11,PCB_DR6(%r8)
+ movq %r12,PCB_DR3(%r8)
+ movq %r13,PCB_DR2(%r8)
+ movq %r14,PCB_DR1(%r8)
+ movq %r15,PCB_DR0(%r8)
+ jmp done_store_dr
- /* Test if debug registers should be restored. */
- testl $PCB_DBREGS,PCB_FLAGS(%r8)
- jz 1f
- movq PCB_DR6(%r8),%rax
- movq %rax,%dr6
- movq PCB_DR3(%r8),%rax
- movq %rax,%dr3
- movq PCB_DR2(%r8),%rax
- movq %rax,%dr2
- movq PCB_DR1(%r8),%rax
- movq %rax,%dr1
- movq PCB_DR0(%r8),%rax
- movq %rax,%dr0
+load_dr:
+ movq PCB_DR6(%r8),%r11
+ movq PCB_DR3(%r8),%r12
+ movq PCB_DR2(%r8),%r13
+ movq PCB_DR1(%r8),%r14
+ movq PCB_DR0(%r8),%r15
+ movq %r11,%dr6
+ movq %r12,%dr3
+ movq %r13,%dr2
+ movq %r14,%dr1
+ movq %r15,%dr0
/* But preserve reserved bits in %dr7 */
movq %dr7,%rax
+ movq PCB_DR7(%r8),%rcx
andq $0x0000fc00,%rax
- movq PCB_DR7(%r8),%rcx
andq $~0x0000fc00,%rcx
orq %rcx,%rax
movq %rax,%dr7
-1:
- ret
+ jmp done_load_dr
+
END(cpu_switch)
/*
==== //depot/projects/hammer/sys/amd64/amd64/genassym.c#49 (text+ko) ====
@@ -86,6 +86,7 @@
ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN);
+ASSYM(TDP_KTHREAD, TDP_KTHREAD);
ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
More information about the p4-projects
mailing list