svn commit: r328083 - in head/sys: amd64/amd64 amd64/ia32 amd64/include amd64/vmm amd64/vmm/intel dev/hyperv/vmbus dev/hyperv/vmbus/amd64 dev/hyperv/vmbus/i386 i386/i386 x86/include x86/isa x86/x86

Konstantin Belousov kib at FreeBSD.org
Wed Jan 17 11:44:23 UTC 2018


Author: kib
Date: Wed Jan 17 11:44:21 2018
New Revision: 328083
URL: https://svnweb.freebsd.org/changeset/base/328083

Log:
  PTI for amd64.
  
  The implementation of the Kernel Page Table Isolation (KPTI) for
  amd64, first version. It provides a workaround for the 'meltdown'
  vulnerability.  PTI is turned off by default for now, enable with the
  loader tunable vm.pmap.pti=1.
  
  The pmap page table is split into kernel-mode table and user-mode
  table. Kernel-mode table is identical to the non-PTI table, while
  usermode table is obtained from kernel table by leaving userspace
  mappings intact, but only leaving the following parts of the kernel
  mapped:
  
      kernel text (but not modules text)
      PCPU
      GDT/IDT/user LDT/task structures
      IST stacks for NMI and doublefault handlers.
  
  Kernel switches to user page table before returning to usermode, and
  restores full kernel page table on the entry. Initial kernel-mode
  stack for PTI trampoline is allocated in PCPU, it is only 16
  qwords.  Kernel entry trampoline switches page tables. then the
  hardware trap frame is copied to the normal kstack, and execution
  continues.
  
  IST stacks are kept mapped and no trampoline is needed for
  NMI/doublefault, but of course page table switch is performed.
  
  On return to usermode, the trampoline is used again, iret frame is
  copied to the trampoline stack, page tables are switched and iretq is
  executed.  The case of iretq faulting due to the invalid usermode
  context is tricky, since the frame for fault is appended to the
  trampoline frame.  Besides copying the fault frame and original
  (corrupted) frame to kstack, the fault frame must be patched to make
  it look as if the fault occured on the kstack, see the comment in
  doret_iret detection code in trap().
  
  Currently kernel pages which are mapped during trampoline operation
  are identical for all pmaps.  They are registered using
  pmap_pti_add_kva().  Besides initial registrations done during boot,
  LDT and non-common TSS segments are registered if user requested their
  use.  In principle, they can be installed into kernel page table per
  pmap with some work.  Similarly, PCPU can be hidden from userspace
  mapping using trampoline PCPU page, but again I do not see much
  benefits besides complexity.
  
  PDPE pages for the kernel half of the user page tables are
  pre-allocated during boot because we need to know pml4 entries which
  are copied to the top-level paging structure page, in advance on a new
  pmap creation.  I enforce this to avoid iterating over the all
  existing pmaps if a new PDPE page is needed for PTI kernel mappings.
  The iteration is a known problematic operation on i386.
  
  The need to flush hidden kernel translations on the switch to user
  mode make global tables (PG_G) meaningless and even harming, so PG_G
  use is disabled for PTI case.  Our existing use of PCID is
  incompatible with PTI and is automatically disabled if PTI is
  enabled.  PCID can be forced on only for developer's benefit.
  
  MCE is known to be broken, it requires IST stack to operate completely
  correctly even for non-PTI case, and absolutely needs dedicated IST
  stack because MCE delivery while trampoline did not switched from PTI
  stack is fatal.  The fix is pending.
  
  Reviewed by:	markj (partially)
  Tested by:	pho (previous version)
  Discussed with:	jeff, jhb
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 weeks

Modified:
  head/sys/amd64/amd64/apic_vector.S
  head/sys/amd64/amd64/atpic_vector.S
  head/sys/amd64/amd64/cpu_switch.S
  head/sys/amd64/amd64/exception.S
  head/sys/amd64/amd64/genassym.c
  head/sys/amd64/amd64/machdep.c
  head/sys/amd64/amd64/mp_machdep.c
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/amd64/sys_machdep.c
  head/sys/amd64/amd64/trap.c
  head/sys/amd64/amd64/vm_machdep.c
  head/sys/amd64/ia32/ia32_exception.S
  head/sys/amd64/ia32/ia32_syscall.c
  head/sys/amd64/include/asmacros.h
  head/sys/amd64/include/frame.h
  head/sys/amd64/include/pcpu.h
  head/sys/amd64/include/pmap.h
  head/sys/amd64/include/smp.h
  head/sys/amd64/vmm/intel/vmx.c
  head/sys/amd64/vmm/vmm.c
  head/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
  head/sys/dev/hyperv/vmbus/i386/vmbus_vector.S
  head/sys/dev/hyperv/vmbus/vmbus.c
  head/sys/i386/i386/apic_vector.s
  head/sys/i386/i386/atpic_vector.s
  head/sys/i386/i386/exception.s
  head/sys/i386/i386/pmap.c
  head/sys/x86/include/apicvar.h
  head/sys/x86/include/x86_var.h
  head/sys/x86/isa/atpic.c
  head/sys/x86/x86/local_apic.c

Modified: head/sys/amd64/amd64/apic_vector.S
==============================================================================
--- head/sys/amd64/amd64/apic_vector.S	Wed Jan 17 11:21:03 2018	(r328082)
+++ head/sys/amd64/amd64/apic_vector.S	Wed Jan 17 11:44:21 2018	(r328083)
@@ -2,7 +2,13 @@
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
+ * Copyright (c) 2014-2018 The FreeBSD Foundation
+ * All rights reserved.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib at FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -38,12 +44,12 @@
 
 #include "opt_smp.h"
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
-#include "assym.s"
-
 #ifdef SMP
 #define LK	lock ;
 #else
@@ -73,30 +79,28 @@ as_lapic_eoi:
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
-#define	ISR_VEC(index, vec_name)					\
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
-	cmpl	$0,x2apic_mode ;					\
-	je	1f ;							\
-	movl	$(MSR_APIC_ISR0 + index),%ecx ;				\
-	rdmsr ;								\
-	jmp	2f ;							\
-1: ;									\
-	movq	lapic_map, %rdx ;	/* pointer to local APIC */	\
-	movl	LA_ISR + 16 * (index)(%rdx), %eax ;	/* load ISR */	\
-2: ;									\
-	bsrl	%eax, %eax ;	/* index of highest set bit in ISR */	\
-	jz	3f ;							\
-	addl	$(32 * index),%eax ;					\
-	movq	%rsp, %rsi	;                                       \
-	movl	%eax, %edi ;	/* pass the IRQ */			\
-	call	lapic_handle_intr ;					\
-3: ;									\
-	MEXITCOUNT ;							\
+	.macro	ISR_VEC	index, vec_name
+	INTR_HANDLER	\vec_name
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	cmpl	$0,x2apic_mode
+	je	1f
+	movl	$(MSR_APIC_ISR0 + \index),%ecx
+	rdmsr
+	jmp	2f
+1:
+	movq	lapic_map, %rdx		/* pointer to local APIC */
+	movl	LA_ISR + 16 * (\index)(%rdx), %eax	/* load ISR */
+2:
+	bsrl	%eax, %eax	/* index of highest set bit in ISR */
+	jz	3f
+	addl	$(32 * \index),%eax
+	movq	%rsp, %rsi
+	movl	%eax, %edi	/* pass the IRQ */
+	call	lapic_handle_intr
+3:
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
 /*
  * Handle "spurious INTerrupts".
@@ -108,26 +112,21 @@ IDTVEC(vec_name) ;							\
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
-
 	/* No EOI cycle used here */
-
 	jmp	doreti_iret
 
-	ISR_VEC(1, apic_isr1)
-	ISR_VEC(2, apic_isr2)
-	ISR_VEC(3, apic_isr3)
-	ISR_VEC(4, apic_isr4)
-	ISR_VEC(5, apic_isr5)
-	ISR_VEC(6, apic_isr6)
-	ISR_VEC(7, apic_isr7)
+	ISR_VEC	1, apic_isr1
+	ISR_VEC	2, apic_isr2
+	ISR_VEC	3, apic_isr3
+	ISR_VEC	4, apic_isr4
+	ISR_VEC	5, apic_isr5
+	ISR_VEC	6, apic_isr6
+	ISR_VEC	7, apic_isr7
 
 /*
  * Local APIC periodic timer handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(timerint)
-	PUSH_FRAME
+	INTR_HANDLER	timerint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	lapic_handle_timer
@@ -137,10 +136,7 @@ IDTVEC(timerint)
 /*
  * Local APIC CMCI handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cmcint)
-	PUSH_FRAME
+	INTR_HANDLER cmcint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_cmc
 	MEXITCOUNT
@@ -149,10 +145,7 @@ IDTVEC(cmcint)
 /*
  * Local APIC error interrupt handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(errorint)
-	PUSH_FRAME
+	INTR_HANDLER errorint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_error
 	MEXITCOUNT
@@ -163,10 +156,7 @@ IDTVEC(errorint)
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(xen_intr_upcall)
-	PUSH_FRAME
+	INTR_HANDLER xen_intr_upcall
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	xen_intr_handle_upcall
@@ -183,74 +173,48 @@ IDTVEC(xen_intr_upcall)
 	SUPERALIGN_TEXT
 invltlb_ret:
 	call	as_lapic_eoi
-	POP_FRAME
-	jmp	doreti_iret
+	jmp	ld_regs
 
 	SUPERALIGN_TEXT
-IDTVEC(invltlb)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb
 	call	invltlb_handler
 	jmp	invltlb_ret
 
-IDTVEC(invltlb_pcid)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb_pcid
 	call	invltlb_pcid_handler
 	jmp	invltlb_ret
 
-IDTVEC(invltlb_invpcid)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb_invpcid
 	call	invltlb_invpcid_handler
 	jmp	invltlb_ret
 
 /*
  * Single page TLB shootdown
  */
-	.text
-
-	SUPERALIGN_TEXT
-IDTVEC(invlpg)
-	PUSH_FRAME
-
+	INTR_HANDLER invlpg
 	call	invlpg_handler
 	jmp	invltlb_ret
 
 /*
  * Page range TLB shootdown.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(invlrng)
-	PUSH_FRAME
-
+	INTR_HANDLER invlrng
 	call	invlrng_handler
 	jmp	invltlb_ret
 
 /*
  * Invalidate cache.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(invlcache)
-	PUSH_FRAME
-
+	INTR_HANDLER invlcache
 	call	invlcache_handler
 	jmp	invltlb_ret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(ipi_intr_bitmap_handler)		
-	PUSH_FRAME
-
+	INTR_HANDLER ipi_intr_bitmap_handler
 	call	as_lapic_eoi
-	
 	FAKE_MCOUNT(TF_RIP(%rsp))
-
 	call	ipi_bitmap_handler
 	MEXITCOUNT
 	jmp	doreti
@@ -258,24 +222,15 @@ IDTVEC(ipi_intr_bitmap_handler)		
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cpustop)
-	PUSH_FRAME
-
+	INTR_HANDLER cpustop
 	call	as_lapic_eoi
-
 	call	cpustop_handler
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cpususpend)
-	PUSH_FRAME
-
+	INTR_HANDLER cpususpend
 	call	cpususpend_handler
 	call	as_lapic_eoi
 	jmp	doreti
@@ -285,10 +240,7 @@ IDTVEC(cpususpend)
  *
  * - Calls the generic rendezvous action function.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(rendezvous)
-	PUSH_FRAME
+	INTR_HANDLER rendezvous
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movq	ipi_rendezvous_counts(,%rax,8), %rax
@@ -327,5 +279,9 @@ IDTVEC(justreturn)
 	popq	%rcx
 	popq	%rax
 	jmp	doreti_iret
+
+	INTR_HANDLER	justreturn1
+	call	as_lapic_eoi
+	jmp	doreti
 
 #endif /* SMP */

Modified: head/sys/amd64/amd64/atpic_vector.S
==============================================================================
--- head/sys/amd64/amd64/atpic_vector.S	Wed Jan 17 11:21:03 2018	(r328082)
+++ head/sys/amd64/amd64/atpic_vector.S	Wed Jan 17 11:44:21 2018	(r328083)
@@ -36,38 +36,35 @@
  * master and slave interrupt controllers.
  */
 
+#include "assym.s"
 #include <machine/asmacros.h>
 
-#include "assym.s"
-
 /*
  * Macros for interrupt entry, call to handler, and exit.
  */
-#define	INTR(irq_num, vec_name) \
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
-	movq	%rsp, %rsi	;                                       \
-	movl	$irq_num, %edi; 	/* pass the IRQ */		\
-	call	atpic_handle_intr ;					\
-	MEXITCOUNT ;							\
+	.macro	INTR	irq_num, vec_name
+	INTR_HANDLER	\vec_name
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp, %rsi
+	movl	$\irq_num, %edi	 	/* pass the IRQ */
+	call	atpic_handle_intr
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
-	INTR(0, atpic_intr0)
-	INTR(1, atpic_intr1)
-	INTR(2, atpic_intr2)
-	INTR(3, atpic_intr3)
-	INTR(4, atpic_intr4)
-	INTR(5, atpic_intr5)
-	INTR(6, atpic_intr6)
-	INTR(7, atpic_intr7)
-	INTR(8, atpic_intr8)
-	INTR(9, atpic_intr9)
-	INTR(10, atpic_intr10)
-	INTR(11, atpic_intr11)
-	INTR(12, atpic_intr12)
-	INTR(13, atpic_intr13)
-	INTR(14, atpic_intr14)
-	INTR(15, atpic_intr15)
+	INTR	0, atpic_intr0
+	INTR	1, atpic_intr1
+	INTR	2, atpic_intr2
+	INTR	3, atpic_intr3
+	INTR	4, atpic_intr4
+	INTR	5, atpic_intr5
+	INTR	6, atpic_intr6
+	INTR	7, atpic_intr7
+	INTR	8, atpic_intr8
+	INTR	9, atpic_intr9
+	INTR	10, atpic_intr10
+	INTR	11, atpic_intr11
+	INTR	12, atpic_intr12
+	INTR	13, atpic_intr13
+	INTR	14, atpic_intr14
+	INTR	15, atpic_intr15

Modified: head/sys/amd64/amd64/cpu_switch.S
==============================================================================
--- head/sys/amd64/amd64/cpu_switch.S	Wed Jan 17 11:21:03 2018	(r328082)
+++ head/sys/amd64/amd64/cpu_switch.S	Wed Jan 17 11:44:21 2018	(r328083)
@@ -215,8 +215,10 @@ done_tss:
 	movq	%r8,PCPU(RSP0)
 	movq	%r8,PCPU(CURPCB)
 	/* Update the TSS_RSP0 pointer for the next interrupt */
+	cmpb	$0,pti(%rip)
+	jne	1f
 	movq	%r8,TSS_RSP0(%rdx)
-	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
+1:	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
 
 	/* Test if debug registers should be restored. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%r8)
@@ -293,7 +295,12 @@ do_tss:	movq	%rdx,PCPU(TSSP)
 	shrq	$8,%rcx
 	movl	%ecx,8(%rax)
 	movb	$0x89,5(%rax)	/* unset busy */
-	movl	$TSSSEL,%eax
+	cmpb	$0,pti(%rip)
+	je	1f
+	movq	PCPU(PRVSPACE),%rax
+	addq	$PC_PTI_STACK+PC_PTI_STACK_SZ*8,%rax
+	movq	%rax,TSS_RSP0(%rdx)
+1:	movl	$TSSSEL,%eax
 	ltr	%ax
 	jmp	done_tss
 

Modified: head/sys/amd64/amd64/exception.S
==============================================================================
--- head/sys/amd64/amd64/exception.S	Wed Jan 17 11:21:03 2018	(r328082)
+++ head/sys/amd64/amd64/exception.S	Wed Jan 17 11:44:21 2018	(r328083)
@@ -1,12 +1,16 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
- * Copyright (c) 2007 The FreeBSD Foundation
+ * Copyright (c) 2007-2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib at FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -38,13 +42,13 @@
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/psl.h>
 #include <machine/trap.h>
 #include <machine/specialreg.h>
 
-#include "assym.s"
-
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
@@ -100,69 +104,63 @@ dtrace_invop_calltrap_addr:
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
-/* Traps that we leave interrupts disabled for.. */
-#define	TRAP_NOEN(a)	\
-	subq $TF_RIP,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
-	movq $0,TF_ERR(%rsp) ; \
+/* Traps that we leave interrupts disabled for. */
+	.macro	TRAP_NOEN	l, trapno
+	PTI_ENTRY	\l,X\l
+	.globl	X\l
+	.type	X\l, at function
+X\l:	subq $TF_RIP,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
+	movq $0,TF_ERR(%rsp)
 	jmp alltraps_noen
-IDTVEC(dbg)
-	TRAP_NOEN(T_TRCTRAP)
-IDTVEC(bpt)
-	TRAP_NOEN(T_BPTFLT)
+	.endm
+
+	TRAP_NOEN	dbg, T_TRCTRAP
+	TRAP_NOEN	bpt, T_BPTFLT
 #ifdef KDTRACE_HOOKS
-IDTVEC(dtrace_ret)
-	TRAP_NOEN(T_DTRACE_RET)
+	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
 #endif
 
 /* Regular traps; The cpu does not supply tf_err for these. */
-#define	TRAP(a)	 \
-	subq $TF_RIP,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
-	movq $0,TF_ERR(%rsp) ; \
+	.macro	TRAP	l, trapno
+	PTI_ENTRY	\l,X\l
+	.globl	X\l
+	.type	X\l, at function
+X\l:
+	subq $TF_RIP,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
+	movq $0,TF_ERR(%rsp)
 	jmp alltraps
-IDTVEC(div)
-	TRAP(T_DIVIDE)
-IDTVEC(ofl)
-	TRAP(T_OFLOW)
-IDTVEC(bnd)
-	TRAP(T_BOUND)
-IDTVEC(ill)
-	TRAP(T_PRIVINFLT)
-IDTVEC(dna)
-	TRAP(T_DNA)
-IDTVEC(fpusegm)
-	TRAP(T_FPOPFLT)
-IDTVEC(mchk)
-	TRAP(T_MCHK)
-IDTVEC(rsvd)
-	TRAP(T_RESERVED)
-IDTVEC(fpu)
-	TRAP(T_ARITHTRAP)
-IDTVEC(xmm)
-	TRAP(T_XMMFLT)
+	.endm
 
-/* This group of traps have tf_err already pushed by the cpu */
-#define	TRAP_ERR(a)	\
-	subq $TF_ERR,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
+	TRAP	div, T_DIVIDE
+	TRAP	ofl, T_OFLOW
+	TRAP	bnd, T_BOUND
+	TRAP	ill, T_PRIVINFLT
+	TRAP	dna, T_DNA
+	TRAP	fpusegm, T_FPOPFLT
+	TRAP	mchk, T_MCHK
+	TRAP	rsvd, T_RESERVED
+	TRAP	fpu, T_ARITHTRAP
+	TRAP	xmm, T_XMMFLT
+
+/* This group of traps have tf_err already pushed by the cpu. */
+	.macro	TRAP_ERR	l, trapno
+	PTI_ENTRY	\l,X\l,has_err=1
+	.globl	X\l
+	.type	X\l, at function
+X\l:
+	subq $TF_ERR,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
 	jmp alltraps
-IDTVEC(tss)
-	TRAP_ERR(T_TSSFLT)
-IDTVEC(missing)
-	subq	$TF_ERR,%rsp
-	movl	$T_SEGNPFLT,TF_TRAPNO(%rsp)
-	jmp	prot_addrf
-IDTVEC(stk)
-	subq	$TF_ERR,%rsp
-	movl	$T_STKFLT,TF_TRAPNO(%rsp)
-	jmp	prot_addrf
-IDTVEC(align)
-	TRAP_ERR(T_ALIGNFLT)
+	.endm
 
+	TRAP_ERR	tss, T_TSSFLT
+	TRAP_ERR	align, T_ALIGNFLT
+
 	/*
 	 * alltraps entry point.  Use swapgs if this is the first time in the
 	 * kernel from userland.  Reenable interrupts if they were enabled
@@ -174,15 +172,12 @@ IDTVEC(align)
 alltraps:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	alltraps_testi		/* already running with kernel GS.base */
+	jz	alltraps_segs		/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
-alltraps_testi:
+alltraps_segs:
+	SAVE_SEGS
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rdi
 	sti
@@ -249,14 +244,12 @@ calltrap:
 alltraps_noen:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f	/* already running with kernel GS.base */
+	jz	alltraps_noen_segs /* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1:	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+alltraps_noen_segs:
+	SAVE_SEGS
 	jmp	alltraps_pushregs_no_rdi
 
 IDTVEC(dblfault)
@@ -279,37 +272,36 @@ IDTVEC(dblfault)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f			/* already running with kernel GS.base */
 	swapgs
 1:
-	movq	%rsp,%rdi
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	2f
+	movq	%rax,%cr3
+2:	movq	%rsp,%rdi
 	call	dblfault_handler
-2:
-	hlt
-	jmp	2b
+3:	hlt
+	jmp	3b
 
+	PTI_ENTRY	page, Xpage, has_err=1
 IDTVEC(page)
 	subq	$TF_ERR,%rsp
-	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f			/* already running with kernel GS.base */
+	jz	page_cr2		/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1:	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
+page_cr2:
+	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
 	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
+	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rdi
 	sti
@@ -320,10 +312,43 @@ IDTVEC(page)
 	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
 	 * to do a special the swapgs in this case even coming from the kernel.
 	 * XXX linux has a trap handler for their equivalent of load_gs().
+	 *
+	 * On the stack, we have the hardware interrupt frame to return
+	 * to usermode (faulted) and another frame with error code, for
+	 * fault.  For PTI, copy both frames to the main thread stack.
 	 */
-IDTVEC(prot)
+	.macro PROTF_ENTRY name,trapno
+\name\()_pti_doreti:
+	pushq	%rax
+	pushq	%rdx
+	swapgs
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	movq	PCPU(RSP0),%rax
+	subq	$2*PTI_SIZE-3*8,%rax
+	MOVE_STACKS	(PTI_SIZE / 4 - 3)
+	movq	%rax,%rsp
+	popq	%rdx
+	popq	%rax
+	swapgs
+	jmp	X\name
+IDTVEC(\name\()_pti)
+	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
+	je	\name\()_pti_doreti
+	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
+	jz	X\name
+	PTI_UENTRY has_err=1
+	swapgs
+IDTVEC(\name)
 	subq	$TF_ERR,%rsp
-	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
+	movl	$\trapno,TF_TRAPNO(%rsp)
+	jmp	prot_addrf
+	.endm
+
+	PROTF_ENTRY	missing, T_SEGNPFLT
+	PROTF_ENTRY	stk, T_STKFLT
+	PROTF_ENTRY	prot, T_PROTFLT
+
 prot_addrf:
 	movq	$0,TF_ADDR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
@@ -375,8 +400,18 @@ prot_addrf:
  * We do not support invoking this from a custom segment registers,
  * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
  */
+	SUPERALIGN_TEXT
+IDTVEC(fast_syscall_pti)
+	swapgs
+	movq	%rax,PCPU(SCRATCH_RAX)
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	jmp	fast_syscall_common
+	SUPERALIGN_TEXT
 IDTVEC(fast_syscall)
 	swapgs
+	movq	%rax,PCPU(SCRATCH_RAX)
+fast_syscall_common:
 	movq	%rsp,PCPU(SCRATCH_RSP)
 	movq	PCPU(RSP0),%rsp
 	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
@@ -386,10 +421,9 @@ IDTVEC(fast_syscall)
 	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
 	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
 	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	movq	PCPU(SCRATCH_RAX),%rax
+	movq	%rax,TF_RAX(%rsp)	/* syscall number */
+	SAVE_SEGS
 	movq	PCPU(CURPCB),%r11
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
 	sti
@@ -402,7 +436,6 @@ IDTVEC(fast_syscall)
 	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
 	movq	%r8,TF_R8(%rsp)		/* arg 5 */
 	movq	%r9,TF_R9(%rsp)		/* arg 6 */
-	movq	%rax,TF_RAX(%rsp)	/* syscall number */
 	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
 	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
 	movq	%r12,TF_R12(%rsp)	/* C preserved */
@@ -420,11 +453,11 @@ IDTVEC(fast_syscall)
 	/* Disable interrupts before testing PCB_FULL_IRET. */
 	cli
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
-	jnz	3f
+	jnz	4f
 	/* Check for and handle AST's on return to userland. */
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
-	jne	2f
+	jne	3f
 	/* Restore preserved registers. */
 	MEXITCOUNT
 	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
@@ -434,16 +467,21 @@ IDTVEC(fast_syscall)
 	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
 	movq	TF_RIP(%rsp),%rcx	/* original %rip */
 	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
-	swapgs
+	cmpb	$0,pti
+	je	2f
+	movq	PCPU(UCR3),%r9
+	movq	%r9,%cr3
+	xorl	%r9d,%r9d
+2:	swapgs
 	sysretq
 
-2:	/* AST scheduled. */
+3:	/* AST scheduled. */
 	sti
 	movq	%rsp,%rdi
 	call	ast
 	jmp	1b
 
-3:	/* Requested full context restore, use doreti for that. */
+4:	/* Requested full context restore, use doreti for that. */
 	MEXITCOUNT
 	jmp	doreti
 
@@ -499,17 +537,15 @@ IDTVEC(nmi)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	xorl	%ebx,%ebx
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	nmi_fromuserspace
 	/*
-	 * We've interrupted the kernel.  Preserve GS.base in %r12.
+	 * We've interrupted the kernel.  Preserve GS.base in %r12
+	 * and %cr3 in %r13.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
@@ -521,27 +557,38 @@ IDTVEC(nmi)
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	nmi_calltrap
+	movq	%rax,%cr3
 	jmp	nmi_calltrap
 nmi_fromuserspace:
 	incl	%ebx
 	swapgs
-	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
-	jz	2f
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	1f
+	movq	%rax,%cr3
 	movq	PCPU(CURPCB),%rdi
 	testq	%rdi,%rdi
-	jz	2f
+	jz	3f
+	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
+1:	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
+	jz	3f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
-	jne	1f
+	jne	2f
 	rdfsbase %rax
 	movq	%rax,PCB_FSBASE(%rdi)
-1:	cmpw	$KUG32SEL,TF_GS(%rsp)
-	jne	2f
+2:	cmpw	$KUG32SEL,TF_GS(%rsp)
+	jne	3f
 	movl	$MSR_KGSBASE,%ecx
 	rdmsr
 	shlq	$32,%rdx
 	orq	%rdx,%rax
 	movq	%rax,PCB_GSBASE(%rdi)
-2:
+3:
 /* Note: this label is also used by ddb and gdb: */
 nmi_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
@@ -564,26 +611,29 @@ nmi_calltrap:
 	movq	PCPU(CURTHREAD),%rax
 	orq	%rax,%rax	/* curthread present? */
 	jz	nocallchain
-	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
-	jz	nocallchain
 	/*
-	 * A user callchain is to be captured, so:
-	 * - Move execution to the regular kernel stack, to allow for
-	 *   nested NMI interrupts.
-	 * - Take the processor out of "NMI" mode by faking an "iret".
-	 * - Enable interrupts, so that copyin() can work.
+	 * Move execution to the regular kernel stack, because we
+	 * committed to return through doreti.
 	 */
 	movq	%rsp,%rsi	/* source stack pointer */
 	movq	$TF_SIZE,%rcx
 	movq	PCPU(RSP0),%rdx
 	subq	%rcx,%rdx
 	movq	%rdx,%rdi	/* destination stack pointer */
-
 	shrq	$3,%rcx		/* trap frame size in long words */
 	cld
 	rep
 	movsq			/* copy trapframe */
+	movq	%rdx,%rsp	/* we are on the regular kstack */
 
+	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
+	jz	nocallchain
+	/*
+	 * A user callchain is to be captured, so:
+	 * - Take the processor out of "NMI" mode by faking an "iret",
+	 *   to allow for nested NMI interrupts.
+	 * - Enable interrupts, so that copyin() can work.
+	 */
 	movl	%ss,%eax
 	pushq	%rax		/* tf_ss */
 	pushq	%rdx		/* tf_rsp (on kernel stack) */
@@ -624,22 +674,9 @@ nmi_kernelexit:
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
+	movq	%r13,%cr3
 nmi_restoreregs:
-	movq	TF_RDI(%rsp),%rdi
-	movq	TF_RSI(%rsp),%rsi
-	movq	TF_RDX(%rsp),%rdx
-	movq	TF_RCX(%rsp),%rcx
-	movq	TF_R8(%rsp),%r8
-	movq	TF_R9(%rsp),%r9
-	movq	TF_RAX(%rsp),%rax
-	movq	TF_RBX(%rsp),%rbx
-	movq	TF_RBP(%rsp),%rbp
-	movq	TF_R10(%rsp),%r10
-	movq	TF_R11(%rsp),%r11
-	movq	TF_R12(%rsp),%r12
-	movq	TF_R13(%rsp),%r13
-	movq	TF_R14(%rsp),%r14
-	movq	TF_R15(%rsp),%r15
+	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 
@@ -807,27 +844,38 @@ ld_es:
 ld_ds:
 	movw	TF_DS(%rsp),%ds
 ld_regs:
-	movq	TF_RDI(%rsp),%rdi
-	movq	TF_RSI(%rsp),%rsi
-	movq	TF_RDX(%rsp),%rdx
-	movq	TF_RCX(%rsp),%rcx
-	movq	TF_R8(%rsp),%r8
-	movq	TF_R9(%rsp),%r9
-	movq	TF_RAX(%rsp),%rax
-	movq	TF_RBX(%rsp),%rbx
-	movq	TF_RBP(%rsp),%rbp
-	movq	TF_R10(%rsp),%r10
-	movq	TF_R11(%rsp),%r11
-	movq	TF_R12(%rsp),%r12
-	movq	TF_R13(%rsp),%r13
-	movq	TF_R14(%rsp),%r14
-	movq	TF_R15(%rsp),%r15
+	RESTORE_REGS
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f			/* keep running with kernel GS.base */
+	jz	2f			/* keep running with kernel GS.base */
 	cli
+	cmpb	$0,pti
+	je	1f
+	pushq	%rdx
+	movq	PCPU(PRVSPACE),%rdx
+	addq	$PC_PTI_STACK+PC_PTI_STACK_SZ*8-PTI_SIZE,%rdx
+	movq	%rax,PTI_RAX(%rdx)
+	popq	%rax
+	movq	%rax,PTI_RDX(%rdx)
+	movq	TF_RIP(%rsp),%rax
+	movq	%rax,PTI_RIP(%rdx)
+	movq	TF_CS(%rsp),%rax
+	movq	%rax,PTI_CS(%rdx)
+	movq	TF_RFLAGS(%rsp),%rax
+	movq	%rax,PTI_RFLAGS(%rdx)
+	movq	TF_RSP(%rsp),%rax
+	movq	%rax,PTI_RSP(%rdx)
+	movq	TF_SS(%rsp),%rax
+	movq	%rax,PTI_SS(%rdx)
+	movq	PCPU(UCR3),%rax
 	swapgs
-1:
-	addq	$TF_RIP,%rsp		/* skip over tf_err, tf_trapno */
+	movq	%rdx,%rsp
+	movq	%rax,%cr3
+	popq	%rdx
+	popq	%rax
+	addq	$8,%rsp
+	jmp	doreti_iret
+1:	swapgs
+2:	addq	$TF_RIP,%rsp
 	.globl	doreti_iret
 doreti_iret:
 	iretq
@@ -851,14 +899,11 @@ set_segs:
 	.globl	doreti_iret_fault
 doreti_iret_fault:
 	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
-	testl	$PSL_I,TF_RFLAGS(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
@@ -885,7 +930,7 @@ doreti_iret_fault:
 	.globl	ds_load_fault
 ds_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
-	testl	$PSL_I,TF_RFLAGS(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:

Modified: head/sys/amd64/amd64/genassym.c
==============================================================================
--- head/sys/amd64/amd64/genassym.c	Wed Jan 17 11:21:03 2018	(r328082)
+++ head/sys/amd64/amd64/genassym.c	Wed Jan 17 11:44:21 2018	(r328083)
@@ -186,6 +186,16 @@ ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags));
 ASSYM(TF_SIZE, sizeof(struct trapframe));
 ASSYM(TF_HASSEGS, TF_HASSEGS);
 
+ASSYM(PTI_RDX, offsetof(struct pti_frame, pti_rdx));
+ASSYM(PTI_RAX, offsetof(struct pti_frame, pti_rax));
+ASSYM(PTI_ERR, offsetof(struct pti_frame, pti_err));
+ASSYM(PTI_RIP, offsetof(struct pti_frame, pti_rip));
+ASSYM(PTI_CS, offsetof(struct pti_frame, pti_cs));
+ASSYM(PTI_RFLAGS, offsetof(struct pti_frame, pti_rflags));
+ASSYM(PTI_RSP, offsetof(struct pti_frame, pti_rsp));
+ASSYM(PTI_SS, offsetof(struct pti_frame, pti_ss));
+ASSYM(PTI_SIZE, sizeof(struct pti_frame));
+
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags));
@@ -202,6 +212,7 @@ ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethre
 ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp));
+ASSYM(PC_SCRATCH_RAX, offsetof(struct pcpu, pc_scratch_rax));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
 ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp));
 ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0));
@@ -211,6 +222,10 @@ ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt));
 ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
 ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
 ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
+ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3));
+ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3));
+ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack));
+ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ);
  
 ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
 ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list