svn commit: r323722 - in head/sys: i386/i386 i386/include i386/isa kern
Konstantin Belousov
kib at FreeBSD.org
Mon Sep 18 20:22:44 UTC 2017
Author: kib
Date: Mon Sep 18 20:22:42 2017
New Revision: 323722
URL: https://svnweb.freebsd.org/changeset/base/323722
Log:
Fix handling of the segment registers on i386.
Suppose that userspace is executing with the non-standard segment
descriptors. Then, until exception or interrupt handler executed
SET_KERNEL_SEGS, kernel is still executing with user %ds, %es and %fs.
If an interrupt occurs in this window, the interrupt handler is
executed unsafely, relying on usability of the usermode registers. If
the interrupt results in the context switch on return, the
contamination of the kernel state spreads to the thread we switched
to. As result, kernel data accesses might fault or, if only the base
is changed, completely messed up.
More, if the user segment was allocated in LDT, another thread might
mark the descriptor as invalid before doreti code tried to reload
them. In this case kernel panics.
The issue exists for all exception entry points which use trap gate,
and thus do not automatically disable interrupts on entry, and for
lcall_handler.
Fix is two-fold: first, we need to disable interrupts for all kernel
entries, changing the IDT descriptor types from trap gate to interrupt
gate. Interrupts are re-enabled not earlier than the kernel segments
are loaded into the segment registers. Second, we only load the
segment registers from the trap frame when returning to usermode. For
the later, all interrupt return paths must happen through the doreti
common code.
There is no way to disable interrupts on call gate, which is the
supposed mode of servicing for lcall $7,$0 syscalls. Change the LDT
descriptor 0 into a code segment type and point it to the userspace
trampoline which redirects the syscall to int $0x80.
All the measures make the segment register handling similar to that of
amd64. We do not apply amd64 optimizations of not reloading segment
registers on return from the syscall.
Reported by: Maxime Villard <max at m00nbsd.net>
Tested by: pho (the non-lcall part)
Reviewed by: jhb
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
Differential revision: https://reviews.freebsd.org/D12402
Modified:
head/sys/i386/i386/apic_vector.s
head/sys/i386/i386/db_trace.c
head/sys/i386/i386/exception.s
head/sys/i386/i386/locore.s
head/sys/i386/i386/machdep.c
head/sys/i386/i386/trap.c
head/sys/i386/include/md_var.h
head/sys/i386/isa/npx.c
head/sys/kern/imgact_aout.c
Modified: head/sys/i386/i386/apic_vector.s
==============================================================================
--- head/sys/i386/i386/apic_vector.s Mon Sep 18 20:17:08 2017 (r323721)
+++ head/sys/i386/i386/apic_vector.s Mon Sep 18 20:22:42 2017 (r323722)
@@ -189,8 +189,7 @@ IDTVEC(xen_intr_upcall)
SUPERALIGN_TEXT
invltlb_ret:
call as_lapic_eoi
- POP_FRAME
- iret
+ jmp doreti
SUPERALIGN_TEXT
IDTVEC(invltlb)
@@ -274,10 +273,8 @@ IDTVEC(cpustop)
call as_lapic_eoi
call cpustop_handler
+ jmp doreti
- POP_FRAME
- iret
-
/*
* Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
*/
@@ -290,10 +287,8 @@ IDTVEC(cpususpend)
call as_lapic_eoi
call cpususpend_handler
+ jmp doreti
- POP_FRAME
- jmp doreti_iret
-
/*
* Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
*
@@ -314,7 +309,6 @@ IDTVEC(rendezvous)
call smp_rendezvous_action
call as_lapic_eoi
- POP_FRAME
- iret
+ jmp doreti
#endif /* SMP */
Modified: head/sys/i386/i386/db_trace.c
==============================================================================
--- head/sys/i386/i386/db_trace.c Mon Sep 18 20:17:08 2017 (r323721)
+++ head/sys/i386/i386/db_trace.c Mon Sep 18 20:22:42 2017 (r323722)
@@ -326,8 +326,7 @@ db_nextframe(struct i386_frame **fp, db_addr_t *ip, st
else if (strncmp(name, "Xatpic_intr", 11) == 0 ||
strncmp(name, "Xapic_isr", 9) == 0)
frame_type = INTERRUPT;
- else if (strcmp(name, "Xlcall_syscall") == 0 ||
- strcmp(name, "Xint0x80_syscall") == 0)
+ else if (strcmp(name, "Xint0x80_syscall") == 0)
frame_type = SYSCALL;
else if (strcmp(name, "dblfault_handler") == 0)
frame_type = DOUBLE_FAULT;
Modified: head/sys/i386/i386/exception.s
==============================================================================
--- head/sys/i386/i386/exception.s Mon Sep 18 20:17:08 2017 (r323721)
+++ head/sys/i386/i386/exception.s Mon Sep 18 20:22:42 2017 (r323722)
@@ -98,15 +98,16 @@ MCOUNT_LABEL(user)
MCOUNT_LABEL(btrap)
#define TRAP(a) pushl $(a) ; jmp alltraps
+#define TRAP_NOEN(a) pushl $(a) ; jmp alltraps_noen
IDTVEC(div)
pushl $0; TRAP(T_DIVIDE)
IDTVEC(dbg)
- pushl $0; TRAP(T_TRCTRAP)
+ pushl $0; TRAP_NOEN(T_TRCTRAP)
IDTVEC(nmi)
- pushl $0; TRAP(T_NMI)
+ pushl $0; TRAP_NOEN(T_NMI)
IDTVEC(bpt)
- pushl $0; TRAP(T_BPTFLT)
+ pushl $0; TRAP_NOEN(T_BPTFLT)
IDTVEC(dtrace_ret)
pushl $0; TRAP(T_DTRACE_RET)
IDTVEC(ofl)
@@ -130,7 +131,7 @@ IDTVEC(stk)
IDTVEC(prot)
TRAP(T_PROTFLT)
IDTVEC(page)
- TRAP(T_PAGEFLT)
+ TRAP_NOEN(T_PAGEFLT)
IDTVEC(mchk)
pushl $0; TRAP(T_MCHK)
IDTVEC(rsvd)
@@ -142,6 +143,21 @@ IDTVEC(align)
IDTVEC(xmm)
pushl $0; TRAP(T_XMMFLT)
+ SUPERALIGN_TEXT
+ .globl alltraps_noen
+alltraps_noen:
+ pushal
+ pushl $0
+ movw %ds,(%esp)
+ pushl $0
+ movw %es,(%esp)
+ pushl $0
+ movw %fs,(%esp)
+ SET_KERNEL_SREGS
+ cld
+ FAKE_MCOUNT(TF_EIP(%esp))
+ jmp calltrap
+
/*
* All traps except ones for syscalls jump to alltraps. If
* interrupts were enabled when the trap occurred, then interrupts
@@ -164,6 +180,7 @@ alltraps:
movw %fs,(%esp)
alltraps_with_regs_pushed:
SET_KERNEL_SREGS
+ sti
cld
FAKE_MCOUNT(TF_EIP(%esp))
calltrap:
@@ -225,40 +242,6 @@ norm_ill:
#endif
/*
- * Call gate entry for syscalls (lcall 7,0).
- * This is used by FreeBSD 1.x a.out executables and "old" NetBSD executables.
- *
- * The intersegment call has been set up to specify one dummy parameter.
- * This leaves a place to put eflags so that the call frame can be
- * converted to a trap frame. Note that the eflags is (semi-)bogusly
- * pushed into (what will be) tf_err and then copied later into the
- * final spot. It has to be done this way because esp can't be just
- * temporarily altered for the pushfl - an interrupt might come in
- * and clobber the saved cs/eip.
- */
- SUPERALIGN_TEXT
-IDTVEC(lcall_syscall)
- pushfl /* save eflags */
- popl 8(%esp) /* shuffle into tf_eflags */
- pushl $7 /* sizeof "lcall 7,0" */
- pushl $0 /* tf_trapno */
- pushal
- pushl $0
- movw %ds,(%esp)
- pushl $0
- movw %es,(%esp)
- pushl $0
- movw %fs,(%esp)
- SET_KERNEL_SREGS
- cld
- FAKE_MCOUNT(TF_EIP(%esp))
- pushl %esp
- call syscall
- add $4, %esp
- MEXITCOUNT
- jmp doreti
-
-/*
* Trap gate entry for syscalls (int 0x80).
* This is used by FreeBSD ELF executables, "new" NetBSD executables, and all
* Linux executables.
@@ -279,6 +262,7 @@ IDTVEC(int0x80_syscall)
pushl $0
movw %fs,(%esp)
SET_KERNEL_SREGS
+ sti
cld
FAKE_MCOUNT(TF_EIP(%esp))
pushl %esp
@@ -362,7 +346,7 @@ doreti_next:
#ifdef HWPMC_HOOKS
je doreti_nmi
#else
- je doreti_exit
+ je doreti_notvm86
#endif
/*
* PSL_VM must be checked first since segment registers only
@@ -378,7 +362,7 @@ doreti_next:
doreti_notvm86:
testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */
- jz doreti_exit /* can't handle ASTs now if not */
+ jz doreti_nosegs /* can't handle ASTs now if not */
doreti_ast:
/*
@@ -415,6 +399,12 @@ doreti_popl_es:
.globl doreti_popl_ds
doreti_popl_ds:
popl %ds
+ jmp doreti_iret_popal
+
+doreti_nosegs:
+ MEXITCOUNT
+ addl $12,%esp
+doreti_iret_popal:
popal
addl $8,%esp
.globl doreti_iret
@@ -457,7 +447,7 @@ doreti_nmi:
* needs a user call chain capture.
*/
testb $SEL_RPL_MASK,TF_CS(%esp)
- jz doreti_exit
+ jz doreti_nosegs
movl PCPU(CURTHREAD),%eax /* curthread present? */
orl %eax,%eax
jz doreti_exit
Modified: head/sys/i386/i386/locore.s
==============================================================================
--- head/sys/i386/i386/locore.s Mon Sep 18 20:17:08 2017 (r323721)
+++ head/sys/i386/i386/locore.s Mon Sep 18 20:22:42 2017 (r323722)
@@ -335,6 +335,44 @@ osigcode:
pushl %eax /* junk to fake return addr. */
int $0x80 /* enter kernel with args */
0: jmp 0b
+
+/*
+ * The lcall $7,$0 handler cannot use the call gate that does an
+ * inter-privilege transition. The reason is that the call gate
+ * does not disable interrupts, and, before the kernel segment registers
+ * are loaded, we would have a window where the ring 0 code is
+ * executed with the wrong segments.
+ *
+ * Instead, set LDT descriptor 0 as code segment, which reflects
+ * the lcall $7,$0 back to ring 3 trampoline. The trampoline sets up
+ * the frame for int $0x80.
+ */
+ ALIGN_TEXT
+lcall_tramp:
+ cmpl $SYS_vfork,%eax
+ je 1f
+ pushl %ebp
+ movl %esp,%ebp
+ pushl 0x24(%ebp) /* arg 6 */
+ pushl 0x20(%ebp)
+ pushl 0x1c(%ebp)
+ pushl 0x18(%ebp)
+ pushl 0x14(%ebp)
+ pushl 0x10(%ebp) /* arg 1 */
+ subl $4,%esp /* gap */
+ int $0x80
+ leavel
+ lretl
+1:
+ /*
+ * vfork handling is special and relies on the libc stub saving
+ * the return ip in %ecx. Also, we assume that the call was done
+ * with ucode32 selector in %cs.
+ */
+ int $0x80
+ movl $0x33,4(%esp) /* GUCODE32_SEL | SEL_UPL */
+ movl %ecx,(%esp)
+ lretl
#endif /* COMPAT_43 */
ALIGN_TEXT
@@ -353,6 +391,9 @@ szfreebsd4_sigcode:
.globl szosigcode
szosigcode:
.long esigcode-osigcode
+ .globl szlcallcode
+szlcallcode:
+ .long esigcode-lcall_tramp
#endif
.text
Modified: head/sys/i386/i386/machdep.c
==============================================================================
--- head/sys/i386/i386/machdep.c Mon Sep 18 20:17:08 2017 (r323721)
+++ head/sys/i386/i386/machdep.c Mon Sep 18 20:22:42 2017 (r323722)
@@ -1513,7 +1513,7 @@ extern inthand_t
#ifdef XENHVM
IDTVEC(xen_intr_upcall),
#endif
- IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
+ IDTVEC(int0x80_syscall);
#ifdef DDB
/*
@@ -2157,7 +2157,9 @@ i386_kdb_init(void)
register_t
init386(int first)
{
- struct gate_descriptor *gdp;
+#ifdef COMPAT_43
+ struct segment_descriptor *gdp;
+#endif
int gsel_tss, metadata_missing, x, pa;
struct pcpu *pc;
struct xstate_hdr *xhdr;
@@ -2246,9 +2248,9 @@ init386(int first)
/* exceptions */
for (x = 0; x < NIDT; x++)
- setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
+ setidt(x, &IDTVEC(rsvd), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_DE, &IDTVEC(div), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
@@ -2256,39 +2258,39 @@ init386(int first)
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
+ setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
+ setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386IGT, SEL_KPL
, GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
- setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_AC, &IDTVEC(align), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
+ setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
#ifdef KDTRACE_HOOKS
- setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
+ setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386IGT, SEL_UPL,
GSEL(GCODE_SEL, SEL_KPL));
#endif
#ifdef XENHVM
@@ -2329,9 +2331,9 @@ init386(int first)
clock_init();
finishidentcpu(); /* Final stage of CPU initialization */
- setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
- setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
initializecpu(); /* Initialize CPU registers */
initializecpucache();
@@ -2436,17 +2438,21 @@ init386(int first)
gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */
ltr(gsel_tss);
- /* make a call gate to reenter kernel with */
- gdp = &ldt[LSYS5CALLS_SEL].gd;
-
- x = (int) &IDTVEC(lcall_syscall);
- gdp->gd_looffset = x;
- gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
- gdp->gd_stkcpy = 1;
- gdp->gd_type = SDT_SYS386CGT;
- gdp->gd_dpl = SEL_UPL;
- gdp->gd_p = 1;
- gdp->gd_hioffset = x >> 16;
+#ifdef COMPAT_43
+ /*
+ * Make a code descriptor to emulate lcall $7,$0 with int
+ * $0x80. sd_hibase and sd_lobase are set after the sigtramp
+ * base in the shared table is known.
+ */
+ gdp = &ldt[LSYS5CALLS_SEL].sd;
+ gdp->sd_type = SDT_MEMERA;
+ gdp->sd_dpl = SEL_UPL;
+ gdp->sd_p = 1;
+ gdp->sd_def32 = 1;
+ gdp->sd_gran = 1;
+ gdp->sd_lolimit = 0xffff;
+ gdp->sd_hilimit = 0xf;
+#endif
/* transfer to user mode */
Modified: head/sys/i386/i386/trap.c
==============================================================================
--- head/sys/i386/i386/trap.c Mon Sep 18 20:17:08 2017 (r323721)
+++ head/sys/i386/i386/trap.c Mon Sep 18 20:22:42 2017 (r323722)
@@ -114,8 +114,6 @@ static int trap_pfault(struct trapframe *, int, vm_off
static void trap_fatal(struct trapframe *, vm_offset_t);
void dblfault_handler(void);
-extern inthand_t IDTVEC(lcall_syscall);
-
#define MAX_TRAP_MSG 32
static char *trap_msg[] = {
"", /* 0 unused */
@@ -629,23 +627,6 @@ user_trctrap_out:
case T_TRCTRAP: /* trace trap */
kernel_trctrap:
- if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
- /*
- * We've just entered system mode via the
- * syscall lcall. Continue single stepping
- * silently until the syscall handler has
- * saved the flags.
- */
- return;
- }
- if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
- /*
- * The syscall handler has now saved the
- * flags. Stop single stepping it.
- */
- frame->tf_eflags &= ~PSL_T;
- return;
- }
/*
* Ignore debug register trace traps due to
* accesses in the user's address space, which
Modified: head/sys/i386/include/md_var.h
==============================================================================
--- head/sys/i386/include/md_var.h Mon Sep 18 20:17:08 2017 (r323721)
+++ head/sys/i386/include/md_var.h Mon Sep 18 20:22:42 2017 (r323722)
@@ -43,6 +43,7 @@ extern int szfreebsd4_sigcode;
#endif
#ifdef COMPAT_43
extern int szosigcode;
+extern int szlcallcode;
#endif
extern uint32_t *vm_page_dump;
Modified: head/sys/i386/isa/npx.c
==============================================================================
--- head/sys/i386/isa/npx.c Mon Sep 18 20:17:08 2017 (r323721)
+++ head/sys/i386/isa/npx.c Mon Sep 18 20:22:42 2017 (r323722)
@@ -237,7 +237,7 @@ npx_probe(void)
}
save_idt_npxtrap = idt[IDT_MF];
- setidt(IDT_MF, probetrap, SDT_SYS386TGT, SEL_KPL,
+ setidt(IDT_MF, probetrap, SDT_SYS386IGT, SEL_KPL,
GSEL(GCODE_SEL, SEL_KPL));
/*
Modified: head/sys/kern/imgact_aout.c
==============================================================================
--- head/sys/kern/imgact_aout.c Mon Sep 18 20:17:08 2017 (r323721)
+++ head/sys/kern/imgact_aout.c Mon Sep 18 20:22:42 2017 (r323722)
@@ -27,6 +27,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_compat.h"
+
#include <sys/param.h>
#include <sys/exec.h>
#include <sys/imgact.h>
@@ -337,3 +339,18 @@ exec_aout_imgact(struct image_params *imgp)
*/
static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
EXEC_SET(aout, aout_execsw);
+
+#if defined(__i386__) && defined(COMPAT_43)
+static void
+exec_init_lcall(void *arg __unused)
+{
+ struct segment_descriptor *gdp;
+ u_int lcall_addr;
+
+ gdp = &ldt[LSYS5CALLS_SEL].sd;
+ lcall_addr = aout_sysvec.sv_psstrings - szlcallcode;
+ gdp->sd_hibase = lcall_addr >> 24;
+ gdp->sd_lobase = lcall_addr;
+}
+SYSINIT(aout, SI_SUB_EXEC + 1, SI_ORDER_ANY, exec_init_lcall, NULL);
+#endif
More information about the svn-src-head
mailing list