Re: git: 95f773e59482 - main - i386 copyout_fast: improve detection of a fault on accessing userspace

From: Konstantin Belousov <kostikbel_at_gmail.com>
Date: Tue, 06 Sep 2022 15:30:01 UTC
On Tue, Sep 06, 2022 at 05:18:26PM +0200, Tijl Coosemans wrote:
> On Wed, 24 Aug 2022 19:25:09 GMT Konstantin Belousov <kib@FreeBSD.org>
> wrote:
> > The branch main has been updated by kib:
> > 
> > URL: https://cgit.FreeBSD.org/src/commit/?id=95f773e59482b1a3462d2fe3901532d51fb053b3
> > 
> > commit 95f773e59482b1a3462d2fe3901532d51fb053b3
> > Author:     Konstantin Belousov <kib@FreeBSD.org>
> > AuthorDate: 2022-08-09 00:56:54 +0000
> > Commit:     Konstantin Belousov <kib@FreeBSD.org>
> > CommitDate: 2022-08-24 19:11:40 +0000
> > 
> >     i386 copyout_fast: improve detection of a fault on accessing userspace
> >     
> >     Do not blindly account a page fault occuring on the trampoline area,
> >     as the userspace access fault.  Check that it occured exactly in the
> >     instruction that does that.
> >     
> >     This avoids unneeded switches of address space on faults not needing the
> >     switch, effectively converting machine resets due to tripple faults,
> >     into regular panics.
> >     
> >     Reviewed by:    jhb
> >     Tested by:      pho
> >     Sponsored by:   The FreeBSD Foundation
> >     MFC after:      1 week
> >     Differential revision:  https://reviews.freebsd.org/D36302
> > ---
> >  sys/i386/i386/copyout_fast.s | 16 ++++++++--------
> >  sys/i386/i386/exception.s    | 32 ++++++++++++++++++++++++++++----
> >  2 files changed, 36 insertions(+), 12 deletions(-)
> > 
> > diff --git a/sys/i386/i386/copyout_fast.s b/sys/i386/i386/copyout_fast.s
> > index 715952f5fe20..d1d17f775872 100644
> > --- a/sys/i386/i386/copyout_fast.s
> > +++ b/sys/i386/i386/copyout_fast.s
> > @@ -93,7 +93,7 @@ ENTRY(copyout_fast)
> >  	popl	%ecx
> >  	popl	%edi
> >  	popl	%esi
> > -	rep; movsb
> > +pf_x1:	rep; movsb
> >  
> >  	movl	%ebx,%cr3
> >  	movl	%eax,%esp
> > @@ -150,7 +150,7 @@ ENTRY(copyin_fast)
> >  	popl	%ecx
> >  	popl	%edi
> >  	popl	%esi
> > -	rep; movsb
> > +pf_x2:	rep; movsb
> >  
> >  	movl	%ebx,%cr3
> >  
> > @@ -197,7 +197,7 @@ ENTRY(fueword_fast)
> >  	cli
> >  	movl	PCPU(TRAMPSTK),%esp
> >  	movl	%eax,%cr3
> > -	movl	(%ecx),%eax
> > +pf_x3:	movl	(%ecx),%eax
> >  	movl	%ebx,%cr3
> >  	movl	%esi,%esp
> >  	sti
> > @@ -226,7 +226,7 @@ ENTRY(fuword16_fast)
> >  	cli
> >  	movl	PCPU(TRAMPSTK),%esp
> >  	movl	%eax,%cr3
> > -	movzwl	(%ecx),%eax
> > +pf_x4:	movzwl	(%ecx),%eax
> >  	movl	%ebx,%cr3
> >  	movl	%esi,%esp
> >  	sti
> > @@ -252,7 +252,7 @@ ENTRY(fubyte_fast)
> >  	cli
> >  	movl	PCPU(TRAMPSTK),%esp
> >  	movl	%eax,%cr3
> > -	movzbl	(%ecx),%eax
> > +pf_x5:	movzbl	(%ecx),%eax
> >  	movl	%ebx,%cr3
> >  	movl	%esi,%esp
> >  	sti
> > @@ -291,7 +291,7 @@ ENTRY(suword_fast)
> >  	cli
> >  	movl	PCPU(TRAMPSTK),%esp
> >  	movl	%eax,%cr3
> > -	movl	%edi,(%ecx)
> > +pf_x6:	movl	%edi,(%ecx)
> >  	movl	%ebx,%cr3
> >  	movl	%esi,%esp
> >  	sti
> > @@ -319,7 +319,7 @@ ENTRY(suword16_fast)
> >  	cli
> >  	movl	PCPU(TRAMPSTK),%esp
> >  	movl	%eax,%cr3
> > -	movw	%di,(%ecx)
> > +pf_x7:	movw	%di,(%ecx)
> >  	movl	%ebx,%cr3
> >  	movl	%esi,%esp
> >  	sti
> > @@ -348,7 +348,7 @@ ENTRY(subyte_fast)
> >  	movl	PCPU(TRAMPSTK),%esp
> >  	movl	%eax,%cr3
> >  	movl	%edi,%eax
> > -	movb	%al,(%ecx)
> > +pf_x8:	movb	%al,(%ecx)
> >  	movl	%ebx,%cr3
> >  	movl	%esi,%esp
> >  	sti
> > diff --git a/sys/i386/i386/exception.s b/sys/i386/i386/exception.s
> > index f4135548fd81..42e9c474c3cd 100644
> > --- a/sys/i386/i386/exception.s
> > +++ b/sys/i386/i386/exception.s
> > @@ -130,17 +130,41 @@ IDTVEC(prot)
> >  	jmp	irettraps
> >  IDTVEC(page)
> >  	testl	$PSL_VM, TF_EFLAGS-TF_ERR(%esp)
> > -	jnz	1f
> > +	jnz	4f
> >  	testb	$SEL_RPL_MASK, TF_CS-TF_ERR(%esp)
> > -	jnz	1f
> > +	jnz	4f
> >  	cmpl	$PMAP_TRM_MIN_ADDRESS, TF_EIP-TF_ERR(%esp)
> > -	jb	1f
> > +	jb	4f
> > +	pushl	%eax
> > +	movl	TF_EIP-TF_ERR+4(%esp), %eax
> > +	addl	$1f, %eax
> > +	call	5f
> > +1:	cmpl	$pf_x1, %eax
> > +	je	2f
> > +	cmpl	$pf_x2, %eax
> > +	je	2f
> > +	cmpl	$pf_x3, %eax
> > +	je	2f
> > +	cmpl	$pf_x4, %eax
> > +	je	2f
> > +	cmpl	$pf_x5, %eax
> > +	je	2f
> > +	cmpl	$pf_x6, %eax
> > +	je	2f
> > +	cmpl	$pf_x7, %eax
> > +	je	2f
> > +	cmpl	$pf_x8, %eax
> > +	jne	3f
> > +2:	popl	%eax
> >  	movl	%ebx, %cr3
> >  	movl	%edx, TF_EIP-TF_ERR(%esp)
> >  	addl	$4, %esp
> >  	iret
> > -1:	pushl	$T_PAGEFLT
> > +3:	popl	%eax
> > +4:	pushl	$T_PAGEFLT
> >  	jmp	alltraps
> > +5:	subl	(%esp), %eax
> > +	retl
> >  IDTVEC(rsvd_pti)
> >  IDTVEC(rsvd)
> >  	pushl $0; TRAP(T_RESERVED)
> 
> I'm sporadically seeing a panic after this commit.  It's caused by a
> page fault during the second rep; movsb in copyin_fast (copying between
> copyout_buf and the kernel).  When I add the same special treatment as
> above for this instruction the panic goes away.
The purpose of the patch was to change some tripple faults into normal
panics.  The check for %eip belonging to the trampoline area is too
broad to be correct.

I suspect you see that leftover panics, which I am working on right now.

> 
> The panics happened while in X and the crash dump doesn't seem to
> contain the trampoline stack so I ended up rewriting copyin_fast and
> copyout_fast so the copying between the kernel and copyout_buf ran on
> the kernel stack instead of the trampoline stack (see attached patch).
> Now with this patch the panics are also gone so I suspect the problem is
> simply that the trampoline stack is too small to handle some page
> faults.
> 
> So, is this patch correct then?  I'm not sure it's actually safe to
> handle page faults in this context with interrupts disabled.
I do not think that the patch is correct, and I even surprised that you
do not see a sporadic reboots with it applied (do you?).  When you change
address space to do the fast bcopy, kernel stack gets unmapped.  So if
the page fault occurs because user page is not resident, fault must cause
tripple fault and reboot the system.

The idea about too small trampoline stack might have some merits, did
you tried to increase the trampoline stack size?

I am currently reworking the copyin/copyout_fast to avoid pushing the
bcopy args into trampoline stack, but there is one bug not yet fixed
in the change.  See
https://kib.kiev.ua/git/gitweb.cgi?p=deviant3.git;a=shortlog;h=refs/heads/ast

> diff --git a/sys/i386/i386/copyout_fast.s b/sys/i386/i386/copyout_fast.s
> index d1d17f775872..4b7d4c293466 100644
> --- a/sys/i386/i386/copyout_fast.s
> +++ b/sys/i386/i386/copyout_fast.s
> @@ -46,53 +46,28 @@ ENTRY(copyout_fast)
>  	pushl	%edi
>  	pushl	%ebx
>  
> -	movl	$copyout_fault,%edx
> +	movl	PCPU(CURPCB),%edx
> +	movl	16(%ebp),%ecx	/* len */
> +	movl	8(%ebp),%esi	/* kaddr */
> +	movl	PCPU(COPYOUT_BUF),%edi
>  	movl	20(%ebp),%ebx	/* KCR3 */
> -
> -	movl	PCPU(CURPCB),%eax
> -	movl	PCB_CR3(%eax),%edi
> +	movl	PCB_CR3(%edx),%edx
>  
>  	cli
> -	movl	PCPU(TRAMPSTK),%esi
> -	movl	PCPU(COPYOUT_BUF),%eax
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -	movl	12(%ebp),%eax	/* udaddr */
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -	movl	16(%ebp),%eax	/* len */
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -
> -	subl	$4, %esi
> -	movl	%edi, (%esi)
> +	/* bcopy(%esi = kaddr, %edi = PCPU(copyout_buf), %ecx = len) */
> +	rep; movsb
>  
> -	movl	8(%ebp),%eax	/* kaddr */
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -	movl	PCPU(COPYOUT_BUF),%eax
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -	movl	16(%ebp),%eax	/* len */
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> +	movl	16(%ebp),%ecx	/* len */
> +	movl	PCPU(COPYOUT_BUF),%esi
> +	movl	12(%ebp),%edi	/* udaddr */
>  
>  	movl	%esp,%eax
> -	movl	%esi,%esp
> -
> -	/* bcopy(%esi = kaddr, %edi = PCPU(copyout_buf), %ecx = len) */
> -	popl	%ecx
> -	popl	%edi
> -	popl	%esi
> -	rep; movsb
> +	movl	PCPU(TRAMPSTK),%esp
> +	movl	%edx,%cr3
>  
> -	popl	%edi
> -	movl	%edi,%cr3
> +	movl	$copyout_fault,%edx
>  
>  	/* bcopy(%esi = PCPU(copyout_buf), %edi = udaddr, %ecx = len) */
> -	popl	%ecx
> -	popl	%edi
> -	popl	%esi
>  pf_x1:	rep; movsb
>  
>  	movl	%ebx,%cr3
> @@ -114,53 +89,33 @@ ENTRY(copyin_fast)
>  	pushl	%edi
>  	pushl	%ebx
>  
> -	movl	$copyout_fault,%edx
> +	movl	PCPU(CURPCB),%edx
> +	movl	16(%ebp),%ecx	/* len */
> +	movl	8(%ebp),%esi	/* udaddr */
> +	movl	PCPU(COPYOUT_BUF),%edi
>  	movl	20(%ebp),%ebx	/* KCR3 */
> +	movl	PCB_CR3(%edx),%edx
>  
> -	movl	PCPU(CURPCB),%eax
> -	movl	PCB_CR3(%eax),%edi
> -
> +	movl	%esp,%eax
>  	cli
> -	movl	PCPU(TRAMPSTK),%esi
> -	movl	PCPU(COPYOUT_BUF),%eax
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -	movl	12(%ebp),%eax	/* kaddr */
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -	movl	16(%ebp),%eax	/* len */
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -
> -	movl	8(%ebp),%eax	/* udaddr */
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -	movl	PCPU(COPYOUT_BUF),%eax
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> -	movl	16(%ebp),%eax	/* len */
> -	subl	$4,%esi
> -	movl	%eax,(%esi)
> +	movl	PCPU(TRAMPSTK),%esp
> +	movl	%edx,%cr3
>  
> -	movl	%esp,%eax
> -	movl	%esi,%esp
> -	movl	%edi,%cr3
> +	movl	$copyout_fault,%edx
>  
>  	/* bcopy(%esi = udaddr, %edi = PCPU(copyout_buf), %ecx = len) */
> -	popl	%ecx
> -	popl	%edi
> -	popl	%esi
>  pf_x2:	rep; movsb
>  
>  	movl	%ebx,%cr3
> +	movl	%eax,%esp
> +
> +	movl	16(%ebp),%ecx	/* len */
> +	movl	PCPU(COPYOUT_BUF),%esi
> +	movl	12(%ebp),%edi	/* kaddr */
>  
>  	/* bcopy(%esi = PCPU(copyout_buf), %edi = kaddr, %ecx = len) */
> -	popl	%ecx
> -	popl	%edi
> -	popl	%esi
>  	rep; movsb
>  
> -	movl	%eax,%esp
>  	sti
>  
>  	xorl	%eax,%eax