9.1-stable crashes while copying data from a NFS mounted directory

Thu Jan 24 20:49:16 UTC 2013

On Thursday 24 January 2013 20:37:09 Konstantin Belousov wrote:
> On Thu, Jan 24, 2013 at 07:50:49PM +0100, Christian Gusenbauer wrote:
> > On Thursday 24 January 2013 19:07:23 Konstantin Belousov wrote:
> > > On Thu, Jan 24, 2013 at 08:03:59PM +0200, Konstantin Belousov wrote:
> > > > On Thu, Jan 24, 2013 at 06:05:57PM +0100, Christian Gusenbauer wrote:
> > > > > Hi!
> > > > > 
> > > > > I'm using 9.1 stable svn revision 245605 and I get the panic below
> > > > > if I execute the following commands (as single user):
> > > > > 
> > > > > # swapon -a
> > > > > # dumpon /dev/ada0s3b
> > > > > # mount -u /
> > > > > # ifconfig age0 inet 192.168.2.2 mtu 6144 up
> > > > > # mount -t nfs -o rsize=32768 data:/multimedia /mnt
> > > > > # cp /mnt/Movies/test/a.m2ts /tmp
> > > > > 
> > > > > then the system panics almost immediately. I'll attach the stack
> > > > > trace.
> > > > > 
> > > > > Note, that I'm using jumbo frames (6144 byte) on a 1Gbit network,
> > > > > maybe that's the cause for the panic, because the bcopy (see stack
> > > > > frame #15) fails.
> > > > > 
> > > > > Any clues?
> > > > 
> > > > I tried a similar operation with the nfs mount of rsize=32768 and mtu
> > > > 6144, but the machine runs HEAD and em instead of age. I was unable
> > > > to reproduce the panic on the copy of the 5GB file from nfs mount.
> > 
> > Hmmm, I did a quick test. If I do not change the MTU, so just configuring
> > age0 with
> > 
> > # ifconfig age0 inet 192.168.2.2 up
> > 
> > then I can copy all files from the mounted directory without any
> > problems, too. So it's probably age0 related?
> 
> From your backtrace and the buffer printout, I see somewhat strange thing.
> The buffer data address is 0xffffff8171418000, while kernel faulted
> at the attempt to write at 0xffffff8171413000, which is is lower then
> the buffer data pointer, at the attempt to bcopy to the buffer.
> 
> The other data suggests that there were no overflow of the data from the
> server response. So it might be that mbuf_len(mp) returned negative number
> ? I am not sure is it possible at all.
> 
> Try this debugging patch, please. You need to add INVARIANTS etc to the
> kernel config.
> 
> diff --git a/sys/fs/nfs/nfs_commonsubs.c b/sys/fs/nfs/nfs_commonsubs.c
> index efc0786..9a6bda5 100644
> --- a/sys/fs/nfs/nfs_commonsubs.c
> +++ b/sys/fs/nfs/nfs_commonsubs.c
> @@ -218,6 +218,7 @@ nfsm_mbufuio(struct nfsrv_descript *nd, struct uio
> *uiop, int siz) }
>  				mbufcp = NFSMTOD(mp, caddr_t);
>  				len = mbuf_len(mp);
> +				KASSERT(len > 0, ("len %d", len));
>  			}
>  			xfer = (left > len) ? len : left;
>  #ifdef notdef
> @@ -239,6 +240,8 @@ nfsm_mbufuio(struct nfsrv_descript *nd, struct uio
> *uiop, int siz) uiop->uio_resid -= xfer;
>  		}
>  		if (uiop->uio_iov->iov_len <= siz) {
> +			KASSERT(uiop->uio_iovcnt > 1, ("uio_iovcnt %d",
> +			    uiop->uio_iovcnt));
>  			uiop->uio_iovcnt--;
>  			uiop->uio_iov++;
>  		} else {
> 
> I thought that server have returned too long response, but it seems to
> be not the case from your data. Still, I think the patch below might be
> due.
> 
> diff --git a/sys/fs/nfsclient/nfs_clrpcops.c
> b/sys/fs/nfsclient/nfs_clrpcops.c index be0476a..a89b907 100644
> --- a/sys/fs/nfsclient/nfs_clrpcops.c
> +++ b/sys/fs/nfsclient/nfs_clrpcops.c
> @@ -1444,7 +1444,7 @@ nfsrpc_readrpc(vnode_t vp, struct uio *uiop, struct
> ucred *cred, NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
>  			eof = fxdr_unsigned(int, *tl);
>  		}
> -		NFSM_STRSIZ(retlen, rsize);
> +		NFSM_STRSIZ(retlen, len);
>  		error = nfsm_mbufuio(nd, uiop, retlen);
>  		if (error)
>  			goto nfsmout;

I applied your patches and now I get a

panic: len -4
cpuid = 1
KDB: enter: panic
Dumping 377 out of 6116 MB:..5%..13%..22%..34%..43%..51%..64%..73%..81%..94%


#0  doadump (textdump=0)
    at /spare/tmp/src-stable9/sys/kern/kern_shutdown.c:265
265             if (textdump && textdump_pending) {
(kgdb) #0  doadump (textdump=0)
    at /spare/tmp/src-stable9/sys/kern/kern_shutdown.c:265
#1  0xffffffff802a7490 in db_dump (dummy=<value optimized out>,
    dummy2=<value optimized out>, dummy3=<value optimized out>,
    dummy4=<value optimized out>)
    at /spare/tmp/src-stable9/sys/ddb/db_command.c:538
#2  0xffffffff802a6a7e in db_command (last_cmdp=0xffffffff808ca140,
    cmd_table=<value optimized out>, dopager=1)
    at /spare/tmp/src-stable9/sys/ddb/db_command.c:449
#3  0xffffffff802a6cd0 in db_command_loop ()
    at /spare/tmp/src-stable9/sys/ddb/db_command.c:502
#4  0xffffffff802a8e29 in db_trap (type=<value optimized out>,
    code=<value optimized out>)
    at /spare/tmp/src-stable9/sys/ddb/db_main.c:231
#5  0xffffffff803bf548 in kdb_trap (type=3, code=0, tf=0xffffff81b2ba1080)
    at /spare/tmp/src-stable9/sys/kern/subr_kdb.c:649
#6  0xffffffff80594c28 in trap (frame=0xffffff81b2ba1080)
    at /spare/tmp/src-stable9/sys/amd64/amd64/trap.c:579
#7  0xffffffff8057e06f in calltrap ()
    at /spare/tmp/src-stable9/sys/amd64/amd64/exception.S:228
#8  0xffffffff803beffb in kdb_enter (why=0xffffffff8060ebcf "panic",
    msg=0x80 <Address 0x80 out of bounds>) at cpufunc.h:63
#9  0xffffffff80389391 in panic (fmt=<value optimized out>)
    at /spare/tmp/src-stable9/sys/kern/kern_shutdown.c:627
#10 0xffffffff81e5bab2 in nfsm_mbufuio (nd=0xffffff81b2ba1340, uiop=0x7cf,
    siz=18)
    at /spare/tmp/src-stable9/sys/modules/nfscommon/../../fs/nfs/nfs_commonsubs.c:202
#11 0xffffffff81e195c1 in nfsrpc_read (vp=0xfffffe0006c94dc8,
    uiop=0xffffff81b2ba15c0, cred=<value optimized out>,
    p=0xfffffe0006aa6490, nap=0xffffff81b2ba14a0,
    attrflagp=0xffffff81b2ba156c, stuff=0x0)
    at /spare/tmp/src-stable9/sys/modules/nfscl/../../fs/nfsclient/nfs_clrpcops.c:1343
#12 0xffffffff81e3bd80 in ncl_readrpc (vp=0xfffffe0006c94dc8,
    uiop=0xffffff81b2ba15c0, cred=<value optimized out>)
    at /spare/tmp/src-stable9/sys/modules/nfscl/../../fs/nfsclient/nfs_clvnops.c:1366
#13 0xffffffff81e3086b in ncl_doio (vp=0xfffffe0006c94dc8,
    bp=0xffffff816f8f4120, cr=0xfffffe0002d58e00, td=0xfffffe0006aa6490,
    called_from_strategy=0)
    at /spare/tmp/src-stable9/sys/modules/nfscl/../../fs/nfsclient/nfs_clbio.c:1605
#14 0xffffffff81e3254f in ncl_bioread (vp=0xfffffe0006c94dc8,
    uio=0xffffff81b2ba1ad0, ioflag=<value optimized out>,
    cred=0xfffffe0002d58e00)
    at /spare/tmp/src-stable9/sys/modules/nfscl/../../fs/nfsclient/nfs_clbio.c:541
#15 0xffffffff80434ae8 in vn_read (fp=0xfffffe0006abda50,
    uio=0xffffff81b2ba1ad0, active_cred=<value optimized out>,
    flags=<value optimized out>, td=<value optimized out>) at vnode_if.h:384
#16 0xffffffff8043206e in vn_io_fault (fp=0xfffffe0006abda50,
    uio=0xffffff81b2ba1ad0, active_cred=0xfffffe0002d58e00, flags=0,
    td=0xfffffe0006aa6490) at /spare/tmp/src-stable9/sys/kern/vfs_vnops.c:903
#17 0xffffffff803d7ac1 in dofileread (td=0xfffffe0006aa6490, fd=3,
    fp=0xfffffe0006abda50, auio=0xffffff81b2ba1ad0,
    offset=<value optimized out>, flags=0) at file.h:287
#18 0xffffffff803d7e1c in kern_readv (td=0xfffffe0006aa6490, fd=3,
    auio=0xffffff81b2ba1ad0)
    at /spare/tmp/src-stable9/sys/kern/sys_generic.c:250
#19 0xffffffff803d7f34 in sys_read (td=<value optimized out>,
    uap=<value optimized out>)
    at /spare/tmp/src-stable9/sys/kern/sys_generic.c:166
#20 0xffffffff80593cb3 in amd64_syscall (td=0xfffffe0006aa6490, traced=0)
    at subr_syscall.c:135
#21 0xffffffff8057e357 in Xfast_syscall ()
    at /spare/tmp/src-stable9/sys/amd64/amd64/exception.S:387
#22 0x00000008009245fc in ?? ()
Previous frame inner to this frame (corrupt stack?)
(kgdb)